femto fixes to rparsexml; add rl_settings.xmlParser option prefer 'lxml'
authorrobin
Wed, 03 Aug 2022 13:24:40 +0100
changeset 4734 4f30137c136e
parent 4733 58ea94e4c517
child 4735 637b714bc032
femto fixes to rparsexml; add rl_settings.xmlParser option prefer 'lxml'
CHANGES.md
src/reportlab/__init__.py
src/reportlab/lib/rparsexml.py
src/reportlab/platypus/paraparser.py
src/reportlab/rl_settings.py
--- a/CHANGES.md	Thu Jul 21 09:29:56 2022 +0100
+++ b/CHANGES.md	Wed Aug 03 13:24:40 2022 +0100
@@ -11,9 +11,11 @@
 The contributors lists are in no order and apologies to those accidentally not
 mentioned. If we missed you, please let us know!
 
-CHANGES  3.6.12  20/07/2022
+CHANGES  3.6.12  03/08/2022
 ---------------------------
 	* fix dpi handling in renderPM.py; bug found by Terry Zhao Terry dot Zhao at fil dot com
+	* attempt fix in rparsexml.py
+	* add rl_settings.xmlParser with default 'lxml'
 
 CHANGES  3.6.11  24/06/2022
 ---------------------------
--- a/src/reportlab/__init__.py	Thu Jul 21 09:29:56 2022 +0100
+++ b/src/reportlab/__init__.py	Wed Aug 03 13:24:40 2022 +0100
@@ -3,7 +3,7 @@
 __doc__="""The Reportlab PDF generation library."""
 Version = "3.6.12"
 __version__=Version
-__date__='20220720'
+__date__='20220803'
 
 import sys, os
 
--- a/src/reportlab/lib/rparsexml.py	Thu Jul 21 09:29:56 2022 +0100
+++ b/src/reportlab/lib/rparsexml.py	Wed Aug 03 13:24:40 2022 +0100
@@ -77,6 +77,20 @@
 except ImportError:
     simpleparse = 1
 
+class smartDecode:
+    @staticmethod
+    def __call__(s):
+        print('initial')
+        import chardet
+        def __call__(s):
+            if isinstance(s,str): return s
+            cdd = chardet.detect(s)
+            print('final')
+            return s.decode(cdd["encoding"])
+        smartDecode.__class__.__call__ = staticmethod(__call__)
+        return  __call__(s)
+smartDecode = smartDecode()
+
 NONAME = ""
 NAMEKEY = 0
 CONTENTSKEY = 1
@@ -146,6 +160,7 @@
     """simple recursive descent xml parser...
        return (dictionary, endcharacter)
        special case: comment returns (None, endcharacter)"""
+    xmltext = smartDecode(xmltext)
     #print "parsexml0", repr(xmltext[startingat: startingat+10])
     # DEFAULTS
     NameString = NONAME
@@ -370,8 +385,7 @@
 
 def pprettyprint(parsedxml):
     """pretty printer mainly for testing"""
-    st = bytes
-    if type(parsedxml) is st:
+    if isinstance(parsedxml,(str,bytes)):
         return parsedxml
     (name, attdict, textlist, extra) = parsedxml
     if not attdict: attdict={}
@@ -395,12 +409,12 @@
     # otherwise must be a simple tag
     return "<%s %s/>" % (name, attributes)
 
-dump = 0
-def testparse(s):
+def testparse(s,dump=0):
     from time import time
     from pprint import pprint
     now = time()
-    D = parsexmlSimple(s)
+    breakpoint()
+    D = parsexmlSimple(s,oneOutermostTag=1)
     print("DONE", time()-now)
     if dump&4:
         pprint(D)
@@ -410,26 +424,29 @@
         p = pprettyprint(D)
         print(p)
 
-def test():
+def test(dump=0):
     testparse("""<this type="xml">text &lt;&gt;<b>in</b> <funnytag foo="bar"/> xml</this>
                  <!-- comment -->
                  <![CDATA[
                  <this type="xml">text <b>in</b> xml</this> ]]>
                  <tag with="<brackets in values>">just testing brackets feature</tag>
-                 """)
+                 """,dump=dump)
 
-filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml",
-              "samples/hamlet.xml"]
-
-#filenames = ["moa.xml"]
-
-dump=1
 if __name__=="__main__":
-    test()
+    test(dump=1)
+    import sys, os
     from time import time
+    import reportlab
     now = time()
-    for f in filenames:
-        t = open(f).read()
-        print("parsing", f)
-        testparse(t)
-    print("elapsed", time()-now)
+    seen = 0
+    for f in sys.argv[1:]:
+        if not os.path.isfile(f):
+            print("!!!!! no file at {f!r}")
+        else:
+            with open(f) as _f:
+                t = _f.read()
+            print(f"parsing {f!r} |t|={len(t)}")
+            testparse(t,dump=1)
+            seen += 1
+    if seen:
+        print(f"timed at {time()-now:.2f} secs.")
--- a/src/reportlab/platypus/paraparser.py	Thu Jul 21 09:29:56 2022 +0100
+++ b/src/reportlab/platypus/paraparser.py	Wed Aug 03 13:24:40 2022 +0100
@@ -3163,7 +3163,7 @@
         return style, fragList, bFragList
 
     def _tt_handle(self,tt):
-        "Iterate through a pre-parsed tuple tree (e.g. from pyRXP)"
+        "Iterate through a pre-parsed tuple tree (e.g. from pyrxp)"
         #import pprint
         #pprint.pprint(tt)
         #find the corresponding start_tagname and end_tagname methods.
--- a/src/reportlab/rl_settings.py	Thu Jul 21 09:29:56 2022 +0100
+++ b/src/reportlab/rl_settings.py	Wed Aug 03 13:24:40 2022 +0100
@@ -67,7 +67,8 @@
 encryptionStrength
 trustedHosts
 trustedSchemes
-renderPMBackend'''.split())
+renderPMBackend
+xmlParser'''.split())
 
 allowTableBoundsErrors =    1 # set to 0 to die on too large elements in tables in debug (recommend 1 for production use)
 shapeChecking =             1
@@ -157,6 +158,7 @@
 trustedSchemes=['file', 'rml', 'data', 'https',     #these url schemes are trusted
                 'http', 'ftp']
 renderPMBackend='_renderPM'                         #or 'rlPyCairo' if available
+xmlParser='lxml'                                    #or 'pyrxp' for preferred xml parsing
 
 # places to look for T1Font information
 T1SearchPath =  (