first attempt at HTMLParser-based paraparser py33
authorAndy Robinson <andy@reportlab.com>
Mon, 30 Dec 2013 14:30:06 +0000
branchpy33
changeset 3954 44dbe56eb858
parent 3953 cc84c8da36d7
child 3955 3dbd805c708c
first attempt at HTMLParser-based paraparser
src/reportlab/platypus/paraparser.py
tests/test_platypus_paraparser.py
--- a/src/reportlab/platypus/paraparser.py	Thu Dec 26 13:18:35 2013 +0000
+++ b/src/reportlab/platypus/paraparser.py	Mon Dec 30 14:30:06 2013 +0000
@@ -9,12 +9,18 @@
 import os
 import copy
 import base64
+from pprint import pprint as pp
+
 try:
     import pickle as pickle
 except:
     import pickle
 import unicodedata
 import reportlab.lib.sequencer
+
+from html.parser import HTMLParser
+from html.entities import name2codepoint
+
 from reportlab.lib.abag import ABag
 from reportlab.lib.utils import ImageReader, isPy3, annotateException, encode_label, asUnicode
 from reportlab.lib.colors import toColor, white, black, red, Color
@@ -578,7 +584,7 @@
 #
 # It will also be able to handle any MathML specified Greek characters.
 #------------------------------------------------------------------
-class ParaParser:
+class ParaParser(HTMLParser):
 
     #----------------------------------------------------------
     # First we will define all of the xml tag handler functions.
@@ -733,7 +739,7 @@
         except ValueError:
             self.unknown_charref(name)
             return
-        self.handle_data(chr(n).encode('utf8'))
+        self.handle_data(chr(n))   #.encode('utf8'))
 
     def syntax_error(self,lineno,message):
         self._syntax_error(message)
@@ -798,15 +804,18 @@
     end_span = end_font
 
     def start_br(self, attr):
-        #just do the trick to make sure there is no content
         self._push(_selfClosingTag='br',lineBreak=True,text='')
-
+        
     def end_br(self):
+        #print('\nend_br called, %d frags in list' % len(self.fragList))
         frag = self._stack[-1]
         assert frag._selfClosingTag=='br' and frag.lineBreak,'Parser failure in <br/>'
         del frag._selfClosingTag
+        self._handled_text = False
         self.handle_data('')
         self._pop()
+        
+
 
     def _initial_frag(self,attr,attrMap,bullet=0):
         style = self._style
@@ -1014,11 +1023,13 @@
     #----------------------------------------------------------------
 
     def __init__(self,verbose=0, caseSensitive=0, ignoreUnknownTags=1):
+        HTMLParser.__init__(self)
         self.verbose = verbose
         self.caseSensitive = caseSensitive
         self.ignoreUnknownTags = ignoreUnknownTags
 
     def _iReset(self):
+        self._handled_text = False
         self.fragList = []
         if hasattr(self, 'bFragList'): delattr(self,'bFragList')
 
@@ -1034,6 +1045,19 @@
     def handle_data(self,data):
         "Creates an intermediate representation of string segments."
 
+        #The old parser would only 'see' a string after all entities had
+        #been processed.  Thus, 'Hello &trade; World' would emerge as one
+        #fragment.    HTMLParser processes these separately.  We want to ensure
+        #that successive calls like this are concatenated, to prevent too many
+        #fragments being created.
+
+        #print("\n called handle_data('%s')" % data)
+        #print('handle_data("%s")' % data)
+        if self._handled_text:
+            #print('handle_more_data("%s")' % data)
+            self.handle_more_data(data)
+            return
+
         frag = copy.copy(self._stack[-1])
         if hasattr(frag,'cbDefn'):
             kind = frag.cbDefn.kind
@@ -1070,6 +1094,12 @@
         else:
             self.fragList.append(frag)
 
+        #Set this if we just processed sme text, but not if it was a br tag.
+        #Ugly, but seems necessary to get pyRXP and HTMLParser working the
+        #same way.
+        if not hasattr(frag, 'lineBreak'):
+            self._handled_text = True
+
     def handle_cdata(self,data):
         self.handle_data(data)
 
@@ -1077,7 +1107,9 @@
         self._seq = reportlab.lib.sequencer.getSequencer()
         self._reset(style)  # reinitialise the parser
 
-    def parse(self, text, style):
+
+
+    def old_parse(self, text, style):
         """Given a formatted string will return a list of
         ParaFrag objects with their calculated widths.
         If errors occur None will be returned and the
@@ -1089,12 +1121,16 @@
             text = u"<para>"+text+u"</para>"
         try:
             tt = makeParser(caseInsensitive=not self.caseSensitive)(text)
+
+            #from pprint import pprint
+            #pprint(tt)
         except:
             annotateException('paragraph text %s caused exception' % ascii(text))
         self._tt_start(tt)
         return self._complete_parse()
 
     def _complete_parse(self):
+        "Reset after parsing, to be ready for next paragraph"
         del self._seq
         style = self._style
         del self._style
@@ -1108,6 +1144,11 @@
         return style, fragList, bFragList
 
     def _tt_handle(self,tt):
+        "Iterate through a pre-parsed tuple tree (e.g. from pyRXP)"
+        #import pprint
+        #pprint.pprint(tt)
+        #find the corresponding start_tagname and end_tagname methods.
+        #These must be defined.
         tag = tt[0]
         try:
             start = getattr(self,'start_'+tag)
@@ -1117,13 +1158,21 @@
                 raise ValueError('Invalid tag "%s"' % tag)
             start = self.start_unknown
             end = self.end_unknown
+
+        #call the start_tagname method
         start(tt[1] or {})
+        self._handled_text = False
+        #if tree node has any children, they will either be further nodes,
+        #or text.  Accordingly, call either this function, or handle_data.
         C = tt[2]
         if C:
             M = self._tt_handlers
             for c in C:
                 M[isinstance(c,(list,tuple))](c)
+
+        #call the end_tagname method
         end()
+        self._handled_text = False
 
     def _tt_start(self,tt):
         self._tt_handlers = self.handle_data,self._tt_handle
@@ -1138,6 +1187,93 @@
     def findSpanStyle(self,style):
         raise ValueError('findSpanStyle not implemented in this parser')
 
+
+
+    #New methods to supprt HTML parser
+    def new_parse(self, text, style):
+        "attempt replacement for parse"
+        self._setup_for_parse(style)
+        text = asUnicode(text)
+        if not(len(text)>=6 and text[0]=='<' and _re_para.match(text)):
+            text = u"<para>"+text+u"</para>"
+        self.feed(text)
+        return self._complete_parse()
+
+    def handle_starttag(self, tag, attrs):
+        "Called by HTMLParser when a tag starts"
+
+        #tuple tree parser used to expect a dict.  HTML parser
+        #gives list of two-element tuples
+        if isinstance(attrs, list):
+            d = {}
+            for (k,  v) in attrs:
+                d[k] = v
+            attrs = d
+
+        if tag not in ['br']:
+            self._handled_text = False
+        try:
+            start = getattr(self,'start_'+tag)
+        except AttributeError:
+            if not self.ignoreUnknownTags:
+                raise ValueError('Invalid tag "%s"' % tag)
+            start = self.start_unknown
+        #call it
+        start(attrs or {})
+        
+    def handle_endtag(self, tag):
+        "Called by HTMLParser when a tag ends"
+        if tag not in ['br']:
+            self._handled_text = False
+        #find the existing end_tagname method
+        try:
+            end = getattr(self,'end_'+tag)
+        except AttributeError:
+            if not self.ignoreUnknownTags:
+                raise ValueError('Invalid tag "%s"' % tag)
+            end = self.end_unknown
+        #call it
+        end()
+
+    
+
+    def handle_entityref(self, name):
+        "Handles a named entity.  "
+        #print('handle_entityref called for "%s"' % name)
+        #The old parser saw these automatically resolved, so
+        #just tack it onto the current fragment.
+        resolved = chr(name2codepoint[name])
+        if self._handled_text:
+            self.fragList[-1].text += resolved
+        else:
+            self.handle_data(resolved)
+        
+    def handle_more_data(self, data):
+        """We call this when we get successive text chunks
+
+        This is to ensure that successive strings with no
+        formatting changes are concatenated.
+        """
+        frag = self._stack[-1]
+        if hasattr(frag,'isBullet'):
+            last = self.bFragList[-1]
+        else:
+            last = self.fragList[-1]
+        last.text += data
+
+
+
+
+        #frag.text += resolved
+        
+    def parse(self, text, style):
+        if os.environ.get('HTMLPARSE', '0') == '1':
+            return self.new_parse(text, style)
+        else:
+            return self.old_parse(text, style)
+
+
+
 if __name__=='__main__':
     from reportlab.platypus import cleanBlockQuotedText
     from reportlab.lib.styles import _baseFontName
--- a/tests/test_platypus_paraparser.py	Thu Dec 26 13:18:35 2013 +0000
+++ b/tests/test_platypus_paraparser.py	Mon Dec 30 14:30:06 2013 +0000
@@ -104,20 +104,20 @@
         self.assertEquals([x.text for x in fragList], ['Hello ','',' World'])
         self.assertEquals(fragList[1].lineBreak, True)
 
-    def testNakedAmpersands(self):
-        import pyRXPU
-        from reportlab.platypus.paragraph import Paragraph
-        def func():
-            txt = "1 & 2"
-            parser = ParaParser()
-            parser.caseSensitive = True
-            frags = ParaParser().parse(txt, self.style)[1]
-            #print 'parsed OK, frags=', frags
-            Paragraph(txt, self.style),
-        self.assertRaises(
-                pyRXPU.error,
-                func,
-                )
+    #def testNakedAmpersands(self):
+        #We no longer require this error to be raised when using html.parser
+        # import pyRXPU
+        # from reportlab.platypus.paragraph import Paragraph
+        # def func():
+        #     txt = "1 & 2"
+        #     parser = ParaParser()
+        #     parser.caseSensitive = True
+        #     frags = ParaParser().parse(txt, self.style)[1]
+        #     Paragraph(txt, self.style),
+        # self.assertRaises(
+        #         pyRXPU.error,
+        #         func,
+        #         )
 
 def makeSuite():
     return makeSuiteForClasses(ParaParserTestCase)