initial hyphenation work; all tests pass, but hyphenation is not very well controlled. add function for long uris etc hyphenation
authorrobin
Sun, 24 Jun 2018 11:58:58 +0100
branchhyphenation
changeset 4410 bd848827483f
parent 4409 e1546608f841
child 4411 4c8ad7aa3cb7
initial hyphenation work; all tests pass, but hyphenation is not very well controlled. add function for long uris etc
src/reportlab/lib/styles.py
src/reportlab/lib/uri_split_pairs.py
src/reportlab/platypus/paragraph.py
src/reportlab/platypus/paraparser.py
src/reportlab/rl_settings.py
--- a/src/reportlab/lib/styles.py	Wed Jun 20 14:43:17 2018 +0100
+++ b/src/reportlab/lib/styles.py	Sun Jun 24 11:58:58 2018 +0100
@@ -32,7 +32,8 @@
                                 strikeWidth as _baseStrikeWidth, \
                                 strikeOffset as _baseStrikeOffset, \
                                 strikeGap as _baseStrikeGap, \
-                                spaceShrinkage, platypus_link_underline
+                                spaceShrinkage, platypus_link_underline, \
+                                hyphenationLang
 _baseFontNameB = tt2ps(_baseFontName,1,0)
 _baseFontNameI = tt2ps(_baseFontName,0,1)
 _baseFontNameBI = tt2ps(_baseFontName,1,1)
@@ -148,6 +149,7 @@
         'linkUnderline': platypus_link_underline,
         #'underlineColor':  None,
         #'strikeColor': None,
+        'hyphenationLang': hyphenationLang,
         }
 
 class LineStyle(PropertySet):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/reportlab/lib/uri_split_pairs.py	Sun Jun 24 11:58:58 2018 +0100
@@ -0,0 +1,55 @@
+import re
+from reportlab.lib.utils import isBytes
+#uri schemes from https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+uri_scheme_pattern=re.compile(u'^(?:'+(u'|'.join(u'''aaas about acap acct acr adiumxtra afp afs aim appdata apt
+attachment aw barion beshare bitcoin blob bolo browserext callto cap chrome chrome-extension cid coap coap+tcp
+coap+ws coaps coaps+tcp coaps+ws com-eventbrite-attendee content conti crid cvs data dav diaspora dict did dis
+dlna-playcontainer dlna-playsingle dns dntp dtn dvb ed2k elsi example facetime fax feed feedready file filesystem
+finger fish ftp geo gg git gizmoproject go gopher graph gtalk h323 ham hcp http https hxxp hxxps hydrazone iax
+icap icon im imap info iotdisco ipn ipp ipps irc irc6 ircs iris iris.beep iris.lwz iris.xpc iris.xpcs isostore
+itms jabber jar jms keyparc lastfm ldap ldaps lvlt magnet mailserver mailto maps market message
+microsoft.windows.camera microsoft.windows.camera.multipicker microsoft.windows.camera.picker mid mms modem
+mongodb moz ms-access ms-browser-extension ms-drive-to ms-enrollment ms-excel ms-gamebarservices ms-gamingoverlay
+ms-getoffice ms-help ms-infopath ms-inputapp ms-lockscreencomponent-config ms-media-stream-id
+ms-mixedrealitycapture ms-officeapp ms-people ms-project ms-powerpoint ms-publisher ms-restoretabcompanion
+ms-search-repair ms-secondary-screen-controller ms-secondary-screen-setup ms-settings ms-settings-airplanemode
+ms-settings-bluetooth ms-settings-camera ms-settings-cellular ms-settings-cloudstorage ms-settings-connectabledevices
+ms-settings-displays-topology ms-settings-emailandaccounts ms-settings-language ms-settings-location ms-settings-lock
+ms-settings-nfctransactions ms-settings-notifications ms-settings-power ms-settings-privacy ms-settings-proximity
+ms-settings-screenrotation ms-settings-wifi ms-settings-workplace ms-spd ms-sttoverlay ms-transit-to
+ms-useractivityset ms-virtualtouchpad ms-visio ms-walk-to ms-whiteboard ms-whiteboard-cmd ms-word msnim msrp
+msrps mtqp mumble mupdate mvn news nfs ni nih nntp notes ocf oid onenote onenote-cmd opaquelocktoken openpgp4fpr
+pack palm paparazzi pkcs11 platform pop pres prospero proxy pwid psyc qb query redis rediss reload res resource
+rmi rsync rtmfp rtmp rtsp rtsps rtspu secondlife service session sftp sgn shttp sieve sip sips skype smb sms
+smtp snews snmp soap.beep soap.beeps soldat spiffe spotify ssh steam stun stuns submit svn tag teamspeak
+tel teliaeid telnet tftp things thismessage tip tn3270 tool turn turns tv udp unreal urn ut2004 v-event
+vemmi ventrilo videotex vnc view-source wais webcal wpid ws wss wtai wyciwyg xcon xcon-userid xfire xmlrpc.beep
+xmlrpc.beeps xmpp xri ymsgr z39.50 z39.50r z39.50s'''.replace('.','\\.').replace('+','\\+').split()))+u'):',re.I)
+
+def _slash_parts(uri,scheme,slash):
+    tail = u''
+    while uri.endswith(slash):
+        tail += slash
+        uri = uri[:-1]
+
+    i = 2
+    while True:
+        i = uri.find(slash,i)
+        if i<0: break
+        i += 1
+        yield scheme+uri[:i],uri[i:]+tail
+
+def uri_split_pairs(uri):
+    if isBytes(uri): uri = uri.decode('utf8')
+    scheme = uri_scheme_pattern.match(uri)
+    scheme = scheme.group(0) if scheme else ''
+    uri = uri[len(scheme):] #strip off scheme
+    while uri.startswith('/'):
+        scheme += u'/'
+        uri = uri[1:]
+    if scheme and uri:
+        yield scheme, uri
+    slash = (u'\\' if not scheme and u'/' not in uri #might be a microsoft pattern
+            else u'/')
+    for p in _slash_parts(uri,scheme,slash):
+        yield p
--- a/src/reportlab/platypus/paragraph.py	Wed Jun 20 14:43:17 2018 +0100
+++ b/src/reportlab/platypus/paragraph.py	Sun Jun 24 11:58:58 2018 +0100
@@ -22,6 +22,10 @@
 from reportlab import xrange
 import re
 from types import MethodType
+try:
+    import pyphen
+except:
+    pyphen = None
 
 #on UTF8/py33 branch, split and strip must be unicode-safe!
 #thanks to Dirk Holtwick for helpful discussions/insight
@@ -488,16 +492,19 @@
     ws = _wsc_end_search(text)
     return tx._canvas.stringWidth(ws.group(), tx._fontname, tx._fontsize) if ws else 0
 
-class _HSWord(list):
+class _HSFrag(list):
+    pass
+
+class _SplitFrag(list):
+    '''a split frag'''
     pass
 
-class _SplitList(list):
+class _SplitFragHY(_SplitFrag):
+    '''a split frag that need '-' removing befire rejoining'''
     pass
 
-class _SplitListLast(_SplitList):
-    pass
-
-class _HSSplitList(_HSWord):
+class _SplitFragHS(_SplitFrag,_HSFrag):
+    """a split frag that's followed by a space"""
     pass
 
 def _processed_frags(frags):
@@ -525,31 +532,42 @@
                 w = w.normalizedValue(maxWidth)
                 f[0] = w
     R = []
-    R_append = R.append
+    aR = R.append
     if _processed_frags(frags):
-        i = 0
-        n = len(frags)
-        while i<n:
-            f = frags[i]
+        W = []
+        aW = W.append
+        for f in frags:
             _rescaleFrag(f)
-            if isinstance(f,(_SplitList,_HSSplitList,_SplitListLast)):
-                #we need to re-join these to make a single word
-                W = [0]
-                while True:
-                    W[0] += f[0]
-                    W.extend(f[1:])
-                    if isinstance(f,(_SplitListLast,_HSSplitList)):
-                        break
-                    i += 1
-                    if i==n: break  #exceptional case when a split paragraph split in the middle of such a sequence
-                    f = frags[i]
-                    _rescaleFrag(f)
-                if isinstance(f,_HSSplitList):
-                    f = _HSWord(W)
+            if isinstance(f,_SplitFrag):
+                f0 = f[0]
+                if not W:
+                    W0t = type(f0)
+                    Wlen = 0
+                    sty = None
                 else:
-                    f = W
-            R_append(f)
-            i += 1
+                    if isinstance(lf,_SplitFragHY):
+                        sty, t = W[-1]
+                        Wlen -= stringWidth(t[-1],sty.fontName,sty.fontSize) + 1e-8
+                        W[-1] = (sty,t[:-1]) #strip the '-'
+                Wlen += f0
+                for ts,t in f[1:]:
+                    if ts is sty:
+                        W[-1] = (sty,W[-1][1]+t)
+                    else:
+                        aW((ts,t))
+                        sty = ts
+                #W.extend(f[1:])
+                lf = f          #latest f in W
+                continue
+            else:
+                if W:
+                    #must end a joining
+                    aR((_HSFrag if isinstance(lf,_HSFrag) else list)([W0t(Wlen)]+W))
+                    del W[:]
+                aR(f)
+        if W:
+            #must end a joining
+            aR((_HSFrag if isinstance(lf,_HSFrag) else list)([W0t(Wlen)]+W))
     else:
         W = []
         hangingSpace = False
@@ -567,13 +585,13 @@
                 if text[0] in whitespace:
                     if W:
                         W.insert(0,n)   #end preceding word
-                        R_append(W)
+                        aR(W)
                         whs = hangingSpace
                         W = []
                         hangingSpace = False
                         n = 0
                     else:
-                        whs = R and isinstance(R[-1],_HSWord)
+                        whs = R and isinstance(R[-1],_HSFrag)
                     if not whs:
                         S.insert(0,'')
                     elif not S:
@@ -583,7 +601,7 @@
                     W.append((f,w))
                     n += stringWidth(w, f.fontName, f.fontSize)
                     W.insert(0,n)
-                    R_append(_HSWord(W))
+                    aR(_HSFrag(W))
                     W = []
                     n = 0
 
@@ -593,7 +611,7 @@
                 n += stringWidth(w, f.fontName, f.fontSize)
                 if text and text[-1] in whitespace:
                     W.insert(0,n)
-                    R_append(_HSWord(W))
+                    aR(_HSFrag(W))
                     W = []
                     n = 0
             elif hasattr(f,'cbDefn'):
@@ -605,16 +623,16 @@
                         w = w.normalizedValue(maxWidth)
                     if W:
                         W.insert(0,n)
-                        R_append(_HSWord(W) if hangingSpace else W)
+                        aR(_HSFrag(W) if hangingSpace else W)
                         W = []
                         hangingSpace = False
                         n = 0
                     f._fkind = _FK_IMG
-                    R_append([w,(f,'')])
+                    aR([w,(f,'')])
                     hangingStrip = False
                 else:
                     f._fkind = _FK_APPEND
-                    if not W and R and isinstance(R[-1],_HSWord):
+                    if not W and R and isinstance(R[-1],_HSFrag):
                         R[-1].append((f,''))
                     else:
                         W.append((f,''))
@@ -622,17 +640,17 @@
                 #pass the frag through.  The line breaker will scan for it.
                 if W:
                     W.insert(0,n)
-                    R_append(W)
+                    aR(W)
                     W = []
                     n = 0
                     hangingSpace = False
                 f._fkind = _FK_BREAK
-                R_append([0,(f,'')])
+                aR([0,(f,'')])
                 hangingStrip = True
 
         if W:
             W.insert(0,n)
-            R_append(W)
+            aR(W)
 
     return R
 
@@ -657,8 +675,8 @@
     maxWidths[lineno+n]
 
     return the new word list which is either 
-    _SplitList....._SPlitList or
-    _splitList....._HSSplitList if the word is hanging space.
+    _SplitFrag....._SPlitFrag or
+    _SplitFrag....._SplitFragHS if the word is hanging space.
     '''
     R = []
     maxlineno = len(maxWidths)-1
@@ -676,7 +694,7 @@
                 f.text = fragText
             W.append((f,fragText))
             if tooLong:
-                W = _SplitList([wordWidth]+W)
+                W = _SplitFrag([wordWidth]+W)
                 R.append(W)
                 lineno += 1
                 maxWidth = maxWidths[min(maxlineno,lineno)]
@@ -689,14 +707,62 @@
         fragText += c
         lineWidth = newLineWidth
     W.append((f,fragText))
-    W = _HSSplitList([wordWidth]+W) if isinstance(w,_HSWord) else _SplitListLast([wordWidth]+W)
+    W = (_SplitFragHS if isinstance(w,_HSFrag) else _SplitFrag)([wordWidth]+W)
 
     R.append(W)
     return R
 
-class _SplitText(unicodeT):
+def _hyphenateFragWord(hyphenator,w,newWidth,maxWidth):
+    ww = w[0]
+    if ww==0 or len(w)>2 or w[1][0].rise!=0: return []
+    f,s = w[1]
+    if isBytes(s): s = s.decode('utf8') #only encoding allowed
+    fn = f.fontName
+    fs = f.fontSize
+    hylen = stringWidth(u'-',fn,fs)
+    R = []
+    w0 = newWidth - ww + hylen
+    for h,t in hyphenator(s):
+        hw = stringWidth(h,fn,fs)
+        tw = hw+w0
+        if tw<=maxWidth:
+            R.append((hw,ww-hw,h,t))
+    if R:
+        if len(R)>1:
+            R = [(abs(r[0]-r[1]),r) for r in R]
+            R.sort()
+            hw, tw, h, t = R[0][1]
+        else:
+            hw, tw, h, t = R[0]
+        return [_SplitFragHY([hw+hylen,(f,h+u'-')]),(_SplitFragHS if isinstance(w,_HSFrag) else _SplitFrag)([tw,(f,t)])]
+
+class _SplitWord(unicodeT):
     pass
 
+class _SplitWordHY(_SplitWord):
+    '''head part of a hyphenation word pair'''
+    pass
+
+def _hyphenateWord(hyphenator,fontName,fontSize,w,ww,newWidth,maxWidth):
+    if ww==0: return []
+    s = s.decode('utf8') if isBytes(w) else w
+    hylen = stringWidth('-',fontName,fontSize)
+    R = []
+    w0 = newWidth - ww + hylen
+    for h,t in hyphenator(s):
+        hw = stringWidth(h,fontName,fontSize)
+        tw = hw+w0
+        if tw<=maxWidth:
+            R.append((hw,ww-hw,h,t))
+    if R:
+        if len(R)>1:
+            R = [(abs(r[0]-r[1]),r) for r in R]
+            R.sort()
+            hw, tw, h, t = R[0][1]
+        else:
+            hw, tw, h, t = R[0]
+        return [_SplitWordHY(h+u'-'),_SplitWord(t)]
+
 def _splitWord(w,maxWidth,maxWidths,lineno,fontName,fontSize,encoding='utf8'):
     '''
     split w into words that fit in lines of length
@@ -719,26 +785,41 @@
         cw = stringWidth(c,fontName,fontSize,encoding)
         newLineWidth = lineWidth+cw
         if newLineWidth>maxWidth:
-            aR(_SplitText(wordText))
+            aR(_SplitWord(wordText))
             lineno += 1
             maxWidth = maxWidths[min(maxlineno,lineno)]
             newLineWidth = cw
             wordText = u''
         wordText += c
         lineWidth = newLineWidth
-    aR(_SplitText(wordText))
+    aR(_SplitWord(wordText))
     if not R[0]: del R[0]   #delete in special case
     return R
 
+def _yieldBLParaWords(blPara,start,stop):
+    state = 0
+    R = []
+    aR = R.append
+    for l in blPara.lines[start:stop]:
+        for w in l[1]:
+            if isinstance(w,_SplitWord):
+                if R and isinstance(R[-1],_SplitWordHY):
+                    R[-1] = R[-1][:-1]  #remove unwanted -
+                aR(w)
+                continue
+            else:
+                if R:
+                    yield ''.join(R)
+                    del R[:]
+            yield w
+    if R:
+        yield ''.join(R)
+
 def _split_blParaSimple(blPara,start,stop):
     f = blPara.clone()
     for a in ('lines', 'kind', 'text'):
         if hasattr(f,a): delattr(f,a)
-
-    f.words = []
-    for l in blPara.lines[start:stop]:
-        for w in l[1]:
-            f.words.append(w)
+    f.words = list(_yieldBLParaWords(blPara,start,stop))
     return [f]
 
 def _split_blParaHard(blPara,start,stop):
@@ -1401,6 +1482,7 @@
             if hasattr(self,a):
                 setattr(P1,a,getattr(self,a))
                 setattr(P2,a,getattr(self,a))
+
         return [P1,P2]
 
     def draw(self):
@@ -1448,9 +1530,21 @@
         self.height = lineno = 0
         maxlineno = len(maxWidths)-1
         style = self.style
+        hyphenator = getattr(self,'hyphenator',getattr(style,'hyphenationLang','').strip())
+        if hyphenator:
+            if isStr(hyphenator):
+                if pyphen:
+                    hyphenator = lambda s, i=pyphen.Pyphen(lang=hyphenator).iterate: i(s)
+                else:
+                    hyphenator = None
+            elif not callable(hyphenator):
+                raise ValueError('hyphenator should be a language spec or a callable unicode -->  pairs not %r' % hyphenator) 
+                
+        else:
+            hyphenator = None
         spaceShrinkage = style.spaceShrinkage
         splitLongWords = style.splitLongWords
-        self._splitLongWordCount = 0
+        self._splitLongWordCount = self._hyphenations = 0
 
         #for bullets, work out width and ensure we wrap the right amount onto line one
         _handleBulletWidth(self.bulletText,style,maxWidths)
@@ -1491,8 +1585,14 @@
                 wordWidth = stringWidth(word, fontName, fontSize, self.encoding)
                 newWidth = currentWidth + spaceWidth + wordWidth
                 if newWidth>maxWidth+spaceShrink:
+                    if hyphenator:
+                        hsw = _hyphenateWord(hyphenator, fontName, fontSize, word, wordWidth, newWidth, maxWidth+spaceShrink)
+                        if hsw:
+                            words[0:0] = hsw
+                            self._hyphenations += 1
+                            continue
                     nmw = min(lineno,maxlineno)
-                    if wordWidth>max(maxWidths[nmw:nmw+1]) and not isinstance(word,_SplitText) and splitLongWords:
+                    if wordWidth>max(maxWidths[nmw:nmw+1]) and not isinstance(word,_SplitWord) and splitLongWords:
                         #a long word
                         words[0:0] = _splitWord(word,maxWidth-spaceWidth-currentWidth,maxWidths,lineno,fontName,fontSize,self.encoding)
                         self._splitLongWordCount += 1
@@ -1551,7 +1651,14 @@
                 #test to see if this frag is a line break. If it is we will only act on it
                 #if the current width is non-negative or the previous thing was a deliberate lineBreak
                 lineBreak = f._fkind==_FK_BREAK
-                if not lineBreak and newWidth>(maxWidth+spaceShrink) and not isinstance(w,_SplitList) and splitLongWords:
+                if not lineBreak and newWidth>(maxWidth+spaceShrink) and not isinstance(w,_SplitFrag) and splitLongWords:
+                    if hyphenator and not isinstance(w,_SplitFragHY):
+                        hsw = _hyphenateFragWord(hyphenator,w,newWidth,maxWidth+spaceShrink)
+                        if hsw:
+                            _words[0:0] = hsw
+                            FW.pop(-1)  #remove this as we are doing this one again
+                            self._hyphenations += 1
+                            continue
                     nmw = min(lineno,maxlineno)
                     if wordWidth>max(maxWidths[nmw:nmw+1]):
                         #a long word
@@ -1602,7 +1709,7 @@
                     else:
                         g.text += nText
 
-                    spaceWidth = stringWidth(' ',fontName,fontSize) if isinstance(w,_HSWord) else 0 #of the space following this word
+                    spaceWidth = stringWidth(' ',fontName,fontSize) if isinstance(w,_HSFrag) else 0 #of the space following this word
                     dSpaceShrink = spaceWidth*spaceShrinkage
 
                     ni = 0
@@ -1648,7 +1755,7 @@
                         words = []
                         continue
 
-                    spaceWidth = stringWidth(' ',fontName,fontSize) if isinstance(w,_HSWord) else 0 #of the space following this word
+                    spaceWidth = stringWidth(' ',fontName,fontSize) if isinstance(w,_HSFrag) else 0 #of the space following this word
                     dSpaceShrink = spaceWidth*spaceShrinkage
                     currentWidth = wordWidth
                     n = 1
@@ -1815,7 +1922,7 @@
                 else:
                     cur_y = self.height - getattr(f,'ascent',f.fontSize)
                 if bulletText:
-                    offset = _drawBullet(canvas,offset,cur_y,bulletText,style,rtl=style.wordWrap=='RTL' and self._wrapWidths or False)
+                    gffset = _drawBullet(canvas,offset,cur_y,bulletText,style,rtl=style.wordWrap=='RTL' and self._wrapWidths or False)
 
                 #set up the font etc.
                 canvas.setFillColor(f.textColor)
@@ -1962,7 +2069,7 @@
                 for word in frags:
                     for style,text in word[1:]:
                         plains_append(text)
-                    if isinstance(word,_HSWord):
+                    if isinstance(word,_HSFrag):
                         plains_append(' ')
             else:
                 for frag in frags:
@@ -1988,6 +2095,23 @@
             func = lambda frag, w=self.width: w - frag[0]
         return list(map(func,self.blPara.lines))
 
+    @staticmethod
+    def dumpFrags(frags,indent=4,full=False):
+        R = ['[']
+        aR = R.append
+        for i,f in enumerate(frags):
+            if full:
+                aR('    [%r,' % f[0])
+                for fx in f[1:]:
+                    aR('        (%s,)' % repr(fx[0]))
+                    aR('        %r),' % fx[1])
+                    aR('    ], #%d %s' % (i,f.__class__.__name__))
+                aR('    ]')
+            else:
+                aR('[%r, %s], #%d %s' % (f[0],', '.join(('(%s,%r)' % (fx[0].__class__.__name__,fx[1]) for fx in f[1:])),i,f.__class__.__name__))
+        i = indent*' '
+        return i + ('\n'+i).join(R)
+
 if __name__=='__main__':    #NORUNTESTS
     def dumpParagraphLines(P):
         print('dumpParagraphLines(<Paragraph @ %d>)' % id(P))
--- a/src/reportlab/platypus/paraparser.py	Wed Jun 20 14:43:17 2018 +0100
+++ b/src/reportlab/platypus/paraparser.py	Sun Jun 24 11:58:58 2018 +0100
@@ -97,6 +97,15 @@
         r._normalizer = normalizer
         return r
 
+    def __copy__(self):
+        r = _PCT(float(self))
+        r._value = self._value
+        r._normalizer = normalizer
+        return r
+
+    def __deepcopy__(self,mem):
+        return self.__copy__()
+
 def fontSizeNormalize(frag,attr,default):
     if not hasattr(frag,attr): return default
     v = _numpct(getattr(frag,attr),allowRelative=True)
--- a/src/reportlab/rl_settings.py	Wed Jun 20 14:43:17 2018 +0100
+++ b/src/reportlab/rl_settings.py	Sun Jun 24 11:58:58 2018 +0100
@@ -58,7 +58,8 @@
 underlineGap
 strikeWidth
 strikeOffset
-strikeGap'''.split())
+strikeGap
+hyphenationLang'''.split())
 
 allowTableBoundsErrors =    1 # set to 0 to die on too large elements in tables in debug (recommend 1 for production use)
 shapeChecking =             1
@@ -135,7 +136,8 @@
                                                     #ReportLab takes no responsibility for the use of this setting.
 
 spaceShrinkage=0.05                                 #allowable space shrinkage to make lines fit
-
+hyphenationLang=''                                  #if pyphen installed set this to the language of your choice
+                                                    #eg 'en_GB'
 
 # places to look for T1Font information
 T1SearchPath =  (