src/reportlab/lib/textsplit.py
author robin <robin@reportlab.com>
Tue, 07 Mar 2017 10:00:34 +0000
changeset 4330 617ffa6bbdc8
parent 4252 fe660f227cac
child 4370 823a8c33ce43
permissions -rw-r--r--
changes for release 3.4.0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
4330
617ffa6bbdc8 changes for release 3.4.0
robin <robin@reportlab.com>
parents: 4252
diff changeset
     1
#Copyright ReportLab Europe Ltd. 2000-2017
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     2
#see license.txt for license details
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     3
#history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/lib/textsplit.py
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     4
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     5
"""Helpers for text wrapping, hyphenation, Asian text splitting and kinsoku shori.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     6
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     7
How to split a 'big word' depends on the language and the writing system.  This module
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     8
works on a Unicode string.  It ought to grow by allowing ore algoriths to be plugged
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
     9
in based on possible knowledge of the language and desirable 'niceness' of the algorithm.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    10
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    11
"""
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    12
4252
fe660f227cac changes for release 3.3.0
robin
parents: 4027
diff changeset
    13
__version__='3.3.0'
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    14
3545
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
    15
from unicodedata import category
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    16
from reportlab.pdfbase.pdfmetrics import stringWidth
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    17
from reportlab.rl_config import _FUZZ
3731
b233dd0577ff another round of changes mostly type related
rptlab
parents: 3721
diff changeset
    18
from reportlab.lib.utils import isUnicode
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    19
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    20
CANNOT_START_LINE = [
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    21
    #strongly prohibited e.g. end brackets, stop, exclamation...
3918
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    22
    u'!\',.:;?!")]\u3001\u3002\u300d\u300f\u3011\u3015\uff3d\u3011\uff09',
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    23
    #middle priority e.g. continuation small vowels - wrapped on two lines but one string...
3918
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    24
    u'\u3005\u2015\u3041\u3043\u3045\u3047\u3049\u3063\u3083\u3085\u3087\u308e\u30a1\u30a3'
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    25
    u'\u30a5\u30a7\u30a9\u30c3\u30e3\u30e5\u30e7\u30ee\u30fc\u30f5\u30f6',
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    26
    #weakly prohibited - continuations, celsius symbol etc.
3918
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    27
    u'\u309b\u309c\u30fb\u30fd\u30fe\u309d\u309e\u2015\u2010\xb0\u2032\u2033\u2103\uffe0\uff05\u2030'
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    28
    ]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    29
3918
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    30
ALL_CANNOT_START = u''.join(CANNOT_START_LINE)
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    31
CANNOT_END_LINE = [
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    32
    #strongly prohibited
3918
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    33
    u'\u2018\u201c\uff08[{\uff08\u3014\uff3b\uff5b\u3008\u300a\u300c\u300e\u3010',
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    34
    #weaker - currency symbols, hash, postcode - prefixes
3918
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    35
    u'$\u00a3@#\uffe5\uff04\uffe1\uff20\u3012\u00a7'
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    36
    ]
3918
4a6572ea94e8 textsplit.py: fix for python2
robin
parents: 3731
diff changeset
    37
ALL_CANNOT_END = u''.join(CANNOT_END_LINE)
3545
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
    38
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
    39
def is_multi_byte(ch):
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
    40
    "Is this an Asian character?"
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
    41
    return (ord(ch) >= 0x3000)
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
    42
    
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    43
def getCharWidths(word, fontName, fontSize):
4027
3fd07cb65f90 textsplit.py: remove spurious reference to _rl_accel
robin
parents: 3975
diff changeset
    44
    """Returns a list of glyph widths.
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    45
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    46
    >>> getCharWidths('Hello', 'Courier', 10)
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    47
    [6.0, 6.0, 6.0, 6.0, 6.0]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    48
    >>> from reportlab.pdfbase.cidfonts import UnicodeCIDFont
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    49
    >>> from reportlab.pdfbase.pdfmetrics import registerFont
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    50
    >>> registerFont(UnicodeCIDFont('HeiseiMin-W3'))
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    51
    >>> getCharWidths(u'\u6771\u4EAC', 'HeiseiMin-W3', 10)   #most kanji are 100 ems
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    52
    [10.0, 10.0]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    53
    """
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    54
    #character-level function call; the performance is going to SUCK
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    55
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    56
    return [stringWidth(uChar, fontName, fontSize) for uChar in word]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    57
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
    58
def wordSplit(word, maxWidths, fontName, fontSize, encoding='utf8'):
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    59
    """Attempts to break a word which lacks spaces into two parts, the first of which
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    60
    fits in the remaining space.  It is allowed to add hyphens or whatever it wishes.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    61
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    62
    This is intended as a wrapper for some language- and user-choice-specific splitting
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    63
    algorithms.  It should only be called after line breaking on spaces, which covers western
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    64
    languages and is highly optimised already.  It works on the 'last unsplit word'.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    65
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    66
    Presumably with further study one could write a Unicode splitting algorithm for text
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    67
    fragments whick was much faster.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    68
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    69
    Courier characters should be 6 points wide.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    70
    >>> wordSplit('HelloWorld', 30, 'Courier', 10)
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    71
    [[0.0, 'Hello'], [0.0, 'World']]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    72
    >>> wordSplit('HelloWorld', 31, 'Courier', 10)
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    73
    [[1.0, 'Hello'], [1.0, 'World']]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    74
    """
3731
b233dd0577ff another round of changes mostly type related
rptlab
parents: 3721
diff changeset
    75
    if not isUnicode(word):
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    76
        uword = word.decode(encoding)
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    77
    else:
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    78
        uword = word
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    79
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    80
    charWidths = getCharWidths(uword, fontName, fontSize)
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
    81
    lines = dumbSplit(uword, charWidths, maxWidths)
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    82
3731
b233dd0577ff another round of changes mostly type related
rptlab
parents: 3721
diff changeset
    83
    if not isUnicode(word):
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    84
        lines2 = []
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    85
        #convert back
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    86
        for (extraSpace, text) in lines:
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    87
            lines2.append([extraSpace, text.encode(encoding)])
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    88
        lines = lines2
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    89
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    90
    return lines
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    91
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
    92
def dumbSplit(word, widths, maxWidths):
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    93
    """This function attempts to fit as many characters as possible into the available
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    94
    space, cutting "like a knife" between characters.  This would do for Chinese.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    95
    It returns a list of (text, extraSpace) items where text is a Unicode string,
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    96
    and extraSpace is the points of unused space available on the line.  This is a
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    97
    structure which is fairly easy to display, and supports 'backtracking' approaches
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    98
    after the fact.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
    99
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   100
    Test cases assume each character is ten points wide...
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   101
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   102
    >>> dumbSplit(u'Hello', [10]*5, 60)
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   103
    [[10, u'Hello']]
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   104
    >>> dumbSplit(u'Hello', [10]*5, 50)
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   105
    [[0, u'Hello']]
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   106
    >>> dumbSplit(u'Hello', [10]*5, 40)
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   107
    [[0, u'Hell'], [30, u'o']]
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   108
    """
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   109
    _more = """
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   110
    #>>> dumbSplit(u'Hello', [10]*5, 4)   # less than one character
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   111
    #(u'', u'Hello')
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   112
    # this says 'Nihongo wa muzukashii desu ne!' (Japanese is difficult isn't it?) in 12 characters
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   113
    >>> jtext = u'\u65e5\u672c\u8a9e\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01'
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   114
    >>> dumbSplit(jtext, [10]*11, 30)   #
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   115
    (u'\u65e5\u672c\u8a9e', u'\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01')
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   116
    """
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   117
    if not isinstance(maxWidths,(list,tuple)): maxWidths = [maxWidths]
3731
b233dd0577ff another round of changes mostly type related
rptlab
parents: 3721
diff changeset
   118
    assert isUnicode(word)
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   119
    lines = []
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   120
    i = widthUsed = lineStartPos = 0
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   121
    maxWidth = maxWidths[0]
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   122
    nW = len(word)
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   123
    while i<nW:
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   124
        w = widths[i]
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   125
        c = word[i]
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   126
        widthUsed += w
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   127
        i += 1
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   128
        if widthUsed > maxWidth + _FUZZ and widthUsed>0:
3547
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   129
            extraSpace = maxWidth - widthUsed
3546
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   130
            if ord(c)<0x3000:
3545
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   131
                # we appear to be inside a non-Asian script section.
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   132
                # (this is a very crude test but quick to compute).
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   133
                # This is likely to be quite rare so the speed of the
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   134
                # code below is hopefully not a big issue.  The main
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   135
                # situation requiring this is that a document title
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   136
                # with an english product name in it got cut.
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   137
                
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   138
                
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   139
                # we count back and look for 
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   140
                #  - a space-like character
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   141
                #  - reversion to Kanji (which would be a good split point)
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   142
                #  - in the worst case, roughly half way back along the line
3546
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   143
                limitCheck = (lineStartPos+i)>>1        #(arbitrary taste issue)
3721
0c93dd8ff567 initial changes from 2to3-3.3
rptlab
parents: 3617
diff changeset
   144
                for j in range(i-1,limitCheck,-1):
3546
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   145
                    cj = word[j]
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   146
                    if category(cj)=='Zs' or ord(cj)>=0x3000:
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   147
                        k = j+1
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   148
                        if k<i:
3547
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   149
                            j = k+1
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   150
                            extraSpace += sum(widths[j:i])
3546
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   151
                            w = widths[k]
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   152
                            c = word[k]
3547
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   153
                            i = j
3546
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   154
                            break
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   155
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   156
                #end of English-within-Asian special case
3545
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   157
3546
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   158
            #we are pushing this character back, but
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   159
            #the most important of the Japanese typography rules
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   160
            #if this character cannot start a line, wrap it up to this line so it hangs
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   161
            #in the right margin. We won't do two or more though - that's unlikely and
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   162
            #would result in growing ugliness.
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   163
            #and increase the extra space
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   164
            #bug fix contributed by Alexander Vasilenko <alexs.vasilenko@gmail.com>
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   165
            if c not in ALL_CANNOT_START and i>lineStartPos+1:
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   166
                #otherwise we need to push the character back
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   167
                #the i>lineStart+1 condition ensures progress
42ae40e7a1b9 textsplit.py: attempt to fix up Andy's algorithm for CJK splitting with special case western eol handling
rgbecker
parents: 3545
diff changeset
   168
                i -= 1
3547
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   169
                extraSpace += w
3545
538ad211299b english-in-japanese splitting
andy
parents: 3492
diff changeset
   170
3547
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   171
            #lines.append([maxWidth-sum(widths[lineStartPos:i]), word[lineStartPos:i].strip()])
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   172
            lines.append([extraSpace, word[lineStartPos:i].strip()])
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   173
            try:
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   174
                maxWidth = maxWidths[len(lines)]
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   175
            except IndexError:
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   176
                maxWidth = maxWidths[-1]  # use the last one
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   177
            lineStartPos = i
3492
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   178
            widthUsed = 0
9f7288085d44 reportlab: fix support for cjk splitting
rgbecker
parents: 3328
diff changeset
   179
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   180
    #any characters left?
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   181
    if widthUsed > 0:
3547
6d9cf43ebf9d western character modificaton for CJK splitting
rgbecker
parents: 3546
diff changeset
   182
        lines.append([maxWidth - widthUsed, word[lineStartPos:]])
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   183
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   184
    return lines
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   185
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   186
def kinsokuShoriSplit(word, widths, availWidth):
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   187
    #NOT USED OR FINISHED YET!
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   188
    """Split according to Japanese rules according to CJKV (Lunde).
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   189
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   190
    Essentially look for "nice splits" so that we don't end a line
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   191
    with an open bracket, or start one with a full stop, or stuff like
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   192
    that.  There is no attempt to try to split compound words into
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   193
    constituent kanji.  It currently uses wrap-down: packs as much
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   194
    on a line as possible, then backtracks if needed
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   195
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   196
    This returns a number of words each of which should just about fit
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   197
    on a line.  If you give it a whole paragraph at once, it will
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   198
    do all the splits.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   199
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   200
    It's possible we might slightly step over the width limit
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   201
    if we do hanging punctuation marks in future (e.g. dangle a Japanese
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   202
    full stop in the right margin rather than using a whole character
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   203
    box.
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   204
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   205
    """
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   206
    lines = []
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   207
    assert len(word) == len(widths)
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   208
    curWidth = 0.0
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   209
    curLine = []
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   210
    i = 0   #character index - we backtrack at times so cannot use for loop
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   211
    while 1:
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   212
        ch = word[i]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   213
        w = widths[i]
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   214
        if curWidth + w < availWidth:
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   215
            curLine.append(ch)
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   216
            curWidth += w
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   217
        else:
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   218
            #end of line.  check legality
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   219
            if ch in CANNOT_END_LINE[0]:
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   220
                pass
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   221
    #to be completed
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   222
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   223
# This recipe refers:
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   224
#
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   225
#  http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   226
import re
3721
0c93dd8ff567 initial changes from 2to3-3.3
rptlab
parents: 3617
diff changeset
   227
rx=re.compile("([\u2e80-\uffff])", re.UNICODE)
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   228
def cjkwrap(text, width, encoding="utf8"):
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   229
     return reduce(lambda line, word, width=width: '%s%s%s' %
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   230
                (line,
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   231
                 [' ','\n', ''][(len(line)-line.rfind('\n')-1
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   232
                       + len(word.split('\n',1)[0] ) >= width) or
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   233
                      line[-1:] == '\0' and 2],
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   234
                 word),
3721
0c93dd8ff567 initial changes from 2to3-3.3
rptlab
parents: 3617
diff changeset
   235
                rx.sub(r'\1\0 ', str(text,encoding)).split(' ')
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   236
            ).replace('\0', '').encode(encoding)
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   237
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   238
if __name__=='__main__':
3328
4d7f836cd947 reportlab: fix 2to3 warnings
rgbecker
parents: 2964
diff changeset
   239
    import doctest
3975
4a3599863c11 eliminate from . imports in favour of absolutes to allow running modules
robin
parents: 3918
diff changeset
   240
    from reportlab.lib import textsplit
2577
f442326a11e9 reportlab: add files from utf8 branch
rgbecker
parents:
diff changeset
   241
    doctest.testmod(textsplit)