src/reportlab/pdfbase/pdfutils.py
author rptlab
Tue, 30 Apr 2013 14:28:14 +0100
branchpy33
changeset 3723 99aa837b6703
parent 3721 0c93dd8ff567
child 3731 b233dd0577ff
permissions -rwxr-xr-x
second stage of port to Python 3.3; working hello world

#Copyright ReportLab Europe Ltd. 2000-2012
#see license.txt for license details
#history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/pdfbase/pdfutils.py
__version__=''' $Id$ '''
__doc__=''
# pdfutils.py - everything to do with images, streams,
# compression, and some constants

import sys
import os
import binascii
from reportlab import rl_config
from reportlab.lib.utils import getBytesIO, ImageReader, isStrType, isUnicodeType, isPython3

LINEEND = '\015\012'

def _chunker(src,dst=[],chunkSize=60):
    for i in range(0,len(src),chunkSize):
        dst.append(src[i:i+chunkSize])
    return dst

##########################################################
#
#  Image compression helpers.  Preprocessing a directory
#  of images will offer a vast speedup.
#
##########################################################
_mode2cs = {'RGB':'RGB', 'CMYK': 'CMYK', 'L': 'G'}
_mode2bpp = {'RGB': 3, 'CMYK':4, 'L':1}
def makeA85Image(filename,IMG=None):
    import zlib
    img = ImageReader(filename)
    if IMG is not None: IMG.append(img)

    imgwidth, imgheight = img.getSize()
    raw = img.getRGBData()

    code = []
    append = code.append
    # this describes what is in the image itself
    append('BI')
    append('/W %s /H %s /BPC 8 /CS /%s /F [/A85 /Fl]' % (imgwidth, imgheight,_mode2cs[img.mode]))
    append('ID')
    #use a flate filter and Ascii Base 85
    assert len(raw) == imgwidth * imgheight*_mode2bpp[img.mode], "Wrong amount of data for image"
    compressed = zlib.compress(raw)   #this bit is very fast...
    encoded = _AsciiBase85Encode(compressed) #...sadly this may not be

    #append in blocks of 60 characters
    _chunker(encoded,code)

    append('EI')
    return code
def makeRawImage(filename,IMG=None):
    import zlib
    img = ImageReader(filename)
    if IMG is not None: IMG.append(img)

    imgwidth, imgheight = img.getSize()
    raw = img.getRGBData()

    code = []
    append = code.append
    # this describes what is in the image itself
    append('BI')
    append('/W %s /H %s /BPC 8 /CS /%s /F [/Fl]' % (imgwidth, imgheight,_mode2cs[img.mode]))
    append('ID')
    #use a flate filter
    assert len(raw) == imgwidth * imgheight*_mode2bpp[img.mode], "Wrong amount of data for image"
    compressed = zlib.compress(raw)   #this bit is very fast...

    #append in blocks of 60 characters
    _chunker(compressed,code)

    append('EI')
    return code

def cacheImageFile(filename, returnInMemory=0, IMG=None):
    "Processes image as if for encoding, saves to a file with .a85 extension."

    cachedname = os.path.splitext(filename)[0] + (rl_config.useA85 and '.a85' or '.bin')
    if filename==cachedname:
        if cachedImageExists(filename):
            from reportlab.lib.utils import open_for_read
            if returnInMemory: return filter(None,open_for_read(cachedname).read().split(LINEEND))
        else:
            raise IOError('No such cached image %s' % filename)
    else:
        if rl_config.useA85:
            code = makeA85Image(filename,IMG)
        else:
            code = makeRawImage(filename,IMG)
        if returnInMemory: return code

        #save it to a file
        f = open(cachedname,'wb')
        f.write(LINEEND.join(code)+LINEEND)
        f.close()
        if rl_config.verbose:
            print('cached image as %s' % cachedname)


def preProcessImages(spec):
    """Preprocesses one or more image files.

    Accepts either a filespec ('C:\mydir\*.jpg') or a list
    of image filenames, crunches them all to save time.  Run this
    to save huge amounts of time when repeatedly building image
    documents."""

    import types, glob

    if type(spec) is types.StringType:
        filelist = glob.glob(spec)
    else:  #list or tuple OK
        filelist = spec

    for filename in filelist:
        if cachedImageExists(filename):
            if rl_config.verbose:
                print('cached version of %s already exists' % filename)
        else:
            cacheImageFile(filename)


def cachedImageExists(filename):
    """Determines if a cached image already exists for a given file.

    Determines if a cached image exists which has the same name
    and equal or newer date to the given file."""
    cachedname = os.path.splitext(filename)[0] + (rl_config.useA85 and '.a85' or 'bin')
    if os.path.isfile(cachedname):
        #see if it is newer
        original_date = os.stat(filename)[8]
        cached_date = os.stat(cachedname)[8]
        if original_date > cached_date:
            return 0
        else:
            return 1
    else:
        return 0


##############################################################
#
#            PDF Helper functions
#
##############################################################

try:
    from _rl_accel import escapePDF, _instanceEscapePDF
    _escape = escapePDF
except ImportError:
    try:
        from reportlab.lib._rl_accel import escapePDF, _instanceEscapePDF
        _escape = escapePDF
    except ImportError:
        _instanceEscapePDF=None
        _ESCAPEDICT={}
        for c in range(256):
            if c<32 or c>=127:
                _ESCAPEDICT[c]= '\\%03o' % c
            elif c in (ord('\\'),ord('('),ord(')')):
                _ESCAPEDICT[c] = '\\'+chr(c)
            else:
                _ESCAPEDICT[c] = chr(c)
        del c
        #Michael Hudson donated this
        def _escape(s):
            r = []
            for c in s:
                if not type(c) is int:
                    c = ord(c)
                r.append(_ESCAPEDICT[c])
            return ''.join(r)

def _normalizeLineEnds(text,desired=LINEEND,unlikely='\x00\x01\x02\x03'):
    """Normalizes different line end character(s).

    Ensures all instances of CR, LF and CRLF end up as
    the specified one."""
    
    return (text
            .replace('\r\n', unlikely)
            .replace('\r', unlikely)
            .replace('\n', unlikely)
            .replace(unlikely, desired))

def _AsciiHexEncode(input):
    """Encodes input using ASCII-Hex coding.

    This is a verbose encoding used for binary data within
    a PDF file.  One byte binary becomes two bytes of ASCII.
    Helper function used by images."""
    if isUnicodeType(input):
        input = input.encode('utf-8')
    output = getBytesIO()
    output.write(binascii.b2a_hex(input))
    output.write(b'>')
    return output.getvalue()


def _AsciiHexDecode(input):
    """Decodes input using ASCII-Hex coding.

    Not used except to provide a test of the inverse function."""

    #strip out all whitespace
    if not isStrType(input):
        input = input.decode('utf-8')
    stripped = ''.join(input.split())
    assert stripped[-1] == '>', 'Invalid terminator for Ascii Hex Stream'
    stripped = stripped[:-1]  #chop off terminator
    assert len(stripped) % 2 == 0, 'Ascii Hex stream has odd number of bytes'

    return ''.join([chr(int(stripped[i:i+2],16)) for i in range(0,len(stripped),2)])
        
if 1: # for testing always define this
    def _AsciiBase85EncodePYTHON(input):
        """Encodes input using ASCII-Base85 coding.

        This is a compact encoding used for binary data within
        a PDF file.  Four bytes of binary data become five bytes of
        ASCII.  This is the default method used for encoding images."""
        # special rules apply if not a multiple of four bytes.
        whole_word_count, remainder_size = divmod(len(input), 4)
        cut = 4 * whole_word_count
        body, lastbit = input[0:cut], input[cut:]
        if isPython3 and isStrType(lastbit):
            lastbit = lastbit.encode('utf-8')

        out = [].append
        for i in range(whole_word_count):
            offset = i*4
            b1 = body[offset]
            b2 = body[offset+1]
            b3 = body[offset+2]
            b4 = body[offset+3]
            if isStrType(b1): b1 = ord(b1)
            if isStrType(b2): b2 = ord(b2)
            if isStrType(b3): b3 = ord(b3)
            if isStrType(b4): b4 = ord(b4)

            if b1<128:
                num = (((((b1<<8)|b2)<<8)|b3)<<8)|b4
            else:
                num = 16777216 * b1 + 65536 * b2 + 256 * b3 + b4

            if num == 0:
                #special case
                out('z')
            else:
                #solve for five base-85 numbers
                temp, c5 = divmod(num, 85)
                temp, c4 = divmod(temp, 85)
                temp, c3 = divmod(temp, 85)
                c1, c2 = divmod(temp, 85)
                assert ((85**4) * c1) + ((85**3) * c2) + ((85**2) * c3) + (85*c4) + c5 == num, 'dodgy code!'
                out(chr(c1+33))
                out(chr(c2+33))
                out(chr(c3+33))
                out(chr(c4+33))
                out(chr(c5+33))

        # now we do the final bit at the end.  I repeated this separately as
        # the loop above is the time-critical part of a script, whereas this
        # happens only once at the end.

        #encode however many bytes we have as usual
        if remainder_size > 0:
            while len(lastbit) < 4:
                lastbit = lastbit + b'\000'
            b1 = lastbit[0]
            b2 = lastbit[1]
            b3 = lastbit[2]
            b4 = lastbit[3]
            if isStrType(b1): b1 = ord(b1)
            if isStrType(b2): b2 = ord(b2)
            if isStrType(b3): b3 = ord(b3)
            if isStrType(b4): b4 = ord(b4)

            num = 16777216 * b1 + 65536 * b2 + 256 * b3 + b4

            #solve for c1..c5
            temp, c5 = divmod(num, 85)
            temp, c4 = divmod(temp, 85)
            temp, c3 = divmod(temp, 85)
            c1, c2 = divmod(temp, 85)

            #print 'encoding: %d %d %d %d -> %d -> %d %d %d %d %d' % (
            #    b1,b2,b3,b4,num,c1,c2,c3,c4,c5)
            lastword = chr(c1+33) + chr(c2+33) + chr(c3+33) + chr(c4+33) + chr(c5+33)
            #write out most of the bytes.
            out(lastword[0:remainder_size + 1])

        #terminator code for ascii 85
        out('~>')
        return ''.join(out.__self__)

    def _AsciiBase85DecodePYTHON(input):
        """Decodes input using ASCII-Base85 coding.

        This is not used - Acrobat Reader decodes for you
        - but a round trip is essential for testing."""
        #strip all whitespace
        stripped = ''.join(input.split())
        #check end
        assert stripped[-2:] == '~>', 'Invalid terminator for Ascii Base 85 Stream'
        stripped = stripped[:-2]  #chop off terminator

        #may have 'z' in it which complicates matters - expand them
        stripped = stripped.replace('z','!!!!!')
        # special rules apply if not a multiple of five bytes.
        whole_word_count, remainder_size = divmod(len(stripped), 5)
        #print '%d words, %d leftover' % (whole_word_count, remainder_size)
        #assert remainder_size != 1, 'invalid Ascii 85 stream!'
        cut = 5 * whole_word_count
        body, lastbit = stripped[0:cut], stripped[cut:]

        out = [].append
        for i in range(whole_word_count):
            offset = i*5
            c1 = ord(body[offset]) - 33
            c2 = ord(body[offset+1]) - 33
            c3 = ord(body[offset+2]) - 33
            c4 = ord(body[offset+3]) - 33
            c5 = ord(body[offset+4]) - 33

            num = ((85**4) * c1) + ((85**3) * c2) + ((85**2) * c3) + (85*c4) + c5

            temp, b4 = divmod(num,256)
            temp, b3 = divmod(temp,256)
            b1, b2 = divmod(temp, 256)

            assert  num == 16777216 * b1 + 65536 * b2 + 256 * b3 + b4, 'dodgy code!'
            out(chr(b1))
            out(chr(b2))
            out(chr(b3))
            out(chr(b4))

        #decode however many bytes we have as usual
        if remainder_size > 0:
            while len(lastbit) < 5:
                lastbit = lastbit + '!'
            c1 = ord(lastbit[0]) - 33
            c2 = ord(lastbit[1]) - 33
            c3 = ord(lastbit[2]) - 33
            c4 = ord(lastbit[3]) - 33
            c5 = ord(lastbit[4]) - 33
            num = (((85*c1+c2)*85+c3)*85+c4)*85 + (c5
                     +(0,0,0xFFFFFF,0xFFFF,0xFF)[remainder_size])
            temp, b4 = divmod(num,256)
            temp, b3 = divmod(temp,256)
            b1, b2 = divmod(temp, 256)
            assert  num == 16777216 * b1 + 65536 * b2 + 256 * b3 + b4, 'dodgy code!'
            #print 'decoding: %d %d %d %d %d -> %d -> %d %d %d %d' % (
            #    c1,c2,c3,c4,c5,num,b1,b2,b3,b4)

            #the last character needs 1 adding; the encoding loses
            #data by rounding the number to x bytes, and when
            #divided repeatedly we get one less
            if remainder_size == 2:
                lastword = chr(b1)
            elif remainder_size == 3:
                lastword = chr(b1) + chr(b2)
            elif remainder_size == 4:
                lastword = chr(b1) + chr(b2) + chr(b3)
            else:
                lastword = ''
            out(lastword)

        #terminator code for ascii 85
        return ''.join(out.__self__)

try:
    from _rl_accel import _AsciiBase85Encode                    # builtin or on the path
except ImportError:
    try:
        from reportlab.lib._rl_accel import _AsciiBase85Encode  # where we think it should be
    except ImportError:
        _AsciiBase85Encode = _AsciiBase85EncodePYTHON

try:
    from _rl_accel import _AsciiBase85Decode                    # builtin or on the path
except ImportError:
    try:
        from reportlab.lib._rl_accel import _AsciiBase85Decode  # where we think it should be
    except ImportError:
        _AsciiBase85Decode = _AsciiBase85DecodePYTHON

def _wrap(input, columns=60):
    "Wraps input at a given column size by inserting LINEEND characters."
    output = []
    length = len(input)
    i = 0
    pos = columns * i
    while pos < length:
        output.append(input[pos:pos+columns])
        i = i + 1
        pos = columns * i
    #avoid HP printer problem
    if len(output[-1])==1:
        output[-2:] = [output[-2][:-1],output[-2][-1]+output[-1]]
    return LINEEND.join(output)


#########################################################################
#
#  JPEG processing code - contributed by Eric Johnson
#
#########################################################################

# Read data from the JPEG file. We should probably be using PIL to
# get this information for us -- but this way is more fun!
# Returns (width, height, color components) as a triple
# This is based on Thomas Merz's code from GhostScript (viewjpeg.ps)
def readJPEGInfo(image):
    "Read width, height and number of components from open JPEG file."

    import struct
    from .pdfdoc import PDFError

    #Acceptable JPEG Markers:
    #  SROF0=baseline, SOF1=extended sequential or SOF2=progressive
    validMarkers = [0xC0, 0xC1, 0xC2]

    #JPEG markers without additional parameters
    noParamMarkers = \
        [ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0x01 ]

    #Unsupported JPEG Markers
    unsupportedMarkers = \
        [ 0xC3, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF ]

    #read JPEG marker segments until we find SOFn marker or EOF
    done = 0
    while not done:
        x = struct.unpack('B', image.read(1))
        if x[0] == 0xFF:                    #found marker
            x = struct.unpack('B', image.read(1))
            #print "Marker: ", '%0.2x' % x[0]
            #check marker type is acceptable and process it
            if x[0] in validMarkers:
                image.seek(2, 1)            #skip segment length
                x = struct.unpack('B', image.read(1)) #data precision
                if x[0] != 8:
                    raise PDFError('JPEG must have 8 bits per component')
                y = struct.unpack('BB', image.read(2))
                height = (y[0] << 8) + y[1]
                y = struct.unpack('BB', image.read(2))
                width =  (y[0] << 8) + y[1]
                y = struct.unpack('B', image.read(1))
                color =  y[0]
                return width, height, color
            elif x[0] in unsupportedMarkers:
                raise PDFError('JPEG Unsupported JPEG marker: %0.2x' % x[0])
            elif x[0] not in noParamMarkers:
                #skip segments with parameters
                #read length and skip the data
                x = struct.unpack('BB', image.read(2))
                image.seek( (x[0] << 8) + x[1] - 2, 1)

class _fusc:
    def __init__(self,k, n):
        assert k, 'Argument k should be a non empty string'
        self._k = k
        self._klen = len(k)
        self._n = int(n) or 7

    def encrypt(self,s):
        return self.__rotate(_AsciiBase85Encode(''.join(map(chr,self.__fusc(list(map(ord,s)))))),self._n)

    def decrypt(self,s):
        return ''.join(map(chr,self.__fusc(list(map(ord,_AsciiBase85Decode(self.__rotate(s,-self._n)))))))

    def __rotate(self,s,n):
        l = len(s)
        if n<0: n = l+n
        n %= l
        if not n: return s
        return s[-n:]+s[:l-n]

    def __fusc(self,s):
        slen = len(s)
        return list(map(lambda x,y: x ^ y,s,list(map(ord,((int(slen/self._klen)+1)*self._k)[:slen]))))