author rptlab
Tue, 30 Apr 2013 14:28:14 +0100
changeset 3723 99aa837b6703
parent 3721 0c93dd8ff567
child 3789 5bc95e1f3dd4
permissions -rw-r--r--
second stage of port to Python 3.3; working hello world

#Copyright ReportLab Europe Ltd. 2000-2012
#see license.txt for license details
Template parsing module inspired by REXX (with thanks to Donn Cave for discussion).

Template initialization has the form:
   T = Template(template_string, wild_card_marker, single_char_marker,
             x = regex_x, y = regex_y, ...)
Parsing has the form
   ([match1, match2, ..., matchn], lastindex) = T.PARSE(string)

Only the first argument is mandatory.

The resultant object efficiently parses strings that match the template_string,
giving a list of substrings that correspond to each "directive" of the template.

Template directives:

    The template may be initialized with a wildcard that matches any string
    up to the string matching the next directive (which may not be a wild
    card or single character marker) or the next literal sequence of characters
    of the template.  The character that represents a wildcard is specified
    by the wild_card_marker parameter, which has no default.

    For example, using X as the wildcard:

    >>> T = Template("prefixXinteriorX", "X")
    >>> T.PARSE("prefix this is before interior and this is after")
    ([' this is before ', ' and this is after'], 47)
    >>> T = Template("<X>X<X>", "X")
    >>> T.PARSE('<A HREF="index.html">go to index</A>')
    (['A HREF="index.html"', 'go to index', '/A'], 36)

    Obviously the character used to represent the wildcard must be distinct
    from the characters used to represent literals or other directives.

  Fixed length character sequences:
    The template may have a marker character which indicates a fixed
    length field.  All adjacent instances of this marker will be matched
    by a substring of the same length in the parsed string.  For example:

      >>> T = Template("NNN-NN-NNNN", single_char_marker="N")
      >>> T.PARSE("1-2-34-5-12")
      (['1-2', '34', '5-12'], 11)
      >>> T.PARSE("111-22-3333")
      (['111', '22', '3333'], 11)
      >>> T.PARSE("1111-22-3333")
      ValueError: literal not found at (3, '-')

    A template may have multiple fixed length markers, which allows fixed
    length fields to be adjacent, but recognized separately.  For example:

      >>> T = Template("MMDDYYX", "X", "MDY")
      >>> T.PARSE("112489 Somebody's birthday!")
      (['11', '24', '89', " Somebody's birthday!"], 27)

  Regular expression markers:
    The template may have markers associated with regular expressions.
    the regular expressions may be either string represenations of compiled.
    For example:
      >>> T = Template("v: s i", v=id, s=str, i=int)
      >>> T.PARSE("this_is_an_identifier: 'a string' 12344")
      (['this_is_an_identifier', "'a string'", '12344'], 39)

    Here id, str, and int are regular expression conveniences provided by
    this module.

  Directive markers may be mixed and matched, except that wildcards cannot precede
  wildcards or single character markers.
>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str)
>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'")
(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72)


import re, string
from types import StringType
from string import find

# template parsing
# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N")
#     ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters")
class Template:

   def __init__(self,
       self.template = template
       self.wild_card = wild_card_marker
       self.char = single_char_marker
       # determine the set of markers for this template
       markers = list(marker_to_regex_dict.keys())
       if wild_card_marker:
       if single_char_marker:
          for ch in single_char_marker: # allow multiple scm's
          self.char = single_char_primary = single_char_marker[0]
       self.markers = markers
       for mark in markers:
           if len(mark)>1:
              raise ValueError("Marks must be single characters: "+repr(mark))
       # compile the regular expressions if needed
       self.marker_dict = marker_dict = {}
       for mark, rgex in marker_to_regex_dict.items():
           if type(rgex) == StringType:
              rgex = re.compile(rgex)
           marker_dict[mark] = rgex
       # determine the parse sequence
       parse_seq = []
       # dummy last char
       lastchar = None
       index = 0
       last = len(template)
       # count the number of directives encountered
       ndirectives = 0
       while index<last:
          start = index
          thischar = template[index]
          # is it a wildcard?
          if thischar == wild_card_marker:
             if lastchar == wild_card_marker:
                raise ValueError("two wild cards in sequence is not allowed")
             parse_seq.append( (wild_card_marker, None) )
             index = index+1
             ndirectives = ndirectives+1
          # is it a sequence of single character markers?
          elif single_char_marker and thischar in single_char_marker:
             if lastchar == wild_card_marker:
                raise ValueError("wild card cannot precede single char marker")
             while index<last and template[index] == thischar:
                index = index+1
             parse_seq.append( (single_char_primary, index-start) )
             ndirectives = ndirectives+1
          # is it a literal sequence?
          elif not thischar in markers:
             while index<last and not template[index] in markers:
                index = index+1
             parse_seq.append( (None, template[start:index]) )
          # otherwise it must be a re marker
             rgex = marker_dict[thischar]
             parse_seq.append( (thischar, rgex) )
             ndirectives = ndirectives+1
             index = index+1
          lastchar = template[index-1]
       self.parse_seq = parse_seq
       self.ndirectives = ndirectives

   def PARSE(self, str, start=0):
       ndirectives = self.ndirectives
       wild_card = self.wild_card
       single_char = self.char
       parse_seq = self.parse_seq
       lparse_seq = len(parse_seq) - 1
       # make a list long enough for substitutions for directives
       result = [None] * ndirectives
       current_directive_index = 0
       currentindex = start
       # scan through the parse sequence, recognizing
       for parse_index in range(lparse_seq + 1):
           (indicator, data) = parse_seq[parse_index]
           # is it a literal indicator?
           if indicator is None:
              if find(str, data, currentindex) != currentindex:
                 raise ValueError("literal not found at "+repr((currentindex,data)))
              currentindex = currentindex + len(data)
              # anything else is a directive
              # is it a wildcard?
              if indicator == wild_card:
                 # if it is the last directive then it matches the rest of the string
                 if parse_index == lparse_seq:
                    last = len(str)
                 # otherwise must look at next directive to find end of wildcard
                    # next directive must be re or literal
                    (nextindicator, nextdata) = parse_seq[parse_index+1]
                    if nextindicator is None:
                       # search for literal
                       last = find(str, nextdata, currentindex)
                       if last<currentindex:
                          raise ValueError("couldn't terminate wild with lit "+repr(currentindex))
                       # data is a re, search for it
                       last =, currentindex)
                       if last<currentindex:
                          raise ValueError("couldn't terminate wild with re "+repr(currentindex))
              elif indicator == single_char:
                 # data is length to eat
                 last = currentindex + data
                 # other directives are always regular expressions
                 last = data.match(str, currentindex) + currentindex
                 if last<currentindex:
                    raise ValueError("couldn't match re at "+repr(currentindex))
              #print "accepting", str[currentindex:last]
              result[current_directive_index] = str[currentindex:last]
              current_directive_index = current_directive_index+1
              currentindex = last
       # sanity check
       if current_directive_index != ndirectives:
          raise SystemError("not enough directives found?")
       return (result, currentindex)

# some useful regular expressions
STRINGLITREGEX = "'[^\n']*'"
SIMPLEINTREGEX = "["+string.digits+"]+"
id = re.compile(USERNAMEREGEX)
str = re.compile(STRINGLITREGEX)
int = re.compile(SIMPLEINTREGEX)

def test():
    global T, T1, T2, T3

    T = Template("(NNN)NNN-NNNN X X", "X", "N")
    print(T.PARSE("(908)949-2726 Aaron Watters"))

    T1 = Template("s --> s blah", s=str)
    s = "' <-- a string --> ' --> 'blah blah another string blah' blah"

    T2 = Template("s --> NNNiX", "X", "N", s=str, i=int)
    print(T2.PARSE("'A STRING' --> 15964653alpha beta gamma"))

    T3 = Template("XsXi", "X", "N", s=str, i=int)
    print(T3.PARSE("prefix'string'interior1234junk not parsed"))

    T4 = Template("MMDDYYX", "X", "MDY")
    print(T4.PARSE("122961 Somebody's birthday!"))

if __name__=="__main__": test()