author andy
Fri, 17 Apr 2009 14:29:52 +0000
changeset 3138 3c1f87352b7b
parent 3029 eded59f94021
child 3326 ce725978d11c
permissions -rw-r--r--
fixed a docstring causing a test to fail

"""Very simple and fast XML parser, used for intra-paragraph text.

Devised by Aaron Watters in the bad old days before Python had fast
parsers available.  Constructs the lightest possible in-memory
representation; parses most files we have seen in pure python very

The output structure is the same as the one produced by pyRXP,
our validating C-based parser, which was written later.  It will
use pyRXP if available.

This is used to parse intra-paragraph markup.

Example parse::

    <this type="xml">text <b>in</b> xml</this>

    ( "this",
      {"type": "xml"},
      [ "text ",
        ("b", None, ["in"], None),
        " xml"
       None )

    { 0: "this"
      "type": "xml"
      1: ["text ",
          {0: "b", 1:["in"]},
          " xml"]

Ie, xml tag translates to a tuple:
 (name, dictofattributes, contentlist, miscellaneousinfo)

where miscellaneousinfo can be anything, (but defaults to None)
(with the intention of adding, eg, line number information)

special cases: name of "" means "top level, no containing tag".
Top level parse always looks like this::

    ("", list, None, None)

 contained text of None means <simple_tag/>

In order to support stuff like::




RequirePyRXP = 0        # set this to 1 to disable the nonvalidating fallback parser.

import string
    #raise ImportError, "dummy error"
    simpleparse = 0
    import pyRXPU
    def warnCB(s):
        print s
    pyRXP_parser = pyRXPU.Parser(
                        warnCB = warnCB,
                        srcName='string input',
                        ReturnUTF8 = 1,
    def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None,parseOpts={}):
        pyRXP_parser.eoCB = eoCB
        p = pyRXP_parser.parse(xmlText,**parseOpts)
        return oneOutermostTag and p or ('',None,[p],None)
except ImportError:
    simpleparse = 1

replacelist = [("&lt;", "<"), ("&gt;", ">"), ("&amp;", "&")] # amp must be last
#replacelist = []
def unEscapeContentList(contentList):
    result = []
    from string import replace
    for e in contentList:
        if "&" in e:
            for (old, new) in replacelist:
                e = replace(e, old, new)
    return result

def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList):
    """official interface: discard unused cursor info"""
    if RequirePyRXP:
        raise ImportError, "pyRXP not found, fallback parser disabled"
    (result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer)
    if oneOutermostTag:
        return result[2][0]
        return result

if simpleparse:
    parsexml = parsexmlSimple

def parseFile(filename):
    raw = open(filename, 'r').read()
    return parsexml(raw)

verbose = 0

def skip_prologue(text, cursor):
    """skip any prologue found after cursor, return index of rest of text"""
    ### NOT AT ALL COMPLETE!!! definitely can be confused!!!
    from string import find
    prologue_elements = ("!DOCTYPE", "?xml", "!--")
    done = None
    while done is None:
        #print "trying to skip:", repr(text[cursor:cursor+20])
        openbracket = find(text, "<", cursor)
        if openbracket<0: break
        past = openbracket+1
        found = None
        for e in prologue_elements:
            le = len(e)
            if text[past:past+le]==e:
                found = 1
                cursor = find(text, ">", past)
                if cursor<0:
                    raise ValueError, "can't close prologue %s" % `e`
                cursor = cursor+1
        if found is None:
    #print "done skipping"
    return cursor

def parsexml0(xmltext, startingat=0, toplevel=1,
        # snarf in some globals
        strip=string.strip, split=string.split, find=string.find, entityReplacer=unEscapeContentList,
        #len=len, None=None
    """simple recursive descent xml parser...
       return (dictionary, endcharacter)
       special case: comment returns (None, endcharacter)"""
    #from string import strip, split, find
    #print "parsexml0", `xmltext[startingat: startingat+10]`
    NameString = NONAME
    ContentList = AttDict = ExtraStuff = None
    if toplevel is not None:
        #if verbose: print "at top level"
        #if startingat!=0:
        #    raise ValueError, "have to start at 0 for top level!"
        xmltext = strip(xmltext)
    cursor = startingat
    #look for interesting starting points
    firstbracket = find(xmltext, "<", cursor)
    afterbracket2char = xmltext[firstbracket+1:firstbracket+3]
    #print "a", `afterbracket2char`
    #firstampersand = find(xmltext, "&", cursor)
    #if firstampersand>0 and firstampersand<firstbracket:
    #    raise ValueError, "I don't handle ampersands yet!!!"
    docontents = 1
    if firstbracket<0:
            # no tags
            #if verbose: print "no tags"
            if toplevel is not None:
                #D = {NAMEKEY: NONAME, CONTENTSKEY: [xmltext[cursor:]]}
                ContentList = [xmltext[cursor:]]
                if entityReplacer: ContentList = entityReplacer(ContentList)
                return (NameString, AttDict, ContentList, ExtraStuff), len(xmltext)
                raise ValueError, "no tags at non-toplevel %s" % `xmltext[cursor:cursor+20]`
    #D = {}
    L = []
    # look for start tag
    # NEED to force always outer level is unnamed!!!
    #if toplevel and firstbracket>0:
    #afterbracket2char = xmltext[firstbracket:firstbracket+2]
    if toplevel is not None:
            #print "toplevel with no outer tag"
            NameString = name = NONAME
            cursor = skip_prologue(xmltext, cursor)
    elif firstbracket<0:
            raise ValueError, "non top level entry should be at start tag: %s" % repr(xmltext[:10])
    # special case: CDATA
    elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="<![CDATA[":
            #print "in CDATA", cursor
            # skip straight to the close marker
            startcdata = firstbracket+9
            endcdata = find(xmltext, CDATAENDMARKER, startcdata)
            if endcdata<0:
                raise ValueError, "unclosed CDATA %s" % repr(xmltext[cursor:cursor+20])
            NameString = CDATAMARKER
            ContentList = [xmltext[startcdata: endcdata]]
            cursor = endcdata+len(CDATAENDMARKER)
            docontents = None
    # special case COMMENT
    elif afterbracket2char=="!-" and xmltext[firstbracket:firstbracket+4]=="<!--":
            #print "in COMMENT"
            endcommentdashes = find(xmltext, "--", firstbracket+4)
            if endcommentdashes<firstbracket:
                raise ValueError, "unterminated comment %s" % repr(xmltext[cursor:cursor+20])
            endcomment = endcommentdashes+2
            if xmltext[endcomment]!=">":
                raise ValueError, "invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20])
            return (None, endcomment+1) # shortcut exit
            # get the rest of the tag
            #if verbose: print "parsing start tag"
            # make sure the tag isn't in doublequote pairs
            closebracket = find(xmltext, ">", firstbracket)
            noclose = closebracket<0
            startsearch = closebracket+1
            pastfirstbracket = firstbracket+1
            tagcontent = xmltext[pastfirstbracket:closebracket]
            # shortcut, no equal means nothing but name in the tag content
            if '=' not in tagcontent:
                if tagcontent[-1]=="/":
                    # simple case
                    #print "simple case", tagcontent
                    tagcontent = tagcontent[:-1]
                    docontents = None
                name = strip(tagcontent)
                NameString = name
                cursor = startsearch
                if '"' in tagcontent:
                    # check double quotes
                    stop = None
                    # not inside double quotes! (the split should have odd length)
                    if noclose or len(split(tagcontent+".", '"'))% 2:
                    while stop is None:
                        closebracket = find(xmltext, ">", startsearch)
                        startsearch = closebracket+1
                        noclose = closebracket<0
                        tagcontent = xmltext[pastfirstbracket:closebracket]
                        # not inside double quotes! (the split should have odd length)
                        if noclose or len(split(tagcontent+".", '"'))% 2:
                if noclose:
                    raise ValueError, "unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20])
                cursor = startsearch
                #cursor = closebracket+1
                # handle simple tag /> syntax
                if xmltext[closebracket-1]=="/":
                    #if verbose: print "it's a simple tag"
                    closebracket = closebracket-1
                    tagcontent = tagcontent[:-1]
                    docontents = None
                #tagcontent = xmltext[firstbracket+1:closebracket]
                tagcontent = strip(tagcontent)
                taglist = split(tagcontent, "=")
                #if not taglist:
                #    raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20])
                taglist0 = taglist[0]
                taglist0list = split(taglist0)
                #if len(taglist0list)>2:
                #    raise ValueError, "bad tag head %s" % repr(taglist0)
                name = taglist0list[0]
                #print "tag name is", name
                NameString = name
                # now parse the attributes
                attributename = taglist0list[-1]
                # put a fake att name at end of last taglist entry for consistent parsing
                taglist[-1] = taglist[-1]+" f"
                AttDict = D = {}
                taglistindex = 1
                lasttaglistindex = len(taglist)
                #for attentry in taglist[1:]:
                while taglistindex<lasttaglistindex:
                    #print "looking for attribute named", attributename
                    attentry = taglist[taglistindex]
                    taglistindex = taglistindex+1
                    attentry = strip(attentry)
                    if attentry[0]!='"':
                        raise ValueError, "attribute value must start with double quotes" + repr(attentry)
                    while '"' not in attentry[1:]:
                        # must have an = inside the attribute value...
                        if taglistindex>lasttaglistindex:
                            raise ValueError, "unclosed value " + repr(attentry)
                        nextattentry = taglist[taglistindex]
                        taglistindex = taglistindex+1
                        attentry = "%s=%s" % (attentry, nextattentry)
                    attentry = strip(attentry) # only needed for while loop...
                    attlist = split(attentry)
                    nextattname = attlist[-1]
                    attvalue = attentry[:-len(nextattname)]
                    attvalue = strip(attvalue)
                        first = attvalue[0]; last=attvalue[-1]
                        raise ValueError, "attvalue,attentry,attlist="+repr((attvalue, attentry,attlist))
                    if first==last=='"' or first==last=="'":
                        attvalue = attvalue[1:-1]
                    #print attributename, "=", attvalue
                    D[attributename] = attvalue
                    attributename = nextattname
    # pass over other tags and content looking for end tag
    if docontents is not None:
        #print "now looking for end tag"
        ContentList = L
    while docontents is not None:
            nextopenbracket = find(xmltext, "<", cursor)
            if nextopenbracket<cursor:
                #if verbose: print "no next open bracket found"
                if name==NONAME:
                    #print "no more tags for noname", repr(xmltext[cursor:cursor+10])
                    docontents=None # done
                    remainder = xmltext[cursor:]
                    cursor = len(xmltext)
                    if remainder:
                    raise ValueError, "no close bracket for %s found after %s" % (name,repr(xmltext[cursor: cursor+20]))
            # is it a close bracket?
            elif xmltext[nextopenbracket+1]=="/":
                #print "found close bracket", repr(xmltext[nextopenbracket:nextopenbracket+20])
                nextclosebracket = find(xmltext, ">", nextopenbracket)
                if nextclosebracket<nextopenbracket:
                    raise ValueError, "unclosed close tag %s" % repr(xmltext[nextopenbracket: nextopenbracket+20])
                closetagcontents = xmltext[nextopenbracket+2: nextclosebracket]
                closetaglist = split(closetagcontents)
                #if len(closetaglist)!=1:
                    #print closetagcontents
                    #raise ValueError, "bad close tag format %s" % repr(xmltext[nextopenbracket: nextopenbracket+20])
                # name should match
                closename = closetaglist[0]
                #if verbose: print "closetag name is", closename
                if name!=closename:
                    prefix = xmltext[:cursor]
                    endlinenum = len(split(prefix, "\n"))
                    prefix = xmltext[:startingat]
                    linenum = len(split(prefix, "\n"))
                    raise ValueError, \
                       "at lines %s...%s close tag name doesn't match %s...%s %s" %(
                       linenum, endlinenum, `name`, `closename`, repr(xmltext[cursor: cursor+100]))
                remainder = xmltext[cursor:nextopenbracket]
                if remainder:
                    #if verbose: print "remainder", repr(remainder)
                cursor = nextclosebracket+1
                #print "for", name, "found close tag"
                docontents = None # done
            # otherwise we are looking at a new tag, recursively parse it...
            # first record any intervening content
                remainder = xmltext[cursor:nextopenbracket]
                if remainder:
                #if verbose:
                #    #print "skipping", repr(remainder)
                #    #print "--- recursively parsing starting at", xmltext[nextopenbracket:nextopenbracket+20]
                (parsetree, cursor) = parsexml0(xmltext, startingat=nextopenbracket, toplevel=None, entityReplacer=entityReplacer)
                if parsetree:
        # maybe should check for trailing garbage?
        # toplevel:
        #    remainder = strip(xmltext[cursor:])
        #    if remainder:
        #        raise ValueError, "trailing garbage at top level %s" % repr(remainder[:20])
    if ContentList:
        if entityReplacer: ContentList = entityReplacer(ContentList)
    t = (NameString, AttDict, ContentList, ExtraStuff)
    return (t, cursor)

import types
def pprettyprint(parsedxml):
    """pretty printer mainly for testing"""
    st = types.StringType
    if type(parsedxml) is st:
        return parsedxml
    (name, attdict, textlist, extra) = parsedxml
    if not attdict: attdict={}
    join = string.join
    attlist = []
    for k in attdict.keys():
        v = attdict[k]
        attlist.append("%s=%s" % (k, `v`))
    attributes = join(attlist, " ")
    if not name and attributes:
        raise ValueError, "name missing with attributes???"
    if textlist is not None:
        # with content
        textlistpprint = map(pprettyprint, textlist)
        textpprint = join(textlistpprint, "\n")
        if not name:
            return textpprint # no outer tag
        # indent it
        nllist = string.split(textpprint, "\n")
        textpprint = "   "+join(nllist, "\n   ")
        return "<%s %s>\n%s\n</%s>" % (name, attributes, textpprint, name)
    # otherwise must be a simple tag
    return "<%s %s/>" % (name, attributes)

dump = 0
def testparse(s):
    from time import time
    from pprint import pprint
    now = time()
    D = parsexmlSimple(s)
    print "DONE", time()-now
    if dump&4:
    if dump&1:
        print "============== reformatting"
        p = pprettyprint(D)
        print p

def test():
    testparse("""<this type="xml">text &lt;&gt;<b>in</b> <funnytag foo="bar"/> xml</this>
                 <!-- comment -->
                 <this type="xml">text <b>in</b> xml</this> ]]>
                 <tag with="<brackets in values>">just testing brackets feature</tag>

filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml",

#filenames = ["moa.xml"]

if __name__=="__main__":
    from time import time
    now = time()
    for f in filenames:
        t = open(f).read()
        print "parsing", f
    print "elapsed", time()-now