author | rgbecker |
Wed, 03 Sep 2008 16:10:51 +0000 | |
changeset 2964 | 32352db0d71e |
parent 2945 | reportlab/lib/rparsexml.py@a6fdc0a2035b |
child 3028 | 082f5208644e |
permissions | -rw-r--r-- |
1724 | 1 |
"""Radically simple xml parsing |
2 |
||
3 |
Example parse |
|
4 |
||
5 |
<this type="xml">text <b>in</b> xml</this> |
|
6 |
||
7 |
( "this", |
|
8 |
{"type": "xml"}, |
|
9 |
[ "text ", |
|
10 |
("b", None, ["in"], None), |
|
11 |
" xml" |
|
12 |
] |
|
13 |
None ) |
|
14 |
||
15 |
{ 0: "this" |
|
16 |
"type": "xml" |
|
17 |
1: ["text ", |
|
18 |
{0: "b", 1:["in"]}, |
|
19 |
" xml"] |
|
20 |
} |
|
21 |
||
22 |
Ie, xml tag translates to a tuple: |
|
23 |
(name, dictofattributes, contentlist, miscellaneousinfo) |
|
24 |
||
25 |
where miscellaneousinfo can be anything, (but defaults to None) |
|
26 |
(with the intention of adding, eg, line number information) |
|
1771 | 27 |
|
1724 | 28 |
special cases: name of "" means "top level, no containing tag". |
29 |
Top level parse always looks like this |
|
30 |
||
31 |
("", list, None, None) |
|
32 |
||
33 |
contained text of None means <simple_tag\> |
|
1771 | 34 |
|
1724 | 35 |
In order to support stuff like |
36 |
||
37 |
<this></this><one></one> |
|
1771 | 38 |
|
1724 | 39 |
AT THE MOMENT & ETCETERA ARE IGNORED. THEY MUST BE PROCESSED |
40 |
IN A POST-PROCESSING STEP. |
|
41 |
||
42 |
PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING. |
|
43 |
""" |
|
44 |
||
45 |
RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser. |
|
46 |
||
47 |
import string |
|
48 |
try: |
|
49 |
#raise ImportError, "dummy error" |
|
50 |
simpleparse = 0 |
|
2575 | 51 |
import pyRXPU |
52 |
def warnCB(s): |
|
53 |
print s |
|
54 |
pyRXP_parser = pyRXPU.Parser( |
|
55 |
ErrorOnValidityErrors=1, |
|
56 |
NoNoDTDWarning=1, |
|
57 |
ExpandCharacterEntities=1, |
|
58 |
ExpandGeneralEntities=1, |
|
59 |
warnCB = warnCB, |
|
60 |
srcName='string input', |
|
61 |
ReturnUTF8 = 1, |
|
62 |
) |
|
2945 | 63 |
def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None,parseOpts={}): |
2575 | 64 |
pyRXP_parser.eoCB = eoCB |
2945 | 65 |
p = pyRXP_parser.parse(xmlText,**parseOpts) |
2575 | 66 |
return oneOutermostTag and p or ('',None,[p],None) |
1724 | 67 |
except ImportError: |
68 |
simpleparse = 1 |
|
1771 | 69 |
|
1724 | 70 |
NONAME = "" |
71 |
NAMEKEY = 0 |
|
72 |
CONTENTSKEY = 1 |
|
73 |
CDATAMARKER = "<![CDATA[" |
|
74 |
LENCDATAMARKER = len(CDATAMARKER) |
|
75 |
CDATAENDMARKER = "]]>" |
|
76 |
replacelist = [("<", "<"), (">", ">"), ("&", "&")] # amp must be last |
|
77 |
#replacelist = [] |
|
78 |
def unEscapeContentList(contentList): |
|
79 |
result = [] |
|
80 |
from string import replace |
|
81 |
for e in contentList: |
|
82 |
if "&" in e: |
|
83 |
for (old, new) in replacelist: |
|
84 |
e = replace(e, old, new) |
|
85 |
result.append(e) |
|
86 |
return result |
|
87 |
||
1984 | 88 |
def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList): |
1724 | 89 |
"""official interface: discard unused cursor info""" |
90 |
if RequirePyRXP: |
|
91 |
raise ImportError, "pyRXP not found, fallback parser disabled" |
|
1984 | 92 |
(result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer) |
1724 | 93 |
if oneOutermostTag: |
94 |
return result[2][0] |
|
95 |
else: |
|
96 |
return result |
|
97 |
||
98 |
if simpleparse: |
|
99 |
parsexml = parsexmlSimple |
|
100 |
||
101 |
def parseFile(filename): |
|
102 |
raw = open(filename, 'r').read() |
|
103 |
return parsexml(raw) |
|
104 |
||
105 |
verbose = 0 |
|
106 |
||
107 |
def skip_prologue(text, cursor): |
|
108 |
"""skip any prologue found after cursor, return index of rest of text""" |
|
109 |
### NOT AT ALL COMPLETE!!! definitely can be confused!!! |
|
110 |
from string import find |
|
2176 | 111 |
prologue_elements = ("!DOCTYPE", "?xml", "!--") |
1724 | 112 |
done = None |
113 |
while done is None: |
|
114 |
#print "trying to skip:", repr(text[cursor:cursor+20]) |
|
115 |
openbracket = find(text, "<", cursor) |
|
116 |
if openbracket<0: break |
|
117 |
past = openbracket+1 |
|
118 |
found = None |
|
119 |
for e in prologue_elements: |
|
120 |
le = len(e) |
|
121 |
if text[past:past+le]==e: |
|
122 |
found = 1 |
|
123 |
cursor = find(text, ">", past) |
|
124 |
if cursor<0: |
|
125 |
raise ValueError, "can't close prologue %s" % `e` |
|
126 |
cursor = cursor+1 |
|
127 |
if found is None: |
|
128 |
done=1 |
|
129 |
#print "done skipping" |
|
130 |
return cursor |
|
1771 | 131 |
|
132 |
def parsexml0(xmltext, startingat=0, toplevel=1, |
|
1724 | 133 |
# snarf in some globals |
1984 | 134 |
strip=string.strip, split=string.split, find=string.find, entityReplacer=unEscapeContentList, |
1724 | 135 |
#len=len, None=None |
136 |
#LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER |
|
137 |
): |
|
138 |
"""simple recursive descent xml parser... |
|
139 |
return (dictionary, endcharacter) |
|
140 |
special case: comment returns (None, endcharacter)""" |
|
141 |
#from string import strip, split, find |
|
142 |
#print "parsexml0", `xmltext[startingat: startingat+10]` |
|
143 |
# DEFAULTS |
|
144 |
NameString = NONAME |
|
145 |
ContentList = AttDict = ExtraStuff = None |
|
146 |
if toplevel is not None: |
|
147 |
#if verbose: print "at top level" |
|
148 |
#if startingat!=0: |
|
149 |
# raise ValueError, "have to start at 0 for top level!" |
|
150 |
xmltext = strip(xmltext) |
|
151 |
cursor = startingat |
|
152 |
#look for interesting starting points |
|
153 |
firstbracket = find(xmltext, "<", cursor) |
|
154 |
afterbracket2char = xmltext[firstbracket+1:firstbracket+3] |
|
155 |
#print "a", `afterbracket2char` |
|
156 |
#firstampersand = find(xmltext, "&", cursor) |
|
157 |
#if firstampersand>0 and firstampersand<firstbracket: |
|
158 |
# raise ValueError, "I don't handle ampersands yet!!!" |
|
159 |
docontents = 1 |
|
160 |
if firstbracket<0: |
|
161 |
# no tags |
|
162 |
#if verbose: print "no tags" |
|
163 |
if toplevel is not None: |
|
164 |
#D = {NAMEKEY: NONAME, CONTENTSKEY: [xmltext[cursor:]]} |
|
165 |
ContentList = [xmltext[cursor:]] |
|
1984 | 166 |
if entityReplacer: ContentList = entityReplacer(ContentList) |
1988
71d7483dac55
Attempt at fixing the silly return wrong tuple size problem
rgbecker
parents:
1984
diff
changeset
|
167 |
return (NameString, AttDict, ContentList, ExtraStuff), len(xmltext) |
1724 | 168 |
else: |
169 |
raise ValueError, "no tags at non-toplevel %s" % `xmltext[cursor:cursor+20]` |
|
170 |
#D = {} |
|
171 |
L = [] |
|
172 |
# look for start tag |
|
173 |
# NEED to force always outer level is unnamed!!! |
|
174 |
#if toplevel and firstbracket>0: |
|
175 |
#afterbracket2char = xmltext[firstbracket:firstbracket+2] |
|
176 |
if toplevel is not None: |
|
177 |
#print "toplevel with no outer tag" |
|
178 |
NameString = name = NONAME |
|
179 |
cursor = skip_prologue(xmltext, cursor) |
|
180 |
#break |
|
181 |
elif firstbracket<0: |
|
182 |
raise ValueError, "non top level entry should be at start tag: %s" % repr(xmltext[:10]) |
|
183 |
# special case: CDATA |
|
184 |
elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="<![CDATA[": |
|
185 |
#print "in CDATA", cursor |
|
186 |
# skip straight to the close marker |
|
187 |
startcdata = firstbracket+9 |
|
188 |
endcdata = find(xmltext, CDATAENDMARKER, startcdata) |
|
189 |
if endcdata<0: |
|
190 |
raise ValueError, "unclosed CDATA %s" % repr(xmltext[cursor:cursor+20]) |
|
191 |
NameString = CDATAMARKER |
|
192 |
ContentList = [xmltext[startcdata: endcdata]] |
|
193 |
cursor = endcdata+len(CDATAENDMARKER) |
|
194 |
docontents = None |
|
195 |
# special case COMMENT |
|
196 |
elif afterbracket2char=="!-" and xmltext[firstbracket:firstbracket+4]=="<!--": |
|
197 |
#print "in COMMENT" |
|
198 |
endcommentdashes = find(xmltext, "--", firstbracket+4) |
|
199 |
if endcommentdashes<firstbracket: |
|
200 |
raise ValueError, "unterminated comment %s" % repr(xmltext[cursor:cursor+20]) |
|
201 |
endcomment = endcommentdashes+2 |
|
202 |
if xmltext[endcomment]!=">": |
|
203 |
raise ValueError, "invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20]) |
|
204 |
return (None, endcomment+1) # shortcut exit |
|
205 |
else: |
|
206 |
# get the rest of the tag |
|
207 |
#if verbose: print "parsing start tag" |
|
208 |
# make sure the tag isn't in doublequote pairs |
|
209 |
closebracket = find(xmltext, ">", firstbracket) |
|
210 |
noclose = closebracket<0 |
|
211 |
startsearch = closebracket+1 |
|
212 |
pastfirstbracket = firstbracket+1 |
|
213 |
tagcontent = xmltext[pastfirstbracket:closebracket] |
|
214 |
# shortcut, no equal means nothing but name in the tag content |
|
215 |
if '=' not in tagcontent: |
|
216 |
if tagcontent[-1]=="/": |
|
217 |
# simple case |
|
218 |
#print "simple case", tagcontent |
|
219 |
tagcontent = tagcontent[:-1] |
|
220 |
docontents = None |
|
221 |
name = strip(tagcontent) |
|
222 |
NameString = name |
|
223 |
cursor = startsearch |
|
224 |
else: |
|
225 |
if '"' in tagcontent: |
|
226 |
# check double quotes |
|
227 |
stop = None |
|
228 |
# not inside double quotes! (the split should have odd length) |
|
229 |
if noclose or len(split(tagcontent+".", '"'))% 2: |
|
230 |
stop=1 |
|
231 |
while stop is None: |
|
232 |
closebracket = find(xmltext, ">", startsearch) |
|
233 |
startsearch = closebracket+1 |
|
234 |
noclose = closebracket<0 |
|
235 |
tagcontent = xmltext[pastfirstbracket:closebracket] |
|
236 |
# not inside double quotes! (the split should have odd length) |
|
237 |
if noclose or len(split(tagcontent+".", '"'))% 2: |
|
238 |
stop=1 |
|
239 |
if noclose: |
|
240 |
raise ValueError, "unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20]) |
|
241 |
cursor = startsearch |
|
242 |
#cursor = closebracket+1 |
|
243 |
# handle simple tag /> syntax |
|
244 |
if xmltext[closebracket-1]=="/": |
|
245 |
#if verbose: print "it's a simple tag" |
|
246 |
closebracket = closebracket-1 |
|
247 |
tagcontent = tagcontent[:-1] |
|
248 |
docontents = None |
|
249 |
#tagcontent = xmltext[firstbracket+1:closebracket] |
|
250 |
tagcontent = strip(tagcontent) |
|
251 |
taglist = split(tagcontent, "=") |
|
252 |
#if not taglist: |
|
253 |
# raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20]) |
|
254 |
taglist0 = taglist[0] |
|
255 |
taglist0list = split(taglist0) |
|
256 |
#if len(taglist0list)>2: |
|
257 |
# raise ValueError, "bad tag head %s" % repr(taglist0) |
|
258 |
name = taglist0list[0] |
|
259 |
#print "tag name is", name |
|
260 |
NameString = name |
|
261 |
# now parse the attributes |
|
262 |
attributename = taglist0list[-1] |
|
263 |
# put a fake att name at end of last taglist entry for consistent parsing |
|
264 |
taglist[-1] = taglist[-1]+" f" |
|
265 |
AttDict = D = {} |
|
266 |
taglistindex = 1 |
|
267 |
lasttaglistindex = len(taglist) |
|
268 |
#for attentry in taglist[1:]: |
|
269 |
while taglistindex<lasttaglistindex: |
|
270 |
#print "looking for attribute named", attributename |
|
271 |
attentry = taglist[taglistindex] |
|
272 |
taglistindex = taglistindex+1 |
|
273 |
attentry = strip(attentry) |
|
274 |
if attentry[0]!='"': |
|
275 |
raise ValueError, "attribute value must start with double quotes" + repr(attentry) |
|
276 |
while '"' not in attentry[1:]: |
|
277 |
# must have an = inside the attribute value... |
|
278 |
if taglistindex>lasttaglistindex: |
|
279 |
raise ValueError, "unclosed value " + repr(attentry) |
|
280 |
nextattentry = taglist[taglistindex] |
|
281 |
taglistindex = taglistindex+1 |
|
282 |
attentry = "%s=%s" % (attentry, nextattentry) |
|
283 |
attentry = strip(attentry) # only needed for while loop... |
|
284 |
attlist = split(attentry) |
|
285 |
nextattname = attlist[-1] |
|
286 |
attvalue = attentry[:-len(nextattname)] |
|
287 |
attvalue = strip(attvalue) |
|
288 |
try: |
|
289 |
first = attvalue[0]; last=attvalue[-1] |
|
290 |
except: |
|
291 |
raise ValueError, "attvalue,attentry,attlist="+repr((attvalue, attentry,attlist)) |
|
292 |
if first==last=='"' or first==last=="'": |
|
293 |
attvalue = attvalue[1:-1] |
|
294 |
#print attributename, "=", attvalue |
|
295 |
D[attributename] = attvalue |
|
296 |
attributename = nextattname |
|
297 |
# pass over other tags and content looking for end tag |
|
298 |
if docontents is not None: |
|
299 |
#print "now looking for end tag" |
|
300 |
ContentList = L |
|
301 |
while docontents is not None: |
|
302 |
nextopenbracket = find(xmltext, "<", cursor) |
|
303 |
if nextopenbracket<cursor: |
|
304 |
#if verbose: print "no next open bracket found" |
|
305 |
if name==NONAME: |
|
306 |
#print "no more tags for noname", repr(xmltext[cursor:cursor+10]) |
|
307 |
docontents=None # done |
|
308 |
remainder = xmltext[cursor:] |
|
309 |
cursor = len(xmltext) |
|
310 |
if remainder: |
|
311 |
L.append(remainder) |
|
312 |
else: |
|
313 |
raise ValueError, "no close bracket for %s found after %s" % (name,repr(xmltext[cursor: cursor+20])) |
|
314 |
# is it a close bracket? |
|
315 |
elif xmltext[nextopenbracket+1]=="/": |
|
316 |
#print "found close bracket", repr(xmltext[nextopenbracket:nextopenbracket+20]) |
|
317 |
nextclosebracket = find(xmltext, ">", nextopenbracket) |
|
318 |
if nextclosebracket<nextopenbracket: |
|
319 |
raise ValueError, "unclosed close tag %s" % repr(xmltext[nextopenbracket: nextopenbracket+20]) |
|
320 |
closetagcontents = xmltext[nextopenbracket+2: nextclosebracket] |
|
321 |
closetaglist = split(closetagcontents) |
|
322 |
#if len(closetaglist)!=1: |
|
323 |
#print closetagcontents |
|
324 |
#raise ValueError, "bad close tag format %s" % repr(xmltext[nextopenbracket: nextopenbracket+20]) |
|
325 |
# name should match |
|
326 |
closename = closetaglist[0] |
|
327 |
#if verbose: print "closetag name is", closename |
|
328 |
if name!=closename: |
|
329 |
prefix = xmltext[:cursor] |
|
330 |
endlinenum = len(split(prefix, "\n")) |
|
331 |
prefix = xmltext[:startingat] |
|
332 |
linenum = len(split(prefix, "\n")) |
|
333 |
raise ValueError, \ |
|
334 |
"at lines %s...%s close tag name doesn't match %s...%s %s" %( |
|
335 |
linenum, endlinenum, `name`, `closename`, repr(xmltext[cursor: cursor+100])) |
|
1771 | 336 |
remainder = xmltext[cursor:nextopenbracket] |
1724 | 337 |
if remainder: |
338 |
#if verbose: print "remainder", repr(remainder) |
|
339 |
L.append(remainder) |
|
340 |
cursor = nextclosebracket+1 |
|
341 |
#print "for", name, "found close tag" |
|
342 |
docontents = None # done |
|
343 |
# otherwise we are looking at a new tag, recursively parse it... |
|
344 |
# first record any intervening content |
|
345 |
else: |
|
346 |
remainder = xmltext[cursor:nextopenbracket] |
|
347 |
if remainder: |
|
348 |
L.append(remainder) |
|
1771 | 349 |
#if verbose: |
1724 | 350 |
# #print "skipping", repr(remainder) |
351 |
# #print "--- recursively parsing starting at", xmltext[nextopenbracket:nextopenbracket+20] |
|
1984 | 352 |
(parsetree, cursor) = parsexml0(xmltext, startingat=nextopenbracket, toplevel=None, entityReplacer=entityReplacer) |
1724 | 353 |
if parsetree: |
354 |
L.append(parsetree) |
|
355 |
# maybe should check for trailing garbage? |
|
356 |
# toplevel: |
|
357 |
# remainder = strip(xmltext[cursor:]) |
|
358 |
# if remainder: |
|
359 |
# raise ValueError, "trailing garbage at top level %s" % repr(remainder[:20]) |
|
360 |
if ContentList: |
|
1984 | 361 |
if entityReplacer: ContentList = entityReplacer(ContentList) |
1724 | 362 |
t = (NameString, AttDict, ContentList, ExtraStuff) |
363 |
return (t, cursor) |
|
1771 | 364 |
|
1724 | 365 |
import types |
366 |
def pprettyprint(parsedxml): |
|
367 |
"""pretty printer mainly for testing""" |
|
368 |
st = types.StringType |
|
369 |
if type(parsedxml) is st: |
|
370 |
return parsedxml |
|
371 |
(name, attdict, textlist, extra) = parsedxml |
|
372 |
if not attdict: attdict={} |
|
373 |
join = string.join |
|
374 |
attlist = [] |
|
375 |
for k in attdict.keys(): |
|
376 |
v = attdict[k] |
|
377 |
attlist.append("%s=%s" % (k, `v`)) |
|
378 |
attributes = join(attlist, " ") |
|
379 |
if not name and attributes: |
|
380 |
raise ValueError, "name missing with attributes???" |
|
381 |
if textlist is not None: |
|
382 |
# with content |
|
383 |
textlistpprint = map(pprettyprint, textlist) |
|
384 |
textpprint = join(textlistpprint, "\n") |
|
385 |
if not name: |
|
386 |
return textpprint # no outer tag |
|
387 |
# indent it |
|
388 |
nllist = string.split(textpprint, "\n") |
|
389 |
textpprint = " "+join(nllist, "\n ") |
|
390 |
return "<%s %s>\n%s\n</%s>" % (name, attributes, textpprint, name) |
|
391 |
# otherwise must be a simple tag |
|
392 |
return "<%s %s/>" % (name, attributes) |
|
393 |
||
394 |
dump = 0 |
|
395 |
def testparse(s): |
|
396 |
from time import time |
|
397 |
from pprint import pprint |
|
398 |
now = time() |
|
399 |
D = parsexmlSimple(s) |
|
400 |
print "DONE", time()-now |
|
401 |
if dump&4: |
|
402 |
pprint(D) |
|
403 |
#pprint(D) |
|
404 |
if dump&1: |
|
405 |
print "============== reformatting" |
|
406 |
p = pprettyprint(D) |
|
407 |
print p |
|
1771 | 408 |
|
1724 | 409 |
def test(): |
410 |
testparse("""<this type="xml">text <><b>in</b> <funnytag foo="bar"/> xml</this> |
|
411 |
<!-- comment --> |
|
412 |
<![CDATA[ |
|
413 |
<this type="xml">text <b>in</b> xml</this> ]]> |
|
414 |
<tag with="<brackets in values>">just testing brackets feature</tag> |
|
415 |
""") |
|
1771 | 416 |
|
1724 | 417 |
filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml", |
418 |
"samples/hamlet.xml"] |
|
419 |
||
420 |
#filenames = ["moa.xml"] |
|
421 |
||
422 |
dump=1 |
|
423 |
if __name__=="__main__": |
|
424 |
test() |
|
425 |
from time import time |
|
426 |
now = time() |
|
427 |
for f in filenames: |
|
428 |
t = open(f).read() |
|
429 |
print "parsing", f |
|
430 |
testparse(t) |
|
431 |
print "elapsed", time()-now |