blob: 6f8fea1dc3b1f80e6afa31b46694810e8b279ecc [file] [log] [blame]
# -*- test-case-name: twisted.web.test.test_xml -*-
#
# Copyright (c) 2001-2004 Twisted Matrix Laboratories.
# See LICENSE for details.
"""
*S*mall, *U*ncomplicated *X*ML.
This is a very simple implementation of XML/HTML as a network
protocol. It is not at all clever. Its main features are that it
does not:
- support namespaces
- mung mnemonic entity references
- validate
- perform *any* external actions (such as fetching URLs or writing files)
under *any* circumstances
- has lots and lots of horrible hacks for supporting broken HTML (as an
option, they're not on by default).
"""
from twisted.internet.protocol import Protocol, FileWrapper
from twisted.python.reflect import prefixedMethodNames
# Elements of the three-tuples in the state table.
BEGIN_HANDLER = 0
DO_HANDLER = 1
END_HANDLER = 2
identChars = '.-_:'
lenientIdentChars = identChars + ';+#/%~'
def nop(*args, **kw):
"Do nothing."
def unionlist(*args):
l = []
for x in args:
l.extend(x)
d = dict([(x, 1) for x in l])
return d.keys()
def zipfndict(*args, **kw):
default = kw.get('default', nop)
d = {}
for key in unionlist(*[fndict.keys() for fndict in args]):
d[key] = tuple([x.get(key, default) for x in args])
return d
def prefixedMethodClassDict(clazz, prefix):
return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
def prefixedMethodObjDict(obj, prefix):
return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
class ParseError(Exception):
def __init__(self, filename, line, col, message):
self.filename = filename
self.line = line
self.col = col
self.message = message
def __str__(self):
return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
self.message)
class XMLParser(Protocol):
state = None
encodings = None
filename = "<xml />"
beExtremelyLenient = 0
_prepend = None
# _leadingBodyData will sometimes be set before switching to the
# 'bodydata' state, when we "accidentally" read a byte of bodydata
# in a different state.
_leadingBodyData = None
def connectionMade(self):
self.lineno = 1
self.colno = 0
self.encodings = []
def saveMark(self):
'''Get the line number and column of the last character parsed'''
# This gets replaced during dataReceived, restored afterwards
return (self.lineno, self.colno)
def _parseError(self, message):
raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
def _buildStateTable(self):
'''Return a dictionary of begin, do, end state function tuples'''
# _buildStateTable leaves something to be desired but it does what it
# does.. probably slowly, so I'm doing some evil caching so it doesn't
# get called more than once per class.
stateTable = getattr(self.__class__, '__stateTable', None)
if stateTable is None:
stateTable = self.__class__.__stateTable = zipfndict(
*[prefixedMethodObjDict(self, prefix)
for prefix in ('begin_', 'do_', 'end_')])
return stateTable
def _decode(self, data):
if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
assert not len(data) & 1, 'UTF-16 must come in pairs for now'
if self._prepend:
data = self._prepend + data
for encoding in self.encodings:
data = unicode(data, encoding)
return data
def maybeBodyData(self):
if self.endtag:
return 'bodydata'
# Get ready for fun! We're going to allow
# <script>if (foo < bar)</script> to work!
# We do this by making everything between <script> and
# </script> a Text
# BUT <script src="foo"> will be special-cased to do regular,
# lenient behavior, because those may not have </script>
# -radix
if (self.tagName == 'script'
and not self.tagAttributes.has_key('src')):
# we do this ourselves rather than having begin_waitforendscript
# becuase that can get called multiple times and we don't want
# bodydata to get reset other than the first time.
self.begin_bodydata(None)
return 'waitforendscript'
return 'bodydata'
def dataReceived(self, data):
stateTable = self._buildStateTable()
if not self.state:
# all UTF-16 starts with this string
if data.startswith('\xff\xfe'):
self._prepend = '\xff\xfe'
self.encodings.append('UTF-16')
data = data[2:]
elif data.startswith('\xfe\xff'):
self._prepend = '\xfe\xff'
self.encodings.append('UTF-16')
data = data[2:]
self.state = 'begin'
if self.encodings:
data = self._decode(data)
# bring state, lineno, colno into local scope
lineno, colno = self.lineno, self.colno
curState = self.state
# replace saveMark with a nested scope function
_saveMark = self.saveMark
def saveMark():
return (lineno, colno)
self.saveMark = saveMark
# fetch functions from the stateTable
beginFn, doFn, endFn = stateTable[curState]
try:
for byte in data:
# do newline stuff
if byte == '\n':
lineno += 1
colno = 0
else:
colno += 1
newState = doFn(byte)
if newState is not None and newState != curState:
# this is the endFn from the previous state
endFn()
curState = newState
beginFn, doFn, endFn = stateTable[curState]
beginFn(byte)
finally:
self.saveMark = _saveMark
self.lineno, self.colno = lineno, colno
# state doesn't make sense if there's an exception..
self.state = curState
def connectionLost(self, reason):
"""
End the last state we were in.
"""
stateTable = self._buildStateTable()
stateTable[self.state][END_HANDLER]()
# state methods
def do_begin(self, byte):
if byte.isspace():
return
if byte != '<':
if self.beExtremelyLenient:
self._leadingBodyData = byte
return 'bodydata'
self._parseError("First char of document [%r] wasn't <" % (byte,))
return 'tagstart'
def begin_comment(self, byte):
self.commentbuf = ''
def do_comment(self, byte):
self.commentbuf += byte
if self.commentbuf.endswith('-->'):
self.gotComment(self.commentbuf[:-3])
return 'bodydata'
def begin_tagstart(self, byte):
self.tagName = '' # name of the tag
self.tagAttributes = {} # attributes of the tag
self.termtag = 0 # is the tag self-terminating
self.endtag = 0
def do_tagstart(self, byte):
if byte.isalnum() or byte in identChars:
self.tagName += byte
if self.tagName == '!--':
return 'comment'
elif byte.isspace():
if self.tagName:
if self.endtag:
# properly strict thing to do here is probably to only
# accept whitespace
return 'waitforgt'
return 'attrs'
else:
self._parseError("Whitespace before tag-name")
elif byte == '>':
if self.endtag:
self.gotTagEnd(self.tagName)
return 'bodydata'
else:
self.gotTagStart(self.tagName, {})
return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
elif byte == '/':
if self.tagName:
return 'afterslash'
else:
self.endtag = 1
elif byte in '!?':
if self.tagName:
if not self.beExtremelyLenient:
self._parseError("Invalid character in tag-name")
else:
self.tagName += byte
self.termtag = 1
elif byte == '[':
if self.tagName == '!':
return 'expectcdata'
else:
self._parseError("Invalid '[' in tag-name")
else:
if self.beExtremelyLenient:
self.bodydata = '<'
return 'unentity'
self._parseError('Invalid tag character: %r'% byte)
def begin_unentity(self, byte):
self.bodydata += byte
def do_unentity(self, byte):
self.bodydata += byte
return 'bodydata'
def end_unentity(self):
self.gotText(self.bodydata)
def begin_expectcdata(self, byte):
self.cdatabuf = byte
def do_expectcdata(self, byte):
self.cdatabuf += byte
cdb = self.cdatabuf
cd = '[CDATA['
if len(cd) > len(cdb):
if cd.startswith(cdb):
return
elif self.beExtremelyLenient:
## WHAT THE CRAP!? MSWord9 generates HTML that includes these
## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
## 'em as best I can. this should really be a separate parse
## state but I don't even have any idea what these _are_.
return 'waitforgt'
else:
self._parseError("Mal-formed CDATA header")
if cd == cdb:
self.cdatabuf = ''
return 'cdata'
self._parseError("Mal-formed CDATA header")
def do_cdata(self, byte):
self.cdatabuf += byte
if self.cdatabuf.endswith("]]>"):
self.cdatabuf = self.cdatabuf[:-3]
return 'bodydata'
def end_cdata(self):
self.gotCData(self.cdatabuf)
self.cdatabuf = ''
def do_attrs(self, byte):
if byte.isalnum() or byte in identChars:
# XXX FIXME really handle !DOCTYPE at some point
if self.tagName == '!DOCTYPE':
return 'doctype'
if self.tagName[0] in '!?':
return 'waitforgt'
return 'attrname'
elif byte.isspace():
return
elif byte == '>':
self.gotTagStart(self.tagName, self.tagAttributes)
return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
elif byte == '/':
return 'afterslash'
elif self.beExtremelyLenient:
# discard and move on? Only case I've seen of this so far was:
# <foo bar="baz"">
return
self._parseError("Unexpected character: %r" % byte)
def begin_doctype(self, byte):
self.doctype = byte
def do_doctype(self, byte):
if byte == '>':
return 'bodydata'
self.doctype += byte
def end_doctype(self):
self.gotDoctype(self.doctype)
self.doctype = None
def do_waitforgt(self, byte):
if byte == '>':
if self.endtag or not self.beExtremelyLenient:
return 'bodydata'
return self.maybeBodyData()
def begin_attrname(self, byte):
self.attrname = byte
self._attrname_termtag = 0
def do_attrname(self, byte):
if byte.isalnum() or byte in identChars:
self.attrname += byte
return
elif byte == '=':
return 'beforeattrval'
elif byte.isspace():
return 'beforeeq'
elif self.beExtremelyLenient:
if byte in '"\'':
return 'attrval'
if byte in lenientIdentChars or byte.isalnum():
self.attrname += byte
return
if byte == '/':
self._attrname_termtag = 1
return
if byte == '>':
self.attrval = 'True'
self.tagAttributes[self.attrname] = self.attrval
self.gotTagStart(self.tagName, self.tagAttributes)
if self._attrname_termtag:
self.gotTagEnd(self.tagName)
return 'bodydata'
return self.maybeBodyData()
# something is really broken. let's leave this attribute where it
# is and move on to the next thing
return
self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
def do_beforeattrval(self, byte):
if byte in '"\'':
return 'attrval'
elif byte.isspace():
return
elif self.beExtremelyLenient:
if byte in lenientIdentChars or byte.isalnum():
return 'messyattr'
if byte == '>':
self.attrval = 'True'
self.tagAttributes[self.attrname] = self.attrval
self.gotTagStart(self.tagName, self.tagAttributes)
return self.maybeBodyData()
if byte == '\\':
# I saw this in actual HTML once:
# <font size=\"3\"><sup>SM</sup></font>
return
self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
attrname = ''
attrval = ''
def begin_beforeeq(self,byte):
self._beforeeq_termtag = 0
def do_beforeeq(self, byte):
if byte == '=':
return 'beforeattrval'
elif byte.isspace():
return
elif self.beExtremelyLenient:
if byte.isalnum() or byte in identChars:
self.attrval = 'True'
self.tagAttributes[self.attrname] = self.attrval
return 'attrname'
elif byte == '>':
self.attrval = 'True'
self.tagAttributes[self.attrname] = self.attrval
self.gotTagStart(self.tagName, self.tagAttributes)
if self._beforeeq_termtag:
self.gotTagEnd(self.tagName)
return 'bodydata'
return self.maybeBodyData()
elif byte == '/':
self._beforeeq_termtag = 1
return
self._parseError("Invalid attribute")
def begin_attrval(self, byte):
self.quotetype = byte
self.attrval = ''
def do_attrval(self, byte):
if byte == self.quotetype:
return 'attrs'
self.attrval += byte
def end_attrval(self):
self.tagAttributes[self.attrname] = self.attrval
self.attrname = self.attrval = ''
def begin_messyattr(self, byte):
self.attrval = byte
def do_messyattr(self, byte):
if byte.isspace():
return 'attrs'
elif byte == '>':
endTag = 0
if self.attrval.endswith('/'):
endTag = 1
self.attrval = self.attrval[:-1]
self.tagAttributes[self.attrname] = self.attrval
self.gotTagStart(self.tagName, self.tagAttributes)
if endTag:
self.gotTagEnd(self.tagName)
return 'bodydata'
return self.maybeBodyData()
else:
self.attrval += byte
def end_messyattr(self):
if self.attrval:
self.tagAttributes[self.attrname] = self.attrval
def begin_afterslash(self, byte):
self._after_slash_closed = 0
def do_afterslash(self, byte):
# this state is only after a self-terminating slash, e.g. <foo/>
if self._after_slash_closed:
self._parseError("Mal-formed")#XXX When does this happen??
if byte != '>':
if self.beExtremelyLenient:
return
else:
self._parseError("No data allowed after '/'")
self._after_slash_closed = 1
self.gotTagStart(self.tagName, self.tagAttributes)
self.gotTagEnd(self.tagName)
# don't need maybeBodyData here because there better not be
# any javascript code after a <script/>... we'll see :(
return 'bodydata'
def begin_bodydata(self, byte):
if self._leadingBodyData:
self.bodydata = self._leadingBodyData
del self._leadingBodyData
else:
self.bodydata = ''
def do_bodydata(self, byte):
if byte == '<':
return 'tagstart'
if byte == '&':
return 'entityref'
self.bodydata += byte
def end_bodydata(self):
self.gotText(self.bodydata)
self.bodydata = ''
def do_waitforendscript(self, byte):
if byte == '<':
return 'waitscriptendtag'
self.bodydata += byte
def begin_waitscriptendtag(self, byte):
self.temptagdata = ''
self.tagName = ''
self.endtag = 0
def do_waitscriptendtag(self, byte):
# 1 enforce / as first byte read
# 2 enforce following bytes to be subset of "script" until
# tagName == "script"
# 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
# 3 spaces can happen anywhere, they're ignored
# e.g. < / script >
# 4 anything else causes all data I've read to be moved to the
# bodydata, and switch back to waitforendscript state
# If it turns out this _isn't_ a </script>, we need to
# remember all the data we've been through so we can append it
# to bodydata
self.temptagdata += byte
# 1
if byte == '/':
self.endtag = True
elif not self.endtag:
self.bodydata += "<" + self.temptagdata
return 'waitforendscript'
# 2
elif byte.isalnum() or byte in identChars:
self.tagName += byte
if not 'script'.startswith(self.tagName):
self.bodydata += "<" + self.temptagdata
return 'waitforendscript'
elif self.tagName == 'script':
self.gotText(self.bodydata)
self.gotTagEnd(self.tagName)
return 'waitforgt'
# 3
elif byte.isspace():
return 'waitscriptendtag'
# 4
else:
self.bodydata += "<" + self.temptagdata
return 'waitforendscript'
def begin_entityref(self, byte):
self.erefbuf = ''
self.erefextra = '' # extra bit for lenient mode
def do_entityref(self, byte):
if byte.isspace() or byte == "<":
if self.beExtremelyLenient:
# '&foo' probably was '&amp;foo'
if self.erefbuf and self.erefbuf != "amp":
self.erefextra = self.erefbuf
self.erefbuf = "amp"
if byte == "<":
return "tagstart"
else:
self.erefextra += byte
return 'spacebodydata'
self._parseError("Bad entity reference")
elif byte != ';':
self.erefbuf += byte
else:
return 'bodydata'
def end_entityref(self):
self.gotEntityReference(self.erefbuf)
# hacky support for space after & in entityref in beExtremelyLenient
# state should only happen in that case
def begin_spacebodydata(self, byte):
self.bodydata = self.erefextra
self.erefextra = None
do_spacebodydata = do_bodydata
end_spacebodydata = end_bodydata
# Sorta SAX-ish API
def gotTagStart(self, name, attributes):
'''Encountered an opening tag.
Default behaviour is to print.'''
print 'begin', name, attributes
def gotText(self, data):
'''Encountered text
Default behaviour is to print.'''
print 'text:', repr(data)
def gotEntityReference(self, entityRef):
'''Encountered mnemonic entity reference
Default behaviour is to print.'''
print 'entityRef: &%s;' % entityRef
def gotComment(self, comment):
'''Encountered comment.
Default behaviour is to ignore.'''
pass
def gotCData(self, cdata):
'''Encountered CDATA
Default behaviour is to call the gotText method'''
self.gotText(cdata)
def gotDoctype(self, doctype):
"""Encountered DOCTYPE
This is really grotty: it basically just gives you everything between
'<!DOCTYPE' and '>' as an argument.
"""
print '!DOCTYPE', repr(doctype)
def gotTagEnd(self, name):
'''Encountered closing tag
Default behaviour is to print.'''
print 'end', name
if __name__ == '__main__':
from cStringIO import StringIO
testDocument = '''
<!DOCTYPE ignore all this shit, hah its malformed!!!!@$>
<?xml version="suck it"?>
<foo>
&#65;
<bar />
<baz boz="buz">boz &zop;</baz>
<![CDATA[ foo bar baz ]]>
</foo>
'''
x = XMLParser()
x.makeConnection(FileWrapper(StringIO()))
# fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html"
fn = "/home/glyph/gruesome.xml"
# testDocument = open(fn).read()
x.dataReceived(testDocument)