lib/python2.7/test/test_sgmllib.py - platform/prebuilts/python/darwin-x86/2.7.5 - Git at Google

 import pprint
 import re
 import unittest
 from test import test_support
 sgmllib = test_support.import_module('sgmllib', deprecated=True)


 class EventCollector(sgmllib.SGMLParser):

     def __init__(self):
         self.events = []
         self.append = self.events.append
         sgmllib.SGMLParser.__init__(self)

     def get_events(self):
         # Normalize the list of events so that buffer artefacts don't
         # separate runs of contiguous characters.
         L = []
         prevtype = None
         for event in self.events:
             type = event[0]
             if type == prevtype == "data":
                 L[-1] = ("data", L[-1][1] + event[1])
             else:
                 L.append(event)
             prevtype = type
         self.events = L
         return L

     # structure markup

     def unknown_starttag(self, tag, attrs):
         self.append(("starttag", tag, attrs))

     def unknown_endtag(self, tag):
         self.append(("endtag", tag))

     # all other markup

     def handle_comment(self, data):
         self.append(("comment", data))

     def handle_charref(self, data):
         self.append(("charref", data))

     def handle_data(self, data):
         self.append(("data", data))

     def handle_decl(self, decl):
         self.append(("decl", decl))

     def handle_entityref(self, data):
         self.append(("entityref", data))

     def handle_pi(self, data):
         self.append(("pi", data))

     def unknown_decl(self, decl):
         self.append(("unknown decl", decl))


 class CDATAEventCollector(EventCollector):
     def start_cdata(self, attrs):
         self.append(("starttag", "cdata", attrs))
         self.setliteral()


 class HTMLEntityCollector(EventCollector):

     entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
         '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')

     def convert_charref(self, name):
         self.append(("charref", "convert", name))
         if name[0] != "x":
             return EventCollector.convert_charref(self, name)

     def convert_codepoint(self, codepoint):
         self.append(("codepoint", "convert", codepoint))
         EventCollector.convert_codepoint(self, codepoint)

     def convert_entityref(self, name):
         self.append(("entityref", "convert", name))
         return EventCollector.convert_entityref(self, name)

     # These to record that they were called, then pass the call along
     # to the default implementation so that it's actions can be
     # recorded.

     def handle_charref(self, data):
         self.append(("charref", data))
         sgmllib.SGMLParser.handle_charref(self, data)

     def handle_entityref(self, data):
         self.append(("entityref", data))
         sgmllib.SGMLParser.handle_entityref(self, data)


 class SGMLParserTestCase(unittest.TestCase):

     collector = EventCollector

     def get_events(self, source):
         parser = self.collector()
         try:
             for s in source:
                 parser.feed(s)
             parser.close()
         except:
             #self.events = parser.events
             raise
         return parser.get_events()

     def check_events(self, source, expected_events):
         try:
             events = self.get_events(source)
         except:
             #import sys
             #print >>sys.stderr, pprint.pformat(self.events)
             raise
         if events != expected_events:
             self.fail("received events did not match expected events\n"
                       "Expected:\n" + pprint.pformat(expected_events) +
                       "\nReceived:\n" + pprint.pformat(events))

     def check_parse_error(self, source):
         parser = EventCollector()
         try:
             parser.feed(source)
             parser.close()
         except sgmllib.SGMLParseError:
             pass
         else:
             self.fail("expected SGMLParseError for %r\nReceived:\n%s"
                       % (source, pprint.pformat(parser.get_events())))

     def test_doctype_decl_internal(self):
         inside = """\
 DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
              SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
   <!ELEMENT html - O EMPTY>
   <!ATTLIST html
       version CDATA #IMPLIED
       profile CDATA 'DublinCore'>
   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
   <!ENTITY myEntity 'internal parsed entity'>
   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
   <!ENTITY % paramEntity 'name|name|name'>
   %paramEntity;
   <!-- comment -->
 ]"""
         self.check_events(["<!%s>" % inside], [
             ("decl", inside),
             ])

     def test_doctype_decl_external(self):
         inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
         self.check_events("<!%s>" % inside, [
             ("decl", inside),
             ])

     def test_underscore_in_attrname(self):
         # SF bug #436621
         """Make sure attribute names with underscores are accepted"""
         self.check_events("<a has_under _under>", [
             ("starttag", "a", [("has_under", "has_under"),
                                ("_under", "_under")]),
             ])

     def test_underscore_in_tagname(self):
         # SF bug #436621
         """Make sure tag names with underscores are accepted"""
         self.check_events("<has_under></has_under>", [
             ("starttag", "has_under", []),
             ("endtag", "has_under"),
             ])

     def test_quotes_in_unquoted_attrs(self):
         # SF bug #436621
         """Be sure quotes in unquoted attributes are made part of the value"""
         self.check_events("<a href=foo'bar\"baz>", [
             ("starttag", "a", [("href", "foo'bar\"baz")]),
             ])

     def test_xhtml_empty_tag(self):
         """Handling of XHTML-style empty start tags"""
         self.check_events("<br />text<i></i>", [
             ("starttag", "br", []),
             ("data", "text"),
             ("starttag", "i", []),
             ("endtag", "i"),
             ])

     def test_processing_instruction_only(self):
         self.check_events("<?processing instruction>", [
             ("pi", "processing instruction"),
             ])

     def test_bad_nesting(self):
         self.check_events("<a><b></a></b>", [
             ("starttag", "a", []),
             ("starttag", "b", []),
             ("endtag", "a"),
             ("endtag", "b"),
             ])

     def test_bare_ampersands(self):
         self.check_events("this text & contains & ampersands &", [
             ("data", "this text & contains & ampersands &"),
             ])

     def test_bare_pointy_brackets(self):
         self.check_events("this < text > contains < bare>pointy< brackets", [
             ("data", "this < text > contains < bare>pointy< brackets"),
             ])

     def test_attr_syntax(self):
         output = [
           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
           ]
         self.check_events("""<a b='v' c="v" d=v e>""", output)
         self.check_events("""<a  b = 'v' c = "v" d = v e>""", output)
         self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
         self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)

     def test_attr_values(self):
         self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
                         [("starttag", "a", [("b", "xxx\n\txxx"),
                                             ("c", "yyy\t\nyyy"),
                                             ("d", "\txyz\n")])
                          ])
         self.check_events("""<a b='' c="">""", [
             ("starttag", "a", [("b", ""), ("c", "")]),
             ])
         # URL construction stuff from RFC 1808:
         safe = "$-_.+"
         extra = "!*'(),"
         reserved = ";/?:@&="
         url = "http://example.com:8080/path/to/file?%s%s%s" % (
             safe, extra, reserved)
         self.check_events("""<e a=%s>""" % url, [
             ("starttag", "e", [("a", url)]),
             ])
         # Regression test for SF patch #669683.
         self.check_events("<e a=rgb(1,2,3)>", [
             ("starttag", "e", [("a", "rgb(1,2,3)")]),
             ])

     def test_attr_values_entities(self):
         """Substitution of entities and charrefs in attribute values"""
         # SF bug #1452246
         self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
                                 f="&xxx;" g='&#32;&#33;' h='&#500;'
                                 i='x?a=b&c=d;'
                                 j='&amp;#42;' k='&#38;#42;'>""",
             [("starttag", "a", [("b", "<"),
                                 ("c", "<>"),
                                 ("d", "&lt->"),
                                 ("e", "< "),
                                 ("f", "&xxx;"),
                                 ("g", " !"),
                                 ("h", "&#500;"),
                                 ("i", "x?a=b&c=d;"),
                                 ("j", "&#42;"),
                                 ("k", "&#42;"),
                                 ])])

     def test_convert_overrides(self):
         # This checks that the character and entity reference
         # conversion helpers are called at the documented times.  No
         # attempt is made to really change what the parser accepts.
         #
         self.collector = HTMLEntityCollector
         self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
                            '&foobar;&#42;'), [
             ('entityref', 'convert', 'ldquo'),
             ('charref', 'convert', 'x201d'),
             ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
             ('data', 'foo'),
             ('endtag', 'a'),
             ('entityref', 'foobar'),
             ('entityref', 'convert', 'foobar'),
             ('charref', '42'),
             ('charref', 'convert', '42'),
             ('codepoint', 'convert', 42),
             ])

     def test_attr_funky_names(self):
         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
             ])

     def test_attr_value_ip6_url(self):
         # http://www.python.org/sf/853506
         self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
                            "<a href=http://[1080::8:800:200C:417A]/>"), [
             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
             ])

     def test_weird_starttags(self):
         self.check_events("<a<a>", [
             ("starttag", "a", []),
             ("starttag", "a", []),
             ])
         self.check_events("</a<a>", [
             ("endtag", "a"),
             ("starttag", "a", []),
             ])

     def test_declaration_junk_chars(self):
         self.check_parse_error("<!DOCTYPE foo $ >")

     def test_get_starttag_text(self):
         s = """<foobar   \n   one="1"\ttwo=2   >"""
         self.check_events(s, [
             ("starttag", "foobar", [("one", "1"), ("two", "2")]),
             ])

     def test_cdata_content(self):
         s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
              "<notcdata> <!-- comment --> </notcdata>")
         self.collector = CDATAEventCollector
         self.check_events(s, [
             ("starttag", "cdata", []),
             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
             ("endtag", "cdata"),
             ("starttag", "notcdata", []),
             ("data", " "),
             ("comment", " comment "),
             ("data", " "),
             ("endtag", "notcdata"),
             ])
         s = """<cdata> <not a='start tag'> </cdata>"""
         self.check_events(s, [
             ("starttag", "cdata", []),
             ("data", " <not a='start tag'> "),
             ("endtag", "cdata"),
             ])

     def test_illegal_declarations(self):
         s = 'abc<!spacer type="block" height="25">def'
         self.check_events(s, [
             ("data", "abc"),
             ("unknown decl", 'spacer type="block" height="25"'),
             ("data", "def"),
             ])

     def test_enumerated_attr_type(self):
         s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
         self.check_events(s, [
             ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
             ])

     def test_read_chunks(self):
         # SF bug #1541697, this caused sgml parser to hang
         # Just verify this code doesn't cause a hang.
         CHUNK = 1024  # increasing this to 8212 makes the problem go away

         f = open(test_support.findfile('sgml_input.html'))
         fp = sgmllib.SGMLParser()
         while 1:
             data = f.read(CHUNK)
             fp.feed(data)
             if len(data) != CHUNK:
                 break

     def test_only_decode_ascii(self):
         # SF bug #1651995, make sure non-ascii character references are not decoded
         s = '<signs exclamation="&#33" copyright="&#169" quoteleft="&#8216;">'
         self.check_events(s, [
             ('starttag', 'signs',
              [('exclamation', '!'), ('copyright', '&#169'),
               ('quoteleft', '&#8216;')]),
             ])

     # XXX These tests have been disabled by prefixing their names with
     # an underscore.  The first two exercise outstanding bugs in the
     # sgmllib module, and the third exhibits questionable behavior
     # that needs to be carefully considered before changing it.

     def _test_starttag_end_boundary(self):
         self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
         self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])

     def _test_buffer_artefacts(self):
         output = [("starttag", "a", [("b", "<")])]
         self.check_events(["<a b='<'>"], output)
         self.check_events(["<a ", "b='<'>"], output)
         self.check_events(["<a b", "='<'>"], output)
         self.check_events(["<a b=", "'<'>"], output)
         self.check_events(["<a b='<", "'>"], output)
         self.check_events(["<a b='<'", ">"], output)

         output = [("starttag", "a", [("b", ">")])]
         self.check_events(["<a b='>'>"], output)
         self.check_events(["<a ", "b='>'>"], output)
         self.check_events(["<a b", "='>'>"], output)
         self.check_events(["<a b=", "'>'>"], output)
         self.check_events(["<a b='>", "'>"], output)
         self.check_events(["<a b='>'", ">"], output)

         output = [("comment", "abc")]
         self.check_events(["", "<!--abc-->"], output)
         self.check_events(["<", "!--abc-->"], output)
         self.check_events(["<!", "--abc-->"], output)
         self.check_events(["<!-", "-abc-->"], output)
         self.check_events(["<!--", "abc-->"], output)
         self.check_events(["<!--a", "bc-->"], output)
         self.check_events(["<!--ab", "c-->"], output)
         self.check_events(["<!--abc", "-->"], output)
         self.check_events(["<!--abc-", "->"], output)
         self.check_events(["<!--abc--", ">"], output)
         self.check_events(["<!--abc-->", ""], output)

     def _test_starttag_junk_chars(self):
         self.check_parse_error("<")
         self.check_parse_error("<>")
         self.check_parse_error("</$>")
         self.check_parse_error("</")
         self.check_parse_error("</a")
         self.check_parse_error("<$")
         self.check_parse_error("<$>")
         self.check_parse_error("<!")
         self.check_parse_error("<a $>")
         self.check_parse_error("<a")
         self.check_parse_error("<a foo='bar'")
         self.check_parse_error("<a foo='bar")
         self.check_parse_error("<a foo='>'")
         self.check_parse_error("<a foo='>")
         self.check_parse_error("<a foo=>")


 def test_main():
     test_support.run_unittest(SGMLParserTestCase)


 if __name__ == "__main__":
     test_main()
	import pprint
	import re
	import unittest
	from test import test_support
	sgmllib = test_support.import_module('sgmllib', deprecated=True)


	class EventCollector(sgmllib.SGMLParser):

	def __init__(self):
	self.events = []
	self.append = self.events.append
	sgmllib.SGMLParser.__init__(self)

	def get_events(self):
	# Normalize the list of events so that buffer artefacts don't
	# separate runs of contiguous characters.
	L = []
	prevtype = None
	for event in self.events:
	type = event[0]
	if type == prevtype == "data":
	L[-1] = ("data", L[-1][1] + event[1])
	else:
	L.append(event)
	prevtype = type
	self.events = L
	return L

	# structure markup

	def unknown_starttag(self, tag, attrs):
	self.append(("starttag", tag, attrs))

	def unknown_endtag(self, tag):
	self.append(("endtag", tag))

	# all other markup

	def handle_comment(self, data):
	self.append(("comment", data))

	def handle_charref(self, data):
	self.append(("charref", data))

	def handle_data(self, data):
	self.append(("data", data))

	def handle_decl(self, decl):
	self.append(("decl", decl))

	def handle_entityref(self, data):
	self.append(("entityref", data))

	def handle_pi(self, data):
	self.append(("pi", data))

	def unknown_decl(self, decl):
	self.append(("unknown decl", decl))


	class CDATAEventCollector(EventCollector):
	def start_cdata(self, attrs):
	self.append(("starttag", "cdata", attrs))
	self.setliteral()


	class HTMLEntityCollector(EventCollector):

	entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
	'\|&#(x[0-9a-zA-Z]+\|[0-9]+))(;?)')

	def convert_charref(self, name):
	self.append(("charref", "convert", name))
	if name[0] != "x":
	return EventCollector.convert_charref(self, name)

	def convert_codepoint(self, codepoint):
	self.append(("codepoint", "convert", codepoint))
	EventCollector.convert_codepoint(self, codepoint)

	def convert_entityref(self, name):
	self.append(("entityref", "convert", name))
	return EventCollector.convert_entityref(self, name)

	# These to record that they were called, then pass the call along
	# to the default implementation so that it's actions can be
	# recorded.

	def handle_charref(self, data):
	self.append(("charref", data))
	sgmllib.SGMLParser.handle_charref(self, data)

	def handle_entityref(self, data):
	self.append(("entityref", data))
	sgmllib.SGMLParser.handle_entityref(self, data)


	class SGMLParserTestCase(unittest.TestCase):

	collector = EventCollector

	def get_events(self, source):
	parser = self.collector()
	try:
	for s in source:
	parser.feed(s)
	parser.close()
	except:
	#self.events = parser.events
	raise
	return parser.get_events()

	def check_events(self, source, expected_events):
	try:
	events = self.get_events(source)
	except:
	#import sys
	#print >>sys.stderr, pprint.pformat(self.events)
	raise
	if events != expected_events:
	self.fail("received events did not match expected events\n"
	"Expected:\n" + pprint.pformat(expected_events) +
	"\nReceived:\n" + pprint.pformat(events))

	def check_parse_error(self, source):
	parser = EventCollector()
	try:
	parser.feed(source)
	parser.close()
	except sgmllib.SGMLParseError:
	pass
	else:
	self.fail("expected SGMLParseError for %r\nReceived:\n%s"
	% (source, pprint.pformat(parser.get_events())))

	def test_doctype_decl_internal(self):
	inside = """\
	DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
	SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
	<!ELEMENT html - O EMPTY>
	<!ATTLIST html
	version CDATA #IMPLIED
	profile CDATA 'DublinCore'>
	<!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
	<!ENTITY myEntity 'internal parsed entity'>
	<!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
	<!ENTITY % paramEntity 'name\|name\|name'>
	%paramEntity;
	<!-- comment -->
	]"""
	self.check_events(["<!%s>" % inside], [
	("decl", inside),
	])

	def test_doctype_decl_external(self):
	inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
	self.check_events("<!%s>" % inside, [
	("decl", inside),
	])

	def test_underscore_in_attrname(self):
	# SF bug #436621
	"""Make sure attribute names with underscores are accepted"""
	self.check_events("<a has_under _under>", [
	("starttag", "a", [("has_under", "has_under"),
	("_under", "_under")]),
	])

	def test_underscore_in_tagname(self):
	# SF bug #436621
	"""Make sure tag names with underscores are accepted"""
	self.check_events("<has_under></has_under>", [
	("starttag", "has_under", []),
	("endtag", "has_under"),
	])

	def test_quotes_in_unquoted_attrs(self):
	# SF bug #436621
	"""Be sure quotes in unquoted attributes are made part of the value"""
	self.check_events("<a href=foo'bar\"baz>", [
	("starttag", "a", [("href", "foo'bar\"baz")]),
	])

	def test_xhtml_empty_tag(self):
	"""Handling of XHTML-style empty start tags"""
	self.check_events("<br />text<i></i>", [
	("starttag", "br", []),
	("data", "text"),
	("starttag", "i", []),
	("endtag", "i"),
	])

	def test_processing_instruction_only(self):
	self.check_events("<?processing instruction>", [
	("pi", "processing instruction"),
	])

	def test_bad_nesting(self):
	self.check_events("<a><b></a></b>", [
	("starttag", "a", []),
	("starttag", "b", []),
	("endtag", "a"),
	("endtag", "b"),
	])

	def test_bare_ampersands(self):
	self.check_events("this text & contains & ampersands &", [
	("data", "this text & contains & ampersands &"),
	])

	def test_bare_pointy_brackets(self):
	self.check_events("this < text > contains < bare>pointy< brackets", [
	("data", "this < text > contains < bare>pointy< brackets"),
	])

	def test_attr_syntax(self):
	output = [
	("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
	]
	self.check_events("""<a b='v' c="v" d=v e>""", output)
	self.check_events("""<a b = 'v' c = "v" d = v e>""", output)
	self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
	self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)

	def test_attr_values(self):
	self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
	[("starttag", "a", [("b", "xxx\n\txxx"),
	("c", "yyy\t\nyyy"),
	("d", "\txyz\n")])
	])
	self.check_events("""<a b='' c="">""", [
	("starttag", "a", [("b", ""), ("c", "")]),
	])
	# URL construction stuff from RFC 1808:
	safe = "$-_.+"
	extra = "!*'(),"
	reserved = ";/?:@&="
	url = "http://example.com:8080/path/to/file?%s%s%s" % (
	safe, extra, reserved)
	self.check_events("""<e a=%s>""" % url, [
	("starttag", "e", [("a", url)]),
	])
	# Regression test for SF patch #669683.
	self.check_events("<e a=rgb(1,2,3)>", [
	("starttag", "e", [("a", "rgb(1,2,3)")]),
	])

	def test_attr_values_entities(self):
	"""Substitution of entities and charrefs in attribute values"""
	# SF bug #1452246
	self.check_events("""<a b=< c=<> d=&lt-> e='< '
	f="&xxx;" g=' !' h='Ǵ'
	i='x?a=b&c=d;'
	j='&#42;' k='&#42;'>""",
	[("starttag", "a", [("b", "<"),
	("c", "<>"),
	("d", "&lt->"),
	("e", "< "),
	("f", "&xxx;"),
	("g", " !"),
	("h", "Ǵ"),
	("i", "x?a=b&c=d;"),
	("j", "*"),
	("k", "*"),
	])])

	def test_convert_overrides(self):
	# This checks that the character and entity reference
	# conversion helpers are called at the documented times. No
	# attempt is made to really change what the parser accepts.
	#
	self.collector = HTMLEntityCollector
	self.check_events(('<a title="“test”">foo</a>'
	'&foobar;*'), [
	('entityref', 'convert', 'ldquo'),
	('charref', 'convert', 'x201d'),
	('starttag', 'a', [('title', '“test”')]),
	('data', 'foo'),
	('endtag', 'a'),
	('entityref', 'foobar'),
	('entityref', 'convert', 'foobar'),
	('charref', '42'),
	('charref', 'convert', '42'),
	('codepoint', 'convert', 42),
	])

	def test_attr_funky_names(self):
	self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
	("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
	])

	def test_attr_value_ip6_url(self):
	# http://www.python.org/sf/853506
	self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
	"<a href=http://[1080::8:800:200C:417A]/>"), [
	("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
	("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
	])

	def test_weird_starttags(self):
	self.check_events("<a<a>", [
	("starttag", "a", []),
	("starttag", "a", []),
	])
	self.check_events("</a<a>", [
	("endtag", "a"),
	("starttag", "a", []),
	])

	def test_declaration_junk_chars(self):
	self.check_parse_error("<!DOCTYPE foo $ >")

	def test_get_starttag_text(self):
	s = """<foobar \n one="1"\ttwo=2 >"""
	self.check_events(s, [
	("starttag", "foobar", [("one", "1"), ("two", "2")]),
	])

	def test_cdata_content(self):
	s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
	"<notcdata> <!-- comment --> </notcdata>")
	self.collector = CDATAEventCollector
	self.check_events(s, [
	("starttag", "cdata", []),
	("data", " <!-- not a comment --> &not-an-entity-ref; "),
	("endtag", "cdata"),
	("starttag", "notcdata", []),
	("data", " "),
	("comment", " comment "),
	("data", " "),
	("endtag", "notcdata"),
	])
	s = """<cdata> <not a='start tag'> </cdata>"""
	self.check_events(s, [
	("starttag", "cdata", []),
	("data", " <not a='start tag'> "),
	("endtag", "cdata"),
	])

	def test_illegal_declarations(self):
	s = 'abc<!spacer type="block" height="25">def'
	self.check_events(s, [
	("data", "abc"),
	("unknown decl", 'spacer type="block" height="25"'),
	("data", "def"),
	])

	def test_enumerated_attr_type(self):
	s = "<!DOCTYPE doc [<!ATTLIST doc attr (a \| b) >]>"
	self.check_events(s, [
	('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a \| b) >]'),
	])

	def test_read_chunks(self):
	# SF bug #1541697, this caused sgml parser to hang
	# Just verify this code doesn't cause a hang.
	CHUNK = 1024 # increasing this to 8212 makes the problem go away

	f = open(test_support.findfile('sgml_input.html'))
	fp = sgmllib.SGMLParser()
	while 1:
	data = f.read(CHUNK)
	fp.feed(data)
	if len(data) != CHUNK:
	break

	def test_only_decode_ascii(self):
	# SF bug #1651995, make sure non-ascii character references are not decoded
	s = '<signs exclamation="&#33" copyright="&#169" quoteleft="‘">'
	self.check_events(s, [
	('starttag', 'signs',
	[('exclamation', '!'), ('copyright', '&#169'),
	('quoteleft', '‘')]),
	])

	# XXX These tests have been disabled by prefixing their names with
	# an underscore. The first two exercise outstanding bugs in the
	# sgmllib module, and the third exhibits questionable behavior
	# that needs to be carefully considered before changing it.

	def _test_starttag_end_boundary(self):
	self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
	self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])

	def _test_buffer_artefacts(self):
	output = [("starttag", "a", [("b", "<")])]
	self.check_events(["<a b='<'>"], output)
	self.check_events(["<a ", "b='<'>"], output)
	self.check_events(["<a b", "='<'>"], output)
	self.check_events(["<a b=", "'<'>"], output)
	self.check_events(["<a b='<", "'>"], output)
	self.check_events(["<a b='<'", ">"], output)

	output = [("starttag", "a", [("b", ">")])]
	self.check_events(["<a b='>'>"], output)
	self.check_events(["<a ", "b='>'>"], output)
	self.check_events(["<a b", "='>'>"], output)
	self.check_events(["<a b=", "'>'>"], output)
	self.check_events(["<a b='>", "'>"], output)
	self.check_events(["<a b='>'", ">"], output)

	output = [("comment", "abc")]
	self.check_events(["", "<!--abc-->"], output)
	self.check_events(["<", "!--abc-->"], output)
	self.check_events(["<!", "--abc-->"], output)
	self.check_events(["<!-", "-abc-->"], output)
	self.check_events(["<!--", "abc-->"], output)
	self.check_events(["<!--a", "bc-->"], output)
	self.check_events(["<!--ab", "c-->"], output)
	self.check_events(["<!--abc", "-->"], output)
	self.check_events(["<!--abc-", "->"], output)
	self.check_events(["<!--abc--", ">"], output)
	self.check_events(["<!--abc-->", ""], output)

	def _test_starttag_junk_chars(self):
	self.check_parse_error("<")
	self.check_parse_error("<>")
	self.check_parse_error("</$>")
	self.check_parse_error("</")
	self.check_parse_error("</a")
	self.check_parse_error("<$")
	self.check_parse_error("<$>")
	self.check_parse_error("<!")
	self.check_parse_error("<a $>")
	self.check_parse_error("<a")
	self.check_parse_error("<a foo='bar'")
	self.check_parse_error("<a foo='bar")
	self.check_parse_error("<a foo='>'")
	self.check_parse_error("<a foo='>")
	self.check_parse_error("<a foo=>")


	def test_main():
	test_support.run_unittest(SGMLParserTestCase)


	if __name__ == "__main__":
	test_main()