chrome/common/extensions/docs/server2/document_parser_test.py - platform/external/chromium_org - Git at Google

 #!/usr/bin/env python
 # Copyright 2013 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import unittest

 from document_parser import ParseDocument, RemoveTitle


 _WHOLE_DOCUMENT = '''
 Preamble before heading.

 <h1 id='main' class='header'>Main header</h1>
 Some intro to the content.

 <h2 id='banana' class='header' title=''>Bananas</h2>
 Something about bananas.

 <h2 id='orange' title='hello'>Oranges</h2>
 Something about oranges.

 <h3 id='valencia'>Valencia Oranges</h3>
 A description of valencia oranges.

 <h3 id='seville'>Seville Oranges</h3>
 A description of seville oranges.

 <h2>Grapefruit</h3>
 Grapefruit closed a h2 with a h3. This should be a warning.

 <h1 id='not-main'>Not the main header</h1>
 But it should still show up in the TOC as though it were an h2.

 <h2>Not <h3>a banana</h2>
 The embedded h3 should be ignored.

 <h4>It's a h4</h4>
 h4 are part of the document structure, but this is not inside a h3.

 <h3>Plantains</h3>
 Now I'm just getting lazy.

 <h4>Another h4</h4>
 This h4 is inside a h3 so will show up.

 <h5>Header 5</h5>
 Header 5s are not parsed.
 '''


 _WHOLE_DOCUMENT_WITHOUT_TITLE = '''
 Preamble before heading.


 Some intro to the content.

 <h2 id='banana' class='header' title=''>Bananas</h2>
 Something about bananas.

 <h2 id='orange' title='hello'>Oranges</h2>
 Something about oranges.

 <h3 id='valencia'>Valencia Oranges</h3>
 A description of valencia oranges.

 <h3 id='seville'>Seville Oranges</h3>
 A description of seville oranges.

 <h2>Grapefruit</h3>
 Grapefruit closed a h2 with a h3. This should be a warning.

 <h1 id='not-main'>Not the main header</h1>
 But it should still show up in the TOC as though it were an h2.

 <h2>Not <h3>a banana</h2>
 The embedded h3 should be ignored.

 <h4>It's a h4</h4>
 h4 are part of the document structure, but this is not inside a h3.

 <h3>Plantains</h3>
 Now I'm just getting lazy.

 <h4>Another h4</h4>
 This h4 is inside a h3 so will show up.

 <h5>Header 5</h5>
 Header 5s are not parsed.
 '''


 class DocumentParserUnittest(unittest.TestCase):

   def testEmptyDocument(self):
     self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle(''))

     result = ParseDocument('')
     self.assertEqual(None, result.title)
     self.assertEqual(None, result.title_attributes)
     self.assertEqual([], result.sections)
     self.assertEqual([], result.warnings)

     result = ParseDocument('', expect_title=True)
     self.assertEqual('', result.title)
     self.assertEqual({}, result.title_attributes)
     self.assertEqual([], result.sections)
     self.assertEqual(['Expected a title'], result.warnings)

   def testRemoveTitle(self):
     no_closing_tag = '<h1>No closing tag'
     self.assertEqual((no_closing_tag, 'No closing </h1> was found'),
                      RemoveTitle(no_closing_tag))

     no_opening_tag = 'No opening tag</h1>'
     self.assertEqual((no_opening_tag, 'No opening <h1> was found'),
                      RemoveTitle(no_opening_tag))

     tags_wrong_order = '</h1>Tags in wrong order<h1>'
     self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'),
                      RemoveTitle(tags_wrong_order))

     multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>'
     self.assertEqual((' and <h1>Second header</h1>', None),
                      RemoveTitle(multiple_titles))

     upper_case = '<H1>Upper case header tag</H1> hi'
     self.assertEqual((' hi', None), RemoveTitle(upper_case))
     mixed_case = '<H1>Mixed case header tag</h1> hi'
     self.assertEqual((' hi', None), RemoveTitle(mixed_case))

   def testOnlyTitleDocument(self):
     document = '<h1 id="header">heading</h1>'
     self.assertEqual(('', None), RemoveTitle(document))

     result = ParseDocument(document)
     self.assertEqual(None, result.title)
     self.assertEqual(None, result.title_attributes)
     self.assertEqual([], result.sections)
     self.assertEqual(['Found unexpected title "heading"'], result.warnings)

     result = ParseDocument(document, expect_title=True)
     self.assertEqual('heading', result.title)
     self.assertEqual({'id': 'header'}, result.title_attributes)
     self.assertEqual([], result.sections)
     self.assertEqual([], result.warnings)

   def testWholeDocument(self):
     self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None),
                      RemoveTitle(_WHOLE_DOCUMENT))
     result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True)
     self.assertEqual('Main header', result.title)
     self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes)
     self.assertEqual([
       'Found closing </h3> while processing a <h2> (line 19, column 15)',
       'Found multiple <h1> tags. Subsequent <h1> tags will be classified as '
           '<h2> for the purpose of the structure (line 22, column 1)',
       'Found <h3> in the middle of processing a <h2> (line 25, column 9)',
       # TODO(kalman): Re-enable this warning once the reference pages have
       # their references fixed.
       #'Found <h4> without any preceding <h3> (line 28, column 1)',
     ], result.warnings)

     # The non-trivial table of contents assertions...
     self.assertEqual(1, len(result.sections))
     entries = result.sections[0].structure

     self.assertEqual(4, len(entries), entries)
     entry0, entry1, entry2, entry3 = entries

     self.assertEqual('hello', entry0.name)
     self.assertEqual({'id': 'orange'}, entry0.attributes)
     self.assertEqual(2, len(entry0.entries))
     entry0_0, entry0_1 = entry0.entries

     self.assertEqual('Valencia Oranges', entry0_0.name)
     self.assertEqual({'id': 'valencia'}, entry0_0.attributes)
     self.assertEqual([], entry0_0.entries)
     self.assertEqual('Seville Oranges', entry0_1.name)
     self.assertEqual({'id': 'seville'}, entry0_1.attributes)
     self.assertEqual([], entry0_1.entries)

     self.assertEqual('Grapefruit', entry1.name)
     self.assertEqual({}, entry1.attributes)
     self.assertEqual([], entry1.entries)

     self.assertEqual('Not the main header', entry2.name)
     self.assertEqual({'id': 'not-main'}, entry2.attributes)
     self.assertEqual([], entry2.entries)

     self.assertEqual('Not a banana', entry3.name)
     self.assertEqual({}, entry3.attributes)
     self.assertEqual(2, len(entry3.entries))
     entry3_1, entry3_2 = entry3.entries

     self.assertEqual('It\'s a h4', entry3_1.name)
     self.assertEqual({}, entry3_1.attributes)
     self.assertEqual([], entry3_1.entries)

     self.assertEqual('Plantains', entry3_2.name)
     self.assertEqual({}, entry3_2.attributes)
     self.assertEqual(1, len(entry3_2.entries))
     entry3_2_1, = entry3_2.entries

     self.assertEqual('Another h4', entry3_2_1.name)
     self.assertEqual({}, entry3_2_1.attributes)
     self.assertEqual([], entry3_2_1.entries)

   def testSingleExplicitSection(self):
     def test(document):
       result = ParseDocument(document, expect_title=True)
       self.assertEqual([], result.warnings)
       self.assertEqual('Header', result.title)
       self.assertEqual(1, len(result.sections))
       section0, = result.sections
       entry0, = section0.structure
       self.assertEqual('An inner header', entry0.name)
     # A single section, one with the title inside the section, the other out.
     test('<h1>Header</h1>'
          '<section>'
          'Just a single section here.'
          '<h2>An inner header</h2>'
          '</section>')
     test('<section>'
          'Another single section here.'
          '<h1>Header</h1>'
          '<h2>An inner header</h2>'
          '</section>')

   def testMultipleSections(self):
     result = ParseDocument(
         '<h1>Header</h1>'
         '<h2>First header</h2>'
         'This content outside a section is the first section.'
         '<section>'
         'Second section'
         '<h2>Second header</h2>'
         '</section>'
         '<section>'
         'Third section'
         '<h2>Third header</h2>'
         '</section>',
         expect_title=True)
     self.assertEqual([], result.warnings)
     self.assertEqual('Header', result.title)
     self.assertEqual(3, len(result.sections))
     section0, section1, section2 = result.sections
     def assert_single_header(section, name):
       self.assertEqual(1, len(section.structure))
       self.assertEqual(name, section.structure[0].name)
     assert_single_header(section0, 'First header')
     assert_single_header(section1, 'Second header')
     assert_single_header(section2, 'Third header')


 if __name__ == '__main__':
   unittest.main()
	#!/usr/bin/env python
	# Copyright 2013 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	import unittest

	from document_parser import ParseDocument, RemoveTitle


	_WHOLE_DOCUMENT = '''
	Preamble before heading.

	<h1 id='main' class='header'>Main header</h1>
	Some intro to the content.

	<h2 id='banana' class='header' title=''>Bananas</h2>
	Something about bananas.

	<h2 id='orange' title='hello'>Oranges</h2>
	Something about oranges.

	<h3 id='valencia'>Valencia Oranges</h3>
	A description of valencia oranges.

	<h3 id='seville'>Seville Oranges</h3>
	A description of seville oranges.

	<h2>Grapefruit</h3>
	Grapefruit closed a h2 with a h3. This should be a warning.

	<h1 id='not-main'>Not the main header</h1>
	But it should still show up in the TOC as though it were an h2.

	<h2>Not <h3>a banana</h2>
	The embedded h3 should be ignored.

	<h4>It's a h4</h4>
	h4 are part of the document structure, but this is not inside a h3.

	<h3>Plantains</h3>
	Now I'm just getting lazy.

	<h4>Another h4</h4>
	This h4 is inside a h3 so will show up.

	<h5>Header 5</h5>
	Header 5s are not parsed.
	'''


	_WHOLE_DOCUMENT_WITHOUT_TITLE = '''
	Preamble before heading.


	Some intro to the content.

	<h2 id='banana' class='header' title=''>Bananas</h2>
	Something about bananas.

	<h2 id='orange' title='hello'>Oranges</h2>
	Something about oranges.

	<h3 id='valencia'>Valencia Oranges</h3>
	A description of valencia oranges.

	<h3 id='seville'>Seville Oranges</h3>
	A description of seville oranges.

	<h2>Grapefruit</h3>
	Grapefruit closed a h2 with a h3. This should be a warning.

	<h1 id='not-main'>Not the main header</h1>
	But it should still show up in the TOC as though it were an h2.

	<h2>Not <h3>a banana</h2>
	The embedded h3 should be ignored.

	<h4>It's a h4</h4>
	h4 are part of the document structure, but this is not inside a h3.

	<h3>Plantains</h3>
	Now I'm just getting lazy.

	<h4>Another h4</h4>
	This h4 is inside a h3 so will show up.

	<h5>Header 5</h5>
	Header 5s are not parsed.
	'''


	class DocumentParserUnittest(unittest.TestCase):

	def testEmptyDocument(self):
	self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle(''))

	result = ParseDocument('')
	self.assertEqual(None, result.title)
	self.assertEqual(None, result.title_attributes)
	self.assertEqual([], result.sections)
	self.assertEqual([], result.warnings)

	result = ParseDocument('', expect_title=True)
	self.assertEqual('', result.title)
	self.assertEqual({}, result.title_attributes)
	self.assertEqual([], result.sections)
	self.assertEqual(['Expected a title'], result.warnings)

	def testRemoveTitle(self):
	no_closing_tag = '<h1>No closing tag'
	self.assertEqual((no_closing_tag, 'No closing </h1> was found'),
	RemoveTitle(no_closing_tag))

	no_opening_tag = 'No opening tag</h1>'
	self.assertEqual((no_opening_tag, 'No opening <h1> was found'),
	RemoveTitle(no_opening_tag))

	tags_wrong_order = '</h1>Tags in wrong order<h1>'
	self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'),
	RemoveTitle(tags_wrong_order))

	multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>'
	self.assertEqual((' and <h1>Second header</h1>', None),
	RemoveTitle(multiple_titles))

	upper_case = '<H1>Upper case header tag</H1> hi'
	self.assertEqual((' hi', None), RemoveTitle(upper_case))
	mixed_case = '<H1>Mixed case header tag</h1> hi'
	self.assertEqual((' hi', None), RemoveTitle(mixed_case))

	def testOnlyTitleDocument(self):
	document = '<h1 id="header">heading</h1>'
	self.assertEqual(('', None), RemoveTitle(document))

	result = ParseDocument(document)
	self.assertEqual(None, result.title)
	self.assertEqual(None, result.title_attributes)
	self.assertEqual([], result.sections)
	self.assertEqual(['Found unexpected title "heading"'], result.warnings)

	result = ParseDocument(document, expect_title=True)
	self.assertEqual('heading', result.title)
	self.assertEqual({'id': 'header'}, result.title_attributes)
	self.assertEqual([], result.sections)
	self.assertEqual([], result.warnings)

	def testWholeDocument(self):
	self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None),
	RemoveTitle(_WHOLE_DOCUMENT))
	result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True)
	self.assertEqual('Main header', result.title)
	self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes)
	self.assertEqual([
	'Found closing </h3> while processing a <h2> (line 19, column 15)',
	'Found multiple <h1> tags. Subsequent <h1> tags will be classified as '
	'<h2> for the purpose of the structure (line 22, column 1)',
	'Found <h3> in the middle of processing a <h2> (line 25, column 9)',
	# TODO(kalman): Re-enable this warning once the reference pages have
	# their references fixed.
	#'Found <h4> without any preceding <h3> (line 28, column 1)',
	], result.warnings)

	# The non-trivial table of contents assertions...
	self.assertEqual(1, len(result.sections))
	entries = result.sections[0].structure

	self.assertEqual(4, len(entries), entries)
	entry0, entry1, entry2, entry3 = entries

	self.assertEqual('hello', entry0.name)
	self.assertEqual({'id': 'orange'}, entry0.attributes)
	self.assertEqual(2, len(entry0.entries))
	entry0_0, entry0_1 = entry0.entries

	self.assertEqual('Valencia Oranges', entry0_0.name)
	self.assertEqual({'id': 'valencia'}, entry0_0.attributes)
	self.assertEqual([], entry0_0.entries)
	self.assertEqual('Seville Oranges', entry0_1.name)
	self.assertEqual({'id': 'seville'}, entry0_1.attributes)
	self.assertEqual([], entry0_1.entries)

	self.assertEqual('Grapefruit', entry1.name)
	self.assertEqual({}, entry1.attributes)
	self.assertEqual([], entry1.entries)

	self.assertEqual('Not the main header', entry2.name)
	self.assertEqual({'id': 'not-main'}, entry2.attributes)
	self.assertEqual([], entry2.entries)

	self.assertEqual('Not a banana', entry3.name)
	self.assertEqual({}, entry3.attributes)
	self.assertEqual(2, len(entry3.entries))
	entry3_1, entry3_2 = entry3.entries

	self.assertEqual('It\'s a h4', entry3_1.name)
	self.assertEqual({}, entry3_1.attributes)
	self.assertEqual([], entry3_1.entries)

	self.assertEqual('Plantains', entry3_2.name)
	self.assertEqual({}, entry3_2.attributes)
	self.assertEqual(1, len(entry3_2.entries))
	entry3_2_1, = entry3_2.entries

	self.assertEqual('Another h4', entry3_2_1.name)
	self.assertEqual({}, entry3_2_1.attributes)
	self.assertEqual([], entry3_2_1.entries)

	def testSingleExplicitSection(self):
	def test(document):
	result = ParseDocument(document, expect_title=True)
	self.assertEqual([], result.warnings)
	self.assertEqual('Header', result.title)
	self.assertEqual(1, len(result.sections))
	section0, = result.sections
	entry0, = section0.structure
	self.assertEqual('An inner header', entry0.name)
	# A single section, one with the title inside the section, the other out.
	test('<h1>Header</h1>'
	'<section>'
	'Just a single section here.'
	'<h2>An inner header</h2>'
	'</section>')
	test('<section>'
	'Another single section here.'
	'<h1>Header</h1>'
	'<h2>An inner header</h2>'
	'</section>')

	def testMultipleSections(self):
	result = ParseDocument(
	'<h1>Header</h1>'
	'<h2>First header</h2>'
	'This content outside a section is the first section.'
	'<section>'
	'Second section'
	'<h2>Second header</h2>'
	'</section>'
	'<section>'
	'Third section'
	'<h2>Third header</h2>'
	'</section>',
	expect_title=True)
	self.assertEqual([], result.warnings)
	self.assertEqual('Header', result.title)
	self.assertEqual(3, len(result.sections))
	section0, section1, section2 = result.sections
	def assert_single_header(section, name):
	self.assertEqual(1, len(section.structure))
	self.assertEqual(name, section.structure[0].name)
	assert_single_header(section0, 'First header')
	assert_single_header(section1, 'Second header')
	assert_single_header(section2, 'Third header')


	if __name__ == '__main__':
	unittest.main()