blob: 855561a06ff4e603e4015821bcb78b3fbf513fda [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import unittest
from document_parser import ParseDocument, RemoveTitle
_WHOLE_DOCUMENT = '''
Preamble before heading.
<h1 id='main' class='header'>Main header</h1>
Some intro to the content.
<h2 id='banana' class='header' title=''>Bananas</h2>
Something about bananas.
<h2 id='orange' title='hello'>Oranges</h2>
Something about oranges.
<h3 id='valencia'>Valencia Oranges</h3>
A description of valencia oranges.
<h3 id='seville'>Seville Oranges</h3>
A description of seville oranges.
<h2>Grapefruit</h3>
Grapefruit closed a h2 with a h3. This should be a warning.
<h1 id='not-main'>Not the main header</h1>
But it should still show up in the TOC as though it were an h2.
<h2>Not <h3>a banana</h2>
The embedded h3 should be ignored.
<h4>It's a h4</h4>
h4 are part of the document structure, but this is not inside a h3.
<h3>Plantains</h3>
Now I'm just getting lazy.
<h4>Another h4</h4>
This h4 is inside a h3 so will show up.
<h5>Header 5</h5>
Header 5s are not parsed.
'''
_WHOLE_DOCUMENT_WITHOUT_TITLE = '''
Preamble before heading.
Some intro to the content.
<h2 id='banana' class='header' title=''>Bananas</h2>
Something about bananas.
<h2 id='orange' title='hello'>Oranges</h2>
Something about oranges.
<h3 id='valencia'>Valencia Oranges</h3>
A description of valencia oranges.
<h3 id='seville'>Seville Oranges</h3>
A description of seville oranges.
<h2>Grapefruit</h3>
Grapefruit closed a h2 with a h3. This should be a warning.
<h1 id='not-main'>Not the main header</h1>
But it should still show up in the TOC as though it were an h2.
<h2>Not <h3>a banana</h2>
The embedded h3 should be ignored.
<h4>It's a h4</h4>
h4 are part of the document structure, but this is not inside a h3.
<h3>Plantains</h3>
Now I'm just getting lazy.
<h4>Another h4</h4>
This h4 is inside a h3 so will show up.
<h5>Header 5</h5>
Header 5s are not parsed.
'''
class DocumentParserUnittest(unittest.TestCase):
def testEmptyDocument(self):
self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle(''))
result = ParseDocument('')
self.assertEqual(None, result.title)
self.assertEqual(None, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual([], result.warnings)
result = ParseDocument('', expect_title=True)
self.assertEqual('', result.title)
self.assertEqual({}, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual(['Expected a title'], result.warnings)
def testRemoveTitle(self):
no_closing_tag = '<h1>No closing tag'
self.assertEqual((no_closing_tag, 'No closing </h1> was found'),
RemoveTitle(no_closing_tag))
no_opening_tag = 'No opening tag</h1>'
self.assertEqual((no_opening_tag, 'No opening <h1> was found'),
RemoveTitle(no_opening_tag))
tags_wrong_order = '</h1>Tags in wrong order<h1>'
self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'),
RemoveTitle(tags_wrong_order))
multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>'
self.assertEqual((' and <h1>Second header</h1>', None),
RemoveTitle(multiple_titles))
upper_case = '<H1>Upper case header tag</H1> hi'
self.assertEqual((' hi', None), RemoveTitle(upper_case))
mixed_case = '<H1>Mixed case header tag</h1> hi'
self.assertEqual((' hi', None), RemoveTitle(mixed_case))
def testOnlyTitleDocument(self):
document = '<h1 id="header">heading</h1>'
self.assertEqual(('', None), RemoveTitle(document))
result = ParseDocument(document)
self.assertEqual(None, result.title)
self.assertEqual(None, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual(['Found unexpected title "heading"'], result.warnings)
result = ParseDocument(document, expect_title=True)
self.assertEqual('heading', result.title)
self.assertEqual({'id': 'header'}, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual([], result.warnings)
def testWholeDocument(self):
self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None),
RemoveTitle(_WHOLE_DOCUMENT))
result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True)
self.assertEqual('Main header', result.title)
self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes)
self.assertEqual([
'Found closing </h3> while processing a <h2> (line 19, column 15)',
'Found multiple <h1> tags. Subsequent <h1> tags will be classified as '
'<h2> for the purpose of the structure (line 22, column 1)',
'Found <h3> in the middle of processing a <h2> (line 25, column 9)',
# TODO(kalman): Re-enable this warning once the reference pages have
# their references fixed.
#'Found <h4> without any preceding <h3> (line 28, column 1)',
], result.warnings)
# The non-trivial table of contents assertions...
self.assertEqual(1, len(result.sections))
entries = result.sections[0].structure
self.assertEqual(4, len(entries), entries)
entry0, entry1, entry2, entry3 = entries
self.assertEqual('hello', entry0.name)
self.assertEqual({'id': 'orange'}, entry0.attributes)
self.assertEqual(2, len(entry0.entries))
entry0_0, entry0_1 = entry0.entries
self.assertEqual('Valencia Oranges', entry0_0.name)
self.assertEqual({'id': 'valencia'}, entry0_0.attributes)
self.assertEqual([], entry0_0.entries)
self.assertEqual('Seville Oranges', entry0_1.name)
self.assertEqual({'id': 'seville'}, entry0_1.attributes)
self.assertEqual([], entry0_1.entries)
self.assertEqual('Grapefruit', entry1.name)
self.assertEqual({}, entry1.attributes)
self.assertEqual([], entry1.entries)
self.assertEqual('Not the main header', entry2.name)
self.assertEqual({'id': 'not-main'}, entry2.attributes)
self.assertEqual([], entry2.entries)
self.assertEqual('Not a banana', entry3.name)
self.assertEqual({}, entry3.attributes)
self.assertEqual(2, len(entry3.entries))
entry3_1, entry3_2 = entry3.entries
self.assertEqual('It\'s a h4', entry3_1.name)
self.assertEqual({}, entry3_1.attributes)
self.assertEqual([], entry3_1.entries)
self.assertEqual('Plantains', entry3_2.name)
self.assertEqual({}, entry3_2.attributes)
self.assertEqual(1, len(entry3_2.entries))
entry3_2_1, = entry3_2.entries
self.assertEqual('Another h4', entry3_2_1.name)
self.assertEqual({}, entry3_2_1.attributes)
self.assertEqual([], entry3_2_1.entries)
def testSingleExplicitSection(self):
def test(document):
result = ParseDocument(document, expect_title=True)
self.assertEqual([], result.warnings)
self.assertEqual('Header', result.title)
self.assertEqual(1, len(result.sections))
section0, = result.sections
entry0, = section0.structure
self.assertEqual('An inner header', entry0.name)
# A single section, one with the title inside the section, the other out.
test('<h1>Header</h1>'
'<section>'
'Just a single section here.'
'<h2>An inner header</h2>'
'</section>')
test('<section>'
'Another single section here.'
'<h1>Header</h1>'
'<h2>An inner header</h2>'
'</section>')
def testMultipleSections(self):
result = ParseDocument(
'<h1>Header</h1>'
'<h2>First header</h2>'
'This content outside a section is the first section.'
'<section>'
'Second section'
'<h2>Second header</h2>'
'</section>'
'<section>'
'Third section'
'<h2>Third header</h2>'
'</section>',
expect_title=True)
self.assertEqual([], result.warnings)
self.assertEqual('Header', result.title)
self.assertEqual(3, len(result.sections))
section0, section1, section2 = result.sections
def assert_single_header(section, name):
self.assertEqual(1, len(section.structure))
self.assertEqual(name, section.structure[0].name)
assert_single_header(section0, 'First header')
assert_single_header(section1, 'Second header')
assert_single_header(section2, 'Third header')
if __name__ == '__main__':
unittest.main()