| #!/usr/bin/env python |
| # Copyright 2013 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import unittest |
| |
| from document_parser import ParseDocument, RemoveTitle |
| |
| |
| _WHOLE_DOCUMENT = ''' |
| Preamble before heading. |
| |
| <h1 id='main' class='header'>Main header</h1> |
| Some intro to the content. |
| |
| <h2 id='banana' class='header' title=''>Bananas</h2> |
| Something about bananas. |
| |
| <h2 id='orange' title='hello'>Oranges</h2> |
| Something about oranges. |
| |
| <h3 id='valencia'>Valencia Oranges</h3> |
| A description of valencia oranges. |
| |
| <h3 id='seville'>Seville Oranges</h3> |
| A description of seville oranges. |
| |
| <h2>Grapefruit</h3> |
| Grapefruit closed a h2 with a h3. This should be a warning. |
| |
| <h1 id='not-main'>Not the main header</h1> |
| But it should still show up in the TOC as though it were an h2. |
| |
| <h2>Not <h3>a banana</h2> |
| The embedded h3 should be ignored. |
| |
| <h4>It's a h4</h4> |
| h4 are part of the document structure, but this is not inside a h3. |
| |
| <h3>Plantains</h3> |
| Now I'm just getting lazy. |
| |
| <h4>Another h4</h4> |
| This h4 is inside a h3 so will show up. |
| |
| <h5>Header 5</h5> |
| Header 5s are not parsed. |
| ''' |
| |
| |
| _WHOLE_DOCUMENT_WITHOUT_TITLE = ''' |
| Preamble before heading. |
| |
| |
| Some intro to the content. |
| |
| <h2 id='banana' class='header' title=''>Bananas</h2> |
| Something about bananas. |
| |
| <h2 id='orange' title='hello'>Oranges</h2> |
| Something about oranges. |
| |
| <h3 id='valencia'>Valencia Oranges</h3> |
| A description of valencia oranges. |
| |
| <h3 id='seville'>Seville Oranges</h3> |
| A description of seville oranges. |
| |
| <h2>Grapefruit</h3> |
| Grapefruit closed a h2 with a h3. This should be a warning. |
| |
| <h1 id='not-main'>Not the main header</h1> |
| But it should still show up in the TOC as though it were an h2. |
| |
| <h2>Not <h3>a banana</h2> |
| The embedded h3 should be ignored. |
| |
| <h4>It's a h4</h4> |
| h4 are part of the document structure, but this is not inside a h3. |
| |
| <h3>Plantains</h3> |
| Now I'm just getting lazy. |
| |
| <h4>Another h4</h4> |
| This h4 is inside a h3 so will show up. |
| |
| <h5>Header 5</h5> |
| Header 5s are not parsed. |
| ''' |
| |
| |
| class DocumentParserUnittest(unittest.TestCase): |
| |
| def testEmptyDocument(self): |
| self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle('')) |
| |
| result = ParseDocument('') |
| self.assertEqual(None, result.title) |
| self.assertEqual(None, result.title_attributes) |
| self.assertEqual([], result.sections) |
| self.assertEqual([], result.warnings) |
| |
| result = ParseDocument('', expect_title=True) |
| self.assertEqual('', result.title) |
| self.assertEqual({}, result.title_attributes) |
| self.assertEqual([], result.sections) |
| self.assertEqual(['Expected a title'], result.warnings) |
| |
| def testRemoveTitle(self): |
| no_closing_tag = '<h1>No closing tag' |
| self.assertEqual((no_closing_tag, 'No closing </h1> was found'), |
| RemoveTitle(no_closing_tag)) |
| |
| no_opening_tag = 'No opening tag</h1>' |
| self.assertEqual((no_opening_tag, 'No opening <h1> was found'), |
| RemoveTitle(no_opening_tag)) |
| |
| tags_wrong_order = '</h1>Tags in wrong order<h1>' |
| self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'), |
| RemoveTitle(tags_wrong_order)) |
| |
| multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>' |
| self.assertEqual((' and <h1>Second header</h1>', None), |
| RemoveTitle(multiple_titles)) |
| |
| upper_case = '<H1>Upper case header tag</H1> hi' |
| self.assertEqual((' hi', None), RemoveTitle(upper_case)) |
| mixed_case = '<H1>Mixed case header tag</h1> hi' |
| self.assertEqual((' hi', None), RemoveTitle(mixed_case)) |
| |
| def testOnlyTitleDocument(self): |
| document = '<h1 id="header">heading</h1>' |
| self.assertEqual(('', None), RemoveTitle(document)) |
| |
| result = ParseDocument(document) |
| self.assertEqual(None, result.title) |
| self.assertEqual(None, result.title_attributes) |
| self.assertEqual([], result.sections) |
| self.assertEqual(['Found unexpected title "heading"'], result.warnings) |
| |
| result = ParseDocument(document, expect_title=True) |
| self.assertEqual('heading', result.title) |
| self.assertEqual({'id': 'header'}, result.title_attributes) |
| self.assertEqual([], result.sections) |
| self.assertEqual([], result.warnings) |
| |
| def testWholeDocument(self): |
| self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None), |
| RemoveTitle(_WHOLE_DOCUMENT)) |
| result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True) |
| self.assertEqual('Main header', result.title) |
| self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes) |
| self.assertEqual([ |
| 'Found closing </h3> while processing a <h2> (line 19, column 15)', |
| 'Found multiple <h1> tags. Subsequent <h1> tags will be classified as ' |
| '<h2> for the purpose of the structure (line 22, column 1)', |
| 'Found <h3> in the middle of processing a <h2> (line 25, column 9)', |
| # TODO(kalman): Re-enable this warning once the reference pages have |
| # their references fixed. |
| #'Found <h4> without any preceding <h3> (line 28, column 1)', |
| ], result.warnings) |
| |
| # The non-trivial table of contents assertions... |
| self.assertEqual(1, len(result.sections)) |
| entries = result.sections[0].structure |
| |
| self.assertEqual(4, len(entries), entries) |
| entry0, entry1, entry2, entry3 = entries |
| |
| self.assertEqual('hello', entry0.name) |
| self.assertEqual({'id': 'orange'}, entry0.attributes) |
| self.assertEqual(2, len(entry0.entries)) |
| entry0_0, entry0_1 = entry0.entries |
| |
| self.assertEqual('Valencia Oranges', entry0_0.name) |
| self.assertEqual({'id': 'valencia'}, entry0_0.attributes) |
| self.assertEqual([], entry0_0.entries) |
| self.assertEqual('Seville Oranges', entry0_1.name) |
| self.assertEqual({'id': 'seville'}, entry0_1.attributes) |
| self.assertEqual([], entry0_1.entries) |
| |
| self.assertEqual('Grapefruit', entry1.name) |
| self.assertEqual({}, entry1.attributes) |
| self.assertEqual([], entry1.entries) |
| |
| self.assertEqual('Not the main header', entry2.name) |
| self.assertEqual({'id': 'not-main'}, entry2.attributes) |
| self.assertEqual([], entry2.entries) |
| |
| self.assertEqual('Not a banana', entry3.name) |
| self.assertEqual({}, entry3.attributes) |
| self.assertEqual(2, len(entry3.entries)) |
| entry3_1, entry3_2 = entry3.entries |
| |
| self.assertEqual('It\'s a h4', entry3_1.name) |
| self.assertEqual({}, entry3_1.attributes) |
| self.assertEqual([], entry3_1.entries) |
| |
| self.assertEqual('Plantains', entry3_2.name) |
| self.assertEqual({}, entry3_2.attributes) |
| self.assertEqual(1, len(entry3_2.entries)) |
| entry3_2_1, = entry3_2.entries |
| |
| self.assertEqual('Another h4', entry3_2_1.name) |
| self.assertEqual({}, entry3_2_1.attributes) |
| self.assertEqual([], entry3_2_1.entries) |
| |
| def testSingleExplicitSection(self): |
| def test(document): |
| result = ParseDocument(document, expect_title=True) |
| self.assertEqual([], result.warnings) |
| self.assertEqual('Header', result.title) |
| self.assertEqual(1, len(result.sections)) |
| section0, = result.sections |
| entry0, = section0.structure |
| self.assertEqual('An inner header', entry0.name) |
| # A single section, one with the title inside the section, the other out. |
| test('<h1>Header</h1>' |
| '<section>' |
| 'Just a single section here.' |
| '<h2>An inner header</h2>' |
| '</section>') |
| test('<section>' |
| 'Another single section here.' |
| '<h1>Header</h1>' |
| '<h2>An inner header</h2>' |
| '</section>') |
| |
| def testMultipleSections(self): |
| result = ParseDocument( |
| '<h1>Header</h1>' |
| '<h2>First header</h2>' |
| 'This content outside a section is the first section.' |
| '<section>' |
| 'Second section' |
| '<h2>Second header</h2>' |
| '</section>' |
| '<section>' |
| 'Third section' |
| '<h2>Third header</h2>' |
| '</section>', |
| expect_title=True) |
| self.assertEqual([], result.warnings) |
| self.assertEqual('Header', result.title) |
| self.assertEqual(3, len(result.sections)) |
| section0, section1, section2 = result.sections |
| def assert_single_header(section, name): |
| self.assertEqual(1, len(section.structure)) |
| self.assertEqual(name, section.structure[0].name) |
| assert_single_header(section0, 'First header') |
| assert_single_header(section1, 'Second header') |
| assert_single_header(section2, 'Third header') |
| |
| |
| if __name__ == '__main__': |
| unittest.main() |