| #!/usr/bin/env python3 |
| """ |
| Utility for parsing HTML5 entity definitions available from: |
| |
| https://html.spec.whatwg.org/entities.json |
| https://html.spec.whatwg.org/multipage/named-characters.html |
| |
| The page now contains the following note: |
| |
| "This list is static and will not be expanded or changed in the future." |
| |
| Written by Ezio Melotti and Iuliia Proskurnia. |
| """ |
| |
| import os |
| import sys |
| import json |
| from urllib.request import urlopen |
| from html.entities import html5 |
| |
| SCRIPT_NAME = 'Tools/build/parse_html5_entities.py' |
| PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html' |
| ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json' |
| HTML5_SECTION_START = '# HTML5 named character references' |
| |
| def get_json(url): |
| """Download the json file from the url and returns a decoded object.""" |
| with urlopen(url) as f: |
| data = f.read().decode('utf-8') |
| return json.loads(data) |
| |
| def create_dict(entities): |
| """Create the html5 dict from the decoded json object.""" |
| new_html5 = {} |
| for name, value in entities.items(): |
| new_html5[name.lstrip('&')] = value['characters'] |
| return new_html5 |
| |
| def compare_dicts(old, new): |
| """Compare the old and new dicts and print the differences.""" |
| added = new.keys() - old.keys() |
| if added: |
| print('{} entitie(s) have been added:'.format(len(added))) |
| for name in sorted(added): |
| print(' {!r}: {!r}'.format(name, new[name])) |
| removed = old.keys() - new.keys() |
| if removed: |
| print('{} entitie(s) have been removed:'.format(len(removed))) |
| for name in sorted(removed): |
| print(' {!r}: {!r}'.format(name, old[name])) |
| changed = set() |
| for name in (old.keys() & new.keys()): |
| if old[name] != new[name]: |
| changed.add((name, old[name], new[name])) |
| if changed: |
| print('{} entitie(s) have been modified:'.format(len(changed))) |
| for item in sorted(changed): |
| print(' {!r}: {!r} -> {!r}'.format(*item)) |
| |
| def write_items(entities, file=sys.stdout): |
| """Write the items of the dictionary in the specified file.""" |
| # The keys in the generated dictionary should be sorted |
| # in a case-insensitive way, however, when two keys are equal, |
| # the uppercase version should come first so that the result |
| # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] |
| # To do this we first sort in a case-sensitive way (so all the |
| # uppercase chars come first) and then sort with key=str.lower. |
| # Since the sorting is stable the uppercase keys will eventually |
| # be before their equivalent lowercase version. |
| keys = sorted(entities.keys()) |
| keys = sorted(keys, key=str.lower) |
| print(HTML5_SECTION_START, file=file) |
| print(f'# Generated by {SCRIPT_NAME}\n' |
| f'# from {ENTITIES_URL} and\n' |
| f'# {PAGE_URL}.\n' |
| f'# Map HTML5 named character references to the ' |
| f'equivalent Unicode character(s).', file=file) |
| print('html5 = {', file=file) |
| for name in keys: |
| print(f' {name!r}: {entities[name]!a},', file=file) |
| print('}', file=file) |
| |
| |
| if __name__ == '__main__': |
| # without args print a diff between html.entities.html5 and new_html5 |
| # with --create print the new html5 dict |
| # with --patch patch the Lib/html/entities.py file |
| new_html5 = create_dict(get_json(ENTITIES_URL)) |
| if '--create' in sys.argv: |
| write_items(new_html5) |
| elif '--patch' in sys.argv: |
| fname = 'Lib/html/entities.py' |
| temp_fname = fname + '.temp' |
| with open(fname) as f1, open(temp_fname, 'w') as f2: |
| skip = False |
| for line in f1: |
| if line.startswith(HTML5_SECTION_START): |
| write_items(new_html5, file=f2) |
| skip = True |
| continue |
| if skip: |
| # skip the old items until the } |
| if line.startswith('}'): |
| skip = False |
| continue |
| f2.write(line) |
| os.remove(fname) |
| os.rename(temp_fname, fname) |
| else: |
| if html5 == new_html5: |
| print('The current dictionary is updated.') |
| else: |
| compare_dicts(html5, new_html5) |
| print('Run "./python {0} --patch" to update Lib/html/entities.html ' |
| 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) |