| #!/usr/bin/python3 |
| # |
| # Copyright 2020 The Khronos Group Inc. |
| # |
| # SPDX-License-Identifier: Apache-2.0 |
| |
| # check_html_xrefs - simple-minded check for internal xrefs in spec HTML |
| # that don't exist. |
| |
| # Usage: check_html_xrefs file |
| # Just reports bad xrefs, not where they occur |
| |
| import argparse, cProfile, pdb, string, sys, time |
| import io, os, re, string, sys, copy |
| from lxml import etree |
| |
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| |
| parser.add_argument('files', metavar='filename', nargs='*', |
| help='Path to registry XML') |
| args = parser.parse_args() |
| |
| if len(args.files) > 0: |
| file = open(args.files[0], 'r') |
| parser = etree.HTMLParser() |
| tree = etree.parse(file, parser) |
| |
| # Find all 'id' elements |
| id_elems = tree.findall('.//*[@id]') |
| ids = set() |
| for elem in id_elems: |
| id = elem.get('id') |
| if id in ids: |
| True |
| # print('Duplicate ID attribute:', id) |
| else: |
| ids.add(id) |
| |
| # Find all 'href' attributes |
| ref_elems = tree.findall('.//a[@href]') |
| refs = set() |
| for elem in ref_elems: |
| ref = elem.get('href') |
| # If not a local ref, skip it |
| if ref[0] == '#': |
| ref = ref[1:] |
| if ref in refs: |
| True |
| # print('Duplicate href:', ref) |
| else: |
| refs.add(ref) |
| else: |
| True |
| # print('Skipping ref:', ref) |
| |
| # Check for hrefs not found in ids |
| for ref in refs: |
| if ref not in ids: |
| print('Reference not found in HTML: #' + ref) |