| import itertools |
| import os |
| import re |
| import sys |
| |
| |
| def get_characters(): |
| """Find every Unicode character that is valid in a Python `identifier`_ but |
| is not matched by the regex ``\\w`` group. |
| |
| ``\\w`` matches some characters that aren't valid in identifiers, but |
| :meth:`str.isidentifier` will catch that later in lexing. |
| |
| All start characters are valid continue characters, so we only test for |
| continue characters. |
| |
| _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers |
| """ |
| for cp in range(sys.maxunicode + 1): |
| s = chr(cp) |
| |
| if ("a" + s).isidentifier() and not re.match(r"\w", s): |
| yield s |
| |
| |
| def collapse_ranges(data): |
| """Given a sorted list of unique characters, generate ranges representing |
| sequential code points. |
| |
| Source: https://stackoverflow.com/a/4629241/400617 |
| """ |
| for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]): |
| b = list(b) |
| yield b[0][1], b[-1][1] |
| |
| |
| def build_pattern(ranges): |
| """Output the regex pattern for ranges of characters. |
| |
| One and two character ranges output the individual characters. |
| """ |
| out = [] |
| |
| for a, b in ranges: |
| if a == b: # single char |
| out.append(a) |
| elif ord(b) - ord(a) == 1: # two chars, range is redundant |
| out.append(a) |
| out.append(b) |
| else: |
| out.append(f"{a}-{b}") |
| |
| return "".join(out) |
| |
| |
| def main(): |
| """Build the regex pattern and write it to |
| ``jinja2/_identifier.py``. |
| """ |
| pattern = build_pattern(collapse_ranges(get_characters())) |
| filename = os.path.abspath( |
| os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py") |
| ) |
| |
| with open(filename, "w", encoding="utf8") as f: |
| f.write("import re\n\n") |
| f.write("# generated by scripts/generate_identifier_pattern.py\n") |
| f.write("pattern = re.compile(\n") |
| f.write(f' r"[\\w{pattern}]+" # noqa: B950\n') |
| f.write(")\n") |
| |
| |
| if __name__ == "__main__": |
| main() |