scripts/generate_identifier_pattern.py - platform/external/python/jinja - Git at Google

 import itertools
 import os
 import re
 import sys


 def get_characters():
     """Find every Unicode character that is valid in a Python `identifier`_ but
     is not matched by the regex ``\\w`` group.

     ``\\w`` matches some characters that aren't valid in identifiers, but
     :meth:`str.isidentifier` will catch that later in lexing.

     All start characters are valid continue characters, so we only test for
     continue characters.

     _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
     """
     for cp in range(sys.maxunicode + 1):
         s = chr(cp)

         if ("a" + s).isidentifier() and not re.match(r"\w", s):
             yield s


 def collapse_ranges(data):
     """Given a sorted list of unique characters, generate ranges representing
     sequential code points.

     Source: https://stackoverflow.com/a/4629241/400617
     """
     for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]):
         b = list(b)
         yield b[0][1], b[-1][1]


 def build_pattern(ranges):
     """Output the regex pattern for ranges of characters.

     One and two character ranges output the individual characters.
     """
     out = []

     for a, b in ranges:
         if a == b:  # single char
             out.append(a)
         elif ord(b) - ord(a) == 1:  # two chars, range is redundant
             out.append(a)
             out.append(b)
         else:
             out.append(f"{a}-{b}")

     return "".join(out)


 def main():
     """Build the regex pattern and write it to
     ``jinja2/_identifier.py``.
     """
     pattern = build_pattern(collapse_ranges(get_characters()))
     filename = os.path.abspath(
         os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py")
     )

     with open(filename, "w", encoding="utf8") as f:
         f.write("import re\n\n")
         f.write("# generated by scripts/generate_identifier_pattern.py\n")
         f.write("pattern = re.compile(\n")
         f.write(f'    r"[\\w{pattern}]+"  # noqa: B950\n')
         f.write(")\n")


 if __name__ == "__main__":
     main()
	import itertools
	import os
	import re
	import sys


	def get_characters():
	"""Find every Unicode character that is valid in a Python `identifier`_ but
	is not matched by the regex ``\\w`` group.

	``\\w`` matches some characters that aren't valid in identifiers, but
	:meth:`str.isidentifier` will catch that later in lexing.

	All start characters are valid continue characters, so we only test for
	continue characters.

	_identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
	"""
	for cp in range(sys.maxunicode + 1):
	s = chr(cp)

	if ("a" + s).isidentifier() and not re.match(r"\w", s):
	yield s


	def collapse_ranges(data):
	"""Given a sorted list of unique characters, generate ranges representing
	sequential code points.

	Source: https://stackoverflow.com/a/4629241/400617
	"""
	for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]):
	b = list(b)
	yield b[0][1], b[-1][1]


	def build_pattern(ranges):
	"""Output the regex pattern for ranges of characters.

	One and two character ranges output the individual characters.
	"""
	out = []

	for a, b in ranges:
	if a == b: # single char
	out.append(a)
	elif ord(b) - ord(a) == 1: # two chars, range is redundant
	out.append(a)
	out.append(b)
	else:
	out.append(f"{a}-{b}")

	return "".join(out)


	def main():
	"""Build the regex pattern and write it to
	``jinja2/_identifier.py``.
	"""
	pattern = build_pattern(collapse_ranges(get_characters()))
	filename = os.path.abspath(
	os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py")
	)

	with open(filename, "w", encoding="utf8") as f:
	f.write("import re\n\n")
	f.write("# generated by scripts/generate_identifier_pattern.py\n")
	f.write("pattern = re.compile(\n")
	f.write(f' r"[\\w{pattern}]+" # noqa: B950\n')
	f.write(")\n")


	if __name__ == "__main__":
	main()