| #! /usr/bin/python |
| |
| # PCRE2 UNICODE PROPERTY SUPPORT |
| # ------------------------------ |
| # |
| # This file auto-generates unicode property tests and their expected output. |
| # It is recommended to re-run this generator after the unicode files are |
| # updated. The names of the generated files are `testinput26` and `testoutput26` |
| |
| import re |
| import sys |
| |
| from GenerateCommon import \ |
| script_names, \ |
| script_abbrevs |
| |
| def write_both(text): |
| input_file.write(text) |
| output_file.write(text) |
| |
| def to_string_char(ch_idx): |
| if ch_idx < 128: |
| if ch_idx < 16: |
| return "\\x{0%x}" % ch_idx |
| if ch_idx >= 32: |
| return chr(ch_idx) |
| return "\\x{%x}" % ch_idx |
| |
| output_directory = "" |
| |
| if len(sys.argv) > 2: |
| print('** Too many arguments: just give a directory name') |
| sys.exit(1) |
| if len(sys.argv) == 2: |
| output_directory = sys.argv[1] |
| if not output_directory.endswith("/"): |
| output_directory += "/" |
| |
| try: |
| input_file = open(output_directory + "testinput26", "w") |
| output_file = open(output_directory + "testoutput26", "w") |
| except IOError: |
| print ("** Couldn't open output files") |
| sys.exit(1) |
| |
| write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n") |
| |
| # --------------------------------------------------------------------------- |
| # UNICODE SCRIPT EXTENSION TESTS |
| # --------------------------------------------------------------------------- |
| |
| write_both("# Unicode Script Extension tests.\n\n") |
| |
| def gen_script_tests(): |
| script_data = [None] * len(script_names) |
| char_data = [None] * 0x110000 |
| |
| property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #") |
| prev_name = "" |
| script_idx = -1 |
| |
| with open("Unicode.tables/Scripts.txt") as f: |
| for line in f: |
| match_obj = property_re.match(line) |
| |
| if match_obj == None: |
| continue |
| |
| name = match_obj.group(3) |
| if name != prev_name: |
| script_idx = script_names.index(name) |
| prev_name = name |
| |
| low = int(match_obj.group(1), 16) |
| high = low |
| char_data[low] = name |
| |
| if match_obj.group(2) != None: |
| high = int(match_obj.group(2), 16) |
| for idx in range(low + 1, high + 1): |
| char_data[idx] = name |
| |
| if script_data[script_idx] == None: |
| script_data[script_idx] = [low, None, None, None, None] |
| script_data[script_idx][1] = high |
| |
| extended_script_indicies = {} |
| |
| with open("Unicode.tables/ScriptExtensions.txt") as f: |
| for line in f: |
| match_obj = property_re.match(line) |
| |
| if match_obj == None: |
| continue |
| |
| low = int(match_obj.group(1), 16) |
| high = low |
| if match_obj.group(2) != None: |
| high = int(match_obj.group(2), 16) |
| |
| for abbrev in match_obj.group(3).split(" "): |
| if abbrev not in extended_script_indicies: |
| idx = script_abbrevs.index(abbrev) |
| extended_script_indicies[abbrev] = idx |
| rec = script_data[idx] |
| rec[2] = low |
| rec[3] = high |
| else: |
| idx = extended_script_indicies[abbrev] |
| rec = script_data[idx] |
| if rec[2] > low: |
| rec[2] = low |
| if rec[3] < high: |
| rec[3] = high |
| |
| if rec[4] == None: |
| name = script_names[idx] |
| for idx in range(low, high + 1): |
| if char_data[idx] != name: |
| rec[4] = idx |
| break |
| |
| long_property_name = False |
| |
| for idx, rec in enumerate(script_data): |
| script_name = script_names[idx] |
| |
| if script_name == "Unknown": |
| continue |
| |
| script_abbrev = script_abbrevs[idx] |
| |
| write_both("# Base script check\n") |
| write_both("/^\\p{sc=%s}/utf\n" % script_name) |
| write_both(" %s\n" % to_string_char(rec[0])) |
| output_file.write(" 0: %s\n" % to_string_char(rec[0])) |
| write_both("\n") |
| |
| write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) |
| write_both(" %s\n" % to_string_char(rec[1])) |
| output_file.write(" 0: %s\n" % to_string_char(rec[1])) |
| write_both("\n") |
| |
| if rec[2] != None: |
| property_name = "scx" |
| if long_property_name: |
| property_name = "Script_Extensions" |
| |
| write_both("# Script extension check\n") |
| write_both("/^\\p{%s}/utf\n" % script_name) |
| write_both(" %s\n" % to_string_char(rec[2])) |
| output_file.write(" 0: %s\n" % to_string_char(rec[2])) |
| write_both("\n") |
| |
| write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) |
| write_both(" %s\n" % to_string_char(rec[3])) |
| output_file.write(" 0: %s\n" % to_string_char(rec[3])) |
| write_both("\n") |
| |
| long_property_name = not long_property_name |
| |
| if rec[4] != None: |
| write_both("# Script extension only character\n") |
| write_both("/^\\p{%s}/utf\n" % script_name) |
| write_both(" %s\n" % to_string_char(rec[4])) |
| output_file.write(" 0: %s\n" % to_string_char(rec[4])) |
| write_both("\n") |
| |
| write_both("/^\\p{sc=%s}/utf\n" % script_name) |
| write_both(" %s\n" % to_string_char(rec[4])) |
| output_file.write("No match\n") |
| write_both("\n") |
| else: |
| print("External character has not found for %s" % script_name) |
| |
| high = rec[1] |
| if rec[3] != None and rec[3] > rec[1]: |
| high = rec[3] |
| write_both("# Character not in script\n") |
| write_both("/^\\p{%s}/utf\n" % script_name) |
| write_both(" %s\n" % to_string_char(high + 1)) |
| output_file.write("No match\n") |
| write_both("\n") |
| |
| |
| gen_script_tests() |
| |
| write_both("# End of testinput26\n") |