| #! /usr/bin/python |
| |
| # PCRE2 UNICODE PROPERTY SUPPORT |
| # ------------------------------ |
| |
| # This file is a Python module containing common lists and functions for the |
| # GenerateXXX scripts that create various.c and .h files from Unicode data |
| # files. It was created as part of a re-organizaton of these scripts in |
| # December 2021. |
| |
| |
| import re |
| |
| |
| # --------------------------------------------------------------------------- |
| # DATA LISTS |
| # --------------------------------------------------------------------------- |
| |
| # BIDI classes in the DerivedBidiClass.txt file, with comments. |
| |
| bidi_classes = [ |
| 'AL', 'Arabic letter', |
| 'AN', 'Arabic number', |
| 'B', 'Paragraph separator', |
| 'BN', 'Boundary neutral', |
| 'CS', 'Common separator', |
| 'EN', 'European number', |
| 'ES', 'European separator', |
| 'ET', 'European terminator', |
| 'FSI', 'First strong isolate', |
| 'L', 'Left to right', |
| 'LRE', 'Left to right embedding', |
| 'LRI', 'Left to right isolate', |
| 'LRO', 'Left to right override', |
| 'NSM', 'Non-spacing mark', |
| 'ON', 'Other neutral', |
| 'PDF', 'Pop directional format', |
| 'PDI', 'Pop directional isolate', |
| 'R', 'Right to left', |
| 'RLE', 'Right to left embedding', |
| 'RLI', 'Right to left isolate', |
| 'RLO', 'Right to left override', |
| 'S', 'Segment separator', |
| 'WS', 'White space' |
| ] |
| |
| # Particular category property names, with comments. NOTE: If ever this list |
| # is changed, the table called "catposstab" in the pcre2_auto_possess.c file |
| # must be edited to keep in step. |
| |
| category_names = [ |
| 'Cc', 'Control', |
| 'Cf', 'Format', |
| 'Cn', 'Unassigned', |
| 'Co', 'Private use', |
| 'Cs', 'Surrogate', |
| 'Ll', 'Lower case letter', |
| 'Lm', 'Modifier letter', |
| 'Lo', 'Other letter', |
| 'Lt', 'Title case letter', |
| 'Lu', 'Upper case letter', |
| 'Mc', 'Spacing mark', |
| 'Me', 'Enclosing mark', |
| 'Mn', 'Non-spacing mark', |
| 'Nd', 'Decimal number', |
| 'Nl', 'Letter number', |
| 'No', 'Other number', |
| 'Pc', 'Connector punctuation', |
| 'Pd', 'Dash punctuation', |
| 'Pe', 'Close punctuation', |
| 'Pf', 'Final punctuation', |
| 'Pi', 'Initial punctuation', |
| 'Po', 'Other punctuation', |
| 'Ps', 'Open punctuation', |
| 'Sc', 'Currency symbol', |
| 'Sk', 'Modifier symbol', |
| 'Sm', 'Mathematical symbol', |
| 'So', 'Other symbol', |
| 'Zl', 'Line separator', |
| 'Zp', 'Paragraph separator', |
| 'Zs', 'Space separator' |
| ] |
| |
| # The Extended_Pictographic property is not found in the file where all the |
| # others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt |
| # file, but we list it here so that the name has the correct index value. |
| |
| break_properties = [ |
| 'CR', ' 0', |
| 'LF', ' 1', |
| 'Control', ' 2', |
| 'Extend', ' 3', |
| 'Prepend', ' 4', |
| 'SpacingMark', ' 5', |
| 'L', ' 6 Hangul syllable type L', |
| 'V', ' 7 Hangul syllable type V', |
| 'T', ' 8 Hangul syllable type T', |
| 'LV', ' 9 Hangul syllable type LV', |
| 'LVT', '10 Hangul syllable type LVT', |
| 'Regional_Indicator', '11', |
| 'Other', '12', |
| 'ZWJ', '13', |
| 'Extended_Pictographic', '14' |
| ] |
| |
| # List of files from which the names of Boolean properties are obtained, along |
| # with a list of regex patterns for properties to be ignored, and a list of |
| # extra pattern names to add. |
| |
| bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt'] |
| bool_propsignore = [r'^Other_', r'^Hyphen$'] |
| bool_propsextras = ['ASCII', 'Bidi_Mirrored'] |
| |
| |
| # --------------------------------------------------------------------------- |
| # GET BOOLEAN PROPERTY NAMES |
| # --------------------------------------------------------------------------- |
| |
| # Get a list of Boolean property names from a number of files. |
| |
| def getbpropslist(): |
| bplist = [] |
| bplast = "" |
| |
| for filename in bool_propsfiles: |
| try: |
| file = open('Unicode.tables/' + filename, 'r') |
| except IOError: |
| print(f"** Couldn't open {'Unicode.tables/' + filename}\n") |
| sys.exit(1) |
| |
| for line in file: |
| line = re.sub(r'#.*', '', line) |
| data = list(map(str.strip, line.split(';'))) |
| if len(data) <= 1 or data[1] == bplast: |
| continue |
| bplast = data[1] |
| for pat in bool_propsignore: |
| if re.match(pat, bplast) != None: |
| break |
| else: |
| bplist.append(bplast) |
| |
| file.close() |
| |
| bplist.extend(bool_propsextras) |
| bplist.sort() |
| return bplist |
| |
| bool_properties = getbpropslist() |
| bool_props_list_item_size = (len(bool_properties) + 31) // 32 |
| |
| |
| |
| # --------------------------------------------------------------------------- |
| # COLLECTING PROPERTY NAMES AND ALIASES |
| # --------------------------------------------------------------------------- |
| |
| script_names = ['Unknown'] |
| abbreviations = {} |
| |
| def collect_property_names(): |
| global script_names |
| global abbreviations |
| |
| names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') |
| |
| last_script_name = "" |
| with open("Unicode.tables/Scripts.txt") as f: |
| for line in f: |
| match_obj = names_re.match(line) |
| |
| if match_obj == None or match_obj.group(1) == last_script_name: |
| continue |
| |
| last_script_name = match_obj.group(1) |
| script_names.append(last_script_name) |
| |
| # Sometimes there is comment in the line |
| # so splitting around semicolon is not enough |
| value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') |
| |
| with open("Unicode.tables/PropertyValueAliases.txt") as f: |
| for line in f: |
| match_obj = value_alias_re.match(line) |
| |
| if match_obj == None: |
| continue |
| |
| if match_obj.group(1) == "sc": |
| if match_obj.group(2) == match_obj.group(3): |
| abbreviations[match_obj.group(3)] = () |
| elif match_obj.group(4) == None: |
| abbreviations[match_obj.group(3)] = (match_obj.group(2),) |
| else: |
| abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) |
| |
| # We can also collect Boolean property abbreviations into the same dictionary |
| |
| bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?') |
| with open("Unicode.tables/PropertyAliases.txt") as f: |
| for line in f: |
| match_obj = bin_alias_re.match(line) |
| if match_obj == None: |
| continue |
| |
| if match_obj.group(2) in bool_properties: |
| if match_obj.group(3) == None: |
| abbreviations[match_obj.group(2)] = (match_obj.group(1),) |
| else: |
| abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3)) |
| |
| collect_property_names() |
| |
| |
| |
| # --------------------------------------------------------------------------- |
| # REORDERING SCRIPT NAMES |
| # --------------------------------------------------------------------------- |
| |
| script_abbrevs = [] |
| |
| def reorder_scripts(): |
| global script_names |
| global script_abbrevs |
| global abbreviations |
| |
| for name in script_names: |
| abbrevs = abbreviations[name] |
| script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) |
| |
| extended_script_abbrevs = set() |
| with open("Unicode.tables/ScriptExtensions.txt") as f: |
| names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #') |
| |
| for line in f: |
| match_obj = names_re.match(line) |
| |
| if match_obj == None: |
| continue |
| |
| for name in match_obj.group(1).split(" "): |
| extended_script_abbrevs.add(name) |
| |
| new_script_names = [] |
| new_script_abbrevs = [] |
| |
| for idx, abbrev in enumerate(script_abbrevs): |
| if abbrev in extended_script_abbrevs: |
| new_script_names.append(script_names[idx]) |
| new_script_abbrevs.append(abbrev) |
| |
| for idx, abbrev in enumerate(script_abbrevs): |
| if abbrev not in extended_script_abbrevs: |
| new_script_names.append(script_names[idx]) |
| new_script_abbrevs.append(abbrev) |
| |
| script_names = new_script_names |
| script_abbrevs = new_script_abbrevs |
| |
| reorder_scripts() |
| script_list_item_size = (script_names.index('Unknown') + 31) // 32 |
| |
| |
| # --------------------------------------------------------------------------- |
| # DERIVED LISTS |
| # --------------------------------------------------------------------------- |
| |
| # Create general character property names from the first letters of the |
| # particular categories. |
| |
| gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) |
| general_category_names = list(gcn_set) |
| general_category_names.sort() |
| |
| |
| # --------------------------------------------------------------------------- |
| # FUNCTIONS |
| # --------------------------------------------------------------------------- |
| |
| import sys |
| |
| # Open an output file, using the command's argument or a default. Write common |
| # preliminary header information. |
| |
| def open_output(default): |
| if len(sys.argv) > 2: |
| print('** Too many arguments: just give a file name') |
| sys.exit(1) |
| if len(sys.argv) == 2: |
| output_name = sys.argv[1] |
| else: |
| output_name = default |
| try: |
| file = open(output_name, "w") |
| except IOError: |
| print ("** Couldn't open %s" % output_name) |
| sys.exit(1) |
| |
| script_name = sys.argv[0] |
| i = script_name.rfind('/') |
| if i >= 0: |
| script_name = script_name[i+1:] |
| |
| file.write("""\ |
| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* PCRE is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. |
| |
| Written by Philip Hazel |
| Original API code Copyright (c) 1997-2012 University of Cambridge |
| New API code Copyright (c) 2016-2022 University of Cambridge |
| |
| This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! |
| """) |
| |
| file.write("Instead, modify the maint/%s script and run it to generate\n" |
| "a new version of this code.\n\n" % script_name) |
| |
| file.write("""\ |
| ----------------------------------------------------------------------------- |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| * Neither the name of the University of Cambridge nor the names of its |
| contributors may be used to endorse or promote products derived from |
| this software without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| POSSIBILITY OF SUCH DAMAGE. |
| ----------------------------------------------------------------------------- |
| */ |
| \n""") |
| return file |
| |
| # End of UcpCommon.py |