| import re, unicodedata, sys
|
|
|
| if sys.maxunicode == 65535:
|
| raise RuntimeError("need UCS-4 Python")
|
|
|
| def gen_category(cats):
|
| for i in range(0, 0x110000):
|
| if unicodedata.category(unichr(i)) in cats:
|
| yield(i)
|
|
|
| def gen_bidirectional(cats):
|
| for i in range(0, 0x110000):
|
| if unicodedata.bidirectional(unichr(i)) in cats:
|
| yield(i)
|
|
|
| def compact_set(l):
|
| single = []
|
| tuple = []
|
| prev = None
|
| span = 0
|
| for e in l:
|
| if prev is None:
|
| prev = e
|
| span = 0
|
| continue
|
| if prev+span+1 != e:
|
| if span > 2:
|
| tuple.append((prev,prev+span+1))
|
| else:
|
| for i in range(prev, prev+span+1):
|
| single.append(i)
|
| prev = e
|
| span = 0
|
| else:
|
| span += 1
|
| if span:
|
| tuple.append((prev,prev+span+1))
|
| else:
|
| single.append(prev)
|
| tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
|
| if not single:
|
| return "set(%s)" % tuple
|
| if not tuple:
|
| return "set(%s)" % repr(single)
|
| return "set(%s + %s)" % (repr(single),tuple)
|
|
|
| ############## Read the tables in the RFC #######################
|
|
|
| data = open("rfc3454.txt").readlines()
|
|
|
| tables = []
|
| curname = None
|
| for l in data:
|
| l = l.strip()
|
| if not l:
|
| continue
|
| # Skip RFC page breaks
|
| if l.startswith("Hoffman & Blanchet") or\
|
| l.startswith("RFC 3454"):
|
| continue
|
| # Find start/end lines
|
| m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
|
| if m:
|
| if m.group(1) == "Start":
|
| if curname:
|
| raise RuntimeError("Double Start", (curname, l))
|
| curname = m.group(2)
|
| table = {}
|
| tables.append((curname, table))
|
| continue
|
| else:
|
| if not curname:
|
| raise RuntimeError("End without start", l)
|
| curname = None
|
| continue
|
| if not curname:
|
| continue
|
| # Now we are in a table
|
| fields = l.split(";")
|
| if len(fields) > 1:
|
| # Drop comment field
|
| fields = fields[:-1]
|
| if len(fields) == 1:
|
| fields = fields[0].split("-")
|
| if len(fields) > 1:
|
| # range
|
| try:
|
| start, end = fields
|
| except ValueError:
|
| raise RuntimeError("Unpacking problem", l)
|
| else:
|
| start = end = fields[0]
|
| start = int(start, 16)
|
| end = int(end, 16)
|
| for i in range(start, end+1):
|
| table[i] = i
|
| else:
|
| code, value = fields
|
| value = value.strip()
|
| if value:
|
| value = [int(v, 16) for v in value.split(" ")]
|
| else:
|
| # table B.1
|
| value = None
|
| table[int(code, 16)] = value
|
|
|
| ########### Generate compact Python versions of the tables #############
|
|
|
| print """# This file is generated by mkstringprep.py. DO NOT EDIT.
|
| \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
|
|
|
| There are two kinds of tables: sets, for which a member test is provided,
|
| and mappings, for which a mapping function is provided.
|
| \"\"\"
|
|
|
| import unicodedata
|
| """
|
|
|
| print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)
|
|
|
| # A.1 is the table of unassigned characters
|
| # XXX Plane 15 PUA is listed as unassigned in Python.
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "A.1"
|
| table = set(table.keys())
|
| Cn = set(gen_category(["Cn"]))
|
|
|
| # FDD0..FDEF are process internal codes
|
| Cn -= set(range(0xFDD0, 0xFDF0))
|
| # not a character
|
| Cn -= set(range(0xFFFE, 0x110000, 0x10000))
|
| Cn -= set(range(0xFFFF, 0x110000, 0x10000))
|
|
|
| # assert table == Cn
|
|
|
| print """
|
| def in_table_a1(code):
|
| if unicodedata.category(code) != 'Cn': return False
|
| c = ord(code)
|
| if 0xFDD0 <= c < 0xFDF0: return False
|
| return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
|
| """
|
|
|
| # B.1 cannot easily be derived
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "B.1"
|
| table = sorted(table.keys())
|
| print """
|
| b1_set = """ + compact_set(table) + """
|
| def in_table_b1(code):
|
| return ord(code) in b1_set
|
| """
|
|
|
| # B.2 and B.3 is case folding.
|
| # It takes CaseFolding.txt into account, which is
|
| # not available in the Python database. Since
|
| # B.2 is derived from B.3, we process B.3 first.
|
| # B.3 supposedly *is* CaseFolding-3.2.0.txt.
|
|
|
| name, table_b2 = tables[0]
|
| del tables[0]
|
| assert name == "B.2"
|
|
|
| name, table_b3 = tables[0]
|
| del tables[0]
|
| assert name == "B.3"
|
|
|
| # B.3 is mostly Python's .lower, except for a number
|
| # of special cases, e.g. considering canonical forms.
|
|
|
| b3_exceptions = {}
|
|
|
| for k,v in table_b2.items():
|
| if map(ord, unichr(k).lower()) != v:
|
| b3_exceptions[k] = u"".join(map(unichr,v))
|
|
|
| b3 = sorted(b3_exceptions.items())
|
|
|
| print """
|
| b3_exceptions = {"""
|
| for i,(k,v) in enumerate(b3):
|
| print "0x%x:%s," % (k, repr(v)),
|
| if i % 4 == 3:
|
| print
|
| print "}"
|
|
|
| print """
|
| def map_table_b3(code):
|
| r = b3_exceptions.get(ord(code))
|
| if r is not None: return r
|
| return code.lower()
|
| """
|
|
|
| def map_table_b3(code):
|
| r = b3_exceptions.get(ord(code))
|
| if r is not None: return r
|
| return code.lower()
|
|
|
| # B.2 is case folding for NFKC. This is the same as B.3,
|
| # except where NormalizeWithKC(Fold(a)) !=
|
| # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
|
|
|
| def map_table_b2(a):
|
| al = map_table_b3(a)
|
| b = unicodedata.normalize("NFKC", al)
|
| bl = u"".join([map_table_b3(ch) for ch in b])
|
| c = unicodedata.normalize("NFKC", bl)
|
| if b != c:
|
| return c
|
| else:
|
| return al
|
|
|
| specials = {}
|
| for k,v in table_b2.items():
|
| if map(ord, map_table_b2(unichr(k))) != v:
|
| specials[k] = v
|
|
|
| # B.3 should not add any additional special cases
|
| assert specials == {}
|
|
|
| print """
|
| def map_table_b2(a):
|
| al = map_table_b3(a)
|
| b = unicodedata.normalize("NFKC", al)
|
| bl = u"".join([map_table_b3(ch) for ch in b])
|
| c = unicodedata.normalize("NFKC", bl)
|
| if b != c:
|
| return c
|
| else:
|
| return al
|
| """
|
|
|
| # C.1.1 is a table with a single character
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.1.1"
|
| assert table == {0x20:0x20}
|
|
|
| print """
|
| def in_table_c11(code):
|
| return code == u" "
|
| """
|
|
|
| # C.1.2 is the rest of all space characters
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.1.2"
|
|
|
| # table = set(table.keys())
|
| # Zs = set(gen_category(["Zs"])) - set([0x20])
|
| # assert Zs == table
|
|
|
| print """
|
| def in_table_c12(code):
|
| return unicodedata.category(code) == "Zs" and code != u" "
|
|
|
| def in_table_c11_c12(code):
|
| return unicodedata.category(code) == "Zs"
|
| """
|
|
|
| # C.2.1 ASCII control characters
|
| name, table_c21 = tables[0]
|
| del tables[0]
|
| assert name == "C.2.1"
|
|
|
| Cc = set(gen_category(["Cc"]))
|
| Cc_ascii = Cc & set(range(128))
|
| table_c21 = set(table_c21.keys())
|
| assert Cc_ascii == table_c21
|
|
|
| print """
|
| def in_table_c21(code):
|
| return ord(code) < 128 and unicodedata.category(code) == "Cc"
|
| """
|
|
|
| # C.2.2 Non-ASCII control characters. It also includes
|
| # a number of characters in category Cf.
|
| name, table_c22 = tables[0]
|
| del tables[0]
|
| assert name == "C.2.2"
|
|
|
| Cc_nonascii = Cc - Cc_ascii
|
| table_c22 = set(table_c22.keys())
|
| assert len(Cc_nonascii - table_c22) == 0
|
|
|
| specials = list(table_c22 - Cc_nonascii)
|
| specials.sort()
|
|
|
| print """c22_specials = """ + compact_set(specials) + """
|
| def in_table_c22(code):
|
| c = ord(code)
|
| if c < 128: return False
|
| if unicodedata.category(code) == "Cc": return True
|
| return c in c22_specials
|
|
|
| def in_table_c21_c22(code):
|
| return unicodedata.category(code) == "Cc" or \\
|
| ord(code) in c22_specials
|
| """
|
|
|
| # C.3 Private use
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.3"
|
|
|
| Co = set(gen_category(["Co"]))
|
| assert set(table.keys()) == Co
|
|
|
| print """
|
| def in_table_c3(code):
|
| return unicodedata.category(code) == "Co"
|
| """
|
|
|
| # C.4 Non-character code points, xFFFE, xFFFF
|
| # plus process internal codes
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.4"
|
|
|
| nonchar = set(range(0xFDD0,0xFDF0) +
|
| range(0xFFFE,0x110000,0x10000) +
|
| range(0xFFFF,0x110000,0x10000))
|
| table = set(table.keys())
|
| assert table == nonchar
|
|
|
| print """
|
| def in_table_c4(code):
|
| c = ord(code)
|
| if c < 0xFDD0: return False
|
| if c < 0xFDF0: return True
|
| return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
|
| """
|
|
|
| # C.5 Surrogate codes
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.5"
|
|
|
| Cs = set(gen_category(["Cs"]))
|
| assert set(table.keys()) == Cs
|
|
|
| print """
|
| def in_table_c5(code):
|
| return unicodedata.category(code) == "Cs"
|
| """
|
|
|
| # C.6 Inappropriate for plain text
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.6"
|
|
|
| table = sorted(table.keys())
|
|
|
| print """
|
| c6_set = """ + compact_set(table) + """
|
| def in_table_c6(code):
|
| return ord(code) in c6_set
|
| """
|
|
|
| # C.7 Inappropriate for canonical representation
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.7"
|
|
|
| table = sorted(table.keys())
|
|
|
| print """
|
| c7_set = """ + compact_set(table) + """
|
| def in_table_c7(code):
|
| return ord(code) in c7_set
|
| """
|
|
|
| # C.8 Change display properties or are deprecated
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.8"
|
|
|
| table = sorted(table.keys())
|
|
|
| print """
|
| c8_set = """ + compact_set(table) + """
|
| def in_table_c8(code):
|
| return ord(code) in c8_set
|
| """
|
|
|
| # C.9 Tagging characters
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "C.9"
|
|
|
| table = sorted(table.keys())
|
|
|
| print """
|
| c9_set = """ + compact_set(table) + """
|
| def in_table_c9(code):
|
| return ord(code) in c9_set
|
| """
|
|
|
| # D.1 Characters with bidirectional property "R" or "AL"
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "D.1"
|
|
|
| RandAL = set(gen_bidirectional(["R","AL"]))
|
| assert set(table.keys()) == RandAL
|
|
|
| print """
|
| def in_table_d1(code):
|
| return unicodedata.bidirectional(code) in ("R","AL")
|
| """
|
|
|
| # D.2 Characters with bidirectional property "L"
|
| name, table = tables[0]
|
| del tables[0]
|
| assert name == "D.2"
|
|
|
| L = set(gen_bidirectional(["L"]))
|
| assert set(table.keys()) == L
|
|
|
| print """
|
| def in_table_d2(code):
|
| return unicodedata.bidirectional(code) == "L"
|
| """
|