""" Unicode Mapping Parser and Codec Generator. | |
This script parses Unicode mapping files as available from the Unicode | |
site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec | |
modules from them. The codecs use the standard character mapping codec | |
to actually apply the mapping. | |
Synopsis: gencodec.py dir codec_prefix | |
All files in dir are scanned and those producing non-empty mappings | |
will be written to <codec_prefix><mapname>.py with <mapname> being the | |
first part of the map's filename ('a' in a.b.c.txt) converted to | |
lowercase with hyphens replaced by underscores. | |
The tool also writes marshalled versions of the mapping tables to the | |
same location (with .mapping extension). | |
Written by Marc-Andre Lemburg (mal@lemburg.com). | |
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | |
(c) Copyright Guido van Rossum, 2000. | |
Table generation: | |
(c) Copyright Marc-Andre Lemburg, 2005. | |
Licensed to PSF under a Contributor Agreement. | |
"""#" | |
import re, os, marshal, codecs | |
# Maximum allowed size of charmap tables | |
MAX_TABLE_SIZE = 8192 | |
# Standard undefined Unicode code point | |
UNI_UNDEFINED = unichr(0xFFFE) | |
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' | |
'\s+' | |
'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' | |
'\s*' | |
'(#.+)?') | |
def parsecodes(codes, len=len, range=range): | |
""" Converts code combinations to either a single code integer | |
or a tuple of integers. | |
meta-codes (in angular brackets, e.g. <LR> and <RL>) are | |
ignored. | |
Empty codes or illegal ones are returned as None. | |
""" | |
if not codes: | |
return None | |
l = codes.split('+') | |
if len(l) == 1: | |
return int(l[0],16) | |
for i in range(len(l)): | |
try: | |
l[i] = int(l[i],16) | |
except ValueError: | |
l[i] = None | |
l = [x for x in l if x is not None] | |
if len(l) == 1: | |
return l[0] | |
else: | |
return tuple(l) | |
def readmap(filename): | |
f = open(filename,'r') | |
lines = f.readlines() | |
f.close() | |
enc2uni = {} | |
identity = [] | |
unmapped = range(256) | |
# UTC mapping tables per convention don't include the identity | |
# mappings for code points 0x00 - 0x1F and 0x7F, unless these are | |
# explicitly mapped to different characters or undefined | |
for i in range(32) + [127]: | |
identity.append(i) | |
unmapped.remove(i) | |
enc2uni[i] = (i, 'CONTROL CHARACTER') | |
for line in lines: | |
line = line.strip() | |
if not line or line[0] == '#': | |
continue | |
m = mapRE.match(line) | |
if not m: | |
#print '* not matched: %s' % repr(line) | |
continue | |
enc,uni,comment = m.groups() | |
enc = parsecodes(enc) | |
uni = parsecodes(uni) | |
if comment is None: | |
comment = '' | |
else: | |
comment = comment[1:].strip() | |
if enc < 256: | |
if enc in unmapped: | |
unmapped.remove(enc) | |
if enc == uni: | |
identity.append(enc) | |
enc2uni[enc] = (uni,comment) | |
else: | |
enc2uni[enc] = (uni,comment) | |
# If there are more identity-mapped entries than unmapped entries, | |
# it pays to generate an identity dictionary first, and add explicit | |
# mappings to None for the rest | |
if len(identity) >= len(unmapped): | |
for enc in unmapped: | |
enc2uni[enc] = (None, "") | |
enc2uni['IDENTITY'] = 256 | |
return enc2uni | |
def hexrepr(t, precision=4): | |
if t is None: | |
return 'None' | |
try: | |
len(t) | |
except: | |
return '0x%0*X' % (precision, t) | |
try: | |
return '(' + ', '.join(['0x%0*X' % (precision, item) | |
for item in t]) + ')' | |
except TypeError, why: | |
print '* failed to convert %r: %s' % (t, why) | |
raise | |
def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): | |
l = [] | |
append = l.append | |
if "IDENTITY" in map: | |
append("%s = codecs.make_identity_dict(range(%d))" % | |
(varname, map["IDENTITY"])) | |
append("%s.update({" % varname) | |
splits = 1 | |
del map["IDENTITY"] | |
identity = 1 | |
else: | |
append("%s = {" % varname) | |
splits = 0 | |
identity = 0 | |
mappings = sorted(map.items()) | |
i = 0 | |
key_precision, value_precision = precisions | |
for mapkey, mapvalue in mappings: | |
mapcomment = '' | |
if isinstance(mapkey, tuple): | |
(mapkey, mapcomment) = mapkey | |
if isinstance(mapvalue, tuple): | |
(mapvalue, mapcomment) = mapvalue | |
if mapkey is None: | |
continue | |
if (identity and | |
mapkey == mapvalue and | |
mapkey < 256): | |
# No need to include identity mappings, since these | |
# are already set for the first 256 code points. | |
continue | |
key = hexrepr(mapkey, key_precision) | |
value = hexrepr(mapvalue, value_precision) | |
if mapcomment and comments: | |
append(' %s: %s,\t# %s' % (key, value, mapcomment)) | |
else: | |
append(' %s: %s,' % (key, value)) | |
i += 1 | |
if i == 4096: | |
# Split the definition into parts to that the Python | |
# parser doesn't dump core | |
if splits == 0: | |
append('}') | |
else: | |
append('})') | |
append('%s.update({' % varname) | |
i = 0 | |
splits = splits + 1 | |
if splits == 0: | |
append('}') | |
else: | |
append('})') | |
return l | |
def python_tabledef_code(varname, map, comments=1, key_precision=2): | |
l = [] | |
append = l.append | |
append('%s = (' % varname) | |
# Analyze map and create table dict | |
mappings = sorted(map.items()) | |
table = {} | |
maxkey = 0 | |
if 'IDENTITY' in map: | |
for key in range(256): | |
table[key] = (key, '') | |
maxkey = 255 | |
del map['IDENTITY'] | |
for mapkey, mapvalue in mappings: | |
mapcomment = '' | |
if isinstance(mapkey, tuple): | |
(mapkey, mapcomment) = mapkey | |
if isinstance(mapvalue, tuple): | |
(mapvalue, mapcomment) = mapvalue | |
if mapkey is None: | |
continue | |
table[mapkey] = (mapvalue, mapcomment) | |
if mapkey > maxkey: | |
maxkey = mapkey | |
if maxkey > MAX_TABLE_SIZE: | |
# Table too large | |
return None | |
# Create table code | |
for key in range(maxkey + 1): | |
if key not in table: | |
mapvalue = None | |
mapcomment = 'UNDEFINED' | |
else: | |
mapvalue, mapcomment = table[key] | |
if mapvalue is None: | |
mapchar = UNI_UNDEFINED | |
else: | |
if isinstance(mapvalue, tuple): | |
# 1-n mappings not supported | |
return None | |
else: | |
mapchar = unichr(mapvalue) | |
if mapcomment and comments: | |
append(' %r\t# %s -> %s' % (mapchar, | |
hexrepr(key, key_precision), | |
mapcomment)) | |
else: | |
append(' %r' % mapchar) | |
append(')') | |
return l | |
def codegen(name, map, encodingname, comments=1): | |
""" Returns Python source for the given map. | |
Comments are included in the source, if comments is true (default). | |
""" | |
# Generate code | |
decoding_map_code = python_mapdef_code( | |
'decoding_map', | |
map, | |
comments=comments) | |
decoding_table_code = python_tabledef_code( | |
'decoding_table', | |
map, | |
comments=comments) | |
encoding_map_code = python_mapdef_code( | |
'encoding_map', | |
codecs.make_encoding_map(map), | |
comments=comments, | |
precisions=(4, 2)) | |
if decoding_table_code: | |
suffix = 'table' | |
else: | |
suffix = 'map' | |
l = [ | |
'''\ | |
""" Python Character Mapping Codec %s generated from '%s' with gencodec.py. | |
"""#" | |
import codecs | |
### Codec APIs | |
class Codec(codecs.Codec): | |
def encode(self,input,errors='strict'): | |
return codecs.charmap_encode(input,errors,encoding_%s) | |
def decode(self,input,errors='strict'): | |
return codecs.charmap_decode(input,errors,decoding_%s) | |
''' % (encodingname, name, suffix, suffix)] | |
l.append('''\ | |
class IncrementalEncoder(codecs.IncrementalEncoder): | |
def encode(self, input, final=False): | |
return codecs.charmap_encode(input,self.errors,encoding_%s)[0] | |
class IncrementalDecoder(codecs.IncrementalDecoder): | |
def decode(self, input, final=False): | |
return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' % | |
(suffix, suffix)) | |
l.append(''' | |
class StreamWriter(Codec,codecs.StreamWriter): | |
pass | |
class StreamReader(Codec,codecs.StreamReader): | |
pass | |
### encodings module API | |
def getregentry(): | |
return codecs.CodecInfo( | |
name=%r, | |
encode=Codec().encode, | |
decode=Codec().decode, | |
incrementalencoder=IncrementalEncoder, | |
incrementaldecoder=IncrementalDecoder, | |
streamreader=StreamReader, | |
streamwriter=StreamWriter, | |
) | |
''' % encodingname.replace('_', '-')) | |
# Add decoding table or map (with preference to the table) | |
if not decoding_table_code: | |
l.append(''' | |
### Decoding Map | |
''') | |
l.extend(decoding_map_code) | |
else: | |
l.append(''' | |
### Decoding Table | |
''') | |
l.extend(decoding_table_code) | |
# Add encoding map | |
if decoding_table_code: | |
l.append(''' | |
### Encoding table | |
encoding_table=codecs.charmap_build(decoding_table) | |
''') | |
else: | |
l.append(''' | |
### Encoding Map | |
''') | |
l.extend(encoding_map_code) | |
# Final new-line | |
l.append('') | |
return '\n'.join(l).expandtabs() | |
def pymap(name,map,pyfile,encodingname,comments=1): | |
code = codegen(name,map,encodingname,comments) | |
f = open(pyfile,'w') | |
f.write(code) | |
f.close() | |
def marshalmap(name,map,marshalfile): | |
d = {} | |
for e,(u,c) in map.items(): | |
d[e] = (u,c) | |
f = open(marshalfile,'wb') | |
marshal.dump(d,f) | |
f.close() | |
def convertdir(dir, dirprefix='', nameprefix='', comments=1): | |
mapnames = os.listdir(dir) | |
for mapname in mapnames: | |
mappathname = os.path.join(dir, mapname) | |
if not os.path.isfile(mappathname): | |
continue | |
name = os.path.split(mapname)[1] | |
name = name.replace('-','_') | |
name = name.split('.')[0] | |
name = name.lower() | |
name = nameprefix + name | |
codefile = name + '.py' | |
marshalfile = name + '.mapping' | |
print 'converting %s to %s and %s' % (mapname, | |
dirprefix + codefile, | |
dirprefix + marshalfile) | |
try: | |
map = readmap(os.path.join(dir,mapname)) | |
if not map: | |
print '* map is empty; skipping' | |
else: | |
pymap(mappathname, map, dirprefix + codefile,name,comments) | |
marshalmap(mappathname, map, dirprefix + marshalfile) | |
except ValueError, why: | |
print '* conversion failed: %s' % why | |
raise | |
def rewritepythondir(dir, dirprefix='', comments=1): | |
mapnames = os.listdir(dir) | |
for mapname in mapnames: | |
if not mapname.endswith('.mapping'): | |
continue | |
name = mapname[:-len('.mapping')] | |
codefile = name + '.py' | |
print 'converting %s to %s' % (mapname, | |
dirprefix + codefile) | |
try: | |
map = marshal.load(open(os.path.join(dir,mapname), | |
'rb')) | |
if not map: | |
print '* map is empty; skipping' | |
else: | |
pymap(mapname, map, dirprefix + codefile,name,comments) | |
except ValueError, why: | |
print '* conversion failed: %s' % why | |
if __name__ == '__main__': | |
import sys | |
if 1: | |
convertdir(*sys.argv[1:]) | |
else: | |
rewritepythondir(*sys.argv[1:]) |