|  | """ Unicode Mapping Parser and Codec Generator. | 
|  |  | 
|  | This script parses Unicode mapping files as available from the Unicode | 
|  | site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec | 
|  | modules from them. The codecs use the standard character mapping codec | 
|  | to actually apply the mapping. | 
|  |  | 
|  | Synopsis: gencodec.py dir codec_prefix | 
|  |  | 
|  | All files in dir are scanned and those producing non-empty mappings | 
|  | will be written to <codec_prefix><mapname>.py with <mapname> being the | 
|  | first part of the map's filename ('a' in a.b.c.txt) converted to | 
|  | lowercase with hyphens replaced by underscores. | 
|  |  | 
|  | The tool also writes marshalled versions of the mapping tables to the | 
|  | same location (with .mapping extension). | 
|  |  | 
|  | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
|  |  | 
|  | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
|  | (c) Copyright Guido van Rossum, 2000. | 
|  |  | 
|  | Table generation: | 
|  | (c) Copyright Marc-Andre Lemburg, 2005. | 
|  | Licensed to PSF under a Contributor Agreement. | 
|  |  | 
|  | """#" | 
|  |  | 
|  | import re, os, marshal, codecs | 
|  |  | 
|  | # Maximum allowed size of charmap tables | 
|  | MAX_TABLE_SIZE = 8192 | 
|  |  | 
|  | # Standard undefined Unicode code point | 
|  | UNI_UNDEFINED = unichr(0xFFFE) | 
|  |  | 
|  | mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' | 
|  | '\s+' | 
|  | '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' | 
|  | '\s*' | 
|  | '(#.+)?') | 
|  |  | 
|  | def parsecodes(codes, len=len, range=range): | 
|  |  | 
|  | """ Converts code combinations to either a single code integer | 
|  | or a tuple of integers. | 
|  |  | 
|  | meta-codes (in angular brackets, e.g. <LR> and <RL>) are | 
|  | ignored. | 
|  |  | 
|  | Empty codes or illegal ones are returned as None. | 
|  |  | 
|  | """ | 
|  | if not codes: | 
|  | return None | 
|  | l = codes.split('+') | 
|  | if len(l) == 1: | 
|  | return int(l[0],16) | 
|  | for i in range(len(l)): | 
|  | try: | 
|  | l[i] = int(l[i],16) | 
|  | except ValueError: | 
|  | l[i] = None | 
|  | l = [x for x in l if x is not None] | 
|  | if len(l) == 1: | 
|  | return l[0] | 
|  | else: | 
|  | return tuple(l) | 
|  |  | 
|  | def readmap(filename): | 
|  |  | 
|  | f = open(filename,'r') | 
|  | lines = f.readlines() | 
|  | f.close() | 
|  | enc2uni = {} | 
|  | identity = [] | 
|  | unmapped = range(256) | 
|  |  | 
|  | # UTC mapping tables per convention don't include the identity | 
|  | # mappings for code points 0x00 - 0x1F and 0x7F, unless these are | 
|  | # explicitly mapped to different characters or undefined | 
|  | for i in range(32) + [127]: | 
|  | identity.append(i) | 
|  | unmapped.remove(i) | 
|  | enc2uni[i] = (i, 'CONTROL CHARACTER') | 
|  |  | 
|  | for line in lines: | 
|  | line = line.strip() | 
|  | if not line or line[0] == '#': | 
|  | continue | 
|  | m = mapRE.match(line) | 
|  | if not m: | 
|  | #print '* not matched: %s' % repr(line) | 
|  | continue | 
|  | enc,uni,comment = m.groups() | 
|  | enc = parsecodes(enc) | 
|  | uni = parsecodes(uni) | 
|  | if comment is None: | 
|  | comment = '' | 
|  | else: | 
|  | comment = comment[1:].strip() | 
|  | if enc < 256: | 
|  | if enc in unmapped: | 
|  | unmapped.remove(enc) | 
|  | if enc == uni: | 
|  | identity.append(enc) | 
|  | enc2uni[enc] = (uni,comment) | 
|  | else: | 
|  | enc2uni[enc] = (uni,comment) | 
|  |  | 
|  | # If there are more identity-mapped entries than unmapped entries, | 
|  | # it pays to generate an identity dictionary first, and add explicit | 
|  | # mappings to None for the rest | 
|  | if len(identity) >= len(unmapped): | 
|  | for enc in unmapped: | 
|  | enc2uni[enc] = (None, "") | 
|  | enc2uni['IDENTITY'] = 256 | 
|  |  | 
|  | return enc2uni | 
|  |  | 
|  | def hexrepr(t, precision=4): | 
|  |  | 
|  | if t is None: | 
|  | return 'None' | 
|  | try: | 
|  | len(t) | 
|  | except: | 
|  | return '0x%0*X' % (precision, t) | 
|  | try: | 
|  | return '(' + ', '.join(['0x%0*X' % (precision, item) | 
|  | for item in t]) + ')' | 
|  | except TypeError, why: | 
|  | print '* failed to convert %r: %s' % (t, why) | 
|  | raise | 
|  |  | 
|  | def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): | 
|  |  | 
|  | l = [] | 
|  | append = l.append | 
|  | if "IDENTITY" in map: | 
|  | append("%s = codecs.make_identity_dict(range(%d))" % | 
|  | (varname, map["IDENTITY"])) | 
|  | append("%s.update({" % varname) | 
|  | splits = 1 | 
|  | del map["IDENTITY"] | 
|  | identity = 1 | 
|  | else: | 
|  | append("%s = {" % varname) | 
|  | splits = 0 | 
|  | identity = 0 | 
|  |  | 
|  | mappings = sorted(map.items()) | 
|  | i = 0 | 
|  | key_precision, value_precision = precisions | 
|  | for mapkey, mapvalue in mappings: | 
|  | mapcomment = '' | 
|  | if isinstance(mapkey, tuple): | 
|  | (mapkey, mapcomment) = mapkey | 
|  | if isinstance(mapvalue, tuple): | 
|  | (mapvalue, mapcomment) = mapvalue | 
|  | if mapkey is None: | 
|  | continue | 
|  | if (identity and | 
|  | mapkey == mapvalue and | 
|  | mapkey < 256): | 
|  | # No need to include identity mappings, since these | 
|  | # are already set for the first 256 code points. | 
|  | continue | 
|  | key = hexrepr(mapkey, key_precision) | 
|  | value = hexrepr(mapvalue, value_precision) | 
|  | if mapcomment and comments: | 
|  | append('    %s: %s,\t#  %s' % (key, value, mapcomment)) | 
|  | else: | 
|  | append('    %s: %s,' % (key, value)) | 
|  | i += 1 | 
|  | if i == 4096: | 
|  | # Split the definition into parts to that the Python | 
|  | # parser doesn't dump core | 
|  | if splits == 0: | 
|  | append('}') | 
|  | else: | 
|  | append('})') | 
|  | append('%s.update({' % varname) | 
|  | i = 0 | 
|  | splits = splits + 1 | 
|  | if splits == 0: | 
|  | append('}') | 
|  | else: | 
|  | append('})') | 
|  |  | 
|  | return l | 
|  |  | 
|  | def python_tabledef_code(varname, map, comments=1, key_precision=2): | 
|  |  | 
|  | l = [] | 
|  | append = l.append | 
|  | append('%s = (' % varname) | 
|  |  | 
|  | # Analyze map and create table dict | 
|  | mappings = sorted(map.items()) | 
|  | table = {} | 
|  | maxkey = 0 | 
|  | if 'IDENTITY' in map: | 
|  | for key in range(256): | 
|  | table[key] = (key, '') | 
|  | maxkey = 255 | 
|  | del map['IDENTITY'] | 
|  | for mapkey, mapvalue in mappings: | 
|  | mapcomment = '' | 
|  | if isinstance(mapkey, tuple): | 
|  | (mapkey, mapcomment) = mapkey | 
|  | if isinstance(mapvalue, tuple): | 
|  | (mapvalue, mapcomment) = mapvalue | 
|  | if mapkey is None: | 
|  | continue | 
|  | table[mapkey] = (mapvalue, mapcomment) | 
|  | if mapkey > maxkey: | 
|  | maxkey = mapkey | 
|  | if maxkey > MAX_TABLE_SIZE: | 
|  | # Table too large | 
|  | return None | 
|  |  | 
|  | # Create table code | 
|  | for key in range(maxkey + 1): | 
|  | if key not in table: | 
|  | mapvalue = None | 
|  | mapcomment = 'UNDEFINED' | 
|  | else: | 
|  | mapvalue, mapcomment = table[key] | 
|  | if mapvalue is None: | 
|  | mapchar = UNI_UNDEFINED | 
|  | else: | 
|  | if isinstance(mapvalue, tuple): | 
|  | # 1-n mappings not supported | 
|  | return None | 
|  | else: | 
|  | mapchar = unichr(mapvalue) | 
|  | if mapcomment and comments: | 
|  | append('    %r\t#  %s -> %s' % (mapchar, | 
|  | hexrepr(key, key_precision), | 
|  | mapcomment)) | 
|  | else: | 
|  | append('    %r' % mapchar) | 
|  |  | 
|  | append(')') | 
|  | return l | 
|  |  | 
|  | def codegen(name, map, encodingname, comments=1): | 
|  |  | 
|  | """ Returns Python source for the given map. | 
|  |  | 
|  | Comments are included in the source, if comments is true (default). | 
|  |  | 
|  | """ | 
|  | # Generate code | 
|  | decoding_map_code = python_mapdef_code( | 
|  | 'decoding_map', | 
|  | map, | 
|  | comments=comments) | 
|  | decoding_table_code = python_tabledef_code( | 
|  | 'decoding_table', | 
|  | map, | 
|  | comments=comments) | 
|  | encoding_map_code = python_mapdef_code( | 
|  | 'encoding_map', | 
|  | codecs.make_encoding_map(map), | 
|  | comments=comments, | 
|  | precisions=(4, 2)) | 
|  |  | 
|  | if decoding_table_code: | 
|  | suffix = 'table' | 
|  | else: | 
|  | suffix = 'map' | 
|  |  | 
|  | l = [ | 
|  | '''\ | 
|  | """ Python Character Mapping Codec %s generated from '%s' with gencodec.py. | 
|  |  | 
|  | """#" | 
|  |  | 
|  | import codecs | 
|  |  | 
|  | ### Codec APIs | 
|  |  | 
|  | class Codec(codecs.Codec): | 
|  |  | 
|  | def encode(self,input,errors='strict'): | 
|  | return codecs.charmap_encode(input,errors,encoding_%s) | 
|  |  | 
|  | def decode(self,input,errors='strict'): | 
|  | return codecs.charmap_decode(input,errors,decoding_%s) | 
|  | ''' % (encodingname, name, suffix, suffix)] | 
|  | l.append('''\ | 
|  | class IncrementalEncoder(codecs.IncrementalEncoder): | 
|  | def encode(self, input, final=False): | 
|  | return codecs.charmap_encode(input,self.errors,encoding_%s)[0] | 
|  |  | 
|  | class IncrementalDecoder(codecs.IncrementalDecoder): | 
|  | def decode(self, input, final=False): | 
|  | return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' % | 
|  | (suffix, suffix)) | 
|  |  | 
|  | l.append(''' | 
|  | class StreamWriter(Codec,codecs.StreamWriter): | 
|  | pass | 
|  |  | 
|  | class StreamReader(Codec,codecs.StreamReader): | 
|  | pass | 
|  |  | 
|  | ### encodings module API | 
|  |  | 
|  | def getregentry(): | 
|  | return codecs.CodecInfo( | 
|  | name=%r, | 
|  | encode=Codec().encode, | 
|  | decode=Codec().decode, | 
|  | incrementalencoder=IncrementalEncoder, | 
|  | incrementaldecoder=IncrementalDecoder, | 
|  | streamreader=StreamReader, | 
|  | streamwriter=StreamWriter, | 
|  | ) | 
|  | ''' % encodingname.replace('_', '-')) | 
|  |  | 
|  | # Add decoding table or map (with preference to the table) | 
|  | if not decoding_table_code: | 
|  | l.append(''' | 
|  | ### Decoding Map | 
|  | ''') | 
|  | l.extend(decoding_map_code) | 
|  | else: | 
|  | l.append(''' | 
|  | ### Decoding Table | 
|  | ''') | 
|  | l.extend(decoding_table_code) | 
|  |  | 
|  | # Add encoding map | 
|  | if decoding_table_code: | 
|  | l.append(''' | 
|  | ### Encoding table | 
|  | encoding_table=codecs.charmap_build(decoding_table) | 
|  | ''') | 
|  | else: | 
|  | l.append(''' | 
|  | ### Encoding Map | 
|  | ''') | 
|  | l.extend(encoding_map_code) | 
|  |  | 
|  | # Final new-line | 
|  | l.append('') | 
|  |  | 
|  | return '\n'.join(l).expandtabs() | 
|  |  | 
|  | def pymap(name,map,pyfile,encodingname,comments=1): | 
|  |  | 
|  | code = codegen(name,map,encodingname,comments) | 
|  | f = open(pyfile,'w') | 
|  | f.write(code) | 
|  | f.close() | 
|  |  | 
|  | def marshalmap(name,map,marshalfile): | 
|  |  | 
|  | d = {} | 
|  | for e,(u,c) in map.items(): | 
|  | d[e] = (u,c) | 
|  | f = open(marshalfile,'wb') | 
|  | marshal.dump(d,f) | 
|  | f.close() | 
|  |  | 
|  | def convertdir(dir, dirprefix='', nameprefix='', comments=1): | 
|  |  | 
|  | mapnames = os.listdir(dir) | 
|  | for mapname in mapnames: | 
|  | mappathname = os.path.join(dir, mapname) | 
|  | if not os.path.isfile(mappathname): | 
|  | continue | 
|  | name = os.path.split(mapname)[1] | 
|  | name = name.replace('-','_') | 
|  | name = name.split('.')[0] | 
|  | name = name.lower() | 
|  | name = nameprefix + name | 
|  | codefile = name + '.py' | 
|  | marshalfile = name + '.mapping' | 
|  | print 'converting %s to %s and %s' % (mapname, | 
|  | dirprefix + codefile, | 
|  | dirprefix + marshalfile) | 
|  | try: | 
|  | map = readmap(os.path.join(dir,mapname)) | 
|  | if not map: | 
|  | print '* map is empty; skipping' | 
|  | else: | 
|  | pymap(mappathname, map, dirprefix + codefile,name,comments) | 
|  | marshalmap(mappathname, map, dirprefix + marshalfile) | 
|  | except ValueError, why: | 
|  | print '* conversion failed: %s' % why | 
|  | raise | 
|  |  | 
|  | def rewritepythondir(dir, dirprefix='', comments=1): | 
|  |  | 
|  | mapnames = os.listdir(dir) | 
|  | for mapname in mapnames: | 
|  | if not mapname.endswith('.mapping'): | 
|  | continue | 
|  | name = mapname[:-len('.mapping')] | 
|  | codefile = name + '.py' | 
|  | print 'converting %s to %s' % (mapname, | 
|  | dirprefix + codefile) | 
|  | try: | 
|  | map = marshal.load(open(os.path.join(dir,mapname), | 
|  | 'rb')) | 
|  | if not map: | 
|  | print '* map is empty; skipping' | 
|  | else: | 
|  | pymap(mapname, map, dirprefix + codefile,name,comments) | 
|  | except ValueError, why: | 
|  | print '* conversion failed: %s' % why | 
|  |  | 
|  | if __name__ == '__main__': | 
|  |  | 
|  | import sys | 
|  | if 1: | 
|  | convertdir(*sys.argv[1:]) | 
|  | else: | 
|  | rewritepythondir(*sys.argv[1:]) |