Back to index

python3.2  3.2.2
Functions | Variables
gencodec Namespace Reference

Functions

def parsecodes
def readmap
def hexrepr
def python_mapdef_code
def python_tabledef_code
def codegen
def pymap
def marshalmap
def convertdir
def rewritepythondir

Variables

int MAX_TABLE_SIZE = 8192
tuple UNI_UNDEFINED = chr(0xFFFE)
int MISSING_CODE = 1
tuple mapRE

Detailed Description

Unicode Mapping Parser and Codec Generator.

This script parses Unicode mapping files as available from the Unicode
site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
modules from them. The codecs use the standard character mapping codec
to actually apply the mapping.

Synopsis: gencodec.py dir codec_prefix

All files in dir are scanned and those producing non-empty mappings
will be written to <codec_prefix><mapname>.py with <mapname> being the
first part of the map's filename ('a' in a.b.c.txt) converted to
lowercase with hyphens replaced by underscores.

The tool also writes marshalled versions of the mapping tables to the
same location (with .mapping extension).

Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright Guido van Rossum, 2000.

Table generation:
(c) Copyright Marc-Andre Lemburg, 2005.
Licensed to PSF under a Contributor Agreement.

Function Documentation

def gencodec.codegen (   name,
  map,
  encodingname,
  comments = 1 
)
Returns Python source for the given map.

    Comments are included in the source, if comments is true (default).

Definition at line 251 of file gencodec.py.

00251 
00252 def codegen(name, map, encodingname, comments=1):
00253 
00254     """ Returns Python source for the given map.
00255 
00256         Comments are included in the source, if comments is true (default).
00257 
00258     """
00259     # Generate code
00260     decoding_map_code = python_mapdef_code(
00261         'decoding_map',
00262         map,
00263         comments=comments)
00264     decoding_table_code = python_tabledef_code(
00265         'decoding_table',
00266         map,
00267         comments=comments)
00268     encoding_map_code = python_mapdef_code(
00269         'encoding_map',
00270         codecs.make_encoding_map(map),
00271         comments=comments,
00272         precisions=(4, 2))
00273 
00274     if decoding_table_code:
00275         suffix = 'table'
00276     else:
00277         suffix = 'map'
00278 
00279     l = [
00280         '''\
00281 """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
00282 
00283 """#"
00284 
00285 import codecs
00286 
00287 ### Codec APIs
00288 
00289 class Codec(codecs.Codec):
00290 
00291     def encode(self,input,errors='strict'):
00292         return codecs.charmap_encode(input,errors,encoding_%s)
00293 
00294     def decode(self,input,errors='strict'):
00295         return codecs.charmap_decode(input,errors,decoding_%s)
00296 ''' % (encodingname, name, suffix, suffix)]
00297     l.append('''\
00298 class IncrementalEncoder(codecs.IncrementalEncoder):
00299     def encode(self, input, final=False):
00300         return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
00301 
00302 class IncrementalDecoder(codecs.IncrementalDecoder):
00303     def decode(self, input, final=False):
00304         return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
00305         (suffix, suffix))
00306 
00307     l.append('''
00308 class StreamWriter(Codec,codecs.StreamWriter):
00309     pass
00310 
00311 class StreamReader(Codec,codecs.StreamReader):
00312     pass
00313 
00314 ### encodings module API
00315 
00316 def getregentry():
00317     return codecs.CodecInfo(
00318         name=%r,
00319         encode=Codec().encode,
00320         decode=Codec().decode,
00321         incrementalencoder=IncrementalEncoder,
00322         incrementaldecoder=IncrementalDecoder,
00323         streamreader=StreamReader,
00324         streamwriter=StreamWriter,
00325     )
00326 ''' % encodingname.replace('_', '-'))
00327 
00328     # Add decoding table or map (with preference to the table)
00329     if not decoding_table_code:
00330         l.append('''
00331 ### Decoding Map
00332 ''')
00333         l.extend(decoding_map_code)
00334     else:
00335         l.append('''
00336 ### Decoding Table
00337 ''')
00338         l.extend(decoding_table_code)
00339 
00340     # Add encoding map
00341     if decoding_table_code:
00342         l.append('''
00343 ### Encoding table
00344 encoding_table=codecs.charmap_build(decoding_table)
00345 ''')
00346     else:
00347         l.append('''
00348 ### Encoding Map
00349 ''')
00350         l.extend(encoding_map_code)
00351 
00352     # Final new-line
00353     l.append('')
00354 
00355     return '\n'.join(l).expandtabs()

Here is the call graph for this function:

Here is the caller graph for this function:

def gencodec.convertdir (   dir,
  dirprefix = '',
  nameprefix = '',
  comments = 1 
)

Definition at line 372 of file gencodec.py.

00372 
00373 def convertdir(dir, dirprefix='', nameprefix='', comments=1):
00374 
00375     mapnames = os.listdir(dir)
00376     for mapname in mapnames:
00377         mappathname = os.path.join(dir, mapname)
00378         if not os.path.isfile(mappathname):
00379             continue
00380         name = os.path.split(mapname)[1]
00381         name = name.replace('-','_')
00382         name = name.split('.')[0]
00383         name = name.lower()
00384         name = nameprefix + name
00385         codefile = name + '.py'
00386         marshalfile = name + '.mapping'
00387         print('converting %s to %s and %s' % (mapname,
00388                                               dirprefix + codefile,
00389                                               dirprefix + marshalfile))
00390         try:
00391             map = readmap(os.path.join(dir,mapname))
00392             if not map:
00393                 print('* map is empty; skipping')
00394             else:
00395                 pymap(mappathname, map, dirprefix + codefile,name,comments)
00396                 marshalmap(mappathname, map, dirprefix + marshalfile)
00397         except ValueError as why:
00398             print('* conversion failed: %s' % why)
00399             raise

Here is the call graph for this function:

Here is the caller graph for this function:

def gencodec.hexrepr (   t,
  precision = 4 
)

Definition at line 124 of file gencodec.py.

00124 
00125 def hexrepr(t, precision=4):
00126 
00127     if t is None:
00128         return 'None'
00129     try:
00130         len(t)
00131     except:
00132         return '0x%0*X' % (precision, t)
00133     try:
00134         return '(' + ', '.join(['0x%0*X' % (precision, item)
00135                                 for item in t]) + ')'
00136     except TypeError as why:
00137         print('* failed to convert %r: %s' % (t, why))
00138         raise

Here is the caller graph for this function:

def gencodec.marshalmap (   name,
  map,
  marshalfile 
)

Definition at line 363 of file gencodec.py.

00363 
00364 def marshalmap(name,map,marshalfile):
00365 
00366     d = {}
00367     for e,(u,c) in map.items():
00368         d[e] = (u,c)
00369     f = open(marshalfile,'wb')
00370     marshal.dump(d,f)
00371     f.close()

Here is the caller graph for this function:

def gencodec.parsecodes (   codes,
  len = len,
  range = range 
)
Converts code combinations to either a single code integer
    or a tuple of integers.

    meta-codes (in angular brackets, e.g. <LR> and <RL>) are
    ignored.

    Empty codes or illegal ones are returned as None.

Definition at line 46 of file gencodec.py.

00046 
00047 def parsecodes(codes, len=len, range=range):
00048 
00049     """ Converts code combinations to either a single code integer
00050         or a tuple of integers.
00051 
00052         meta-codes (in angular brackets, e.g. <LR> and <RL>) are
00053         ignored.
00054 
00055         Empty codes or illegal ones are returned as None.
00056 
00057     """
00058     if not codes:
00059         return MISSING_CODE
00060     l = codes.split('+')
00061     if len(l) == 1:
00062         return int(l[0],16)
00063     for i in range(len(l)):
00064         try:
00065             l[i] = int(l[i],16)
00066         except ValueError:
00067             l[i] = MISSING_CODE
00068     l = [x for x in l if x != MISSING_CODE]
00069     if len(l) == 1:
00070         return l[0]
00071     else:
00072         return tuple(l)

Here is the caller graph for this function:

def gencodec.pymap (   name,
  map,
  pyfile,
  encodingname,
  comments = 1 
)

Definition at line 356 of file gencodec.py.

00356 
00357 def pymap(name,map,pyfile,encodingname,comments=1):
00358 
00359     code = codegen(name,map,encodingname,comments)
00360     f = open(pyfile,'w')
00361     f.write(code)
00362     f.close()

Here is the call graph for this function:

Here is the caller graph for this function:

def gencodec.python_mapdef_code (   varname,
  map,
  comments = 1,
  precisions = (2, 4 
)

Definition at line 139 of file gencodec.py.

00139 
00140 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
00141 
00142     l = []
00143     append = l.append
00144     if "IDENTITY" in map:
00145         append("%s = codecs.make_identity_dict(range(%d))" %
00146                (varname, map["IDENTITY"]))
00147         append("%s.update({" % varname)
00148         splits = 1
00149         del map["IDENTITY"]
00150         identity = 1
00151     else:
00152         append("%s = {" % varname)
00153         splits = 0
00154         identity = 0
00155 
00156     mappings = sorted(map.items())
00157     i = 0
00158     key_precision, value_precision = precisions
00159     for mapkey, mapvalue in mappings:
00160         mapcomment = ''
00161         if isinstance(mapkey, tuple):
00162             (mapkey, mapcomment) = mapkey
00163         if isinstance(mapvalue, tuple):
00164             (mapvalue, mapcomment) = mapvalue
00165         if mapkey is None:
00166             continue
00167         if (identity and
00168             mapkey == mapvalue and
00169             mapkey < 256):
00170             # No need to include identity mappings, since these
00171             # are already set for the first 256 code points.
00172             continue
00173         key = hexrepr(mapkey, key_precision)
00174         value = hexrepr(mapvalue, value_precision)
00175         if mapcomment and comments:
00176             append('    %s: %s,\t#  %s' % (key, value, mapcomment))
00177         else:
00178             append('    %s: %s,' % (key, value))
00179         i += 1
00180         if i == 4096:
00181             # Split the definition into parts to that the Python
00182             # parser doesn't dump core
00183             if splits == 0:
00184                 append('}')
00185             else:
00186                 append('})')
00187             append('%s.update({' % varname)
00188             i = 0
00189             splits = splits + 1
00190     if splits == 0:
00191         append('}')
00192     else:
00193         append('})')
00194 
00195     return l

Here is the call graph for this function:

Here is the caller graph for this function:

def gencodec.python_tabledef_code (   varname,
  map,
  comments = 1,
  key_precision = 2 
)

Definition at line 196 of file gencodec.py.

00196 
00197 def python_tabledef_code(varname, map, comments=1, key_precision=2):
00198 
00199     l = []
00200     append = l.append
00201     append('%s = (' % varname)
00202 
00203     # Analyze map and create table dict
00204     mappings = sorted(map.items())
00205     table = {}
00206     maxkey = 0
00207     if 'IDENTITY' in map:
00208         for key in range(256):
00209             table[key] = (key, '')
00210         maxkey = 255
00211         del map['IDENTITY']
00212     for mapkey, mapvalue in mappings:
00213         mapcomment = ''
00214         if isinstance(mapkey, tuple):
00215             (mapkey, mapcomment) = mapkey
00216         if isinstance(mapvalue, tuple):
00217             (mapvalue, mapcomment) = mapvalue
00218         if mapkey == MISSING_CODE:
00219             continue
00220         table[mapkey] = (mapvalue, mapcomment)
00221         if mapkey > maxkey:
00222             maxkey = mapkey
00223     if maxkey > MAX_TABLE_SIZE:
00224         # Table too large
00225         return None
00226 
00227     # Create table code
00228     for key in range(maxkey + 1):
00229         if key not in table:
00230             mapvalue = MISSING_CODE
00231             mapcomment = 'UNDEFINED'
00232         else:
00233             mapvalue, mapcomment = table[key]
00234         if mapvalue == MISSING_CODE:
00235             mapchar = UNI_UNDEFINED
00236         else:
00237             if isinstance(mapvalue, tuple):
00238                 # 1-n mappings not supported
00239                 return None
00240             else:
00241                 mapchar = chr(mapvalue)
00242         if mapcomment and comments:
00243             append('    %a \t#  %s -> %s' % (mapchar,
00244                                             hexrepr(key, key_precision),
00245                                             mapcomment))
00246         else:
00247             append('    %a' % mapchar)
00248 
00249     append(')')
00250     return l

Here is the call graph for this function:

Here is the caller graph for this function:

def gencodec.readmap (   filename)

Definition at line 73 of file gencodec.py.

00073 
00074 def readmap(filename):
00075 
00076     f = open(filename,'r')
00077     lines = f.readlines()
00078     f.close()
00079     enc2uni = {}
00080     identity = []
00081     unmapped = list(range(256))
00082 
00083     # UTC mapping tables per convention don't include the identity
00084     # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
00085     # explicitly mapped to different characters or undefined
00086     for i in list(range(32)) + [127]:
00087         identity.append(i)
00088         unmapped.remove(i)
00089         enc2uni[i] = (i, 'CONTROL CHARACTER')
00090 
00091     for line in lines:
00092         line = line.strip()
00093         if not line or line[0] == '#':
00094             continue
00095         m = mapRE.match(line)
00096         if not m:
00097             #print '* not matched: %s' % repr(line)
00098             continue
00099         enc,uni,comment = m.groups()
00100         enc = parsecodes(enc)
00101         uni = parsecodes(uni)
00102         if comment is None:
00103             comment = ''
00104         else:
00105             comment = comment[1:].strip()
00106         if enc < 256:
00107             if enc in unmapped:
00108                 unmapped.remove(enc)
00109             if enc == uni:
00110                 identity.append(enc)
00111             enc2uni[enc] = (uni,comment)
00112         else:
00113             enc2uni[enc] = (uni,comment)
00114 
00115     # If there are more identity-mapped entries than unmapped entries,
00116     # it pays to generate an identity dictionary first, and add explicit
00117     # mappings to None for the rest
00118     if len(identity) >= len(unmapped):
00119         for enc in unmapped:
00120             enc2uni[enc] = (MISSING_CODE, "")
00121         enc2uni['IDENTITY'] = 256
00122 
00123     return enc2uni

Here is the call graph for this function:

Here is the caller graph for this function:

def gencodec.rewritepythondir (   dir,
  dirprefix = '',
  comments = 1 
)

Definition at line 400 of file gencodec.py.

00400 
00401 def rewritepythondir(dir, dirprefix='', comments=1):
00402 
00403     mapnames = os.listdir(dir)
00404     for mapname in mapnames:
00405         if not mapname.endswith('.mapping'):
00406             continue
00407         name = mapname[:-len('.mapping')]
00408         codefile = name + '.py'
00409         print('converting %s to %s' % (mapname,
00410                                        dirprefix + codefile))
00411         try:
00412             map = marshal.load(open(os.path.join(dir,mapname),
00413                                'rb'))
00414             if not map:
00415                 print('* map is empty; skipping')
00416             else:
00417                 pymap(mapname, map, dirprefix + codefile,name,comments)
00418         except ValueError as why:
00419             print('* conversion failed: %s' % why)

Here is the call graph for this function:


Variable Documentation

Initial value:
00001 re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
00002                    '\s+'
00003                    '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
00004                    '\s*'
00005                    '(#.+)?')

Definition at line 40 of file gencodec.py.

Definition at line 32 of file gencodec.py.

Definition at line 38 of file gencodec.py.

tuple gencodec.UNI_UNDEFINED = chr(0xFFFE)

Definition at line 35 of file gencodec.py.