Back to index

python3.2  3.2.2
Classes | Functions | Variables
makeunicodedata Namespace Reference

Classes

class  UnicodeData
class  Hash
class  Array

Functions

def maketables
def makeunicodedata
def makeunicodetype
def makeunicodename
def merge_old_version
def open_data
def myhash
def getsize
def splitbins

Variables

list SCRIPT = sys.argv[0]
string VERSION = "3.2"
string UNIDATA_VERSION = "6.0.0"
string UNICODE_DATA = "UnicodeData%s.txt"
string COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
string EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
string UNIHAN = "Unihan%s.zip"
string DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
string DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
string LINE_BREAK = "LineBreak%s.txt"
list old_versions = ["3.2.0"]
list CATEGORY_NAMES
list BIDIRECTIONAL_NAMES
list EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
list MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
int ALPHA_MASK = 0x01
int DECIMAL_MASK = 0x02
int DIGIT_MASK = 0x04
int LOWER_MASK = 0x08
int LINEBREAK_MASK = 0x10
int SPACE_MASK = 0x20
int TITLE_MASK = 0x40
int UPPER_MASK = 0x80
int XID_START_MASK = 0x100
int XID_CONTINUE_MASK = 0x200
int PRINTABLE_MASK = 0x400
int NODELTA_MASK = 0x800
int NUMERIC_MASK = 0x1000
list cjk_ranges
list SIZES

Function Documentation

def makeunicodedata.getsize (   data)

Definition at line 1093 of file makeunicodedata.py.

01093 
01094 def getsize(data):
01095     # return smallest possible integer size for the given array
01096     maxdata = max(data)
01097     if maxdata < 256:
01098         return 1
01099     elif maxdata < 65536:
01100         return 2
01101     else:
01102         return 4

Here is the caller graph for this function:

def makeunicodedata.maketables (   trace = 0)

Definition at line 82 of file makeunicodedata.py.

00082 
00083 def maketables(trace=0):
00084 
00085     print("--- Reading", UNICODE_DATA % "", "...")
00086 
00087     version = ""
00088     unicode = UnicodeData(UNIDATA_VERSION)
00089 
00090     print(len(list(filter(None, unicode.table))), "characters")
00091 
00092     for version in old_versions:
00093         print("--- Reading", UNICODE_DATA % ("-"+version), "...")
00094         old_unicode = UnicodeData(version, cjk_check=False)
00095         print(len(list(filter(None, old_unicode.table))), "characters")
00096         merge_old_version(version, unicode, old_unicode)
00097 
00098     makeunicodename(unicode, trace)
00099     makeunicodedata(unicode, trace)
00100     makeunicodetype(unicode, trace)
00101 
00102 # --------------------------------------------------------------------
00103 # unicode character properties

Here is the call graph for this function:

Here is the caller graph for this function:

def makeunicodedata.makeunicodedata (   unicode,
  trace 
)

Definition at line 104 of file makeunicodedata.py.

00104 
00105 def makeunicodedata(unicode, trace):
00106 
00107     dummy = (0, 0, 0, 0, 0, 0)
00108     table = [dummy]
00109     cache = {0: dummy}
00110     index = [0] * len(unicode.chars)
00111 
00112     FILE = "Modules/unicodedata_db.h"
00113 
00114     print("--- Preparing", FILE, "...")
00115 
00116     # 1) database properties
00117 
00118     for char in unicode.chars:
00119         record = unicode.table[char]
00120         if record:
00121             # extract database properties
00122             category = CATEGORY_NAMES.index(record[2])
00123             combining = int(record[3])
00124             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
00125             mirrored = record[9] == "Y"
00126             eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
00127             normalizationquickcheck = record[17]
00128             item = (
00129                 category, combining, bidirectional, mirrored, eastasianwidth,
00130                 normalizationquickcheck
00131                 )
00132             # add entry to index and item tables
00133             i = cache.get(item)
00134             if i is None:
00135                 cache[item] = i = len(table)
00136                 table.append(item)
00137             index[char] = i
00138 
00139     # 2) decomposition data
00140 
00141     decomp_data = [0]
00142     decomp_prefix = [""]
00143     decomp_index = [0] * len(unicode.chars)
00144     decomp_size = 0
00145 
00146     comp_pairs = []
00147     comp_first = [None] * len(unicode.chars)
00148     comp_last = [None] * len(unicode.chars)
00149 
00150     for char in unicode.chars:
00151         record = unicode.table[char]
00152         if record:
00153             if record[5]:
00154                 decomp = record[5].split()
00155                 if len(decomp) > 19:
00156                     raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
00157                 # prefix
00158                 if decomp[0][0] == "<":
00159                     prefix = decomp.pop(0)
00160                 else:
00161                     prefix = ""
00162                 try:
00163                     i = decomp_prefix.index(prefix)
00164                 except ValueError:
00165                     i = len(decomp_prefix)
00166                     decomp_prefix.append(prefix)
00167                 prefix = i
00168                 assert prefix < 256
00169                 # content
00170                 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
00171                 # Collect NFC pairs
00172                 if not prefix and len(decomp) == 3 and \
00173                    char not in unicode.exclusions and \
00174                    unicode.table[decomp[1]][3] == "0":
00175                     p, l, r = decomp
00176                     comp_first[l] = 1
00177                     comp_last[r] = 1
00178                     comp_pairs.append((l,r,char))
00179                 try:
00180                     i = decomp_data.index(decomp)
00181                 except ValueError:
00182                     i = len(decomp_data)
00183                     decomp_data.extend(decomp)
00184                     decomp_size = decomp_size + len(decomp) * 2
00185             else:
00186                 i = 0
00187             decomp_index[char] = i
00188 
00189     f = l = 0
00190     comp_first_ranges = []
00191     comp_last_ranges = []
00192     prev_f = prev_l = None
00193     for i in unicode.chars:
00194         if comp_first[i] is not None:
00195             comp_first[i] = f
00196             f += 1
00197             if prev_f is None:
00198                 prev_f = (i,i)
00199             elif prev_f[1]+1 == i:
00200                 prev_f = prev_f[0],i
00201             else:
00202                 comp_first_ranges.append(prev_f)
00203                 prev_f = (i,i)
00204         if comp_last[i] is not None:
00205             comp_last[i] = l
00206             l += 1
00207             if prev_l is None:
00208                 prev_l = (i,i)
00209             elif prev_l[1]+1 == i:
00210                 prev_l = prev_l[0],i
00211             else:
00212                 comp_last_ranges.append(prev_l)
00213                 prev_l = (i,i)
00214     comp_first_ranges.append(prev_f)
00215     comp_last_ranges.append(prev_l)
00216     total_first = f
00217     total_last = l
00218 
00219     comp_data = [0]*(total_first*total_last)
00220     for f,l,char in comp_pairs:
00221         f = comp_first[f]
00222         l = comp_last[l]
00223         comp_data[f*total_last+l] = char
00224 
00225     print(len(table), "unique properties")
00226     print(len(decomp_prefix), "unique decomposition prefixes")
00227     print(len(decomp_data), "unique decomposition entries:", end=' ')
00228     print(decomp_size, "bytes")
00229     print(total_first, "first characters in NFC")
00230     print(total_last, "last characters in NFC")
00231     print(len(comp_pairs), "NFC pairs")
00232 
00233     print("--- Writing", FILE, "...")
00234 
00235     fp = open(FILE, "w")
00236     print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
00237     print(file=fp)
00238     print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
00239     print("/* a list of unique database records */", file=fp)
00240     print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
00241     for item in table:
00242         print("    {%d, %d, %d, %d, %d, %d}," % item, file=fp)
00243     print("};", file=fp)
00244     print(file=fp)
00245 
00246     print("/* Reindexing of NFC first characters. */", file=fp)
00247     print("#define TOTAL_FIRST",total_first, file=fp)
00248     print("#define TOTAL_LAST",total_last, file=fp)
00249     print("struct reindex{int start;short count,index;};", file=fp)
00250     print("static struct reindex nfc_first[] = {", file=fp)
00251     for start,end in comp_first_ranges:
00252         print("  { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
00253     print("  {0,0,0}", file=fp)
00254     print("};\n", file=fp)
00255     print("static struct reindex nfc_last[] = {", file=fp)
00256     for start,end in comp_last_ranges:
00257         print("  { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
00258     print("  {0,0,0}", file=fp)
00259     print("};\n", file=fp)
00260 
00261     # FIXME: <fl> the following tables could be made static, and
00262     # the support code moved into unicodedatabase.c
00263 
00264     print("/* string literals */", file=fp)
00265     print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
00266     for name in CATEGORY_NAMES:
00267         print("    \"%s\"," % name, file=fp)
00268     print("    NULL", file=fp)
00269     print("};", file=fp)
00270 
00271     print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
00272     for name in BIDIRECTIONAL_NAMES:
00273         print("    \"%s\"," % name, file=fp)
00274     print("    NULL", file=fp)
00275     print("};", file=fp)
00276 
00277     print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
00278     for name in EASTASIANWIDTH_NAMES:
00279         print("    \"%s\"," % name, file=fp)
00280     print("    NULL", file=fp)
00281     print("};", file=fp)
00282 
00283     print("static const char *decomp_prefix[] = {", file=fp)
00284     for name in decomp_prefix:
00285         print("    \"%s\"," % name, file=fp)
00286     print("    NULL", file=fp)
00287     print("};", file=fp)
00288 
00289     # split record index table
00290     index1, index2, shift = splitbins(index, trace)
00291 
00292     print("/* index tables for the database records */", file=fp)
00293     print("#define SHIFT", shift, file=fp)
00294     Array("index1", index1).dump(fp, trace)
00295     Array("index2", index2).dump(fp, trace)
00296 
00297     # split decomposition index table
00298     index1, index2, shift = splitbins(decomp_index, trace)
00299 
00300     print("/* decomposition data */", file=fp)
00301     Array("decomp_data", decomp_data).dump(fp, trace)
00302 
00303     print("/* index tables for the decomposition data */", file=fp)
00304     print("#define DECOMP_SHIFT", shift, file=fp)
00305     Array("decomp_index1", index1).dump(fp, trace)
00306     Array("decomp_index2", index2).dump(fp, trace)
00307 
00308     index, index2, shift = splitbins(comp_data, trace)
00309     print("/* NFC pairs */", file=fp)
00310     print("#define COMP_SHIFT", shift, file=fp)
00311     Array("comp_index", index).dump(fp, trace)
00312     Array("comp_data", index2).dump(fp, trace)
00313 
00314     # Generate delta tables for old versions
00315     for version, table, normalization in unicode.changed:
00316         cversion = version.replace(".","_")
00317         records = [table[0]]
00318         cache = {table[0]:0}
00319         index = [0] * len(table)
00320         for i, record in enumerate(table):
00321             try:
00322                 index[i] = cache[record]
00323             except KeyError:
00324                 index[i] = cache[record] = len(records)
00325                 records.append(record)
00326         index1, index2, shift = splitbins(index, trace)
00327         print("static const change_record change_records_%s[] = {" % cversion, file=fp)
00328         for record in records:
00329             print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
00330         print("};", file=fp)
00331         Array("changes_%s_index" % cversion, index1).dump(fp, trace)
00332         Array("changes_%s_data" % cversion, index2).dump(fp, trace)
00333         print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
00334         print("{", file=fp)
00335         print("\tint index;", file=fp)
00336         print("\tif (n >= 0x110000) index = 0;", file=fp)
00337         print("\telse {", file=fp)
00338         print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
00339         print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
00340               (cversion, shift, ((1<<shift)-1)), file=fp)
00341         print("\t}", file=fp)
00342         print("\treturn change_records_%s+index;" % cversion, file=fp)
00343         print("}\n", file=fp)
00344         print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
00345         print("{", file=fp)
00346         print("\tswitch(n) {", file=fp)
00347         for k, v in normalization:
00348             print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
00349         print("\tdefault: return 0;", file=fp)
00350         print("\t}\n}\n", file=fp)
00351 
00352     fp.close()
00353 
00354 # --------------------------------------------------------------------
00355 # unicode character type tables

Here is the call graph for this function:

def makeunicodedata.makeunicodename (   unicode,
  trace 
)

Definition at line 538 of file makeunicodedata.py.

00538 
00539 def makeunicodename(unicode, trace):
00540 
00541     FILE = "Modules/unicodename_db.h"
00542 
00543     print("--- Preparing", FILE, "...")
00544 
00545     # collect names
00546     names = [None] * len(unicode.chars)
00547 
00548     for char in unicode.chars:
00549         record = unicode.table[char]
00550         if record:
00551             name = record[1].strip()
00552             if name and name[0] != "<":
00553                 names[char] = name + chr(0)
00554 
00555     print(len(list(n for n in names if n is not None)), "distinct names")
00556 
00557     # collect unique words from names (note that we differ between
00558     # words inside a sentence, and words ending a sentence.  the
00559     # latter includes the trailing null byte.
00560 
00561     words = {}
00562     n = b = 0
00563     for char in unicode.chars:
00564         name = names[char]
00565         if name:
00566             w = name.split()
00567             b = b + len(name)
00568             n = n + len(w)
00569             for w in w:
00570                 l = words.get(w)
00571                 if l:
00572                     l.append(None)
00573                 else:
00574                     words[w] = [len(words)]
00575 
00576     print(n, "words in text;", b, "bytes")
00577 
00578     wordlist = list(words.items())
00579 
00580     # sort on falling frequency, then by name
00581     def word_key(a):
00582         aword, alist = a
00583         return -len(alist), aword
00584     wordlist.sort(key=word_key)
00585 
00586     # figure out how many phrasebook escapes we need
00587     escapes = 0
00588     while escapes * 256 < len(wordlist):
00589         escapes = escapes + 1
00590     print(escapes, "escapes")
00591 
00592     short = 256 - escapes
00593 
00594     assert short > 0
00595 
00596     print(short, "short indexes in lexicon")
00597 
00598     # statistics
00599     n = 0
00600     for i in range(short):
00601         n = n + len(wordlist[i][1])
00602     print(n, "short indexes in phrasebook")
00603 
00604     # pick the most commonly used words, and sort the rest on falling
00605     # length (to maximize overlap)
00606 
00607     wordlist, wordtail = wordlist[:short], wordlist[short:]
00608     wordtail.sort(key=lambda a: a[0], reverse=True)
00609     wordlist.extend(wordtail)
00610 
00611     # generate lexicon from words
00612 
00613     lexicon_offset = [0]
00614     lexicon = ""
00615     words = {}
00616 
00617     # build a lexicon string
00618     offset = 0
00619     for w, x in wordlist:
00620         # encoding: bit 7 indicates last character in word (chr(128)
00621         # indicates the last character in an entire string)
00622         ww = w[:-1] + chr(ord(w[-1])+128)
00623         # reuse string tails, when possible
00624         o = lexicon.find(ww)
00625         if o < 0:
00626             o = offset
00627             lexicon = lexicon + ww
00628             offset = offset + len(w)
00629         words[w] = len(lexicon_offset)
00630         lexicon_offset.append(o)
00631 
00632     lexicon = list(map(ord, lexicon))
00633 
00634     # generate phrasebook from names and lexicon
00635     phrasebook = [0]
00636     phrasebook_offset = [0] * len(unicode.chars)
00637     for char in unicode.chars:
00638         name = names[char]
00639         if name:
00640             w = name.split()
00641             phrasebook_offset[char] = len(phrasebook)
00642             for w in w:
00643                 i = words[w]
00644                 if i < short:
00645                     phrasebook.append(i)
00646                 else:
00647                     # store as two bytes
00648                     phrasebook.append((i>>8) + short)
00649                     phrasebook.append(i&255)
00650 
00651     assert getsize(phrasebook) == 1
00652 
00653     #
00654     # unicode name hash table
00655 
00656     # extract names
00657     data = []
00658     for char in unicode.chars:
00659         record = unicode.table[char]
00660         if record:
00661             name = record[1].strip()
00662             if name and name[0] != "<":
00663                 data.append((name, char))
00664 
00665     # the magic number 47 was chosen to minimize the number of
00666     # collisions on the current data set.  if you like, change it
00667     # and see what happens...
00668 
00669     codehash = Hash("code", data, 47)
00670 
00671     print("--- Writing", FILE, "...")
00672 
00673     fp = open(FILE, "w")
00674     print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
00675     print(file=fp)
00676     print("#define NAME_MAXLEN", 256, file=fp)
00677     print(file=fp)
00678     print("/* lexicon */", file=fp)
00679     Array("lexicon", lexicon).dump(fp, trace)
00680     Array("lexicon_offset", lexicon_offset).dump(fp, trace)
00681 
00682     # split decomposition index table
00683     offset1, offset2, shift = splitbins(phrasebook_offset, trace)
00684 
00685     print("/* code->name phrasebook */", file=fp)
00686     print("#define phrasebook_shift", shift, file=fp)
00687     print("#define phrasebook_short", short, file=fp)
00688 
00689     Array("phrasebook", phrasebook).dump(fp, trace)
00690     Array("phrasebook_offset1", offset1).dump(fp, trace)
00691     Array("phrasebook_offset2", offset2).dump(fp, trace)
00692 
00693     print("/* name->code dictionary */", file=fp)
00694     codehash.dump(fp, trace)
00695 
00696     fp.close()
00697 

Here is the call graph for this function:

Here is the caller graph for this function:

def makeunicodedata.makeunicodetype (   unicode,
  trace 
)

Definition at line 356 of file makeunicodedata.py.

00356 
00357 def makeunicodetype(unicode, trace):
00358 
00359     FILE = "Objects/unicodetype_db.h"
00360 
00361     print("--- Preparing", FILE, "...")
00362 
00363     # extract unicode types
00364     dummy = (0, 0, 0, 0, 0, 0)
00365     table = [dummy]
00366     cache = {0: dummy}
00367     index = [0] * len(unicode.chars)
00368     numeric = {}
00369     spaces = []
00370     linebreaks = []
00371 
00372     for char in unicode.chars:
00373         record = unicode.table[char]
00374         if record:
00375             # extract database properties
00376             category = record[2]
00377             bidirectional = record[4]
00378             properties = record[16]
00379             flags = 0
00380             delta = True
00381             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
00382                 flags |= ALPHA_MASK
00383             if category == "Ll":
00384                 flags |= LOWER_MASK
00385             if 'Line_Break' in properties or bidirectional == "B":
00386                 flags |= LINEBREAK_MASK
00387                 linebreaks.append(char)
00388             if category == "Zs" or bidirectional in ("WS", "B", "S"):
00389                 flags |= SPACE_MASK
00390                 spaces.append(char)
00391             if category == "Lt":
00392                 flags |= TITLE_MASK
00393             if category == "Lu":
00394                 flags |= UPPER_MASK
00395             if char == ord(" ") or category[0] not in ("C", "Z"):
00396                 flags |= PRINTABLE_MASK
00397             if "XID_Start" in properties:
00398                 flags |= XID_START_MASK
00399             if "XID_Continue" in properties:
00400                 flags |= XID_CONTINUE_MASK
00401             # use delta predictor for upper/lower/title if it fits
00402             if record[12]:
00403                 upper = int(record[12], 16)
00404             else:
00405                 upper = char
00406             if record[13]:
00407                 lower = int(record[13], 16)
00408             else:
00409                 lower = char
00410             if record[14]:
00411                 title = int(record[14], 16)
00412             else:
00413                 # UCD.html says that a missing title char means that
00414                 # it defaults to the uppercase character, not to the
00415                 # character itself. Apparently, in the current UCD (5.x)
00416                 # this feature is never used
00417                 title = upper
00418             upper_d = upper - char
00419             lower_d = lower - char
00420             title_d = title - char
00421             if -32768 <= upper_d <= 32767 and \
00422                -32768 <= lower_d <= 32767 and \
00423                -32768 <= title_d <= 32767:
00424                 # use deltas
00425                 upper = upper_d & 0xffff
00426                 lower = lower_d & 0xffff
00427                 title = title_d & 0xffff
00428             else:
00429                 flags |= NODELTA_MASK
00430             # decimal digit, integer digit
00431             decimal = 0
00432             if record[6]:
00433                 flags |= DECIMAL_MASK
00434                 decimal = int(record[6])
00435             digit = 0
00436             if record[7]:
00437                 flags |= DIGIT_MASK
00438                 digit = int(record[7])
00439             if record[8]:
00440                 flags |= NUMERIC_MASK
00441                 numeric.setdefault(record[8], []).append(char)
00442             item = (
00443                 upper, lower, title, decimal, digit, flags
00444                 )
00445             # add entry to index and item tables
00446             i = cache.get(item)
00447             if i is None:
00448                 cache[item] = i = len(table)
00449                 table.append(item)
00450             index[char] = i
00451 
00452     print(len(table), "unique character type entries")
00453     print(sum(map(len, numeric.values())), "numeric code points")
00454     print(len(spaces), "whitespace code points")
00455     print(len(linebreaks), "linebreak code points")
00456 
00457     print("--- Writing", FILE, "...")
00458 
00459     fp = open(FILE, "w")
00460     print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
00461     print(file=fp)
00462     print("/* a list of unique character type descriptors */", file=fp)
00463     print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
00464     for item in table:
00465         print("    {%d, %d, %d, %d, %d, %d}," % item, file=fp)
00466     print("};", file=fp)
00467     print(file=fp)
00468 
00469     # split decomposition index table
00470     index1, index2, shift = splitbins(index, trace)
00471 
00472     print("/* type indexes */", file=fp)
00473     print("#define SHIFT", shift, file=fp)
00474     Array("index1", index1).dump(fp, trace)
00475     Array("index2", index2).dump(fp, trace)
00476 
00477     # Generate code for _PyUnicode_ToNumeric()
00478     numeric_items = sorted(numeric.items())
00479     print('/* Returns the numeric value as double for Unicode characters', file=fp)
00480     print(' * having this property, -1.0 otherwise.', file=fp)
00481     print(' */', file=fp)
00482     print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
00483     print('{', file=fp)
00484     print('    switch (ch) {', file=fp)
00485     for value, codepoints in numeric_items:
00486         # Turn text into float literals
00487         parts = value.split('/')
00488         parts = [repr(float(part)) for part in parts]
00489         value = '/'.join(parts)
00490 
00491         codepoints.sort()
00492         for codepoint in codepoints:
00493             print('    case 0x%04X:' % (codepoint,), file=fp)
00494         print('        return (double) %s;' % (value,), file=fp)
00495     print('    }', file=fp)
00496     print('    return -1.0;', file=fp)
00497     print('}', file=fp)
00498     print(file=fp)
00499 
00500     # Generate code for _PyUnicode_IsWhitespace()
00501     print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
00502     print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
00503     print(" */", file=fp)
00504     print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
00505     print('{', file=fp)
00506     print('    switch (ch) {', file=fp)
00507 
00508     for codepoint in sorted(spaces):
00509         print('    case 0x%04X:' % (codepoint,), file=fp)
00510     print('        return 1;', file=fp)
00511 
00512     print('    }', file=fp)
00513     print('    return 0;', file=fp)
00514     print('}', file=fp)
00515     print(file=fp)
00516 
00517     # Generate code for _PyUnicode_IsLinebreak()
00518     print("/* Returns 1 for Unicode characters having the line break", file=fp)
00519     print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
00520     print(" * type 'B', 0 otherwise.", file=fp)
00521     print(" */", file=fp)
00522     print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
00523     print('{', file=fp)
00524     print('    switch (ch) {', file=fp)
00525     for codepoint in sorted(linebreaks):
00526         print('    case 0x%04X:' % (codepoint,), file=fp)
00527     print('        return 1;', file=fp)
00528 
00529     print('    }', file=fp)
00530     print('    return 0;', file=fp)
00531     print('}', file=fp)
00532     print(file=fp)
00533 
00534     fp.close()
00535 
00536 # --------------------------------------------------------------------
00537 # unicode name database

Here is the call graph for this function:

Here is the caller graph for this function:

def makeunicodedata.merge_old_version (   version,
  new,
  old 
)

Definition at line 698 of file makeunicodedata.py.

00698 
00699 def merge_old_version(version, new, old):
00700     # Changes to exclusion file not implemented yet
00701     if old.exclusions != new.exclusions:
00702         raise NotImplementedError("exclusions differ")
00703 
00704     # In these change records, 0xFF means "no change"
00705     bidir_changes = [0xFF]*0x110000
00706     category_changes = [0xFF]*0x110000
00707     decimal_changes = [0xFF]*0x110000
00708     mirrored_changes = [0xFF]*0x110000
00709     # In numeric data, 0 means "no change",
00710     # -1 means "did not have a numeric value
00711     numeric_changes = [0] * 0x110000
00712     # normalization_changes is a list of key-value pairs
00713     normalization_changes = []
00714     for i in range(0x110000):
00715         if new.table[i] is None:
00716             # Characters unassigned in the new version ought to
00717             # be unassigned in the old one
00718             assert old.table[i] is None
00719             continue
00720         # check characters unassigned in the old version
00721         if old.table[i] is None:
00722             # category 0 is "unassigned"
00723             category_changes[i] = 0
00724             continue
00725         # check characters that differ
00726         if old.table[i] != new.table[i]:
00727             for k in range(len(old.table[i])):
00728                 if old.table[i][k] != new.table[i][k]:
00729                     value = old.table[i][k]
00730                     if k == 2:
00731                         #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
00732                         category_changes[i] = CATEGORY_NAMES.index(value)
00733                     elif k == 4:
00734                         #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
00735                         bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
00736                     elif k == 5:
00737                         #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
00738                         # We assume that all normalization changes are in 1:1 mappings
00739                         assert " " not in value
00740                         normalization_changes.append((i, value))
00741                     elif k == 6:
00742                         #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
00743                         # we only support changes where the old value is a single digit
00744                         assert value in "0123456789"
00745                         decimal_changes[i] = int(value)
00746                     elif k == 8:
00747                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
00748                         # Since 0 encodes "no change", the old value is better not 0
00749                         if not value:
00750                             numeric_changes[i] = -1
00751                         else:
00752                             numeric_changes[i] = float(value)
00753                             assert numeric_changes[i] not in (0, -1)
00754                     elif k == 9:
00755                         if value == 'Y':
00756                             mirrored_changes[i] = '1'
00757                         else:
00758                             mirrored_changes[i] = '0'
00759                     elif k == 11:
00760                         # change to ISO comment, ignore
00761                         pass
00762                     elif k == 12:
00763                         # change to simple uppercase mapping; ignore
00764                         pass
00765                     elif k == 13:
00766                         # change to simple lowercase mapping; ignore
00767                         pass
00768                     elif k == 14:
00769                         # change to simple titlecase mapping; ignore
00770                         pass
00771                     elif k == 16:
00772                         # derived property changes; not yet
00773                         pass
00774                     elif k == 17:
00775                         # normalization quickchecks are not performed
00776                         # for older versions
00777                         pass
00778                     else:
00779                         class Difference(Exception):pass
00780                         raise Difference(hex(i), k, old.table[i], new.table[i])
00781     new.changed.append((version, list(zip(bidir_changes, category_changes,
00782                                      decimal_changes, mirrored_changes,
00783                                      numeric_changes)),
00784                         normalization_changes))

Here is the caller graph for this function:

def makeunicodedata.myhash (   s,
  magic 
)

Definition at line 979 of file makeunicodedata.py.

00979 
00980 def myhash(s, magic):
00981     h = 0
00982     for c in map(ord, s.upper()):
00983         h = (h * magic) + c
00984         ix = h & 0xff000000
00985         if ix:
00986             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
00987     return h

def makeunicodedata.open_data (   template,
  version 
)

Definition at line 785 of file makeunicodedata.py.

00785 
00786 def open_data(template, version):
00787     local = template % ('-'+version,)
00788     if not os.path.exists(local):
00789         import urllib.request
00790         if version == '3.2.0':
00791             # irregular url structure
00792             url = 'http://www.unicode.org/Public/3.2-Update/' + local
00793         else:
00794             url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
00795         urllib.request.urlretrieve(url, filename=local)
00796     if local.endswith('.txt'):
00797         return open(local, encoding='utf-8')
00798     else:
00799         # Unihan.zip
00800         return open(local, 'rb')
00801 
00802 # --------------------------------------------------------------------
00803 # the following support code is taken from the unidb utilities
00804 # Copyright (c) 1999-2000 by Secret Labs AB
00805 
00806 # load a unicode-data file from disk

Here is the call graph for this function:

def makeunicodedata.splitbins (   t,
  trace = 0 
)
t, trace=0 -> (t1, t2, shift).  Split a table to save space.

t is a sequence of ints.  This function can be useful to save space if
many of the ints are the same.  t1 and t2 are lists of ints, and shift
is an int, chosen to minimize the combined size of t1 and t2 (in C
code), and where for each i in range(len(t)),
    t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
where mask is a bitmask isolating the last "shift" bits.

If optional arg trace is non-zero (default zero), progress info
is printed to sys.stderr.  The higher the value, the more info
you'll get.

Definition at line 1103 of file makeunicodedata.py.

01103 
01104 def splitbins(t, trace=0):
01105     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
01106 
01107     t is a sequence of ints.  This function can be useful to save space if
01108     many of the ints are the same.  t1 and t2 are lists of ints, and shift
01109     is an int, chosen to minimize the combined size of t1 and t2 (in C
01110     code), and where for each i in range(len(t)),
01111         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
01112     where mask is a bitmask isolating the last "shift" bits.
01113 
01114     If optional arg trace is non-zero (default zero), progress info
01115     is printed to sys.stderr.  The higher the value, the more info
01116     you'll get.
01117     """
01118 
01119     if trace:
01120         def dump(t1, t2, shift, bytes):
01121             print("%d+%d bins at shift %d; %d bytes" % (
01122                 len(t1), len(t2), shift, bytes), file=sys.stderr)
01123         print("Size of original table:", len(t)*getsize(t), \
01124                             "bytes", file=sys.stderr)
01125     n = len(t)-1    # last valid index
01126     maxshift = 0    # the most we can shift n and still have something left
01127     if n > 0:
01128         while n >> 1:
01129             n >>= 1
01130             maxshift += 1
01131     del n
01132     bytes = sys.maxsize  # smallest total size so far
01133     t = tuple(t)    # so slices can be dict keys
01134     for shift in range(maxshift + 1):
01135         t1 = []
01136         t2 = []
01137         size = 2**shift
01138         bincache = {}
01139         for i in range(0, len(t), size):
01140             bin = t[i:i+size]
01141             index = bincache.get(bin)
01142             if index is None:
01143                 index = len(t2)
01144                 bincache[bin] = index
01145                 t2.extend(bin)
01146             t1.append(index >> shift)
01147         # determine memory size
01148         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
01149         if trace > 1:
01150             dump(t1, t2, shift, b)
01151         if b < bytes:
01152             best = t1, t2, shift
01153             bytes = b
01154     t1, t2, shift = best
01155     if trace:
01156         print("Best:", end=' ', file=sys.stderr)
01157         dump(t1, t2, shift, bytes)
01158     if __debug__:
01159         # exhaustively verify that the decomposition is correct
01160         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
01161         for i in range(len(t)):
01162             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
01163     return best

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

Definition at line 59 of file makeunicodedata.py.

Initial value:
00001 [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
00002     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
00003     "ON" ]

Definition at line 50 of file makeunicodedata.py.

Initial value:
00001 [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
00002     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
00003     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
00004     "So" ]

Definition at line 45 of file makeunicodedata.py.

Initial value:
00001 [
00002     ('3400', '4DB5'),
00003     ('4E00', '9FCB'),
00004     ('20000', '2A6D6'),
00005     ('2A700', '2B734'),
00006     ('2B740', '2B81D')
00007 ]

Definition at line 74 of file makeunicodedata.py.

string makeunicodedata.COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"

Definition at line 36 of file makeunicodedata.py.

Definition at line 60 of file makeunicodedata.py.

string makeunicodedata.DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"

Definition at line 39 of file makeunicodedata.py.

string makeunicodedata.DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"

Definition at line 40 of file makeunicodedata.py.

Definition at line 61 of file makeunicodedata.py.

string makeunicodedata.EASTASIAN_WIDTH = "EastAsianWidth%s.txt"

Definition at line 37 of file makeunicodedata.py.

list makeunicodedata.EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

Definition at line 54 of file makeunicodedata.py.

string makeunicodedata.LINE_BREAK = "LineBreak%s.txt"

Definition at line 41 of file makeunicodedata.py.

Definition at line 63 of file makeunicodedata.py.

Definition at line 62 of file makeunicodedata.py.

list makeunicodedata.MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

Definition at line 56 of file makeunicodedata.py.

Definition at line 70 of file makeunicodedata.py.

Definition at line 71 of file makeunicodedata.py.

list makeunicodedata.old_versions = ["3.2.0"]

Definition at line 43 of file makeunicodedata.py.

Definition at line 69 of file makeunicodedata.py.

Definition at line 30 of file makeunicodedata.py.

Initial value:
00001 [
00002     (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
00003     (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
00004     (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
00005     (2097152,5), (4194304,3), (8388608,33), (16777216,27)
00006 ]

Definition at line 988 of file makeunicodedata.py.

Definition at line 64 of file makeunicodedata.py.

Definition at line 65 of file makeunicodedata.py.

string makeunicodedata.UNICODE_DATA = "UnicodeData%s.txt"

Definition at line 35 of file makeunicodedata.py.

Definition at line 34 of file makeunicodedata.py.

string makeunicodedata.UNIHAN = "Unihan%s.zip"

Definition at line 38 of file makeunicodedata.py.

Definition at line 66 of file makeunicodedata.py.

Definition at line 31 of file makeunicodedata.py.

Definition at line 68 of file makeunicodedata.py.

Definition at line 67 of file makeunicodedata.py.