Back to index

python3.2  3.2.2
Public Member Functions | Public Attributes
makeunicodedata.UnicodeData Class Reference

List of all members.

Public Member Functions

def __init__
def uselatin1

Public Attributes

 changed
 filename
 table
 chars
 exclusions

Detailed Description

Definition at line 807 of file makeunicodedata.py.


Constructor & Destructor Documentation

def makeunicodedata.UnicodeData.__init__ (   self,
  version,
  linebreakprops = False,
  expand = 1,
  cjk_check = True 
)

Definition at line 817 of file makeunicodedata.py.

00817 
00818                  cjk_check=True):
00819         self.changed = []
00820         file = open_data(UNICODE_DATA, version)
00821         table = [None] * 0x110000
00822         while 1:
00823             s = file.readline()
00824             if not s:
00825                 break
00826             s = s.strip().split(";")
00827             char = int(s[0], 16)
00828             table[char] = s
00829 
00830         cjk_ranges_found = []
00831 
00832         # expand first-last ranges
00833         if expand:
00834             field = None
00835             for i in range(0, 0x110000):
00836                 s = table[i]
00837                 if s:
00838                     if s[1][-6:] == "First>":
00839                         s[1] = ""
00840                         field = s
00841                     elif s[1][-5:] == "Last>":
00842                         if s[1].startswith("<CJK Ideograph"):
00843                             cjk_ranges_found.append((field[0],
00844                                                      s[0]))
00845                         s[1] = ""
00846                         field = None
00847                 elif field:
00848                     f2 = field[:]
00849                     f2[0] = "%X" % i
00850                     table[i] = f2
00851             if cjk_check and cjk_ranges != cjk_ranges_found:
00852                 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
00853 
00854         # public attributes
00855         self.filename = UNICODE_DATA % ''
00856         self.table = table
00857         self.chars = list(range(0x110000)) # unicode 3.2
00858 
00859         file = open_data(COMPOSITION_EXCLUSIONS, version)
00860         self.exclusions = {}
00861         for s in file:
00862             s = s.strip()
00863             if not s:
00864                 continue
00865             if s[0] == '#':
00866                 continue
00867             char = int(s.split()[0],16)
00868             self.exclusions[char] = 1
00869 
00870         widths = [None] * 0x110000
00871         for s in open_data(EASTASIAN_WIDTH, version):
00872             s = s.strip()
00873             if not s:
00874                 continue
00875             if s[0] == '#':
00876                 continue
00877             s = s.split()[0].split(';')
00878             if '..' in s[0]:
00879                 first, last = [int(c, 16) for c in s[0].split('..')]
00880                 chars = list(range(first, last+1))
00881             else:
00882                 chars = [int(s[0], 16)]
00883             for char in chars:
00884                 widths[char] = s[1]
00885         for i in range(0, 0x110000):
00886             if table[i] is not None:
00887                 table[i].append(widths[i])
00888 
00889         for i in range(0, 0x110000):
00890             if table[i] is not None:
00891                 table[i].append(set())
00892         for s in open_data(DERIVED_CORE_PROPERTIES, version):
00893             s = s.split('#', 1)[0].strip()
00894             if not s:
00895                 continue
00896 
00897             r, p = s.split(";")
00898             r = r.strip()
00899             p = p.strip()
00900             if ".." in r:
00901                 first, last = [int(c, 16) for c in r.split('..')]
00902                 chars = list(range(first, last+1))
00903             else:
00904                 chars = [int(r, 16)]
00905             for char in chars:
00906                 if table[char]:
00907                     # Some properties (e.g. Default_Ignorable_Code_Point)
00908                     # apply to unassigned code points; ignore them
00909                     table[char][-1].add(p)
00910 
00911         for s in open_data(LINE_BREAK, version):
00912             s = s.partition('#')[0]
00913             s = [i.strip() for i in s.split(';')]
00914             if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
00915                 continue
00916             if '..' not in s[0]:
00917                 first = last = int(s[0], 16)
00918             else:
00919                 first, last = [int(c, 16) for c in s[0].split('..')]
00920             for char in range(first, last+1):
00921                 table[char][-1].add('Line_Break')
00922 
00923         # We only want the quickcheck properties
00924         # Format: NF?_QC; Y(es)/N(o)/M(aybe)
00925         # Yes is the default, hence only N and M occur
00926         # In 3.2.0, the format was different (NF?_NO)
00927         # The parsing will incorrectly determine these as
00928         # "yes", however, unicodedata.c will not perform quickchecks
00929         # for older versions, and no delta records will be created.
00930         quickchecks = [0] * 0x110000
00931         qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
00932         for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
00933             if '#' in s:
00934                 s = s[:s.index('#')]
00935             s = [i.strip() for i in s.split(';')]
00936             if len(s) < 2 or s[1] not in qc_order:
00937                 continue
00938             quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
00939             quickcheck_shift = qc_order.index(s[1])*2
00940             quickcheck <<= quickcheck_shift
00941             if '..' not in s[0]:
00942                 first = last = int(s[0], 16)
00943             else:
00944                 first, last = [int(c, 16) for c in s[0].split('..')]
00945             for char in range(first, last+1):
00946                 assert not (quickchecks[char]>>quickcheck_shift)&3
00947                 quickchecks[char] |= quickcheck
00948         for i in range(0, 0x110000):
00949             if table[i] is not None:
00950                 table[i].append(quickchecks[i])
00951 
00952         zip = zipfile.ZipFile(open_data(UNIHAN, version))
00953         if version == '3.2.0':
00954             data = zip.open('Unihan-3.2.0.txt').read()
00955         else:
00956             data = zip.open('Unihan_NumericValues.txt').read()
00957         for line in data.decode("utf-8").splitlines():
00958             if not line.startswith('U+'):
00959                 continue
00960             code, tag, value = line.split(None, 3)[:3]
00961             if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
00962                            'kOtherNumeric'):
00963                 continue
00964             value = value.strip().replace(',', '')
00965             i = int(code[2:], 16)
00966             # Patch the numeric field
00967             if table[i] is not None:
00968                 table[i][8] = value


Member Function Documentation

Definition at line 969 of file makeunicodedata.py.

00969 
00970     def uselatin1(self):
00971         # restrict character range to ISO Latin 1
00972         self.chars = list(range(256))
00973 
00974 # hash table tools
00975 
00976 # this is a straight-forward reimplementation of Python's built-in
00977 # dictionary type, using a static data structure, and a custom string
00978 # hash algorithm.


Member Data Documentation

Definition at line 818 of file makeunicodedata.py.

Definition at line 856 of file makeunicodedata.py.

Definition at line 859 of file makeunicodedata.py.

Definition at line 854 of file makeunicodedata.py.

Definition at line 855 of file makeunicodedata.py.


The documentation for this class was generated from the following file: