Back to index

python3.2  3.2.2
Functions | Variables
sre_compile Namespace Reference

Functions

def _identityfunction
def _compile
def _compile_charset
def _optimize_charset
def _mk_bitmap
def _optimize_unicode
def _simple
def _compile_info
def isstring
def _code
def compile

Variables

int MAXCODE = 65535
tuple _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
tuple _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
tuple _SUCCESS_CODES = set([SUCCESS, FAILURE])
tuple _ASSERT_CODES = set([ASSERT, ASSERT_NOT])
 emit = code.append
 
if prefix:
    print "*** PREFIX", prefix, prefix_skip
if charset:
    print "*** CHARSET", charset

add an info block

tuple skip = len(code)
int mask = 0
list prefix = prefix[:MAXCODE]
list table = [-1]

Function Documentation

def sre_compile._code (   p,
  flags 
) [private]

Definition at line 471 of file sre_compile.py.

00471 
00472 def _code(p, flags):
00473 
00474     flags = p.pattern.flags | flags
00475     code = []
00476 
00477     # compile info block
00478     _compile_info(code, p, flags)
00479 
00480     # compile the pattern
00481     _compile(code, p.data, flags)
00482 
00483     code.append(OPCODES[SUCCESS])
00484 
00485     return code

Here is the call graph for this function:

Here is the caller graph for this function:

def sre_compile._compile (   code,
  pattern,
  flags 
) [private]

Definition at line 32 of file sre_compile.py.

00032 
00033 def _compile(code, pattern, flags):
00034     # internal: compile a (sub)pattern
00035     emit = code.append
00036     _len = len
00037     LITERAL_CODES = _LITERAL_CODES
00038     REPEATING_CODES = _REPEATING_CODES
00039     SUCCESS_CODES = _SUCCESS_CODES
00040     ASSERT_CODES = _ASSERT_CODES
00041     for op, av in pattern:
00042         if op in LITERAL_CODES:
00043             if flags & SRE_FLAG_IGNORECASE:
00044                 emit(OPCODES[OP_IGNORE[op]])
00045                 emit(_sre.getlower(av, flags))
00046             else:
00047                 emit(OPCODES[op])
00048                 emit(av)
00049         elif op is IN:
00050             if flags & SRE_FLAG_IGNORECASE:
00051                 emit(OPCODES[OP_IGNORE[op]])
00052                 def fixup(literal, flags=flags):
00053                     return _sre.getlower(literal, flags)
00054             else:
00055                 emit(OPCODES[op])
00056                 fixup = _identityfunction
00057             skip = _len(code); emit(0)
00058             _compile_charset(av, flags, code, fixup)
00059             code[skip] = _len(code) - skip
00060         elif op is ANY:
00061             if flags & SRE_FLAG_DOTALL:
00062                 emit(OPCODES[ANY_ALL])
00063             else:
00064                 emit(OPCODES[ANY])
00065         elif op in REPEATING_CODES:
00066             if flags & SRE_FLAG_TEMPLATE:
00067                 raise error("internal: unsupported template operator")
00068                 emit(OPCODES[REPEAT])
00069                 skip = _len(code); emit(0)
00070                 emit(av[0])
00071                 emit(av[1])
00072                 _compile(code, av[2], flags)
00073                 emit(OPCODES[SUCCESS])
00074                 code[skip] = _len(code) - skip
00075             elif _simple(av) and op is not REPEAT:
00076                 if op is MAX_REPEAT:
00077                     emit(OPCODES[REPEAT_ONE])
00078                 else:
00079                     emit(OPCODES[MIN_REPEAT_ONE])
00080                 skip = _len(code); emit(0)
00081                 emit(av[0])
00082                 emit(av[1])
00083                 _compile(code, av[2], flags)
00084                 emit(OPCODES[SUCCESS])
00085                 code[skip] = _len(code) - skip
00086             else:
00087                 emit(OPCODES[REPEAT])
00088                 skip = _len(code); emit(0)
00089                 emit(av[0])
00090                 emit(av[1])
00091                 _compile(code, av[2], flags)
00092                 code[skip] = _len(code) - skip
00093                 if op is MAX_REPEAT:
00094                     emit(OPCODES[MAX_UNTIL])
00095                 else:
00096                     emit(OPCODES[MIN_UNTIL])
00097         elif op is SUBPATTERN:
00098             if av[0]:
00099                 emit(OPCODES[MARK])
00100                 emit((av[0]-1)*2)
00101             # _compile_info(code, av[1], flags)
00102             _compile(code, av[1], flags)
00103             if av[0]:
00104                 emit(OPCODES[MARK])
00105                 emit((av[0]-1)*2+1)
00106         elif op in SUCCESS_CODES:
00107             emit(OPCODES[op])
00108         elif op in ASSERT_CODES:
00109             emit(OPCODES[op])
00110             skip = _len(code); emit(0)
00111             if av[0] >= 0:
00112                 emit(0) # look ahead
00113             else:
00114                 lo, hi = av[1].getwidth()
00115                 if lo != hi:
00116                     raise error("look-behind requires fixed-width pattern")
00117                 emit(lo) # look behind
00118             _compile(code, av[1], flags)
00119             emit(OPCODES[SUCCESS])
00120             code[skip] = _len(code) - skip
00121         elif op is CALL:
00122             emit(OPCODES[op])
00123             skip = _len(code); emit(0)
00124             _compile(code, av, flags)
00125             emit(OPCODES[SUCCESS])
00126             code[skip] = _len(code) - skip
00127         elif op is AT:
00128             emit(OPCODES[op])
00129             if flags & SRE_FLAG_MULTILINE:
00130                 av = AT_MULTILINE.get(av, av)
00131             if flags & SRE_FLAG_LOCALE:
00132                 av = AT_LOCALE.get(av, av)
00133             elif flags & SRE_FLAG_UNICODE:
00134                 av = AT_UNICODE.get(av, av)
00135             emit(ATCODES[av])
00136         elif op is BRANCH:
00137             emit(OPCODES[op])
00138             tail = []
00139             tailappend = tail.append
00140             for av in av[1]:
00141                 skip = _len(code); emit(0)
00142                 # _compile_info(code, av, flags)
00143                 _compile(code, av, flags)
00144                 emit(OPCODES[JUMP])
00145                 tailappend(_len(code)); emit(0)
00146                 code[skip] = _len(code) - skip
00147             emit(0) # end of branch
00148             for tail in tail:
00149                 code[tail] = _len(code) - tail
00150         elif op is CATEGORY:
00151             emit(OPCODES[op])
00152             if flags & SRE_FLAG_LOCALE:
00153                 av = CH_LOCALE[av]
00154             elif flags & SRE_FLAG_UNICODE:
00155                 av = CH_UNICODE[av]
00156             emit(CHCODES[av])
00157         elif op is GROUPREF:
00158             if flags & SRE_FLAG_IGNORECASE:
00159                 emit(OPCODES[OP_IGNORE[op]])
00160             else:
00161                 emit(OPCODES[op])
00162             emit(av-1)
00163         elif op is GROUPREF_EXISTS:
00164             emit(OPCODES[op])
00165             emit(av[0]-1)
00166             skipyes = _len(code); emit(0)
00167             _compile(code, av[1], flags)
00168             if av[2]:
00169                 emit(OPCODES[JUMP])
00170                 skipno = _len(code); emit(0)
00171                 code[skipyes] = _len(code) - skipyes + 1
00172                 _compile(code, av[2], flags)
00173                 code[skipno] = _len(code) - skipno
00174             else:
00175                 code[skipyes] = _len(code) - skipyes + 1
00176         else:
00177             raise ValueError("unsupported operand type", op)

Here is the call graph for this function:

Here is the caller graph for this function:

def sre_compile._compile_charset (   charset,
  flags,
  code,
  fixup = None 
) [private]

Definition at line 178 of file sre_compile.py.

00178 
00179 def _compile_charset(charset, flags, code, fixup=None):
00180     # compile charset subprogram
00181     emit = code.append
00182     if fixup is None:
00183         fixup = _identityfunction
00184     for op, av in _optimize_charset(charset, fixup):
00185         emit(OPCODES[op])
00186         if op is NEGATE:
00187             pass
00188         elif op is LITERAL:
00189             emit(fixup(av))
00190         elif op is RANGE:
00191             emit(fixup(av[0]))
00192             emit(fixup(av[1]))
00193         elif op is CHARSET:
00194             code.extend(av)
00195         elif op is BIGCHARSET:
00196             code.extend(av)
00197         elif op is CATEGORY:
00198             if flags & SRE_FLAG_LOCALE:
00199                 emit(CHCODES[CH_LOCALE[av]])
00200             elif flags & SRE_FLAG_UNICODE:
00201                 emit(CHCODES[CH_UNICODE[av]])
00202             else:
00203                 emit(CHCODES[av])
00204         else:
00205             raise error("internal: unsupported set operator")
00206     emit(OPCODES[FAILURE])

Here is the call graph for this function:

Here is the caller graph for this function:

def sre_compile._compile_info (   code,
  pattern,
  flags 
) [private]

Definition at line 362 of file sre_compile.py.

00362 
00363 def _compile_info(code, pattern, flags):
00364     # internal: compile an info block.  in the current version,
00365     # this contains min/max pattern width, and an optional literal
00366     # prefix or a character map
00367     lo, hi = pattern.getwidth()
00368     if lo == 0:
00369         return # not worth it
00370     # look for a literal prefix
00371     prefix = []
00372     prefixappend = prefix.append
00373     prefix_skip = 0
00374     charset = [] # not used
00375     charsetappend = charset.append
00376     if not (flags & SRE_FLAG_IGNORECASE):
00377         # look for literal prefix
00378         for op, av in pattern.data:
00379             if op is LITERAL:
00380                 if len(prefix) == prefix_skip:
00381                     prefix_skip = prefix_skip + 1
00382                 prefixappend(av)
00383             elif op is SUBPATTERN and len(av[1]) == 1:
00384                 op, av = av[1][0]
00385                 if op is LITERAL:
00386                     prefixappend(av)
00387                 else:
00388                     break
00389             else:
00390                 break
00391         # if no prefix, look for charset prefix
00392         if not prefix and pattern.data:
00393             op, av = pattern.data[0]
00394             if op is SUBPATTERN and av[1]:
00395                 op, av = av[1][0]
00396                 if op is LITERAL:
00397                     charsetappend((op, av))
00398                 elif op is BRANCH:
00399                     c = []
00400                     cappend = c.append
00401                     for p in av[1]:
00402                         if not p:
00403                             break
00404                         op, av = p[0]
00405                         if op is LITERAL:
00406                             cappend((op, av))
00407                         else:
00408                             break
00409                     else:
00410                         charset = c
00411             elif op is BRANCH:
00412                 c = []
00413                 cappend = c.append
00414                 for p in av[1]:
00415                     if not p:
00416                         break
00417                     op, av = p[0]
00418                     if op is LITERAL:
00419                         cappend((op, av))
00420                     else:
00421                         break
00422                 else:
00423                     charset = c
00424             elif op is IN:
                charset = av

Here is the caller graph for this function:

def sre_compile._identityfunction (   x) [private]

Definition at line 24 of file sre_compile.py.

00024 
00025 def _identityfunction(x):
00026     return x

def sre_compile._mk_bitmap (   bits) [private]

Definition at line 258 of file sre_compile.py.

00258 
00259 def _mk_bitmap(bits):
00260     data = []
00261     dataappend = data.append
00262     if _sre.CODESIZE == 2:
00263         start = (1, 0)
00264     else:
00265         start = (1, 0)
00266     m, v = start
00267     for c in bits:
00268         if c:
00269             v = v + m
00270         m = m + m
00271         if m > MAXCODE:
00272             dataappend(v)
00273             m, v = start
00274     return data
00275 
00276 # To represent a big charset, first a bitmap of all characters in the
00277 # set is constructed. Then, this bitmap is sliced into chunks of 256
00278 # characters, duplicate chunks are eliminated, and each chunk is
00279 # given a number. In the compiled expression, the charset is
00280 # represented by a 16-bit word sequence, consisting of one word for
00281 # the number of different chunks, a sequence of 256 bytes (128 words)
00282 # of chunk numbers indexed by their original chunk position, and a
00283 # sequence of chunks (16 words each).
00284 
00285 # Compression is normally good: in a typical charset, large ranges of
00286 # Unicode will be either completely excluded (e.g. if only cyrillic
00287 # letters are to be matched), or completely included (e.g. if large
00288 # subranges of Kanji match). These ranges will be represented by
00289 # chunks of all one-bits or all zero-bits.
00290 
00291 # Matching can be also done efficiently: the more significant byte of
00292 # the Unicode character is an index into the chunk number, and the
00293 # less significant byte is a bit index in the chunk (just like the
00294 # CHARSET matching).
00295 
00296 # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
00297 # of the basic multilingual plane; an efficient representation
00298 # for all of UTF-16 has not yet been developed. This means,
00299 # in particular, that negated charsets cannot be represented as
00300 # bigcharsets.

Here is the caller graph for this function:

def sre_compile._optimize_charset (   charset,
  fixup 
) [private]

Definition at line 207 of file sre_compile.py.

00207 
00208 def _optimize_charset(charset, fixup):
00209     # internal: optimize character set
00210     out = []
00211     outappend = out.append
00212     charmap = [0]*256
00213     try:
00214         for op, av in charset:
00215             if op is NEGATE:
00216                 outappend((op, av))
00217             elif op is LITERAL:
00218                 charmap[fixup(av)] = 1
00219             elif op is RANGE:
00220                 for i in range(fixup(av[0]), fixup(av[1])+1):
00221                     charmap[i] = 1
00222             elif op is CATEGORY:
00223                 # XXX: could append to charmap tail
00224                 return charset # cannot compress
00225     except IndexError:
00226         # character set contains unicode characters
00227         return _optimize_unicode(charset, fixup)
00228     # compress character map
00229     i = p = n = 0
00230     runs = []
00231     runsappend = runs.append
00232     for c in charmap:
00233         if c:
00234             if n == 0:
00235                 p = i
00236             n = n + 1
00237         elif n:
00238             runsappend((p, n))
00239             n = 0
00240         i = i + 1
00241     if n:
00242         runsappend((p, n))
00243     if len(runs) <= 2:
00244         # use literal/range
00245         for p, n in runs:
00246             if n == 1:
00247                 outappend((LITERAL, p))
00248             else:
00249                 outappend((RANGE, (p, p+n-1)))
00250         if len(out) < len(charset):
00251             return out
00252     else:
00253         # use bitmap
00254         data = _mk_bitmap(charmap)
00255         outappend((CHARSET, data))
00256         return out
00257     return charset

Here is the call graph for this function:

Here is the caller graph for this function:

def sre_compile._optimize_unicode (   charset,
  fixup 
) [private]

Definition at line 301 of file sre_compile.py.

00301 
00302 def _optimize_unicode(charset, fixup):
00303     try:
00304         import array
00305     except ImportError:
00306         return charset
00307     charmap = [0]*65536
00308     negate = 0
00309     try:
00310         for op, av in charset:
00311             if op is NEGATE:
00312                 negate = 1
00313             elif op is LITERAL:
00314                 charmap[fixup(av)] = 1
00315             elif op is RANGE:
00316                 for i in range(fixup(av[0]), fixup(av[1])+1):
00317                     charmap[i] = 1
00318             elif op is CATEGORY:
00319                 # XXX: could expand category
00320                 return charset # cannot compress
00321     except IndexError:
00322         # non-BMP characters
00323         return charset
00324     if negate:
00325         if sys.maxunicode != 65535:
00326             # XXX: negation does not work with big charsets
00327             return charset
00328         for i in range(65536):
00329             charmap[i] = not charmap[i]
00330     comps = {}
00331     mapping = [0]*256
00332     block = 0
00333     data = []
00334     for i in range(256):
00335         chunk = tuple(charmap[i*256:(i+1)*256])
00336         new = comps.setdefault(chunk, block)
00337         mapping[i] = new
00338         if new == block:
00339             block = block + 1
00340             data = data + _mk_bitmap(chunk)
00341     header = [block]
00342     if _sre.CODESIZE == 2:
00343         code = 'H'
00344     else:
00345         code = 'I'
00346     # Convert block indices to byte array of 256 bytes
00347     mapping = array.array('b', mapping).tobytes()
00348     # Convert byte array to word array
00349     mapping = array.array(code, mapping)
00350     assert mapping.itemsize == _sre.CODESIZE
00351     assert len(mapping) * mapping.itemsize == 256
00352     header = header + mapping.tolist()
00353     data[0:0] = header
00354     return [(BIGCHARSET, data)]

Here is the call graph for this function:

Here is the caller graph for this function:

def sre_compile._simple (   av) [private]

Definition at line 355 of file sre_compile.py.

00355 
00356 def _simple(av):
00357     # check if av is a "simple" operator
00358     lo, hi = av[2].getwidth()
00359     if lo == 0 and hi == MAXREPEAT:
00360         raise error("nothing to repeat")
00361     return lo == hi == 1 and av[2][0][0] != SUBPATTERN

Here is the caller graph for this function:

def sre_compile.compile (   p,
  flags = 0 
)

Definition at line 486 of file sre_compile.py.

00486 
00487 def compile(p, flags=0):
00488     # internal: convert pattern list to internal format
00489 
00490     if isstring(p):
00491         pattern = p
00492         p = sre_parse.parse(p, flags)
00493     else:
00494         pattern = None
00495 
00496     code = _code(p, flags)
00497 
00498     # print code
00499 
00500     # XXX: <fl> get rid of this limitation!
00501     if p.pattern.groups > 100:
00502         raise AssertionError(
00503             "sorry, but this version only supports 100 named groups"
00504             )
00505 
00506     # map in either direction
00507     groupindex = p.pattern.groupdict
00508     indexgroup = [None] * p.pattern.groups
00509     for k, i in groupindex.items():
00510         indexgroup[i] = k
00511 
00512     return _sre.compile(
00513         pattern, flags | p.pattern.flags, code,
00514         p.pattern.groups-1,
00515         groupindex, indexgroup
00516         )

Here is the call graph for this function:

Here is the caller graph for this function:

def sre_compile.isstring (   obj)

Definition at line 468 of file sre_compile.py.

00468 
00469 def isstring(obj):
00470     return isinstance(obj, (str, bytes))

Here is the caller graph for this function:


Variable Documentation

tuple sre_compile._ASSERT_CODES = set([ASSERT, ASSERT_NOT])

Definition at line 30 of file sre_compile.py.

tuple sre_compile._LITERAL_CODES = set([LITERAL, NOT_LITERAL])

Definition at line 27 of file sre_compile.py.

tuple sre_compile._REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])

Definition at line 28 of file sre_compile.py.

tuple sre_compile._SUCCESS_CODES = set([SUCCESS, FAILURE])

Definition at line 29 of file sre_compile.py.

sre_compile.emit = code.append

if prefix:
    print "*** PREFIX", prefix, prefix_skip
if charset:
    print "*** CHARSET", charset

add an info block

Definition at line 430 of file sre_compile.py.

Definition at line 434 of file sre_compile.py.

Definition at line 20 of file sre_compile.py.

Definition at line 447 of file sre_compile.py.

Definition at line 432 of file sre_compile.py.

list sre_compile.table = [-1]

Definition at line 458 of file sre_compile.py.