Back to index

python3.2  3.2.2
test_tokenize.py
Go to the documentation of this file.
00001 doctests = """
00002 Tests for the tokenize module.
00003 
00004 The tests can be really simple. Given a small fragment of source
00005 code, print out a table with tokens. The ENDMARK is omitted for
00006 brevity.
00007 
00008     >>> dump_tokens("1 + 1")
00009     ENCODING   'utf-8'       (0, 0) (0, 0)
00010     NUMBER     '1'           (1, 0) (1, 1)
00011     OP         '+'           (1, 2) (1, 3)
00012     NUMBER     '1'           (1, 4) (1, 5)
00013 
00014     >>> dump_tokens("if False:\\n"
00015     ...             "    # NL\\n"
00016     ...             "    True = False # NEWLINE\\n")
00017     ENCODING   'utf-8'       (0, 0) (0, 0)
00018     NAME       'if'          (1, 0) (1, 2)
00019     NAME       'False'       (1, 3) (1, 8)
00020     OP         ':'           (1, 8) (1, 9)
00021     NEWLINE    '\\n'          (1, 9) (1, 10)
00022     COMMENT    '# NL'        (2, 4) (2, 8)
00023     NL         '\\n'          (2, 8) (2, 9)
00024     INDENT     '    '        (3, 0) (3, 4)
00025     NAME       'True'        (3, 4) (3, 8)
00026     OP         '='           (3, 9) (3, 10)
00027     NAME       'False'       (3, 11) (3, 16)
00028     COMMENT    '# NEWLINE'   (3, 17) (3, 26)
00029     NEWLINE    '\\n'          (3, 26) (3, 27)
00030     DEDENT     ''            (4, 0) (4, 0)
00031 
00032     >>> indent_error_file = \"""
00033     ... def k(x):
00034     ...     x += 2
00035     ...   x += 5
00036     ... \"""
00037     >>> readline = BytesIO(indent_error_file.encode('utf-8')).readline
00038     >>> for tok in tokenize(readline): pass
00039     Traceback (most recent call last):
00040         ...
00041     IndentationError: unindent does not match any outer indentation level
00042 
00043 There are some standard formatting practices that are easy to get right.
00044 
00045     >>> roundtrip("if x == 1:\\n"
00046     ...           "    print(x)\\n")
00047     True
00048 
00049     >>> roundtrip("# This is a comment\\n# This also")
00050     True
00051 
00052 Some people use different formatting conventions, which makes
00053 untokenize a little trickier. Note that this test involves trailing
00054 whitespace after the colon. Note that we use hex escapes to make the
00055 two trailing blanks apparent in the expected output.
00056 
00057     >>> roundtrip("if x == 1 : \\n"
00058     ...           "  print(x)\\n")
00059     True
00060 
00061     >>> f = support.findfile("tokenize_tests.txt")
00062     >>> roundtrip(open(f, 'rb'))
00063     True
00064 
00065     >>> roundtrip("if x == 1:\\n"
00066     ...           "    # A comment by itself.\\n"
00067     ...           "    print(x) # Comment here, too.\\n"
00068     ...           "    # Another comment.\\n"
00069     ...           "after_if = True\\n")
00070     True
00071 
00072     >>> roundtrip("if (x # The comments need to go in the right place\\n"
00073     ...           "    == 1):\\n"
00074     ...           "    print('x==1')\\n")
00075     True
00076 
00077     >>> roundtrip("class Test: # A comment here\\n"
00078     ...           "  # A comment with weird indent\\n"
00079     ...           "  after_com = 5\\n"
00080     ...           "  def x(m): return m*5 # a one liner\\n"
00081     ...           "  def y(m): # A whitespace after the colon\\n"
00082     ...           "     return y*4 # 3-space indent\\n")
00083     True
00084 
00085 Some error-handling code
00086 
00087     >>> roundtrip("try: import somemodule\\n"
00088     ...           "except ImportError: # comment\\n"
00089     ...           "    print('Can not import' # comment2\\n)"
00090     ...           "else:   print('Loaded')\\n")
00091     True
00092 
00093 Balancing continuation
00094 
00095     >>> roundtrip("a = (3,4, \\n"
00096     ...           "5,6)\\n"
00097     ...           "y = [3, 4,\\n"
00098     ...           "5]\\n"
00099     ...           "z = {'a': 5,\\n"
00100     ...           "'b':15, 'c':True}\\n"
00101     ...           "x = len(y) + 5 - a[\\n"
00102     ...           "3] - a[2]\\n"
00103     ...           "+ len(z) - z[\\n"
00104     ...           "'b']\\n")
00105     True
00106 
00107 Ordinary integers and binary operators
00108 
00109     >>> dump_tokens("0xff <= 255")
00110     ENCODING   'utf-8'       (0, 0) (0, 0)
00111     NUMBER     '0xff'        (1, 0) (1, 4)
00112     OP         '<='          (1, 5) (1, 7)
00113     NUMBER     '255'         (1, 8) (1, 11)
00114     >>> dump_tokens("0b10 <= 255")
00115     ENCODING   'utf-8'       (0, 0) (0, 0)
00116     NUMBER     '0b10'        (1, 0) (1, 4)
00117     OP         '<='          (1, 5) (1, 7)
00118     NUMBER     '255'         (1, 8) (1, 11)
00119     >>> dump_tokens("0o123 <= 0O123")
00120     ENCODING   'utf-8'       (0, 0) (0, 0)
00121     NUMBER     '0o123'       (1, 0) (1, 5)
00122     OP         '<='          (1, 6) (1, 8)
00123     NUMBER     '0O123'       (1, 9) (1, 14)
00124     >>> dump_tokens("1234567 > ~0x15")
00125     ENCODING   'utf-8'       (0, 0) (0, 0)
00126     NUMBER     '1234567'     (1, 0) (1, 7)
00127     OP         '>'           (1, 8) (1, 9)
00128     OP         '~'           (1, 10) (1, 11)
00129     NUMBER     '0x15'        (1, 11) (1, 15)
00130     >>> dump_tokens("2134568 != 1231515")
00131     ENCODING   'utf-8'       (0, 0) (0, 0)
00132     NUMBER     '2134568'     (1, 0) (1, 7)
00133     OP         '!='          (1, 8) (1, 10)
00134     NUMBER     '1231515'     (1, 11) (1, 18)
00135     >>> dump_tokens("(-124561-1) & 200000000")
00136     ENCODING   'utf-8'       (0, 0) (0, 0)
00137     OP         '('           (1, 0) (1, 1)
00138     OP         '-'           (1, 1) (1, 2)
00139     NUMBER     '124561'      (1, 2) (1, 8)
00140     OP         '-'           (1, 8) (1, 9)
00141     NUMBER     '1'           (1, 9) (1, 10)
00142     OP         ')'           (1, 10) (1, 11)
00143     OP         '&'           (1, 12) (1, 13)
00144     NUMBER     '200000000'   (1, 14) (1, 23)
00145     >>> dump_tokens("0xdeadbeef != -1")
00146     ENCODING   'utf-8'       (0, 0) (0, 0)
00147     NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
00148     OP         '!='          (1, 11) (1, 13)
00149     OP         '-'           (1, 14) (1, 15)
00150     NUMBER     '1'           (1, 15) (1, 16)
00151     >>> dump_tokens("0xdeadc0de & 12345")
00152     ENCODING   'utf-8'       (0, 0) (0, 0)
00153     NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
00154     OP         '&'           (1, 11) (1, 12)
00155     NUMBER     '12345'       (1, 13) (1, 18)
00156     >>> dump_tokens("0xFF & 0x15 | 1234")
00157     ENCODING   'utf-8'       (0, 0) (0, 0)
00158     NUMBER     '0xFF'        (1, 0) (1, 4)
00159     OP         '&'           (1, 5) (1, 6)
00160     NUMBER     '0x15'        (1, 7) (1, 11)
00161     OP         '|'           (1, 12) (1, 13)
00162     NUMBER     '1234'        (1, 14) (1, 18)
00163 
00164 Long integers
00165 
00166     >>> dump_tokens("x = 0")
00167     ENCODING   'utf-8'       (0, 0) (0, 0)
00168     NAME       'x'           (1, 0) (1, 1)
00169     OP         '='           (1, 2) (1, 3)
00170     NUMBER     '0'           (1, 4) (1, 5)
00171     >>> dump_tokens("x = 0xfffffffffff")
00172     ENCODING   'utf-8'       (0, 0) (0, 0)
00173     NAME       'x'           (1, 0) (1, 1)
00174     OP         '='           (1, 2) (1, 3)
00175     NUMBER     '0xffffffffff (1, 4) (1, 17)
00176     >>> dump_tokens("x = 123141242151251616110")
00177     ENCODING   'utf-8'       (0, 0) (0, 0)
00178     NAME       'x'           (1, 0) (1, 1)
00179     OP         '='           (1, 2) (1, 3)
00180     NUMBER     '123141242151 (1, 4) (1, 25)
00181     >>> dump_tokens("x = -15921590215012591")
00182     ENCODING   'utf-8'       (0, 0) (0, 0)
00183     NAME       'x'           (1, 0) (1, 1)
00184     OP         '='           (1, 2) (1, 3)
00185     OP         '-'           (1, 4) (1, 5)
00186     NUMBER     '159215902150 (1, 5) (1, 22)
00187 
00188 Floating point numbers
00189 
00190     >>> dump_tokens("x = 3.14159")
00191     ENCODING   'utf-8'       (0, 0) (0, 0)
00192     NAME       'x'           (1, 0) (1, 1)
00193     OP         '='           (1, 2) (1, 3)
00194     NUMBER     '3.14159'     (1, 4) (1, 11)
00195     >>> dump_tokens("x = 314159.")
00196     ENCODING   'utf-8'       (0, 0) (0, 0)
00197     NAME       'x'           (1, 0) (1, 1)
00198     OP         '='           (1, 2) (1, 3)
00199     NUMBER     '314159.'     (1, 4) (1, 11)
00200     >>> dump_tokens("x = .314159")
00201     ENCODING   'utf-8'       (0, 0) (0, 0)
00202     NAME       'x'           (1, 0) (1, 1)
00203     OP         '='           (1, 2) (1, 3)
00204     NUMBER     '.314159'     (1, 4) (1, 11)
00205     >>> dump_tokens("x = 3e14159")
00206     ENCODING   'utf-8'       (0, 0) (0, 0)
00207     NAME       'x'           (1, 0) (1, 1)
00208     OP         '='           (1, 2) (1, 3)
00209     NUMBER     '3e14159'     (1, 4) (1, 11)
00210     >>> dump_tokens("x = 3E123")
00211     ENCODING   'utf-8'       (0, 0) (0, 0)
00212     NAME       'x'           (1, 0) (1, 1)
00213     OP         '='           (1, 2) (1, 3)
00214     NUMBER     '3E123'       (1, 4) (1, 9)
00215     >>> dump_tokens("x+y = 3e-1230")
00216     ENCODING   'utf-8'       (0, 0) (0, 0)
00217     NAME       'x'           (1, 0) (1, 1)
00218     OP         '+'           (1, 1) (1, 2)
00219     NAME       'y'           (1, 2) (1, 3)
00220     OP         '='           (1, 4) (1, 5)
00221     NUMBER     '3e-1230'     (1, 6) (1, 13)
00222     >>> dump_tokens("x = 3.14e159")
00223     ENCODING   'utf-8'       (0, 0) (0, 0)
00224     NAME       'x'           (1, 0) (1, 1)
00225     OP         '='           (1, 2) (1, 3)
00226     NUMBER     '3.14e159'    (1, 4) (1, 12)
00227 
00228 String literals
00229 
00230     >>> dump_tokens("x = ''; y = \\\"\\\"")
00231     ENCODING   'utf-8'       (0, 0) (0, 0)
00232     NAME       'x'           (1, 0) (1, 1)
00233     OP         '='           (1, 2) (1, 3)
00234     STRING     "''"          (1, 4) (1, 6)
00235     OP         ';'           (1, 6) (1, 7)
00236     NAME       'y'           (1, 8) (1, 9)
00237     OP         '='           (1, 10) (1, 11)
00238     STRING     '""'          (1, 12) (1, 14)
00239     >>> dump_tokens("x = '\\\"'; y = \\\"'\\\"")
00240     ENCODING   'utf-8'       (0, 0) (0, 0)
00241     NAME       'x'           (1, 0) (1, 1)
00242     OP         '='           (1, 2) (1, 3)
00243     STRING     '\\'"\\''       (1, 4) (1, 7)
00244     OP         ';'           (1, 7) (1, 8)
00245     NAME       'y'           (1, 9) (1, 10)
00246     OP         '='           (1, 11) (1, 12)
00247     STRING     '"\\'"'        (1, 13) (1, 16)
00248     >>> dump_tokens("x = \\\"doesn't \\\"shrink\\\", does it\\\"")
00249     ENCODING   'utf-8'       (0, 0) (0, 0)
00250     NAME       'x'           (1, 0) (1, 1)
00251     OP         '='           (1, 2) (1, 3)
00252     STRING     '"doesn\\'t "' (1, 4) (1, 14)
00253     NAME       'shrink'      (1, 14) (1, 20)
00254     STRING     '", does it"' (1, 20) (1, 31)
00255     >>> dump_tokens("x = 'abc' + 'ABC'")
00256     ENCODING   'utf-8'       (0, 0) (0, 0)
00257     NAME       'x'           (1, 0) (1, 1)
00258     OP         '='           (1, 2) (1, 3)
00259     STRING     "'abc'"       (1, 4) (1, 9)
00260     OP         '+'           (1, 10) (1, 11)
00261     STRING     "'ABC'"       (1, 12) (1, 17)
00262     >>> dump_tokens('y = "ABC" + "ABC"')
00263     ENCODING   'utf-8'       (0, 0) (0, 0)
00264     NAME       'y'           (1, 0) (1, 1)
00265     OP         '='           (1, 2) (1, 3)
00266     STRING     '"ABC"'       (1, 4) (1, 9)
00267     OP         '+'           (1, 10) (1, 11)
00268     STRING     '"ABC"'       (1, 12) (1, 17)
00269     >>> dump_tokens("x = r'abc' + r'ABC' + R'ABC' + R'ABC'")
00270     ENCODING   'utf-8'       (0, 0) (0, 0)
00271     NAME       'x'           (1, 0) (1, 1)
00272     OP         '='           (1, 2) (1, 3)
00273     STRING     "r'abc'"      (1, 4) (1, 10)
00274     OP         '+'           (1, 11) (1, 12)
00275     STRING     "r'ABC'"      (1, 13) (1, 19)
00276     OP         '+'           (1, 20) (1, 21)
00277     STRING     "R'ABC'"      (1, 22) (1, 28)
00278     OP         '+'           (1, 29) (1, 30)
00279     STRING     "R'ABC'"      (1, 31) (1, 37)
00280     >>> dump_tokens('y = r"abc" + r"ABC" + R"ABC" + R"ABC"')
00281     ENCODING   'utf-8'       (0, 0) (0, 0)
00282     NAME       'y'           (1, 0) (1, 1)
00283     OP         '='           (1, 2) (1, 3)
00284     STRING     'r"abc"'      (1, 4) (1, 10)
00285     OP         '+'           (1, 11) (1, 12)
00286     STRING     'r"ABC"'      (1, 13) (1, 19)
00287     OP         '+'           (1, 20) (1, 21)
00288     STRING     'R"ABC"'      (1, 22) (1, 28)
00289     OP         '+'           (1, 29) (1, 30)
00290     STRING     'R"ABC"'      (1, 31) (1, 37)
00291 
00292 Operators
00293 
00294     >>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
00295     ENCODING   'utf-8'       (0, 0) (0, 0)
00296     NAME       'def'         (1, 0) (1, 3)
00297     NAME       'd22'         (1, 4) (1, 7)
00298     OP         '('           (1, 7) (1, 8)
00299     NAME       'a'           (1, 8) (1, 9)
00300     OP         ','           (1, 9) (1, 10)
00301     NAME       'b'           (1, 11) (1, 12)
00302     OP         ','           (1, 12) (1, 13)
00303     NAME       'c'           (1, 14) (1, 15)
00304     OP         '='           (1, 15) (1, 16)
00305     NUMBER     '2'           (1, 16) (1, 17)
00306     OP         ','           (1, 17) (1, 18)
00307     NAME       'd'           (1, 19) (1, 20)
00308     OP         '='           (1, 20) (1, 21)
00309     NUMBER     '2'           (1, 21) (1, 22)
00310     OP         ','           (1, 22) (1, 23)
00311     OP         '*'           (1, 24) (1, 25)
00312     NAME       'k'           (1, 25) (1, 26)
00313     OP         ')'           (1, 26) (1, 27)
00314     OP         ':'           (1, 27) (1, 28)
00315     NAME       'pass'        (1, 29) (1, 33)
00316     >>> dump_tokens("def d01v_(a=1, *k, **w): pass")
00317     ENCODING   'utf-8'       (0, 0) (0, 0)
00318     NAME       'def'         (1, 0) (1, 3)
00319     NAME       'd01v_'       (1, 4) (1, 9)
00320     OP         '('           (1, 9) (1, 10)
00321     NAME       'a'           (1, 10) (1, 11)
00322     OP         '='           (1, 11) (1, 12)
00323     NUMBER     '1'           (1, 12) (1, 13)
00324     OP         ','           (1, 13) (1, 14)
00325     OP         '*'           (1, 15) (1, 16)
00326     NAME       'k'           (1, 16) (1, 17)
00327     OP         ','           (1, 17) (1, 18)
00328     OP         '**'          (1, 19) (1, 21)
00329     NAME       'w'           (1, 21) (1, 22)
00330     OP         ')'           (1, 22) (1, 23)
00331     OP         ':'           (1, 23) (1, 24)
00332     NAME       'pass'        (1, 25) (1, 29)
00333 
00334 Comparison
00335 
00336     >>> dump_tokens("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " +
00337     ...             "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass")
00338     ENCODING   'utf-8'       (0, 0) (0, 0)
00339     NAME       'if'          (1, 0) (1, 2)
00340     NUMBER     '1'           (1, 3) (1, 4)
00341     OP         '<'           (1, 5) (1, 6)
00342     NUMBER     '1'           (1, 7) (1, 8)
00343     OP         '>'           (1, 9) (1, 10)
00344     NUMBER     '1'           (1, 11) (1, 12)
00345     OP         '=='          (1, 13) (1, 15)
00346     NUMBER     '1'           (1, 16) (1, 17)
00347     OP         '>='          (1, 18) (1, 20)
00348     NUMBER     '5'           (1, 21) (1, 22)
00349     OP         '<='          (1, 23) (1, 25)
00350     NUMBER     '0x15'        (1, 26) (1, 30)
00351     OP         '<='          (1, 31) (1, 33)
00352     NUMBER     '0x12'        (1, 34) (1, 38)
00353     OP         '!='          (1, 39) (1, 41)
00354     NUMBER     '1'           (1, 42) (1, 43)
00355     NAME       'and'         (1, 44) (1, 47)
00356     NUMBER     '5'           (1, 48) (1, 49)
00357     NAME       'in'          (1, 50) (1, 52)
00358     NUMBER     '1'           (1, 53) (1, 54)
00359     NAME       'not'         (1, 55) (1, 58)
00360     NAME       'in'          (1, 59) (1, 61)
00361     NUMBER     '1'           (1, 62) (1, 63)
00362     NAME       'is'          (1, 64) (1, 66)
00363     NUMBER     '1'           (1, 67) (1, 68)
00364     NAME       'or'          (1, 69) (1, 71)
00365     NUMBER     '5'           (1, 72) (1, 73)
00366     NAME       'is'          (1, 74) (1, 76)
00367     NAME       'not'         (1, 77) (1, 80)
00368     NUMBER     '1'           (1, 81) (1, 82)
00369     OP         ':'           (1, 82) (1, 83)
00370     NAME       'pass'        (1, 84) (1, 88)
00371 
00372 Shift
00373 
00374     >>> dump_tokens("x = 1 << 1 >> 5")
00375     ENCODING   'utf-8'       (0, 0) (0, 0)
00376     NAME       'x'           (1, 0) (1, 1)
00377     OP         '='           (1, 2) (1, 3)
00378     NUMBER     '1'           (1, 4) (1, 5)
00379     OP         '<<'          (1, 6) (1, 8)
00380     NUMBER     '1'           (1, 9) (1, 10)
00381     OP         '>>'          (1, 11) (1, 13)
00382     NUMBER     '5'           (1, 14) (1, 15)
00383 
00384 Additive
00385 
00386     >>> dump_tokens("x = 1 - y + 15 - 1 + 0x124 + z + a[5]")
00387     ENCODING   'utf-8'       (0, 0) (0, 0)
00388     NAME       'x'           (1, 0) (1, 1)
00389     OP         '='           (1, 2) (1, 3)
00390     NUMBER     '1'           (1, 4) (1, 5)
00391     OP         '-'           (1, 6) (1, 7)
00392     NAME       'y'           (1, 8) (1, 9)
00393     OP         '+'           (1, 10) (1, 11)
00394     NUMBER     '15'          (1, 12) (1, 14)
00395     OP         '-'           (1, 15) (1, 16)
00396     NUMBER     '1'           (1, 17) (1, 18)
00397     OP         '+'           (1, 19) (1, 20)
00398     NUMBER     '0x124'       (1, 21) (1, 26)
00399     OP         '+'           (1, 27) (1, 28)
00400     NAME       'z'           (1, 29) (1, 30)
00401     OP         '+'           (1, 31) (1, 32)
00402     NAME       'a'           (1, 33) (1, 34)
00403     OP         '['           (1, 34) (1, 35)
00404     NUMBER     '5'           (1, 35) (1, 36)
00405     OP         ']'           (1, 36) (1, 37)
00406 
00407 Multiplicative
00408 
00409     >>> dump_tokens("x = 1//1*1/5*12%0x12")
00410     ENCODING   'utf-8'       (0, 0) (0, 0)
00411     NAME       'x'           (1, 0) (1, 1)
00412     OP         '='           (1, 2) (1, 3)
00413     NUMBER     '1'           (1, 4) (1, 5)
00414     OP         '//'          (1, 5) (1, 7)
00415     NUMBER     '1'           (1, 7) (1, 8)
00416     OP         '*'           (1, 8) (1, 9)
00417     NUMBER     '1'           (1, 9) (1, 10)
00418     OP         '/'           (1, 10) (1, 11)
00419     NUMBER     '5'           (1, 11) (1, 12)
00420     OP         '*'           (1, 12) (1, 13)
00421     NUMBER     '12'          (1, 13) (1, 15)
00422     OP         '%'           (1, 15) (1, 16)
00423     NUMBER     '0x12'        (1, 16) (1, 20)
00424 
00425 Unary
00426 
00427     >>> dump_tokens("~1 ^ 1 & 1 |1 ^ -1")
00428     ENCODING   'utf-8'       (0, 0) (0, 0)
00429     OP         '~'           (1, 0) (1, 1)
00430     NUMBER     '1'           (1, 1) (1, 2)
00431     OP         '^'           (1, 3) (1, 4)
00432     NUMBER     '1'           (1, 5) (1, 6)
00433     OP         '&'           (1, 7) (1, 8)
00434     NUMBER     '1'           (1, 9) (1, 10)
00435     OP         '|'           (1, 11) (1, 12)
00436     NUMBER     '1'           (1, 12) (1, 13)
00437     OP         '^'           (1, 14) (1, 15)
00438     OP         '-'           (1, 16) (1, 17)
00439     NUMBER     '1'           (1, 17) (1, 18)
00440     >>> dump_tokens("-1*1/1+1*1//1 - ---1**1")
00441     ENCODING   'utf-8'       (0, 0) (0, 0)
00442     OP         '-'           (1, 0) (1, 1)
00443     NUMBER     '1'           (1, 1) (1, 2)
00444     OP         '*'           (1, 2) (1, 3)
00445     NUMBER     '1'           (1, 3) (1, 4)
00446     OP         '/'           (1, 4) (1, 5)
00447     NUMBER     '1'           (1, 5) (1, 6)
00448     OP         '+'           (1, 6) (1, 7)
00449     NUMBER     '1'           (1, 7) (1, 8)
00450     OP         '*'           (1, 8) (1, 9)
00451     NUMBER     '1'           (1, 9) (1, 10)
00452     OP         '//'          (1, 10) (1, 12)
00453     NUMBER     '1'           (1, 12) (1, 13)
00454     OP         '-'           (1, 14) (1, 15)
00455     OP         '-'           (1, 16) (1, 17)
00456     OP         '-'           (1, 17) (1, 18)
00457     OP         '-'           (1, 18) (1, 19)
00458     NUMBER     '1'           (1, 19) (1, 20)
00459     OP         '**'          (1, 20) (1, 22)
00460     NUMBER     '1'           (1, 22) (1, 23)
00461 
00462 Selector
00463 
00464     >>> dump_tokens("import sys, time\\nx = sys.modules['time'].time()")
00465     ENCODING   'utf-8'       (0, 0) (0, 0)
00466     NAME       'import'      (1, 0) (1, 6)
00467     NAME       'sys'         (1, 7) (1, 10)
00468     OP         ','           (1, 10) (1, 11)
00469     NAME       'time'        (1, 12) (1, 16)
00470     NEWLINE    '\\n'          (1, 16) (1, 17)
00471     NAME       'x'           (2, 0) (2, 1)
00472     OP         '='           (2, 2) (2, 3)
00473     NAME       'sys'         (2, 4) (2, 7)
00474     OP         '.'           (2, 7) (2, 8)
00475     NAME       'modules'     (2, 8) (2, 15)
00476     OP         '['           (2, 15) (2, 16)
00477     STRING     "'time'"      (2, 16) (2, 22)
00478     OP         ']'           (2, 22) (2, 23)
00479     OP         '.'           (2, 23) (2, 24)
00480     NAME       'time'        (2, 24) (2, 28)
00481     OP         '('           (2, 28) (2, 29)
00482     OP         ')'           (2, 29) (2, 30)
00483 
00484 Methods
00485 
00486     >>> dump_tokens("@staticmethod\\ndef foo(x,y): pass")
00487     ENCODING   'utf-8'       (0, 0) (0, 0)
00488     OP         '@'           (1, 0) (1, 1)
00489     NAME       'staticmethod (1, 1) (1, 13)
00490     NEWLINE    '\\n'          (1, 13) (1, 14)
00491     NAME       'def'         (2, 0) (2, 3)
00492     NAME       'foo'         (2, 4) (2, 7)
00493     OP         '('           (2, 7) (2, 8)
00494     NAME       'x'           (2, 8) (2, 9)
00495     OP         ','           (2, 9) (2, 10)
00496     NAME       'y'           (2, 10) (2, 11)
00497     OP         ')'           (2, 11) (2, 12)
00498     OP         ':'           (2, 12) (2, 13)
00499     NAME       'pass'        (2, 14) (2, 18)
00500 
00501 Backslash means line continuation, except for comments
00502 
00503     >>> roundtrip("x=1+\\\\n"
00504     ...           "1\\n"
00505     ...           "# This is a comment\\\\n"
00506     ...           "# This also\\n")
00507     True
00508     >>> roundtrip("# Comment \\\\nx = 0")
00509     True
00510 
00511 Two string literals on the same line
00512 
00513     >>> roundtrip("'' ''")
00514     True
00515 
00516 Test roundtrip on random python modules.
00517 pass the '-ucpu' option to process the full directory.
00518 
00519     >>> import random
00520     >>> tempdir = os.path.dirname(f) or os.curdir
00521     >>> testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
00522 
00523 tokenize is broken on test_pep3131.py because regular expressions are broken on
00524 the obscure unicode identifiers in it. *sigh*
00525     >>> testfiles.remove(os.path.join(tempdir, "test_pep3131.py"))
00526     >>> if not support.is_resource_enabled("cpu"):
00527     ...     testfiles = random.sample(testfiles, 10)
00528     ...
00529     >>> for testfile in testfiles:
00530     ...     if not roundtrip(open(testfile, 'rb')):
00531     ...         print("Roundtrip failed for file %s" % testfile)
00532     ...         break
00533     ... else: True
00534     True
00535 
00536 Evil tabs
00537 
00538     >>> dump_tokens("def f():\\n\\tif x\\n        \\tpass")
00539     ENCODING   'utf-8'       (0, 0) (0, 0)
00540     NAME       'def'         (1, 0) (1, 3)
00541     NAME       'f'           (1, 4) (1, 5)
00542     OP         '('           (1, 5) (1, 6)
00543     OP         ')'           (1, 6) (1, 7)
00544     OP         ':'           (1, 7) (1, 8)
00545     NEWLINE    '\\n'          (1, 8) (1, 9)
00546     INDENT     '\\t'          (2, 0) (2, 1)
00547     NAME       'if'          (2, 1) (2, 3)
00548     NAME       'x'           (2, 4) (2, 5)
00549     NEWLINE    '\\n'          (2, 5) (2, 6)
00550     INDENT     '        \\t'  (3, 0) (3, 9)
00551     NAME       'pass'        (3, 9) (3, 13)
00552     DEDENT     ''            (4, 0) (4, 0)
00553     DEDENT     ''            (4, 0) (4, 0)
00554 
00555 Non-ascii identifiers
00556 
00557     >>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
00558     ENCODING   'utf-8'       (0, 0) (0, 0)
00559     NAME       'Örter'       (1, 0) (1, 5)
00560     OP         '='           (1, 6) (1, 7)
00561     STRING     "'places'"    (1, 8) (1, 16)
00562     NEWLINE    '\\n'          (1, 16) (1, 17)
00563     NAME       'grün'        (2, 0) (2, 4)
00564     OP         '='           (2, 5) (2, 6)
00565     STRING     "'green'"     (2, 7) (2, 14)
00566 """
00567 
00568 from test import support
00569 from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
00570                      STRING, ENDMARKER, tok_name, detect_encoding,
00571                      open as tokenize_open)
00572 from io import BytesIO
00573 from unittest import TestCase
00574 import os, sys, glob
00575 
00576 def dump_tokens(s):
00577     """Print out the tokens in s in a table format.
00578 
00579     The ENDMARKER is omitted.
00580     """
00581     f = BytesIO(s.encode('utf-8'))
00582     for type, token, start, end, line in tokenize(f.readline):
00583         if type == ENDMARKER:
00584             break
00585         type = tok_name[type]
00586         print("%(type)-10.10s %(token)-13.13r %(start)s %(end)s" % locals())
00587 
00588 def roundtrip(f):
00589     """
00590     Test roundtrip for `untokenize`. `f` is an open file or a string.
00591     The source code in f is tokenized, converted back to source code via
00592     tokenize.untokenize(), and tokenized again from the latter. The test
00593     fails if the second tokenization doesn't match the first.
00594     """
00595     if isinstance(f, str):
00596         f = BytesIO(f.encode('utf-8'))
00597     try:
00598         token_list = list(tokenize(f.readline))
00599     finally:
00600         f.close()
00601     tokens1 = [tok[:2] for tok in token_list]
00602     new_bytes = untokenize(tokens1)
00603     readline = (line for line in new_bytes.splitlines(1)).__next__
00604     tokens2 = [tok[:2] for tok in tokenize(readline)]
00605     return tokens1 == tokens2
00606 
00607 # This is an example from the docs, set up as a doctest.
00608 def decistmt(s):
00609     """Substitute Decimals for floats in a string of statements.
00610 
00611     >>> from decimal import Decimal
00612     >>> s = 'print(+21.3e-5*-.1234/81.7)'
00613     >>> decistmt(s)
00614     "print (+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7'))"
00615 
00616     The format of the exponent is inherited from the platform C library.
00617     Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
00618     we're only showing 11 digits, and the 12th isn't close to 5, the
00619     rest of the output should be platform-independent.
00620 
00621     >>> exec(s) #doctest: +ELLIPSIS
00622     -3.2171603427...e-0...7
00623 
00624     Output from calculations with Decimal should be identical across all
00625     platforms.
00626 
00627     >>> exec(decistmt(s))
00628     -3.217160342717258261933904529E-7
00629     """
00630     result = []
00631     g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
00632     for toknum, tokval, _, _, _  in g:
00633         if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
00634             result.extend([
00635                 (NAME, 'Decimal'),
00636                 (OP, '('),
00637                 (STRING, repr(tokval)),
00638                 (OP, ')')
00639             ])
00640         else:
00641             result.append((toknum, tokval))
00642     return untokenize(result).decode('utf-8')
00643 
00644 
00645 class TestTokenizerAdheresToPep0263(TestCase):
00646     """
00647     Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
00648     """
00649 
00650     def _testFile(self, filename):
00651         path = os.path.join(os.path.dirname(__file__), filename)
00652         return roundtrip(open(path, 'rb'))
00653 
00654     def test_utf8_coding_cookie_and_no_utf8_bom(self):
00655         f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
00656         self.assertTrue(self._testFile(f))
00657 
00658     def test_latin1_coding_cookie_and_utf8_bom(self):
00659         """
00660         As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
00661         allowed encoding for the comment is 'utf-8'.  The text file used in
00662         this test starts with a BOM signature, but specifies latin1 as the
00663         coding, so verify that a SyntaxError is raised, which matches the
00664         behaviour of the interpreter when it encounters a similar condition.
00665         """
00666         f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
00667         self.assertRaises(SyntaxError, self._testFile, f)
00668 
00669     def test_no_coding_cookie_and_utf8_bom(self):
00670         f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
00671         self.assertTrue(self._testFile(f))
00672 
00673     def test_utf8_coding_cookie_and_utf8_bom(self):
00674         f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
00675         self.assertTrue(self._testFile(f))
00676 
00677 
00678 class Test_Tokenize(TestCase):
00679 
00680     def test__tokenize_decodes_with_specified_encoding(self):
00681         literal = '"ЉЊЈЁЂ"'
00682         line = literal.encode('utf-8')
00683         first = False
00684         def readline():
00685             nonlocal first
00686             if not first:
00687                 first = True
00688                 return line
00689             else:
00690                 return b''
00691 
00692         # skip the initial encoding token and the end token
00693         tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
00694         expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
00695         self.assertEqual(tokens, expected_tokens,
00696                          "bytes not decoded with encoding")
00697 
00698     def test__tokenize_does_not_decode_with_encoding_none(self):
00699         literal = '"ЉЊЈЁЂ"'
00700         first = False
00701         def readline():
00702             nonlocal first
00703             if not first:
00704                 first = True
00705                 return literal
00706             else:
00707                 return b''
00708 
00709         # skip the end token
00710         tokens = list(_tokenize(readline, encoding=None))[:-1]
00711         expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
00712         self.assertEqual(tokens, expected_tokens,
00713                          "string not tokenized when encoding is None")
00714 
00715 
00716 class TestDetectEncoding(TestCase):
00717 
00718     def get_readline(self, lines):
00719         index = 0
00720         def readline():
00721             nonlocal index
00722             if index == len(lines):
00723                 raise StopIteration
00724             line = lines[index]
00725             index += 1
00726             return line
00727         return readline
00728 
00729     def test_no_bom_no_encoding_cookie(self):
00730         lines = (
00731             b'# something\n',
00732             b'print(something)\n',
00733             b'do_something(else)\n'
00734         )
00735         encoding, consumed_lines = detect_encoding(self.get_readline(lines))
00736         self.assertEqual(encoding, 'utf-8')
00737         self.assertEqual(consumed_lines, list(lines[:2]))
00738 
00739     def test_bom_no_cookie(self):
00740         lines = (
00741             b'\xef\xbb\xbf# something\n',
00742             b'print(something)\n',
00743             b'do_something(else)\n'
00744         )
00745         encoding, consumed_lines = detect_encoding(self.get_readline(lines))
00746         self.assertEqual(encoding, 'utf-8-sig')
00747         self.assertEqual(consumed_lines,
00748                          [b'# something\n', b'print(something)\n'])
00749 
00750     def test_cookie_first_line_no_bom(self):
00751         lines = (
00752             b'# -*- coding: latin-1 -*-\n',
00753             b'print(something)\n',
00754             b'do_something(else)\n'
00755         )
00756         encoding, consumed_lines = detect_encoding(self.get_readline(lines))
00757         self.assertEqual(encoding, 'iso-8859-1')
00758         self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
00759 
00760     def test_matched_bom_and_cookie_first_line(self):
00761         lines = (
00762             b'\xef\xbb\xbf# coding=utf-8\n',
00763             b'print(something)\n',
00764             b'do_something(else)\n'
00765         )
00766         encoding, consumed_lines = detect_encoding(self.get_readline(lines))
00767         self.assertEqual(encoding, 'utf-8-sig')
00768         self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
00769 
00770     def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
00771         lines = (
00772             b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
00773             b'print(something)\n',
00774             b'do_something(else)\n'
00775         )
00776         readline = self.get_readline(lines)
00777         self.assertRaises(SyntaxError, detect_encoding, readline)
00778 
00779     def test_cookie_second_line_no_bom(self):
00780         lines = (
00781             b'#! something\n',
00782             b'# vim: set fileencoding=ascii :\n',
00783             b'print(something)\n',
00784             b'do_something(else)\n'
00785         )
00786         encoding, consumed_lines = detect_encoding(self.get_readline(lines))
00787         self.assertEqual(encoding, 'ascii')
00788         expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
00789         self.assertEqual(consumed_lines, expected)
00790 
00791     def test_matched_bom_and_cookie_second_line(self):
00792         lines = (
00793             b'\xef\xbb\xbf#! something\n',
00794             b'f# coding=utf-8\n',
00795             b'print(something)\n',
00796             b'do_something(else)\n'
00797         )
00798         encoding, consumed_lines = detect_encoding(self.get_readline(lines))
00799         self.assertEqual(encoding, 'utf-8-sig')
00800         self.assertEqual(consumed_lines,
00801                          [b'#! something\n', b'f# coding=utf-8\n'])
00802 
00803     def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
00804         lines = (
00805             b'\xef\xbb\xbf#! something\n',
00806             b'# vim: set fileencoding=ascii :\n',
00807             b'print(something)\n',
00808             b'do_something(else)\n'
00809         )
00810         readline = self.get_readline(lines)
00811         self.assertRaises(SyntaxError, detect_encoding, readline)
00812 
00813     def test_latin1_normalization(self):
00814         # See get_normal_name() in tokenizer.c.
00815         encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
00816                      "iso-8859-1-unix", "iso-latin-1-mac")
00817         for encoding in encodings:
00818             for rep in ("-", "_"):
00819                 enc = encoding.replace("-", rep)
00820                 lines = (b"#!/usr/bin/python\n",
00821                          b"# coding: " + enc.encode("ascii") + b"\n",
00822                          b"print(things)\n",
00823                          b"do_something += 4\n")
00824                 rl = self.get_readline(lines)
00825                 found, consumed_lines = detect_encoding(rl)
00826                 self.assertEqual(found, "iso-8859-1")
00827 
00828     def test_utf8_normalization(self):
00829         # See get_normal_name() in tokenizer.c.
00830         encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
00831         for encoding in encodings:
00832             for rep in ("-", "_"):
00833                 enc = encoding.replace("-", rep)
00834                 lines = (b"#!/usr/bin/python\n",
00835                          b"# coding: " + enc.encode("ascii") + b"\n",
00836                          b"1 + 3\n")
00837                 rl = self.get_readline(lines)
00838                 found, consumed_lines = detect_encoding(rl)
00839                 self.assertEqual(found, "utf-8")
00840 
00841     def test_short_files(self):
00842         readline = self.get_readline((b'print(something)\n',))
00843         encoding, consumed_lines = detect_encoding(readline)
00844         self.assertEqual(encoding, 'utf-8')
00845         self.assertEqual(consumed_lines, [b'print(something)\n'])
00846 
00847         encoding, consumed_lines = detect_encoding(self.get_readline(()))
00848         self.assertEqual(encoding, 'utf-8')
00849         self.assertEqual(consumed_lines, [])
00850 
00851         readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
00852         encoding, consumed_lines = detect_encoding(readline)
00853         self.assertEqual(encoding, 'utf-8-sig')
00854         self.assertEqual(consumed_lines, [b'print(something)\n'])
00855 
00856         readline = self.get_readline((b'\xef\xbb\xbf',))
00857         encoding, consumed_lines = detect_encoding(readline)
00858         self.assertEqual(encoding, 'utf-8-sig')
00859         self.assertEqual(consumed_lines, [])
00860 
00861         readline = self.get_readline((b'# coding: bad\n',))
00862         self.assertRaises(SyntaxError, detect_encoding, readline)
00863 
00864     def test_open(self):
00865         filename = support.TESTFN + '.py'
00866         self.addCleanup(support.unlink, filename)
00867 
00868         # test coding cookie
00869         for encoding in ('iso-8859-15', 'utf-8'):
00870             with open(filename, 'w', encoding=encoding) as fp:
00871                 print("# coding: %s" % encoding, file=fp)
00872                 print("print('euro:\u20ac')", file=fp)
00873             with tokenize_open(filename) as fp:
00874                 self.assertEqual(fp.encoding, encoding)
00875                 self.assertEqual(fp.mode, 'r')
00876 
00877         # test BOM (no coding cookie)
00878         with open(filename, 'w', encoding='utf-8-sig') as fp:
00879             print("print('euro:\u20ac')", file=fp)
00880         with tokenize_open(filename) as fp:
00881             self.assertEqual(fp.encoding, 'utf-8-sig')
00882             self.assertEqual(fp.mode, 'r')
00883 
00884 class TestTokenize(TestCase):
00885 
00886     def test_tokenize(self):
00887         import tokenize as tokenize_module
00888         encoding = object()
00889         encoding_used = None
00890         def mock_detect_encoding(readline):
00891             return encoding, ['first', 'second']
00892 
00893         def mock__tokenize(readline, encoding):
00894             nonlocal encoding_used
00895             encoding_used = encoding
00896             out = []
00897             while True:
00898                 next_line = readline()
00899                 if next_line:
00900                     out.append(next_line)
00901                     continue
00902                 return out
00903 
00904         counter = 0
00905         def mock_readline():
00906             nonlocal counter
00907             counter += 1
00908             if counter == 5:
00909                 return b''
00910             return counter
00911 
00912         orig_detect_encoding = tokenize_module.detect_encoding
00913         orig__tokenize = tokenize_module._tokenize
00914         tokenize_module.detect_encoding = mock_detect_encoding
00915         tokenize_module._tokenize = mock__tokenize
00916         try:
00917             results = tokenize(mock_readline)
00918             self.assertEqual(list(results), ['first', 'second', 1, 2, 3, 4])
00919         finally:
00920             tokenize_module.detect_encoding = orig_detect_encoding
00921             tokenize_module._tokenize = orig__tokenize
00922 
00923         self.assertTrue(encoding_used, encoding)
00924 
00925 
00926 __test__ = {"doctests" : doctests, 'decistmt': decistmt}
00927 
00928 def test_main():
00929     from test import test_tokenize
00930     support.run_doctest(test_tokenize, True)
00931     support.run_unittest(TestTokenizerAdheresToPep0263)
00932     support.run_unittest(Test_Tokenize)
00933     support.run_unittest(TestDetectEncoding)
00934     support.run_unittest(TestTokenize)
00935 
00936 if __name__ == "__main__":
00937     test_main()