Back to index

python3.2  3.2.2
shlex.py
Go to the documentation of this file.
00001 """A lexical analyzer class for simple shell-like syntaxes."""
00002 
00003 # Module and documentation by Eric S. Raymond, 21 Dec 1998
00004 # Input stacking and error message cleanup added by ESR, March 2000
00005 # push_source() and pop_source() made explicit by ESR, January 2001.
00006 # Posix compliance, split(), string arguments, and
00007 # iterator interface by Gustavo Niemeyer, April 2003.
00008 
00009 import os.path
00010 import sys
00011 from collections import deque
00012 
00013 from io import StringIO
00014 
00015 __all__ = ["shlex", "split"]
00016 
00017 class shlex:
00018     "A lexical analyzer class for simple shell-like syntaxes."
00019     def __init__(self, instream=None, infile=None, posix=False):
00020         if isinstance(instream, str):
00021             instream = StringIO(instream)
00022         if instream is not None:
00023             self.instream = instream
00024             self.infile = infile
00025         else:
00026             self.instream = sys.stdin
00027             self.infile = None
00028         self.posix = posix
00029         if posix:
00030             self.eof = None
00031         else:
00032             self.eof = ''
00033         self.commenters = '#'
00034         self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
00035                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
00036         if self.posix:
00037             self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
00038                                'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
00039         self.whitespace = ' \t\r\n'
00040         self.whitespace_split = False
00041         self.quotes = '\'"'
00042         self.escape = '\\'
00043         self.escapedquotes = '"'
00044         self.state = ' '
00045         self.pushback = deque()
00046         self.lineno = 1
00047         self.debug = 0
00048         self.token = ''
00049         self.filestack = deque()
00050         self.source = None
00051         if self.debug:
00052             print('shlex: reading from %s, line %d' \
00053                   % (self.instream, self.lineno))
00054 
00055     def push_token(self, tok):
00056         "Push a token onto the stack popped by the get_token method"
00057         if self.debug >= 1:
00058             print("shlex: pushing token " + repr(tok))
00059         self.pushback.appendleft(tok)
00060 
00061     def push_source(self, newstream, newfile=None):
00062         "Push an input source onto the lexer's input source stack."
00063         if isinstance(newstream, str):
00064             newstream = StringIO(newstream)
00065         self.filestack.appendleft((self.infile, self.instream, self.lineno))
00066         self.infile = newfile
00067         self.instream = newstream
00068         self.lineno = 1
00069         if self.debug:
00070             if newfile is not None:
00071                 print('shlex: pushing to file %s' % (self.infile,))
00072             else:
00073                 print('shlex: pushing to stream %s' % (self.instream,))
00074 
00075     def pop_source(self):
00076         "Pop the input source stack."
00077         self.instream.close()
00078         (self.infile, self.instream, self.lineno) = self.filestack.popleft()
00079         if self.debug:
00080             print('shlex: popping to %s, line %d' \
00081                   % (self.instream, self.lineno))
00082         self.state = ' '
00083 
00084     def get_token(self):
00085         "Get a token from the input stream (or from stack if it's nonempty)"
00086         if self.pushback:
00087             tok = self.pushback.popleft()
00088             if self.debug >= 1:
00089                 print("shlex: popping token " + repr(tok))
00090             return tok
00091         # No pushback.  Get a token.
00092         raw = self.read_token()
00093         # Handle inclusions
00094         if self.source is not None:
00095             while raw == self.source:
00096                 spec = self.sourcehook(self.read_token())
00097                 if spec:
00098                     (newfile, newstream) = spec
00099                     self.push_source(newstream, newfile)
00100                 raw = self.get_token()
00101         # Maybe we got EOF instead?
00102         while raw == self.eof:
00103             if not self.filestack:
00104                 return self.eof
00105             else:
00106                 self.pop_source()
00107                 raw = self.get_token()
00108         # Neither inclusion nor EOF
00109         if self.debug >= 1:
00110             if raw != self.eof:
00111                 print("shlex: token=" + repr(raw))
00112             else:
00113                 print("shlex: token=EOF")
00114         return raw
00115 
00116     def read_token(self):
00117         quoted = False
00118         escapedstate = ' '
00119         while True:
00120             nextchar = self.instream.read(1)
00121             if nextchar == '\n':
00122                 self.lineno = self.lineno + 1
00123             if self.debug >= 3:
00124                 print("shlex: in state", repr(self.state), \
00125                       "I see character:", repr(nextchar))
00126             if self.state is None:
00127                 self.token = ''        # past end of file
00128                 break
00129             elif self.state == ' ':
00130                 if not nextchar:
00131                     self.state = None  # end of file
00132                     break
00133                 elif nextchar in self.whitespace:
00134                     if self.debug >= 2:
00135                         print("shlex: I see whitespace in whitespace state")
00136                     if self.token or (self.posix and quoted):
00137                         break   # emit current token
00138                     else:
00139                         continue
00140                 elif nextchar in self.commenters:
00141                     self.instream.readline()
00142                     self.lineno = self.lineno + 1
00143                 elif self.posix and nextchar in self.escape:
00144                     escapedstate = 'a'
00145                     self.state = nextchar
00146                 elif nextchar in self.wordchars:
00147                     self.token = nextchar
00148                     self.state = 'a'
00149                 elif nextchar in self.quotes:
00150                     if not self.posix:
00151                         self.token = nextchar
00152                     self.state = nextchar
00153                 elif self.whitespace_split:
00154                     self.token = nextchar
00155                     self.state = 'a'
00156                 else:
00157                     self.token = nextchar
00158                     if self.token or (self.posix and quoted):
00159                         break   # emit current token
00160                     else:
00161                         continue
00162             elif self.state in self.quotes:
00163                 quoted = True
00164                 if not nextchar:      # end of file
00165                     if self.debug >= 2:
00166                         print("shlex: I see EOF in quotes state")
00167                     # XXX what error should be raised here?
00168                     raise ValueError("No closing quotation")
00169                 if nextchar == self.state:
00170                     if not self.posix:
00171                         self.token = self.token + nextchar
00172                         self.state = ' '
00173                         break
00174                     else:
00175                         self.state = 'a'
00176                 elif self.posix and nextchar in self.escape and \
00177                      self.state in self.escapedquotes:
00178                     escapedstate = self.state
00179                     self.state = nextchar
00180                 else:
00181                     self.token = self.token + nextchar
00182             elif self.state in self.escape:
00183                 if not nextchar:      # end of file
00184                     if self.debug >= 2:
00185                         print("shlex: I see EOF in escape state")
00186                     # XXX what error should be raised here?
00187                     raise ValueError("No escaped character")
00188                 # In posix shells, only the quote itself or the escape
00189                 # character may be escaped within quotes.
00190                 if escapedstate in self.quotes and \
00191                    nextchar != self.state and nextchar != escapedstate:
00192                     self.token = self.token + self.state
00193                 self.token = self.token + nextchar
00194                 self.state = escapedstate
00195             elif self.state == 'a':
00196                 if not nextchar:
00197                     self.state = None   # end of file
00198                     break
00199                 elif nextchar in self.whitespace:
00200                     if self.debug >= 2:
00201                         print("shlex: I see whitespace in word state")
00202                     self.state = ' '
00203                     if self.token or (self.posix and quoted):
00204                         break   # emit current token
00205                     else:
00206                         continue
00207                 elif nextchar in self.commenters:
00208                     self.instream.readline()
00209                     self.lineno = self.lineno + 1
00210                     if self.posix:
00211                         self.state = ' '
00212                         if self.token or (self.posix and quoted):
00213                             break   # emit current token
00214                         else:
00215                             continue
00216                 elif self.posix and nextchar in self.quotes:
00217                     self.state = nextchar
00218                 elif self.posix and nextchar in self.escape:
00219                     escapedstate = 'a'
00220                     self.state = nextchar
00221                 elif nextchar in self.wordchars or nextchar in self.quotes \
00222                     or self.whitespace_split:
00223                     self.token = self.token + nextchar
00224                 else:
00225                     self.pushback.appendleft(nextchar)
00226                     if self.debug >= 2:
00227                         print("shlex: I see punctuation in word state")
00228                     self.state = ' '
00229                     if self.token:
00230                         break   # emit current token
00231                     else:
00232                         continue
00233         result = self.token
00234         self.token = ''
00235         if self.posix and not quoted and result == '':
00236             result = None
00237         if self.debug > 1:
00238             if result:
00239                 print("shlex: raw token=" + repr(result))
00240             else:
00241                 print("shlex: raw token=EOF")
00242         return result
00243 
00244     def sourcehook(self, newfile):
00245         "Hook called on a filename to be sourced."
00246         if newfile[0] == '"':
00247             newfile = newfile[1:-1]
00248         # This implements cpp-like semantics for relative-path inclusion.
00249         if isinstance(self.infile, str) and not os.path.isabs(newfile):
00250             newfile = os.path.join(os.path.dirname(self.infile), newfile)
00251         return (newfile, open(newfile, "r"))
00252 
00253     def error_leader(self, infile=None, lineno=None):
00254         "Emit a C-compiler-like, Emacs-friendly error-message leader."
00255         if infile is None:
00256             infile = self.infile
00257         if lineno is None:
00258             lineno = self.lineno
00259         return "\"%s\", line %d: " % (infile, lineno)
00260 
00261     def __iter__(self):
00262         return self
00263 
00264     def __next__(self):
00265         token = self.get_token()
00266         if token == self.eof:
00267             raise StopIteration
00268         return token
00269 
00270 def split(s, comments=False, posix=True):
00271     lex = shlex(s, posix=posix)
00272     lex.whitespace_split = True
00273     if not comments:
00274         lex.commenters = ''
00275     return list(lex)
00276 
00277 if __name__ == '__main__':
00278     if len(sys.argv) == 1:
00279         lexer = shlex()
00280     else:
00281         file = sys.argv[1]
00282         lexer = shlex(open(file), file)
00283     while 1:
00284         tt = lexer.get_token()
00285         if tt:
00286             print("Token: " + repr(tt))
00287         else:
00288             break