Back to index

moin  1.9.0~rc2
lexer.py
Go to the documentation of this file.
00001 # -*- coding: utf-8 -*-
00002 """
00003     pygments.lexer
00004     ~~~~~~~~~~~~~~
00005 
00006     Base lexer classes.
00007 
00008     :copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS.
00009     :license: BSD, see LICENSE for details.
00010 """
00011 import re
00012 
00013 try:
00014     set
00015 except NameError:
00016     from sets import Set as set
00017 
00018 from pygments.filter import apply_filters, Filter
00019 from pygments.filters import get_filter_by_name
00020 from pygments.token import Error, Text, Other, _TokenType
00021 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
00022      make_analysator
00023 
00024 
00025 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
00026            'LexerContext', 'include', 'flags', 'bygroups', 'using', 'this']
00027 
00028 
00029 _default_analyse = staticmethod(lambda x: 0.0)
00030 
00031 
00032 class LexerMeta(type):
00033     """
00034     This metaclass automagically converts ``analyse_text`` methods into
00035     static methods which always return float values.
00036     """
00037 
00038     def __new__(cls, name, bases, d):
00039         if 'analyse_text' in d:
00040             d['analyse_text'] = make_analysator(d['analyse_text'])
00041         return type.__new__(cls, name, bases, d)
00042 
00043 
00044 class Lexer(object):
00045     """
00046     Lexer for a specific language.
00047 
00048     Basic options recognized:
00049     ``stripnl``
00050         Strip leading and trailing newlines from the input (default: True).
00051     ``stripall``
00052         Strip all leading and trailing whitespace from the input
00053         (default: False).
00054     ``tabsize``
00055         If given and greater than 0, expand tabs in the input (default: 0).
00056     ``encoding``
00057         If given, must be an encoding name. This encoding will be used to
00058         convert the input string to Unicode, if it is not already a Unicode
00059         string (default: ``'latin1'``).
00060         Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
00061         ``'chardet'`` to use the chardet library, if it is installed.
00062     """
00063 
00064     #: Name of the lexer
00065     name = None
00066 
00067     #: Shortcuts for the lexer
00068     aliases = []
00069 
00070     #: fn match rules
00071     filenames = []
00072 
00073     #: fn alias filenames
00074     alias_filenames = []
00075 
00076     #: mime types
00077     mimetypes = []
00078 
00079     __metaclass__ = LexerMeta
00080 
00081     def __init__(self, **options):
00082         self.options = options
00083         self.stripnl = get_bool_opt(options, 'stripnl', True)
00084         self.stripall = get_bool_opt(options, 'stripall', False)
00085         self.tabsize = get_int_opt(options, 'tabsize', 0)
00086         self.encoding = options.get('encoding', 'latin1')
00087         # self.encoding = options.get('inencoding', None) or self.encoding
00088         self.filters = []
00089         for filter_ in get_list_opt(options, 'filters', ()):
00090             self.add_filter(filter_)
00091 
00092     def __repr__(self):
00093         if self.options:
00094             return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
00095                                                      self.options)
00096         else:
00097             return '<pygments.lexers.%s>' % self.__class__.__name__
00098 
00099     def add_filter(self, filter_, **options):
00100         """
00101         Add a new stream filter to this lexer.
00102         """
00103         if not isinstance(filter_, Filter):
00104             filter_ = get_filter_by_name(filter_, **options)
00105         self.filters.append(filter_)
00106 
00107     def analyse_text(text):
00108         """
00109         Has to return a float between ``0`` and ``1`` that indicates
00110         if a lexer wants to highlight this text. Used by ``guess_lexer``.
00111         If this method returns ``0`` it won't highlight it in any case, if
00112         it returns ``1`` highlighting with this lexer is guaranteed.
00113 
00114         The `LexerMeta` metaclass automatically wraps this function so
00115         that it works like a static method (no ``self`` or ``cls``
00116         parameter) and the return value is automatically converted to
00117         `float`. If the return value is an object that is boolean `False`
00118         it's the same as if the return values was ``0.0``.
00119         """
00120 
00121     def get_tokens(self, text, unfiltered=False):
00122         """
00123         Return an iterable of (tokentype, value) pairs generated from
00124         `text`. If `unfiltered` is set to `True`, the filtering mechanism
00125         is bypassed even if filters are defined.
00126 
00127         Also preprocess the text, i.e. expand tabs and strip it if
00128         wanted and applies registered filters.
00129         """
00130         if not isinstance(text, unicode):
00131             if self.encoding == 'guess':
00132                 try:
00133                     text = text.decode('utf-8')
00134                     if text.startswith(u'\ufeff'):
00135                         text = text[len(u'\ufeff'):]
00136                 except UnicodeDecodeError:
00137                     text = text.decode('latin1')
00138             elif self.encoding == 'chardet':
00139                 try:
00140                     import chardet
00141                 except ImportError:
00142                     raise ImportError('To enable chardet encoding guessing, '
00143                                       'please install the chardet library '
00144                                       'from http://chardet.feedparser.org/')
00145                 enc = chardet.detect(text)
00146                 text = text.decode(enc['encoding'])
00147             else:
00148                 text = text.decode(self.encoding)
00149         # text now *is* a unicode string
00150         text = text.replace('\r\n', '\n')
00151         text = text.replace('\r', '\n')
00152         if self.stripall:
00153             text = text.strip()
00154         elif self.stripnl:
00155             text = text.strip('\n')
00156         if self.tabsize > 0:
00157             text = text.expandtabs(self.tabsize)
00158         if not text.endswith('\n'):
00159             text += '\n'
00160 
00161         def streamer():
00162             for i, t, v in self.get_tokens_unprocessed(text):
00163                 yield t, v
00164         stream = streamer()
00165         if not unfiltered:
00166             stream = apply_filters(stream, self.filters, self)
00167         return stream
00168 
00169     def get_tokens_unprocessed(self, text):
00170         """
00171         Return an iterable of (tokentype, value) pairs.
00172         In subclasses, implement this method as a generator to
00173         maximize effectiveness.
00174         """
00175         raise NotImplementedError
00176 
00177 
00178 class DelegatingLexer(Lexer):
00179     """
00180     This lexer takes two lexer as arguments. A root lexer and
00181     a language lexer. First everything is scanned using the language
00182     lexer, afterwards all ``Other`` tokens are lexed using the root
00183     lexer.
00184 
00185     The lexers from the ``template`` lexer package use this base lexer.
00186     """
00187 
00188     def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
00189         self.root_lexer = _root_lexer(**options)
00190         self.language_lexer = _language_lexer(**options)
00191         self.needle = _needle
00192         Lexer.__init__(self, **options)
00193 
00194     def get_tokens_unprocessed(self, text):
00195         buffered = ''
00196         insertions = []
00197         lng_buffer = []
00198         for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
00199             if t is self.needle:
00200                 if lng_buffer:
00201                     insertions.append((len(buffered), lng_buffer))
00202                     lng_buffer = []
00203                 buffered += v
00204             else:
00205                 lng_buffer.append((i, t, v))
00206         if lng_buffer:
00207             insertions.append((len(buffered), lng_buffer))
00208         return do_insertions(insertions,
00209                              self.root_lexer.get_tokens_unprocessed(buffered))
00210 
00211 
00212 #-------------------------------------------------------------------------------
00213 # RegexLexer and ExtendedRegexLexer
00214 #
00215 
00216 
00217 class include(str):
00218     """
00219     Indicates that a state should include rules from another state.
00220     """
00221     pass
00222 
00223 
00224 class combined(tuple):
00225     """
00226     Indicates a state combined from multiple states.
00227     """
00228 
00229     def __new__(cls, *args):
00230         return tuple.__new__(cls, args)
00231 
00232     def __init__(self, *args):
00233         # tuple.__init__ doesn't do anything
00234         pass
00235 
00236 
00237 class _PseudoMatch(object):
00238     """
00239     A pseudo match object constructed from a string.
00240     """
00241 
00242     def __init__(self, start, text):
00243         self._text = text
00244         self._start = start
00245 
00246     def start(self, arg=None):
00247         return self._start
00248 
00249     def end(self, arg=None):
00250         return self._start + len(self._text)
00251 
00252     def group(self, arg=None):
00253         if arg:
00254             raise IndexError('No such group')
00255         return self._text
00256 
00257     def groups(self):
00258         return (self._text,)
00259 
00260     def groupdict(self):
00261         return {}
00262 
00263 
00264 def bygroups(*args):
00265     """
00266     Callback that yields multiple actions for each group in the match.
00267     """
00268     def callback(lexer, match, ctx=None):
00269         for i, action in enumerate(args):
00270             if action is None:
00271                 continue
00272             elif type(action) is _TokenType:
00273                 data = match.group(i + 1)
00274                 if data:
00275                     yield match.start(i + 1), action, data
00276             else:
00277                 if ctx:
00278                     ctx.pos = match.start(i + 1)
00279                 for item in action(lexer, _PseudoMatch(match.start(i + 1),
00280                                    match.group(i + 1)), ctx):
00281                     if item:
00282                         yield item
00283         if ctx:
00284             ctx.pos = match.end()
00285     return callback
00286 
00287 
00288 class _This(object):
00289     """
00290     Special singleton used for indicating the caller class.
00291     Used by ``using``.
00292     """
00293 this = _This()
00294 
00295 
00296 def using(_other, **kwargs):
00297     """
00298     Callback that processes the match with a different lexer.
00299 
00300     The keyword arguments are forwarded to the lexer, except `state` which
00301     is handled separately.
00302 
00303     `state` specifies the state that the new lexer will start in, and can
00304     be an enumerable such as ('root', 'inline', 'string') or a simple
00305     string which is assumed to be on top of the root state.
00306 
00307     Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
00308     """
00309     gt_kwargs = {}
00310     if 'state' in kwargs:
00311         s = kwargs.pop('state')
00312         if isinstance(s, (list, tuple)):
00313             gt_kwargs['stack'] = s
00314         else:
00315             gt_kwargs['stack'] = ('root', s)
00316 
00317     if _other is this:
00318         def callback(lexer, match, ctx=None):
00319             # if keyword arguments are given the callback
00320             # function has to create a new lexer instance
00321             if kwargs:
00322                 # XXX: cache that somehow
00323                 kwargs.update(lexer.options)
00324                 lx = lexer.__class__(**kwargs)
00325             else:
00326                 lx = lexer
00327             s = match.start()
00328             for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
00329                 yield i + s, t, v
00330             if ctx:
00331                 ctx.pos = match.end()
00332     else:
00333         def callback(lexer, match, ctx=None):
00334             # XXX: cache that somehow
00335             kwargs.update(lexer.options)
00336             lx = _other(**kwargs)
00337 
00338             s = match.start()
00339             for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
00340                 yield i + s, t, v
00341             if ctx:
00342                 ctx.pos = match.end()
00343     return callback
00344 
00345 
00346 class RegexLexerMeta(LexerMeta):
00347     """
00348     Metaclass for RegexLexer, creates the self._tokens attribute from
00349     self.tokens on the first instantiation.
00350     """
00351 
00352     def _process_state(cls, unprocessed, processed, state):
00353         assert type(state) is str, "wrong state name %r" % state
00354         assert state[0] != '#', "invalid state name %r" % state
00355         if state in processed:
00356             return processed[state]
00357         tokens = processed[state] = []
00358         rflags = cls.flags
00359         for tdef in unprocessed[state]:
00360             if isinstance(tdef, include):
00361                 # it's a state reference
00362                 assert tdef != state, "circular state reference %r" % state
00363                 tokens.extend(cls._process_state(unprocessed, processed, str(tdef)))
00364                 continue
00365 
00366             assert type(tdef) is tuple, "wrong rule def %r" % tdef
00367 
00368             try:
00369                 rex = re.compile(tdef[0], rflags).match
00370             except Exception, err:
00371                 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
00372                                  (tdef[0], state, cls, err))
00373 
00374             assert type(tdef[1]) is _TokenType or callable(tdef[1]), \
00375                    'token type must be simple type or callable, not %r' % (tdef[1],)
00376 
00377             if len(tdef) == 2:
00378                 new_state = None
00379             else:
00380                 tdef2 = tdef[2]
00381                 if isinstance(tdef2, str):
00382                     # an existing state
00383                     if tdef2 == '#pop':
00384                         new_state = -1
00385                     elif tdef2 in unprocessed:
00386                         new_state = (tdef2,)
00387                     elif tdef2 == '#push':
00388                         new_state = tdef2
00389                     elif tdef2[:5] == '#pop:':
00390                         new_state = -int(tdef2[5:])
00391                     else:
00392                         assert False, 'unknown new state %r' % tdef2
00393                 elif isinstance(tdef2, combined):
00394                     # combine a new state from existing ones
00395                     new_state = '_tmp_%d' % cls._tmpname
00396                     cls._tmpname += 1
00397                     itokens = []
00398                     for istate in tdef2:
00399                         assert istate != state, 'circular state ref %r' % istate
00400                         itokens.extend(cls._process_state(unprocessed,
00401                                                           processed, istate))
00402                     processed[new_state] = itokens
00403                     new_state = (new_state,)
00404                 elif isinstance(tdef2, tuple):
00405                     # push more than one state
00406                     for state in tdef2:
00407                         assert (state in unprocessed or
00408                                 state in ('#pop', '#push')), \
00409                                'unknown new state ' + state
00410                     new_state = tdef2
00411                 else:
00412                     assert False, 'unknown new state def %r' % tdef2
00413             tokens.append((rex, tdef[1], new_state))
00414         return tokens
00415 
00416     def process_tokendef(cls, name, tokendefs=None):
00417         processed = cls._all_tokens[name] = {}
00418         tokendefs = tokendefs or cls.tokens[name]
00419         for state in tokendefs.keys():
00420             cls._process_state(tokendefs, processed, state)
00421         return processed
00422 
00423     def __call__(cls, *args, **kwds):
00424         if not hasattr(cls, '_tokens'):
00425             cls._all_tokens = {}
00426             cls._tmpname = 0
00427             if hasattr(cls, 'token_variants') and cls.token_variants:
00428                 # don't process yet
00429                 pass
00430             else:
00431                 cls._tokens = cls.process_tokendef('', cls.tokens)
00432 
00433         return type.__call__(cls, *args, **kwds)
00434 
00435 
00436 class RegexLexer(Lexer):
00437     """
00438     Base for simple stateful regular expression-based lexers.
00439     Simplifies the lexing process so that you need only
00440     provide a list of states and regular expressions.
00441     """
00442     __metaclass__ = RegexLexerMeta
00443 
00444     #: Flags for compiling the regular expressions.
00445     #: Defaults to MULTILINE.
00446     flags = re.MULTILINE
00447 
00448     #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
00449     #:
00450     #: The initial state is 'root'.
00451     #: ``new_state`` can be omitted to signify no state transition.
00452     #: If it is a string, the state is pushed on the stack and changed.
00453     #: If it is a tuple of strings, all states are pushed on the stack and
00454     #: the current state will be the topmost.
00455     #: It can also be ``combined('state1', 'state2', ...)``
00456     #: to signify a new, anonymous state combined from the rules of two
00457     #: or more existing ones.
00458     #: Furthermore, it can be '#pop' to signify going back one step in
00459     #: the state stack, or '#push' to push the current state on the stack
00460     #: again.
00461     #:
00462     #: The tuple can also be replaced with ``include('state')``, in which
00463     #: case the rules from the state named by the string are included in the
00464     #: current one.
00465     tokens = {}
00466 
00467     def get_tokens_unprocessed(self, text, stack=('root',)):
00468         """
00469         Split ``text`` into (tokentype, text) pairs.
00470 
00471         ``stack`` is the inital stack (default: ``['root']``)
00472         """
00473         pos = 0
00474         tokendefs = self._tokens
00475         statestack = list(stack)
00476         statetokens = tokendefs[statestack[-1]]
00477         while 1:
00478             for rexmatch, action, new_state in statetokens:
00479                 m = rexmatch(text, pos)
00480                 if m:
00481                     if type(action) is _TokenType:
00482                         yield pos, action, m.group()
00483                     else:
00484                         for item in action(self, m):
00485                             yield item
00486                     pos = m.end()
00487                     if new_state is not None:
00488                         # state transition
00489                         if isinstance(new_state, tuple):
00490                             for state in new_state:
00491                                 if state == '#pop':
00492                                     statestack.pop()
00493                                 elif state == '#push':
00494                                     statestack.append(statestack[-1])
00495                                 else:
00496                                     statestack.append(state)
00497                         elif isinstance(new_state, int):
00498                             # pop
00499                             del statestack[new_state:]
00500                         elif new_state == '#push':
00501                             statestack.append(statestack[-1])
00502                         else:
00503                             assert False, "wrong state def: %r" % new_state
00504                         statetokens = tokendefs[statestack[-1]]
00505                     break
00506             else:
00507                 try:
00508                     if text[pos] == '\n':
00509                         # at EOL, reset state to "root"
00510                         pos += 1
00511                         statestack = ['root']
00512                         statetokens = tokendefs['root']
00513                         yield pos, Text, u'\n'
00514                         continue
00515                     yield pos, Error, text[pos]
00516                     pos += 1
00517                 except IndexError:
00518                     break
00519 
00520 
00521 class LexerContext(object):
00522     """
00523     A helper object that holds lexer position data.
00524     """
00525 
00526     def __init__(self, text, pos, stack=None, end=None):
00527         self.text = text
00528         self.pos = pos
00529         self.end = end or len(text) # end=0 not supported ;-)
00530         self.stack = stack or ['root']
00531 
00532     def __repr__(self):
00533         return 'LexerContext(%r, %r, %r)' % (
00534             self.text, self.pos, self.stack)
00535 
00536 
00537 class ExtendedRegexLexer(RegexLexer):
00538     """
00539     A RegexLexer that uses a context object to store its state.
00540     """
00541 
00542     def get_tokens_unprocessed(self, text=None, context=None):
00543         """
00544         Split ``text`` into (tokentype, text) pairs.
00545         If ``context`` is given, use this lexer context instead.
00546         """
00547         tokendefs = self._tokens
00548         if not context:
00549             ctx = LexerContext(text, 0)
00550             statetokens = tokendefs['root']
00551         else:
00552             ctx = context
00553             statetokens = tokendefs[ctx.stack[-1]]
00554             text = ctx.text
00555         while 1:
00556             for rexmatch, action, new_state in statetokens:
00557                 m = rexmatch(text, ctx.pos, ctx.end)
00558                 if m:
00559                     if type(action) is _TokenType:
00560                         yield ctx.pos, action, m.group()
00561                         ctx.pos = m.end()
00562                     else:
00563                         for item in action(self, m, ctx):
00564                             yield item
00565                         if not new_state:
00566                             # altered the state stack?
00567                             statetokens = tokendefs[ctx.stack[-1]]
00568                     # CAUTION: callback must set ctx.pos!
00569                     if new_state is not None:
00570                         # state transition
00571                         if isinstance(new_state, tuple):
00572                             ctx.stack.extend(new_state)
00573                         elif isinstance(new_state, int):
00574                             # pop
00575                             del ctx.stack[new_state:]
00576                         elif new_state == '#push':
00577                             ctx.stack.append(ctx.stack[-1])
00578                         else:
00579                             assert False, "wrong state def: %r" % new_state
00580                         statetokens = tokendefs[ctx.stack[-1]]
00581                     break
00582             else:
00583                 try:
00584                     if ctx.pos >= ctx.end:
00585                         break
00586                     if text[ctx.pos] == '\n':
00587                         # at EOL, reset state to "root"
00588                         ctx.pos += 1
00589                         ctx.stack = ['root']
00590                         statetokens = tokendefs['root']
00591                         yield ctx.pos, Text, u'\n'
00592                         continue
00593                     yield ctx.pos, Error, text[ctx.pos]
00594                     ctx.pos += 1
00595                 except IndexError:
00596                     break
00597 
00598 
00599 def do_insertions(insertions, tokens):
00600     """
00601     Helper for lexers which must combine the results of several
00602     sublexers.
00603 
00604     ``insertions`` is a list of ``(index, itokens)`` pairs.
00605     Each ``itokens`` iterable should be inserted at position
00606     ``index`` into the token stream given by the ``tokens``
00607     argument.
00608 
00609     The result is a combined token stream.
00610 
00611     TODO: clean up the code here.
00612     """
00613     insertions = iter(insertions)
00614     try:
00615         index, itokens = insertions.next()
00616     except StopIteration:
00617         # no insertions
00618         for item in tokens:
00619             yield item
00620         return
00621 
00622     realpos = None
00623     insleft = True
00624 
00625     # iterate over the token stream where we want to insert
00626     # the tokens from the insertion list.
00627     for i, t, v in tokens:
00628         # first iteration. store the postition of first item
00629         if realpos is None:
00630             realpos = i
00631         oldi = 0
00632         while insleft and i + len(v) >= index:
00633             tmpval = v[oldi:index - i]
00634             yield realpos, t, tmpval
00635             realpos += len(tmpval)
00636             for it_index, it_token, it_value in itokens:
00637                 yield realpos, it_token, it_value
00638                 realpos += len(it_value)
00639             oldi = index - i
00640             try:
00641                 index, itokens = insertions.next()
00642             except StopIteration:
00643                 insleft = False
00644                 break  # not strictly necessary
00645         yield realpos, t, v[oldi:]
00646         realpos += len(v) - oldi
00647 
00648     # leftover tokens
00649     if insleft:
00650         # no normal tokens, set realpos to zero
00651         realpos = realpos or 0
00652         for p, t, v in itokens:
00653             yield realpos, t, v
00654             realpos += len(v)