Back to index

moin  1.9.0~rc2
special.py
Go to the documentation of this file.
00001 # -*- coding: utf-8 -*-
00002 """
00003     pygments.lexers.special
00004     ~~~~~~~~~~~~~~~~~~~~~~~
00005 
00006     Special lexers.
00007 
00008     :copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS.
00009     :license: BSD, see LICENSE for details.
00010 """
00011 
00012 import re
00013 import cStringIO
00014 
00015 from pygments.lexer import Lexer
00016 from pygments.token import Token, Error, Text
00017 from pygments.util import get_choice_opt, b
00018 
00019 
00020 __all__ = ['TextLexer', 'RawTokenLexer']
00021 
00022 
00023 class TextLexer(Lexer):
00024     """
00025     "Null" lexer, doesn't highlight anything.
00026     """
00027     name = 'Text only'
00028     aliases = ['text']
00029     filenames = ['*.txt']
00030     mimetypes = ['text/plain']
00031 
00032     def get_tokens_unprocessed(self, text):
00033         yield 0, Text, text
00034 
00035 
00036 _ttype_cache = {}
00037 
00038 line_re = re.compile(b('.*?\n'))
00039 
00040 class RawTokenLexer(Lexer):
00041     """
00042     Recreate a token stream formatted with the `RawTokenFormatter`.  This
00043     lexer raises exceptions during parsing if the token stream in the
00044     file is malformed.
00045 
00046     Additional options accepted:
00047 
00048     `compress`
00049         If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
00050         the given compression algorithm before lexing (default: ``""``).
00051     """
00052     name = 'Raw token data'
00053     aliases = ['raw']
00054     filenames = []
00055     mimetypes = ['application/x-pygments-tokens']
00056 
00057     def __init__(self, **options):
00058         self.compress = get_choice_opt(options, 'compress',
00059                                        ['', 'none', 'gz', 'bz2'], '')
00060         Lexer.__init__(self, **options)
00061 
00062     def get_tokens(self, text):
00063         if isinstance(text, unicode):
00064             # raw token stream never has any non-ASCII characters
00065             text = text.encode('ascii')
00066         if self.compress == 'gz':
00067             import gzip
00068             gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text))
00069             text = gzipfile.read()
00070         elif self.compress == 'bz2':
00071             import bz2
00072             text = bz2.decompress(text)
00073 
00074         # do not call Lexer.get_tokens() because we do not want Unicode
00075         # decoding to occur, and stripping is not optional.
00076         text = text.strip(b('\n')) + b('\n')
00077         for i, t, v in self.get_tokens_unprocessed(text):
00078             yield t, v
00079 
00080     def get_tokens_unprocessed(self, text):
00081         length = 0
00082         for match in line_re.finditer(text):
00083             try:
00084                 ttypestr, val = match.group().split(b('\t'), 1)
00085             except ValueError:
00086                 val = match.group().decode(self.encoding)
00087                 ttype = Error
00088             else:
00089                 ttype = _ttype_cache.get(ttypestr)
00090                 if not ttype:
00091                     ttype = Token
00092                     ttypes = ttypestr.split('.')[1:]
00093                     for ttype_ in ttypes:
00094                         if not ttype_ or not ttype_[0].isupper():
00095                             raise ValueError('malformed token name')
00096                         ttype = getattr(ttype, ttype_)
00097                     _ttype_cache[ttypestr] = ttype
00098                 val = val[2:-2].decode('unicode-escape')
00099             yield length, ttype, val
00100             length += len(val)