Back to index

moin  1.9.0~rc2
_creole.py
Go to the documentation of this file.
00001 # -*- coding: iso-8859-1 -*-
00002 """
00003     Creole wiki markup parser
00004 
00005     See http://wikicreole.org/ for latest specs.
00006 
00007     Notes:
00008     * No markup allowed in headings.
00009       Creole 1.0 does not require us to support this.
00010     * No markup allowed in table headings.
00011       Creole 1.0 does not require us to support this.
00012     * No (non-bracketed) generic url recognition: this is "mission impossible"
00013       except if you want to risk lots of false positives. Only known protocols
00014       are recognized.
00015     * We do not allow ":" before "//" italic markup to avoid urls with
00016       unrecognized schemes (like wtf://server/path) triggering italic rendering
00017       for the rest of the paragraph.
00018 
00019     @copyright: 2007 MoinMoin:RadomirDopieralski (creole 0.5 implementation),
00020                 2007 MoinMoin:ThomasWaldmann (updates)
00021     @license: GNU GPL, see COPYING for details.
00022     @license: BSD, see COPYING for details.
00023 """
00024 
00025 import re
00026 
00027 # Whether the parser should convert \n into <br>.
00028 bloglike_lines = False
00029 
00030 class Rules:
00031     """Hold all the rules for generating regular expressions."""
00032 
00033     # For the inline elements:
00034     proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc'
00035     url =  r'''(?P<url>
00036             (^ | (?<=\s | [.,:;!?()/=]))
00037             (?P<escaped_url>~)?
00038             (?P<url_target> (?P<url_proto> %s ):\S+? )
00039             ($ | (?=\s | [,.:;!?()] (\s | $)))
00040         )''' % proto
00041     link = r'''(?P<link>
00042             \[\[
00043             (?P<link_target>.+?) \s*
00044             ([|] \s* (?P<link_text>.+?) \s*)?
00045             ]]
00046         )'''
00047     image = r'''(?P<image>
00048             {{
00049             (?P<image_target>.+?) \s*
00050             ([|] \s* (?P<image_text>.+?) \s*)?
00051             }}
00052         )'''
00053     macro = r'''(?P<macro>
00054             <<
00055             (?P<macro_name> \w+)
00056             (\( (?P<macro_args> .*?) \))? \s*
00057             ([|] \s* (?P<macro_text> .+?) \s* )?
00058             >>
00059         )'''
00060     code = r'(?P<code> {{{ (?P<code_text>.*?) }}} )'
00061     emph = r'(?P<emph> (?<!:)// )' # there must be no : in front of the //
00062                                    # avoids italic rendering in urls with
00063                                    # unknown protocols
00064     strong = r'(?P<strong> \*\* )'
00065     linebreak = r'(?P<break> \\\\ )'
00066     escape = r'(?P<escape> ~ (?P<escaped_char>\S) )'
00067     char =  r'(?P<char> . )'
00068 
00069     # For the block elements:
00070     separator = r'(?P<separator> ^ \s* ---- \s* $ )' # horizontal line
00071     line = r'(?P<line> ^ \s* $ )' # empty line that separates paragraphs
00072     head = r'''(?P<head>
00073             ^ \s*
00074             (?P<head_head>=+) \s*
00075             (?P<head_text> .*? ) \s*
00076             (?P<head_tail>=*) \s*
00077             $
00078         )'''
00079     if bloglike_lines:
00080         text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?'
00081     else:
00082         text = r'(?P<text> .+ )'
00083     list = r'''(?P<list>
00084             ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $
00085             ( \n[ \t]* [*\#]+.* $ )*
00086         )''' # Matches the whole list, separate items are parsed later. The
00087              # list *must* start with a single bullet.
00088     item = r'''(?P<item>
00089             ^ \s*
00090             (?P<item_head> [\#*]+) \s*
00091             (?P<item_text> .*?)
00092             $
00093         )''' # Matches single list items
00094     pre = r'''(?P<pre>
00095             ^{{{ \s* $
00096             (\n)?
00097             (?P<pre_text>
00098                 ([\#]!(?P<pre_kind>\w*?)(\s+.*)?$)?
00099                 (.|\n)+?
00100             )
00101             (\n)?
00102             ^}}} \s*$
00103         )'''
00104     pre_escape = r' ^(?P<indent>\s*) ~ (?P<rest> \}\}\} \s*) $'
00105     table = r'''(?P<table>
00106             ^ \s*
00107             [|].*? \s*
00108             [|]? \s*
00109             $
00110         )'''
00111 
00112     # For splitting table cells:
00113     cell = r'''
00114             \| \s*
00115             (
00116                 (?P<head> [=][^|]+ ) |
00117                 (?P<cell> (  %s | [^|])+ )
00118             ) \s*
00119         ''' % '|'.join([link, macro, image, code])
00120 
00121 class Parser:
00122     """
00123     Parse the raw text and create a document object
00124     that can be converted into output using Emitter.
00125     """
00126 
00127     # For pre escaping, in creole 1.0 done with ~:
00128     pre_escape_re = re.compile(Rules.pre_escape, re.M | re.X)
00129     link_re = re.compile('|'.join([Rules.image, Rules.linebreak, Rules.char]), re.X | re.U) # for link descriptions
00130     item_re = re.compile(Rules.item, re.X | re.U | re.M) # for list items
00131     cell_re = re.compile(Rules.cell, re.X | re.U) # for table cells
00132     # For block elements:
00133     block_re = re.compile('|'.join([Rules.line, Rules.head, Rules.separator,
00134         Rules.pre, Rules.list, Rules.table, Rules.text]), re.X | re.U | re.M)
00135     # For inline elements:
00136     inline_re = re.compile('|'.join([Rules.link, Rules.url, Rules.macro,
00137         Rules.code, Rules.image, Rules.strong, Rules.emph, Rules.linebreak,
00138         Rules.escape, Rules.char]), re.X | re.U)
00139 
00140     def __init__(self, raw):
00141         self.raw = raw
00142         self.root = DocNode('document', None)
00143         self.cur = self.root        # The most recent document node
00144         self.text = None            # The node to add inline characters to
00145 
00146     def _upto(self, node, kinds):
00147         """
00148         Look up the tree to the first occurence
00149         of one of the listed kinds of nodes or root.
00150         Start at the node node.
00151         """
00152         while node.parent is not None and not node.kind in kinds:
00153             node = node.parent
00154         return node
00155 
00156     # The _*_repl methods called for matches in regexps. Sometimes the
00157     # same method needs several names, because of group names in regexps.
00158 
00159     def _url_repl(self, groups):
00160         """Handle raw urls in text."""
00161 
00162         if not groups.get('escaped_url'):
00163             # this url is NOT escaped
00164             target = groups.get('url_target', '')
00165             node = DocNode('link', self.cur)
00166             node.content = target
00167             DocNode('text', node, node.content)
00168             self.text = None
00169         else:
00170             # this url is escaped, we render it as text
00171             if self.text is None:
00172                 self.text = DocNode('text', self.cur, u'')
00173             self.text.content += groups.get('url_target')
00174     _url_target_repl = _url_repl
00175     _url_proto_repl = _url_repl
00176     _escaped_url = _url_repl
00177 
00178     def _link_repl(self, groups):
00179         """Handle all kinds of links."""
00180 
00181         target = groups.get('link_target', '')
00182         text = (groups.get('link_text', '') or '').strip()
00183         parent = self.cur
00184         self.cur = DocNode('link', self.cur)
00185         self.cur.content = target
00186         self.text = None
00187         re.sub(self.link_re, self._replace, text)
00188         self.cur = parent
00189         self.text = None
00190     _link_target_repl = _link_repl
00191     _link_text_repl = _link_repl
00192 
00193     def _macro_repl(self, groups):
00194         """Handles macros using the placeholder syntax."""
00195 
00196         name = groups.get('macro_name', '')
00197         text = (groups.get('macro_text', '') or '').strip()
00198         node = DocNode('macro', self.cur, name)
00199         node.args = groups.get('macro_args', '') or ''
00200         DocNode('text', node, text or name)
00201         self.text = None
00202     _macro_name_repl = _macro_repl
00203     _macro_args_repl = _macro_repl
00204     _macro_text_repl = _macro_repl
00205 
00206     def _image_repl(self, groups):
00207         """Handles images and attachemnts included in the page."""
00208 
00209         target = groups.get('image_target', '').strip()
00210         text = (groups.get('image_text', '') or '').strip()
00211         node = DocNode("image", self.cur, target)
00212         DocNode('text', node, text or node.content)
00213         self.text = None
00214     _image_target_repl = _image_repl
00215     _image_text_repl = _image_repl
00216 
00217     def _separator_repl(self, groups):
00218         self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
00219         DocNode('separator', self.cur)
00220 
00221     def _item_repl(self, groups):
00222         bullet = groups.get('item_head', u'')
00223         text = groups.get('item_text', u'')
00224         if bullet[-1] == '#':
00225             kind = 'number_list'
00226         else:
00227             kind = 'bullet_list'
00228         level = len(bullet)
00229         lst = self.cur
00230         # Find a list of the same kind and level up the tree
00231         while (lst and
00232                    not (lst.kind in ('number_list', 'bullet_list') and
00233                         lst.level == level) and
00234                     not lst.kind in ('document', 'section', 'blockquote')):
00235             lst = lst.parent
00236         if lst and lst.kind == kind:
00237             self.cur = lst
00238         else:
00239             # Create a new level of list
00240             self.cur = self._upto(self.cur,
00241                 ('list_item', 'document', 'section', 'blockquote'))
00242             self.cur = DocNode(kind, self.cur)
00243             self.cur.level = level
00244         self.cur = DocNode('list_item', self.cur)
00245         self.parse_inline(text)
00246         self.text = None
00247     _item_text_repl = _item_repl
00248     _item_head_repl = _item_repl
00249 
00250     def _list_repl(self, groups):
00251         text = groups.get('list', u'')
00252         self.item_re.sub(self._replace, text)
00253 
00254     def _head_repl(self, groups):
00255         self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
00256         node = DocNode('header', self.cur, groups.get('head_text', '').strip())
00257         node.level = len(groups.get('head_head', ' '))
00258     _head_head_repl = _head_repl
00259     _head_text_repl = _head_repl
00260 
00261     def _text_repl(self, groups):
00262         text = groups.get('text', '')
00263         if self.cur.kind in ('table', 'table_row', 'bullet_list',
00264             'number_list'):
00265             self.cur = self._upto(self.cur,
00266                 ('document', 'section', 'blockquote'))
00267         if self.cur.kind in ('document', 'section', 'blockquote'):
00268             self.cur = DocNode('paragraph', self.cur)
00269         else:
00270             text = u' ' + text
00271         self.parse_inline(text)
00272         if groups.get('break') and self.cur.kind in ('paragraph',
00273             'emphasis', 'strong', 'code'):
00274             DocNode('break', self.cur, '')
00275         self.text = None
00276     _break_repl = _text_repl
00277 
00278     def _table_repl(self, groups):
00279         row = groups.get('table', '|').strip()
00280         self.cur = self._upto(self.cur, (
00281             'table', 'document', 'section', 'blockquote'))
00282         if self.cur.kind != 'table':
00283             self.cur = DocNode('table', self.cur)
00284         tb = self.cur
00285         tr = DocNode('table_row', tb)
00286 
00287         text = ''
00288         for m in self.cell_re.finditer(row):
00289             cell = m.group('cell')
00290             if cell:
00291                 self.cur = DocNode('table_cell', tr)
00292                 self.text = None
00293                 self.parse_inline(cell)
00294             else:
00295                 cell = m.group('head')
00296                 self.cur = DocNode('table_head', tr)
00297                 self.text = DocNode('text', self.cur, u'')
00298                 self.text.content = cell.strip('=')
00299         self.cur = tb
00300         self.text = None
00301 
00302     def _pre_repl(self, groups):
00303         self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
00304         kind = groups.get('pre_kind', None)
00305         text = groups.get('pre_text', u'')
00306         def remove_tilde(m):
00307             return m.group('indent') + m.group('rest')
00308         text = self.pre_escape_re.sub(remove_tilde, text)
00309         node = DocNode('preformatted', self.cur, text)
00310         node.sect = kind or ''
00311         self.text = None
00312     _pre_text_repl = _pre_repl
00313     _pre_head_repl = _pre_repl
00314     _pre_kind_repl = _pre_repl
00315 
00316     def _line_repl(self, groups):
00317         self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
00318 
00319     def _code_repl(self, groups):
00320         DocNode('code', self.cur, groups.get('code_text', u'').strip())
00321         self.text = None
00322     _code_text_repl = _code_repl
00323     _code_head_repl = _code_repl
00324 
00325     def _emph_repl(self, groups):
00326         if self.cur.kind != 'emphasis':
00327             self.cur = DocNode('emphasis', self.cur)
00328         else:
00329             self.cur = self._upto(self.cur, ('emphasis', )).parent
00330         self.text = None
00331 
00332     def _strong_repl(self, groups):
00333         if self.cur.kind != 'strong':
00334             self.cur = DocNode('strong', self.cur)
00335         else:
00336             self.cur = self._upto(self.cur, ('strong', )).parent
00337         self.text = None
00338 
00339     def _break_repl(self, groups):
00340         DocNode('break', self.cur, None)
00341         self.text = None
00342 
00343     def _escape_repl(self, groups):
00344         if self.text is None:
00345             self.text = DocNode('text', self.cur, u'')
00346         self.text.content += groups.get('escaped_char', u'')
00347 
00348     def _char_repl(self, groups):
00349         if self.text is None:
00350             self.text = DocNode('text', self.cur, u'')
00351         self.text.content += groups.get('char', u'')
00352 
00353     def _replace(self, match):
00354         """Invoke appropriate _*_repl method. Called for every matched group."""
00355 
00356         groups = match.groupdict()
00357         for name, text in groups.iteritems():
00358             if text is not None:
00359                 replace = getattr(self, '_%s_repl' % name)
00360                 replace(groups)
00361                 return
00362 
00363     def parse_inline(self, raw):
00364         """Recognize inline elements inside blocks."""
00365 
00366         re.sub(self.inline_re, self._replace, raw)
00367 
00368     def parse_block(self, raw):
00369         """Recognize block elements."""
00370 
00371         re.sub(self.block_re, self._replace, raw)
00372 
00373     def parse(self):
00374         """Parse the text given as self.raw and return DOM tree."""
00375 
00376         self.parse_block(self.raw)
00377         return self.root
00378 
00379 #################### Helper classes
00380 
00381 ### The document model and emitter follow
00382 
00383 class DocNode:
00384     """
00385     A node in the document.
00386     """
00387 
00388     def __init__(self, kind='', parent=None, content=None):
00389         self.children = []
00390         self.parent = parent
00391         self.kind = kind
00392         self.content = content
00393         if self.parent is not None:
00394             self.parent.children.append(self)
00395 
00396