Back to index

moin  1.9.0~rc2
htmlmarkup.py
Go to the documentation of this file.
00001 # -*- coding: utf-8 -*-
00002 # copied from trac.util.html, revision 3609, merged on 2006-08-20
00003 #
00004 # Copyright (C) 2003-2006 Edgewall Software
00005 # Copyright 2006 MoinMoin:AlexanderSchremmer
00006 # All rights reserved.
00007 #
00008 # This software is licensed as described in the file COPYING, which
00009 # you should have received as part of this distribution. The terms
00010 # are also available at http://trac.edgewall.com/license.html.
00011 #
00012 # This software consists of voluntary contributions made by many
00013 # individuals. For exact contribution history, see the revision
00014 # history and logs, available at http://projects.edgewall.com/trac/.
00015 
00016 import htmlentitydefs
00017 from HTMLParser import HTMLParser, HTMLParseError
00018 import re
00019 try:
00020     frozenset
00021 except NameError:
00022     from sets import ImmutableSet as frozenset
00023 from StringIO import StringIO
00024 
00025 __all__ = ['escape', 'unescape', 'html']
00026 
00027 _EMPTY_TAGS = frozenset(['br', 'hr', 'img', 'input'])
00028 _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
00029                             'defer', 'disabled', 'ismap', 'multiple', 'nohref',
00030                             'noresize', 'noshade', 'nowrap'])
00031 
00032 
00033 class Markup(unicode):
00034     """Marks a string as being safe for inclusion in XML output without needing
00035     to be escaped.
00036     
00037     Strings are normally automatically escaped when added to the HDF.
00038     `Markup`-strings are however an exception. Use with care.
00039     
00040     (since Trac 0.9.3)
00041     """
00042     def __new__(self, text='', *args):
00043         if args:
00044             text %= tuple([escape(arg) for arg in args])
00045         return unicode.__new__(self, text)
00046 
00047     def __add__(self, other):
00048         return Markup(unicode(self) + Markup.escape(other))
00049 
00050     def __mod__(self, args):
00051         if not isinstance(args, (list, tuple)):
00052             args = [args]
00053         return Markup(unicode.__mod__(self,
00054                                       tuple([escape(arg) for arg in args])))
00055 
00056     def __mul__(self, num):
00057         return Markup(unicode(self) * num)
00058 
00059     def join(self, seq):
00060         return Markup(unicode(self).join([Markup.escape(item) for item in seq]))
00061 
00062     def stripentities(self, keepxmlentities=False):
00063         """Return a copy of the text with any character or numeric entities
00064         replaced by the equivalent UTF-8 characters.
00065         
00066         If the `keepxmlentities` parameter is provided and evaluates to `True`,
00067         the core XML entities (&, ', >, < and ").
00068         
00069         (Since Trac 0.10)
00070         """
00071         def _replace_entity(match):
00072             if match.group(1): # numeric entity
00073                 ref = match.group(1)
00074                 if ref.startswith('x'):
00075                     ref = int(ref[1:], 16)
00076                 else:
00077                     ref = int(ref, 10)
00078                 return unichr(ref)
00079             else: # character entity
00080                 ref = match.group(2)
00081                 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
00082                     return '&%s;' % ref
00083                 try:
00084                     codepoint = htmlentitydefs.name2codepoint[ref]
00085                     return unichr(codepoint)
00086                 except KeyError:
00087                     if keepxmlentities:
00088                         return '&%s;' % ref
00089                     else:
00090                         return ref
00091         return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
00092                              _replace_entity, self))
00093 
00094     def striptags(self):
00095         """Return a copy of the text with all XML/HTML tags removed."""
00096         return Markup(re.sub(r'<[^>]*?>', '', self))
00097 
00098     def escape(cls, text, quotes=True):
00099         """Create a Markup instance from a string and escape special characters
00100         it may contain (<, >, & and \").
00101         
00102         If the `quotes` parameter is set to `False`, the \" character is left
00103         as is. Escaping quotes is generally only required for strings that are
00104         to be used in attribute values.
00105         """
00106         if isinstance(text, (cls, Element)):
00107             return text
00108         text = unicode(text)
00109         if not text:
00110             return cls()
00111         text = text.replace('&', '&amp;') \
00112                    .replace('<', '&lt;') \
00113                    .replace('>', '&gt;')
00114         if quotes:
00115             text = text.replace('"', '&#34;')
00116         return cls(text)
00117     escape = classmethod(escape)
00118 
00119     def unescape(self):
00120         """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
00121         if not self:
00122             return ''
00123         return unicode(self).replace('&#34;', '"') \
00124                             .replace('&gt;', '>') \
00125                             .replace('&lt;', '<') \
00126                             .replace('&amp;', '&')
00127 
00128     def plaintext(self, keeplinebreaks=True):
00129         """Returns the text as a `unicode`with all entities and tags removed."""
00130         text = unicode(self.striptags().stripentities())
00131         if not keeplinebreaks:
00132             text = text.replace('\n', ' ')
00133         return text
00134 
00135     def sanitize(self):
00136         """Parse the text as HTML and return a cleaned up XHTML representation.
00137         
00138         This will remove any javascript code or other potentially dangerous
00139         elements.
00140         
00141         If the HTML cannot be parsed, an `HTMLParseError` will be raised by the
00142         underlying `HTMLParser` module, which should be handled by the caller of
00143         this function.
00144         """
00145         buf = StringIO()
00146         sanitizer = HTMLSanitizer(buf)
00147         sanitizer.feed(self.stripentities(keepxmlentities=True))
00148         return Markup(buf.getvalue())
00149 
00150 
00151 escape = Markup.escape
00152 
00153 def unescape(text):
00154     """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
00155     if not isinstance(text, Markup):
00156         return text
00157     return text.unescape()
00158 
00159 
00160 class Deuglifier(object):
00161 
00162     def __new__(cls):
00163         self = object.__new__(cls)
00164         if not hasattr(cls, '_compiled_rules'):
00165             cls._compiled_rules = re.compile('(?:' + '|'.join(cls.rules()) + ')')
00166         self._compiled_rules = cls._compiled_rules
00167         return self
00168     
00169     def format(self, indata):
00170         return re.sub(self._compiled_rules, self.replace, indata)
00171 
00172     def replace(self, fullmatch):
00173         for mtype, match in fullmatch.groupdict().items():
00174             if match:
00175                 if mtype == 'font':
00176                     return '<span>'
00177                 elif mtype == 'endfont':
00178                     return '</span>'
00179                 return '<span class="code-%s">' % mtype
00180 
00181 
00182 class HTMLSanitizer(HTMLParser):
00183 
00184     safe_tags = frozenset(['a', 'abbr', 'acronym', 'address', 'area',
00185         'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center',
00186         'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir',
00187         'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2',
00188         'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd',
00189         'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
00190         'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small',
00191         'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody',
00192         'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul',
00193         'var'])
00194     safe_attrs = frozenset(['abbr', 'accept', 'accept-charset',
00195         'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'bgcolor',
00196         'cellpadding', 'cellspacing', 'char', 'charoff', 'charset',
00197         'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 'color',
00198         'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
00199         'for', 'frame', 'headers', 'height', 'href', 'hreflang',
00200         'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc',
00201         'maxlength', 'media', 'method', 'multiple', 'name', 'nohref',
00202         'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows',
00203         'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
00204         'span', 'src', 'start', 'style', 'summary', 'tabindex',
00205         'target', 'title', 'type', 'usemap', 'valign', 'value',
00206         'vspace', 'width'])
00207     ignore_tags = frozenset(['html', 'body'])
00208     
00209     uri_attrs = frozenset(['action', 'background', 'dynsrc', 'href',
00210                            'lowsrc', 'src'])
00211     safe_schemes = frozenset(['file', 'ftp', 'http', 'https', 'mailto',
00212                               None])
00213 
00214     def __init__(self, out):
00215         HTMLParser.__init__(self)
00216         self.out = out
00217         self.waiting_for = None
00218 
00219     def handle_starttag(self, tag, attrs):
00220         if self.waiting_for:
00221             return
00222         if tag in self.ignore_tags:
00223             return
00224         
00225         if tag not in self.safe_tags:
00226             self.waiting_for = tag
00227             return
00228         self.out.write('<' + tag)
00229 
00230         def _get_scheme(text):
00231             if ':' not in text:
00232                 return None
00233             chars = [char for char in text.split(':', 1)[0]
00234                      if char.isalnum()]
00235             return ''.join(chars).lower()
00236 
00237         for attrname, attrval in attrs:
00238             if attrname not in self.safe_attrs:
00239                 continue
00240             elif attrname in self.uri_attrs:
00241                 # Don't allow URI schemes such as "javascript:"
00242                 if _get_scheme(attrval) not in self.safe_schemes:
00243                     continue
00244             elif attrname == 'style':
00245                 # Remove dangerous CSS declarations from inline styles
00246                 decls = []
00247                 for decl in filter(None, attrval.split(';')):
00248                     is_evil = False
00249                     if 'expression' in decl:
00250                         is_evil = True
00251                     for m in re.finditer(r'url\s*\(([^)]+)', decl):
00252                         if _get_scheme(m.group(1)) not in self.safe_schemes:
00253                             is_evil = True
00254                             break
00255                     if not is_evil:
00256                         decls.append(decl.strip())
00257                 if not decls:
00258                     continue
00259                 attrval = '; '.join(decls)
00260             self.out.write(' ' + attrname + '="' + escape(attrval) + '"')
00261 
00262         if tag in _EMPTY_TAGS:
00263             self.out.write(' />')
00264         else:
00265             self.out.write('>')
00266 
00267     def handle_entityref(self, name):
00268         if not self.waiting_for:
00269             self.out.write('&%s;' % name)
00270 
00271     def handle_data(self, data):
00272         if not self.waiting_for:
00273             self.out.write(escape(data, quotes=False))
00274 
00275     def handle_endtag(self, tag):
00276         if tag in self.ignore_tags:
00277             return
00278 
00279         if self.waiting_for:
00280             if self.waiting_for == tag:
00281                 self.waiting_for = None
00282             return
00283         if tag not in _EMPTY_TAGS:
00284             self.out.write('</' + tag + '>')
00285 
00286 
00287 class Fragment(object):
00288     __slots__ = ['children']
00289 
00290     def __init__(self):
00291         self.children = []
00292 
00293     def append(self, node):
00294         """Append an element or string as child node."""
00295         if isinstance(node, (Element, Markup, basestring, int, float, long)):
00296             # For objects of a known/primitive type, we avoid the check for
00297             # whether it is iterable for better performance
00298             self.children.append(node)
00299         elif isinstance(node, Fragment):
00300             self.children += node.children
00301         elif node is not None:
00302             try:
00303                 for child in node:
00304                     self.append(child)
00305             except TypeError:
00306                 self.children.append(node)
00307 
00308     def __call__(self, *args):
00309         for arg in args:
00310             self.append(arg)
00311         return self
00312 
00313     def serialize(self):
00314         """Generator that yield tags and text nodes as strings."""
00315         for child in self.children:
00316             if isinstance(child, Fragment):
00317                 yield unicode(child)
00318             else:
00319                 yield escape(child, quotes=False)
00320 
00321     def __unicode__(self):
00322         return u''.join(self.serialize())
00323 
00324     def __str__(self):
00325         return ''.join(self.serialize())
00326 
00327     def __add__(self, other):
00328         return Fragment()(self, other)
00329 
00330 
00331 class Element(Fragment):
00332     """Simple XHTML output generator based on the builder pattern.
00333     
00334     Construct XHTML elements by passing the tag name to the constructor:
00335     
00336     >>> print Element('strong')
00337     <strong></strong>
00338     
00339     Attributes can be specified using keyword arguments. The values of the
00340     arguments will be converted to strings and any special XML characters
00341     escaped:
00342     
00343     >>> print Element('textarea', rows=10, cols=60)
00344     <textarea rows="10" cols="60"></textarea>
00345     >>> print Element('span', title='1 < 2')
00346     <span title="1 &lt; 2"></span>
00347     >>> print Element('span', title='"baz"')
00348     <span title="&#34;baz&#34;"></span>
00349     
00350     The " character is escaped using a numerical entity.
00351     The order in which attributes are rendered is undefined.
00352     
00353     If an attribute value evaluates to `None`, that attribute is not included
00354     in the output:
00355     
00356     >>> print Element('a', name=None)
00357     <a></a>
00358     
00359     Attribute names that conflict with Python keywords can be specified by
00360     appending an underscore:
00361     
00362     >>> print Element('div', class_='warning')
00363     <div class="warning"></div>
00364     
00365     While the tag names and attributes are not restricted to the XHTML language,
00366     some HTML characteristics such as boolean (minimized) attributes and empty
00367     elements get special treatment.
00368     
00369     For compatibility with HTML user agents, some XHTML elements need to be
00370     closed using a separate closing tag even if they are empty. For this, the
00371     close tag is only ommitted for a small set of elements which are known be
00372     be safe for use as empty elements:
00373     
00374     >>> print Element('br')
00375     <br />
00376     
00377     Trying to add nested elements to such an element will cause an
00378     `AssertionError`:
00379     
00380     >>> Element('br')('Oops')
00381     Traceback (most recent call last):
00382         ...
00383     AssertionError: 'br' elements must not have content
00384     
00385     Furthermore, boolean attributes such as "selected" or "checked" are omitted
00386     if the value evaluates to `False`. Otherwise, the name of the attribute is
00387     used for the value:
00388     
00389     >>> print Element('option', value=0, selected=False)
00390     <option value="0"></option>
00391     >>> print Element('option', selected='yeah')
00392     <option selected="selected"></option>
00393     
00394     
00395     Nested elements can be added to an element by calling the instance using
00396     positional arguments. The same technique can also be used for adding
00397     attributes using keyword arguments, as one would do in the constructor:
00398     
00399     >>> print Element('ul')(Element('li'), Element('li'))
00400     <ul><li></li><li></li></ul>
00401     >>> print Element('a')('Label')
00402     <a>Label</a>
00403     >>> print Element('a')('Label', href="target")
00404     <a href="target">Label</a>
00405 
00406     Text nodes can be nested in an element by adding strings instead of
00407     elements. Any special characters in the strings are escaped automatically:
00408 
00409     >>> print Element('em')('Hello world')
00410     <em>Hello world</em>
00411     >>> print Element('em')(42)
00412     <em>42</em>
00413     >>> print Element('em')('1 < 2')
00414     <em>1 &lt; 2</em>
00415 
00416     This technique also allows mixed content:
00417 
00418     >>> print Element('p')('Hello ', Element('b')('world'))
00419     <p>Hello <b>world</b></p>
00420 
00421     Elements can also be combined with other elements or strings using the
00422     addition operator, which results in a `Fragment` object that contains the
00423     operands:
00424     
00425     >>> print Element('br') + 'some text' + Element('br')
00426     <br />some text<br />
00427     """
00428     __slots__ = ['tagname', 'attr']
00429 
00430     def __init__(self, tagname_=None, **attr):
00431         Fragment.__init__(self)
00432         if tagname_:
00433             self.tagname = tagname_
00434         self.attr = {}
00435         self(**attr)
00436 
00437     def __call__(self, *args, **attr):
00438         self.attr.update(attr)
00439         return Fragment.__call__(self, *args)
00440 
00441     def append(self, node):
00442         """Append an element or string as child node."""
00443         assert self.tagname not in _EMPTY_TAGS, \
00444             "'%s' elements must not have content" % self.tagname
00445         Fragment.append(self, node)
00446 
00447     def serialize(self):
00448         """Generator that yield tags and text nodes as strings."""
00449         starttag = ['<', self.tagname]
00450         for name, value in self.attr.items():
00451             if value is None:
00452                 continue
00453             if name in _BOOLEAN_ATTRS:
00454                 if not value:
00455                     continue
00456                 value = name
00457             else:
00458                 name = name.rstrip('_').replace('_', '-')
00459             starttag.append(' %s="%s"' % (name.lower(), escape(value)))
00460 
00461         if self.children or self.tagname not in _EMPTY_TAGS:
00462             starttag.append('>')
00463             yield Markup(''.join(starttag))
00464             for part in Fragment.serialize(self):
00465                 yield part
00466             yield Markup('</%s>', self.tagname)
00467 
00468         else:
00469             starttag.append(' />')
00470             yield Markup(''.join(starttag))
00471 
00472 
00473 class Tags(object):
00474 
00475     def __getattribute__(self, name):
00476         return Element(name.lower())
00477 
00478 
00479 html = Tags()