Back to index

plone3  3.1.7
safe_html.py
Go to the documentation of this file.
00001 import logging
00002 from sgmllib import SGMLParser
00003 import re
00004 from cgi import escape
00005 
00006 from Products.PortalTransforms.interfaces import itransform
00007 from Products.PortalTransforms.utils import log
00008 from Products.CMFDefault.utils import bodyfinder
00009 from Products.CMFDefault.utils import IllegalHTML
00010 from Products.CMFDefault.utils import SimpleHTMLParser
00011 from Products.CMFDefault.utils import VALID_TAGS
00012 from Products.CMFDefault.utils import NASTY_TAGS
00013 from Products.PortalTransforms.utils import safeToInt
00014 
00015 # tag mapping: tag -> short or long tag
00016 VALID_TAGS = VALID_TAGS.copy()
00017 NASTY_TAGS = NASTY_TAGS.copy()
00018 
00019 # add some tags to allowed types. This should be fixed in CMFDefault
00020 VALID_TAGS['ins'] = 1
00021 VALID_TAGS['del'] = 1
00022 VALID_TAGS['q'] = 1
00023 VALID_TAGS['map'] = 1
00024 VALID_TAGS['area'] = 1
00025 
00026 msg_pat = """
00027 <div class="system-message">
00028 <p class="system-message-title">System message: %s</p>
00029 %s</d>
00030 """
00031 
00032 def hasScript(s):
00033    """ Dig out evil Java/VB script inside an HTML attribute """
00034 
00035    # look for "script" and "expression"
00036    javascript_pattern = re.compile("([\s\n]*?s[\s\n]*?c[\s\n]*?r[\s\n]*?i[\s\n]*?p[\s\n]*?t[\s\n]*?:)|([\s\n]*?e[\s\n]*?x[\s\n]*?p[\s\n]*?r[\s\n]*?e[\s\n]*?s[\s\n]*?s[\s\n]*?i[\s\n]*?o[\s\n]*?n)", re.DOTALL|re.IGNORECASE)
00037    s = decode_htmlentities(s)
00038    return javascript_pattern.findall(s)
00039 
00040 def decode_htmlentities(s):
00041    """ XSS code can be hidden with htmlentities """
00042 
00043    entity_pattern = re.compile("&#(?P<htmlentity>x?\w+)?;?")
00044    s = entity_pattern.sub(decode_htmlentity,s)
00045    return s
00046 
00047 def decode_htmlentity(m):
00048    entity_value = m.groupdict()['htmlentity']
00049    if entity_value.lower().startswith('x'):
00050       try:
00051           return chr(int('0'+entity_value,16))
00052       except ValueError:
00053           return entity_value
00054    try:
00055       return chr(int(entity_value))
00056    except ValueError:
00057       return entity_value
00058 
00059 class StrippingParser(SGMLParser):
00060     """Pass only allowed tags;  raise exception for known-bad.
00061 
00062     Copied from Products.CMFDefault.utils
00063     Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
00064     """
00065 
00066     from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
00067 
00068     def __init__(self, valid, nasty, remove_javascript, raise_error):
00069         SGMLParser.__init__( self )
00070         self.result = []
00071         self.valid = valid
00072         self.nasty = nasty
00073         self.remove_javascript = remove_javascript
00074         self.raise_error = raise_error
00075         self.suppress = False
00076 
00077     def handle_data(self, data):
00078         if self.suppress: return
00079         if data:
00080             self.result.append(escape(data))
00081 
00082     def handle_charref(self, name):
00083         if self.suppress: return
00084         self.result.append('&#%s;' % name)
00085 
00086     def handle_comment(self, comment):
00087         pass
00088 
00089     def handle_decl(self, data):
00090         pass
00091 
00092     def handle_entityref(self, name):
00093         if self.suppress: return
00094         if self.entitydefs.has_key(name):
00095             x = ';'
00096         else:
00097             # this breaks unstandard entities that end with ';'
00098             x = ''
00099 
00100         self.result.append('&%s%s' % (name, x))
00101 
00102     def unknown_starttag(self, tag, attrs):
00103         """ Delete all tags except for legal ones.
00104         """
00105 
00106         if self.suppress: return
00107 
00108         if self.valid.has_key(tag):
00109             self.result.append('<' + tag)
00110 
00111             remove_script = getattr(self,'remove_javascript',True)
00112 
00113             for k, v in attrs:
00114                 if remove_script and k.strip().lower().startswith('on'):
00115                     if not self.raise_error: continue
00116                     else: raise IllegalHTML, 'Script event "%s" not allowed.' % k
00117                 elif remove_script and hasScript(v):
00118                     if not self.raise_error: continue
00119                     else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
00120                 else:
00121                     self.result.append(' %s="%s"' % (k, v))
00122 
00123             #UNUSED endTag = '</%s>' % tag
00124             if safeToInt(self.valid.get(tag)):
00125                 self.result.append('>')
00126             else:
00127                 self.result.append(' />')
00128         elif self.nasty.has_key(tag):
00129             self.suppress = True
00130             if self.raise_error:
00131                 raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag
00132         else:
00133             # omit tag
00134             pass
00135 
00136     def unknown_endtag(self, tag):
00137         if self.nasty.has_key(tag) and not self.valid.has_key(tag):
00138             self.suppress = False
00139         if self.suppress: return
00140         if safeToInt(self.valid.get(tag)):
00141             self.result.append('</%s>' % tag)
00142             #remTag = '</%s>' % tag
00143 
00144     def getResult(self):
00145         return ''.join(self.result)
00146 
00147 def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,
00148               remove_javascript=True, raise_error=True):
00149 
00150     """ Strip illegal HTML tags from string text.
00151     """
00152     parser = StrippingParser(valid=valid, nasty=nasty,
00153                              remove_javascript=remove_javascript,
00154                              raise_error=raise_error)
00155     parser.feed(html)
00156     parser.close()
00157     return parser.getResult()
00158 
00159 class SafeHTML:
00160     """Simple transform which uses CMFDefault functions to
00161     clean potentially bad tags.   
00162 
00163     Tags must explicit be allowed in valid_tags to pass. Only
00164     the tags themself are removed, not their contents. If tags
00165     are removed and in nasty_tags, they are removed with
00166     all of their contents.         
00167     
00168     Objects will not be transformed again with changed settings.
00169     You need to clear the cache by e.g.
00170     1.) restarting your zope or
00171     2.) empty the zodb-cache via ZMI -> Control_Panel
00172         -> Database Management -> main || other_used_database
00173         -> Flush Cache.
00174     """
00175 
00176     __implements__ = itransform
00177 
00178     __name__ = "safe_html"
00179     inputs   = ('text/html',)
00180     output = "text/x-html-safe"
00181 
00182     def __init__(self, name=None, **kwargs):
00183 
00184 
00185         self.config = {
00186             'inputs': self.inputs,
00187             'output': self.output,
00188             'valid_tags': VALID_TAGS,
00189             'nasty_tags': NASTY_TAGS,
00190             'remove_javascript': 1,
00191             'disable_transform': 0,
00192             }
00193 
00194         self.config_metadata = {
00195             'inputs' : ('list', 'Inputs', 'Input(s) MIME type. Change with care.'),
00196             'valid_tags' : ('dict',
00197                             'valid_tags',
00198                             'List of valid html-tags, value is 1 if they ' +
00199                             'have a closing part (e.g. <p>...</p>) and 0 for empty ' +
00200                             'tags (like <br />). Be carefull!',
00201                             ('tag', 'value')),
00202             'nasty_tags' : ('dict',
00203                             'nasty_tags',
00204                             'Dynamic Tags that are striped with ' +
00205                             'everything they contain (like applet, object). ' +
00206                             'They are only deleted if they are not marked as valid_tags.',
00207                             ('tag', 'value')),
00208             'remove_javascript' : ("int",
00209                                    'remove_javascript',
00210                                    '1 to remove javascript attributes that begin with on (e.g. onClick) ' +
00211                                    'and attributes where the value starts with "javascript:" ' +
00212                                    '(e.g. <a href="javascript:function()". ' +
00213                                    'This does not effect <script> tags. 0 to leave the attributes.'),
00214             'disable_transform' : ("int",
00215                                    'disable_transform',
00216                                    'If 1, nothing is done.')
00217             }
00218 
00219         self.config.update(kwargs)
00220 
00221         if name:
00222             self.__name__ = name
00223 
00224     def name(self):
00225         return self.__name__
00226 
00227     def __getattr__(self, attr):
00228         if attr == 'inputs':
00229             return self.config['inputs']
00230         if attr == 'output':
00231             return self.config['output']
00232         raise AttributeError(attr)
00233 
00234     def convert(self, orig, data, **kwargs):
00235         # note if we need an upgrade.
00236         if not self.config.has_key('disable_transform'):
00237             log(logging.ERROR, 'PortalTransforms safe_html transform needs to be '
00238                 'updated. Please re-install the PortalTransforms product to fix.')
00239 
00240         # if we have a config that we don't want to delete
00241         # we need a disable option
00242         if self.config.get('disable_transform'):
00243             data.setData(orig)
00244             return data
00245 
00246         try:
00247             safe = scrubHTML(
00248                 bodyfinder(orig),
00249                 valid=self.config.get('valid_tags', {}),
00250                 nasty=self.config.get('nasty_tags', {}),
00251                 remove_javascript=self.config.get('remove_javascript', True),
00252                 raise_error=False)
00253         except IllegalHTML, inst:
00254             data.setData(msg_pat % ("Error", str(inst)))
00255         else:
00256             data.setData(safe)
00257         return data
00258 
00259 def register():
00260     return SafeHTML()