Back to index

plone3  3.1.7
utils.py
Go to the documentation of this file.
00001 ##############################################################################
00002 #
00003 # Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
00004 #
00005 # This software is subject to the provisions of the Zope Public License,
00006 # Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
00007 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
00008 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00009 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
00010 # FOR A PARTICULAR PURPOSE.
00011 #
00012 ##############################################################################
00013 """ Utility functions.
00014 
00015 $Id: utils.py 79407 2007-08-31 17:05:54Z jens $
00016 """
00017 
00018 import os
00019 import re
00020 import StringIO
00021 import rfc822
00022 from email.Header import make_header
00023 from email.MIMEText import MIMEText
00024 from sgmllib import SGMLParser
00025 
00026 from AccessControl import ModuleSecurityInfo
00027 from Globals import package_home
00028 from Products.PageTemplates.GlobalTranslationService \
00029         import getGlobalTranslationService
00030 from ZTUtils.Zope import complex_marshal
00031 
00032 from zope import i18n
00033 from zope.component import getUtility
00034 from zope.component import queryUtility
00035 from zope.i18n.interfaces import IUserPreferredCharsets
00036 from zope.i18nmessageid import MessageFactory
00037 
00038 from Products.CMFCore.interfaces import IPropertiesTool
00039 
00040 from Products.CMFDefault.interfaces import IHTMLScrubber
00041 from Products.CMFDefault.exceptions import EmailAddressInvalid
00042 from Products.CMFDefault.exceptions import IllegalHTML
00043 
00044 
00045 security = ModuleSecurityInfo( 'Products.CMFDefault.utils' )
00046 
00047 security.declarePrivate('_dtmldir')
00048 _dtmldir = os.path.join( package_home( globals() ), 'dtml' )
00049 _wwwdir = os.path.join( package_home( globals() ), 'www' )
00050 
00051 security.declarePublic('formatRFC822Headers')
00052 def formatRFC822Headers( headers ):
00053 
00054     """ Convert the key-value pairs in 'headers' to valid RFC822-style
00055         headers, including adding leading whitespace to elements which
00056         contain newlines in order to preserve continuation-line semantics.
00057     """
00058     munged = []
00059     linesplit = re.compile( r'[\n\r]+?' )
00060 
00061     for key, value in headers:
00062 
00063         vallines = linesplit.split( value )
00064         while vallines:
00065             if vallines[-1].rstrip() == '':
00066                 vallines = vallines[:-1]
00067             else:
00068                 break
00069         munged.append( '%s: %s' % ( key, '\r\n  '.join( vallines ) ) )
00070 
00071     return '\r\n'.join( munged )
00072 
00073 
00074 security.declarePublic('parseHeadersBody')
00075 def parseHeadersBody( body, headers=None, rc=re.compile( r'\n|\r\n' ) ):
00076 
00077     """ Parse any leading 'RFC-822'-ish headers from an uploaded
00078         document, returning a dictionary containing the headers
00079         and the stripped body.
00080 
00081         E.g.::
00082 
00083             Title: Some title
00084             Creator: Tres Seaver
00085             Format: text/plain
00086             X-Text-Format: structured
00087 
00088             Overview
00089 
00090             This document .....
00091 
00092             First Section
00093 
00094             ....
00095 
00096 
00097         would be returned as::
00098 
00099             { 'Title' : 'Some title'
00100             , 'Creator' : 'Tres Seaver'
00101             , 'Format' : 'text/plain'
00102             , 'text_format': 'structured'
00103             }
00104 
00105         as the headers, plus the body, starting with 'Overview' as
00106         the first line (the intervening blank line is a separator).
00107 
00108         Allow passing initial dictionary as headers.
00109     """
00110     buffer = StringIO.StringIO(body)
00111     message = rfc822.Message(buffer)
00112 
00113     headers = headers and headers.copy() or {}
00114 
00115     for key in message.keys():
00116         headers[key.capitalize()] = '\n'.join(message.getheaders(key))
00117 
00118     return headers, buffer.read()
00119 
00120 
00121 security.declarePublic('semi_split')
00122 def semi_split(s):
00123 
00124     """ Split 's' on semicolons.
00125     """
00126     return map(lambda x: x.strip(), s.split( ';' ) )
00127 
00128 security.declarePublic('comma_split')
00129 def comma_split(s):
00130 
00131     """ Split 's' on commas.
00132     """
00133     return map(lambda x: x.strip(), s.split( ',') )
00134 
00135 security.declarePublic('seq_strip')
00136 def seq_strip(seq, stripper=lambda x: x.strip() ):
00137     """ Strip a sequence of strings.
00138     """
00139     if isinstance(seq, list):
00140         return map( stripper, seq )
00141 
00142     if isinstance(seq, tuple):
00143         return tuple( map( stripper, seq ) )
00144 
00145     raise ValueError, "%s of unsupported sequencetype %s" % ( seq, type( seq ) )
00146 
00147 security.declarePublic('tuplize')
00148 def tuplize( valueName, value, splitter=lambda x: x.split() ):
00149 
00150     if isinstance(value, tuple):
00151         return seq_strip( value )
00152 
00153     if isinstance(value, list):
00154         return seq_strip( tuple( value ) )
00155 
00156     if isinstance(value, basestring):
00157         return seq_strip( tuple( splitter( value ) ) )
00158 
00159     raise ValueError, "%s of unsupported type" % valueName
00160 
00161 
00162 class SimpleHTMLParser( SGMLParser ):
00163 
00164     #from htmlentitydefs import entitydefs
00165 
00166     def __init__( self, verbose=0 ):
00167 
00168         SGMLParser.__init__( self, verbose )
00169         self.savedata = None
00170         self.title = ''
00171         self.metatags = {}
00172         self.body = ''
00173 
00174     def handle_data( self, data ):
00175 
00176         if self.savedata is not None:
00177             self.savedata = self.savedata + data
00178 
00179     def handle_charref( self, ref ):
00180 
00181         self.handle_data( "&#%s;" % ref )
00182 
00183     def handle_entityref( self, ref ):
00184 
00185         self.handle_data( "&%s;" % ref )
00186 
00187     def save_bgn( self ):
00188 
00189         self.savedata = ''
00190 
00191     def save_end( self ):
00192 
00193         data = self.savedata
00194         self.savedata = None
00195         return data
00196 
00197     def start_title( self, attrs ):
00198 
00199         self.save_bgn()
00200 
00201     def end_title( self ):
00202 
00203         self.title = self.save_end()
00204 
00205     def do_meta( self, attrs ):
00206 
00207         name = ''
00208         content = ''
00209 
00210         for attrname, value in attrs:
00211 
00212             value = value.strip()
00213 
00214             if attrname == "name":
00215                 name = value.capitalize()
00216 
00217             if attrname == "content":
00218                 content = value
00219 
00220         if name:
00221             self.metatags[ name ] = content
00222 
00223     def unknown_startag( self, tag, attrs ):
00224 
00225         self.setliteral()
00226 
00227     def unknown_endtag( self, tag ):
00228 
00229         self.setliteral()
00230 
00231 #
00232 #   HTML cleaning code
00233 #
00234 
00235 # These are the HTML tags that we will leave intact
00236 VALID_TAGS = { 'a'          : 1
00237              , 'b'          : 1
00238              , 'base'       : 0
00239              , 'big'        : 1
00240              , 'blockquote' : 1
00241              , 'body'       : 1
00242              , 'br'         : 0
00243              , 'caption'    : 1
00244              , 'cite'       : 1
00245              , 'code'       : 1
00246              , 'dd'         : 1
00247              , 'div'        : 1
00248              , 'dl'         : 1
00249              , 'dt'         : 1
00250              , 'em'         : 1
00251              , 'h1'         : 1
00252              , 'h2'         : 1
00253              , 'h3'         : 1
00254              , 'h4'         : 1
00255              , 'h5'         : 1
00256              , 'h6'         : 1
00257              , 'head'       : 1
00258              , 'hr'         : 0
00259              , 'html'       : 1
00260              , 'i'          : 1
00261              , 'img'        : 0
00262              , 'kbd'        : 1
00263              , 'li'         : 1
00264            # , 'link'       : 1 type="script" hoses us
00265              , 'meta'       : 0
00266              , 'ol'         : 1
00267              , 'p'          : 1
00268              , 'pre'        : 1
00269              , 'small'      : 1
00270              , 'span'       : 1
00271              , 'strong'     : 1
00272              , 'sub'        : 1
00273              , 'sup'        : 1
00274              , 'table'      : 1
00275              , 'tbody'      : 1
00276              , 'td'         : 1
00277              , 'th'         : 1
00278              , 'title'      : 1
00279              , 'tr'         : 1
00280              , 'tt'         : 1
00281              , 'u'          : 1
00282              , 'ul'         : 1
00283              }
00284 
00285 NASTY_TAGS = { 'script'     : 1
00286              , 'object'     : 1
00287              , 'embed'      : 1
00288              , 'applet'     : 1
00289              }
00290 
00291 
00292 class StrippingParser( SGMLParser ):
00293 
00294     """ Pass only allowed tags;  raise exception for known-bad.
00295     """
00296 
00297     from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
00298 
00299     def __init__( self, valid_tags=None, nasty_tags=None ):
00300 
00301         SGMLParser.__init__( self )
00302         self.result = ""
00303         self.valid_tags = valid_tags or VALID_TAGS
00304         self.nasty_tags = nasty_tags or NASTY_TAGS
00305 
00306     def handle_data( self, data ):
00307 
00308         if data:
00309             self.result = self.result + data
00310 
00311     def handle_charref( self, name ):
00312 
00313         self.result = "%s&#%s;" % ( self.result, name )
00314 
00315     def handle_entityref(self, name):
00316 
00317         if self.entitydefs.has_key(name):
00318             x = ';'
00319         else:
00320             # this breaks unstandard entities that end with ';'
00321             x = ''
00322 
00323         self.result = "%s&%s%s" % (self.result, name, x)
00324 
00325     def unknown_starttag(self, tag, attrs):
00326         """ Delete all tags except for legal ones.
00327         """
00328         if self.valid_tags.has_key(tag):
00329 
00330             self.result = self.result + '<' + tag
00331 
00332             for k, v in attrs:
00333 
00334                 if k.lower().startswith('on'):
00335                     msg = _(u"JavaScript event '${attribute}' not allowed.",
00336                             mapping={'attribute': k})
00337                     raise IllegalHTML(msg)
00338 
00339                 if v.lower().startswith('javascript:'):
00340                     msg = _(u"JavaScript URI '${value}' not allowed.",
00341                             mapping={'value': v})
00342                     raise IllegalHTML(msg)
00343 
00344                 self.result = '%s %s="%s"' % (self.result, k, v)
00345 
00346             endTag = '</%s>' % tag
00347             if self.valid_tags.get(tag):
00348                 self.result = self.result + '>'
00349             else:
00350                 self.result = self.result + ' />'
00351 
00352         elif self.nasty_tags.get(tag):
00353             msg = _(u"Dynamic tag '${tag}' not allowed.",
00354                     mapping={'tag': tag})
00355             raise IllegalHTML(msg)
00356 
00357         else:
00358             pass    # omit tag
00359 
00360     def unknown_endtag(self, tag):
00361 
00362         if self.valid_tags.get(tag):
00363 
00364             self.result = "%s</%s>" % (self.result, tag)
00365             remTag = '</%s>' % tag
00366 
00367 
00368 security.declarePublic('scrubHTML')
00369 def scrubHTML( html ):
00370 
00371     """ Strip illegal HTML tags from string text.
00372 
00373     o Prefer a utility, if registered.
00374     """
00375     scrubber = queryUtility(IHTMLScrubber)
00376 
00377     if scrubber is not None:
00378         return scrubber.scrub(html)
00379 
00380     parser = StrippingParser()
00381     parser.feed( html )
00382     parser.close()
00383     return parser.result
00384 
00385 security.declarePublic('isHTMLSafe')
00386 def isHTMLSafe( html ):
00387 
00388     """ Would current HTML be permitted to be saved?
00389     """
00390     try:
00391         scrubHTML( html )
00392     except IllegalHTML:
00393         return 0
00394     else:
00395         return 1
00396 
00397 security.declarePublic('bodyfinder')
00398 def bodyfinder(text):
00399     """ Return body or unchanged text if no body tags found.
00400 
00401     Always use html_headcheck() first.
00402     """
00403     lowertext = text.lower()
00404     bodystart = lowertext.find('<body')
00405     if bodystart == -1:
00406         return text
00407     bodystart = lowertext.find('>', bodystart) + 1
00408     if bodystart == 0:
00409         return text
00410     bodyend = lowertext.rfind('</body>', bodystart)
00411     if bodyend == -1:
00412         return text
00413     return text[bodystart:bodyend]
00414 
00415 security.declarePrivate('_htfinder')
00416 _htfinder = re.compile(r'(\s|(<[^<>]*?>))*<html.*<body.*?>.*</body>',
00417                        re.DOTALL)
00418 
00419 security.declarePublic('html_headcheck')
00420 def html_headcheck(html):
00421     """ Return 'true' if document looks HTML-ish enough.
00422 
00423     If true bodyfinder() will be able to find the HTML body.
00424     """
00425     lowerhtml = html.lower()
00426     if lowerhtml.find('<html') == -1:
00427         return 0
00428     elif _htfinder.match(lowerhtml):
00429         return 1
00430     else:
00431         return 0
00432 
00433 security.declarePublic('html_marshal')
00434 def html_marshal(**kw):
00435     """ Marshal variables for html forms.
00436     """
00437     vars = [ (key + converter, value)
00438              for key, converter, value in complex_marshal(kw.items()) ]
00439     return tuple(vars)
00440 
00441 security.declarePublic('toUnicode')
00442 def toUnicode(value, charset=None):
00443     """ Convert value to unicode.
00444     """
00445     if isinstance(value, str):
00446         return charset and unicode(value, charset) or unicode(value)
00447     elif isinstance(value, list):
00448         return [ toUnicode(val, charset) for val in value ]
00449     elif isinstance(value, tuple):
00450         return tuple( [ toUnicode(val, charset) for val in value ] )
00451     elif isinstance(value, dict):
00452         for key, val in value.items():
00453             value[key] = toUnicode(val, charset)
00454         return value
00455     else:
00456         return value
00457 
00458 security.declarePublic('decode')
00459 def decode(value, context):
00460     """ Decode value using default_charset.
00461     """
00462     ptool = getUtility(IPropertiesTool)
00463     default_charset = ptool.getProperty('default_charset', None)
00464     return toUnicode(value, default_charset)
00465 
00466 security.declarePublic('translate')
00467 def translate(message, context):
00468     """ Translate i18n message.
00469     """
00470     GTS = getGlobalTranslationService()
00471     if isinstance(message, Exception):
00472         try:
00473             message = message[0]
00474         except (TypeError, IndexError):
00475             pass
00476     return GTS.translate('cmf_default', message, context=context)
00477 
00478 security.declarePublic('getBrowserCharset')
00479 def getBrowserCharset(request):
00480     """ Get charset preferred by the browser.
00481     """
00482     envadapter = IUserPreferredCharsets(request)
00483     charsets = envadapter.getPreferredCharsets() or ['utf-8']
00484     return charsets[0]
00485 
00486 security.declarePublic('makeEmail')
00487 def makeEmail(mtext, context, headers={}):
00488     """ Make email message.
00489     """
00490     ptool = getUtility(IPropertiesTool)
00491     email_charset = ptool.getProperty('email_charset', None) or 'utf-8'
00492     try:
00493         msg = MIMEText(mtext.encode(), 'plain')
00494     except UnicodeEncodeError:
00495         msg = MIMEText(mtext.encode(email_charset), 'plain', email_charset)
00496     for k, val in headers.items():
00497         if isinstance(val, str):
00498             val = decode(val, context)
00499         if isinstance(val, i18n.Message):
00500             val = translate(val, context)
00501         header = make_header([ (w, email_charset) for w in val.split(' ') ])
00502         msg[k] = str(header)
00503     return msg.as_string()
00504 
00505 # RFC 2822 local-part: dot-atom or quoted-string
00506 # characters allowed in atom: A-Za-z0-9!#$%&'*+-/=?^_`{|}~
00507 # RFC 2821 domain: max 255 characters
00508 _LOCAL_RE = re.compile(r'([A-Za-z0-9!#$%&\'*+\-/=?^_`{|}~]+'
00509                      r'(\.[A-Za-z0-9!#$%&\'*+\-/=?^_`{|}~]+)*|'
00510                      r'"[^(\|")]*")@[^@]{3,255}$')
00511 
00512 # RFC 2821 local-part: max 64 characters
00513 # RFC 2821 domain: sequence of dot-separated labels
00514 # characters allowed in label: A-Za-z0-9-, first is a letter
00515 # Even though the RFC does not allow it all-numeric domains do exist
00516 _DOMAIN_RE = re.compile(r'[^@]{1,64}@[A-Za-z0-9][A-Za-z0-9-]*'
00517                                 r'(\.[A-Za-z0-9][A-Za-z0-9-]*)+$')
00518 
00519 security.declarePublic('checkEmailAddress')
00520 def checkEmailAddress(address):
00521     """ Check email address.
00522 
00523     This should catch most invalid but no valid addresses.
00524     """
00525     if not _LOCAL_RE.match(address):
00526         raise EmailAddressInvalid
00527     if not _DOMAIN_RE.match(address):
00528         raise EmailAddressInvalid
00529 
00530 security.declarePublic('Message')
00531 Message = _ = MessageFactory('cmf_default')