Back to index

plone3  3.1.7
utils.py
Go to the documentation of this file.
00001 import re
00002 import os
00003 import sys
00004 from sgmllib import SGMLParser
00005 
00006 try:
00007     # Need to be imported before win32api to avoid dll loading
00008     # problems.
00009     import pywintypes
00010     import pythoncom
00011 
00012     import win32api
00013     WIN32 = True
00014 except ImportError:
00015     WIN32 = False
00016 
00017 class MissingBinary(Exception): pass
00018 
00019 envPath = os.environ['PATH']
00020 bin_search_path = [path for path in envPath.split(os.pathsep)
00021                    if os.path.isdir(path)]
00022 
00023 cygwin = 'c:/cygwin'
00024 
00025 # cygwin support
00026 if sys.platform == 'win32' and os.path.isdir(cygwin):
00027     for p in ['/bin', '/usr/bin', '/usr/local/bin' ]:
00028         p = os.path.join(cygwin, p)
00029         if os.path.isdir(p):
00030             bin_search_path.append(p)
00031 
00032 if sys.platform == 'win32':
00033     extensions = ('.exe', '.com', '.bat', )
00034 else:
00035     extensions = ()
00036 
00037 def bin_search(binary):
00038     """search the bin_search_path for a given binary returning its fullname or
00039        raises MissingBinary"""
00040     mode   = os.R_OK | os.X_OK
00041     for path in bin_search_path:
00042         for ext in ('', ) + extensions:
00043             pathbin = os.path.join(path, binary) + ext
00044             if os.access(pathbin, mode) == 1:
00045                 return pathbin
00046 
00047     raise MissingBinary('Unable to find binary "%s" in %s' % 
00048                         (binary, os.pathsep.join(bin_search_path)))
00049 
00050 def getShortPathName(binary):
00051     if WIN32:
00052         try:
00053             binary = win32api.GetShortPathName(binary)
00054         except win32api.error:
00055             log("Failed to GetShortPathName for '%s'" % binary)
00056     return binary
00057 
00058 def sansext(path):
00059     return os.path.splitext(os.path.basename(path))[0]
00060 
00061 
00062 ##########################################################################
00063 # The code below is taken from CMFDefault.utils to remove
00064 # dependencies for Python-only installations
00065 ##########################################################################
00066 
00067 def bodyfinder(text):
00068     """ Return body or unchanged text if no body tags found.
00069 
00070     Always use html_headcheck() first.
00071     """
00072     lowertext = text.lower()
00073     bodystart = lowertext.find('<body')
00074     if bodystart == -1:
00075         return text
00076     bodystart = lowertext.find('>', bodystart) + 1
00077     if bodystart == 0:
00078         return text
00079     bodyend = lowertext.rfind('</body>', bodystart)
00080     if bodyend == -1:
00081         return text
00082     return text[bodystart:bodyend]
00083 
00084 
00085 #
00086 #   HTML cleaning code
00087 #
00088 
00089 # These are the HTML tags that we will leave intact
00090 VALID_TAGS = { 'a'          : 1
00091              , 'b'          : 1
00092              , 'base'       : 0
00093              , 'blockquote' : 1
00094              , 'body'       : 1
00095              , 'br'         : 0
00096              , 'caption'    : 1
00097              , 'cite'       : 1
00098              , 'code'       : 1
00099              , 'div'        : 1
00100              , 'dl'         : 1
00101              , 'dt'         : 1
00102              , 'dd'         : 1
00103              , 'em'         : 1
00104              , 'h1'         : 1
00105              , 'h2'         : 1
00106              , 'h3'         : 1
00107              , 'h4'         : 1
00108              , 'h5'         : 1
00109              , 'h6'         : 1
00110              , 'head'       : 1
00111              , 'hr'         : 0
00112              , 'html'       : 1
00113              , 'i'          : 1
00114              , 'img'        : 0
00115              , 'kbd'        : 1
00116              , 'li'         : 1
00117              , 'meta'       : 0
00118              , 'ol'         : 1
00119              , 'p'          : 1
00120              , 'pre'        : 1
00121              , 'span'       : 1
00122              , 'strong'     : 1
00123              , 'strike'     : 1
00124              , 'table'      : 1
00125              , 'tbody'      : 1
00126              , 'thead'      : 1
00127              , 'td'         : 1
00128              , 'th'         : 1
00129              , 'title'      : 1
00130              , 'tr'         : 1
00131              , 'tt'         : 1
00132              , 'u'          : 1
00133              , 'ul'         : 1
00134              }
00135 
00136 NASTY_TAGS = { 'script'     : 1
00137              , 'object'     : 1
00138              , 'embed'      : 1
00139              , 'applet'     : 1
00140              }
00141 
00142 class IllegalHTML( ValueError ):
00143     pass
00144 
00145 class StrippingParser( SGMLParser ):
00146     """ Pass only allowed tags;  raise exception for known-bad.  """
00147 
00148     from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
00149 
00150     def __init__( self ):
00151 
00152         SGMLParser.__init__( self )
00153         self.result = ""
00154 
00155     def handle_data( self, data ):
00156 
00157         if data:
00158             self.result = self.result + data
00159 
00160     def handle_charref( self, name ):
00161 
00162         self.result = "%s&#%s;" % ( self.result, name )
00163 
00164     def handle_entityref(self, name):
00165 
00166         if self.entitydefs.has_key(name):
00167             x = ';'
00168         else:
00169             # this breaks unstandard entities that end with ';'
00170             x = ''
00171 
00172         self.result = "%s&%s%s" % (self.result, name, x)
00173 
00174     def unknown_starttag(self, tag, attrs):
00175 
00176         """ Delete all tags except for legal ones.
00177         """
00178         if VALID_TAGS.has_key(tag):
00179 
00180             self.result = self.result + '<' + tag
00181 
00182             for k, v in attrs:
00183 
00184                 if k.lower().startswith( 'on' ):
00185                     raise IllegalHTML, 'Javascipt event "%s" not allowed.' % k
00186 
00187                 if v.lower().startswith( 'javascript:' ):
00188                     raise IllegalHTML, 'Javascipt URI "%s" not allowed.' % v
00189 
00190                 self.result = '%s %s="%s"' % (self.result, k, v)
00191 
00192             endTag = '</%s>' % tag
00193             if VALID_TAGS.get(tag):
00194                 self.result = self.result + '>'
00195             else:
00196                 self.result = self.result + ' />'
00197 
00198         elif NASTY_TAGS.get( tag ):
00199             raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag
00200 
00201         else:
00202             pass    # omit tag
00203 
00204     def unknown_endtag(self, tag):
00205 
00206         if VALID_TAGS.get( tag ):
00207 
00208             self.result = "%s</%s>" % (self.result, tag)
00209             remTag = '</%s>' % tag
00210 
00211 
00212 def scrubHTML( html ):
00213     """ Strip illegal HTML tags from string text.  """
00214     parser = StrippingParser()
00215     parser.feed( html )
00216     parser.close()
00217     return parser.result