Back to index

plone3  3.1.7
transforms.py
Go to the documentation of this file.
00001 from htmlentitydefs import entitydefs
00002 import re
00003 
00004 
00005 def convertWebIntelligentPlainTextToHtml(orig, tab_width=4):
00006     """Converts text/x-web-intelligent to text/html
00007     """
00008     try:
00009         # tab_width could be a string like '4'
00010         tab_width = int(tab_width)
00011     except ValueError:
00012         tab_width=4
00013 
00014     def abbreviateUrl(url, max = 60, ellipsis = "[…]"):
00015         """very long urls are abbreviated to allow nicer layout
00016         """
00017         if len(url) < max:
00018             return url
00019         protocol = ""
00020         protocolend = url.find("//")
00021         if protocolend != -1:
00022             protocol = url[0:protocolend+2]
00023             url = url[protocolend+2:]
00024         list = url.split("/")
00025         if len(list) < 3 or len(list[0])+len(list[-1])>max:
00026             url = protocol + url
00027             center = (max-5)/2
00028             return url[:center] + ellipsis + url[-center:]
00029 
00030         return protocol + list[0] +"/" +ellipsis + "/" + list[-1]
00031 
00032     urlRegexp = re.compile(r'((?:ftp|https?)://(localhost|([12]?[0-9]{1,2}.){3}([12]?[0-9]{1,2})|(?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+(?:com|edu|biz|org|gov|int|info|mil|net|name|museum|coop|aero|[a-z][a-z]))\b(?::\d+)?(?:\/[^"\'<>()\[\]{}\s\x7f-\xff]*(?:[.,?]+[^"\'<>()\[\]{}\s\x7f-\xff]+)*)?)', re.I|re.S|re.U)
00033     emailRegexp = re.compile(r'["=]?(\b[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)', re.I|re.S|re.U)
00034     indentRegexp = re.compile(r'^(\s+)', re.M|re.U)
00035 
00036     text = orig
00037     if text is None:
00038         text = ''
00039     if not isinstance(text, unicode):
00040         text = unicode(text, 'utf-8', 'replace')
00041 
00042     # Do &amp; separately, else, it may replace an already-inserted & from
00043     # an entity with &amp;, so < becomes &lt; becomes &amp;lt;
00044     text = text.replace('&', '&amp;')
00045     # Make funny characters into html entity defs
00046     for entity, letter in entitydefs.items():
00047         if entity != 'amp':
00048             text = text.replace(letter.decode('latin-1'), '&' + entity + ';')
00049 
00050     def replaceURL(match):
00051         """Replace hyperlinks with clickable <a> tags
00052         """
00053         url = match.groups()[0]
00054         linktext = abbreviateUrl(url)
00055         # In urls we need the revert our earlier change to the ampersands.
00056         # We do not want something like:
00057         # http://google.com/ask?question=everything&amp;answer=42
00058         url = url.replace('&amp;', '&')
00059         # Also with <some link> we should only link to some link, not
00060         # including the brackets.
00061         end = ''
00062         # XXX Probably better to fix the regex above.  Maurits
00063         if url.endswith('&gt;'):
00064             url = url[:-len('&gt;')]
00065             linktext = linktext[:-len('&gt;')]
00066             end = '&gt;'
00067 
00068         # rel="nofollow" shall avoid spamming
00069         return '<a href="%s" rel="nofollow">%s</a>%s' % (url, linktext, end)
00070     text = urlRegexp.subn(replaceURL, text)[0]
00071 
00072     def replaceEmail(match):
00073         """Replace email strings with mailto: links
00074         """
00075         url = match.groups()[0]
00076         # following unicode substitutions shall avoid email spam
00077         # crawlers to pickup email addresses
00078         url = url.replace('@', '&#0064;')
00079         return '<a href="&#0109;ailto&#0058;%s">%s</a>' % (url, url)
00080     text = emailRegexp.subn(replaceEmail, text)[0]
00081 
00082     def indentWhitespace(match):
00083         """Make leading whitespace on a line into &nbsp; to preserve indents
00084         """
00085         indent = match.groups()[0]
00086         indent = indent.replace(' ', '&nbsp;')
00087         return indent.replace('\t', '&nbsp;' * tab_width)
00088     text = indentRegexp.subn(indentWhitespace, text)[0]
00089 
00090     # Finally, make \n's into br's
00091     text = text.replace('\n', '<br />')
00092 
00093     text = text.encode('utf-8')
00094 
00095     return text
00096 
00097 
00098 def convertHtmlToWebIntelligentPlainText(orig):
00099     """Converts text/html to text/x-web-intelligent.
00100     """
00101     preRegex = re.compile(r'<\s*pre[^>]*>(.*?)<\s*/pre\s*>', re.I | re.S)
00102 
00103     tagWhitespaceRegex = re.compile(r'\s+((<[^>]+>)\s+)+')
00104     whitespaceRegex = re.compile(r'\s+')
00105 
00106     tdRegex = re.compile(r'<\s*(td)([^>])*>', re.I)
00107     breakRegex = re.compile(r'<\s*(br)\s*/?>', re.I)
00108     startBlockRegex = re.compile(r'<\s*(dt)[^>]*>', re.I)
00109     endBlockRegex = re.compile(r'<\s*/\s*(p|div|tr|ul|ol|dl)[^>]*>', re.I)
00110     indentBlockRegex = re.compile(r'<\s*(blockquote|dd)[^>]*>', re.I)
00111     listBlockRegex = re.compile(r'<\s*(li)[^>]*>', re.I)
00112 
00113     tagRegex = re.compile(r'<[^>]+>', re.I | re.M)
00114 
00115     # Save all <pre> sections and restore after other transforms
00116     preSections = {}
00117 
00118     def savePres(match):
00119         marker = '__pre_marker__%d__' % len(preSections)
00120         preSections[marker] = match.group(1)
00121         return marker
00122     if orig is None:
00123         orig = ''
00124     text = preRegex.sub(savePres, orig)
00125 
00126     def fixTagWhitespace(match):
00127         """Make whitespace-tag-whitespace into whitespace-tag.
00128         Repeat this in case there are directly nested tags.
00129         """
00130         # Remove any superfluous whitespace, but preserve one leading space
00131         return ' ' + whitespaceRegex.sub('', match.group(0))
00132     text = tagWhitespaceRegex.sub(fixTagWhitespace, text)
00133 
00134     # Make all whitespace into a single space
00135     text = whitespaceRegex.sub(' ', text)
00136 
00137     # Fix entities
00138     text = text.replace('&nbsp;', ' ')
00139     for entity, letter in entitydefs.items():
00140         # Do &lt; and &gt; later, else we may be creating what looks like
00141         # tags
00142         if entity != 'lt' and entity != 'gt':
00143             text = text.replace('&' + entity + ';', letter)
00144 
00145     # XXX: Remove <head>, <script>, <style> ?
00146 
00147     # Make tabs out of td's
00148     text = tdRegex.sub('\t', text)
00149 
00150     # Make br's and li's into newlines
00151     text = breakRegex.sub('\n', text)
00152 
00153     # Make the start of list blocks into paragraphs
00154     text = startBlockRegex.sub('\n\n', text)
00155 
00156     # Make the close of p's, div's and tr's into paragraphs
00157     text = endBlockRegex.sub('\n\n', text)
00158 
00159     # Make blockquotes and dd blocks indented
00160     text = indentBlockRegex.sub('\n\n  ', text)
00161 
00162     # Make list items indented and prefixed with -
00163     text = listBlockRegex.sub('\n\n  - ', text)
00164 
00165     # Remove other tags
00166     text = tagRegex.sub('', text)
00167 
00168     # Fix < and > entities
00169     text = text.replace('&lt;', '<')
00170     text = text.replace('&gt;', '>')
00171 
00172     # Restore pres
00173     for marker, section in preSections.items():
00174         text = text.replace(marker, '\n\n' + section + '\n\n')
00175 
00176     return text