Back to index

plone3  3.1.7
Functions
plone.intelligenttext.transforms Namespace Reference

Functions

def convertWebIntelligentPlainTextToHtml
def convertHtmlToWebIntelligentPlainText

Function Documentation

Converts text/html to text/x-web-intelligent.

Definition at line 98 of file transforms.py.

00098 
00099 def convertHtmlToWebIntelligentPlainText(orig):
00100     """Converts text/html to text/x-web-intelligent.
00101     """
00102     preRegex = re.compile(r'<\s*pre[^>]*>(.*?)<\s*/pre\s*>', re.I | re.S)
00103 
00104     tagWhitespaceRegex = re.compile(r'\s+((<[^>]+>)\s+)+')
00105     whitespaceRegex = re.compile(r'\s+')
00106 
00107     tdRegex = re.compile(r'<\s*(td)([^>])*>', re.I)
00108     breakRegex = re.compile(r'<\s*(br)\s*/?>', re.I)
00109     startBlockRegex = re.compile(r'<\s*(dt)[^>]*>', re.I)
00110     endBlockRegex = re.compile(r'<\s*/\s*(p|div|tr|ul|ol|dl)[^>]*>', re.I)
00111     indentBlockRegex = re.compile(r'<\s*(blockquote|dd)[^>]*>', re.I)
00112     listBlockRegex = re.compile(r'<\s*(li)[^>]*>', re.I)
00113 
00114     tagRegex = re.compile(r'<[^>]+>', re.I | re.M)
00115 
00116     # Save all <pre> sections and restore after other transforms
00117     preSections = {}
00118 
00119     def savePres(match):
00120         marker = '__pre_marker__%d__' % len(preSections)
00121         preSections[marker] = match.group(1)
00122         return marker
00123     if orig is None:
00124         orig = ''
00125     text = preRegex.sub(savePres, orig)
00126 
00127     def fixTagWhitespace(match):
00128         """Make whitespace-tag-whitespace into whitespace-tag.
00129         Repeat this in case there are directly nested tags.
00130         """
00131         # Remove any superfluous whitespace, but preserve one leading space
00132         return ' ' + whitespaceRegex.sub('', match.group(0))
00133     text = tagWhitespaceRegex.sub(fixTagWhitespace, text)
00134 
00135     # Make all whitespace into a single space
00136     text = whitespaceRegex.sub(' ', text)
00137 
00138     # Fix entities
00139     text = text.replace('&nbsp;', ' ')
00140     for entity, letter in entitydefs.items():
00141         # Do &lt; and &gt; later, else we may be creating what looks like
00142         # tags
00143         if entity != 'lt' and entity != 'gt':
00144             text = text.replace('&' + entity + ';', letter)
00145 
00146     # XXX: Remove <head>, <script>, <style> ?
00147 
00148     # Make tabs out of td's
00149     text = tdRegex.sub('\t', text)
00150 
00151     # Make br's and li's into newlines
00152     text = breakRegex.sub('\n', text)
00153 
00154     # Make the start of list blocks into paragraphs
00155     text = startBlockRegex.sub('\n\n', text)
00156 
00157     # Make the close of p's, div's and tr's into paragraphs
00158     text = endBlockRegex.sub('\n\n', text)
00159 
00160     # Make blockquotes and dd blocks indented
00161     text = indentBlockRegex.sub('\n\n  ', text)
00162 
00163     # Make list items indented and prefixed with -
00164     text = listBlockRegex.sub('\n\n  - ', text)
00165 
00166     # Remove other tags
00167     text = tagRegex.sub('', text)
00168 
00169     # Fix < and > entities
00170     text = text.replace('&lt;', '<')
00171     text = text.replace('&gt;', '>')
00172 
00173     # Restore pres
00174     for marker, section in preSections.items():
00175         text = text.replace(marker, '\n\n' + section + '\n\n')
00176 
00177     return text

Here is the caller graph for this function:

Converts text/x-web-intelligent to text/html

Definition at line 5 of file transforms.py.

00005 
00006 def convertWebIntelligentPlainTextToHtml(orig, tab_width=4):
00007     """Converts text/x-web-intelligent to text/html
00008     """
00009     try:
00010         # tab_width could be a string like '4'
00011         tab_width = int(tab_width)
00012     except ValueError:
00013         tab_width=4
00014 
00015     def abbreviateUrl(url, max = 60, ellipsis = "[&hellip;]"):
00016         """very long urls are abbreviated to allow nicer layout
00017         """
00018         if len(url) < max:
00019             return url
00020         protocol = ""
00021         protocolend = url.find("//")
00022         if protocolend != -1:
00023             protocol = url[0:protocolend+2]
00024             url = url[protocolend+2:]
00025         list = url.split("/")
00026         if len(list) < 3 or len(list[0])+len(list[-1])>max:
00027             url = protocol + url
00028             center = (max-5)/2
00029             return url[:center] + ellipsis + url[-center:]
00030 
00031         return protocol + list[0] +"/" +ellipsis + "/" + list[-1]
00032 
00033     urlRegexp = re.compile(r'((?:ftp|https?)://(localhost|([12]?[0-9]{1,2}.){3}([12]?[0-9]{1,2})|(?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+(?:com|edu|biz|org|gov|int|info|mil|net|name|museum|coop|aero|[a-z][a-z]))\b(?::\d+)?(?:\/[^"\'<>()\[\]{}\s\x7f-\xff]*(?:[.,?]+[^"\'<>()\[\]{}\s\x7f-\xff]+)*)?)', re.I|re.S|re.U)
00034     emailRegexp = re.compile(r'["=]?(\b[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)', re.I|re.S|re.U)
00035     indentRegexp = re.compile(r'^(\s+)', re.M|re.U)
00036 
00037     text = orig
00038     if text is None:
00039         text = ''
00040     if not isinstance(text, unicode):
00041         text = unicode(text, 'utf-8', 'replace')
00042 
00043     # Do &amp; separately, else, it may replace an already-inserted & from
00044     # an entity with &amp;, so < becomes &lt; becomes &amp;lt;
00045     text = text.replace('&', '&amp;')
00046     # Make funny characters into html entity defs
00047     for entity, letter in entitydefs.items():
00048         if entity != 'amp':
00049             text = text.replace(letter.decode('latin-1'), '&' + entity + ';')
00050 
00051     def replaceURL(match):
00052         """Replace hyperlinks with clickable <a> tags
00053         """
00054         url = match.groups()[0]
00055         linktext = abbreviateUrl(url)
00056         # In urls we need the revert our earlier change to the ampersands.
00057         # We do not want something like:
00058         # http://google.com/ask?question=everything&amp;answer=42
00059         url = url.replace('&amp;', '&')
00060         # Also with <some link> we should only link to some link, not
00061         # including the brackets.
00062         end = ''
00063         # XXX Probably better to fix the regex above.  Maurits
00064         if url.endswith('&gt;'):
00065             url = url[:-len('&gt;')]
00066             linktext = linktext[:-len('&gt;')]
00067             end = '&gt;'
00068 
00069         # rel="nofollow" shall avoid spamming
00070         return '<a href="%s" rel="nofollow">%s</a>%s' % (url, linktext, end)
00071     text = urlRegexp.subn(replaceURL, text)[0]
00072 
00073     def replaceEmail(match):
00074         """Replace email strings with mailto: links
00075         """
00076         url = match.groups()[0]
00077         # following unicode substitutions shall avoid email spam
00078         # crawlers to pickup email addresses
00079         url = url.replace('@', '&#0064;')
00080         return '<a href="&#0109;ailto&#0058;%s">%s</a>' % (url, url)
00081     text = emailRegexp.subn(replaceEmail, text)[0]
00082 
00083     def indentWhitespace(match):
00084         """Make leading whitespace on a line into &nbsp; to preserve indents
00085         """
00086         indent = match.groups()[0]
00087         indent = indent.replace(' ', '&nbsp;')
00088         return indent.replace('\t', '&nbsp;' * tab_width)
00089     text = indentRegexp.subn(indentWhitespace, text)[0]
00090 
00091     # Finally, make \n's into br's
00092     text = text.replace('\n', '<br />')
00093 
00094     text = text.encode('utf-8')
00095 
00096     return text
00097 

Here is the caller graph for this function: