Back to index

plone3  3.1.7
parsers.py
Go to the documentation of this file.
00001 # -*- coding: ISO-8859-15 -*-
00002 # Copyright (c) 2006-2007
00003 # Authors: KSS Project Contributors (see docs/CREDITS.txt)
00004 #
00005 # This program is free software; you can redistribute it and/or modify
00006 # it under the terms of the GNU General Public License version 2 as published
00007 # by the Free Software Foundation.
00008 #
00009 # This program is distributed in the hope that it will be useful,
00010 # but WITHOUT ANY WARRANTY; without even the implied warranty of
00011 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012 # GNU General Public License for more details.
00013 #
00014 # You should have received a copy of the GNU General Public License
00015 # along with this program; if not, write to the Free Software
00016 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
00017 # 02111-1307, USA.
00018 
00019 '''\
00020 Parser implementations
00021 
00022 These assure that output is valid XML or HTML and fix the code or
00023 raise an exception.
00024 
00025 The wrapping makes it possible to change the parser transparently
00026 if necessary.
00027 '''
00028 
00029 from unicode_quirks import force_unicode
00030 import re, htmlentitydefs
00031 
00032 class replace_html_named_entities(object):
00033 
00034     _entity_regexp = re.compile(r'&([A-Za-z]+);')
00035 
00036     def _entity_replacer(m):
00037         value = htmlentitydefs.name2codepoint.get(m.group(1))
00038         if value is None:
00039             return m.group(0)
00040         return "&#%i;" % value
00041     _entity_replacer = staticmethod(_entity_replacer)
00042 
00043     def _replace(cls, value):
00044         return cls._entity_regexp.sub(cls._entity_replacer, value)
00045     _replace = classmethod(_replace)
00046 
00047     def __new__(cls, value):
00048         return cls._replace(value)
00049 
00050 class XmlParser(object):
00051     '''Custom XML parser
00052 
00053     wraps the parser implementation
00054     '''
00055 
00056     from BeautifulSoup import BeautifulStoneSoup
00057     
00058     def __init__(self, value):
00059         value = force_unicode(value)
00060         self.soup = self.BeautifulStoneSoup(value)
00061 
00062     def __call__(self):
00063         return unicode(self.soup)
00064         
00065 class HtmlParser(object):
00066     '''Custom HTML parser
00067 
00068     wraps the parser implementation
00069     '''
00070 
00071     from BeautifulSoup import BeautifulSoup
00072     
00073     def __init__(self, value):
00074         value = force_unicode(value)
00075         self.soup = self.BeautifulSoup(value)
00076         #
00077         # XXX ree: I think these are not needed any more. See
00078         # kukit patches r25865, r25866 that IMO fix this on IE.
00079         #
00080         #for tag in self.soup.fetch(recursive=False):
00081         #    tag['xmlns'] = "http://www.w3.org/1999/xhtml"
00082 
00083     def __call__(self):
00084         value = unicode(self.soup)
00085         # Replace named HTML entitied in each case.
00086         # This is necessary for two reasons:
00087         # 1. Fixes an IE bug.
00088         # 2. Needed for the alternate transport mechanism to work.
00089         value = replace_html_named_entities(value)
00090 
00091         return value