Back to index

plone3  3.1.7
feedparser.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 """Universal feed parser
00003 
00004 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
00005 
00006 Visit http://feedparser.org/ for the latest version
00007 Visit http://feedparser.org/docs/ for the latest documentation
00008 
00009 Required: Python 2.1 or later
00010 Recommended: Python 2.3 or later
00011 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
00012 """
00013 
00014 __version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
00015 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
00016 
00017 Redistribution and use in source and binary forms, with or without modification,
00018 are permitted provided that the following conditions are met:
00019 
00020 * Redistributions of source code must retain the above copyright notice,
00021   this list of conditions and the following disclaimer.
00022 * Redistributions in binary form must reproduce the above copyright notice,
00023   this list of conditions and the following disclaimer in the documentation
00024   and/or other materials provided with the distribution.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE."""
00037 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
00038 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
00039                     "John Beimler <http://john.beimler.org/>",
00040                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
00041                     "Aaron Swartz <http://aaronsw.com/>",
00042                     "Kevin Marks <http://epeus.blogspot.com/>"]
00043 _debug = 0
00044 
00045 # HTTP "User-Agent" header to send to servers when downloading feeds.
00046 # If you are embedding feedparser in a larger application, you should
00047 # change this to your application name and URL.
00048 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
00049 
00050 # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
00051 # want to send an Accept header, set this to None.
00052 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
00053 
00054 # List of preferred XML parsers, by SAX driver name.  These will be tried first,
00055 # but if they're not installed, Python will keep searching through its own list
00056 # of pre-installed parsers until it finds one that supports everything we need.
00057 PREFERRED_XML_PARSERS = ["drv_libxml2"]
00058 
00059 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
00060 # this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
00061 # or utidylib <http://utidylib.berlios.de/>.
00062 TIDY_MARKUP = 0
00063 
00064 # List of Python interfaces for HTML Tidy, in order of preference.  Only useful
00065 # if TIDY_MARKUP = 1
00066 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
00067 
00068 # ---------- required modules (should come with any Python distribution) ----------
00069 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
00070 try:
00071     from cStringIO import StringIO as _StringIO
00072 except:
00073     from StringIO import StringIO as _StringIO
00074 
00075 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
00076 
00077 # gzip is included with most Python distributions, but may not be available if you compiled your own
00078 try:
00079     import gzip
00080 except:
00081     gzip = None
00082 try:
00083     import zlib
00084 except:
00085     zlib = None
00086 
00087 # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
00088 # been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
00089 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
00090 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
00091 try:
00092     import xml.sax
00093     xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
00094     from xml.sax.saxutils import escape as _xmlescape
00095     _XML_AVAILABLE = 1
00096 except:
00097     _XML_AVAILABLE = 0
00098     def _xmlescape(data):
00099         data = data.replace('&', '&amp;')
00100         data = data.replace('>', '&gt;')
00101         data = data.replace('<', '&lt;')
00102         return data
00103 
00104 # base64 support for Atom feeds that contain embedded binary data
00105 try:
00106     import base64, binascii
00107 except:
00108     base64 = binascii = None
00109 
00110 # cjkcodecs and iconv_codec provide support for more character encodings.
00111 # Both are available from http://cjkpython.i18n.org/
00112 try:
00113     import cjkcodecs.aliases
00114 except:
00115     pass
00116 try:
00117     import iconv_codec
00118 except:
00119     pass
00120 
00121 # chardet library auto-detects character encodings
00122 # Download from http://chardet.feedparser.org/
00123 try:
00124     import chardet
00125     if _debug:
00126         import chardet.constants
00127         chardet.constants._debug = 1
00128 except:
00129     chardet = None
00130 
00131 # ---------- don't touch these ----------
00132 class ThingsNobodyCaresAboutButMe(Exception): pass
00133 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
00134 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
00135 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
00136 class UndeclaredNamespace(Exception): pass
00137 
00138 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
00139 sgmllib.special = re.compile('<!')
00140 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
00141 
00142 SUPPORTED_VERSIONS = {'': 'unknown',
00143                       'rss090': 'RSS 0.90',
00144                       'rss091n': 'RSS 0.91 (Netscape)',
00145                       'rss091u': 'RSS 0.91 (Userland)',
00146                       'rss092': 'RSS 0.92',
00147                       'rss093': 'RSS 0.93',
00148                       'rss094': 'RSS 0.94',
00149                       'rss20': 'RSS 2.0',
00150                       'rss10': 'RSS 1.0',
00151                       'rss': 'RSS (unknown version)',
00152                       'atom01': 'Atom 0.1',
00153                       'atom02': 'Atom 0.2',
00154                       'atom03': 'Atom 0.3',
00155                       'atom10': 'Atom 1.0',
00156                       'atom': 'Atom (unknown version)',
00157                       'cdf': 'CDF',
00158                       'hotrss': 'Hot RSS'
00159                       }
00160 
00161 try:
00162     UserDict = dict
00163 except NameError:
00164     # Python 2.1 does not have dict
00165     from UserDict import UserDict
00166     def dict(aList):
00167         rc = {}
00168         for k, v in aList:
00169             rc[k] = v
00170         return rc
00171 
00172 class FeedParserDict(UserDict):
00173     keymap = {'channel': 'feed',
00174               'items': 'entries',
00175               'guid': 'id',
00176               'date': 'updated',
00177               'date_parsed': 'updated_parsed',
00178               'description': ['subtitle', 'summary'],
00179               'url': ['href'],
00180               'modified': 'updated',
00181               'modified_parsed': 'updated_parsed',
00182               'issued': 'published',
00183               'issued_parsed': 'published_parsed',
00184               'copyright': 'rights',
00185               'copyright_detail': 'rights_detail',
00186               'tagline': 'subtitle',
00187               'tagline_detail': 'subtitle_detail'}
00188     def __getitem__(self, key):
00189         if key == 'category':
00190             return UserDict.__getitem__(self, 'tags')[0]['term']
00191         if key == 'categories':
00192             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
00193         realkey = self.keymap.get(key, key)
00194         if type(realkey) == types.ListType:
00195             for k in realkey:
00196                 if UserDict.has_key(self, k):
00197                     return UserDict.__getitem__(self, k)
00198         if UserDict.has_key(self, key):
00199             return UserDict.__getitem__(self, key)
00200         return UserDict.__getitem__(self, realkey)
00201 
00202     def __setitem__(self, key, value):
00203         for k in self.keymap.keys():
00204             if key == k:
00205                 key = self.keymap[k]
00206                 if type(key) == types.ListType:
00207                     key = key[0]
00208         return UserDict.__setitem__(self, key, value)
00209 
00210     def get(self, key, default=None):
00211         if self.has_key(key):
00212             return self[key]
00213         else:
00214             return default
00215 
00216     def setdefault(self, key, value):
00217         if not self.has_key(key):
00218             self[key] = value
00219         return self[key]
00220         
00221     def has_key(self, key):
00222         try:
00223             return hasattr(self, key) or UserDict.has_key(self, key)
00224         except AttributeError:
00225             return False
00226         
00227     def __getattr__(self, key):
00228         try:
00229             return self.__dict__[key]
00230         except KeyError:
00231             pass
00232         try:
00233             assert not key.startswith('_')
00234             return self.__getitem__(key)
00235         except:
00236             raise AttributeError, "object has no attribute '%s'" % key
00237 
00238     def __setattr__(self, key, value):
00239         if key.startswith('_') or key == 'data':
00240             self.__dict__[key] = value
00241         else:
00242             return self.__setitem__(key, value)
00243 
00244     def __contains__(self, key):
00245         return self.has_key(key)
00246 
00247 def zopeCompatibilityHack():
00248     global FeedParserDict
00249     del FeedParserDict
00250     def FeedParserDict(aDict=None):
00251         rc = {}
00252         if aDict:
00253             rc.update(aDict)
00254         return rc
00255 
00256 _ebcdic_to_ascii_map = None
00257 def _ebcdic_to_ascii(s):
00258     global _ebcdic_to_ascii_map
00259     if not _ebcdic_to_ascii_map:
00260         emap = (
00261             0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
00262             16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
00263             128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
00264             144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
00265             32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
00266             38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
00267             45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
00268             186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
00269             195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
00270             202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
00271             209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
00272             216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
00273             123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
00274             125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
00275             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
00276             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
00277             )
00278         import string
00279         _ebcdic_to_ascii_map = string.maketrans( \
00280             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
00281     return s.translate(_ebcdic_to_ascii_map)
00282 
00283 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
00284 def _urljoin(base, uri):
00285     uri = _urifixer.sub(r'\1\3', uri)
00286     return urlparse.urljoin(base, uri)
00287 
00288 class _FeedParserMixin:
00289     namespaces = {'': '',
00290                   'http://backend.userland.com/rss': '',
00291                   'http://blogs.law.harvard.edu/tech/rss': '',
00292                   'http://purl.org/rss/1.0/': '',
00293                   'http://my.netscape.com/rdf/simple/0.9/': '',
00294                   'http://example.com/newformat#': '',
00295                   'http://example.com/necho': '',
00296                   'http://purl.org/echo/': '',
00297                   'uri/of/echo/namespace#': '',
00298                   'http://purl.org/pie/': '',
00299                   'http://purl.org/atom/ns#': '',
00300                   'http://www.w3.org/2005/Atom': '',
00301                   'http://purl.org/rss/1.0/modules/rss091#': '',
00302                   
00303                   'http://webns.net/mvcb/':                               'admin',
00304                   'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
00305                   'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
00306                   'http://media.tangent.org/rss/1.0/':                    'audio',
00307                   'http://backend.userland.com/blogChannelModule':        'blogChannel',
00308                   'http://web.resource.org/cc/':                          'cc',
00309                   'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
00310                   'http://purl.org/rss/1.0/modules/company':              'co',
00311                   'http://purl.org/rss/1.0/modules/content/':             'content',
00312                   'http://my.theinfo.org/changed/1.0/rss/':               'cp',
00313                   'http://purl.org/dc/elements/1.1/':                     'dc',
00314                   'http://purl.org/dc/terms/':                            'dcterms',
00315                   'http://purl.org/rss/1.0/modules/email/':               'email',
00316                   'http://purl.org/rss/1.0/modules/event/':               'ev',
00317                   'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
00318                   'http://freshmeat.net/rss/fm/':                         'fm',
00319                   'http://xmlns.com/foaf/0.1/':                           'foaf',
00320                   'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
00321                   'http://postneo.com/icbm/':                             'icbm',
00322                   'http://purl.org/rss/1.0/modules/image/':               'image',
00323                   'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
00324                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
00325                   'http://purl.org/rss/1.0/modules/link/':                'l',
00326                   'http://search.yahoo.com/mrss':                         'media',
00327                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
00328                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
00329                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
00330                   'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
00331                   'http://purl.org/rss/1.0/modules/reference/':           'ref',
00332                   'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
00333                   'http://purl.org/rss/1.0/modules/search/':              'search',
00334                   'http://purl.org/rss/1.0/modules/slash/':               'slash',
00335                   'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
00336                   'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
00337                   'http://hacks.benhammersley.com/rss/streaming/':        'str',
00338                   'http://purl.org/rss/1.0/modules/subscription/':        'sub',
00339                   'http://purl.org/rss/1.0/modules/syndication/':         'sy',
00340                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
00341                   'http://purl.org/rss/1.0/modules/threading/':           'thr',
00342                   'http://purl.org/rss/1.0/modules/textinput/':           'ti',
00343                   'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
00344                   'http://wellformedweb.org/commentAPI/':                 'wfw',
00345                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
00346                   'http://www.w3.org/1999/xhtml':                         'xhtml',
00347                   'http://www.w3.org/XML/1998/namespace':                 'xml',
00348                   'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
00349 }
00350     _matchnamespaces = {}
00351 
00352     can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
00353     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
00354     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
00355     html_types = ['text/html', 'application/xhtml+xml']
00356     
00357     def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
00358         if _debug: sys.stderr.write('initializing FeedParser\n')
00359         if not self._matchnamespaces:
00360             for k, v in self.namespaces.items():
00361                 self._matchnamespaces[k.lower()] = v
00362         self.feeddata = FeedParserDict() # feed-level data
00363         self.encoding = encoding # character encoding
00364         self.entries = [] # list of entry-level data
00365         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
00366         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
00367 
00368         # the following are used internally to track state;
00369         # this is really out of control and should be refactored
00370         self.infeed = 0
00371         self.inentry = 0
00372         self.incontent = 0
00373         self.intextinput = 0
00374         self.inimage = 0
00375         self.inauthor = 0
00376         self.incontributor = 0
00377         self.inpublisher = 0
00378         self.insource = 0
00379         self.sourcedata = FeedParserDict()
00380         self.contentparams = FeedParserDict()
00381         self._summaryKey = None
00382         self.namespacemap = {}
00383         self.elementstack = []
00384         self.basestack = []
00385         self.langstack = []
00386         self.baseuri = baseuri or ''
00387         self.lang = baselang or None
00388         if baselang:
00389             self.feeddata['language'] = baselang
00390 
00391     def unknown_starttag(self, tag, attrs):
00392         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
00393         # normalize attrs
00394         attrs = [(k.lower(), v) for k, v in attrs]
00395         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
00396         
00397         # track xml:base and xml:lang
00398         attrsD = dict(attrs)
00399         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
00400         self.baseuri = _urljoin(self.baseuri, baseuri)
00401         lang = attrsD.get('xml:lang', attrsD.get('lang'))
00402         if lang == '':
00403             # xml:lang could be explicitly set to '', we need to capture that
00404             lang = None
00405         elif lang is None:
00406             # if no xml:lang is specified, use parent lang
00407             lang = self.lang
00408         if lang:
00409             if tag in ('feed', 'rss', 'rdf:RDF'):
00410                 self.feeddata['language'] = lang
00411         self.lang = lang
00412         self.basestack.append(self.baseuri)
00413         self.langstack.append(lang)
00414         
00415         # track namespaces
00416         for prefix, uri in attrs:
00417             if prefix.startswith('xmlns:'):
00418                 self.trackNamespace(prefix[6:], uri)
00419             elif prefix == 'xmlns':
00420                 self.trackNamespace(None, uri)
00421 
00422         # track inline content
00423         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
00424             # element declared itself as escaped markup, but it isn't really
00425             self.contentparams['type'] = 'application/xhtml+xml'
00426         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
00427             # Note: probably shouldn't simply recreate localname here, but
00428             # our namespace handling isn't actually 100% correct in cases where
00429             # the feed redefines the default namespace (which is actually
00430             # the usual case for inline content, thanks Sam), so here we
00431             # cheat and just reconstruct the element based on localname
00432             # because that compensates for the bugs in our namespace handling.
00433             # This will horribly munge inline content with non-empty qnames,
00434             # but nobody actually does that, so I'm not fixing it.
00435             tag = tag.split(':')[-1]
00436             return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
00437 
00438         # match namespaces
00439         if tag.find(':') <> -1:
00440             prefix, suffix = tag.split(':', 1)
00441         else:
00442             prefix, suffix = '', tag
00443         prefix = self.namespacemap.get(prefix, prefix)
00444         if prefix:
00445             prefix = prefix + '_'
00446 
00447         # special hack for better tracking of empty textinput/image elements in illformed feeds
00448         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
00449             self.intextinput = 0
00450         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
00451             self.inimage = 0
00452         
00453         # call special handler (if defined) or default handler
00454         methodname = '_start_' + prefix + suffix
00455         try:
00456             method = getattr(self, methodname)
00457             return method(attrsD)
00458         except AttributeError:
00459             return self.push(prefix + suffix, 1)
00460 
00461     def unknown_endtag(self, tag):
00462         if _debug: sys.stderr.write('end %s\n' % tag)
00463         # match namespaces
00464         if tag.find(':') <> -1:
00465             prefix, suffix = tag.split(':', 1)
00466         else:
00467             prefix, suffix = '', tag
00468         prefix = self.namespacemap.get(prefix, prefix)
00469         if prefix:
00470             prefix = prefix + '_'
00471 
00472         # call special handler (if defined) or default handler
00473         methodname = '_end_' + prefix + suffix
00474         try:
00475             method = getattr(self, methodname)
00476             method()
00477         except AttributeError:
00478             self.pop(prefix + suffix)
00479 
00480         # track inline content
00481         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
00482             # element declared itself as escaped markup, but it isn't really
00483             self.contentparams['type'] = 'application/xhtml+xml'
00484         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
00485             tag = tag.split(':')[-1]
00486             self.handle_data('</%s>' % tag, escape=0)
00487 
00488         # track xml:base and xml:lang going out of scope
00489         if self.basestack:
00490             self.basestack.pop()
00491             if self.basestack and self.basestack[-1]:
00492                 self.baseuri = self.basestack[-1]
00493         if self.langstack:
00494             self.langstack.pop()
00495             if self.langstack: # and (self.langstack[-1] is not None):
00496                 self.lang = self.langstack[-1]
00497 
00498     def handle_charref(self, ref):
00499         # called for each character reference, e.g. for '&#160;', ref will be '160'
00500         if not self.elementstack: return
00501         ref = ref.lower()
00502         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
00503             text = '&#%s;' % ref
00504         else:
00505             if ref[0] == 'x':
00506                 c = int(ref[1:], 16)
00507             else:
00508                 c = int(ref)
00509             text = unichr(c).encode('utf-8')
00510         self.elementstack[-1][2].append(text)
00511 
00512     def handle_entityref(self, ref):
00513         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
00514         if not self.elementstack: return
00515         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
00516         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
00517             text = '&%s;' % ref
00518         else:
00519             # entity resolution graciously donated by Aaron Swartz
00520             def name2cp(k):
00521                 import htmlentitydefs
00522                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
00523                     return htmlentitydefs.name2codepoint[k]
00524                 k = htmlentitydefs.entitydefs[k]
00525                 if k.startswith('&#') and k.endswith(';'):
00526                     return int(k[2:-1]) # not in latin-1
00527                 return ord(k)
00528             try: name2cp(ref)
00529             except KeyError: text = '&%s;' % ref
00530             else: text = unichr(name2cp(ref)).encode('utf-8')
00531         self.elementstack[-1][2].append(text)
00532 
00533     def handle_data(self, text, escape=1):
00534         # called for each block of plain text, i.e. outside of any tag and
00535         # not containing any character or entity references
00536         if not self.elementstack: return
00537         if escape and self.contentparams.get('type') == 'application/xhtml+xml':
00538             text = _xmlescape(text)
00539         self.elementstack[-1][2].append(text)
00540 
00541     def handle_comment(self, text):
00542         # called for each comment, e.g. <!-- insert message here -->
00543         pass
00544 
00545     def handle_pi(self, text):
00546         # called for each processing instruction, e.g. <?instruction>
00547         pass
00548 
00549     def handle_decl(self, text):
00550         pass
00551 
00552     def parse_declaration(self, i):
00553         # override internal declaration handler to handle CDATA blocks
00554         if _debug: sys.stderr.write('entering parse_declaration\n')
00555         if self.rawdata[i:i+9] == '<![CDATA[':
00556             k = self.rawdata.find(']]>', i)
00557             if k == -1: k = len(self.rawdata)
00558             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
00559             return k+3
00560         else:
00561             k = self.rawdata.find('>', i)
00562             return k+1
00563 
00564     def mapContentType(self, contentType):
00565         contentType = contentType.lower()
00566         if contentType == 'text':
00567             contentType = 'text/plain'
00568         elif contentType == 'html':
00569             contentType = 'text/html'
00570         elif contentType == 'xhtml':
00571             contentType = 'application/xhtml+xml'
00572         return contentType
00573     
00574     def trackNamespace(self, prefix, uri):
00575         loweruri = uri.lower()
00576         if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
00577             self.version = 'rss090'
00578         if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
00579             self.version = 'rss10'
00580         if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
00581             self.version = 'atom10'
00582         if loweruri.find('backend.userland.com/rss') <> -1:
00583             # match any backend.userland.com namespace
00584             uri = 'http://backend.userland.com/rss'
00585             loweruri = uri
00586         if self._matchnamespaces.has_key(loweruri):
00587             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
00588             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
00589         else:
00590             self.namespacesInUse[prefix or ''] = uri
00591 
00592     def resolveURI(self, uri):
00593         return _urljoin(self.baseuri or '', uri)
00594     
00595     def decodeEntities(self, element, data):
00596         return data
00597 
00598     def push(self, element, expectingText):
00599         self.elementstack.append([element, expectingText, []])
00600 
00601     def pop(self, element, stripWhitespace=1):
00602         if not self.elementstack: return
00603         if self.elementstack[-1][0] != element: return
00604         
00605         element, expectingText, pieces = self.elementstack.pop()
00606         output = ''.join(pieces)
00607         if stripWhitespace:
00608             output = output.strip()
00609         if not expectingText: return output
00610 
00611         # decode base64 content
00612         if base64 and self.contentparams.get('base64', 0):
00613             try:
00614                 output = base64.decodestring(output)
00615             except binascii.Error:
00616                 pass
00617             except binascii.Incomplete:
00618                 pass
00619                 
00620         # resolve relative URIs
00621         if (element in self.can_be_relative_uri) and output:
00622             output = self.resolveURI(output)
00623         
00624         # decode entities within embedded markup
00625         if not self.contentparams.get('base64', 0):
00626             output = self.decodeEntities(element, output)
00627 
00628         # remove temporary cruft from contentparams
00629         try:
00630             del self.contentparams['mode']
00631         except KeyError:
00632             pass
00633         try:
00634             del self.contentparams['base64']
00635         except KeyError:
00636             pass
00637 
00638         # resolve relative URIs within embedded markup
00639         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
00640             if element in self.can_contain_relative_uris:
00641                 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
00642         
00643         # sanitize embedded markup
00644         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
00645             if element in self.can_contain_dangerous_markup:
00646                 output = _sanitizeHTML(output, self.encoding)
00647 
00648         if self.encoding and type(output) != type(u''):
00649             try:
00650                 output = unicode(output, self.encoding)
00651             except:
00652                 pass
00653 
00654         # categories/tags/keywords/whatever are handled in _end_category
00655         if element == 'category':
00656             return output
00657         
00658         # store output in appropriate place(s)
00659         if self.inentry and not self.insource:
00660             if element == 'content':
00661                 self.entries[-1].setdefault(element, [])
00662                 contentparams = copy.deepcopy(self.contentparams)
00663                 contentparams['value'] = output
00664                 self.entries[-1][element].append(contentparams)
00665             elif element == 'link':
00666                 self.entries[-1][element] = output
00667                 if output:
00668                     self.entries[-1]['links'][-1]['href'] = output
00669             else:
00670                 if element == 'description':
00671                     element = 'summary'
00672                 self.entries[-1][element] = output
00673                 if self.incontent:
00674                     contentparams = copy.deepcopy(self.contentparams)
00675                     contentparams['value'] = output
00676                     self.entries[-1][element + '_detail'] = contentparams
00677         elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
00678             context = self._getContext()
00679             if element == 'description':
00680                 element = 'subtitle'
00681             context[element] = output
00682             if element == 'link':
00683                 context['links'][-1]['href'] = output
00684             elif self.incontent:
00685                 contentparams = copy.deepcopy(self.contentparams)
00686                 contentparams['value'] = output
00687                 context[element + '_detail'] = contentparams
00688         return output
00689 
00690     def pushContent(self, tag, attrsD, defaultContentType, expectingText):
00691         self.incontent += 1
00692         self.contentparams = FeedParserDict({
00693             'type': self.mapContentType(attrsD.get('type', defaultContentType)),
00694             'language': self.lang,
00695             'base': self.baseuri})
00696         self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
00697         self.push(tag, expectingText)
00698 
00699     def popContent(self, tag):
00700         value = self.pop(tag)
00701         self.incontent -= 1
00702         self.contentparams.clear()
00703         return value
00704         
00705     def _mapToStandardPrefix(self, name):
00706         colonpos = name.find(':')
00707         if colonpos <> -1:
00708             prefix = name[:colonpos]
00709             suffix = name[colonpos+1:]
00710             prefix = self.namespacemap.get(prefix, prefix)
00711             name = prefix + ':' + suffix
00712         return name
00713         
00714     def _getAttribute(self, attrsD, name):
00715         return attrsD.get(self._mapToStandardPrefix(name))
00716 
00717     def _isBase64(self, attrsD, contentparams):
00718         if attrsD.get('mode', '') == 'base64':
00719             return 1
00720         if self.contentparams['type'].startswith('text/'):
00721             return 0
00722         if self.contentparams['type'].endswith('+xml'):
00723             return 0
00724         if self.contentparams['type'].endswith('/xml'):
00725             return 0
00726         return 1
00727 
00728     def _itsAnHrefDamnIt(self, attrsD):
00729         href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
00730         if href:
00731             try:
00732                 del attrsD['url']
00733             except KeyError:
00734                 pass
00735             try:
00736                 del attrsD['uri']
00737             except KeyError:
00738                 pass
00739             attrsD['href'] = href
00740         return attrsD
00741     
00742     def _save(self, key, value):
00743         context = self._getContext()
00744         context.setdefault(key, value)
00745 
00746     def _start_rss(self, attrsD):
00747         versionmap = {'0.91': 'rss091u',
00748                       '0.92': 'rss092',
00749                       '0.93': 'rss093',
00750                       '0.94': 'rss094'}
00751         if not self.version:
00752             attr_version = attrsD.get('version', '')
00753             version = versionmap.get(attr_version)
00754             if version:
00755                 self.version = version
00756             elif attr_version.startswith('2.'):
00757                 self.version = 'rss20'
00758             else:
00759                 self.version = 'rss'
00760     
00761     def _start_dlhottitles(self, attrsD):
00762         self.version = 'hotrss'
00763 
00764     def _start_channel(self, attrsD):
00765         self.infeed = 1
00766         self._cdf_common(attrsD)
00767     _start_feedinfo = _start_channel
00768 
00769     def _cdf_common(self, attrsD):
00770         if attrsD.has_key('lastmod'):
00771             self._start_modified({})
00772             self.elementstack[-1][-1] = attrsD['lastmod']
00773             self._end_modified()
00774         if attrsD.has_key('href'):
00775             self._start_link({})
00776             self.elementstack[-1][-1] = attrsD['href']
00777             self._end_link()
00778     
00779     def _start_feed(self, attrsD):
00780         self.infeed = 1
00781         versionmap = {'0.1': 'atom01',
00782                       '0.2': 'atom02',
00783                       '0.3': 'atom03'}
00784         if not self.version:
00785             attr_version = attrsD.get('version')
00786             version = versionmap.get(attr_version)
00787             if version:
00788                 self.version = version
00789             else:
00790                 self.version = 'atom'
00791 
00792     def _end_channel(self):
00793         self.infeed = 0
00794     _end_feed = _end_channel
00795     
00796     def _start_image(self, attrsD):
00797         self.inimage = 1
00798         self.push('image', 0)
00799         context = self._getContext()
00800         context.setdefault('image', FeedParserDict())
00801             
00802     def _end_image(self):
00803         self.pop('image')
00804         self.inimage = 0
00805 
00806     def _start_textinput(self, attrsD):
00807         self.intextinput = 1
00808         self.push('textinput', 0)
00809         context = self._getContext()
00810         context.setdefault('textinput', FeedParserDict())
00811     _start_textInput = _start_textinput
00812     
00813     def _end_textinput(self):
00814         self.pop('textinput')
00815         self.intextinput = 0
00816     _end_textInput = _end_textinput
00817 
00818     def _start_author(self, attrsD):
00819         self.inauthor = 1
00820         self.push('author', 1)
00821     _start_managingeditor = _start_author
00822     _start_dc_author = _start_author
00823     _start_dc_creator = _start_author
00824     _start_itunes_author = _start_author
00825 
00826     def _end_author(self):
00827         self.pop('author')
00828         self.inauthor = 0
00829         self._sync_author_detail()
00830     _end_managingeditor = _end_author
00831     _end_dc_author = _end_author
00832     _end_dc_creator = _end_author
00833     _end_itunes_author = _end_author
00834 
00835     def _start_itunes_owner(self, attrsD):
00836         self.inpublisher = 1
00837         self.push('publisher', 0)
00838 
00839     def _end_itunes_owner(self):
00840         self.pop('publisher')
00841         self.inpublisher = 0
00842         self._sync_author_detail('publisher')
00843 
00844     def _start_contributor(self, attrsD):
00845         self.incontributor = 1
00846         context = self._getContext()
00847         context.setdefault('contributors', [])
00848         context['contributors'].append(FeedParserDict())
00849         self.push('contributor', 0)
00850 
00851     def _end_contributor(self):
00852         self.pop('contributor')
00853         self.incontributor = 0
00854 
00855     def _start_dc_contributor(self, attrsD):
00856         self.incontributor = 1
00857         context = self._getContext()
00858         context.setdefault('contributors', [])
00859         context['contributors'].append(FeedParserDict())
00860         self.push('name', 0)
00861 
00862     def _end_dc_contributor(self):
00863         self._end_name()
00864         self.incontributor = 0
00865 
00866     def _start_name(self, attrsD):
00867         self.push('name', 0)
00868     _start_itunes_name = _start_name
00869 
00870     def _end_name(self):
00871         value = self.pop('name')
00872         if self.inpublisher:
00873             self._save_author('name', value, 'publisher')
00874         elif self.inauthor:
00875             self._save_author('name', value)
00876         elif self.incontributor:
00877             self._save_contributor('name', value)
00878         elif self.intextinput:
00879             context = self._getContext()
00880             context['textinput']['name'] = value
00881     _end_itunes_name = _end_name
00882 
00883     def _start_width(self, attrsD):
00884         self.push('width', 0)
00885 
00886     def _end_width(self):
00887         value = self.pop('width')
00888         try:
00889             value = int(value)
00890         except:
00891             value = 0
00892         if self.inimage:
00893             context = self._getContext()
00894             context['image']['width'] = value
00895 
00896     def _start_height(self, attrsD):
00897         self.push('height', 0)
00898 
00899     def _end_height(self):
00900         value = self.pop('height')
00901         try:
00902             value = int(value)
00903         except:
00904             value = 0
00905         if self.inimage:
00906             context = self._getContext()
00907             context['image']['height'] = value
00908 
00909     def _start_url(self, attrsD):
00910         self.push('href', 1)
00911     _start_homepage = _start_url
00912     _start_uri = _start_url
00913 
00914     def _end_url(self):
00915         value = self.pop('href')
00916         if self.inauthor:
00917             self._save_author('href', value)
00918         elif self.incontributor:
00919             self._save_contributor('href', value)
00920         elif self.inimage:
00921             context = self._getContext()
00922             context['image']['href'] = value
00923         elif self.intextinput:
00924             context = self._getContext()
00925             context['textinput']['link'] = value
00926     _end_homepage = _end_url
00927     _end_uri = _end_url
00928 
00929     def _start_email(self, attrsD):
00930         self.push('email', 0)
00931     _start_itunes_email = _start_email
00932 
00933     def _end_email(self):
00934         value = self.pop('email')
00935         if self.inpublisher:
00936             self._save_author('email', value, 'publisher')
00937         elif self.inauthor:
00938             self._save_author('email', value)
00939         elif self.incontributor:
00940             self._save_contributor('email', value)
00941     _end_itunes_email = _end_email
00942 
00943     def _getContext(self):
00944         if self.insource:
00945             context = self.sourcedata
00946         elif self.inentry:
00947             context = self.entries[-1]
00948         else:
00949             context = self.feeddata
00950         return context
00951 
00952     def _save_author(self, key, value, prefix='author'):
00953         context = self._getContext()
00954         context.setdefault(prefix + '_detail', FeedParserDict())
00955         context[prefix + '_detail'][key] = value
00956         self._sync_author_detail()
00957 
00958     def _save_contributor(self, key, value):
00959         context = self._getContext()
00960         context.setdefault('contributors', [FeedParserDict()])
00961         context['contributors'][-1][key] = value
00962 
00963     def _sync_author_detail(self, key='author'):
00964         context = self._getContext()
00965         detail = context.get('%s_detail' % key)
00966         if detail:
00967             name = detail.get('name')
00968             email = detail.get('email')
00969             if name and email:
00970                 context[key] = '%s (%s)' % (name, email)
00971             elif name:
00972                 context[key] = name
00973             elif email:
00974                 context[key] = email
00975         else:
00976             author = context.get(key)
00977             if not author: return
00978             emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
00979             if not emailmatch: return
00980             email = emailmatch.group(0)
00981             # probably a better way to do the following, but it passes all the tests
00982             author = author.replace(email, '')
00983             author = author.replace('()', '')
00984             author = author.strip()
00985             if author and (author[0] == '('):
00986                 author = author[1:]
00987             if author and (author[-1] == ')'):
00988                 author = author[:-1]
00989             author = author.strip()
00990             context.setdefault('%s_detail' % key, FeedParserDict())
00991             context['%s_detail' % key]['name'] = author
00992             context['%s_detail' % key]['email'] = email
00993 
00994     def _start_subtitle(self, attrsD):
00995         self.pushContent('subtitle', attrsD, 'text/plain', 1)
00996     _start_tagline = _start_subtitle
00997     _start_itunes_subtitle = _start_subtitle
00998 
00999     def _end_subtitle(self):
01000         self.popContent('subtitle')
01001     _end_tagline = _end_subtitle
01002     _end_itunes_subtitle = _end_subtitle
01003             
01004     def _start_rights(self, attrsD):
01005         self.pushContent('rights', attrsD, 'text/plain', 1)
01006     _start_dc_rights = _start_rights
01007     _start_copyright = _start_rights
01008 
01009     def _end_rights(self):
01010         self.popContent('rights')
01011     _end_dc_rights = _end_rights
01012     _end_copyright = _end_rights
01013 
01014     def _start_item(self, attrsD):
01015         self.entries.append(FeedParserDict())
01016         self.push('item', 0)
01017         self.inentry = 1
01018         self.guidislink = 0
01019         id = self._getAttribute(attrsD, 'rdf:about')
01020         if id:
01021             context = self._getContext()
01022             context['id'] = id
01023         self._cdf_common(attrsD)
01024     _start_entry = _start_item
01025     _start_product = _start_item
01026 
01027     def _end_item(self):
01028         self.pop('item')
01029         self.inentry = 0
01030     _end_entry = _end_item
01031 
01032     def _start_dc_language(self, attrsD):
01033         self.push('language', 1)
01034     _start_language = _start_dc_language
01035 
01036     def _end_dc_language(self):
01037         self.lang = self.pop('language')
01038     _end_language = _end_dc_language
01039 
01040     def _start_dc_publisher(self, attrsD):
01041         self.push('publisher', 1)
01042     _start_webmaster = _start_dc_publisher
01043 
01044     def _end_dc_publisher(self):
01045         self.pop('publisher')
01046         self._sync_author_detail('publisher')
01047     _end_webmaster = _end_dc_publisher
01048 
01049     def _start_published(self, attrsD):
01050         self.push('published', 1)
01051     _start_dcterms_issued = _start_published
01052     _start_issued = _start_published
01053 
01054     def _end_published(self):
01055         value = self.pop('published')
01056         self._save('published_parsed', _parse_date(value))
01057     _end_dcterms_issued = _end_published
01058     _end_issued = _end_published
01059 
01060     def _start_updated(self, attrsD):
01061         self.push('updated', 1)
01062     _start_modified = _start_updated
01063     _start_dcterms_modified = _start_updated
01064     _start_pubdate = _start_updated
01065     _start_dc_date = _start_updated
01066 
01067     def _end_updated(self):
01068         value = self.pop('updated')
01069         parsed_value = _parse_date(value)
01070         self._save('updated_parsed', parsed_value)
01071     _end_modified = _end_updated
01072     _end_dcterms_modified = _end_updated
01073     _end_pubdate = _end_updated
01074     _end_dc_date = _end_updated
01075 
01076     def _start_created(self, attrsD):
01077         self.push('created', 1)
01078     _start_dcterms_created = _start_created
01079 
01080     def _end_created(self):
01081         value = self.pop('created')
01082         self._save('created_parsed', _parse_date(value))
01083     _end_dcterms_created = _end_created
01084 
01085     def _start_expirationdate(self, attrsD):
01086         self.push('expired', 1)
01087 
01088     def _end_expirationdate(self):
01089         self._save('expired_parsed', _parse_date(self.pop('expired')))
01090 
01091     def _start_cc_license(self, attrsD):
01092         self.push('license', 1)
01093         value = self._getAttribute(attrsD, 'rdf:resource')
01094         if value:
01095             self.elementstack[-1][2].append(value)
01096         self.pop('license')
01097         
01098     def _start_creativecommons_license(self, attrsD):
01099         self.push('license', 1)
01100 
01101     def _end_creativecommons_license(self):
01102         self.pop('license')
01103 
01104     def _addTag(self, term, scheme, label):
01105         context = self._getContext()
01106         tags = context.setdefault('tags', [])
01107         if (not term) and (not scheme) and (not label): return
01108         value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
01109         if value not in tags:
01110             tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
01111 
01112     def _start_category(self, attrsD):
01113         if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
01114         term = attrsD.get('term')
01115         scheme = attrsD.get('scheme', attrsD.get('domain'))
01116         label = attrsD.get('label')
01117         self._addTag(term, scheme, label)
01118         self.push('category', 1)
01119     _start_dc_subject = _start_category
01120     _start_keywords = _start_category
01121         
01122     def _end_itunes_keywords(self):
01123         for term in self.pop('itunes_keywords').split():
01124             self._addTag(term, 'http://www.itunes.com/', None)
01125         
01126     def _start_itunes_category(self, attrsD):
01127         self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
01128         self.push('category', 1)
01129         
01130     def _end_category(self):
01131         value = self.pop('category')
01132         if not value: return
01133         context = self._getContext()
01134         tags = context['tags']
01135         if value and len(tags) and not tags[-1]['term']:
01136             tags[-1]['term'] = value
01137         else:
01138             self._addTag(value, None, None)
01139     _end_dc_subject = _end_category
01140     _end_keywords = _end_category
01141     _end_itunes_category = _end_category
01142 
01143     def _start_cloud(self, attrsD):
01144         self._getContext()['cloud'] = FeedParserDict(attrsD)
01145         
01146     def _start_link(self, attrsD):
01147         attrsD.setdefault('rel', 'alternate')
01148         attrsD.setdefault('type', 'text/html')
01149         attrsD = self._itsAnHrefDamnIt(attrsD)
01150         if attrsD.has_key('href'):
01151             attrsD['href'] = self.resolveURI(attrsD['href'])
01152         expectingText = self.infeed or self.inentry or self.insource
01153         context = self._getContext()
01154         context.setdefault('links', [])
01155         context['links'].append(FeedParserDict(attrsD))
01156         if attrsD['rel'] == 'enclosure':
01157             self._start_enclosure(attrsD)
01158         if attrsD.has_key('href'):
01159             expectingText = 0
01160             if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
01161                 context['link'] = attrsD['href']
01162         else:
01163             self.push('link', expectingText)
01164     _start_producturl = _start_link
01165 
01166     def _end_link(self):
01167         value = self.pop('link')
01168         context = self._getContext()
01169         if self.intextinput:
01170             context['textinput']['link'] = value
01171         if self.inimage:
01172             context['image']['link'] = value
01173     _end_producturl = _end_link
01174 
01175     def _start_guid(self, attrsD):
01176         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
01177         self.push('id', 1)
01178 
01179     def _end_guid(self):
01180         value = self.pop('id')
01181         self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
01182         if self.guidislink:
01183             # guid acts as link, but only if 'ispermalink' is not present or is 'true',
01184             # and only if the item doesn't already have a link element
01185             self._save('link', value)
01186 
01187     def _start_title(self, attrsD):
01188         self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
01189     _start_dc_title = _start_title
01190     _start_media_title = _start_title
01191 
01192     def _end_title(self):
01193         value = self.popContent('title')
01194         context = self._getContext()
01195         if self.intextinput:
01196             context['textinput']['title'] = value
01197         elif self.inimage:
01198             context['image']['title'] = value
01199     _end_dc_title = _end_title
01200     _end_media_title = _end_title
01201 
01202     def _start_description(self, attrsD):
01203         context = self._getContext()
01204         if context.has_key('summary'):
01205             self._summaryKey = 'content'
01206             self._start_content(attrsD)
01207         else:
01208             self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
01209 
01210     def _start_abstract(self, attrsD):
01211         self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
01212 
01213     def _end_description(self):
01214         if self._summaryKey == 'content':
01215             self._end_content()
01216         else:
01217             value = self.popContent('description')
01218             context = self._getContext()
01219             if self.intextinput:
01220                 context['textinput']['description'] = value
01221             elif self.inimage:
01222                 context['image']['description'] = value
01223         self._summaryKey = None
01224     _end_abstract = _end_description
01225 
01226     def _start_info(self, attrsD):
01227         self.pushContent('info', attrsD, 'text/plain', 1)
01228     _start_feedburner_browserfriendly = _start_info
01229 
01230     def _end_info(self):
01231         self.popContent('info')
01232     _end_feedburner_browserfriendly = _end_info
01233 
01234     def _start_generator(self, attrsD):
01235         if attrsD:
01236             attrsD = self._itsAnHrefDamnIt(attrsD)
01237             if attrsD.has_key('href'):
01238                 attrsD['href'] = self.resolveURI(attrsD['href'])
01239         self._getContext()['generator_detail'] = FeedParserDict(attrsD)
01240         self.push('generator', 1)
01241 
01242     def _end_generator(self):
01243         value = self.pop('generator')
01244         context = self._getContext()
01245         if context.has_key('generator_detail'):
01246             context['generator_detail']['name'] = value
01247             
01248     def _start_admin_generatoragent(self, attrsD):
01249         self.push('generator', 1)
01250         value = self._getAttribute(attrsD, 'rdf:resource')
01251         if value:
01252             self.elementstack[-1][2].append(value)
01253         self.pop('generator')
01254         self._getContext()['generator_detail'] = FeedParserDict({'href': value})
01255 
01256     def _start_admin_errorreportsto(self, attrsD):
01257         self.push('errorreportsto', 1)
01258         value = self._getAttribute(attrsD, 'rdf:resource')
01259         if value:
01260             self.elementstack[-1][2].append(value)
01261         self.pop('errorreportsto')
01262         
01263     def _start_summary(self, attrsD):
01264         context = self._getContext()
01265         if context.has_key('summary'):
01266             self._summaryKey = 'content'
01267             self._start_content(attrsD)
01268         else:
01269             self._summaryKey = 'summary'
01270             self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
01271     _start_itunes_summary = _start_summary
01272 
01273     def _end_summary(self):
01274         if self._summaryKey == 'content':
01275             self._end_content()
01276         else:
01277             self.popContent(self._summaryKey or 'summary')
01278         self._summaryKey = None
01279     _end_itunes_summary = _end_summary
01280         
01281     def _start_enclosure(self, attrsD):
01282         attrsD = self._itsAnHrefDamnIt(attrsD)
01283         self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
01284         href = attrsD.get('href')
01285         if href:
01286             context = self._getContext()
01287             if not context.get('id'):
01288                 context['id'] = href
01289             
01290     def _start_source(self, attrsD):
01291         self.insource = 1
01292 
01293     def _end_source(self):
01294         self.insource = 0
01295         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
01296         self.sourcedata.clear()
01297 
01298     def _start_content(self, attrsD):
01299         self.pushContent('content', attrsD, 'text/plain', 1)
01300         src = attrsD.get('src')
01301         if src:
01302             self.contentparams['src'] = src
01303         self.push('content', 1)
01304 
01305     def _start_prodlink(self, attrsD):
01306         self.pushContent('content', attrsD, 'text/html', 1)
01307 
01308     def _start_body(self, attrsD):
01309         self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
01310     _start_xhtml_body = _start_body
01311 
01312     def _start_content_encoded(self, attrsD):
01313         self.pushContent('content', attrsD, 'text/html', 1)
01314     _start_fullitem = _start_content_encoded
01315 
01316     def _end_content(self):
01317         copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
01318         value = self.popContent('content')
01319         if copyToDescription:
01320             self._save('description', value)
01321     _end_body = _end_content
01322     _end_xhtml_body = _end_content
01323     _end_content_encoded = _end_content
01324     _end_fullitem = _end_content
01325     _end_prodlink = _end_content
01326 
01327     def _start_itunes_image(self, attrsD):
01328         self.push('itunes_image', 0)
01329         self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
01330     _start_itunes_link = _start_itunes_image
01331         
01332     def _end_itunes_block(self):
01333         value = self.pop('itunes_block', 0)
01334         self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
01335 
01336     def _end_itunes_explicit(self):
01337         value = self.pop('itunes_explicit', 0)
01338         self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
01339 
01340 if _XML_AVAILABLE:
01341     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
01342         def __init__(self, baseuri, baselang, encoding):
01343             if _debug: sys.stderr.write('trying StrictFeedParser\n')
01344             xml.sax.handler.ContentHandler.__init__(self)
01345             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
01346             self.bozo = 0
01347             self.exc = None
01348         
01349         def startPrefixMapping(self, prefix, uri):
01350             self.trackNamespace(prefix, uri)
01351         
01352         def startElementNS(self, name, qname, attrs):
01353             namespace, localname = name
01354             lowernamespace = str(namespace or '').lower()
01355             if lowernamespace.find('backend.userland.com/rss') <> -1:
01356                 # match any backend.userland.com namespace
01357                 namespace = 'http://backend.userland.com/rss'
01358                 lowernamespace = namespace
01359             if qname and qname.find(':') > 0:
01360                 givenprefix = qname.split(':')[0]
01361             else:
01362                 givenprefix = None
01363             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
01364             if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
01365                     raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
01366             if prefix:
01367                 localname = prefix + ':' + localname
01368             localname = str(localname).lower()
01369             if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
01370 
01371             # qname implementation is horribly broken in Python 2.1 (it
01372             # doesn't report any), and slightly broken in Python 2.2 (it
01373             # doesn't report the xml: namespace). So we match up namespaces
01374             # with a known list first, and then possibly override them with
01375             # the qnames the SAX parser gives us (if indeed it gives us any
01376             # at all).  Thanks to MatejC for helping me test this and
01377             # tirelessly telling me that it didn't work yet.
01378             attrsD = {}
01379             for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
01380                 lowernamespace = (namespace or '').lower()
01381                 prefix = self._matchnamespaces.get(lowernamespace, '')
01382                 if prefix:
01383                     attrlocalname = prefix + ':' + attrlocalname
01384                 attrsD[str(attrlocalname).lower()] = attrvalue
01385             for qname in attrs.getQNames():
01386                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
01387             self.unknown_starttag(localname, attrsD.items())
01388 
01389         def characters(self, text):
01390             self.handle_data(text)
01391 
01392         def endElementNS(self, name, qname):
01393             namespace, localname = name
01394             lowernamespace = str(namespace or '').lower()
01395             if qname and qname.find(':') > 0:
01396                 givenprefix = qname.split(':')[0]
01397             else:
01398                 givenprefix = ''
01399             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
01400             if prefix:
01401                 localname = prefix + ':' + localname
01402             localname = str(localname).lower()
01403             self.unknown_endtag(localname)
01404 
01405         def error(self, exc):
01406             self.bozo = 1
01407             self.exc = exc
01408             
01409         def fatalError(self, exc):
01410             self.error(exc)
01411             raise exc
01412 
01413 class _BaseHTMLProcessor(sgmllib.SGMLParser):
01414     elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
01415       'img', 'input', 'isindex', 'link', 'meta', 'param']
01416     
01417     def __init__(self, encoding):
01418         self.encoding = encoding
01419         if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
01420         sgmllib.SGMLParser.__init__(self)
01421         
01422     def reset(self):
01423         self.pieces = []
01424         sgmllib.SGMLParser.reset(self)
01425 
01426     def _shorttag_replace(self, match):
01427         tag = match.group(1)
01428         if tag in self.elements_no_end_tag:
01429             return '<' + tag + ' />'
01430         else:
01431             return '<' + tag + '></' + tag + '>'
01432         
01433     def feed(self, data):
01434         data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
01435         #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
01436         data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) 
01437         data = data.replace('&#39;', "'")
01438         data = data.replace('&#34;', '"')
01439         if self.encoding and type(data) == type(u''):
01440             data = data.encode(self.encoding)
01441         sgmllib.SGMLParser.feed(self, data)
01442 
01443     def normalize_attrs(self, attrs):
01444         # utility method to be called by descendants
01445         attrs = [(k.lower(), v) for k, v in attrs]
01446         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
01447         return attrs
01448 
01449     def unknown_starttag(self, tag, attrs):
01450         # called for each start tag
01451         # attrs is a list of (attr, value) tuples
01452         # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
01453         if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
01454         uattrs = []
01455         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
01456         for key, value in attrs:
01457             if type(value) != type(u''):
01458                 value = unicode(value, self.encoding)
01459             uattrs.append((unicode(key, self.encoding), value))
01460         strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
01461         if tag in self.elements_no_end_tag:
01462             self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
01463         else:
01464             self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
01465 
01466     def unknown_endtag(self, tag):
01467         # called for each end tag, e.g. for </pre>, tag will be 'pre'
01468         # Reconstruct the original end tag.
01469         if tag not in self.elements_no_end_tag:
01470             self.pieces.append("</%(tag)s>" % locals())
01471 
01472     def handle_charref(self, ref):
01473         # called for each character reference, e.g. for '&#160;', ref will be '160'
01474         # Reconstruct the original character reference.
01475         self.pieces.append('&#%(ref)s;' % locals())
01476         
01477     def handle_entityref(self, ref):
01478         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
01479         # Reconstruct the original entity reference.
01480         self.pieces.append('&%(ref)s;' % locals())
01481 
01482     def handle_data(self, text):
01483         # called for each block of plain text, i.e. outside of any tag and
01484         # not containing any character or entity references
01485         # Store the original text verbatim.
01486         if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
01487         self.pieces.append(text)
01488         
01489     def handle_comment(self, text):
01490         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
01491         # Reconstruct the original comment.
01492         self.pieces.append('<!--%(text)s-->' % locals())
01493         
01494     def handle_pi(self, text):
01495         # called for each processing instruction, e.g. <?instruction>
01496         # Reconstruct original processing instruction.
01497         self.pieces.append('<?%(text)s>' % locals())
01498 
01499     def handle_decl(self, text):
01500         # called for the DOCTYPE, if present, e.g.
01501         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
01502         #     "http://www.w3.org/TR/html4/loose.dtd">
01503         # Reconstruct original DOCTYPE
01504         self.pieces.append('<!%(text)s>' % locals())
01505         
01506     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
01507     def _scan_name(self, i, declstartpos):
01508         rawdata = self.rawdata
01509         n = len(rawdata)
01510         if i == n:
01511             return None, -1
01512         m = self._new_declname_match(rawdata, i)
01513         if m:
01514             s = m.group()
01515             name = s.strip()
01516             if (i + len(s)) == n:
01517                 return None, -1  # end of buffer
01518             return name.lower(), m.end()
01519         else:
01520             self.handle_data(rawdata)
01521 #            self.updatepos(declstartpos, i)
01522             return None, -1
01523 
01524     def output(self):
01525         '''Return processed HTML as a single string'''
01526         return ''.join([str(p) for p in self.pieces])
01527 
01528 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
01529     def __init__(self, baseuri, baselang, encoding):
01530         sgmllib.SGMLParser.__init__(self)
01531         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
01532 
01533     def decodeEntities(self, element, data):
01534         data = data.replace('&#60;', '&lt;')
01535         data = data.replace('&#x3c;', '&lt;')
01536         data = data.replace('&#62;', '&gt;')
01537         data = data.replace('&#x3e;', '&gt;')
01538         data = data.replace('&#38;', '&amp;')
01539         data = data.replace('&#x26;', '&amp;')
01540         data = data.replace('&#34;', '&quot;')
01541         data = data.replace('&#x22;', '&quot;')
01542         data = data.replace('&#39;', '&apos;')
01543         data = data.replace('&#x27;', '&apos;')
01544         if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
01545             data = data.replace('&lt;', '<')
01546             data = data.replace('&gt;', '>')
01547             data = data.replace('&amp;', '&')
01548             data = data.replace('&quot;', '"')
01549             data = data.replace('&apos;', "'")
01550         return data
01551         
01552 class _RelativeURIResolver(_BaseHTMLProcessor):
01553     relative_uris = [('a', 'href'),
01554                      ('applet', 'codebase'),
01555                      ('area', 'href'),
01556                      ('blockquote', 'cite'),
01557                      ('body', 'background'),
01558                      ('del', 'cite'),
01559                      ('form', 'action'),
01560                      ('frame', 'longdesc'),
01561                      ('frame', 'src'),
01562                      ('iframe', 'longdesc'),
01563                      ('iframe', 'src'),
01564                      ('head', 'profile'),
01565                      ('img', 'longdesc'),
01566                      ('img', 'src'),
01567                      ('img', 'usemap'),
01568                      ('input', 'src'),
01569                      ('input', 'usemap'),
01570                      ('ins', 'cite'),
01571                      ('link', 'href'),
01572                      ('object', 'classid'),
01573                      ('object', 'codebase'),
01574                      ('object', 'data'),
01575                      ('object', 'usemap'),
01576                      ('q', 'cite'),
01577                      ('script', 'src')]
01578 
01579     def __init__(self, baseuri, encoding):
01580         _BaseHTMLProcessor.__init__(self, encoding)
01581         self.baseuri = baseuri
01582 
01583     def resolveURI(self, uri):
01584         return _urljoin(self.baseuri, uri)
01585     
01586     def unknown_starttag(self, tag, attrs):
01587         attrs = self.normalize_attrs(attrs)
01588         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
01589         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
01590         
01591 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
01592     if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
01593     p = _RelativeURIResolver(baseURI, encoding)
01594     p.feed(htmlSource)
01595     return p.output()
01596 
01597 class _HTMLSanitizer(_BaseHTMLProcessor):
01598     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
01599       'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
01600       'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
01601       'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
01602       'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
01603       'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
01604       'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
01605       'thead', 'tr', 'tt', 'u', 'ul', 'var']
01606 
01607     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
01608       'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
01609       'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
01610       'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
01611       'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
01612       'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
01613       'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
01614       'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
01615       'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
01616       'usemap', 'valign', 'value', 'vspace', 'width']
01617 
01618     unacceptable_elements_with_end_tag = ['script', 'applet']
01619 
01620     def reset(self):
01621         _BaseHTMLProcessor.reset(self)
01622         self.unacceptablestack = 0
01623         
01624     def unknown_starttag(self, tag, attrs):
01625         if not tag in self.acceptable_elements:
01626             if tag in self.unacceptable_elements_with_end_tag:
01627                 self.unacceptablestack += 1
01628             return
01629         attrs = self.normalize_attrs(attrs)
01630         attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
01631         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
01632         
01633     def unknown_endtag(self, tag):
01634         if not tag in self.acceptable_elements:
01635             if tag in self.unacceptable_elements_with_end_tag:
01636                 self.unacceptablestack -= 1
01637             return
01638         _BaseHTMLProcessor.unknown_endtag(self, tag)
01639 
01640     def handle_pi(self, text):
01641         pass
01642 
01643     def handle_decl(self, text):
01644         pass
01645 
01646     def handle_data(self, text):
01647         if not self.unacceptablestack:
01648             _BaseHTMLProcessor.handle_data(self, text)
01649 
01650 def _sanitizeHTML(htmlSource, encoding):
01651     p = _HTMLSanitizer(encoding)
01652     p.feed(htmlSource)
01653     data = p.output()
01654     if TIDY_MARKUP:
01655         # loop through list of preferred Tidy interfaces looking for one that's installed,
01656         # then set up a common _tidy function to wrap the interface-specific API.
01657         _tidy = None
01658         for tidy_interface in PREFERRED_TIDY_INTERFACES:
01659             try:
01660                 if tidy_interface == "uTidy":
01661                     from tidy import parseString as _utidy
01662                     def _tidy(data, **kwargs):
01663                         return str(_utidy(data, **kwargs))
01664                     break
01665                 elif tidy_interface == "mxTidy":
01666                     from mx.Tidy import Tidy as _mxtidy
01667                     def _tidy(data, **kwargs):
01668                         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
01669                         return data
01670                     break
01671             except:
01672                 pass
01673         if _tidy:
01674             utf8 = type(data) == type(u'')
01675             if utf8:
01676                 data = data.encode('utf-8')
01677             data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
01678             if utf8:
01679                 data = unicode(data, 'utf-8')
01680             if data.count('<body'):
01681                 data = data.split('<body', 1)[1]
01682                 if data.count('>'):
01683                     data = data.split('>', 1)[1]
01684             if data.count('</body'):
01685                 data = data.split('</body', 1)[0]
01686     data = data.strip().replace('\r\n', '\n')
01687     return data
01688 
01689 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
01690     def http_error_default(self, req, fp, code, msg, headers):
01691         if ((code / 100) == 3) and (code != 304):
01692             return self.http_error_302(req, fp, code, msg, headers)
01693         infourl = urllib.addinfourl(fp, headers, req.get_full_url())
01694         infourl.status = code
01695         return infourl
01696 
01697     def http_error_302(self, req, fp, code, msg, headers):
01698         if headers.dict.has_key('location'):
01699             infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
01700         else:
01701             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
01702         if not hasattr(infourl, 'status'):
01703             infourl.status = code
01704         return infourl
01705 
01706     def http_error_301(self, req, fp, code, msg, headers):
01707         if headers.dict.has_key('location'):
01708             infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
01709         else:
01710             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
01711         if not hasattr(infourl, 'status'):
01712             infourl.status = code
01713         return infourl
01714 
01715     http_error_300 = http_error_302
01716     http_error_303 = http_error_302
01717     http_error_307 = http_error_302
01718         
01719     def http_error_401(self, req, fp, code, msg, headers):
01720         # Check if
01721         # - server requires digest auth, AND
01722         # - we tried (unsuccessfully) with basic auth, AND
01723         # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
01724         # If all conditions hold, parse authentication information
01725         # out of the Authorization header we sent the first time
01726         # (for the username and password) and the WWW-Authenticate
01727         # header the server sent back (for the realm) and retry
01728         # the request with the appropriate digest auth headers instead.
01729         # This evil genius hack has been brought to you by Aaron Swartz.
01730         host = urlparse.urlparse(req.get_full_url())[1]
01731         try:
01732             assert sys.version.split()[0] >= '2.3.3'
01733             assert base64 != None
01734             user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
01735             realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
01736             self.add_password(realm, host, user, passw)
01737             retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
01738             self.reset_retry_count()
01739             return retry
01740         except:
01741             return self.http_error_default(req, fp, code, msg, headers)
01742 
01743 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
01744     """URL, filename, or string --> stream
01745 
01746     This function lets you define parsers that take any input source
01747     (URL, pathname to local or network file, or actual data as a string)
01748     and deal with it in a uniform manner.  Returned object is guaranteed
01749     to have all the basic stdio read methods (read, readline, readlines).
01750     Just .close() the object when you're done with it.
01751 
01752     If the etag argument is supplied, it will be used as the value of an
01753     If-None-Match request header.
01754 
01755     If the modified argument is supplied, it must be a tuple of 9 integers
01756     as returned by gmtime() in the standard Python time module. This MUST
01757     be in GMT (Greenwich Mean Time). The formatted date/time will be used
01758     as the value of an If-Modified-Since request header.
01759 
01760     If the agent argument is supplied, it will be used as the value of a
01761     User-Agent request header.
01762 
01763     If the referrer argument is supplied, it will be used as the value of a
01764     Referer[sic] request header.
01765 
01766     If handlers is supplied, it is a list of handlers used to build a
01767     urllib2 opener.
01768     """
01769 
01770     if hasattr(url_file_stream_or_string, 'read'):
01771         return url_file_stream_or_string
01772 
01773     if url_file_stream_or_string == '-':
01774         return sys.stdin
01775 
01776     if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
01777         if not agent:
01778             agent = USER_AGENT
01779         # test for inline user:password for basic auth
01780         auth = None
01781         if base64:
01782             urltype, rest = urllib.splittype(url_file_stream_or_string)
01783             realhost, rest = urllib.splithost(rest)
01784             if realhost:
01785                 user_passwd, realhost = urllib.splituser(realhost)
01786                 if user_passwd:
01787                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
01788                     auth = base64.encodestring(user_passwd).strip()
01789         # try to open with urllib2 (to use optional headers)
01790         request = urllib2.Request(url_file_stream_or_string)
01791         request.add_header('User-Agent', agent)
01792         if etag:
01793             request.add_header('If-None-Match', etag)
01794         if modified:
01795             # format into an RFC 1123-compliant timestamp. We can't use
01796             # time.strftime() since the %a and %b directives can be affected
01797             # by the current locale, but RFC 2616 states that dates must be
01798             # in English.
01799             short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
01800             months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
01801             request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
01802         if referrer:
01803             request.add_header('Referer', referrer)
01804         if gzip and zlib:
01805             request.add_header('Accept-encoding', 'gzip, deflate')
01806         elif gzip:
01807             request.add_header('Accept-encoding', 'gzip')
01808         elif zlib:
01809             request.add_header('Accept-encoding', 'deflate')
01810         else:
01811             request.add_header('Accept-encoding', '')
01812         if auth:
01813             request.add_header('Authorization', 'Basic %s' % auth)
01814         if ACCEPT_HEADER:
01815             request.add_header('Accept', ACCEPT_HEADER)
01816         request.add_header('A-IM', 'feed') # RFC 3229 support
01817         opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
01818         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
01819         try:
01820             return opener.open(request)
01821         finally:
01822             opener.close() # JohnD
01823     
01824     # try to open with native open function (if url_file_stream_or_string is a filename)
01825     try:
01826         return open(url_file_stream_or_string)
01827     except:
01828         pass
01829 
01830     # treat url_file_stream_or_string as string
01831     return _StringIO(str(url_file_stream_or_string))
01832 
01833 _date_handlers = []
01834 def registerDateHandler(func):
01835     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
01836     _date_handlers.insert(0, func)
01837     
01838 # ISO-8601 date parsing routines written by Fazal Majid.
01839 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
01840 # parser is beyond the scope of feedparser and would be a worthwhile addition
01841 # to the Python library.
01842 # A single regular expression cannot parse ISO 8601 date formats into groups
01843 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
01844 # 0301-04-01), so we use templates instead.
01845 # Please note the order in templates is significant because we need a
01846 # greedy match.
01847 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
01848                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 
01849                 '-YY-?MM', '-OOO', '-YY',
01850                 '--MM-?DD', '--MM',
01851                 '---DD',
01852                 'CC', '']
01853 _iso8601_re = [
01854     tmpl.replace(
01855     'YYYY', r'(?P<year>\d{4})').replace(
01856     'YY', r'(?P<year>\d\d)').replace(
01857     'MM', r'(?P<month>[01]\d)').replace(
01858     'DD', r'(?P<day>[0123]\d)').replace(
01859     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
01860     'CC', r'(?P<century>\d\d$)')
01861     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
01862     + r'(:(?P<second>\d{2}))?'
01863     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
01864     for tmpl in _iso8601_tmpl]
01865 del tmpl
01866 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
01867 del regex
01868 def _parse_date_iso8601(dateString):
01869     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
01870     m = None
01871     for _iso8601_match in _iso8601_matches:
01872         m = _iso8601_match(dateString)
01873         if m: break
01874     if not m: return
01875     if m.span() == (0, 0): return
01876     params = m.groupdict()
01877     ordinal = params.get('ordinal', 0)
01878     if ordinal:
01879         ordinal = int(ordinal)
01880     else:
01881         ordinal = 0
01882     year = params.get('year', '--')
01883     if not year or year == '--':
01884         year = time.gmtime()[0]
01885     elif len(year) == 2:
01886         # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
01887         year = 100 * int(time.gmtime()[0] / 100) + int(year)
01888     else:
01889         year = int(year)
01890     month = params.get('month', '-')
01891     if not month or month == '-':
01892         # ordinals are NOT normalized by mktime, we simulate them
01893         # by setting month=1, day=ordinal
01894         if ordinal:
01895             month = 1
01896         else:
01897             month = time.gmtime()[1]
01898     month = int(month)
01899     day = params.get('day', 0)
01900     if not day:
01901         # see above
01902         if ordinal:
01903             day = ordinal
01904         elif params.get('century', 0) or \
01905                  params.get('year', 0) or params.get('month', 0):
01906             day = 1
01907         else:
01908             day = time.gmtime()[2]
01909     else:
01910         day = int(day)
01911     # special case of the century - is the first year of the 21st century
01912     # 2000 or 2001 ? The debate goes on...
01913     if 'century' in params.keys():
01914         year = (int(params['century']) - 1) * 100 + 1
01915     # in ISO 8601 most fields are optional
01916     for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
01917         if not params.get(field, None):
01918             params[field] = 0
01919     hour = int(params.get('hour', 0))
01920     minute = int(params.get('minute', 0))
01921     second = int(params.get('second', 0))
01922     # weekday is normalized by mktime(), we can ignore it
01923     weekday = 0
01924     # daylight savings is complex, but not needed for feedparser's purposes
01925     # as time zones, if specified, include mention of whether it is active
01926     # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
01927     # and most implementations have DST bugs
01928     daylight_savings_flag = 0
01929     tm = [year, month, day, hour, minute, second, weekday,
01930           ordinal, daylight_savings_flag]
01931     # ISO 8601 time zone adjustments
01932     tz = params.get('tz')
01933     if tz and tz != 'Z':
01934         if tz[0] == '-':
01935             tm[3] += int(params.get('tzhour', 0))
01936             tm[4] += int(params.get('tzmin', 0))
01937         elif tz[0] == '+':
01938             tm[3] -= int(params.get('tzhour', 0))
01939             tm[4] -= int(params.get('tzmin', 0))
01940         else:
01941             return None
01942     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
01943     # which is guaranteed to normalize d/m/y/h/m/s.
01944     # Many implementations have bugs, but we'll pretend they don't.
01945     return time.localtime(time.mktime(tm))
01946 registerDateHandler(_parse_date_iso8601)
01947     
01948 # 8-bit date handling routines written by ytrewq1.
01949 _korean_year  = u'\ub144' # b3e2 in euc-kr
01950 _korean_month = u'\uc6d4' # bff9 in euc-kr
01951 _korean_day   = u'\uc77c' # c0cf in euc-kr
01952 _korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
01953 _korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
01954 
01955 _korean_onblog_date_re = \
01956     re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
01957                (_korean_year, _korean_month, _korean_day))
01958 _korean_nate_date_re = \
01959     re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
01960                (_korean_am, _korean_pm))
01961 def _parse_date_onblog(dateString):
01962     '''Parse a string according to the OnBlog 8-bit date format'''
01963     m = _korean_onblog_date_re.match(dateString)
01964     if not m: return
01965     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
01966                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
01967                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
01968                  'zonediff': '+09:00'}
01969     if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
01970     return _parse_date_w3dtf(w3dtfdate)
01971 registerDateHandler(_parse_date_onblog)
01972 
01973 def _parse_date_nate(dateString):
01974     '''Parse a string according to the Nate 8-bit date format'''
01975     m = _korean_nate_date_re.match(dateString)
01976     if not m: return
01977     hour = int(m.group(5))
01978     ampm = m.group(4)
01979     if (ampm == _korean_pm):
01980         hour += 12
01981     hour = str(hour)
01982     if len(hour) == 1:
01983         hour = '0' + hour
01984     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
01985                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
01986                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
01987                  'zonediff': '+09:00'}
01988     if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
01989     return _parse_date_w3dtf(w3dtfdate)
01990 registerDateHandler(_parse_date_nate)
01991 
01992 _mssql_date_re = \
01993     re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
01994 def _parse_date_mssql(dateString):
01995     '''Parse a string according to the MS SQL date format'''
01996     m = _mssql_date_re.match(dateString)
01997     if not m: return
01998     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
01999                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
02000                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
02001                  'zonediff': '+09:00'}
02002     if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
02003     return _parse_date_w3dtf(w3dtfdate)
02004 registerDateHandler(_parse_date_mssql)
02005 
02006 # Unicode strings for Greek date strings
02007 _greek_months = \
02008   { \
02009    u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
02010    u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
02011    u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
02012    u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
02013    u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
02014    u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
02015    u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
02016    u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
02017    u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
02018    u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
02019    u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
02020    u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
02021    u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
02022    u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
02023    u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
02024    u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
02025    u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
02026    u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
02027    u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
02028   }
02029 
02030 _greek_wdays = \
02031   { \
02032    u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
02033    u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
02034    u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
02035    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
02036    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
02037    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
02038    u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7   
02039   }
02040 
02041 _greek_date_format_re = \
02042     re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
02043 
02044 def _parse_date_greek(dateString):
02045     '''Parse a string according to a Greek 8-bit date format.'''
02046     m = _greek_date_format_re.match(dateString)
02047     if not m: return
02048     try:
02049         wday = _greek_wdays[m.group(1)]
02050         month = _greek_months[m.group(3)]
02051     except:
02052         return
02053     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
02054                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
02055                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
02056                   'zonediff': m.group(8)}
02057     if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
02058     return _parse_date_rfc822(rfc822date)
02059 registerDateHandler(_parse_date_greek)
02060 
02061 # Unicode strings for Hungarian date strings
02062 _hungarian_months = \
02063   { \
02064     u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
02065     u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
02066     u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
02067     u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
02068     u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
02069     u'j\u00fanius':   u'06',  # fa in iso-8859-2
02070     u'j\u00falius':   u'07',  # fa in iso-8859-2
02071     u'augusztus':     u'08',
02072     u'szeptember':    u'09',
02073     u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
02074     u'november':      u'11',
02075     u'december':      u'12',
02076   }
02077 
02078 _hungarian_date_format_re = \
02079   re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
02080 
02081 def _parse_date_hungarian(dateString):
02082     '''Parse a string according to a Hungarian 8-bit date format.'''
02083     m = _hungarian_date_format_re.match(dateString)
02084     if not m: return
02085     try:
02086         month = _hungarian_months[m.group(2)]
02087         day = m.group(3)
02088         if len(day) == 1:
02089             day = '0' + day
02090         hour = m.group(4)
02091         if len(hour) == 1:
02092             hour = '0' + hour
02093     except:
02094         return
02095     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
02096                 {'year': m.group(1), 'month': month, 'day': day,\
02097                  'hour': hour, 'minute': m.group(5),\
02098                  'zonediff': m.group(6)}
02099     if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
02100     return _parse_date_w3dtf(w3dtfdate)
02101 registerDateHandler(_parse_date_hungarian)
02102 
02103 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
02104 # Drake and licensed under the Python license.  Removed all range checking
02105 # for month, day, hour, minute, and second, since mktime will normalize
02106 # these later
02107 def _parse_date_w3dtf(dateString):
02108     def __extract_date(m):
02109         year = int(m.group('year'))
02110         if year < 100:
02111             year = 100 * int(time.gmtime()[0] / 100) + int(year)
02112         if year < 1000:
02113             return 0, 0, 0
02114         julian = m.group('julian')
02115         if julian:
02116             julian = int(julian)
02117             month = julian / 30 + 1
02118             day = julian % 30 + 1
02119             jday = None
02120             while jday != julian:
02121                 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
02122                 jday = time.gmtime(t)[-2]
02123                 diff = abs(jday - julian)
02124                 if jday > julian:
02125                     if diff < day:
02126                         day = day - diff
02127                     else:
02128                         month = month - 1
02129                         day = 31
02130                 elif jday < julian:
02131                     if day + diff < 28:
02132                        day = day + diff
02133                     else:
02134                         month = month + 1
02135             return year, month, day
02136         month = m.group('month')
02137         day = 1
02138         if month is None:
02139             month = 1
02140         else:
02141             month = int(month)
02142             day = m.group('day')
02143             if day:
02144                 day = int(day)
02145             else:
02146                 day = 1
02147         return year, month, day
02148 
02149     def __extract_time(m):
02150         if not m:
02151             return 0, 0, 0
02152         hours = m.group('hours')
02153         if not hours:
02154             return 0, 0, 0
02155         hours = int(hours)
02156         minutes = int(m.group('minutes'))
02157         seconds = m.group('seconds')
02158         if seconds:
02159             seconds = int(seconds)
02160         else:
02161             seconds = 0
02162         return hours, minutes, seconds
02163 
02164     def __extract_tzd(m):
02165         '''Return the Time Zone Designator as an offset in seconds from UTC.'''
02166         if not m:
02167             return 0
02168         tzd = m.group('tzd')
02169         if not tzd:
02170             return 0
02171         if tzd == 'Z':
02172             return 0
02173         hours = int(m.group('tzdhours'))
02174         minutes = m.group('tzdminutes')
02175         if minutes:
02176             minutes = int(minutes)
02177         else:
02178             minutes = 0
02179         offset = (hours*60 + minutes) * 60
02180         if tzd[0] == '+':
02181             return -offset
02182         return offset
02183 
02184     __date_re = ('(?P<year>\d\d\d\d)'
02185                  '(?:(?P<dsep>-|)'
02186                  '(?:(?P<julian>\d\d\d)'
02187                  '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
02188     __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
02189     __tzd_rx = re.compile(__tzd_re)
02190     __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
02191                  '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
02192                  + __tzd_re)
02193     __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
02194     __datetime_rx = re.compile(__datetime_re)
02195     m = __datetime_rx.match(dateString)
02196     if (m is None) or (m.group() != dateString): return
02197     gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
02198     if gmt[0] == 0: return
02199     return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
02200 registerDateHandler(_parse_date_w3dtf)
02201 
02202 def _parse_date_rfc822(dateString):
02203     '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
02204     data = dateString.split()
02205     if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
02206         del data[0]
02207     if len(data) == 4:
02208         s = data[3]
02209         i = s.find('+')
02210         if i > 0:
02211             data[3:] = [s[:i], s[i+1:]]
02212         else:
02213             data.append('')
02214         dateString = " ".join(data)
02215     if len(data) < 5:
02216         dateString += ' 00:00:00 GMT'
02217     tm = rfc822.parsedate_tz(dateString)
02218     if tm:
02219         return time.gmtime(rfc822.mktime_tz(tm))
02220 # rfc822.py defines several time zones, but we define some extra ones.
02221 # 'ET' is equivalent to 'EST', etc.
02222 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
02223 rfc822._timezones.update(_additional_timezones)
02224 registerDateHandler(_parse_date_rfc822)    
02225 
02226 def _parse_date(dateString):
02227     '''Parses a variety of date formats into a 9-tuple in GMT'''
02228     for handler in _date_handlers:
02229         try:
02230             date9tuple = handler(dateString)
02231             if not date9tuple: continue
02232             if len(date9tuple) != 9:
02233                 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
02234                 raise ValueError
02235             map(int, date9tuple)
02236             return date9tuple
02237         except Exception, e:
02238             if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
02239             pass
02240     return None
02241 
02242 def _getCharacterEncoding(http_headers, xml_data):
02243     '''Get the character encoding of the XML document
02244 
02245     http_headers is a dictionary
02246     xml_data is a raw string (not Unicode)
02247     
02248     This is so much trickier than it sounds, it's not even funny.
02249     According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
02250     is application/xml, application/*+xml,
02251     application/xml-external-parsed-entity, or application/xml-dtd,
02252     the encoding given in the charset parameter of the HTTP Content-Type
02253     takes precedence over the encoding given in the XML prefix within the
02254     document, and defaults to 'utf-8' if neither are specified.  But, if
02255     the HTTP Content-Type is text/xml, text/*+xml, or
02256     text/xml-external-parsed-entity, the encoding given in the XML prefix
02257     within the document is ALWAYS IGNORED and only the encoding given in
02258     the charset parameter of the HTTP Content-Type header should be
02259     respected, and it defaults to 'us-ascii' if not specified.
02260 
02261     Furthermore, discussion on the atom-syntax mailing list with the
02262     author of RFC 3023 leads me to the conclusion that any document
02263     served with a Content-Type of text/* and no charset parameter
02264     must be treated as us-ascii.  (We now do this.)  And also that it
02265     must always be flagged as non-well-formed.  (We now do this too.)
02266     
02267     If Content-Type is unspecified (input was local file or non-HTTP source)
02268     or unrecognized (server just got it totally wrong), then go by the
02269     encoding given in the XML prefix of the document and default to
02270     'iso-8859-1' as per the HTTP specification (RFC 2616).
02271     
02272     Then, assuming we didn't find a character encoding in the HTTP headers
02273     (and the HTTP Content-type allowed us to look in the body), we need
02274     to sniff the first few bytes of the XML data and try to determine
02275     whether the encoding is ASCII-compatible.  Section F of the XML
02276     specification shows the way here:
02277     http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
02278 
02279     If the sniffed encoding is not ASCII-compatible, we need to make it
02280     ASCII compatible so that we can sniff further into the XML declaration
02281     to find the encoding attribute, which will tell us the true encoding.
02282 
02283     Of course, none of this guarantees that we will be able to parse the
02284     feed in the declared character encoding (assuming it was declared
02285     correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
02286     you should definitely install them if you can.
02287     http://cjkpython.i18n.org/
02288     '''
02289 
02290     def _parseHTTPContentType(content_type):
02291         '''takes HTTP Content-Type header and returns (content type, charset)
02292 
02293         If no charset is specified, returns (content type, '')
02294         If no content type is specified, returns ('', '')
02295         Both return parameters are guaranteed to be lowercase strings
02296         '''
02297         content_type = content_type or ''
02298         content_type, params = cgi.parse_header(content_type)
02299         return content_type, params.get('charset', '').replace("'", '')
02300 
02301     sniffed_xml_encoding = ''
02302     xml_encoding = ''
02303     true_encoding = ''
02304     http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
02305     # Must sniff for non-ASCII-compatible character encodings before
02306     # searching for XML declaration.  This heuristic is defined in
02307     # section F of the XML specification:
02308     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
02309     try:
02310         if xml_data[:4] == '\x4c\x6f\xa7\x94':
02311             # EBCDIC
02312             xml_data = _ebcdic_to_ascii(xml_data)
02313         elif xml_data[:4] == '\x00\x3c\x00\x3f':
02314             # UTF-16BE
02315             sniffed_xml_encoding = 'utf-16be'
02316             xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
02317         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
02318             # UTF-16BE with BOM
02319             sniffed_xml_encoding = 'utf-16be'
02320             xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
02321         elif xml_data[:4] == '\x3c\x00\x3f\x00':
02322             # UTF-16LE
02323             sniffed_xml_encoding = 'utf-16le'
02324             xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
02325         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
02326             # UTF-16LE with BOM
02327             sniffed_xml_encoding = 'utf-16le'
02328             xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
02329         elif xml_data[:4] == '\x00\x00\x00\x3c':
02330             # UTF-32BE
02331             sniffed_xml_encoding = 'utf-32be'
02332             xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
02333         elif xml_data[:4] == '\x3c\x00\x00\x00':
02334             # UTF-32LE
02335             sniffed_xml_encoding = 'utf-32le'
02336             xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
02337         elif xml_data[:4] == '\x00\x00\xfe\xff':
02338             # UTF-32BE with BOM
02339             sniffed_xml_encoding = 'utf-32be'
02340             xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
02341         elif xml_data[:4] == '\xff\xfe\x00\x00':
02342             # UTF-32LE with BOM
02343             sniffed_xml_encoding = 'utf-32le'
02344             xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
02345         elif xml_data[:3] == '\xef\xbb\xbf':
02346             # UTF-8 with BOM
02347             sniffed_xml_encoding = 'utf-8'
02348             xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
02349         else:
02350             # ASCII-compatible
02351             pass
02352         xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
02353     except:
02354         xml_encoding_match = None
02355     if xml_encoding_match:
02356         xml_encoding = xml_encoding_match.groups()[0].lower()
02357         if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
02358             xml_encoding = sniffed_xml_encoding
02359     acceptable_content_type = 0
02360     application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
02361     text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
02362     if (http_content_type in application_content_types) or \
02363        (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
02364         acceptable_content_type = 1
02365         true_encoding = http_encoding or xml_encoding or 'utf-8'
02366     elif (http_content_type in text_content_types) or \
02367          (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
02368         acceptable_content_type = 1
02369         true_encoding = http_encoding or 'us-ascii'
02370     elif http_content_type.startswith('text/'):
02371         true_encoding = http_encoding or 'us-ascii'
02372     elif http_headers and (not http_headers.has_key('content-type')):
02373         true_encoding = xml_encoding or 'iso-8859-1'
02374     else:
02375         true_encoding = xml_encoding or 'utf-8'
02376     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
02377     
02378 def _toUTF8(data, encoding):
02379     '''Changes an XML data stream on the fly to specify a new encoding
02380 
02381     data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
02382     encoding is a string recognized by encodings.aliases
02383     '''
02384     if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
02385     # strip Byte Order Mark (if present)
02386     if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
02387         if _debug:
02388             sys.stderr.write('stripping BOM\n')
02389             if encoding != 'utf-16be':
02390                 sys.stderr.write('trying utf-16be instead\n')
02391         encoding = 'utf-16be'
02392         data = data[2:]
02393     elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
02394         if _debug:
02395             sys.stderr.write('stripping BOM\n')
02396             if encoding != 'utf-16le':
02397                 sys.stderr.write('trying utf-16le instead\n')
02398         encoding = 'utf-16le'
02399         data = data[2:]
02400     elif data[:3] == '\xef\xbb\xbf':
02401         if _debug:
02402             sys.stderr.write('stripping BOM\n')
02403             if encoding != 'utf-8':
02404                 sys.stderr.write('trying utf-8 instead\n')
02405         encoding = 'utf-8'
02406         data = data[3:]
02407     elif data[:4] == '\x00\x00\xfe\xff':
02408         if _debug:
02409             sys.stderr.write('stripping BOM\n')
02410             if encoding != 'utf-32be':
02411                 sys.stderr.write('trying utf-32be instead\n')
02412         encoding = 'utf-32be'
02413         data = data[4:]
02414     elif data[:4] == '\xff\xfe\x00\x00':
02415         if _debug:
02416             sys.stderr.write('stripping BOM\n')
02417             if encoding != 'utf-32le':
02418                 sys.stderr.write('trying utf-32le instead\n')
02419         encoding = 'utf-32le'
02420         data = data[4:]
02421     newdata = unicode(data, encoding)
02422     if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
02423     declmatch = re.compile('^<\?xml[^>]*?>')
02424     newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
02425     if declmatch.search(newdata):
02426         newdata = declmatch.sub(newdecl, newdata)
02427     else:
02428         newdata = newdecl + u'\n' + newdata
02429     return newdata.encode('utf-8')
02430 
02431 def _stripDoctype(data):
02432     '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
02433 
02434     rss_version may be 'rss091n' or None
02435     stripped_data is the same XML document, minus the DOCTYPE
02436     '''
02437     entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
02438     data = entity_pattern.sub('', data)
02439     doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
02440     doctype_results = doctype_pattern.findall(data)
02441     doctype = doctype_results and doctype_results[0] or ''
02442     if doctype.lower().count('netscape'):
02443         version = 'rss091n'
02444     else:
02445         version = None
02446     data = doctype_pattern.sub('', data)
02447     return version, data
02448     
02449 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
02450     '''Parse a feed from a URL, file, stream, or string'''
02451     result = FeedParserDict()
02452     result['feed'] = FeedParserDict()
02453     result['entries'] = []
02454     if _XML_AVAILABLE:
02455         result['bozo'] = 0
02456     if type(handlers) == types.InstanceType:
02457         handlers = [handlers]
02458     try:
02459         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
02460         data = f.read()
02461     except Exception, e:
02462         result['bozo'] = 1
02463         result['bozo_exception'] = e
02464         data = ''
02465         f = None
02466 
02467     # if feed is gzip-compressed, decompress it
02468     if f and data and hasattr(f, 'headers'):
02469         if gzip and f.headers.get('content-encoding', '') == 'gzip':
02470             try:
02471                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
02472             except Exception, e:
02473                 # Some feeds claim to be gzipped but they're not, so
02474                 # we get garbage.  Ideally, we should re-request the
02475                 # feed without the 'Accept-encoding: gzip' header,
02476                 # but we don't.
02477                 result['bozo'] = 1
02478                 result['bozo_exception'] = e
02479                 data = ''
02480         elif zlib and f.headers.get('content-encoding', '') == 'deflate':
02481             try:
02482                 data = zlib.decompress(data, -zlib.MAX_WBITS)
02483             except Exception, e:
02484                 result['bozo'] = 1
02485                 result['bozo_exception'] = e
02486                 data = ''
02487 
02488     # save HTTP headers
02489     if hasattr(f, 'info'):
02490         info = f.info()
02491         result['etag'] = info.getheader('ETag')
02492         last_modified = info.getheader('Last-Modified')
02493         if last_modified:
02494             result['modified'] = _parse_date(last_modified)
02495     if hasattr(f, 'url'):
02496         result['href'] = f.url
02497         result['status'] = 200
02498     if hasattr(f, 'status'):
02499         result['status'] = f.status
02500     if hasattr(f, 'headers'):
02501         result['headers'] = f.headers.dict
02502     if hasattr(f, 'close'):
02503         f.close()
02504 
02505     # there are four encodings to keep track of:
02506     # - http_encoding is the encoding declared in the Content-Type HTTP header
02507     # - xml_encoding is the encoding declared in the <?xml declaration
02508     # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
02509     # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
02510     http_headers = result.get('headers', {})
02511     result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
02512         _getCharacterEncoding(http_headers, data)
02513     if http_headers and (not acceptable_content_type):
02514         if http_headers.has_key('content-type'):
02515             bozo_message = '%s is not an XML media type' % http_headers['content-type']
02516         else:
02517             bozo_message = 'no Content-type specified'
02518         result['bozo'] = 1
02519         result['bozo_exception'] = NonXMLContentType(bozo_message)
02520         
02521     result['version'], data = _stripDoctype(data)
02522 
02523     baseuri = http_headers.get('content-location', result.get('href'))
02524     baselang = http_headers.get('content-language', None)
02525 
02526     # if server sent 304, we're done
02527     if result.get('status', 0) == 304:
02528         result['version'] = ''
02529         result['debug_message'] = 'The feed has not changed since you last checked, ' + \
02530             'so the server sent no data.  This is a feature, not a bug!'
02531         return result
02532 
02533     # if there was a problem downloading, we're done
02534     if not data:
02535         return result
02536 
02537     # determine character encoding
02538     use_strict_parser = 0
02539     known_encoding = 0
02540     tried_encodings = []
02541     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
02542     for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
02543         if not proposed_encoding: continue
02544         if proposed_encoding in tried_encodings: continue
02545         tried_encodings.append(proposed_encoding)
02546         try:
02547             data = _toUTF8(data, proposed_encoding)
02548             known_encoding = use_strict_parser = 1
02549             break
02550         except:
02551             pass
02552     # if no luck and we have auto-detection library, try that
02553     if (not known_encoding) and chardet:
02554         try:
02555             proposed_encoding = chardet.detect(data)['encoding']
02556             if proposed_encoding and (proposed_encoding not in tried_encodings):
02557                 tried_encodings.append(proposed_encoding)
02558                 data = _toUTF8(data, proposed_encoding)
02559                 known_encoding = use_strict_parser = 1
02560         except:
02561             pass
02562     # if still no luck and we haven't tried utf-8 yet, try that
02563     if (not known_encoding) and ('utf-8' not in tried_encodings):
02564         try:
02565             proposed_encoding = 'utf-8'
02566             tried_encodings.append(proposed_encoding)
02567             data = _toUTF8(data, proposed_encoding)
02568             known_encoding = use_strict_parser = 1
02569         except:
02570             pass
02571     # if still no luck and we haven't tried windows-1252 yet, try that
02572     if (not known_encoding) and ('windows-1252' not in tried_encodings):
02573         try:
02574             proposed_encoding = 'windows-1252'
02575             tried_encodings.append(proposed_encoding)
02576             data = _toUTF8(data, proposed_encoding)
02577             known_encoding = use_strict_parser = 1
02578         except:
02579             pass
02580     # if still no luck, give up
02581     if not known_encoding:
02582         result['bozo'] = 1
02583         result['bozo_exception'] = CharacterEncodingUnknown( \
02584             'document encoding unknown, I tried ' + \
02585             '%s, %s, utf-8, and windows-1252 but nothing worked' % \
02586             (result['encoding'], xml_encoding))
02587         result['encoding'] = ''
02588     elif proposed_encoding != result['encoding']:
02589         result['bozo'] = 1
02590         result['bozo_exception'] = CharacterEncodingOverride( \
02591             'documented declared as %s, but parsed as %s' % \
02592             (result['encoding'], proposed_encoding))
02593         result['encoding'] = proposed_encoding
02594 
02595     if not _XML_AVAILABLE:
02596         use_strict_parser = 0
02597     if use_strict_parser:
02598         # initialize the SAX parser
02599         feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
02600         saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
02601         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
02602         saxparser.setContentHandler(feedparser)
02603         saxparser.setErrorHandler(feedparser)
02604         source = xml.sax.xmlreader.InputSource()
02605         source.setByteStream(_StringIO(data))
02606         if hasattr(saxparser, '_ns_stack'):
02607             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
02608             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
02609             saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
02610         try:
02611             saxparser.parse(source)
02612         except Exception, e:
02613             if _debug:
02614                 import traceback
02615                 traceback.print_stack()
02616                 traceback.print_exc()
02617                 sys.stderr.write('xml parsing failed\n')
02618             result['bozo'] = 1
02619             result['bozo_exception'] = feedparser.exc or e
02620             use_strict_parser = 0
02621     if not use_strict_parser:
02622         feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
02623         feedparser.feed(data)
02624     result['feed'] = feedparser.feeddata
02625     result['entries'] = feedparser.entries
02626     result['version'] = result['version'] or feedparser.version
02627     result['namespaces'] = feedparser.namespacesInUse
02628     return result
02629 
02630 if __name__ == '__main__':
02631     if not sys.argv[1:]:
02632         print __doc__
02633         sys.exit(0)
02634     else:
02635         urls = sys.argv[1:]
02636     zopeCompatibilityHack()
02637     from pprint import pprint
02638     for url in urls:
02639         print url
02640         print
02641         result = parse(url)
02642         pprint(result)
02643         print
02644 
02645 #REVISION HISTORY
02646 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
02647 #  added Simon Fell's test suite
02648 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
02649 #2.0 - 10/19/2002
02650 #  JD - use inchannel to watch out for image and textinput elements which can
02651 #  also contain title, link, and description elements
02652 #  JD - check for isPermaLink='false' attribute on guid elements
02653 #  JD - replaced openAnything with open_resource supporting ETag and
02654 #  If-Modified-Since request headers
02655 #  JD - parse now accepts etag, modified, agent, and referrer optional
02656 #  arguments
02657 #  JD - modified parse to return a dictionary instead of a tuple so that any
02658 #  etag or modified information can be returned and cached by the caller
02659 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
02660 #  because of etag/modified, return the old etag/modified to the caller to
02661 #  indicate why nothing is being returned
02662 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
02663 #  useless.  Fixes the problem JD was addressing by adding it.
02664 #2.1 - 11/14/2002 - MAP - added gzip support
02665 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
02666 #  start_admingeneratoragent is an example of how to handle elements with
02667 #  only attributes, no content.
02668 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
02669 #  also, make sure we send the User-Agent even if urllib2 isn't available.
02670 #  Match any variation of backend.userland.com/rss namespace.
02671 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
02672 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
02673 #  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
02674 #  project name
02675 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
02676 #  removed unnecessary urllib code -- urllib2 should always be available anyway;
02677 #  return actual url, status, and full HTTP headers (as result['url'],
02678 #  result['status'], and result['headers']) if parsing a remote feed over HTTP --
02679 #  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
02680 #  added the latest namespace-of-the-week for RSS 2.0
02681 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
02682 #  User-Agent (otherwise urllib2 sends two, which confuses some servers)
02683 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
02684 #  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
02685 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
02686 #  textInput, and also to return the character encoding (if specified)
02687 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
02688 #  nested divs within content (JohnD); fixed missing sys import (JohanS);
02689 #  fixed regular expression to capture XML character encoding (Andrei);
02690 #  added support for Atom 0.3-style links; fixed bug with textInput tracking;
02691 #  added support for cloud (MartijnP); added support for multiple
02692 #  category/dc:subject (MartijnP); normalize content model: 'description' gets
02693 #  description (which can come from description, summary, or full content if no
02694 #  description), 'content' gets dict of base/language/type/value (which can come
02695 #  from content:encoded, xhtml:body, content, or fullitem);
02696 #  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
02697 #  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
02698 #  <content> element is not in default namespace (like Pocketsoap feed);
02699 #  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
02700 #  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
02701 #  description, xhtml:body, content, content:encoded, title, subtitle,
02702 #  summary, info, tagline, and copyright; added support for pingback and
02703 #  trackback namespaces
02704 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
02705 #  namespaces, as opposed to 2.6 when I said I did but didn't really;
02706 #  sanitize HTML markup within some elements; added mxTidy support (if
02707 #  installed) to tidy HTML markup within some elements; fixed indentation
02708 #  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
02709 #  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
02710 #  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
02711 #  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
02712 #  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
02713 #2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory
02714 #  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
02715 #  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
02716 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
02717 #  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
02718 #  fixed relative URI processing for guid (skadz); added ICBM support; added
02719 #  base64 support
02720 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
02721 #  blogspot.com sites); added _debug variable
02722 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
02723 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
02724 #  added several new supported namespaces; fixed bug tracking naked markup in
02725 #  description; added support for enclosure; added support for source; re-added
02726 #  support for cloud which got dropped somehow; added support for expirationDate
02727 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
02728 #  xml:base URI, one for documents that don't define one explicitly and one for
02729 #  documents that define an outer and an inner xml:base that goes out of scope
02730 #  before the end of the document
02731 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
02732 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
02733 #  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
02734 #  added support for creativeCommons:license and cc:license; added support for
02735 #  full Atom content model in title, tagline, info, copyright, summary; fixed bug
02736 #  with gzip encoding (not always telling server we support it when we do)
02737 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
02738 #  (dictionary of 'name', 'url', 'email'); map author to author_detail if author
02739 #  contains name + email address
02740 #3.0b8 - 1/28/2004 - MAP - added support for contributor
02741 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
02742 #  support for summary
02743 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
02744 #  xml.util.iso8601
02745 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
02746 #  dangerous markup; fiddled with decodeEntities (not right); liberalized
02747 #  date parsing even further
02748 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
02749 #  added support to Atom 0.2 subtitle; added support for Atom content model
02750 #  in copyright; better sanitizing of dangerous HTML elements with end tags
02751 #  (script, frameset)
02752 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
02753 #  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
02754 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
02755 #  Python 2.1
02756 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
02757 #  fixed bug capturing author and contributor URL; fixed bug resolving relative
02758 #  links in author and contributor URL; fixed bug resolvin relative links in
02759 #  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
02760 #  namespace tests, and included them permanently in the test suite with his
02761 #  permission; fixed namespace handling under Python 2.1
02762 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
02763 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
02764 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
02765 #  use libxml2 (if available)
02766 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
02767 #  name was in parentheses; removed ultra-problematic mxTidy support; patch to
02768 #  workaround crash in PyXML/expat when encountering invalid entities
02769 #  (MarkMoraes); support for textinput/textInput
02770 #3.0b20 - 4/7/2004 - MAP - added CDF support
02771 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
02772 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
02773 #  results dict; changed results dict to allow getting values with results.key
02774 #  as well as results[key]; work around embedded illformed HTML with half
02775 #  a DOCTYPE; work around malformed Content-Type header; if character encoding
02776 #  is wrong, try several common ones before falling back to regexes (if this
02777 #  works, bozo_exception is set to CharacterEncodingOverride); fixed character
02778 #  encoding issues in BaseHTMLProcessor by tracking encoding and converting
02779 #  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
02780 #  convert each value in results to Unicode (if possible), even if using
02781 #  regex-based parsing
02782 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
02783 #  high-bit characters in attributes in embedded HTML in description (thanks
02784 #  Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
02785 #  FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
02786 #  about a mapped key
02787 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
02788 #  results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
02789 #  cause the same encoding to be tried twice (even if it failed the first time);
02790 #  fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
02791 #  better textinput and image tracking in illformed RSS 1.0 feeds
02792 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
02793 #  my blink tag tests
02794 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
02795 #  failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
02796 #  duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
02797 #  added support for image; refactored parse() fallback logic to try other
02798 #  encodings if SAX parsing fails (previously it would only try other encodings
02799 #  if re-encoding failed); remove unichr madness in normalize_attrs now that
02800 #  we're properly tracking encoding in and out of BaseHTMLProcessor; set
02801 #  feed.language from root-level xml:lang; set entry.id from rdf:about;
02802 #  send Accept header
02803 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
02804 #  iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
02805 #  windows-1252); fixed regression that could cause the same encoding to be
02806 #  tried twice (even if it failed the first time)
02807 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
02808 #  recover from malformed content-type header parameter with no equals sign
02809 #  ('text/xml; charset:iso-8859-1')
02810 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
02811 #  to Unicode equivalents in illformed feeds (aaronsw); added and
02812 #  passed tests for converting character entities to Unicode equivalents
02813 #  in illformed feeds (aaronsw); test for valid parsers when setting
02814 #  XML_AVAILABLE; make version and encoding available when server returns
02815 #  a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
02816 #  digest auth or proxy support); add code to parse username/password
02817 #  out of url and send as basic authentication; expose downloading-related
02818 #  exceptions in bozo_exception (aaronsw); added __contains__ method to
02819 #  FeedParserDict (aaronsw); added publisher_detail (aaronsw)
02820 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
02821 #  convert feed to UTF-8 before passing to XML parser; completely revamped
02822 #  logic for determining character encoding and attempting XML parsing
02823 #  (much faster); increased default timeout to 20 seconds; test for presence
02824 #  of Location header on redirects; added tests for many alternate character
02825 #  encodings; support various EBCDIC encodings; support UTF-16BE and
02826 #  UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
02827 #  UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
02828 #  XML parsers are available; added support for 'Content-encoding: deflate';
02829 #  send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
02830 #  are available
02831 #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
02832 #  problem tracking xml:base and xml:lang if element declares it, child
02833 #  doesn't, first grandchild redeclares it, and second grandchild doesn't;
02834 #  refactored date parsing; defined public registerDateHandler so callers
02835 #  can add support for additional date formats at runtime; added support
02836 #  for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
02837 #  zopeCompatibilityHack() which turns FeedParserDict into a regular
02838 #  dictionary, required for Zope compatibility, and also makes command-
02839 #  line debugging easier because pprint module formats real dictionaries
02840 #  better than dictionary-like objects; added NonXMLContentType exception,
02841 #  which is stored in bozo_exception when a feed is served with a non-XML
02842 #  media type such as 'text/plain'; respect Content-Language as default
02843 #  language if not xml:lang is present; cloud dict is now FeedParserDict;
02844 #  generator dict is now FeedParserDict; better tracking of xml:lang,
02845 #  including support for xml:lang='' to unset the current language;
02846 #  recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
02847 #  namespace; don't overwrite final status on redirects (scenarios:
02848 #  redirecting to a URL that returns 304, redirecting to a URL that
02849 #  redirects to another URL with a different type of redirect); add
02850 #  support for HTTP 303 redirects
02851 #4.0 - MAP - support for relative URIs in xml:base attribute; fixed
02852 #  encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
02853 #  support for Atom 1.0; support for iTunes extensions; new 'tags' for
02854 #  categories/keywords/etc. as array of dict
02855 #  {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
02856 #  terminology; parse RFC 822-style dates with no time; lots of other
02857 #  bug fixes
02858 #4.1 - MAP - removed socket timeout; added support for chardet library