Back to index

plone3  3.1.7
BeautifulSoup.py
Go to the documentation of this file.
00001 """Beautiful Soup
00002 Elixir and Tonic
00003 "The Screen-Scraper's Friend"
00004 http://www.crummy.com/software/BeautifulSoup/
00005 
00006 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
00007 tree representation. It provides methods and Pythonic idioms that make
00008 it easy to navigate, search, and modify the tree.
00009 
00010 A well-structured XML/HTML document yields a well-behaved data
00011 structure. An ill-structured XML/HTML document yields a
00012 correspondingly ill-behaved data structure. If your document is only
00013 locally well-structured, you can use this library to find and process
00014 the well-structured part of it.
00015 
00016 Beautiful Soup works with Python 2.2 and up. It has no external
00017 dependencies, but you'll have more success at converting data to UTF-8
00018 if you also install these three packages:
00019 
00020 * chardet, for auto-detecting character encodings
00021   http://chardet.feedparser.org/
00022 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
00023   by stock Python.
00024   http://cjkpython.i18n.org/
00025 
00026 Beautiful Soup defines classes for two main parsing strategies:
00027     
00028  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
00029    language that kind of looks like XML.
00030 
00031  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
00032    or invalid. This class has web browser-like heuristics for
00033    obtaining a sensible parse tree in the face of common HTML errors.
00034 
00035 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
00036 the encoding of an HTML or XML document, and converting it to
00037 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
00038 Parser.
00039 
00040 For more than you ever wanted to know about Beautiful Soup, see the
00041 documentation:
00042 http://www.crummy.com/software/BeautifulSoup/documentation.html
00043 """
00044 from __future__ import generators
00045 
00046 __author__ = "Leonard Richardson (crummy.com)"
00047 __contributors__ = ["Sam Ruby (intertwingly.net)",
00048                     "the unwitting Mark Pilgrim (diveintomark.org)",
00049                     "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
00050 __version__ = "3.0.3"
00051 __copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
00052 __license__ = "PSF"
00053 
00054 from sgmllib import SGMLParser, SGMLParseError
00055 import codecs
00056 import types
00057 import re
00058 import sgmllib
00059 from htmlentitydefs import name2codepoint
00060 
00061 # This RE makes Beautiful Soup able to parse XML with namespaces.
00062 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
00063 
00064 # This RE makes Beautiful Soup capable of recognizing numeric character
00065 # references that use hexadecimal.
00066 sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
00067 
00068 DEFAULT_OUTPUT_ENCODING = "utf-8"
00069 
00070 # First, the classes that represent markup elements.
00071 
00072 class PageElement:
00073     """Contains the navigational information for some part of the page
00074     (either a tag or a piece of text)"""
00075 
00076     def setup(self, parent=None, previous=None):
00077         """Sets up the initial relations between this element and
00078         other elements."""        
00079         self.parent = parent
00080         self.previous = previous
00081         self.next = None
00082         self.previousSibling = None
00083         self.nextSibling = None
00084         if self.parent and self.parent.contents:
00085             self.previousSibling = self.parent.contents[-1]
00086             self.previousSibling.nextSibling = self
00087 
00088     def replaceWith(self, replaceWith):        
00089         oldParent = self.parent
00090         myIndex = self.parent.contents.index(self)
00091         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
00092             # We're replacing this element with one of its siblings.
00093             index = self.parent.contents.index(replaceWith)
00094             if index and index < myIndex:
00095                 # Furthermore, it comes before this element. That
00096                 # means that when we extract it, the index of this
00097                 # element will change.
00098                 myIndex = myIndex - 1
00099         self.extract()        
00100         oldParent.insert(myIndex, replaceWith)
00101         
00102     def extract(self):
00103         """Destructively rips this element out of the tree."""        
00104         if self.parent:
00105             try:
00106                 self.parent.contents.remove(self)
00107             except ValueError:
00108                 pass
00109 
00110         #Find the two elements that would be next to each other if
00111         #this element (and any children) hadn't been parsed. Connect
00112         #the two.        
00113         lastChild = self._lastRecursiveChild()
00114         nextElement = lastChild.next
00115 
00116         if self.previous:
00117             self.previous.next = nextElement
00118         if nextElement:
00119             nextElement.previous = self.previous
00120         self.previous = None
00121         lastChild.next = None
00122 
00123         self.parent = None        
00124         if self.previousSibling:
00125             self.previousSibling.nextSibling = self.nextSibling
00126         if self.nextSibling:
00127             self.nextSibling.previousSibling = self.previousSibling
00128         self.previousSibling = self.nextSibling = None       
00129 
00130     def _lastRecursiveChild(self):
00131         "Finds the last element beneath this object to be parsed."
00132         lastChild = self
00133         while hasattr(lastChild, 'contents') and lastChild.contents:
00134             lastChild = lastChild.contents[-1]
00135         return lastChild
00136 
00137     def insert(self, position, newChild):
00138         if (isinstance(newChild, basestring)
00139             or isinstance(newChild, unicode)) \
00140             and not isinstance(newChild, NavigableString):
00141             newChild = NavigableString(newChild)        
00142 
00143         position =  min(position, len(self.contents))
00144         if hasattr(newChild, 'parent') and newChild.parent != None:
00145             # We're 'inserting' an element that's already one
00146             # of this object's children. 
00147             if newChild.parent == self:
00148                 index = self.find(newChild)
00149                 if index and index < position:
00150                     # Furthermore we're moving it further down the
00151                     # list of this object's children. That means that
00152                     # when we extract this element, our target index
00153                     # will jump down one.
00154                     position = position - 1
00155             newChild.extract()
00156             
00157         newChild.parent = self
00158         previousChild = None
00159         if position == 0:
00160             newChild.previousSibling = None
00161             newChild.previous = self
00162         else:
00163             previousChild = self.contents[position-1]
00164             newChild.previousSibling = previousChild
00165             newChild.previousSibling.nextSibling = newChild
00166             newChild.previous = previousChild._lastRecursiveChild()
00167         if newChild.previous:
00168             newChild.previous.next = newChild        
00169 
00170         newChildsLastElement = newChild._lastRecursiveChild()
00171 
00172         if position >= len(self.contents):
00173             newChild.nextSibling = None
00174             
00175             parent = self
00176             parentsNextSibling = None
00177             while not parentsNextSibling:
00178                 parentsNextSibling = parent.nextSibling
00179                 parent = parent.parent
00180                 if not parent: # This is the last element in the document.
00181                     break
00182             if parentsNextSibling:
00183                 newChildsLastElement.next = parentsNextSibling
00184             else:
00185                 newChildsLastElement.next = None
00186         else:
00187             nextChild = self.contents[position]            
00188             newChild.nextSibling = nextChild            
00189             if newChild.nextSibling:
00190                 newChild.nextSibling.previousSibling = newChild
00191             newChildsLastElement.next = nextChild
00192 
00193         if newChildsLastElement.next:
00194             newChildsLastElement.next.previous = newChildsLastElement
00195         self.contents.insert(position, newChild)
00196 
00197     def findNext(self, name=None, attrs={}, text=None, **kwargs):
00198         """Returns the first item that matches the given criteria and
00199         appears after this Tag in the document."""
00200         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
00201 
00202     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
00203                     **kwargs):
00204         """Returns all items that match the given criteria and appear
00205         before after Tag in the document."""
00206         return self._findAll(name, attrs, text, limit, self.nextGenerator)
00207 
00208     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
00209         """Returns the closest sibling to this Tag that matches the
00210         given criteria and appears after this Tag in the document."""
00211         return self._findOne(self.findNextSiblings, name, attrs, text,
00212                              **kwargs)
00213 
00214     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
00215                          **kwargs):
00216         """Returns the siblings of this Tag that match the given
00217         criteria and appear after this Tag in the document."""
00218         return self._findAll(name, attrs, text, limit,
00219                              self.nextSiblingGenerator, **kwargs)
00220     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
00221 
00222     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
00223         """Returns the first item that matches the given criteria and
00224         appears before this Tag in the document."""
00225         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
00226 
00227     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
00228                         **kwargs):
00229         """Returns all items that match the given criteria and appear
00230         before this Tag in the document."""
00231         return self._findAll(name, attrs, text, limit, self.previousGenerator,
00232                            **kwargs)
00233     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
00234 
00235     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
00236         """Returns the closest sibling to this Tag that matches the
00237         given criteria and appears before this Tag in the document."""
00238         return self._findOne(self.findPreviousSiblings, name, attrs, text,
00239                              **kwargs)
00240 
00241     def findPreviousSiblings(self, name=None, attrs={}, text=None,
00242                              limit=None, **kwargs):
00243         """Returns the siblings of this Tag that match the given
00244         criteria and appear before this Tag in the document."""
00245         return self._findAll(name, attrs, text, limit,
00246                              self.previousSiblingGenerator, **kwargs)
00247     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
00248 
00249     def findParent(self, name=None, attrs={}, **kwargs):
00250         """Returns the closest parent of this Tag that matches the given
00251         criteria."""
00252         # NOTE: We can't use _findOne because findParents takes a different
00253         # set of arguments.
00254         r = None
00255         l = self.findParents(name, attrs, 1)
00256         if l:
00257             r = l[0]
00258         return r
00259 
00260     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
00261         """Returns the parents of this Tag that match the given
00262         criteria."""
00263 
00264         return self._findAll(name, attrs, None, limit, self.parentGenerator,
00265                              **kwargs)
00266     fetchParents = findParents # Compatibility with pre-3.x
00267 
00268     #These methods do the real heavy lifting.
00269 
00270     def _findOne(self, method, name, attrs, text, **kwargs):
00271         r = None
00272         l = method(name, attrs, text, 1, **kwargs)
00273         if l:
00274             r = l[0]
00275         return r
00276     
00277     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
00278         "Iterates over a generator looking for things that match."
00279 
00280         if isinstance(name, SoupStrainer):
00281             strainer = name
00282         else:
00283             # Build a SoupStrainer
00284             strainer = SoupStrainer(name, attrs, text, **kwargs)
00285         results = ResultSet(strainer)
00286         g = generator()
00287         while True:
00288             try:
00289                 i = g.next()
00290             except StopIteration:
00291                 break
00292             if i:
00293                 found = strainer.search(i)
00294                 if found:
00295                     results.append(found)
00296                     if limit and len(results) >= limit:
00297                         break
00298         return results
00299 
00300     #These Generators can be used to navigate starting from both
00301     #NavigableStrings and Tags.                
00302     def nextGenerator(self):
00303         i = self
00304         while i:
00305             i = i.next
00306             yield i
00307 
00308     def nextSiblingGenerator(self):
00309         i = self
00310         while i:
00311             i = i.nextSibling
00312             yield i
00313 
00314     def previousGenerator(self):
00315         i = self
00316         while i:
00317             i = i.previous
00318             yield i
00319 
00320     def previousSiblingGenerator(self):
00321         i = self
00322         while i:
00323             i = i.previousSibling
00324             yield i
00325 
00326     def parentGenerator(self):
00327         i = self
00328         while i:
00329             i = i.parent
00330             yield i
00331 
00332     # Utility methods
00333     def substituteEncoding(self, str, encoding=None):
00334         encoding = encoding or "utf-8"
00335         return str.replace("%SOUP-ENCODING%", encoding)    
00336 
00337     def toEncoding(self, s, encoding=None):
00338         """Encodes an object to a string in some encoding, or to Unicode.
00339         ."""
00340         if isinstance(s, unicode):
00341             if encoding:
00342                 s = s.encode(encoding)
00343         elif isinstance(s, str):
00344             if encoding:
00345                 s = s.encode(encoding)
00346             else:
00347                 s = unicode(s)
00348         else:
00349             if encoding:
00350                 s  = self.toEncoding(str(s), encoding)
00351             else:
00352                 s = unicode(s)
00353         return s
00354 
00355 class NavigableString(unicode, PageElement):
00356 
00357     def __getattr__(self, attr):
00358         """text.string gives you text. This is for backwards
00359         compatibility for Navigable*String, but for CData* it lets you
00360         get the string without the CData wrapper."""
00361         if attr == 'string':
00362             return self
00363         else:
00364             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
00365 
00366     def __unicode__(self):
00367         return __str__(self, None)
00368 
00369     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00370         if encoding:
00371             return self.encode(encoding)
00372         else:
00373             return self
00374         
00375 class CData(NavigableString):
00376 
00377     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00378         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
00379 
00380 class ProcessingInstruction(NavigableString):
00381     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00382         output = self
00383         if "%SOUP-ENCODING%" in output:
00384             output = self.substituteEncoding(output, encoding)
00385         return "<?%s?>" % self.toEncoding(output, encoding)
00386 
00387 class Comment(NavigableString):
00388     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00389         return "<!--%s-->" % NavigableString.__str__(self, encoding)    
00390 
00391 class Declaration(NavigableString):
00392     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00393         return "<!%s>" % NavigableString.__str__(self, encoding)        
00394 
00395 class Tag(PageElement):
00396     """Represents a found HTML tag with its attributes and contents."""
00397 
00398     XML_ENTITIES_TO_CHARS = { 'apos' : "'",
00399                               "quot" : '"',
00400                               "amp" : "&",
00401                               "lt" : "<",
00402                               "gt" : ">"
00403                               }
00404     # An RE for finding ampersands that aren't the start of of a
00405     # numeric entity.
00406     BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
00407 
00408     def __init__(self, parser, name, attrs=None, parent=None,
00409                  previous=None):
00410         "Basic constructor."
00411 
00412         # We don't actually store the parser object: that lets extracted
00413         # chunks be garbage-collected
00414         self.parserClass = parser.__class__
00415         self.isSelfClosing = parser.isSelfClosingTag(name)
00416         self.convertHTMLEntities = parser.convertHTMLEntities
00417         self.name = name
00418         if attrs == None:
00419             attrs = []
00420         self.attrs = attrs
00421         self.contents = []
00422         self.setup(parent, previous)
00423         self.hidden = False
00424         self.containsSubstitutions = False
00425 
00426     def get(self, key, default=None):
00427         """Returns the value of the 'key' attribute for the tag, or
00428         the value given for 'default' if it doesn't have that
00429         attribute."""
00430         return self._getAttrMap().get(key, default)    
00431 
00432     def has_key(self, key):
00433         return self._getAttrMap().has_key(key)
00434 
00435     def __getitem__(self, key):
00436         """tag[key] returns the value of the 'key' attribute for the tag,
00437         and throws an exception if it's not there."""
00438         return self._getAttrMap()[key]
00439 
00440     def __iter__(self):
00441         "Iterating over a tag iterates over its contents."
00442         return iter(self.contents)
00443 
00444     def __len__(self):
00445         "The length of a tag is the length of its list of contents."
00446         return len(self.contents)
00447 
00448     def __contains__(self, x):
00449         return x in self.contents
00450 
00451     def __nonzero__(self):
00452         "A tag is non-None even if it has no contents."
00453         return True
00454 
00455     def __setitem__(self, key, value):        
00456         """Setting tag[key] sets the value of the 'key' attribute for the
00457         tag."""
00458         self._getAttrMap()
00459         self.attrMap[key] = value
00460         found = False
00461         for i in range(0, len(self.attrs)):
00462             if self.attrs[i][0] == key:
00463                 self.attrs[i] = (key, value)
00464                 found = True
00465         if not found:
00466             self.attrs.append((key, value))
00467         self._getAttrMap()[key] = value
00468 
00469     def __delitem__(self, key):
00470         "Deleting tag[key] deletes all 'key' attributes for the tag."
00471         for item in self.attrs:
00472             if item[0] == key:
00473                 self.attrs.remove(item)
00474                 #We don't break because bad HTML can define the same
00475                 #attribute multiple times.
00476             self._getAttrMap()
00477             if self.attrMap.has_key(key):
00478                 del self.attrMap[key]
00479 
00480     def __call__(self, *args, **kwargs):
00481         """Calling a tag like a function is the same as calling its
00482         findAll() method. Eg. tag('a') returns a list of all the A tags
00483         found within this tag."""
00484         return apply(self.findAll, args, kwargs)
00485 
00486     def __getattr__(self, tag):
00487         #print "Getattr %s.%s" % (self.__class__, tag)
00488         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
00489             return self.find(tag[:-3])
00490         elif tag.find('__') != 0:
00491             return self.find(tag)
00492 
00493     def __eq__(self, other):
00494         """Returns true iff this tag has the same name, the same attributes,
00495         and the same contents (recursively) as the given tag.
00496 
00497         NOTE: right now this will return false if two tags have the
00498         same attributes in a different order. Should this be fixed?"""
00499         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
00500             return False
00501         for i in range(0, len(self.contents)):
00502             if self.contents[i] != other.contents[i]:
00503                 return False
00504         return True
00505 
00506     def __ne__(self, other):
00507         """Returns true iff this tag is not identical to the other tag,
00508         as defined in __eq__."""
00509         return not self == other
00510 
00511     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
00512         """Renders this tag as a string."""
00513         return self.__str__(encoding)
00514 
00515     def __unicode__(self):
00516         return self.__str__(None)
00517 
00518     def _convertEntities(self, match):
00519         x = match.group(1)
00520         if x in name2codepoint:
00521             return unichr(name2codepoint[x])            
00522         elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
00523             return '&%s;' % x
00524         else:
00525             return '&amp;%s;' % x
00526 
00527     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
00528                 prettyPrint=False, indentLevel=0):
00529         """Returns a string or Unicode representation of this tag and
00530         its contents. To get Unicode, pass None for encoding.
00531 
00532         NOTE: since Python's HTML parser consumes whitespace, this
00533         method is not certain to reproduce the whitespace present in
00534         the original string."""
00535 
00536         encodedName = self.toEncoding(self.name, encoding)
00537 
00538         attrs = []
00539         if self.attrs:
00540             for key, val in self.attrs:
00541                 fmt = '%s="%s"'
00542                 if isString(val):                    
00543                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
00544                         val = self.substituteEncoding(val, encoding)
00545 
00546                     # The attribute value either:
00547                     #
00548                     # * Contains no embedded double quotes or single quotes.
00549                     #   No problem: we enclose it in double quotes.
00550                     # * Contains embedded single quotes. No problem:
00551                     #   double quotes work here too.
00552                     # * Contains embedded double quotes. No problem:
00553                     #   we enclose it in single quotes.
00554                     # * Embeds both single _and_ double quotes. This
00555                     #   can't happen naturally, but it can happen if
00556                     #   you modify an attribute value after parsing
00557                     #   the document. Now we have a bit of a
00558                     #   problem. We solve it by enclosing the
00559                     #   attribute in single quotes, and escaping any
00560                     #   embedded single quotes to XML entities.
00561                     if '"' in val:
00562                         # This can't happen naturally, but it can happen
00563                         # if you modify an attribute value after parsing.
00564                         if "'" in val:
00565                             val = val.replace('"', "&quot;")
00566                         else:
00567                             fmt = "%s='%s'"
00568 
00569                     # Optionally convert any HTML entities
00570                     if self.convertHTMLEntities:
00571                         val = re.sub("&(\w+);", self._convertEntities, val)
00572 
00573                     # Now we're okay w/r/t quotes. But the attribute
00574                     # value might also contain angle brackets, or
00575                     # ampersands that aren't part of entities. We need
00576                     # to escape those to XML entities too.
00577                     val = val.replace("<", "&lt;").replace(">", "&gt;")
00578                     val = self.BARE_AMPERSAND.sub("&amp;", val)
00579 
00580                                       
00581                 attrs.append(fmt % (self.toEncoding(key, encoding),
00582                                     self.toEncoding(val, encoding)))
00583         close = ''
00584         closeTag = ''
00585         if self.isSelfClosing:
00586             close = ' /'
00587         else:
00588             closeTag = '</%s>' % encodedName
00589 
00590         indentTag, indentContents = 0, 0
00591         if prettyPrint:
00592             indentTag = indentLevel
00593             space = (' ' * (indentTag-1))
00594             indentContents = indentTag + 1
00595         contents = self.renderContents(encoding, prettyPrint, indentContents)
00596         if self.hidden:
00597             s = contents
00598         else:
00599             s = []
00600             attributeString = ''
00601             if attrs:
00602                 attributeString = ' ' + ' '.join(attrs)            
00603             if prettyPrint:
00604                 s.append(space)
00605             s.append('<%s%s%s>' % (encodedName, attributeString, close))
00606             if prettyPrint:
00607                 s.append("\n")
00608             s.append(contents)
00609             if prettyPrint and contents and contents[-1] != "\n":
00610                 s.append("\n")
00611             if prettyPrint and closeTag:
00612                 s.append(space)
00613             s.append(closeTag)
00614             if prettyPrint and closeTag and self.nextSibling:
00615                 s.append("\n")
00616             s = ''.join(s)
00617         return s
00618 
00619     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
00620         return self.__str__(encoding, True)
00621 
00622     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
00623                        prettyPrint=False, indentLevel=0):
00624         """Renders the contents of this tag as a string in the given
00625         encoding. If encoding is None, returns a Unicode string.."""
00626         s=[]
00627         for c in self:
00628             text = None
00629             if isinstance(c, NavigableString):
00630                 text = c.__str__(encoding)
00631             elif isinstance(c, Tag):
00632                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
00633             if text and prettyPrint:
00634                 text = text.strip()              
00635             if text:
00636                 if prettyPrint:
00637                     s.append(" " * (indentLevel-1))
00638                 s.append(text)
00639                 if prettyPrint:
00640                     s.append("\n")
00641         return ''.join(s)    
00642 
00643     #Soup methods
00644 
00645     def find(self, name=None, attrs={}, recursive=True, text=None,
00646              **kwargs):
00647         """Return only the first child of this Tag matching the given
00648         criteria."""
00649         r = None
00650         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
00651         if l:
00652             r = l[0]
00653         return r
00654     findChild = find
00655 
00656     def findAll(self, name=None, attrs={}, recursive=True, text=None,
00657                 limit=None, **kwargs):
00658         """Extracts a list of Tag objects that match the given
00659         criteria.  You can specify the name of the Tag and any
00660         attributes you want the Tag to have.
00661 
00662         The value of a key-value pair in the 'attrs' map can be a
00663         string, a list of strings, a regular expression object, or a
00664         callable that takes a string and returns whether or not the
00665         string matches for some custom definition of 'matches'. The
00666         same is true of the tag name."""
00667         generator = self.recursiveChildGenerator
00668         if not recursive:
00669             generator = self.childGenerator
00670         return self._findAll(name, attrs, text, limit, generator, **kwargs)
00671     findChildren = findAll
00672 
00673     # Pre-3.x compatibility methods
00674     first = find
00675     fetch = findAll
00676     
00677     def fetchText(self, text=None, recursive=True, limit=None):
00678         return self.findAll(text=text, recursive=recursive, limit=limit)
00679 
00680     def firstText(self, text=None, recursive=True):
00681         return self.find(text=text, recursive=recursive)
00682     
00683     #Utility methods
00684 
00685     def append(self, tag):
00686         """Appends the given tag to the contents of this tag."""
00687         self.contents.append(tag)
00688 
00689     #Private methods
00690 
00691     def _getAttrMap(self):
00692         """Initializes a map representation of this tag's attributes,
00693         if not already initialized."""
00694         if not getattr(self, 'attrMap'):
00695             self.attrMap = {}
00696             for (key, value) in self.attrs:
00697                 self.attrMap[key] = value 
00698         return self.attrMap
00699 
00700     #Generator methods
00701     def childGenerator(self):
00702         for i in range(0, len(self.contents)):
00703             yield self.contents[i]
00704         raise StopIteration
00705     
00706     def recursiveChildGenerator(self):
00707         stack = [(self, 0)]
00708         while stack:
00709             tag, start = stack.pop()
00710             if isinstance(tag, Tag):            
00711                 for i in range(start, len(tag.contents)):
00712                     a = tag.contents[i]
00713                     yield a
00714                     if isinstance(a, Tag) and tag.contents:
00715                         if i < len(tag.contents) - 1:
00716                             stack.append((tag, i+1))
00717                         stack.append((a, 0))
00718                         break
00719         raise StopIteration
00720 
00721 # Next, a couple classes to represent queries and their results.
00722 class SoupStrainer:
00723     """Encapsulates a number of ways of matching a markup element (tag or
00724     text)."""
00725 
00726     def __init__(self, name=None, attrs={}, text=None, **kwargs):
00727         self.name = name
00728         if isString(attrs):
00729             kwargs['class'] = attrs
00730             attrs = None
00731         if kwargs:
00732             if attrs:
00733                 attrs = attrs.copy()
00734                 attrs.update(kwargs)
00735             else:
00736                 attrs = kwargs
00737         self.attrs = attrs
00738         self.text = text
00739 
00740     def __str__(self):
00741         if self.text:
00742             return self.text
00743         else:
00744             return "%s|%s" % (self.name, self.attrs)
00745     
00746     def searchTag(self, markupName=None, markupAttrs={}):
00747         found = None
00748         markup = None
00749         if isinstance(markupName, Tag):
00750             markup = markupName
00751             markupAttrs = markup
00752         callFunctionWithTagData = callable(self.name) \
00753                                 and not isinstance(markupName, Tag)
00754 
00755         if (not self.name) \
00756                or callFunctionWithTagData \
00757                or (markup and self._matches(markup, self.name)) \
00758                or (not markup and self._matches(markupName, self.name)):
00759             if callFunctionWithTagData:
00760                 match = self.name(markupName, markupAttrs)
00761             else:
00762                 match = True            
00763                 markupAttrMap = None
00764                 for attr, matchAgainst in self.attrs.items():
00765                     if not markupAttrMap:
00766                          if hasattr(markupAttrs, 'get'):
00767                             markupAttrMap = markupAttrs
00768                          else:
00769                             markupAttrMap = {}
00770                             for k,v in markupAttrs:
00771                                 markupAttrMap[k] = v
00772                     attrValue = markupAttrMap.get(attr)
00773                     if not self._matches(attrValue, matchAgainst):
00774                         match = False
00775                         break
00776             if match:
00777                 if markup:
00778                     found = markup
00779                 else:
00780                     found = markupName
00781         return found
00782 
00783     def search(self, markup):
00784         #print 'looking for %s in %s' % (self, markup)
00785         found = None
00786         # If given a list of items, scan it for a text element that
00787         # matches.        
00788         if isList(markup) and not isinstance(markup, Tag):
00789             for element in markup:
00790                 if isinstance(element, NavigableString) \
00791                        and self.search(element):
00792                     found = element
00793                     break
00794         # If it's a Tag, make sure its name or attributes match.
00795         # Don't bother with Tags if we're searching for text.
00796         elif isinstance(markup, Tag):
00797             if not self.text:
00798                 found = self.searchTag(markup)
00799         # If it's text, make sure the text matches.
00800         elif isinstance(markup, NavigableString) or \
00801                  isString(markup):
00802             if self._matches(markup, self.text):
00803                 found = markup
00804         else:
00805             raise Exception, "I don't know how to match against a %s" \
00806                   % markup.__class__
00807         return found
00808         
00809     def _matches(self, markup, matchAgainst):    
00810         #print "Matching %s against %s" % (markup, matchAgainst)
00811         result = False
00812         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
00813             result = markup != None
00814         elif callable(matchAgainst):
00815             result = matchAgainst(markup)
00816         else:
00817             #Custom match methods take the tag as an argument, but all
00818             #other ways of matching match the tag name as a string.
00819             if isinstance(markup, Tag):
00820                 markup = markup.name
00821             if markup and not isString(markup):
00822                 markup = unicode(markup)
00823             #Now we know that chunk is either a string, or None.
00824             if hasattr(matchAgainst, 'match'):
00825                 # It's a regexp object.
00826                 result = markup and matchAgainst.search(markup)
00827             elif isList(matchAgainst):
00828                 result = markup in matchAgainst
00829             elif hasattr(matchAgainst, 'items'):
00830                 result = markup.has_key(matchAgainst)
00831             elif matchAgainst and isString(markup):
00832                 if isinstance(markup, unicode):
00833                     matchAgainst = unicode(matchAgainst)
00834                 else:
00835                     matchAgainst = str(matchAgainst)
00836 
00837             if not result:
00838                 result = matchAgainst == markup
00839         return result
00840 
00841 class ResultSet(list):
00842     """A ResultSet is just a list that keeps track of the SoupStrainer
00843     that created it."""
00844     def __init__(self, source):
00845         list.__init__([])
00846         self.source = source
00847 
00848 # Now, some helper functions.
00849 
00850 def isList(l):
00851     """Convenience method that works with all 2.x versions of Python
00852     to determine whether or not something is listlike."""
00853     return hasattr(l, '__iter__') \
00854            or (type(l) in (types.ListType, types.TupleType))
00855 
00856 def isString(s):
00857     """Convenience method that works with all 2.x versions of Python
00858     to determine whether or not something is stringlike."""
00859     try:
00860         return isinstance(s, unicode) or isinstance(s, basestring) 
00861     except NameError:
00862         return isinstance(s, str)
00863 
00864 def buildTagMap(default, *args):
00865     """Turns a list of maps, lists, or scalars into a single map.
00866     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
00867     NESTING_RESET_TAGS maps out of lists and partial maps."""
00868     built = {}
00869     for portion in args:
00870         if hasattr(portion, 'items'):
00871             #It's a map. Merge it.
00872             for k,v in portion.items():
00873                 built[k] = v
00874         elif isList(portion):
00875             #It's a list. Map each item to the default.
00876             for k in portion:
00877                 built[k] = default
00878         else:
00879             #It's a scalar. Map it to the default.
00880             built[portion] = default
00881     return built
00882 
00883 # Now, the parser classes.
00884 
00885 class BeautifulStoneSoup(Tag, SGMLParser):
00886 
00887     """This class contains the basic parser and search code. It defines
00888     a parser that knows nothing about tag behavior except for the
00889     following:
00890    
00891       You can't close a tag without closing all the tags it encloses.
00892       That is, "<foo><bar></foo>" actually means
00893       "<foo><bar></bar></foo>".
00894 
00895     [Another possible explanation is "<foo><bar /></foo>", but since
00896     this class defines no SELF_CLOSING_TAGS, it will never use that
00897     explanation.]
00898 
00899     This class is useful for parsing XML or made-up markup languages,
00900     or when BeautifulSoup makes an assumption counter to what you were
00901     expecting."""
00902 
00903     SELF_CLOSING_TAGS = {}
00904     NESTABLE_TAGS = {}
00905     RESET_NESTING_TAGS = {}
00906     QUOTE_TAGS = {}
00907 
00908     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
00909                        lambda x: x.group(1) + ' />'),
00910                       (re.compile('<!\s+([^<>]*)>'),
00911                        lambda x: '<!' + x.group(1) + '>')
00912                       ]
00913 
00914     ROOT_TAG_NAME = u'[document]'
00915 
00916     HTML_ENTITIES = "html"
00917     XML_ENTITIES = "xml"
00918     ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
00919 
00920     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
00921                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
00922                  convertEntities=None, selfClosingTags=None):
00923         """The Soup object is initialized as the 'root tag', and the
00924         provided markup (which can be a string or a file-like object)
00925         is fed into the underlying parser. 
00926 
00927         sgmllib will process most bad HTML, and the BeautifulSoup
00928         class has some tricks for dealing with some HTML that kills
00929         sgmllib, but Beautiful Soup can nonetheless choke or lose data
00930         if your data uses self-closing tags or declarations
00931         incorrectly.
00932 
00933         By default, Beautiful Soup uses regexes to sanitize input,
00934         avoiding the vast majority of these problems. If the problems
00935         don't apply to you, pass in False for markupMassage, and
00936         you'll get better performance.
00937 
00938         The default parser massage techniques fix the two most common
00939         instances of invalid HTML that choke sgmllib:
00940 
00941          <br/> (No space between name of closing tag and tag close)
00942          <! --Comment--> (Extraneous whitespace in declaration)
00943 
00944         You can pass in a custom list of (RE object, replace method)
00945         tuples to get Beautiful Soup to scrub your input the way you
00946         want."""
00947 
00948         self.parseOnlyThese = parseOnlyThese
00949         self.fromEncoding = fromEncoding
00950         self.smartQuotesTo = smartQuotesTo
00951 
00952         if convertEntities:
00953             # It doesn't make sense to convert encoded characters to
00954             # entities even while you're converting entities to Unicode.
00955             # Just convert it all to Unicode.
00956             self.smartQuotesTo = None
00957 
00958         if isList(convertEntities):
00959             self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
00960             self.convertXMLEntities = self.XML_ENTITIES in convertEntities
00961         else:
00962             self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
00963             self.convertXMLEntities = self.XML_ENTITIES == convertEntities
00964 
00965         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
00966         SGMLParser.__init__(self)
00967             
00968         if hasattr(markup, 'read'):        # It's a file-type object.
00969             markup = markup.read()
00970         self.markup = markup
00971         self.markupMassage = markupMassage
00972         try:
00973             self._feed()
00974         except StopParsing:
00975             pass
00976         self.markup = None                 # The markup can now be GCed
00977 
00978     def _feed(self, inDocumentEncoding=None):
00979         # Convert the document to Unicode.
00980         markup = self.markup
00981         if isinstance(markup, unicode):
00982             if not hasattr(self, 'originalEncoding'):
00983                 self.originalEncoding = None
00984         else:
00985             dammit = UnicodeDammit\
00986                      (markup, [self.fromEncoding, inDocumentEncoding],
00987                       smartQuotesTo=self.smartQuotesTo)
00988             markup = dammit.unicode
00989             self.originalEncoding = dammit.originalEncoding
00990         if markup:
00991             if self.markupMassage:
00992                 if not isList(self.markupMassage):
00993                     self.markupMassage = self.MARKUP_MASSAGE            
00994                 for fix, m in self.markupMassage:
00995                     markup = fix.sub(m, markup)
00996         self.reset()
00997 
00998         SGMLParser.feed(self, markup or "")
00999         SGMLParser.close(self)
01000         # Close out any unfinished strings and close all the open tags.
01001         self.endData()
01002         while self.currentTag.name != self.ROOT_TAG_NAME:
01003             self.popTag()
01004 
01005     def __getattr__(self, methodName):
01006         """This method routes method call requests to either the SGMLParser
01007         superclass or the Tag superclass, depending on the method name."""
01008         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
01009 
01010         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
01011                or methodName.find('do_') == 0:
01012             return SGMLParser.__getattr__(self, methodName)
01013         elif methodName.find('__') != 0:
01014             return Tag.__getattr__(self, methodName)
01015         else:
01016             raise AttributeError
01017 
01018     def isSelfClosingTag(self, name):
01019         """Returns true iff the given string is the name of a
01020         self-closing tag according to this parser."""
01021         return self.SELF_CLOSING_TAGS.has_key(name) \
01022                or self.instanceSelfClosingTags.has_key(name)
01023             
01024     def reset(self):
01025         Tag.__init__(self, self, self.ROOT_TAG_NAME)
01026         self.hidden = 1
01027         SGMLParser.reset(self)
01028         self.currentData = []
01029         self.currentTag = None
01030         self.tagStack = []
01031         self.quoteStack = []
01032         self.pushTag(self)
01033     
01034     def popTag(self):
01035         tag = self.tagStack.pop()
01036         # Tags with just one string-owning child get the child as a
01037         # 'string' property, so that soup.tag.string is shorthand for
01038         # soup.tag.contents[0]
01039         if len(self.currentTag.contents) == 1 and \
01040            isinstance(self.currentTag.contents[0], NavigableString):
01041             self.currentTag.string = self.currentTag.contents[0]
01042 
01043         #print "Pop", tag.name
01044         if self.tagStack:
01045             self.currentTag = self.tagStack[-1]
01046         return self.currentTag
01047 
01048     def pushTag(self, tag):
01049         #print "Push", tag.name
01050         if self.currentTag:
01051             self.currentTag.append(tag)
01052         self.tagStack.append(tag)
01053         self.currentTag = self.tagStack[-1]
01054 
01055     def endData(self, containerClass=NavigableString):
01056         if self.currentData:
01057             currentData = ''.join(self.currentData)
01058             if currentData.endswith('<') and self.convertHTMLEntities:
01059                 currentData = currentData[:-1] + '&lt;'
01060             if not currentData.strip():
01061                 if '\n' in currentData:
01062                     currentData = '\n'
01063                 else:
01064                     currentData = ' '
01065             self.currentData = []
01066             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
01067                    (not self.parseOnlyThese.text or \
01068                     not self.parseOnlyThese.search(currentData)):
01069                 return
01070             o = containerClass(currentData)
01071             o.setup(self.currentTag, self.previous)
01072             if self.previous:
01073                 self.previous.next = o
01074             self.previous = o
01075             self.currentTag.contents.append(o)
01076 
01077 
01078     def _popToTag(self, name, inclusivePop=True):
01079         """Pops the tag stack up to and including the most recent
01080         instance of the given tag. If inclusivePop is false, pops the tag
01081         stack up to but *not* including the most recent instqance of
01082         the given tag."""
01083         #print "Popping to %s" % name
01084         if name == self.ROOT_TAG_NAME:
01085             return            
01086 
01087         numPops = 0
01088         mostRecentTag = None
01089         for i in range(len(self.tagStack)-1, 0, -1):
01090             if name == self.tagStack[i].name:
01091                 numPops = len(self.tagStack)-i
01092                 break
01093         if not inclusivePop:
01094             numPops = numPops - 1
01095 
01096         for i in range(0, numPops):
01097             mostRecentTag = self.popTag()
01098         return mostRecentTag    
01099 
01100     def _smartPop(self, name):
01101 
01102         """We need to pop up to the previous tag of this type, unless
01103         one of this tag's nesting reset triggers comes between this
01104         tag and the previous tag of this type, OR unless this tag is a
01105         generic nesting trigger and another generic nesting trigger
01106         comes between this tag and the previous tag of this type.
01107 
01108         Examples:
01109          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
01110          <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
01111          <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
01112          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
01113 
01114          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
01115          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
01116          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
01117         """
01118 
01119         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
01120         isNestable = nestingResetTriggers != None
01121         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
01122         popTo = None
01123         inclusive = True
01124         for i in range(len(self.tagStack)-1, 0, -1):
01125             p = self.tagStack[i]
01126             if (not p or p.name == name) and not isNestable:
01127                 #Non-nestable tags get popped to the top or to their
01128                 #last occurance.
01129                 popTo = name
01130                 break
01131             if (nestingResetTriggers != None
01132                 and p.name in nestingResetTriggers) \
01133                 or (nestingResetTriggers == None and isResetNesting
01134                     and self.RESET_NESTING_TAGS.has_key(p.name)):
01135                 
01136                 #If we encounter one of the nesting reset triggers
01137                 #peculiar to this tag, or we encounter another tag
01138                 #that causes nesting to reset, pop up to but not
01139                 #including that tag.
01140                 popTo = p.name
01141                 inclusive = False
01142                 break
01143             p = p.parent
01144         if popTo:
01145             self._popToTag(popTo, inclusive)
01146 
01147     def unknown_starttag(self, name, attrs, selfClosing=0):
01148         #print "Start tag %s: %s" % (name, attrs)
01149         if self.quoteStack:
01150             #This is not a real tag.
01151             #print "<%s> is not real!" % name
01152             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
01153             self.currentData.append('<%s%s>' % (name, attrs))
01154             return        
01155         self.endData()
01156 
01157         if not self.isSelfClosingTag(name) and not selfClosing:
01158             self._smartPop(name)
01159 
01160         if self.parseOnlyThese and len(self.tagStack) <= 1 \
01161                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
01162             return
01163 
01164         tag = Tag(self, name, attrs, self.currentTag, self.previous)
01165         if self.previous:
01166             self.previous.next = tag
01167         self.previous = tag
01168         self.pushTag(tag)
01169         if selfClosing or self.isSelfClosingTag(name):
01170             self.popTag()                
01171         if name in self.QUOTE_TAGS:
01172             #print "Beginning quote (%s)" % name
01173             self.quoteStack.append(name)
01174             self.literal = 1
01175         return tag
01176 
01177     def unknown_endtag(self, name):
01178         #print "End tag %s" % name
01179         if self.quoteStack and self.quoteStack[-1] != name:
01180             #This is not a real end tag.
01181             #print "</%s> is not real!" % name
01182             self.currentData.append('</%s>' % name)
01183             return
01184         self.endData()
01185         self._popToTag(name)
01186         if self.quoteStack and self.quoteStack[-1] == name:
01187             self.quoteStack.pop()
01188             self.literal = (len(self.quoteStack) > 0)
01189 
01190     def handle_data(self, data):
01191         if self.convertHTMLEntities:
01192             if data[0] == '&':
01193                 data = self.BARE_AMPERSAND.sub("&amp;",data)
01194             else:
01195                 data = data.replace('&','&amp;') \
01196                            .replace('<','&lt;') \
01197                            .replace('>','&gt;')
01198         self.currentData.append(data)
01199 
01200     def _toStringSubclass(self, text, subclass):
01201         """Adds a certain piece of text to the tree as a NavigableString
01202         subclass."""
01203         self.endData()
01204         self.handle_data(text)
01205         self.endData(subclass)
01206 
01207     def handle_pi(self, text):
01208         """Handle a processing instruction as a ProcessingInstruction
01209         object, possibly one with a %SOUP-ENCODING% slot into which an
01210         encoding will be plugged later."""
01211         if text[:3] == "xml":
01212             text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
01213         self._toStringSubclass(text, ProcessingInstruction)
01214 
01215     def handle_comment(self, text):
01216         "Handle comments as Comment objects."
01217         self._toStringSubclass(text, Comment)
01218 
01219     def handle_charref(self, ref):
01220         "Handle character references as data."
01221         if ref[0] == 'x':
01222             data = unichr(int(ref[1:],16))
01223         else:
01224             data = unichr(int(ref))
01225         
01226         if u'\x80' <= data <= u'\x9F':
01227             data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
01228         elif not self.convertHTMLEntities and not self.convertXMLEntities:
01229             data = '&#%s;' % ref
01230 
01231         self.handle_data(data)
01232 
01233     def handle_entityref(self, ref):
01234         """Handle entity references as data, possibly converting known
01235         HTML entity references to the corresponding Unicode
01236         characters."""
01237         replaceWithXMLEntity = self.convertXMLEntities and \
01238                                self.XML_ENTITIES_TO_CHARS.has_key(ref)
01239         if self.convertHTMLEntities or replaceWithXMLEntity:
01240             try:
01241                 data = unichr(name2codepoint[ref])
01242             except KeyError:
01243                 if replaceWithXMLEntity:
01244                     data = self.XML_ENTITIES_TO_CHARS.get(ref)
01245                 else:
01246                     data="&amp;%s" % ref
01247         else:
01248             data = '&%s;' % ref
01249         self.handle_data(data)
01250         
01251     def handle_decl(self, data):
01252         "Handle DOCTYPEs and the like as Declaration objects."
01253         self._toStringSubclass(data, Declaration)
01254 
01255     def parse_declaration(self, i):
01256         """Treat a bogus SGML declaration as raw data. Treat a CDATA
01257         declaration as a CData object."""
01258         j = None
01259         if self.rawdata[i:i+9] == '<![CDATA[':
01260              k = self.rawdata.find(']]>', i)
01261              if k == -1:
01262                  k = len(self.rawdata)
01263              data = self.rawdata[i+9:k]
01264              j = k+3
01265              self._toStringSubclass(data, CData)
01266         else:
01267             try:
01268                 j = SGMLParser.parse_declaration(self, i)
01269             except SGMLParseError:
01270                 toHandle = self.rawdata[i:]
01271                 self.handle_data(toHandle)
01272                 j = i + len(toHandle)
01273         return j
01274 
01275 class BeautifulSoup(BeautifulStoneSoup):
01276 
01277     """This parser knows the following facts about HTML:
01278 
01279     * Some tags have no closing tag and should be interpreted as being
01280       closed as soon as they are encountered.
01281 
01282     * The text inside some tags (ie. 'script') may contain tags which
01283       are not really part of the document and which should be parsed
01284       as text, not tags. If you want to parse the text as tags, you can
01285       always fetch it and parse it explicitly.
01286 
01287     * Tag nesting rules:
01288 
01289       Most tags can't be nested at all. For instance, the occurance of
01290       a <p> tag should implicitly close the previous <p> tag.
01291 
01292        <p>Para1<p>Para2
01293         should be transformed into:
01294        <p>Para1</p><p>Para2
01295 
01296       Some tags can be nested arbitrarily. For instance, the occurance
01297       of a <blockquote> tag should _not_ implicitly close the previous
01298       <blockquote> tag.
01299 
01300        Alice said: <blockquote>Bob said: <blockquote>Blah
01301         should NOT be transformed into:
01302        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
01303 
01304       Some tags can be nested, but the nesting is reset by the
01305       interposition of other tags. For instance, a <tr> tag should
01306       implicitly close the previous <tr> tag within the same <table>,
01307       but not close a <tr> tag in another table.
01308 
01309        <table><tr>Blah<tr>Blah
01310         should be transformed into:
01311        <table><tr>Blah</tr><tr>Blah
01312         but,
01313        <tr>Blah<table><tr>Blah
01314         should NOT be transformed into
01315        <tr>Blah<table></tr><tr>Blah
01316 
01317     Differing assumptions about tag nesting rules are a major source
01318     of problems with the BeautifulSoup class. If BeautifulSoup is not
01319     treating as nestable a tag your page author treats as nestable,
01320     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
01321     BeautifulStoneSoup before writing your own subclass."""
01322 
01323     def __init__(self, *args, **kwargs):
01324         if not kwargs.has_key('smartQuotesTo'):
01325             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
01326         BeautifulStoneSoup.__init__(self, *args, **kwargs)
01327 
01328     SELF_CLOSING_TAGS = buildTagMap(None,
01329                                     ['br' , 'hr', 'input', 'img', 'meta',
01330                                     'spacer', 'link', 'frame', 'base'])
01331 
01332     QUOTE_TAGS = {'script': None}
01333     
01334     #According to the HTML standard, each of these inline tags can
01335     #contain another tag of the same type. Furthermore, it's common
01336     #to actually use these tags this way.
01337     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
01338                             'center']
01339 
01340     #According to the HTML standard, these block tags can contain
01341     #another tag of the same type. Furthermore, it's common
01342     #to actually use these tags this way.
01343     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
01344 
01345     #Lists can contain other lists, but there are restrictions.    
01346     NESTABLE_LIST_TAGS = { 'ol' : [],
01347                            'ul' : [],
01348                            'li' : ['ul', 'ol'],
01349                            'dl' : [],
01350                            'dd' : ['dl'],
01351                            'dt' : ['dl'] }
01352 
01353     #Tables can contain other tables, but there are restrictions.    
01354     NESTABLE_TABLE_TAGS = {'table' : [], 
01355                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
01356                            'td' : ['tr'],
01357                            'th' : ['tr'],
01358                            'thead' : ['table'],
01359                            'tbody' : ['table'],
01360                            'tfoot' : ['table'],
01361                            }
01362 
01363     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
01364 
01365     #If one of these tags is encountered, all tags up to the next tag of
01366     #this type are popped.
01367     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
01368                                      NON_NESTABLE_BLOCK_TAGS,
01369                                      NESTABLE_LIST_TAGS,
01370                                      NESTABLE_TABLE_TAGS)
01371 
01372     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
01373                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
01374 
01375     # Used to detect the charset in a META tag; see start_meta
01376     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
01377 
01378     def start_meta(self, attrs):
01379         """Beautiful Soup can detect a charset included in a META tag,
01380         try to convert the document to that charset, and re-parse the
01381         document from the beginning."""
01382         httpEquiv = None
01383         contentType = None
01384         contentTypeIndex = None
01385         tagNeedsEncodingSubstitution = False
01386 
01387         for i in range(0, len(attrs)):
01388             key, value = attrs[i]
01389             key = key.lower()
01390             if key == 'http-equiv':
01391                 httpEquiv = value
01392             elif key == 'content':
01393                 contentType = value
01394                 contentTypeIndex = i
01395 
01396         if httpEquiv and contentType: # It's an interesting meta tag.
01397             match = self.CHARSET_RE.search(contentType)
01398             if match:
01399                 if getattr(self, 'declaredHTMLEncoding') or \
01400                        (self.originalEncoding == self.fromEncoding):
01401                     # This is our second pass through the document, or
01402                     # else an encoding was specified explicitly and it
01403                     # worked. Rewrite the meta tag.
01404                     newAttr = self.CHARSET_RE.sub\
01405                               (lambda(match):match.group(1) +
01406                                "%SOUP-ENCODING%", value)
01407                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
01408                                                newAttr)
01409                     tagNeedsEncodingSubstitution = True
01410                 else:
01411                     # This is our first pass through the document.
01412                     # Go through it again with the new information.
01413                     newCharset = match.group(3)
01414                     if newCharset and newCharset != self.originalEncoding:
01415                         self.declaredHTMLEncoding = newCharset
01416                         self._feed(self.declaredHTMLEncoding)
01417                         raise StopParsing
01418         tag = self.unknown_starttag("meta", attrs)
01419         if tag and tagNeedsEncodingSubstitution:
01420             tag.containsSubstitutions = True
01421 
01422 class StopParsing(Exception):
01423     pass
01424    
01425 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
01426 
01427     """The BeautifulSoup class is oriented towards skipping over
01428     common HTML errors like unclosed tags. However, sometimes it makes
01429     errors of its own. For instance, consider this fragment:
01430 
01431      <b>Foo<b>Bar</b></b>
01432 
01433     This is perfectly valid (if bizarre) HTML. However, the
01434     BeautifulSoup class will implicitly close the first b tag when it
01435     encounters the second 'b'. It will think the author wrote
01436     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
01437     there's no real-world reason to bold something that's already
01438     bold. When it encounters '</b></b>' it will close two more 'b'
01439     tags, for a grand total of three tags closed instead of two. This
01440     can throw off the rest of your document structure. The same is
01441     true of a number of other tags, listed below.
01442 
01443     It's much more common for someone to forget to close a 'b' tag
01444     than to actually use nested 'b' tags, and the BeautifulSoup class
01445     handles the common case. This class handles the not-co-common
01446     case: where you can't believe someone wrote what they did, but
01447     it's valid HTML and BeautifulSoup screwed up by assuming it
01448     wouldn't be."""
01449 
01450     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
01451      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
01452       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
01453       'big']
01454 
01455     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
01456 
01457     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
01458                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
01459                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
01460 
01461 class MinimalSoup(BeautifulSoup):
01462     """The MinimalSoup class is for parsing HTML that contains
01463     pathologically bad markup. It makes no assumptions about tag
01464     nesting, but it does know which tags are self-closing, that
01465     <script> tags contain Javascript and should not be parsed, that
01466     META tags may contain encoding information, and so on.
01467 
01468     This also makes it better for subclassing than BeautifulStoneSoup
01469     or BeautifulSoup."""
01470     
01471     RESET_NESTING_TAGS = buildTagMap('noscript')
01472     NESTABLE_TAGS = {}
01473 
01474 class BeautifulSOAP(BeautifulStoneSoup):
01475     """This class will push a tag with only a single string child into
01476     the tag's parent as an attribute. The attribute's name is the tag
01477     name, and the value is the string child. An example should give
01478     the flavor of the change:
01479 
01480     <foo><bar>baz</bar></foo>
01481      =>
01482     <foo bar="baz"><bar>baz</bar></foo>
01483 
01484     You can then access fooTag['bar'] instead of fooTag.barTag.string.
01485 
01486     This is, of course, useful for scraping structures that tend to
01487     use subelements instead of attributes, such as SOAP messages. Note
01488     that it modifies its input, so don't print the modified version
01489     out.
01490 
01491     I'm not sure how many people really want to use this class; let me
01492     know if you do. Mainly I like the name."""
01493 
01494     def popTag(self):
01495         if len(self.tagStack) > 1:
01496             tag = self.tagStack[-1]
01497             parent = self.tagStack[-2]
01498             parent._getAttrMap()
01499             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
01500                 isinstance(tag.contents[0], NavigableString) and 
01501                 not parent.attrMap.has_key(tag.name)):
01502                 parent[tag.name] = tag.contents[0]
01503         BeautifulStoneSoup.popTag(self)
01504 
01505 #Enterprise class names! It has come to our attention that some people
01506 #think the names of the Beautiful Soup parser classes are too silly
01507 #and "unprofessional" for use in enterprise screen-scraping. We feel
01508 #your pain! For such-minded folk, the Beautiful Soup Consortium And
01509 #All-Night Kosher Bakery recommends renaming this file to
01510 #"RobustParser.py" (or, in cases of extreme enterprisitude,
01511 #"RobustParserBeanInterface.class") and using the following
01512 #enterprise-friendly class aliases:
01513 class RobustXMLParser(BeautifulStoneSoup):
01514     pass
01515 class RobustHTMLParser(BeautifulSoup):
01516     pass
01517 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
01518     pass
01519 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
01520     pass
01521 class SimplifyingSOAPParser(BeautifulSOAP):
01522     pass
01523 
01524 ######################################################
01525 #
01526 # Bonus library: Unicode, Dammit
01527 #
01528 # This class forces XML data into a standard format (usually to UTF-8
01529 # or Unicode).  It is heavily based on code from Mark Pilgrim's
01530 # Universal Feed Parser. It does not rewrite the XML or HTML to
01531 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
01532 # (XML) and BeautifulSoup.start_meta (HTML).
01533 
01534 # Autodetects character encodings.
01535 # Download from http://chardet.feedparser.org/
01536 try:
01537     import chardet
01538 #    import chardet.constants
01539 #    chardet.constants._debug = 1
01540 except:
01541     chardet = None
01542 chardet = None
01543 
01544 # cjkcodecs and iconv_codec make Python know about more character encodings.
01545 # Both are available from http://cjkpython.i18n.org/
01546 # They're built in if you use Python 2.4.
01547 try:
01548     import cjkcodecs.aliases
01549 except:
01550     pass
01551 try:
01552     import iconv_codec
01553 except:
01554     pass
01555 
01556 class UnicodeDammit:
01557     """A class for detecting the encoding of a *ML document and
01558     converting it to a Unicode string. If the source encoding is
01559     windows-1252, can replace MS smart quotes with their HTML or XML
01560     equivalents."""
01561 
01562     # This dictionary maps commonly seen values for "charset" in HTML
01563     # meta tags to the corresponding Python codec names. It only covers
01564     # values that aren't in Python's aliases and can't be determined
01565     # by the heuristics in find_codec.
01566     CHARSET_ALIASES = { "macintosh" : "mac-roman",
01567                         "x-sjis" : "shift-jis" }
01568     
01569     def __init__(self, markup, overrideEncodings=[],
01570                  smartQuotesTo='xml'):
01571         self.markup, documentEncoding, sniffedEncoding = \
01572                      self._detectEncoding(markup)
01573         self.smartQuotesTo = smartQuotesTo
01574         self.triedEncodings = []
01575         if isinstance(markup, unicode):
01576             return markup
01577 
01578         u = None
01579         for proposedEncoding in overrideEncodings:
01580             u = self._convertFrom(proposedEncoding)
01581             if u: break
01582         if not u:
01583             for proposedEncoding in (documentEncoding, sniffedEncoding):
01584                 u = self._convertFrom(proposedEncoding)
01585                 if u: break
01586                 
01587         # If no luck and we have auto-detection library, try that:
01588         if not u and chardet and not isinstance(self.markup, unicode):
01589             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
01590 
01591         # As a last resort, try utf-8 and windows-1252:
01592         if not u:
01593             for proposed_encoding in ("utf-8", "windows-1252"):
01594                 u = self._convertFrom(proposed_encoding)
01595                 if u: break
01596         self.unicode = u
01597         if not u: self.originalEncoding = None
01598 
01599     def subMSChar(orig, smartQuotesTo):
01600         """Changes a MS smart quote character to an XML or HTML
01601         entity."""
01602         sub = UnicodeDammit.MS_CHARS.get(orig)
01603         if type(sub) == types.TupleType:
01604             if smartQuotesTo == 'xml':
01605                 sub = '&#x%s;' % sub[1]
01606             elif smartQuotesTo == 'html':
01607                 sub = '&%s;' % sub[0]
01608             else:
01609                 sub = unichr(int(sub[1],16))
01610         return sub            
01611     subMSChar = staticmethod(subMSChar)
01612 
01613     def _convertFrom(self, proposed):        
01614         proposed = self.find_codec(proposed)
01615         if not proposed or proposed in self.triedEncodings:
01616             return None
01617         self.triedEncodings.append(proposed)
01618         markup = self.markup
01619 
01620         # Convert smart quotes to HTML if coming from an encoding
01621         # that might have them.
01622         if self.smartQuotesTo and proposed in("windows-1252",
01623                                               "ISO-8859-1",
01624                                               "ISO-8859-2"):
01625             markup = re.compile("([\x80-\x9f])").sub \
01626                      (lambda(x): self.subMSChar(x.group(1),self.smartQuotesTo),
01627                       markup)
01628 
01629         try:
01630             # print "Trying to convert document to %s" % proposed
01631             u = self._toUnicode(markup, proposed)
01632             self.markup = u       
01633             self.originalEncoding = proposed
01634         except Exception, e:
01635             # print "That didn't work!"
01636             # print e
01637             return None        
01638         #print "Correct encoding: %s" % proposed
01639         return self.markup
01640 
01641     def _toUnicode(self, data, encoding):
01642         '''Given a string and its encoding, decodes the string into Unicode.
01643         %encoding is a string recognized by encodings.aliases'''
01644 
01645         # strip Byte Order Mark (if present)
01646         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
01647                and (data[2:4] != '\x00\x00'):
01648             encoding = 'utf-16be'
01649             data = data[2:]
01650         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
01651                  and (data[2:4] != '\x00\x00'):
01652             encoding = 'utf-16le'
01653             data = data[2:]
01654         elif data[:3] == '\xef\xbb\xbf':
01655             encoding = 'utf-8'
01656             data = data[3:]
01657         elif data[:4] == '\x00\x00\xfe\xff':
01658             encoding = 'utf-32be'
01659             data = data[4:]
01660         elif data[:4] == '\xff\xfe\x00\x00':
01661             encoding = 'utf-32le'
01662             data = data[4:]
01663         newdata = unicode(data, encoding)
01664         return newdata
01665     
01666     def _detectEncoding(self, xml_data):
01667         """Given a document, tries to detect its XML encoding."""
01668         xml_encoding = sniffed_xml_encoding = None
01669         try:
01670             if xml_data[:4] == '\x4c\x6f\xa7\x94':
01671                 # EBCDIC
01672                 xml_data = self._ebcdic_to_ascii(xml_data)
01673             elif xml_data[:4] == '\x00\x3c\x00\x3f':
01674                 # UTF-16BE
01675                 sniffed_xml_encoding = 'utf-16be'
01676                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
01677             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
01678                      and (xml_data[2:4] != '\x00\x00'):
01679                 # UTF-16BE with BOM
01680                 sniffed_xml_encoding = 'utf-16be'
01681                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
01682             elif xml_data[:4] == '\x3c\x00\x3f\x00':
01683                 # UTF-16LE
01684                 sniffed_xml_encoding = 'utf-16le'
01685                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
01686             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
01687                      (xml_data[2:4] != '\x00\x00'):
01688                 # UTF-16LE with BOM
01689                 sniffed_xml_encoding = 'utf-16le'
01690                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
01691             elif xml_data[:4] == '\x00\x00\x00\x3c':
01692                 # UTF-32BE
01693                 sniffed_xml_encoding = 'utf-32be'
01694                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
01695             elif xml_data[:4] == '\x3c\x00\x00\x00':
01696                 # UTF-32LE
01697                 sniffed_xml_encoding = 'utf-32le'
01698                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
01699             elif xml_data[:4] == '\x00\x00\xfe\xff':
01700                 # UTF-32BE with BOM
01701                 sniffed_xml_encoding = 'utf-32be'
01702                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
01703             elif xml_data[:4] == '\xff\xfe\x00\x00':
01704                 # UTF-32LE with BOM
01705                 sniffed_xml_encoding = 'utf-32le'
01706                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
01707             elif xml_data[:3] == '\xef\xbb\xbf':
01708                 # UTF-8 with BOM
01709                 sniffed_xml_encoding = 'utf-8'
01710                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
01711             else:
01712                 sniffed_xml_encoding = 'ascii'
01713                 pass
01714             xml_encoding_match = re.compile \
01715                                  ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
01716                                  .match(xml_data)
01717         except:
01718             xml_encoding_match = None
01719         if xml_encoding_match:
01720             xml_encoding = xml_encoding_match.groups()[0].lower()
01721             if sniffed_xml_encoding and \
01722                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
01723                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
01724                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
01725                                  'utf16', 'u16')):
01726                 xml_encoding = sniffed_xml_encoding
01727         return xml_data, xml_encoding, sniffed_xml_encoding
01728 
01729 
01730     def find_codec(self, charset):
01731         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
01732                or (charset and self._codec(charset.replace("-", ""))) \
01733                or (charset and self._codec(charset.replace("-", "_"))) \
01734                or charset
01735 
01736     def _codec(self, charset):
01737         if not charset: return charset 
01738         codec = None
01739         try:
01740             codecs.lookup(charset)
01741             codec = charset
01742         except LookupError:
01743             pass
01744         return codec
01745 
01746     EBCDIC_TO_ASCII_MAP = None
01747     def _ebcdic_to_ascii(self, s):
01748         c = self.__class__
01749         if not c.EBCDIC_TO_ASCII_MAP:
01750             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
01751                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
01752                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
01753                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
01754                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
01755                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
01756                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
01757                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
01758                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
01759                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
01760                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
01761                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
01762                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
01763                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
01764                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
01765                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
01766                     250,251,252,253,254,255)
01767             import string
01768             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
01769             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
01770         return s.translate(c.EBCDIC_TO_ASCII_MAP)
01771 
01772     MS_CHARS = { '\x80' : ('euro', '20AC'),
01773                  '\x81' : ' ',
01774                  '\x82' : ('sbquo', '201A'),
01775                  '\x83' : ('fnof', '192'),
01776                  '\x84' : ('bdquo', '201E'),
01777                  '\x85' : ('hellip', '2026'),
01778                  '\x86' : ('dagger', '2020'),
01779                  '\x87' : ('Dagger', '2021'),
01780                  '\x88' : ('circ', '2C6'),
01781                  '\x89' : ('permil', '2030'),
01782                  '\x8A' : ('Scaron', '160'),
01783                  '\x8B' : ('lsaquo', '2039'),
01784                  '\x8C' : ('OElig', '152'),
01785                  '\x8D' : '?',
01786                  '\x8E' : ('#x17D', '17D'),
01787                  '\x8F' : '?',
01788                  '\x90' : '?',
01789                  '\x91' : ('lsquo', '2018'),
01790                  '\x92' : ('rsquo', '2019'),
01791                  '\x93' : ('ldquo', '201C'),
01792                  '\x94' : ('rdquo', '201D'),
01793                  '\x95' : ('bull', '2022'),
01794                  '\x96' : ('ndash', '2013'),
01795                  '\x97' : ('mdash', '2014'),
01796                  '\x98' : ('tilde', '2DC'),
01797                  '\x99' : ('trade', '2122'),
01798                  '\x9a' : ('scaron', '161'),
01799                  '\x9b' : ('rsaquo', '203A'),
01800                  '\x9c' : ('oelig', '153'),
01801                  '\x9d' : '?',
01802                  '\x9e' : ('#x17E', '17E'),
01803                  '\x9f' : ('Yuml', '178'),}
01804 
01805 #######################################################################
01806 
01807 
01808 #By default, act as an HTML pretty-printer.
01809 if __name__ == '__main__':
01810     import sys
01811     soup = BeautifulSoup(sys.stdin.read())
01812     print soup.prettify()