Back to index

plone3  3.1.7
parser.py
Go to the documentation of this file.
00001 from HTMLParser import HTMLParser, HTMLParseError
00002 
00003 
00004 class LinkParser(HTMLParser):
00005     """ a simple html parser for link and image urls """
00006 
00007     def __init__(self):
00008         HTMLParser.__init__(self)
00009         self.links = []
00010 
00011     def getLinks(self):
00012         """ return all links found during parsing """
00013         return tuple(self.links)
00014 
00015     def handle_starttag(self, tag, attrs):
00016         """ override the method to remember all links """
00017         if tag == 'a':
00018             self.links.extend(search_attr('href', attrs))
00019         if tag == 'img':
00020             self.links.extend(search_attr('src', attrs))
00021 
00022 
00023 def search_attr(name, attrs):
00024     """ search named attribute in a list of attributes """
00025     for attr, value in attrs:
00026         if attr == name:
00027             return [value]
00028     return []
00029 
00030 
00031 def extractLinks(data):
00032     """ parse the given html and return all links """
00033     parser = LinkParser()
00034     try:
00035         parser.feed(data)
00036         parser.close()
00037     except HTMLParseError:
00038         pass
00039     return parser.getLinks()
00040