Back to index

python-biopython  1.60
Parser.py
Go to the documentation of this file.
00001 # Copyright 2008 by Michiel de Hoon.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Parser for XML results returned by NCBI's Entrez Utilities. This
00007 parser is used by the read() function in Bio.Entrez, and is not intended
00008 be used directly.
00009 """
00010 
00011 # The question is how to represent an XML file as Python objects. Some
00012 # XML files returned by NCBI look like lists, others look like dictionaries,
00013 # and others look like a mix of lists and dictionaries.
00014 #
00015 # My approach is to classify each possible element in the XML as a plain
00016 # string, an integer, a list, a dictionary, or a structure. The latter is a
00017 # dictionary where the same key can occur multiple times; in Python, it is
00018 # represented as a dictionary where that key occurs once, pointing to a list
00019 # of values found in the XML file.
00020 #
00021 # The parser then goes through the XML and creates the appropriate Python
00022 # object for each element. The different levels encountered in the XML are
00023 # preserved on the Python side. So a subelement of a subelement of an element
00024 # is a value in a dictionary that is stored in a list which is a value in
00025 # some other dictionary (or a value in a list which itself belongs to a list
00026 # which is a value in a dictionary, and so on). Attributes encountered in 
00027 # the XML are stored as a dictionary in a member .attributes of each element,
00028 # and the tag name is saved in a member .tag.
00029 #
00030 # To decide which kind of Python object corresponds to each element in the
00031 # XML, the parser analyzes the DTD referred at the top of (almost) every
00032 # XML file returned by the Entrez Utilities. This is preferred over a hand-
00033 # written solution, since the number of DTDs is rather large and their
00034 # contents may change over time. About half the code in this parser deals
00035 # wih parsing the DTD, and the other half with the XML itself.
00036 
00037 
00038 import os.path
00039 import urlparse
00040 import urllib
00041 import warnings
00042 from xml.parsers import expat
00043 
00044 # The following four classes are used to add a member .attributes to integers,
00045 # strings, lists, and dictionaries, respectively.
00046 
00047 class IntegerElement(int):
00048     def __repr__(self):
00049         text = int.__repr__(self)
00050         try:
00051             attributes = self.attributes
00052         except AttributeError:
00053             return text
00054         return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
00055 
00056 class StringElement(str):
00057     def __repr__(self):
00058         text = str.__repr__(self)
00059         try:
00060             attributes = self.attributes
00061         except AttributeError:
00062             return text
00063         return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
00064 
00065 class UnicodeElement(unicode):
00066     def __repr__(self):
00067         text = unicode.__repr__(self)
00068         try:
00069             attributes = self.attributes
00070         except AttributeError:
00071             return text
00072         return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
00073 
00074 class ListElement(list):
00075     def __repr__(self):
00076         text = list.__repr__(self)
00077         try:
00078             attributes = self.attributes
00079         except AttributeError:
00080             return text
00081         return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
00082 
00083 class DictionaryElement(dict):
00084     def __repr__(self):
00085         text = dict.__repr__(self)
00086         try:
00087             attributes = self.attributes
00088         except AttributeError:
00089             return text
00090         return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
00091 
00092 # A StructureElement is like a dictionary, but some of its keys can have
00093 # multiple values associated with it. These values are stored in a list
00094 # under each key.
00095 class StructureElement(dict):
00096     def __init__(self, keys):
00097         dict.__init__(self)
00098         for key in keys:
00099             dict.__setitem__(self, key, [])
00100         self.listkeys = keys
00101     def __setitem__(self, key, value):
00102         if key in self.listkeys:
00103             self[key].append(value)
00104         else:
00105             dict.__setitem__(self, key, value)
00106     def __repr__(self):
00107         text = dict.__repr__(self)
00108         try:
00109             attributes = self.attributes
00110         except AttributeError:
00111             return text
00112         return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
00113 
00114 
00115 class NotXMLError(ValueError):
00116     def __init__(self, message):
00117         self.msg = message
00118     def __str__(self):
00119         return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
00120 
00121 
00122 class CorruptedXMLError(ValueError):
00123     def __init__(self, message):
00124         self.msg = message
00125     def __str__(self):
00126         return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
00127 
00128 
00129 class ValidationError(ValueError):
00130     """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
00131     def __init__(self, name):
00132         self.name = name
00133     def __str__(self):
00134         return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
00135 
00136 
00137 class DataHandler(object):
00138 
00139     home = os.path.expanduser('~')
00140     local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs')
00141     del home
00142 
00143     from Bio import Entrez
00144     global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs")
00145     del Entrez
00146 
00147     def __init__(self, validate):
00148         self.stack = []
00149         self.errors = []
00150         self.integers = []
00151         self.strings = []
00152         self.lists = []
00153         self.dictionaries = []
00154         self.structures = {}
00155         self.items = []
00156         self.dtd_urls = []
00157         self.validating = validate
00158         self.parser = expat.ParserCreate(namespace_separator=" ")
00159         self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
00160         self.parser.XmlDeclHandler = self.xmlDeclHandler
00161 
00162     def read(self, handle):
00163         """Set up the parser and let it parse the XML results"""
00164         if hasattr(handle, "closed") and handle.closed:
00165             #Should avoid a possible Segmentation Fault, see:
00166             #http://bugs.python.org/issue4877
00167             raise IOError("Can't parse a closed handle")
00168         try:
00169             self.parser.ParseFile(handle)
00170         except expat.ExpatError, e:
00171             if self.parser.StartElementHandler:
00172                 # We saw the initial <!xml declaration, so we can be sure that
00173                 # we are parsing XML data. Most likely, the XML file is
00174                 # corrupted.
00175                 raise CorruptedXMLError(e)
00176             else:
00177                 # We have not seen the initial <!xml declaration, so probably
00178                 # the input data is not in XML format.
00179                 raise NotXMLError(e)
00180         try:
00181             return self.object
00182         except AttributeError:
00183             if self.parser.StartElementHandler:
00184                 # We saw the initial <!xml declaration, and expat didn't notice
00185                 # any errors, so self.object should be defined. If not, this is
00186                 # a bug.
00187                 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
00188             else:
00189                 # We did not see the initial <!xml declaration, so probably
00190                 # the input data is not in XML format.
00191                 raise NotXMLError("XML declaration not found")
00192 
00193     def parse(self, handle):
00194         BLOCK = 1024
00195         while True:
00196             #Read in another block of the file...
00197             text = handle.read(BLOCK)
00198             if not text:
00199                 # We have reached the end of the XML file
00200                 if self.stack:
00201                     # No more XML data, but there is still some unfinished
00202                     # business
00203                     raise CorruptedXMLError
00204                 try:
00205                     for record in self.object:
00206                         yield record
00207                 except AttributeError:
00208                     if self.parser.StartElementHandler:
00209                         # We saw the initial <!xml declaration, and expat
00210                         # didn't notice any errors, so self.object should be
00211                         # defined. If not, this is a bug.
00212                         raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.")
00213                     else:
00214                         # We did not see the initial <!xml declaration, so
00215                         # probably the input data is not in XML format.
00216                         raise NotXMLError("XML declaration not found")
00217                 self.parser.Parse("", True)
00218                 self.parser = None
00219                 return
00220 
00221             try:
00222                 self.parser.Parse(text, False)        
00223             except expat.ExpatError, e:
00224                 if self.parser.StartElementHandler:
00225                     # We saw the initial <!xml declaration, so we can be sure
00226                     # that we are parsing XML data. Most likely, the XML file
00227                     # is corrupted.
00228                     raise CorruptedXMLError(e)
00229                 else:
00230                     # We have not seen the initial <!xml declaration, so
00231                     # probably the input data is not in XML format.
00232                     raise NotXMLError(e)
00233 
00234             if not self.stack:
00235                 # Haven't read enough from the XML file yet
00236                 continue
00237 
00238             records = self.stack[0]
00239             if not isinstance(records, list):
00240                 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse")
00241             while len(records) > 1: # Then the top record is finished
00242                 record = records.pop(0)
00243                 yield record
00244 
00245     def xmlDeclHandler(self, version, encoding, standalone):
00246         # XML declaration found; set the handlers
00247         self.parser.StartElementHandler = self.startElementHandler
00248         self.parser.EndElementHandler = self.endElementHandler
00249         self.parser.CharacterDataHandler = self.characterDataHandler
00250         self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler
00251         self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
00252 
00253     def startNamespaceDeclHandler(self, prefix, un):
00254         raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
00255 
00256     def startElementHandler(self, name, attrs):
00257         self.content = ""
00258         if name in self.lists:
00259             object = ListElement()
00260         elif name in self.dictionaries:
00261             object = DictionaryElement()
00262         elif name in self.structures:
00263             object = StructureElement(self.structures[name])
00264         elif name in self.items: # Only appears in ESummary
00265             name = str(attrs["Name"]) # convert from Unicode
00266             del attrs["Name"]
00267             itemtype = str(attrs["Type"]) # convert from Unicode
00268             del attrs["Type"]
00269             if itemtype=="Structure":
00270                 object = DictionaryElement()
00271             elif name in ("ArticleIds", "History"):
00272                 object = StructureElement(["pubmed", "medline"])
00273             elif itemtype=="List":
00274                 object = ListElement()
00275             else:
00276                 object = StringElement()
00277             object.itemname = name
00278             object.itemtype = itemtype
00279         elif name in self.strings + self.errors + self.integers:
00280             self.attributes = attrs
00281             return
00282         else:
00283             # Element not found in DTD
00284             if self.validating:
00285                 raise ValidationError(name)
00286             else:
00287                 # this will not be stored in the record
00288                 object = ""
00289         if object!="":
00290             object.tag = name
00291             if attrs:
00292                 object.attributes = dict(attrs)
00293             if len(self.stack)!=0:
00294                 current = self.stack[-1]
00295                 try:
00296                     current.append(object)
00297                 except AttributeError:
00298                     current[name] = object
00299         self.stack.append(object)
00300 
00301     def endElementHandler(self, name):
00302         value = self.content
00303         if name in self.errors:
00304             if value=="":
00305                 return
00306             else:
00307                 raise RuntimeError(value)
00308         elif name in self.integers:
00309             value = IntegerElement(value)
00310         elif name in self.strings:
00311             # Convert Unicode strings to plain strings if possible
00312             try:
00313                 value = StringElement(value)
00314             except UnicodeEncodeError:
00315                 value = UnicodeElement(value)
00316         elif name in self.items:
00317             self.object = self.stack.pop()
00318             if self.object.itemtype in ("List", "Structure"):
00319                 return
00320             elif self.object.itemtype=="Integer" and value:
00321                 value = IntegerElement(value)
00322             else:
00323                 # Convert Unicode strings to plain strings if possible
00324                 try:
00325                     value = StringElement(value)
00326                 except UnicodeEncodeError:
00327                     value = UnicodeElement(value)
00328             name = self.object.itemname
00329         else:
00330             self.object = self.stack.pop()
00331             return
00332         value.tag = name
00333         if self.attributes:
00334             value.attributes = dict(self.attributes)
00335             del self.attributes
00336         current = self.stack[-1]
00337         if current!="":
00338             try:
00339                 current.append(value)
00340             except AttributeError:
00341                 current[name] = value
00342 
00343     def characterDataHandler(self, content):
00344         self.content += content
00345 
00346     def elementDecl(self, name, model):
00347         """This callback function is called for each element declaration:
00348         <!ELEMENT       name          (...)>
00349         encountered in a DTD. The purpose of this function is to determine
00350         whether this element should be regarded as a string, integer, list
00351         dictionary, structure, or error."""
00352         if name.upper()=="ERROR":
00353             self.errors.append(name)
00354             return
00355         if name=='Item' and model==(expat.model.XML_CTYPE_MIXED,
00356                                     expat.model.XML_CQUANT_REP,
00357                                     None, ((expat.model.XML_CTYPE_NAME,
00358                                             expat.model.XML_CQUANT_NONE,
00359                                             'Item',
00360                                             ()
00361                                            ),
00362                                           )
00363                                    ):
00364             # Special case. As far as I can tell, this only occurs in the
00365             # eSummary DTD.
00366             self.items.append(name)
00367             return
00368         # First, remove ignorable parentheses around declarations
00369         while (model[0] in (expat.model.XML_CTYPE_SEQ,
00370                             expat.model.XML_CTYPE_CHOICE)
00371           and model[1] in (expat.model.XML_CQUANT_NONE,
00372                            expat.model.XML_CQUANT_OPT)
00373           and len(model[3])==1):
00374             model = model[3][0]
00375         # PCDATA declarations correspond to strings
00376         if model[0] in (expat.model.XML_CTYPE_MIXED,
00377                         expat.model.XML_CTYPE_EMPTY):
00378             self.strings.append(name)
00379             return
00380         # List-type elements
00381         if (model[0] in (expat.model.XML_CTYPE_CHOICE,
00382                          expat.model.XML_CTYPE_SEQ) and
00383             model[1] in (expat.model.XML_CQUANT_PLUS,
00384                          expat.model.XML_CQUANT_REP)):
00385             self.lists.append(name)
00386             return
00387         # This is the tricky case. Check which keys can occur multiple
00388         # times. If only one key is possible, and it can occur multiple
00389         # times, then this is a list. If more than one key is possible,
00390         # but none of them can occur multiple times, then this is a
00391         # dictionary. Otherwise, this is a structure.
00392         # In 'single' and 'multiple', we keep track which keys can occur
00393         # only once, and which can occur multiple times.
00394         single = []
00395         multiple = []
00396         # The 'count' function is called recursively to make sure all the
00397         # children in this model are counted. Error keys are ignored;
00398         # they raise an exception in Python.
00399         def count(model):
00400             quantifier, name, children = model[1:]
00401             if name==None:
00402                 if quantifier in (expat.model.XML_CQUANT_PLUS,
00403                                   expat.model.XML_CQUANT_REP):
00404                     for child in children:
00405                         multiple.append(child[2])
00406                 else:
00407                     for child in children:
00408                         count(child)
00409             elif name.upper()!="ERROR":
00410                 if quantifier in (expat.model.XML_CQUANT_NONE,
00411                                   expat.model.XML_CQUANT_OPT):
00412                     single.append(name)
00413                 elif quantifier in (expat.model.XML_CQUANT_PLUS,
00414                                     expat.model.XML_CQUANT_REP):
00415                     multiple.append(name)
00416         count(model)
00417         if len(single)==0 and len(multiple)==1:
00418             self.lists.append(name)
00419         elif len(multiple)==0:
00420             self.dictionaries.append(name)
00421         else:
00422             self.structures.update({name: multiple})
00423 
00424     def open_dtd_file(self, filename):
00425         path = os.path.join(DataHandler.local_dtd_dir, filename)
00426         try:
00427             handle = open(path, "rb")
00428         except IOError:
00429             pass
00430         else:
00431             return handle
00432         path = os.path.join(DataHandler.global_dtd_dir, filename)
00433         try:
00434             handle = open(path, "rb")
00435         except IOError:
00436             pass
00437         else:
00438             return handle
00439         return None
00440 
00441     def externalEntityRefHandler(self, context, base, systemId, publicId):
00442         """The purpose of this function is to load the DTD locally, instead
00443         of downloading it from the URL specified in the XML. Using the local
00444         DTD results in much faster parsing. If the DTD is not found locally,
00445         we try to download it. If new DTDs become available from NCBI,
00446         putting them in Bio/Entrez/DTDs will allow the parser to see them."""
00447         urlinfo = urlparse.urlparse(systemId)
00448         #Following attribute requires Python 2.5+
00449         #if urlinfo.scheme=='http':
00450         if urlinfo[0]=='http':
00451             # Then this is an absolute path to the DTD.
00452             url = systemId
00453         elif urlinfo[0]=='':
00454             # Then this is a relative path to the DTD.
00455             # Look at the parent URL to find the full path.
00456             try:
00457                 url = self.dtd_urls[-1]
00458             except IndexError:
00459                 # Assume the default URL for DTDs if the top parent
00460                 # does not contain an absolute path
00461                 source = "http://www.ncbi.nlm.nih.gov/dtd/"
00462             else:
00463                 source = os.path.dirname(url)
00464             # urls always have a forward slash, don't use os.path.join
00465             url = source.rstrip("/") + "/" + systemId
00466         self.dtd_urls.append(url)
00467         # First, try to load the local version of the DTD file
00468         location, filename = os.path.split(systemId)
00469         handle = self.open_dtd_file(filename)
00470         if not handle:
00471             # DTD is not available as a local file. Try accessing it through
00472             # the internet instead.
00473             message = """\
00474 Unable to load DTD file %s.
00475 
00476 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez.
00477 Though most of NCBI's DTD files are included in the Biopython distribution,
00478 sometimes you may find that a particular DTD file is missing. While we can
00479 access the DTD file through the internet, the parser is much faster if the
00480 required DTD files are available locally.
00481 
00482 For this purpose, please download %s from
00483 
00484 %s
00485 
00486 and save it either in directory
00487 
00488 %s
00489 
00490 or in directory
00491 
00492 %s
00493 
00494 in order for Bio.Entrez to find it.
00495 
00496 Alternatively, you can save %s in the directory
00497 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython.
00498 
00499 Please also inform the Biopython developers about this missing DTD, by
00500 reporting a bug on http://bugzilla.open-bio.org/ or sign up to our mailing
00501 list and emailing us, so that we can include it with the next release of
00502 Biopython.
00503 
00504 Proceeding to access the DTD file through the internet...
00505 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename)
00506             warnings.warn(message)
00507             try:
00508                 handle = urllib.urlopen(url)
00509             except IOError:
00510                 raise RuntimeException("Failed to access %s at %s" % (filename, url))
00511 
00512         parser = self.parser.ExternalEntityParserCreate(context)
00513         parser.ElementDeclHandler = self.elementDecl
00514         parser.ParseFile(handle)
00515         handle.close()
00516         self.dtd_urls.pop()
00517         return 1