Back to index

python-biopython  1.60
SeqXmlIO.py
Go to the documentation of this file.
00001 # Copyright 2010 by Thomas Schmitt.  
00002 # All rights reserved.
00003 #
00004 # This module is for reading and writing SeqXML format files as
00005 # SeqRecord objects, and is expected to be used via the Bio.SeqIO API.
00006 """Bio.SeqIO support for the "seqxml" file format, SeqXML.
00007 
00008 You are expected to use this module via the Bio.SeqIO functions.
00009 
00010 SeqXML is a lightweight XML format which is supposed be an alternative for
00011 FASTA files. For more Information see http://www.seqXML.org and Schmitt et al
00012 (2011), http://dx.doi.org/10.1093/bib/bbr025 
00013 """
00014 
00015 from xml.sax.saxutils import XMLGenerator
00016 from xml.sax.xmlreader import AttributesImpl
00017 from xml.dom import pulldom
00018 from xml.sax import SAXParseException
00019 
00020 from Bio import Alphabet
00021 from Bio.Seq import Seq
00022 from Bio.Seq import UnknownSeq
00023 from Bio.SeqRecord import SeqRecord
00024 from Interfaces import SequentialSequenceWriter
00025 
00026 
00027 class XMLRecordIterator:
00028     """Base class for building iterators for record style XML formats. 
00029     
00030     It is assumed that all information for one record can be found within a
00031     record element or above. Two types of methods are called when the start
00032     tag of an element is reached. To receive only the attributes of an
00033     element before its end tag is reached implement _attr_TAGNAME.
00034     To get an element and its children as a DOM tree implement _elem_TAGNAME. 
00035     Everything that is part of the DOM tree will not trigger any further 
00036     method calls.
00037     """
00038 
00039     def __init__(self,handle,recordTag,namespace=None):
00040         """Creating the object and initializing the XML parser."""
00041         
00042         self._recordTag=recordTag
00043         self._namespace=namespace
00044         self._events=pulldom.parse(handle)
00045         
00046         
00047     def __iter__(self):
00048         """Iterate over the records in the XML file. 
00049         Returns the last parsed record.""" 
00050         
00051         record = None
00052         try:
00053             for event,node in self._events:
00054                 
00055                 if event == "START_ELEMENT" and node.namespaceURI == self._namespace:
00056                     
00057                     if node.localName == self._recordTag:
00058                         #create an empty SeqRecord
00059                         record = SeqRecord('', id='')
00060 
00061                     #call matching methods with attributes only                    
00062                     if hasattr(self,"_attr_" + node.localName):
00063                         getattr(self,"_attr_" + node.localName)(self._attributes(node),record)
00064     
00065                     #call matching methods with DOM tree 
00066                     if hasattr(self,"_elem_" + node.localName):
00067                         #read the element and all nested elements into a DOM tree
00068                         self._events.expandNode(node)
00069                         node.normalize()
00070                         
00071                         getattr(self,"_elem_" + node.localName)(node,record)
00072                     
00073                 elif event == "END_ELEMENT" and node.namespaceURI == self._namespace and node.localName == self._recordTag:
00074                     yield record
00075                     
00076         except SAXParseException, e:
00077             
00078             if e.getLineNumber() == 1 and e.getColumnNumber() == 0:
00079                 #empty file
00080                 pass
00081             else:
00082                 import os
00083                 if e.getLineNumber() == 1 and e.getColumnNumber() == 1 \
00084                 and os.name == "java":
00085                     #empty file, see http://bugs.jython.org/issue1774
00086                     pass
00087                 else:
00088                     raise
00089 
00090     
00091     def _attributes(self,node):
00092         """Return the attributes of a DOM node as dictionary."""
00093         
00094         return dict( (node.attributes.item(i).name,node.attributes.item(i).value) for i in xrange(node.attributes.length) )
00095     
00096     
00097 
00098 class SeqXmlIterator(XMLRecordIterator):
00099     """Breaks seqXML file into SeqRecords.
00100     
00101     Assumes valid seqXML please validate beforehand."""
00102     
00103     def __init__(self,handle):
00104         """Create the object."""
00105         XMLRecordIterator.__init__(self, handle,"entry")
00106         
00107         self._source = None
00108         self._source_version = None
00109         self._version = None
00110         self._speciesName = None
00111         self._ncbiTaxId = None
00112 
00113     def _attr_seqXML(self,attr_dict,record):
00114         """Parse the document metadata."""
00115         
00116         if "source" in attr_dict:
00117             self._source = attr_dict["source"]
00118         if "sourceVersion" in attr_dict:
00119             self._source_version = attr_dict["sourceVersion"]
00120         if "version" in attr_dict:
00121             self._version = attr_dict["seqXMLversion"]
00122         if "ncbiTaxID" in attr_dict:
00123             self._ncbiTaxId = attr_dict["ncbiTaxID"]
00124         if "speciesName" in attr_dict:
00125             self._speciesName = attr_dict["speciesName"]
00126     
00127     def _attr_property(self,attr_dict,record):
00128         """Parse key value pair properties and store them as annotations."""
00129         
00130         if "name" not in attr_dict:
00131             raise ValueError("Malformed property element.")
00132         
00133         value = attr_dict.get("value",None)
00134         
00135         if attr_dict["name"] not in record.annotations:
00136             record.annotations[attr_dict["name"]] = value
00137         elif isinstance(record.annotations[attr_dict["name"]],list):
00138             record.annotations[attr_dict["name"]].append(value)
00139         else:
00140             record.annotations[attr_dict["name"]] = [record.annotations[attr_dict["name"]],value]
00141             
00142         
00143     def _attr_species(self,attr_dict,record):
00144         """Parse the species information."""
00145         
00146         if "name" not in attr_dict or "ncbiTaxID" not in attr_dict:
00147             raise ValueError("Malformed species element!")
00148         
00149         #the keywords for the species annotation are taken from SwissIO   
00150         record.annotations["organism"] = attr_dict["name"]
00151         record.annotations["ncbi_taxid"] = attr_dict["ncbiTaxID"]
00152     
00153     def _attr_entry(self,attr_dict,record):
00154         """New entry set id and the optional entry source."""
00155         
00156         if "id" not in attr_dict:
00157             raise ValueError("Malformed entry! Identifier is missing.") 
00158         
00159         record.id = attr_dict["id"]
00160         if "source" in attr_dict:
00161             record.annotations["source"] = attr_dict["source"]
00162         elif self._source != None:
00163             record.annotations["source"] = self._source
00164             
00165         #initialize entry with global species definition
00166         #the keywords for the species annotation are taken from SwissIO   
00167         if self._ncbiTaxId != None:
00168             record.annotations["ncbi_taxid"] = self._ncbiTaxId
00169         if self._speciesName != None:
00170             record.annotations["organism"] = self._speciesName    
00171 
00172 
00173     def _elem_DNAseq(self,node,record):
00174         """Parse DNA sequence."""
00175         
00176         if not (node.hasChildNodes() and len(node.firstChild.data) > 0):
00177             raise ValueError("Sequence length should be greater than 0.")
00178             
00179         record.seq = Seq(node.firstChild.data,Alphabet.generic_dna)
00180         
00181         
00182     def _elem_RNAseq(self,node,record):
00183         """Parse RNA sequence."""
00184         
00185         if not (node.hasChildNodes() and len(node.firstChild.data) > 0):
00186             raise ValueError("Sequence length should be greater than 0.")
00187         
00188         record.seq = Seq(node.firstChild.data,Alphabet.generic_rna)
00189     
00190     def _elem_AAseq(self,node,record):
00191         """Parse protein sequence."""
00192         
00193         if not (node.hasChildNodes() and len(node.firstChild.data) > 0):
00194             raise ValueError("Sequence length should be greater than 0.")
00195         
00196         record.seq = Seq(node.firstChild.data,Alphabet.generic_protein)
00197         
00198         
00199     def _elem_description(self,node,record):
00200         """Parse the description."""
00201         
00202         if node.hasChildNodes() and len(node.firstChild.data) > 0:
00203             record.description = node.firstChild.data
00204         
00205     def _attr_DBRef(self,attr_dict,record):
00206         """Parse a database cross reference"""
00207         
00208         if "source" not in attr_dict or "id" not in attr_dict:
00209             raise ValueError("Invalid DB cross reference.")
00210         
00211         if "%s:%s" % (attr_dict["source"],attr_dict["id"]) not in record.dbxrefs:
00212             record.dbxrefs.append("%s:%s" % (attr_dict["source"],attr_dict["id"]) )
00213 
00214 
00215 
00216 class SeqXmlWriter(SequentialSequenceWriter):
00217     """Writes SeqRecords into seqXML file.
00218     
00219     SeqXML requires the sequence alphabet be explicitly RNA, DNA or protein,
00220     i.e. an instance or subclass of Bio.Alphapet.RNAAlphabet,
00221     Bio.Alphapet.DNAAlphabet or Bio.Alphapet.ProteinAlphabet.
00222     """
00223     
00224     def __init__(self, handle,source=None,source_version=None,species=None,ncbiTaxId=None):
00225         """Create Object and start the xml generator."""
00226         
00227         SequentialSequenceWriter.__init__(self, handle)
00228 
00229         self.xml_generator = XMLGenerator(handle, "utf-8")
00230         self.xml_generator.startDocument()
00231         self.source = source
00232         self.source_version = source_version
00233         self.species = species
00234         self.ncbiTaxId = ncbiTaxId
00235             
00236     def write_header(self):
00237         """Write root node with document metadata."""
00238         SequentialSequenceWriter.write_header(self)
00239         
00240         attrs = {"xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance",
00241                  "xsi:noNamespaceSchemaLocation":"http://www.seqxml.org/0.4/seqxml.xsd",
00242                  "seqXMLversion":"0.4"}
00243         
00244         if self.source != None:
00245             attrs["source"] = self.source
00246         if self.source_version != None:
00247             attrs["sourceVersion"] = self.source_ersion
00248         if self.species != None:
00249             if not isinstance(species,basestring):
00250                 raise TypeError("species should be of type string")
00251             attrs["speciesName"] = self.species
00252         if self.ncbiTaxId != None:
00253             if not isinstance(self.ncbiTaxId,(basestring,int)):
00254                 raise TypeError("ncbiTaxID should be of type string or int")
00255             attrs["ncbiTaxID"] = self.ncbiTaxId
00256         
00257         self.xml_generator.startElement("seqXML", AttributesImpl(attrs))
00258         
00259     
00260     def write_record(self, record):
00261         """Write one record."""
00262         
00263         if not record.id or record.id == "<unknown id>":
00264             raise ValueError("SeqXML requires identifier")
00265         
00266         if not isinstance(record.id,basestring):
00267             raise TypeError("Identifier should be of type string")
00268         
00269         attrb = {"id" : record.id}
00270         
00271         if "source" in record.annotations and self.source != record.annotations["source"]:
00272             if not isinstance(record.annotations["source"],basestring):
00273                 raise TypeError("source should be of type string")
00274             attrb["source"] = record.annotations["source"]
00275         
00276         self.xml_generator.startElement("entry", AttributesImpl(attrb))
00277         self._write_species(record)
00278         self._write_description(record)
00279         self._write_seq(record)
00280         self._write_dbxrefs(record)
00281         self._write_properties(record)
00282         self.xml_generator.endElement("entry")
00283     
00284     def write_footer(self):
00285         """Close the root node and finish the XML document."""
00286         
00287         SequentialSequenceWriter.write_footer(self)
00288         
00289         self.xml_generator.endElement("seqXML")
00290         self.xml_generator.endDocument()
00291     
00292     def _write_species(self,record):
00293         """Write the species if given."""
00294         
00295         if "organism" in record.annotations and "ncbi_taxid" in record.annotations:
00296             
00297             if not isinstance(record.annotations["organism"],basestring):
00298                 raise TypeError("organism should be of type string")
00299             
00300             if not isinstance(record.annotations["ncbi_taxid"],(basestring,int)):
00301                 raise TypeError("ncbiTaxID should be of type string or int")
00302             
00303             #The local species definition is only written if it differs from the global species definition
00304             if record.annotations["organism"] != self.species or record.annotations["ncbi_taxid"] != self.ncbiTaxId:
00305             
00306                 attr = { "name" : record.annotations["organism"], "ncbiTaxID" :record.annotations["ncbi_taxid"] }   
00307                 self.xml_generator.startElement("species",AttributesImpl(attr))
00308                 self.xml_generator.endElement("species")
00309             
00310             
00311     def _write_description(self,record):
00312         """Write the description if given."""
00313         
00314         if record.description:
00315             
00316             if not isinstance(record.description,basestring):
00317                 raise TypeError("Description should be of type string")
00318             
00319             description = record.description
00320             if description == "<unknown description>":
00321                 description = ""
00322             
00323             if len(record.description) > 0:
00324                 self.xml_generator.startElement("description",AttributesImpl( {} ))
00325                 self.xml_generator.characters(description)
00326                 self.xml_generator.endElement("description")
00327     
00328     def _write_seq(self,record):
00329         """Write the sequence.
00330         
00331         Note that SeqXML requires a DNA, RNA or protein alphabet.
00332         """
00333         
00334         if isinstance(record.seq,UnknownSeq):
00335             raise TypeError("Sequence type is UnknownSeq but SeqXML requires sequence")
00336         
00337         seq = record.seq.tostring()
00338         
00339         if not len(seq) > 0:
00340             raise ValueError("The sequence length should be greater than 0")
00341         
00342         #Get the base alphabet (underneath any Gapped or StopCodon encoding)
00343         alphabet = Alphabet._get_base_alphabet(record.seq.alphabet)
00344         if isinstance(alphabet,Alphabet.RNAAlphabet):
00345             seqElem = "RNAseq"
00346         elif isinstance(alphabet,Alphabet.DNAAlphabet):
00347             seqElem = "DNAseq"
00348         elif isinstance(alphabet,Alphabet.ProteinAlphabet):
00349             seqElem = "AAseq"
00350         else:
00351             raise ValueError("Need a DNA, RNA or Protein alphabet")
00352         
00353         self.xml_generator.startElement(seqElem,AttributesImpl( {} ))
00354         self.xml_generator.characters(seq)
00355         self.xml_generator.endElement(seqElem)    
00356         
00357     
00358     def _write_dbxrefs(self,record):
00359         """Write all database cross references."""
00360         if record.dbxrefs != None:
00361             
00362             for dbxref in record.dbxrefs:
00363                 
00364                 if not isinstance(dbxref,basestring):
00365                     raise TypeError("dbxrefs should be of type list of string")
00366                 if dbxref.find(':') < 1:
00367                     raise ValueError("dbxrefs should be in the form ['source:id', 'source:id' ]")
00368                 
00369                 dbsource,dbid = dbxref.split(':',1)
00370                 
00371                 attr = { "source" : dbsource, "id" : dbid }
00372                 self.xml_generator.startElement("DBRef",AttributesImpl(attr))
00373                 self.xml_generator.endElement("DBRef")
00374     
00375     
00376     def _write_properties(self,record):
00377         """Write all annotations that are key value pairs with values of a primitive type or list of primitive types."""
00378         
00379         for key,value in record.annotations.items():
00380     
00381             if key not in ("organism","ncbi_taxid","source"):
00382             
00383                 if value == None:
00384                     
00385                     attr = { "name" : key }
00386                     self.xml_generator.startElement("property",AttributesImpl(attr))
00387                     self.xml_generator.endElement("property")
00388                 
00389                 elif isinstance(value,list):
00390                     
00391                     for v in value:
00392                         if isinstance(value,(int,float,basestring)):
00393                             attr = { "name" : key , "value" : v }
00394                             self.xml_generator.startElement("property",AttributesImpl(attr))
00395                             self.xml_generator.endElement("property")
00396                     
00397                 elif isinstance(value,(int,float,basestring)):
00398                 
00399                     attr = { "name" : key , "value" : str(value) }
00400                     self.xml_generator.startElement("property",AttributesImpl(attr))
00401                     self.xml_generator.endElement("property")
00402     
00403 if __name__ == "__main__":
00404     print "Running quick self test"
00405     
00406     from Bio import SeqIO
00407     import sys
00408     
00409     fileHandle = open("Tests/SeqXML/protein_example.xml","r")
00410     records = list(SeqIO.parse(fileHandle, "seqxml"))
00411     
00412     from StringIO import StringIO
00413     stringHandle = StringIO()
00414 
00415     SeqIO.write(records,stringHandle,"seqxml")
00416     SeqIO.write(records,sys.stdout,"seqxml")
00417     print
00418     
00419     stringHandle.seek(0)
00420     records = list(SeqIO.parse(stringHandle,"seqxml"))
00421     
00422     SeqIO.write(records,sys.stdout,"seqxml")
00423     print