Back to index

python-biopython  1.60
UniprotIO.py
Go to the documentation of this file.
00001 # Copyright 2010 by Andrea Pierleoni
00002 # Revisions copyright 2010 by Peter Cock
00003 # All rights reserved.
00004 #
00005 # This code is part of the Biopython distribution and governed by its
00006 # license.  Please see the LICENSE file that should have been included
00007 # as part of this package.
00008 
00009 """Bio.SeqIO support for the "uniprot-xml" file format.
00010 
00011 See also:
00012 
00013 http://www.uniprot.org
00014 
00015 The UniProt XML format essentially replaces the old plain text file format
00016 originally introduced by SwissProt ("swiss" format in Bio.SeqIO).
00017 """
00018 import sys
00019 
00020 from Bio import Seq
00021 from Bio import SeqFeature
00022 from Bio import Alphabet
00023 from Bio.SeqRecord import SeqRecord
00024 try:
00025     from cStringIO import StringIO
00026 except ImportError:
00027     from StringIO import StringIO
00028 import warnings
00029 
00030 #For speed try to use cElementTree rather than ElemenTree
00031 try:
00032     if (3,0,0) <= sys.version_info[:3] <= (3,1,3):
00033         #workaround for bug in python 3 to 3.1.3  see http://bugs.python.org/issue9257
00034         from xml.etree import ElementTree as ElementTree
00035     else:
00036         from xml.etree import cElementTree as ElementTree
00037 except ImportError:
00038     from xml.etree import ElementTree as ElementTree
00039 
00040 NS = "{http://uniprot.org/uniprot}"
00041 REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)"
00042 
00043 def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
00044     """Generator function to parse UniProt XML as SeqRecord objects.
00045     
00046     parses an XML entry at a time from any UniProt XML file 
00047     returns a SeqRecord for each iteration
00048     
00049     This generator can be used in Bio.SeqIO
00050     
00051     return_raw_comments = True --> comment fields are returned as complete xml to allow further processing
00052     skip_parsing_errors = True --> if parsing errors are found, skip to next entry
00053     """
00054     if isinstance(alphabet, Alphabet.NucleotideAlphabet):
00055         raise ValueError, "Wrong alphabet %r" % alphabet
00056     if isinstance(alphabet, Alphabet.Gapped):
00057         if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet):
00058             raise ValueError, "Wrong alphabet %r" % alphabet
00059 
00060     if not hasattr(handle, "read"):
00061         if type(handle)==type(''):
00062             handle=StringIO(handle)
00063         else:
00064             raise Exception('An XML-containing handler or an XML string must be passed')
00065 
00066     if ElementTree is None:
00067         from Bio import MissingExternalDependencyError
00068         raise MissingExternalDependencyError(
00069                 "No ElementTree module was found. "
00070                 "Use Python 2.5+, lxml or elementtree if you "
00071                 "want to use Bio.SeqIO.UniprotIO.")
00072         
00073     for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
00074         if event=="end" and elem.tag == NS + "entry":
00075             yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse()
00076             elem.clear()
00077 
00078 class Parser(object):
00079     """Parse a UniProt XML entry to a SeqRecord.
00080     
00081     return_raw_comments=True to get back the complete comment field in XML format
00082     alphabet=Alphabet.ProteinAlphabet()    can be modified if needed, default is protein alphabet.
00083     """
00084     def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
00085         self.entry=elem
00086         self.alphabet=alphabet
00087         self.return_raw_comments=return_raw_comments
00088     
00089     def parse(self):
00090         """Parse the input."""
00091         assert self.entry.tag == NS + 'entry'
00092         
00093         def append_to_annotations(key, value):
00094             if key not in self.ParsedSeqRecord.annotations:
00095                 self.ParsedSeqRecord.annotations[key]=[]
00096             if value not in self.ParsedSeqRecord.annotations[key]:
00097                 self.ParsedSeqRecord.annotations[key].append(value)
00098             
00099         def _parse_name(element):
00100             self.ParsedSeqRecord.name=element.text
00101             self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
00102         
00103         def _parse_accession(element):
00104             append_to_annotations('accessions', element.text)# to cope with SwissProt plain text parser
00105             self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
00106         
00107         def _parse_protein(element):
00108             """Parse protein names (PRIVATE)."""
00109             descr_set=False
00110             for protein_element in element.getchildren():
00111                 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:#recommendedName tag are parsed before
00112                     #use protein fields for name and description
00113                     for rec_name in protein_element.getchildren():
00114                         ann_key='%s_%s' % (protein_element.tag.replace(NS,''), rec_name.tag.replace(NS,''))
00115                         append_to_annotations(ann_key, rec_name.text)
00116                         if (rec_name.tag==NS + 'fullName') and not descr_set:
00117                             self.ParsedSeqRecord.description=rec_name.text
00118                             descr_set=True
00119                 elif protein_element.tag==NS + 'component':
00120                     pass #not parsed 
00121                 elif protein_element.tag==NS + 'domain':
00122                     pass #not parsed 
00123         
00124         def _parse_gene(element):
00125             for genename_element in element.getchildren():  
00126                 if 'type' in genename_element.attrib:
00127                     ann_key='gene_%s_%s' % (genename_element.tag.replace(NS,''), genename_element.attrib['type'])
00128                     if genename_element.attrib['type']=='primary':
00129                         self.ParsedSeqRecord.annotations[ann_key]=genename_element.text
00130                     else:
00131                         append_to_annotations(ann_key,genename_element.text)
00132         
00133         def _parse_geneLocation(element):
00134             append_to_annotations('geneLocation', element.attrib['type'])
00135         
00136         def _parse_organism(element):
00137             organism_name = com_name = sci_name = ''
00138             for organism_element in element.getchildren():  
00139                 if organism_element.tag==NS + 'name':
00140                     if organism_element.text:
00141                         if organism_element.attrib['type'] == 'scientific':
00142                             sci_name = organism_element.text
00143                         elif organism_element.attrib['type'] == 'common':
00144                             com_name = organism_element.text
00145                         else:
00146                             #e.g. synonym
00147                             append_to_annotations("organism_name", organism_element.text)
00148                 elif organism_element.tag==NS + 'dbReference':
00149                     self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type']+':'+organism_element.attrib['id'])
00150                 elif organism_element.tag==NS + 'lineage':
00151                     for taxon_element in organism_element.getchildren():
00152                         if taxon_element.tag==NS + 'taxon':
00153                             append_to_annotations('taxonomy',taxon_element.text)
00154             if sci_name and com_name:
00155                 organism_name = '%s (%s)' % (sci_name, com_name)
00156             elif sci_name:
00157                 organism_name = sci_name
00158             elif com_name:
00159                 organism_name = com_name
00160             self.ParsedSeqRecord.annotations['organism']=organism_name
00161             
00162         def _parse_organismHost(element):
00163             for organism_element in element.getchildren():  
00164                 if organism_element.tag==NS + 'name': 
00165                     append_to_annotations("organism_host", organism_element.text)
00166                         
00167         def _parse_keyword(element):      
00168             append_to_annotations('keywords',element.text)
00169         
00170         def _parse_comment(element):
00171             """Parse comments (PRIVATE).
00172             
00173             Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
00174             To store all the contained data, more complex data structures are needed, such as 
00175             annidated dictionaries. This is left to end user, by optionally setting:
00176             
00177             return_raw_comments=True 
00178             
00179             the orginal XMLs is returned in the annotation fields.
00180             
00181             available comment types at december 2009:
00182                 "allergen"
00183                 "alternative products"
00184                 "biotechnology"
00185                 "biophysicochemical properties"
00186                 "catalytic activity"
00187                 "caution"
00188                 "cofactor"
00189                 "developmental stage"
00190                 "disease"
00191                 "domain"
00192                 "disruption phenotype"
00193                 "enzyme regulation"
00194                 "function"
00195                 "induction"
00196                 "miscellaneous"
00197                 "pathway"
00198                 "pharmaceutical"
00199                 "polymorphism"
00200                 "PTM"
00201                 "RNA editing"
00202                 "similarity"
00203                 "subcellular location"
00204                 "sequence caution"
00205                 "subunit"
00206                 "tissue specificity"
00207                 "toxic dose"
00208                 "online information"
00209                 "mass spectrometry"
00210                 "interaction"
00211             """
00212             
00213             simple_comments=["allergen",
00214                             "biotechnology",
00215                             "biophysicochemical properties",
00216                             "catalytic activity",
00217                             "caution",
00218                             "cofactor",
00219                             "developmental stage",
00220                             "disease",
00221                             "domain",
00222                             "disruption phenotype",
00223                             "enzyme regulation",
00224                             "function",
00225                             "induction",
00226                             "miscellaneous",
00227                             "pathway",
00228                             "pharmaceutical",
00229                             "polymorphism",
00230                             "PTM",
00231                             "RNA editing",#positions not parsed
00232                             "similarity",
00233                             "subunit",
00234                             "tissue specificity",
00235                             "toxic dose",
00236                              ]
00237 
00238             if element.attrib['type'] in simple_comments:
00239                 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
00240                 for text_element in element.getiterator(NS + 'text'):
00241                     if text_element.text:
00242                         append_to_annotations(ann_key,text_element.text)
00243             elif element.attrib['type']=='subcellular location':
00244                 for subloc_element in element.getiterator(NS + 'subcellularLocation'):
00245                     for el in subloc_element.getchildren():
00246                         if el.text:
00247                             ann_key='comment_%s_%s' % (element.attrib['type'].replace(' ',''), el.tag.replace(NS,''))
00248                             append_to_annotations(ann_key,el.text)
00249             elif element.attrib['type']=='interaction':
00250                 for interact_element in element.getiterator(NS +'interactant'):
00251                     ann_key='comment_%s_intactId' % element.attrib['type']
00252                     append_to_annotations(ann_key,interact_element.attrib['intactId'])
00253             elif element.attrib['type']=='alternative products':
00254                 for alt_element in element.getiterator(NS +'isoform'):
00255                     ann_key='comment_%s_isoform' % element.attrib['type'].replace(' ','')
00256                     for id_element in alt_element.getiterator(NS +'id'):
00257                         append_to_annotations(ann_key,id_element.text)
00258             elif element.attrib['type']=='mass spectrometry':
00259                 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
00260                 start=end=0
00261                 for loc_element in element.getiterator(NS +'location'):
00262                     pos_els=loc_element.getiterator(NS +'position')
00263                     pos_els=list(pos_els)
00264                     # this try should be avoided, maybe it is safer to skip postion parsing for mass spectrometry
00265                     try:
00266                         if pos_els:
00267                             end=int(pos_els[0].attrib['position'])
00268                             start=end-1
00269                         else:
00270                             start=int(loc_element.getiterator(NS +'begin')[0].attrib['position'])-1
00271                             end=int(loc_element.getiterator(NS +'end')[0].attrib['position'])
00272                     except :#undefined positions or erroneusly mapped
00273                         pass    
00274                 mass=element.attrib['mass']
00275                 method=element.attrib['mass'] #TODO - Check this, looks wrong!
00276                 if start==end==0:  
00277                     append_to_annotations(ann_key,'undefined:%s|%s'%(mass,method))
00278                 else:
00279                     append_to_annotations(ann_key,'%s..%s:%s|%s'%(start,end,mass,method))
00280             elif element.attrib['type']=='sequence caution':
00281                 pass#not parsed: few information, complex structure
00282             elif element.attrib['type']=='online information':
00283                 for link_element in element.getiterator(NS +'link'):
00284                     ann_key='comment_%s' % element.attrib['type'].replace(' ','')
00285                     for id_element in link_element.getiterator(NS +'link'):
00286                         append_to_annotations(ann_key,'%s@%s'%(element.attrib['name'],link_element.attrib['uri']))            
00287             
00288             #return raw XML comments if needed
00289             if self.return_raw_comments:
00290                 ann_key='comment_%s_xml' % element.attrib['type'].replace(' ','')
00291                 append_to_annotations(ann_key,ElementTree.tostring(element))
00292                 
00293         
00294         def _parse_dbReference(element):
00295             self.ParsedSeqRecord.dbxrefs.append(element.attrib['type']+':'+element.attrib['id'])
00296             #e.g.
00297             # <dbReference type="PDB" key="11" id="2GEZ">
00298             #   <property value="X-ray" type="method"/>
00299             #   <property value="2.60 A" type="resolution"/>
00300             #   <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
00301             # </dbReference>
00302             if 'type' in element.attrib:
00303                 if element.attrib['type'] == 'PDB':
00304                         method=""
00305                         resolution=""
00306                         for ref_element in element.getchildren():  
00307                             if ref_element.tag==NS + 'property':
00308                                 dat_type=ref_element.attrib['type']
00309                                 if dat_type=='method':
00310                                     method=ref_element.attrib['value']
00311                                 if dat_type=='resolution':
00312                                     resolution=ref_element.attrib['value']
00313                                 if dat_type=='chains':
00314                                     pairs=ref_element.attrib['value'].split(',')
00315                                     for elem in pairs:
00316                                         pair=elem.strip().split('=')
00317                                         if pair[1]!='-':
00318                                             #TODO - How best to store these, do SeqFeatures make sense?
00319                                             feature=SeqFeature.SeqFeature()
00320                                             feature.type=element.attrib['type']
00321                                             feature.qualifiers['name']=element.attrib['id']
00322                                             feature.qualifiers['method']=method
00323                                             feature.qualifiers['resolution']=resolution
00324                                             feature.qualifiers['chains']=pair[0].split('/')
00325                                             start=int(pair[1].split('-')[0])-1
00326                                             end=int(pair[1].split('-')[1])
00327                                             feature.location=SeqFeature.FeatureLocation(start,end)
00328                                             #self.ParsedSeqRecord.features.append(feature)
00329 
00330             for ref_element in  element.getchildren():  
00331                 if ref_element.tag==NS + 'property':
00332                     pass# this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
00333             
00334         def _parse_reference(element):
00335             reference=SeqFeature.Reference()
00336             authors=[]
00337             scopes=[]
00338             tissues=[]
00339             journal_name=''
00340             pub_type=''
00341             pub_date=''
00342             for ref_element in element.getchildren():
00343                 if ref_element.tag==NS + 'citation':
00344                     pub_type=ref_element.attrib['type']
00345                     if pub_type=='submission':
00346                         pub_type+=' to the '+ref_element.attrib['db']
00347                     if 'name' in ref_element.attrib:
00348                         journal_name=ref_element.attrib['name']
00349                     pub_date=ref_element.attrib.get('date','')
00350                     j_volume=ref_element.attrib.get('volume','')
00351                     j_first=ref_element.attrib.get('first','')
00352                     j_last=ref_element.attrib.get('last','')
00353                     for cit_element in ref_element.getchildren():
00354                         if cit_element.tag==NS + 'title':
00355                             reference.title=cit_element.text
00356                         elif cit_element.tag==NS + 'authorList':
00357                             for person_element in cit_element.getchildren():
00358                                 authors.append(person_element.attrib['name'])
00359                         elif cit_element.tag==NS + 'dbReference':
00360                             self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']+':'+cit_element.attrib['id'])
00361                             if cit_element.attrib['type']=='PubMed':
00362                                 reference.pubmed_id=cit_element.attrib['id']
00363                             elif ref_element.attrib['type']=='MEDLINE':
00364                                 reference.medline_id=cit_element.attrib['id']
00365                 elif ref_element.tag==NS + 'scope':
00366                     scopes.append(ref_element.text)
00367                 elif ref_element.tag==NS + 'source':
00368                     for source_element in ref_element.getchildren():
00369                         if source_element.tag==NS + 'tissue':
00370                             tissues.append(source_element.text)
00371             if scopes:
00372                 scopes_str='Scope: '+', '.join(scopes)
00373             else:
00374                 scopes_str=''
00375             if tissues:
00376                 tissues_str='Tissue: '+', '.join(tissues)
00377             else:
00378                 tissues_str=''
00379             
00380             reference.location = [] #locations cannot be parsed since they are actually written in free text inside scopes so all the references are put in the annotation.
00381             reference.authors = ', '.join(authors) 
00382             if journal_name:
00383                 if pub_date and j_volume and j_first and j_last:
00384                     reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
00385                         volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
00386                 else:
00387                     reference.journal = journal_name 
00388             reference.comment = ' | '.join((pub_type,pub_date,scopes_str,tissues_str))
00389             append_to_annotations('references', reference)
00390         
00391         def _parse_position(element, offset=0):
00392             try:
00393                 position=int(element.attrib['position']) + offset
00394             except KeyError, err:
00395                 position=None
00396             status = element.attrib.get('status', '')
00397             if status == 'unknown':
00398                 assert position is None
00399                 return SeqFeature.UnknownPosition()
00400             elif not status:
00401                 return SeqFeature.ExactPosition(position)
00402             elif status == 'greater than':
00403                 return SeqFeature.AfterPosition(position)
00404             elif status == 'less than':
00405                 return SeqFeature.BeforePosition(position)
00406             elif status == 'uncertain':
00407                 return SeqFeature.UncertainPosition(position)
00408             else:
00409                 raise NotImplementedError("Position status %r" % status)
00410 
00411         def _parse_feature(element):
00412             feature=SeqFeature.SeqFeature()
00413             for k,v in element.attrib.items():
00414                 feature.qualifiers[k]=v
00415             feature.type=element.attrib.get('type','')
00416             if 'id' in element.attrib:
00417                 feature.id=element.attrib['id']
00418             for feature_element in element.getchildren():
00419                 if feature_element.tag==NS + 'location':
00420                     position_elements=feature_element.findall(NS + 'position')
00421                     if position_elements:
00422                         element = position_elements[0]
00423                         start_position = _parse_position(element, -1)
00424                         end_position = _parse_position(element)
00425                     else:
00426                         element = feature_element.findall(NS + 'begin')[0]
00427                         start_position=_parse_position(element, -1)
00428                         element = feature_element.findall(NS + 'end')[0]
00429                         end_position=_parse_position(element)
00430                     feature.location=SeqFeature.FeatureLocation(start_position,end_position)
00431                 else:
00432                     try:
00433                         feature.qualifiers[feature_element.tag.replace(NS,'')]=feature_element.text
00434                     except:
00435                         pass#skip unparsable tag
00436             self.ParsedSeqRecord.features.append(feature)
00437             
00438         def _parse_proteinExistence(element):
00439             append_to_annotations('proteinExistence', element.attrib['type'])   
00440             
00441         def _parse_evidence(element):
00442             for k, v in  element.attrib.items():
00443                 ann_key = k
00444                 append_to_annotations(ann_key, v)   
00445         
00446         def  _parse_sequence(element):
00447             for k, v in element.attrib.items():
00448                 if k in ("length", "mass", "version"):
00449                     self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v)
00450                 else:
00451                     self.ParsedSeqRecord.annotations['sequence_%s' % k] = v
00452             seq=''.join((element.text.split()))
00453             self.ParsedSeqRecord.seq=Seq.Seq(seq,self.alphabet)
00454             
00455         #============================================#
00456         #Initialize SeqRecord
00457         self.ParsedSeqRecord=SeqRecord('', id='') 
00458         
00459         #Entry attribs parsing
00460         #Unknown dataset should not happen!
00461         self.dbname=self.entry.attrib.get('dataset', 'UnknownDataset')
00462         #add attribs to annotations
00463         for k, v in self.entry.attrib.items():
00464             if k in ("version"):
00465                 #original
00466                 #self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v)
00467                 #To cope with swissProt plain text parser. this can cause errors 
00468                 #if the attrib has the same name of an other annotation
00469                 self.ParsedSeqRecord.annotations[k] = int(v)
00470             else:
00471                 #self.ParsedSeqRecord.annotations["entry_%s" % k] = v
00472                 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser
00473 
00474         #Top-to-bottom entry children parsing
00475         for element in self.entry.getchildren():
00476             if element.tag==NS + 'name':
00477                 _parse_name(element)
00478             elif element.tag==NS + 'accession':
00479                 _parse_accession(element)
00480             elif element.tag==NS + 'protein':
00481                 _parse_protein(element)  
00482             elif element.tag==NS + 'gene':
00483                 _parse_gene(element)
00484             elif element.tag==NS + 'geneLocation':
00485                 _parse_geneLocation(element)
00486             elif element.tag==NS + 'organism':
00487                 _parse_organism(element)          
00488             elif element.tag==NS + 'organismHost':
00489                 _parse_organismHost(element)
00490             elif element.tag==NS + 'keyword':
00491                 _parse_keyword(element)
00492             elif element.tag==NS + 'comment':
00493                 _parse_comment(element)
00494             elif element.tag==NS + 'dbReference':
00495                 _parse_dbReference(element)
00496             elif element.tag==NS + 'reference':
00497                 _parse_reference(element)
00498             elif element.tag==NS + 'feature':
00499                 _parse_feature(element)
00500             elif element.tag==NS + 'proteinExistence':
00501                 _parse_proteinExistence(element)
00502             elif element.tag==NS + 'evidence':
00503                 _parse_evidence(element)
00504             elif element.tag==NS + 'sequence':
00505                 _parse_sequence(element)
00506             else:
00507                 pass   
00508             
00509         self.ParsedSeqRecord.dbxrefs=list(set(self.ParsedSeqRecord.dbxrefs))#remove duplicate dbxrefs
00510         self.ParsedSeqRecord.dbxrefs.sort()
00511 
00512         # use first accession as id
00513         if not self.ParsedSeqRecord.id:
00514             self.ParsedSeqRecord.id=self.ParsedSeqRecord.annotations['accessions'][0]
00515         
00516         return self.ParsedSeqRecord
00517