Back to index

python-biopython  1.60
Public Member Functions | Public Attributes
Bio.SeqIO.UniprotIO.Parser Class Reference

List of all members.

Public Member Functions

def __init__
def parse

Public Attributes

 entry
 alphabet
 return_raw_comments
 ParsedSeqRecord
 dbname

Detailed Description

Parse a UniProt XML entry to a SeqRecord.

return_raw_comments=True to get back the complete comment field in XML format
alphabet=Alphabet.ProteinAlphabet()    can be modified if needed, default is protein alphabet.

Definition at line 78 of file UniprotIO.py.


Constructor & Destructor Documentation

def Bio.SeqIO.UniprotIO.Parser.__init__ (   self,
  elem,
  alphabet = Alphabet.ProteinAlphabet(),
  return_raw_comments = False 
)

Definition at line 84 of file UniprotIO.py.

00084 
00085     def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
00086         self.entry=elem
00087         self.alphabet=alphabet
00088         self.return_raw_comments=return_raw_comments
    

Member Function Documentation

Parse the input.

Definition at line 89 of file UniprotIO.py.

00089 
00090     def parse(self):
00091         """Parse the input."""
00092         assert self.entry.tag == NS + 'entry'
00093         
00094         def append_to_annotations(key, value):
00095             if key not in self.ParsedSeqRecord.annotations:
00096                 self.ParsedSeqRecord.annotations[key]=[]
00097             if value not in self.ParsedSeqRecord.annotations[key]:
00098                 self.ParsedSeqRecord.annotations[key].append(value)
00099             
00100         def _parse_name(element):
00101             self.ParsedSeqRecord.name=element.text
00102             self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
00103         
00104         def _parse_accession(element):
00105             append_to_annotations('accessions', element.text)# to cope with SwissProt plain text parser
00106             self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
00107         
00108         def _parse_protein(element):
00109             """Parse protein names (PRIVATE)."""
00110             descr_set=False
00111             for protein_element in element.getchildren():
00112                 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:#recommendedName tag are parsed before
00113                     #use protein fields for name and description
00114                     for rec_name in protein_element.getchildren():
00115                         ann_key='%s_%s' % (protein_element.tag.replace(NS,''), rec_name.tag.replace(NS,''))
00116                         append_to_annotations(ann_key, rec_name.text)
00117                         if (rec_name.tag==NS + 'fullName') and not descr_set:
00118                             self.ParsedSeqRecord.description=rec_name.text
00119                             descr_set=True
00120                 elif protein_element.tag==NS + 'component':
00121                     pass #not parsed 
00122                 elif protein_element.tag==NS + 'domain':
00123                     pass #not parsed 
00124         
00125         def _parse_gene(element):
00126             for genename_element in element.getchildren():  
00127                 if 'type' in genename_element.attrib:
00128                     ann_key='gene_%s_%s' % (genename_element.tag.replace(NS,''), genename_element.attrib['type'])
00129                     if genename_element.attrib['type']=='primary':
00130                         self.ParsedSeqRecord.annotations[ann_key]=genename_element.text
00131                     else:
00132                         append_to_annotations(ann_key,genename_element.text)
00133         
00134         def _parse_geneLocation(element):
00135             append_to_annotations('geneLocation', element.attrib['type'])
00136         
00137         def _parse_organism(element):
00138             organism_name = com_name = sci_name = ''
00139             for organism_element in element.getchildren():  
00140                 if organism_element.tag==NS + 'name':
00141                     if organism_element.text:
00142                         if organism_element.attrib['type'] == 'scientific':
00143                             sci_name = organism_element.text
00144                         elif organism_element.attrib['type'] == 'common':
00145                             com_name = organism_element.text
00146                         else:
00147                             #e.g. synonym
00148                             append_to_annotations("organism_name", organism_element.text)
00149                 elif organism_element.tag==NS + 'dbReference':
00150                     self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type']+':'+organism_element.attrib['id'])
00151                 elif organism_element.tag==NS + 'lineage':
00152                     for taxon_element in organism_element.getchildren():
00153                         if taxon_element.tag==NS + 'taxon':
00154                             append_to_annotations('taxonomy',taxon_element.text)
00155             if sci_name and com_name:
00156                 organism_name = '%s (%s)' % (sci_name, com_name)
00157             elif sci_name:
00158                 organism_name = sci_name
00159             elif com_name:
00160                 organism_name = com_name
00161             self.ParsedSeqRecord.annotations['organism']=organism_name
00162             
00163         def _parse_organismHost(element):
00164             for organism_element in element.getchildren():  
00165                 if organism_element.tag==NS + 'name': 
00166                     append_to_annotations("organism_host", organism_element.text)
00167                         
00168         def _parse_keyword(element):      
00169             append_to_annotations('keywords',element.text)
00170         
00171         def _parse_comment(element):
00172             """Parse comments (PRIVATE).
00173             
00174             Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
00175             To store all the contained data, more complex data structures are needed, such as 
00176             annidated dictionaries. This is left to end user, by optionally setting:
00177             
00178             return_raw_comments=True 
00179             
00180             the orginal XMLs is returned in the annotation fields.
00181             
00182             available comment types at december 2009:
00183                 "allergen"
00184                 "alternative products"
00185                 "biotechnology"
00186                 "biophysicochemical properties"
00187                 "catalytic activity"
00188                 "caution"
00189                 "cofactor"
00190                 "developmental stage"
00191                 "disease"
00192                 "domain"
00193                 "disruption phenotype"
00194                 "enzyme regulation"
00195                 "function"
00196                 "induction"
00197                 "miscellaneous"
00198                 "pathway"
00199                 "pharmaceutical"
00200                 "polymorphism"
00201                 "PTM"
00202                 "RNA editing"
00203                 "similarity"
00204                 "subcellular location"
00205                 "sequence caution"
00206                 "subunit"
00207                 "tissue specificity"
00208                 "toxic dose"
00209                 "online information"
00210                 "mass spectrometry"
00211                 "interaction"
00212             """
00213             
00214             simple_comments=["allergen",
00215                             "biotechnology",
00216                             "biophysicochemical properties",
00217                             "catalytic activity",
00218                             "caution",
00219                             "cofactor",
00220                             "developmental stage",
00221                             "disease",
00222                             "domain",
00223                             "disruption phenotype",
00224                             "enzyme regulation",
00225                             "function",
00226                             "induction",
00227                             "miscellaneous",
00228                             "pathway",
00229                             "pharmaceutical",
00230                             "polymorphism",
00231                             "PTM",
00232                             "RNA editing",#positions not parsed
00233                             "similarity",
00234                             "subunit",
00235                             "tissue specificity",
00236                             "toxic dose",
00237                              ]
00238 
00239             if element.attrib['type'] in simple_comments:
00240                 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
00241                 for text_element in element.getiterator(NS + 'text'):
00242                     if text_element.text:
00243                         append_to_annotations(ann_key,text_element.text)
00244             elif element.attrib['type']=='subcellular location':
00245                 for subloc_element in element.getiterator(NS + 'subcellularLocation'):
00246                     for el in subloc_element.getchildren():
00247                         if el.text:
00248                             ann_key='comment_%s_%s' % (element.attrib['type'].replace(' ',''), el.tag.replace(NS,''))
00249                             append_to_annotations(ann_key,el.text)
00250             elif element.attrib['type']=='interaction':
00251                 for interact_element in element.getiterator(NS +'interactant'):
00252                     ann_key='comment_%s_intactId' % element.attrib['type']
00253                     append_to_annotations(ann_key,interact_element.attrib['intactId'])
00254             elif element.attrib['type']=='alternative products':
00255                 for alt_element in element.getiterator(NS +'isoform'):
00256                     ann_key='comment_%s_isoform' % element.attrib['type'].replace(' ','')
00257                     for id_element in alt_element.getiterator(NS +'id'):
00258                         append_to_annotations(ann_key,id_element.text)
00259             elif element.attrib['type']=='mass spectrometry':
00260                 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
00261                 start=end=0
00262                 for loc_element in element.getiterator(NS +'location'):
00263                     pos_els=loc_element.getiterator(NS +'position')
00264                     pos_els=list(pos_els)
00265                     # this try should be avoided, maybe it is safer to skip postion parsing for mass spectrometry
00266                     try:
00267                         if pos_els:
00268                             end=int(pos_els[0].attrib['position'])
00269                             start=end-1
00270                         else:
00271                             start=int(loc_element.getiterator(NS +'begin')[0].attrib['position'])-1
00272                             end=int(loc_element.getiterator(NS +'end')[0].attrib['position'])
00273                     except :#undefined positions or erroneusly mapped
00274                         pass    
00275                 mass=element.attrib['mass']
00276                 method=element.attrib['mass'] #TODO - Check this, looks wrong!
00277                 if start==end==0:  
00278                     append_to_annotations(ann_key,'undefined:%s|%s'%(mass,method))
00279                 else:
00280                     append_to_annotations(ann_key,'%s..%s:%s|%s'%(start,end,mass,method))
00281             elif element.attrib['type']=='sequence caution':
00282                 pass#not parsed: few information, complex structure
00283             elif element.attrib['type']=='online information':
00284                 for link_element in element.getiterator(NS +'link'):
00285                     ann_key='comment_%s' % element.attrib['type'].replace(' ','')
00286                     for id_element in link_element.getiterator(NS +'link'):
00287                         append_to_annotations(ann_key,'%s@%s'%(element.attrib['name'],link_element.attrib['uri']))            
00288             
00289             #return raw XML comments if needed
00290             if self.return_raw_comments:
00291                 ann_key='comment_%s_xml' % element.attrib['type'].replace(' ','')
00292                 append_to_annotations(ann_key,ElementTree.tostring(element))
00293                 
00294         
00295         def _parse_dbReference(element):
00296             self.ParsedSeqRecord.dbxrefs.append(element.attrib['type']+':'+element.attrib['id'])
00297             #e.g.
00298             # <dbReference type="PDB" key="11" id="2GEZ">
00299             #   <property value="X-ray" type="method"/>
00300             #   <property value="2.60 A" type="resolution"/>
00301             #   <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
00302             # </dbReference>
00303             if 'type' in element.attrib:
00304                 if element.attrib['type'] == 'PDB':
00305                         method=""
00306                         resolution=""
00307                         for ref_element in element.getchildren():  
00308                             if ref_element.tag==NS + 'property':
00309                                 dat_type=ref_element.attrib['type']
00310                                 if dat_type=='method':
00311                                     method=ref_element.attrib['value']
00312                                 if dat_type=='resolution':
00313                                     resolution=ref_element.attrib['value']
00314                                 if dat_type=='chains':
00315                                     pairs=ref_element.attrib['value'].split(',')
00316                                     for elem in pairs:
00317                                         pair=elem.strip().split('=')
00318                                         if pair[1]!='-':
00319                                             #TODO - How best to store these, do SeqFeatures make sense?
00320                                             feature=SeqFeature.SeqFeature()
00321                                             feature.type=element.attrib['type']
00322                                             feature.qualifiers['name']=element.attrib['id']
00323                                             feature.qualifiers['method']=method
00324                                             feature.qualifiers['resolution']=resolution
00325                                             feature.qualifiers['chains']=pair[0].split('/')
00326                                             start=int(pair[1].split('-')[0])-1
00327                                             end=int(pair[1].split('-')[1])
00328                                             feature.location=SeqFeature.FeatureLocation(start,end)
00329                                             #self.ParsedSeqRecord.features.append(feature)
00330 
00331             for ref_element in  element.getchildren():  
00332                 if ref_element.tag==NS + 'property':
00333                     pass# this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
00334             
00335         def _parse_reference(element):
00336             reference=SeqFeature.Reference()
00337             authors=[]
00338             scopes=[]
00339             tissues=[]
00340             journal_name=''
00341             pub_type=''
00342             pub_date=''
00343             for ref_element in element.getchildren():
00344                 if ref_element.tag==NS + 'citation':
00345                     pub_type=ref_element.attrib['type']
00346                     if pub_type=='submission':
00347                         pub_type+=' to the '+ref_element.attrib['db']
00348                     if 'name' in ref_element.attrib:
00349                         journal_name=ref_element.attrib['name']
00350                     pub_date=ref_element.attrib.get('date','')
00351                     j_volume=ref_element.attrib.get('volume','')
00352                     j_first=ref_element.attrib.get('first','')
00353                     j_last=ref_element.attrib.get('last','')
00354                     for cit_element in ref_element.getchildren():
00355                         if cit_element.tag==NS + 'title':
00356                             reference.title=cit_element.text
00357                         elif cit_element.tag==NS + 'authorList':
00358                             for person_element in cit_element.getchildren():
00359                                 authors.append(person_element.attrib['name'])
00360                         elif cit_element.tag==NS + 'dbReference':
00361                             self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']+':'+cit_element.attrib['id'])
00362                             if cit_element.attrib['type']=='PubMed':
00363                                 reference.pubmed_id=cit_element.attrib['id']
00364                             elif ref_element.attrib['type']=='MEDLINE':
00365                                 reference.medline_id=cit_element.attrib['id']
00366                 elif ref_element.tag==NS + 'scope':
00367                     scopes.append(ref_element.text)
00368                 elif ref_element.tag==NS + 'source':
00369                     for source_element in ref_element.getchildren():
00370                         if source_element.tag==NS + 'tissue':
00371                             tissues.append(source_element.text)
00372             if scopes:
00373                 scopes_str='Scope: '+', '.join(scopes)
00374             else:
00375                 scopes_str=''
00376             if tissues:
00377                 tissues_str='Tissue: '+', '.join(tissues)
00378             else:
00379                 tissues_str=''
00380             
00381             reference.location = [] #locations cannot be parsed since they are actually written in free text inside scopes so all the references are put in the annotation.
00382             reference.authors = ', '.join(authors) 
00383             if journal_name:
00384                 if pub_date and j_volume and j_first and j_last:
00385                     reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
00386                         volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
00387                 else:
00388                     reference.journal = journal_name 
00389             reference.comment = ' | '.join((pub_type,pub_date,scopes_str,tissues_str))
00390             append_to_annotations('references', reference)
00391         
00392         def _parse_position(element, offset=0):
00393             try:
00394                 position=int(element.attrib['position']) + offset
00395             except KeyError, err:
00396                 position=None
00397             status = element.attrib.get('status', '')
00398             if status == 'unknown':
00399                 assert position is None
00400                 return SeqFeature.UnknownPosition()
00401             elif not status:
00402                 return SeqFeature.ExactPosition(position)
00403             elif status == 'greater than':
00404                 return SeqFeature.AfterPosition(position)
00405             elif status == 'less than':
00406                 return SeqFeature.BeforePosition(position)
00407             elif status == 'uncertain':
00408                 return SeqFeature.UncertainPosition(position)
00409             else:
00410                 raise NotImplementedError("Position status %r" % status)
00411 
00412         def _parse_feature(element):
00413             feature=SeqFeature.SeqFeature()
00414             for k,v in element.attrib.items():
00415                 feature.qualifiers[k]=v
00416             feature.type=element.attrib.get('type','')
00417             if 'id' in element.attrib:
00418                 feature.id=element.attrib['id']
00419             for feature_element in element.getchildren():
00420                 if feature_element.tag==NS + 'location':
00421                     position_elements=feature_element.findall(NS + 'position')
00422                     if position_elements:
00423                         element = position_elements[0]
00424                         start_position = _parse_position(element, -1)
00425                         end_position = _parse_position(element)
00426                     else:
00427                         element = feature_element.findall(NS + 'begin')[0]
00428                         start_position=_parse_position(element, -1)
00429                         element = feature_element.findall(NS + 'end')[0]
00430                         end_position=_parse_position(element)
00431                     feature.location=SeqFeature.FeatureLocation(start_position,end_position)
00432                 else:
00433                     try:
00434                         feature.qualifiers[feature_element.tag.replace(NS,'')]=feature_element.text
00435                     except:
00436                         pass#skip unparsable tag
00437             self.ParsedSeqRecord.features.append(feature)
00438             
00439         def _parse_proteinExistence(element):
00440             append_to_annotations('proteinExistence', element.attrib['type'])   
00441             
00442         def _parse_evidence(element):
00443             for k, v in  element.attrib.items():
00444                 ann_key = k
00445                 append_to_annotations(ann_key, v)   
00446         
00447         def  _parse_sequence(element):
00448             for k, v in element.attrib.items():
00449                 if k in ("length", "mass", "version"):
00450                     self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v)
00451                 else:
00452                     self.ParsedSeqRecord.annotations['sequence_%s' % k] = v
00453             seq=''.join((element.text.split()))
00454             self.ParsedSeqRecord.seq=Seq.Seq(seq,self.alphabet)
00455             
00456         #============================================#
00457         #Initialize SeqRecord
00458         self.ParsedSeqRecord=SeqRecord('', id='') 
00459         
00460         #Entry attribs parsing
00461         #Unknown dataset should not happen!
00462         self.dbname=self.entry.attrib.get('dataset', 'UnknownDataset')
00463         #add attribs to annotations
00464         for k, v in self.entry.attrib.items():
00465             if k in ("version"):
00466                 #original
00467                 #self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v)
00468                 #To cope with swissProt plain text parser. this can cause errors 
00469                 #if the attrib has the same name of an other annotation
00470                 self.ParsedSeqRecord.annotations[k] = int(v)
00471             else:
00472                 #self.ParsedSeqRecord.annotations["entry_%s" % k] = v
00473                 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser
00474 
00475         #Top-to-bottom entry children parsing
00476         for element in self.entry.getchildren():
00477             if element.tag==NS + 'name':
00478                 _parse_name(element)
00479             elif element.tag==NS + 'accession':
00480                 _parse_accession(element)
00481             elif element.tag==NS + 'protein':
00482                 _parse_protein(element)  
00483             elif element.tag==NS + 'gene':
00484                 _parse_gene(element)
00485             elif element.tag==NS + 'geneLocation':
00486                 _parse_geneLocation(element)
00487             elif element.tag==NS + 'organism':
00488                 _parse_organism(element)          
00489             elif element.tag==NS + 'organismHost':
00490                 _parse_organismHost(element)
00491             elif element.tag==NS + 'keyword':
00492                 _parse_keyword(element)
00493             elif element.tag==NS + 'comment':
00494                 _parse_comment(element)
00495             elif element.tag==NS + 'dbReference':
00496                 _parse_dbReference(element)
00497             elif element.tag==NS + 'reference':
00498                 _parse_reference(element)
00499             elif element.tag==NS + 'feature':
00500                 _parse_feature(element)
00501             elif element.tag==NS + 'proteinExistence':
00502                 _parse_proteinExistence(element)
00503             elif element.tag==NS + 'evidence':
00504                 _parse_evidence(element)
00505             elif element.tag==NS + 'sequence':
00506                 _parse_sequence(element)
00507             else:
00508                 pass   
00509             
00510         self.ParsedSeqRecord.dbxrefs=list(set(self.ParsedSeqRecord.dbxrefs))#remove duplicate dbxrefs
00511         self.ParsedSeqRecord.dbxrefs.sort()
00512 
00513         # use first accession as id
00514         if not self.ParsedSeqRecord.id:
00515             self.ParsedSeqRecord.id=self.ParsedSeqRecord.annotations['accessions'][0]
00516         
00517         return self.ParsedSeqRecord
00518         

Member Data Documentation

Definition at line 86 of file UniprotIO.py.

Definition at line 461 of file UniprotIO.py.

Definition at line 85 of file UniprotIO.py.

Definition at line 457 of file UniprotIO.py.

Definition at line 87 of file UniprotIO.py.


The documentation for this class was generated from the following file: