Back to index

python-biopython  1.60
SwissIO.py
Go to the documentation of this file.
00001 # Copyright 2006-2010 by Peter Cock and Michiel de Hoon.
00002 # All rights reserved.
00003 #
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 
00008 """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
00009 
00010 You are expected to use this module via the Bio.SeqIO functions.
00011 See also the Bio.SwissProt module which offers more than just accessing
00012 the sequences as SeqRecord objects.
00013 
00014 See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format.
00015 """
00016 
00017 from Bio import Seq
00018 from Bio import SeqRecord
00019 from Bio import Alphabet
00020 from Bio import SeqFeature
00021 from Bio import SwissProt
00022 
00023 def _make_position(location_string, offset=0):
00024     """Turn a Swiss location position into a SeqFeature position object (PRIVATE).
00025 
00026     An offset of -1 is used with a start location to make it pythonic.
00027     """
00028     if location_string=="?":
00029         return SeqFeature.UnknownPosition()
00030     #Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0.
00031     try:
00032         return SeqFeature.ExactPosition(max(0, offset+int(location_string)))
00033     except ValueError:
00034         pass
00035     if location_string.startswith("<"):
00036         try:
00037             return SeqFeature.BeforePosition(max(0,offset+int(location_string[1:])))
00038         except ValueError:
00039             pass
00040     elif location_string.startswith(">"): # e.g. ">13"
00041         try:
00042             return SeqFeature.AfterPosition(max(0,offset+int(location_string[1:])))
00043         except ValueError :
00044             pass
00045     elif location_string.startswith("?"): # e.g. "?22"
00046         try:
00047             return SeqFeature.UncertainPosition(max(0,offset+int(location_string[1:])))
00048         except ValueError:
00049             pass
00050     raise NotImplementedError("Cannot parse location '%s'" % location_string)
00051 
00052 def _make_seqfeature(name, from_res, to_res, description, ft_id):
00053     """Construct SeqFeature from feature data from parser (PRIVATE)."""
00054     loc = SeqFeature.FeatureLocation(_make_position(from_res,-1),
00055                                      _make_position(to_res, 0))
00056     if not ft_id:
00057         ft_id = "<unknown id>" #The default in SeqFeature object
00058     return SeqFeature.SeqFeature(loc, type=name, id=ft_id,
00059                                  qualifiers={"description":description})
00060 
00061 #This is a generator function!
00062 def SwissIterator(handle):
00063     """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.
00064 
00065     Every section from the ID line to the terminating // becomes
00066     a single SeqRecord with associated annotation and features.
00067 
00068     This parser is for the flat file "swiss" format as used by:
00069      * Swiss-Prot aka SwissProt
00070      * TrEMBL
00071      * UniProtKB aka UniProt Knowledgebase
00072 
00073     For consistency with BioPerl and EMBOSS we call this the "swiss"
00074     format. See also the SeqIO support for "uniprot-xml" format.
00075     """
00076     swiss_records = SwissProt.parse(handle)
00077     for swiss_record in swiss_records:
00078         # Convert the SwissProt record to a SeqRecord
00079         seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
00080         record = SeqRecord.SeqRecord(seq,
00081                                      id=swiss_record.accessions[0],
00082                                      name=swiss_record.entry_name,
00083                                      description=swiss_record.description,
00084                                      features=[_make_seqfeature(*f) for f \
00085                                                in swiss_record.features],
00086                                     )
00087         record.description = swiss_record.description
00088         for cross_reference in swiss_record.cross_references:
00089             if len(cross_reference) < 2:
00090                 continue
00091             database, accession = cross_reference[:2]
00092             dbxref = "%s:%s" % (database, accession)
00093             if not dbxref in record.dbxrefs:
00094                 record.dbxrefs.append(dbxref)
00095         annotations = record.annotations
00096         annotations['accessions'] = swiss_record.accessions
00097         annotations['date'] = swiss_record.created[0]
00098         annotations['date_last_sequence_update'] = swiss_record.sequence_update[0]
00099         if swiss_record.annotation_update:
00100             annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
00101         if swiss_record.gene_name:
00102             annotations['gene_name'] = swiss_record.gene_name
00103         annotations['organism'] = swiss_record.organism.rstrip(".")
00104         annotations['taxonomy'] = swiss_record.organism_classification
00105         annotations['ncbi_taxid'] = swiss_record.taxonomy_id
00106         if swiss_record.host_organism:
00107             annotations['organism_host'] = swiss_record.host_organism
00108         if swiss_record.host_taxonomy_id:
00109             annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
00110         if swiss_record.comments:
00111             annotations['comment'] = "\n".join(swiss_record.comments)
00112         if swiss_record.references:
00113             annotations['references'] = []
00114             for reference in swiss_record.references:
00115                 feature = SeqFeature.Reference()
00116                 feature.comment = " ".join(["%s=%s;" % (key, value) \
00117                                             for key, value \
00118                                             in reference.comments])
00119                 for key, value in reference.references:
00120                     if key == 'PubMed':
00121                         feature.pubmed_id = value
00122                     elif key == 'MEDLINE':
00123                         feature.medline_id = value
00124                     elif key == 'DOI':
00125                         pass
00126                     elif key == 'AGRICOLA':
00127                         pass
00128                     else:
00129                         raise ValueError(\
00130                             "Unknown key %s found in references" % key)
00131                 feature.authors = reference.authors
00132                 feature.title = reference.title
00133                 feature.journal = reference.location
00134                 annotations['references'].append(feature)
00135         if swiss_record.keywords:
00136             record.annotations['keywords'] = swiss_record.keywords
00137         yield record
00138 
00139 if __name__ == "__main__":
00140     print "Quick self test..."
00141 
00142     example_filename = "../../Tests/SwissProt/sp008"
00143 
00144     import os
00145     if not os.path.isfile(example_filename):
00146         print "Missing test file %s" % example_filename
00147     else:
00148         #Try parsing it!
00149         handle = open(example_filename)
00150         records = SwissIterator(handle)
00151         for record in records:
00152             print record.name
00153             print record.id
00154             print record.annotations['keywords']
00155             print repr(record.annotations['organism'])
00156             print record.seq.tostring()[:20] + "..."
00157             for f in record.features: print f
00158         handle.close()
00159