Back to index

python-biopython  1.60
test_Uniprot.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 """Test for the Uniprot parser on Uniprot XML files.
00003 """
00004 import os
00005 import copy
00006 import unittest
00007 
00008 from Bio import SeqIO
00009 from Bio.SeqRecord import SeqRecord
00010 
00011 #Left as None if the import within UniProtIO fails
00012 if SeqIO.UniprotIO.ElementTree is None:
00013     from Bio import MissingPythonDependencyError
00014     raise MissingPythonDependencyError("No ElementTree module was found. "
00015                             "Use Python 2.5+, lxml or elementtree if you "
00016                             "want to use Bio.SeqIO.UniprotIO.")
00017 
00018 from seq_tests_common import compare_reference, compare_record
00019 
00020 class TestUniprot(unittest.TestCase):
00021 
00022     def test_uni001(self):
00023         "Parsing Uniprot file uni001"
00024         filename = 'uni001'
00025         # test the record parser
00026 
00027         datafile = os.path.join('SwissProt', filename)
00028 
00029         test_handle = open(datafile)
00030         seq_record = SeqIO.read(test_handle, "uniprot-xml")
00031         test_handle.close()
00032 
00033         self.assertTrue(isinstance(seq_record, SeqRecord))
00034 
00035         # test a couple of things on the record -- this is not exhaustive
00036         self.assertEqual(seq_record.id, "Q91G55")
00037         self.assertEqual(seq_record.name, "043L_IIV6")
00038         self.assertEqual(seq_record.description, "Uncharacterized protein 043L")
00039         self.assertEqual(repr(seq_record.seq), "Seq('MDLINNKLNIEIQKFCLDLEKKYNINYNNLIDLWFNKESTERLIKCEVNLENKI...IPI', ProteinAlphabet())")
00040 
00041         # self.assertEqual(seq_record.accessions, ['Q91G55']) #seq_record.accessions does not exist
00042         # self.assertEqual(seq_record.organism_classification, ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Mammalia', 'Eutheria', 'Primates', 'Catarrhini', 'Hominidae', 'Homo'])
00043         # self.assertEqual(record.seqinfo, (348, 39676, '75818910'))
00044     
00045         self.assertEqual(len(seq_record.features), 1)           
00046         self.assertEqual(repr(seq_record.features[0]), "SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(116)), type='chain', id='PRO_0000377969')")
00047 
00048         self.assertEqual(len(seq_record.annotations['references']), 2)
00049         self.assertEqual(seq_record.annotations['references'][0].authors, 'Jakob N.J., Mueller K., Bahr U., Darai G.')
00050         self.assertEqual(seq_record.annotations['references'][0].title, 'Analysis of the first complete DNA sequence of an invertebrate iridovirus: coding strategy of the genome of Chilo iridescent virus.')
00051         self.assertEqual(seq_record.annotations['references'][0].journal, 'Virology 286:182-196(2001)')
00052         self.assertEqual(seq_record.annotations['references'][0].comment, 'journal article | 2001 | Scope: NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] | ')
00053 
00054         self.assertEqual(len(seq_record.dbxrefs), 11)
00055         self.assertEqual(seq_record.dbxrefs[0], 'DOI:10.1006/viro.2001.0963')
00056 
00057         self.assertEqual(seq_record.annotations['sequence_length'], 116)
00058         self.assertEqual(seq_record.annotations['sequence_checksum'], '4A29B35FB716523C')
00059         self.assertEqual(seq_record.annotations['modified'], '2009-07-07')
00060         self.assertEqual(seq_record.annotations['accessions'], ['Q91G55'])
00061         self.assertEqual(seq_record.annotations['taxonomy'], ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Iridovirus'])
00062         self.assertEqual(seq_record.annotations['sequence_mass'], 13673)
00063         self.assertEqual(seq_record.annotations['dataset'], 'Swiss-Prot')
00064         self.assertEqual(seq_record.annotations['gene_name_ORF'], ['IIV6-043L'])
00065         self.assertEqual(seq_record.annotations['version'], 21)
00066         self.assertEqual(seq_record.annotations['sequence_modified'], '2001-12-01')
00067         self.assertEqual(seq_record.annotations['keywords'], ['Complete proteome', 'Virus reference strain'])
00068         self.assertEqual(seq_record.annotations['organism_host'], ['Acheta domesticus', 'House cricket', 'Chilo suppressalis', 'striped riceborer', 'Gryllus bimaculatus', 'Two-spotted cricket', 'Gryllus campestris', 'Spodoptera frugiperda', 'Fall armyworm'])
00069         self.assertEqual(seq_record.annotations['created'], '2009-06-16')
00070         self.assertEqual(seq_record.annotations['organism_name'], ['Chilo iridescent virus'])
00071         self.assertEqual(seq_record.annotations['organism'], 'Invertebrate iridescent virus 6 (IIV-6)')
00072         self.assertEqual(seq_record.annotations['recommendedName_fullName'], ['Uncharacterized protein 043L'])
00073         self.assertEqual(seq_record.annotations['sequence_version'], 1)
00074         self.assertEqual(seq_record.annotations['proteinExistence'], ['Predicted'])
00075 
00076     def compare_txt_xml(self, old, new):
00077         self.assertEqual(old.id, new.id)
00078         self.assertEqual(old.name, new.name)
00079         self.assertEqual(len(old), len(new))
00080         self.assertEqual(str(old.seq), str(new.seq))
00081         for key in set(old.annotations).intersection(new.annotations):
00082             if key == "references":
00083                 self.assertEqual(len(old.annotations[key]),
00084                                  len(new.annotations[key]))
00085                 for r1, r2 in zip(old.annotations[key], new.annotations[key]):
00086                     #Tweak for line breaks in plain text SwissProt
00087                     r1.title = r1.title.replace("- ", "-")
00088                     r2.title = r2.title.replace("- ", "-")
00089                     r1.journal = r1.journal.rstrip(".") #Should parser do this?
00090                     r1.medline_id = "" #Missing in UniPort MXL? TODO - check
00091                     #Lots of extra comments in UniProt XML
00092                     r1.comment = ""
00093                     r2.comment = ""
00094                     if not r2.journal: r1.journal = ""
00095                     compare_reference(r1, r2)
00096             elif old.annotations[key] == new.annotations[key]:
00097                 pass
00098             elif key in ["date"]:
00099                 #TODO - Why is this a list vs str?
00100                 pass
00101             elif type(old.annotations[key]) != type(new.annotations[key]):
00102                 raise TypeError("%s gives %s vs %s" % \
00103                                  (key, old.annotations[key], new.annotations[key]))
00104             elif key in ["organism"]:
00105                 if old.annotations[key] == new.annotations[key]:
00106                     pass
00107                 elif old.annotations[key].startswith(new.annotations[key]+" "):
00108                     pass
00109                 else:
00110                     raise ValueError(key)
00111             elif isinstance(old.annotations[key], list) \
00112             and sorted(old.annotations[key]) == sorted(new.annotations[key]):
00113                 pass
00114             else:
00115                 raise ValueError("%s gives %s vs %s" % \
00116                                  (key, old.annotations[key], new.annotations[key]))
00117         self.assertEqual(len(old.features), len(new.features),
00118                          "Features in %s, %i vs %i" %
00119                          (old.id, len(old.features), len(new.features)))
00120         for f1, f2 in zip(old.features, new.features):
00121             """
00122             self.assertEqual(f1.location.nofuzzy_start, f2.location.nofuzzy_start,
00123                              "%s %s vs %s %s" %
00124                              (f1.location, f1.type, f2.location, f2.type))
00125             self.assertEqual(f1.location.nofuzzy_end, f2.location.nofuzzy_end,
00126                              "%s %s vs %s %s" %
00127                              (f1.location, f1.type, f2.location, f2.type))
00128             """
00129             self.assertEqual(repr(f1.location), repr(f2.location),
00130                             "%s %s vs %s %s" %
00131                             (f1.location, f1.type, f2.location, f2.type))
00132 
00133     def test_Q13639(self):
00134         """Compare SwissProt text and uniprot XML versions of Q13639."""
00135         old = SeqIO.read("SwissProt/Q13639.txt", "swiss")
00136         new = SeqIO.read("SwissProt/Q13639.xml", "uniprot-xml")
00137         self.compare_txt_xml(old, new)
00138     
00139     def test_multi_ex(self):
00140         """Compare SwissProt text and uniprot XML versions of several examples."""
00141         txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
00142         xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
00143         fas_list = list(SeqIO.parse("SwissProt/multi_ex.fasta", "fasta"))
00144         ids = [x.strip() for x in open("SwissProt/multi_ex.list")]
00145         self.assertEqual(len(txt_list), len(ids))
00146         self.assertEqual(len(txt_list), len(fas_list))
00147         self.assertEqual(len(txt_list), len(xml_list))
00148         for txt, xml, fas, id in zip(txt_list, xml_list, fas_list, ids):
00149             self.assertEqual(txt.id, id)
00150             self.assertTrue(txt.id in fas.id.split("|"))
00151             self.assertEqual(str(txt.seq), str(fas.seq))
00152             self.compare_txt_xml(txt, xml)
00153     
00154     def test_multi_ex_index(self):
00155         """Index SwissProt text and uniprot XML versions of several examples."""
00156         txt_list = list(SeqIO.parse("SwissProt/multi_ex.txt", "swiss"))
00157         xml_list = list(SeqIO.parse("SwissProt/multi_ex.xml", "uniprot-xml"))
00158         ids = [x.strip() for x in open("SwissProt/multi_ex.list")]
00159         txt_index = SeqIO.index("SwissProt/multi_ex.txt", "swiss")
00160         xml_index = SeqIO.index("SwissProt/multi_ex.xml", "uniprot-xml")
00161         self.assertEqual(sorted(txt_index), sorted(ids))
00162         self.assertEqual(sorted(xml_index), sorted(ids))
00163         #Check SeqIO.parse() versus SeqIO.index() for plain text "swiss"
00164         for old in txt_list:
00165             new = txt_index[old.id]
00166             compare_record(old, new)
00167         #Check SeqIO.parse() versus SeqIO.index() for XML "uniprot-xml"
00168         for old in xml_list:
00169             new = xml_index[old.id]
00170             compare_record(old, new)
00171         
00172 if __name__ == "__main__":
00173     runner = unittest.TextTestRunner(verbosity = 2)
00174     unittest.main(testRunner=runner)