Back to index

python-biopython  1.60
test_SeqIO_FastaIO.py
Go to the documentation of this file.
00001 # Copyright 2009-2011 by Peter Cock.  All rights reserved.
00002 # Parts copyright 1999 by Jeffrey Chang.  All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 
00007 import unittest
00008 from StringIO import StringIO
00009 
00010 from Bio import SeqIO
00011 from Bio.SeqIO.FastaIO import FastaIterator
00012 from Bio.Alphabet import generic_protein, generic_nucleotide, generic_dna
00013 
00014 def title_to_ids(title):
00015     """Function to convert a title into the id, name, and description.
00016 
00017     This is just a quick-n-dirty implementation, and is definately not meant
00018     to handle every FASTA title line case.
00019     """
00020     # first split the id information from the description
00021     # the first item is the id info block, the rest is the description
00022     all_info = title.split(" ")
00023     id_info = all_info[0]
00024     rest = all_info[1:]
00025     descr = " ".join(rest)
00026 
00027     # now extract the ids from the id block
00028     # gi|5690369|gb|AF158246.1|AF158246
00029     id_info_items = id_info.split("|")
00030     if len(id_info_items) >=4:
00031         assert id_info_items[2] in ["gb", "emb", "dbj", "pdb"], title
00032         id = id_info_items[3] # the id with version info
00033         name = id_info_items[4] # the id without version info
00034     else:
00035         #Fallback:
00036         id = id_info_items[0]
00037         name = id_info_items[0]
00038 
00039     return id, name, descr
00040 
00041 def read_single_with_titles(filename, alphabet):
00042     global title_to_ids
00043     iterator = FastaIterator(open(filename), alphabet, title_to_ids)
00044     record = iterator.next()
00045     try:
00046         second = iterator.next()
00047     except StopIteration:
00048         second = None
00049     assert record is not None and second is None
00050     return record
00051 
00052 def read_title_and_seq(filename):
00053     """Crude parser that gets the first record from a FASTA file."""
00054     handle = open(filename)
00055     title = handle.readline().rstrip()
00056     assert title.startswith(">")
00057     seq = ""
00058     for line in handle:
00059         if line.startswith(">") : break
00060         seq += line.strip()
00061     handle.close()
00062     return title[1:], seq
00063 
00064 
00065 class TitleFunctions(unittest.TestCase):
00066     """Cunning unit test where methods are added at run time."""
00067     def simple_check(self, filename, alphabet):
00068         """Basic test for parsing single record FASTA files."""
00069         title, seq = read_title_and_seq(filename) #crude parser
00070         #First check using Bio.SeqIO.FastaIO directly with title function,
00071         record = read_single_with_titles(filename, alphabet)
00072         idn, name, descr = title_to_ids(title)
00073         self.assertEqual(record.id, idn)
00074         self.assertEqual(record.name, name)
00075         self.assertEqual(record.description, descr)
00076         self.assertEqual(str(record.seq), seq)
00077         self.assertEqual(record.seq.alphabet, alphabet)
00078         #Now check using Bio.SeqIO (default settings)
00079         record = SeqIO.read(open(filename), "fasta", alphabet)
00080         self.assertEqual(record.id, title.split()[0])
00081         self.assertEqual(record.name, title.split()[0])
00082         self.assertEqual(record.description, title)
00083         self.assertEqual(str(record.seq), seq)
00084         self.assertEqual(record.seq.alphabet, alphabet)
00085         #Uncomment this for testing the methods are calling the right files:
00086         #print "{%s done}" % filename,
00087 
00088     def multi_check(self, filename, alphabet):
00089         """Basic test for parsing multi-record FASTA files."""
00090         re_titled = list(FastaIterator(open(filename), alphabet, title_to_ids))
00091         default = list(SeqIO.parse(open(filename), "fasta", alphabet))
00092         self.assertEqual(len(re_titled), len(default))
00093         for old, new in zip(default, re_titled):
00094             idn, name, descr = title_to_ids(old.description)
00095             self.assertEqual(new.id, idn)
00096             self.assertEqual(new.name, name)
00097             self.assertEqual(new.description, descr)
00098             self.assertEqual(str(new.seq), str(old.seq))
00099             self.assertEqual(new.seq.alphabet, old.seq.alphabet)
00100         #Uncomment this for testing the methods are calling the right files:
00101         #print "{%s done}" % filename,
00102 
00103     def test_no_name(self):
00104         """Test FASTA record with no identifier."""
00105         handle = StringIO(">\nACGT")
00106         record = SeqIO.read(handle, "fasta")
00107         handle.close()
00108         self.assertEqual(str(record.seq), "ACGT")
00109         self.assertEqual("", record.id)
00110         self.assertEqual("", record.name)
00111         self.assertEqual("", record.description)
00112 
00113 
00114 single_nucleic_files = ['Fasta/lupine.nu', 'Fasta/elderberry.nu',
00115                         'Fasta/phlox.nu', 'Fasta/centaurea.nu',
00116                         'Fasta/wisteria.nu', 'Fasta/sweetpea.nu',
00117                         'Fasta/lavender.nu', 'Fasta/f001']
00118 
00119 multi_dna_files = ['Quality/example.fasta']
00120 
00121 single_amino_files = ['Fasta/aster.pro', 'Fasta/rosemary.pro',
00122                       'Fasta/rose.pro', 'Fasta/loveliesbleeding.pro']
00123 
00124 multi_amino_files = ['Fasta/f002', 'Fasta/fa01']
00125 
00126 for filename in single_nucleic_files:
00127     name = filename.split(".")[0]
00128     def funct(fn):
00129         f = lambda x : x.simple_check(fn, generic_nucleotide)
00130         f.__doc__ = "Checking nucleotide file %s" % fn
00131         return f
00132     setattr(TitleFunctions, "test_nuc_%s"%name, funct(filename))
00133     del funct
00134 
00135 for filename in multi_dna_files:
00136     name = filename.split(".")[0]
00137     def funct(fn):
00138         f = lambda x : x.multi_check(fn, generic_dna)
00139         f.__doc__ = "Checking multi DNA file %s" % fn
00140         return f
00141     setattr(TitleFunctions, "test_mutli_dna_%s"%name, funct(filename))
00142     del funct
00143 
00144 for filename in single_amino_files:
00145     name = filename.split(".")[0]
00146     def funct(fn):
00147         f = lambda x : x.simple_check(fn, generic_nucleotide)
00148         f.__doc__ = "Checking protein file %s" % fn
00149         return f
00150     setattr(TitleFunctions, "test_pro_%s"%name, funct(filename))
00151     del funct
00152 
00153 for filename in multi_amino_files:
00154     name = filename.split(".")[0]
00155     def funct(fn):
00156         f = lambda x : x.multi_check(fn, generic_dna)
00157         f.__doc__ = "Checking multi protein file %s" % fn
00158         return f
00159     setattr(TitleFunctions, "test_mutli_pro_%s"%name, funct(filename))
00160     del funct
00161 
00162 if __name__ == "__main__":
00163     runner = unittest.TextTestRunner(verbosity = 2)
00164     unittest.main(testRunner=runner)