Back to index

python-biopython  1.60
test_SeqIO_convert.py
Go to the documentation of this file.
00001 # Copyright 2009 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Unit tests for Bio.SeqIO.convert(...) function."""
00007 import os
00008 import unittest
00009 import warnings
00010 from Bio.Seq import UnknownSeq
00011 from Bio import SeqIO
00012 from Bio.SeqIO import QualityIO
00013 from Bio.SeqIO._convert import _converter as converter_dict
00014 from StringIO import StringIO
00015 from Bio.Alphabet import generic_protein, generic_nucleotide, generic_dna
00016 
00017 #TODO - share this with the QualityIO tests...
00018 def truncation_expected(format):
00019     if format in ["fastq-solexa", "fastq-illumina"]:
00020         return 62
00021     elif format in ["fastq", "fastq-sanger"]:
00022         return 93
00023     else:
00024         return None
00025 
00026 #Top level function as this makes it easier to use for debugging:
00027 def check_convert(in_filename, in_format, out_format, alphabet=None):
00028     records = list(SeqIO.parse(in_filename,in_format, alphabet))
00029     #Write it out...
00030     handle = StringIO()
00031     qual_truncate = truncation_expected(out_format)
00032     if qual_truncate:
00033         warnings.simplefilter('ignore', UserWarning)
00034     SeqIO.write(records, handle, out_format)
00035     if qual_truncate:
00036         warnings.filters.pop()
00037     handle.seek(0)
00038     #Now load it back and check it agrees,
00039     records2 = list(SeqIO.parse(handle, out_format, alphabet))
00040     compare_records(records, records2, qual_truncate)
00041     #Finally, use the convert fuction, and check that agrees:
00042     handle2 = StringIO()
00043     if qual_truncate:
00044         warnings.simplefilter('ignore', UserWarning)
00045     SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
00046     if qual_truncate:
00047         warnings.filters.pop()
00048     #We could re-parse this, but it is simpler and stricter:
00049     assert handle.getvalue() == handle2.getvalue()
00050 
00051 def check_convert_fails(in_filename, in_format, out_format, alphabet=None):
00052     qual_truncate = truncation_expected(out_format)
00053     #We want the SAME error message from parse/write as convert!
00054     err1 = None
00055     try:
00056         records = list(SeqIO.parse(in_filename,in_format, alphabet))
00057         handle = StringIO()
00058         if qual_truncate:
00059             warnings.simplefilter('ignore', UserWarning)
00060         SeqIO.write(records, handle, out_format)
00061         if qual_truncate:
00062             warnings.filters.pop()
00063         handle.seek(0)
00064         assert False, "Parse or write should have failed!"
00065     except ValueError, err:
00066         err1 = err
00067     #Now do the conversion...
00068     try:
00069         handle2 = StringIO()
00070         if qual_truncate:
00071             warnings.simplefilter('ignore', UserWarning)
00072         SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
00073         if qual_truncate:
00074             warnings.filters.pop()
00075         assert False, "Convert should have failed!"
00076     except ValueError, err2:
00077         assert str(err1) == str(err2), \
00078                "Different failures, parse/write:\n%s\nconvert:\n%s" \
00079                % (err1, err2)
00080     #print err
00081     
00082 #TODO - move this to a shared test module...
00083 def compare_record(old, new, truncate=None):
00084     """Quality aware SeqRecord comparision.
00085 
00086     This will check the mapping between Solexa and PHRED scores.
00087     It knows to ignore UnknownSeq objects for string matching (i.e. QUAL files).
00088     """
00089     if old.id != new.id:
00090         raise ValueError("'%s' vs '%s' " % (old.id, new.id))
00091     if old.description != new.description \
00092     and (old.id+" "+old.description).strip() != new.description \
00093     and new.description != "<unknown description>" \
00094     and new.description != "" : #e.g. tab format
00095         raise ValueError("'%s' vs '%s' " % (old.description, new.description))
00096     if len(old.seq) != len(new.seq):
00097         raise ValueError("%i vs %i" % (len(old.seq), len(new.seq)))
00098     if isinstance(old.seq, UnknownSeq) or isinstance(new.seq, UnknownSeq):
00099         pass
00100     elif str(old.seq) != str(new.seq):
00101         if len(old.seq) < 200:
00102             raise ValueError("'%s' vs '%s'" % (old.seq, new.seq))
00103         else:
00104             raise ValueError("'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100]))
00105     if "phred_quality" in old.letter_annotations \
00106     and "phred_quality" in new.letter_annotations \
00107     and old.letter_annotations["phred_quality"] != new.letter_annotations["phred_quality"]:
00108         if truncate and [min(q,truncate) for q in old.letter_annotations["phred_quality"]] == \
00109                         [min(q,truncate) for q in new.letter_annotations["phred_quality"]]:
00110             pass
00111         else:
00112             raise ValuerError("Mismatch in phred_quality")
00113     if "solexa_quality" in old.letter_annotations \
00114     and "solexa_quality" in new.letter_annotations \
00115     and old.letter_annotations["solexa_quality"] != new.letter_annotations["solexa_quality"]:
00116         if truncate and [min(q,truncate) for q in old.letter_annotations["solexa_quality"]] == \
00117                         [min(q,truncate) for q in new.letter_annotations["solexa_quality"]]:
00118             pass
00119         else:
00120             raise ValueError("Mismatch in phred_quality")
00121     if "phred_quality" in old.letter_annotations \
00122     and "solexa_quality" in new.letter_annotations:
00123         #Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
00124         #Assume "old" is the original, and "new" has been converted.
00125         converted = [round(QualityIO.solexa_quality_from_phred(q)) \
00126                      for q in old.letter_annotations["phred_quality"]]
00127         if truncate:
00128             converted = [min(q,truncate) for q in converted]
00129         if converted != new.letter_annotations["solexa_quality"]:
00130             print
00131             print old.letter_annotations["phred_quality"]
00132             print converted
00133             print new.letter_annotations["solexa_quality"]
00134             raise ValueError("Mismatch in phred_quality vs solexa_quality")
00135     if "solexa_quality" in old.letter_annotations \
00136     and "phred_quality" in new.letter_annotations:
00137         #Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
00138         #Assume "old" is the original, and "new" has been converted.
00139         converted = [round(QualityIO.phred_quality_from_solexa(q)) \
00140                      for q in old.letter_annotations["solexa_quality"]]
00141         if truncate:
00142             converted = [min(q,truncate) for q in converted]
00143         if converted != new.letter_annotations["phred_quality"]:
00144             print old.letter_annotations["solexa_quality"]
00145             print converted
00146             print new.letter_annotations["phred_quality"]
00147             raise ValueError("Mismatch in solexa_quality vs phred_quality")
00148     return True
00149 
00150 def compare_records(old_list, new_list, truncate_qual=None):
00151     """Check two lists of SeqRecords agree, raises a ValueError if mismatch."""
00152     if len(old_list) != len(new_list):
00153         raise ValueError("%i vs %i records" % (len(old_list), len(new_list)))
00154     for old, new in zip(old_list, new_list):
00155         if not compare_record(old,new,truncate_qual):
00156             return False
00157     return True
00158 
00159 class ConvertTests(unittest.TestCase):
00160     """Cunning unit test where methods are added at run time."""
00161     def simple_check(self, filename, in_format, out_format, alphabet):
00162         check_convert(filename, in_format, out_format, alphabet)
00163     def failure_check(self, filename, in_format, out_format, alphabet):
00164         check_convert_fails(filename, in_format, out_format, alphabet)
00165 
00166 tests = [
00167     ("Quality/example.fastq", "fastq", None),
00168     ("Quality/example.fastq", "fastq-sanger", generic_dna),
00169     ("Quality/tricky.fastq", "fastq", generic_nucleotide),
00170     ("Quality/sanger_93.fastq", "fastq-sanger", None),
00171     ("Quality/sanger_faked.fastq", "fastq-sanger", generic_dna),
00172     ("Quality/solexa_faked.fastq", "fastq-solexa", generic_dna),
00173     ("Quality/illumina_faked.fastq", "fastq-illumina", generic_dna),
00174     ("EMBL/U87107.embl", "embl", None),
00175     ("EMBL/TRBG361.embl", "embl", None),
00176     ("GenBank/NC_005816.gb", "gb", None),
00177     ("GenBank/cor6_6.gb", "genbank", None),
00178     ]
00179 for filename, format, alphabet in tests:
00180     for (in_format, out_format) in converter_dict:
00181         if in_format != format : continue
00182         def funct(fn,fmt1, fmt2, alpha):
00183             f = lambda x : x.simple_check(fn, fmt1, fmt2, alpha)
00184             f.__doc__ = "Convert %s from %s to %s" % (fn, fmt1, fmt2)
00185             return f
00186         setattr(ConvertTests, "test_%s_%s_to_%s" \
00187                 % (filename.replace("/","_").replace(".","_"), in_format, out_format),
00188                 funct(filename, in_format, out_format, alphabet))
00189         del funct
00190 
00191 #Fail tests:
00192 tests = [
00193     ("Quality/error_diff_ids.fastq", "fastq", None),
00194     ("Quality/error_long_qual.fastq", "fastq", None),
00195     ("Quality/error_no_qual.fastq", "fastq", None),
00196     ("Quality/error_qual_del.fastq", "fastq", None),
00197     ("Quality/error_qual_escape.fastq", "fastq", None),
00198     ("Quality/error_qual_null.fastq", "fastq", None),
00199     ("Quality/error_qual_space.fastq", "fastq", None),
00200     ("Quality/error_qual_tab.fastq", "fastq", None),
00201     ("Quality/error_qual_unit_sep.fastq", "fastq", None),
00202     ("Quality/error_qual_vtab.fastq", "fastq", None),
00203     ("Quality/error_short_qual.fastq", "fastq", None),
00204     ("Quality/error_spaces.fastq", "fastq", None),
00205     ("Quality/error_tabs.fastq", "fastq", None),
00206     ("Quality/error_trunc_at_plus.fastq", "fastq", None),
00207     ("Quality/error_trunc_at_qual.fastq", "fastq", None),
00208     ("Quality/error_trunc_at_seq.fastq", "fastq", None),
00209     ("Quality/error_trunc_in_title.fastq", "fastq", generic_dna),
00210     ("Quality/error_trunc_in_seq.fastq", "fastq", generic_nucleotide),
00211     ("Quality/error_trunc_in_plus.fastq", "fastq", None),
00212     ("Quality/error_trunc_in_qual.fastq", "fastq", generic_dna),
00213     ("Quality/error_double_seq.fastq", "fastq", generic_dna),
00214     ("Quality/error_double_qual.fastq", "fastq", generic_dna),
00215     ]
00216 for filename, format, alphabet in tests:
00217     for (in_format, out_format) in converter_dict:
00218         if in_format != format : continue
00219         if in_format in ["fastq", "fastq-sanger", "fastq-solexa", "fastq-illumina"] \
00220         and out_format in ["fasta", "tab"] and filename.startswith("Quality/error_qual_"):
00221             #TODO? These conversions don't check for bad characters in the quality,
00222             #and in order to pass this strict test they should.
00223             continue
00224         def funct(fn,fmt1, fmt2, alpha):
00225             f = lambda x : x.failure_check(fn, fmt1, fmt2, alpha)
00226             f.__doc__ = "Convert %s from %s to %s" % (fn, fmt1, fmt2)
00227             return f
00228         setattr(ConvertTests, "test_%s_%s_to_%s" \
00229                 % (filename.replace("/","_").replace(".","_"), in_format, out_format),
00230                 funct(filename, in_format, out_format, alphabet))
00231     del funct
00232 
00233 
00234 if __name__ == "__main__":
00235     runner = unittest.TextTestRunner(verbosity = 2)
00236     unittest.main(testRunner=runner)