Back to index

python-biopython  1.60
test_SeqIO.py
Go to the documentation of this file.
00001 # Copyright 2007-2010 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 import os
00007 
00008 from Bio import SeqIO
00009 from Bio import AlignIO
00010 from Bio.SeqRecord import SeqRecord
00011 from Bio.Seq import Seq, UnknownSeq
00012 from StringIO import StringIO
00013 from Bio import Alphabet
00014 from Bio.Align import MultipleSeqAlignment
00015 
00016 try:
00017     #This is in Python 2.6+, but we need it on Python 3
00018     from io import BytesIO
00019 except ImportError:
00020     BytesIO = StringIO
00021 
00022 import warnings
00023 def send_warnings_to_stdout(message, category, filename, lineno,
00024                                 file=None, line=None):
00025     #TODO - Have Biopython DataLossWarning?
00026     if category in [UserWarning]:
00027         print "%s - %s" % (category.__name__, message)
00028 warnings.showwarning = send_warnings_to_stdout
00029 
00030 
00031 protein_alphas = [Alphabet.generic_protein]
00032 dna_alphas = [Alphabet.generic_dna]
00033 rna_alphas = [Alphabet.generic_rna]
00034 nucleotide_alphas = [Alphabet.generic_nucleotide,
00035                      Alphabet.Gapped(Alphabet.generic_nucleotide)]
00036 no_alpha_formats = ["fasta","clustal","phylip","phylip-relaxed",
00037                     "phylip-sequential","tab","ig",
00038                     "stockholm","emboss", "fastq","fastq-solexa",
00039                     "fastq-illumina","qual"]
00040 possible_unknown_seq_formats = ["qual", "genbank", "gb", "embl", "imgt"]
00041 
00042 #List of formats including alignment only file formats we can read AND write.
00043 #The list is initially hard coded to preserve the original order of the unit
00044 #test output, with any new formats added since appended to the end.
00045 test_write_read_alignment_formats = ["fasta","clustal","phylip","stockholm",
00046                                      "phylip-relaxed"]
00047 for format in sorted(SeqIO._FormatToWriter):
00048     if format not in test_write_read_alignment_formats:
00049         test_write_read_alignment_formats.append(format)
00050 for format in sorted(AlignIO._FormatToWriter):
00051     if format not in test_write_read_alignment_formats:
00052         test_write_read_alignment_formats.append(format)
00053 test_write_read_alignment_formats.remove("gb") #an alias for genbank
00054 test_write_read_alignment_formats.remove("fastq-sanger") #an alias for fastq
00055 
00056 # test_files is a list of tuples containing:
00057 # - string:  file format
00058 # - boolean: alignment (requires all seqs be same length)
00059 # - string:  relative filename
00060 # - integer: number of sequences
00061 
00062 test_files = [ \
00063     ("sff",    False, 'Roche/E3MFGYR02_random_10_reads.sff', 10),
00064 #Following examples are also used in test_Clustalw.py
00065     ("clustal",True,  'Clustalw/cw02.aln', 2),
00066     ("clustal",True,  'Clustalw/opuntia.aln', 7),
00067     ("clustal",True,  'Clustalw/hedgehog.aln', 5),
00068     ("clustal",True,  'Clustalw/odd_consensus.aln', 2),
00069 #Following nucleic examples are also used in test_SeqIO_FastaIO.py
00070     ("fasta",  False, 'Fasta/lupine.nu', 1),
00071     ("fasta",  False, 'Fasta/elderberry.nu', 1),
00072     ("fasta",  False, 'Fasta/phlox.nu', 1),
00073     ("fasta",  False, 'Fasta/centaurea.nu', 1),
00074     ("fasta",  False, 'Fasta/wisteria.nu', 1),
00075     ("fasta",  False, 'Fasta/sweetpea.nu', 1),
00076     ("fasta",  False, 'Fasta/lavender.nu', 1),
00077 #Following protein examples are also used in test_SeqIO_FastaIO.py
00078     ("fasta",  False, 'Fasta/aster.pro', 1),
00079     ("fasta",  False, 'Fasta/loveliesbleeding.pro', 1),
00080     ("fasta",  False, 'Fasta/rose.pro', 1),
00081     ("fasta",  False, 'Fasta/rosemary.pro', 1),
00082 #Following examples are also used in test_BioSQL_SeqIO.py
00083     ("fasta",  False, 'Fasta/f001', 1), #Protein
00084     ("fasta",  False, 'Fasta/f002', 3), #DNA
00085     #("fasta", False, 'Fasta/f003', 2), #Protein with comments
00086     ("fasta",  False, 'Fasta/fa01', 2), #Protein with gaps
00087 #Following are also used in test_SeqIO_features.py, see also NC_005816.gb
00088     ("fasta",  False, 'GenBank/NC_005816.fna', 1),
00089     ("fasta",  False, 'GenBank/NC_005816.ffn', 10),
00090     ("fasta",  False, 'GenBank/NC_005816.faa', 10),
00091     ("fasta",  False, 'GenBank/NC_000932.faa', 85),
00092     ("tab",  False, 'GenBank/NC_005816.tsv', 10), # FASTA -> Tabbed
00093 #Following examples are also used in test_GFF.py
00094     ("fasta",  False, 'GFF/NC_001802.fna', 1), #upper case
00095     ("fasta",  False, 'GFF/NC_001802lc.fna', 1), #lower case
00096     ("fasta",  True,  'GFF/multi.fna', 3), #Trivial nucleotide alignment
00097 #Following example is also used in test_registry.py
00098     ("fasta",  False, 'Registry/seqs.fasta', 2), #contains blank line
00099 #Following example is also used in test_Nexus.py
00100     ("nexus",  True,  'Nexus/test_Nexus_input.nex', 9),
00101 #Following examples are also used in test_SwissProt.py
00102     ("swiss",  False, 'SwissProt/sp001', 1),
00103     ("swiss",  False, 'SwissProt/sp002', 1),
00104     ("swiss",  False, 'SwissProt/sp003', 1),
00105     ("swiss",  False, 'SwissProt/sp004', 1),
00106     ("swiss",  False, 'SwissProt/sp005', 1),
00107     ("swiss",  False, 'SwissProt/sp006', 1),
00108     ("swiss",  False, 'SwissProt/sp007', 1),
00109     ("swiss",  False, 'SwissProt/sp008', 1),
00110     ("swiss",  False, 'SwissProt/sp009', 1),
00111     ("swiss",  False, 'SwissProt/sp010', 1),
00112     ("swiss",  False, 'SwissProt/sp011', 1),
00113     ("swiss",  False, 'SwissProt/sp012', 1),
00114     ("swiss",  False, 'SwissProt/sp013', 1),
00115     ("swiss",  False, 'SwissProt/sp014', 1),
00116     ("swiss",  False, 'SwissProt/sp015', 1),
00117     ("swiss",  False, 'SwissProt/sp016', 1),
00118 #Following example is also used in test_registry.py
00119     ("swiss",  False, 'Registry/EDD_RAT.dat', 1),
00120 #Following examples are also used in test_Uniprot.py
00121     ("uniprot-xml",  False, 'SwissProt/uni001', 1),
00122     ("uniprot-xml",  False, 'SwissProt/uni002', 3),
00123     ("uniprot-xml",  False, 'SwissProt/Q13639.xml', 1),
00124     ("swiss",    False, 'SwissProt/Q13639.txt', 1),
00125 #Following examples are also used in test_GenBank.py
00126     ("genbank",False, 'GenBank/noref.gb', 1),
00127     ("genbank",False, 'GenBank/cor6_6.gb', 6),
00128     ("genbank",False, 'GenBank/iro.gb', 1),
00129     ("genbank",False, 'GenBank/pri1.gb', 1),
00130     ("genbank",False, 'GenBank/arab1.gb', 1),
00131     ("genbank",False, 'GenBank/protein_refseq.gb', 1), #Old version
00132     ("genbank",False, 'GenBank/protein_refseq2.gb', 1), #Revised version
00133     ("genbank",False, 'GenBank/extra_keywords.gb', 1),
00134     ("genbank",False, 'GenBank/one_of.gb', 1),
00135     ("genbank",False, 'GenBank/NT_019265.gb', 1), #contig, no sequence
00136     ("genbank",False, 'GenBank/origin_line.gb', 1),
00137     ("genbank",False, 'GenBank/blank_seq.gb', 1),
00138     ("genbank",False, 'GenBank/dbsource_wrap.gb', 1),
00139     ("genbank",False, 'GenBank/NC_005816.gb', 1), #See also AE017046.embl
00140     ("genbank",False, 'GenBank/NC_000932.gb', 1),
00141     ("genbank",False, 'GenBank/pBAD30.gb', 1), #Odd LOCUS line from Vector NTI
00142 # The next example is a truncated copy of gbvrl1.seq from
00143 # ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
00144 # This includes an NCBI header, and the first three records:
00145     ("genbank",False, 'GenBank/gbvrl1_start.seq', 3),
00146 #Following files are also used in test_GFF.py
00147     ("genbank",False, 'GFF/NC_001422.gbk', 1),
00148 #Following files are currently only used here or in test_SeqIO_index.py:
00149     ("embl",   False, 'EMBL/epo_prt_selection.embl', 9), #proteins
00150     ("embl",   False, 'EMBL/TRBG361.embl', 1),
00151     ("embl",   False, 'EMBL/DD231055_edited.embl', 1),
00152     ("embl",   False, 'EMBL/SC10H5.embl', 1), # Pre 2006 style ID line
00153     ("embl",   False, 'EMBL/U87107.embl', 1), # Old ID line with SV line
00154     ("embl",   False, 'EMBL/AAA03323.embl', 1), # 2008, PA line but no AC
00155     ("embl",   False, 'EMBL/AE017046.embl', 1), #See also NC_005816.gb
00156     ("embl",   False, 'EMBL/Human_contigs.embl', 2), #contigs, no sequences
00157     ("embl",   False, 'EMBL/location_wrap.embl', 1), #wrapped locations and unspecified type
00158     ("embl",   False, 'EMBL/A04195.imgt', 1), # features over indented for EMBL
00159     ("imgt",   False, 'EMBL/A04195.imgt', 1), # features over indented for EMBL
00160     ("stockholm", True,  'Stockholm/simple.sth', 2),
00161     ("stockholm", True,  'Stockholm/funny.sth', 6),
00162 #Following PHYLIP files are currently only used here and in test_AlignIO.py,
00163 #and are mostly from Joseph Felsenstein's PHYLIP v3.6 documentation:
00164     ("phylip", True,  'Phylip/reference_dna.phy', 6),
00165     ("phylip", True,  'Phylip/reference_dna2.phy', 6),
00166     ("phylip", True,  'Phylip/hennigian.phy', 10),
00167     ("phylip", True,  'Phylip/horses.phy', 10),
00168     ("phylip", True,  'Phylip/random.phy', 10),
00169     ("phylip", True,  'Phylip/interlaced.phy', 3),
00170     ("phylip", True,  'Phylip/interlaced2.phy', 4),
00171 #Following are EMBOSS simple or pairs format alignments
00172     ("emboss", True,  'Emboss/alignret.txt', 4),
00173     ("emboss", False, 'Emboss/needle.txt', 10),
00174     ("emboss", True,  'Emboss/water.txt', 2),
00175 #Following PHD (PHRAP) sequencing files are also used in test_Phd.py
00176     ("phd", False, 'Phd/phd1', 3),
00177     ("phd", False, 'Phd/phd2', 1),
00178     ("phd", False, 'Phd/phd_solexa', 2),
00179     ("phd", False, 'Phd/phd_454', 1),
00180 #Following ACE assembly files are also used in test_Ace.py
00181     ("ace", False, 'Ace/contig1.ace', 2),
00182     ("ace", False, 'Ace/consed_sample.ace', 1),
00183     ("ace", False, 'Ace/seq.cap.ace', 1),
00184 #Following IntelliGenetics / MASE files are also used in test_intelligenetics.py
00185     ("ig",  False, 'IntelliGenetics/TAT_mase_nuc.txt', 17),
00186     ("ig",  True,  'IntelliGenetics/VIF_mase-pro.txt', 16),
00187     #This next file is a MASE alignment but sequence O_ANT70 is shorter than
00188     #the others (so as an alignment will fail).  Perhaps MASE doesn't
00189     #write trailing gaps?
00190     ("ig",  False,  'IntelliGenetics/vpu_nucaligned.txt', 9),
00191 #Following NBRD-PIR files are used in test_nbrf.py
00192     ("pir", False, 'NBRF/B_nuc.pir', 444),
00193     ("pir", False, 'NBRF/Cw_prot.pir', 111),
00194     ("pir", False, 'NBRF/DMA_nuc.pir', 4),
00195     ("pir", False, 'NBRF/DMB_prot.pir', 6),
00196     ("pir", True,  'NBRF/clustalw.pir', 2),
00197 #Following quality files are also used in the Bio.SeqIO.QualityIO doctests:
00198     ("fasta", True, 'Quality/example.fasta', 3),
00199     ("qual",  False,'Quality/example.qual',  3),
00200     ("fastq", True, 'Quality/example.fastq', 3),
00201     ("fastq", True, 'Quality/tricky.fastq', 4),
00202     ("fastq", False,'Quality/sanger_faked.fastq', 1),
00203     ("fastq", False,'Quality/sanger_93.fastq', 1),
00204     ("fastq-illumina", False,'Quality/illumina_faked.fastq', 1),
00205     ("fastq-solexa", False, 'Quality/solexa_faked.fastq', 1),
00206     ("fastq-solexa", True, 'Quality/solexa_example.fastq', 5),
00207 #Following examples are also used in test_SeqXML.py
00208     ("seqxml", False, 'SeqXML/dna_example.xml', 4),
00209     ("seqxml", False, 'SeqXML/rna_example.xml', 5),
00210     ("seqxml", False, 'SeqXML/protein_example.xml', 5),
00211 #Following examples are also used in test_SeqIO_AbiIO.py
00212     ("abi", False, 'Abi/310.ab1', 1),
00213     ("abi", False, 'Abi/3100.ab1', 1),
00214     ("abi", False, 'Abi/3730.ab1', 1),
00215     ]
00216 
00217 class ForwardOnlyHandle(object):
00218     """Mimic a network handle without seek and tell methods etc."""
00219     def __init__(self, handle):
00220         self._handle = handle
00221 
00222     def __iter__(self):
00223         return iter(self._handle)
00224 
00225     def read(self, length=None):
00226         if length is None:
00227             return self._handle.read()
00228         else:
00229             return self._handle.read(length)
00230 
00231     def readline(self):
00232         return self._handle.readline()
00233 
00234     def close(self):
00235         return self._handle.close()
00236 
00237 def compare_record(record_one, record_two):
00238     """This is meant to be a strict comparison for exact agreement..."""
00239     assert isinstance(record_one, SeqRecord)
00240     assert isinstance(record_two, SeqRecord)
00241     assert record_one.seq is not None
00242     assert record_two.seq is not None
00243     if record_one.id != record_two.id:
00244         return False
00245     if record_one.name != record_two.name:
00246         return False
00247     if record_one.description != record_two.description:
00248         return False
00249     if len(record_one) != len(record_two):
00250         return False
00251     if isinstance(record_one.seq, UnknownSeq) \
00252     and isinstance(record_two.seq, UnknownSeq):
00253         #Jython didn't like us comparing the string of very long UnknownSeq
00254         #object (out of heap memory error)
00255         if record_one.seq._character != record_two.seq._character:
00256             return False
00257     elif record_one.seq.tostring() != record_two.seq.tostring():
00258         return False
00259     #TODO - check features and annotation (see code for BioSQL tests)
00260     for key in set(record_one.letter_annotations).intersection( \
00261                    record_two.letter_annotations):
00262         if record_one.letter_annotations[key] != \
00263            record_two.letter_annotations[key]:
00264             return False
00265     return True
00266 
00267 def record_summary(record, indent=" "):
00268     """Returns a concise summary of a SeqRecord object as a string"""
00269     if record.id == record.name:
00270         answer = "%sID and Name='%s',\n%sSeq='" % (indent, record.id, indent)
00271     else:
00272         answer = "%sID = '%s', Name='%s',\n%sSeq='" % (indent, record.id, record.name, indent)
00273     if record.seq is None:
00274         answer += "None"
00275     else:
00276         if len(record.seq) > 50:
00277             answer += record.seq[:40].tostring() + "..." + record.seq[-7:].tostring()
00278         else:
00279             answer += record.seq.tostring()
00280         answer += "', length=%i" % (len(record.seq))
00281     return answer
00282 
00283 def col_summary(col_text):
00284     if len(col_text) < 65:
00285         return col_text
00286     else:
00287         return col_text[:60] + "..." + col_text[-5:]
00288 
00289 def alignment_summary(alignment, index=" "):
00290     """Returns a concise summary of an Alignment object as a string"""
00291     answer = []
00292     alignment_len = alignment.get_alignment_length()
00293     rec_count = len(alignment)
00294     for i in range(min(5,alignment_len)):
00295         answer.append(index + col_summary(alignment.get_column(i)) \
00296                             + " alignment column %i" % i)
00297     if alignment_len > 5:
00298         i = alignment_len - 1
00299         answer.append(index + col_summary("|" * rec_count) \
00300                             + " ...")
00301         answer.append(index + col_summary(alignment.get_column(i)) \
00302                             + " alignment column %i" % i)
00303     return "\n".join(answer)
00304 
00305 
00306 def check_simple_write_read(records, indent=" "):
00307     #print indent+"Checking we can write and then read back these records"
00308     for format in test_write_read_alignment_formats:
00309         if format not in possible_unknown_seq_formats \
00310         and isinstance(records[0].seq, UnknownSeq) \
00311         and len(records[0].seq) > 100:
00312            #Skipping for speed.  Some of the unknown sequences are
00313            #rather long, and it seems a bit pointless to record them.
00314            continue
00315         print indent+"Checking can write/read as '%s' format" % format
00316 
00317         #Going to write to a handle...
00318         if format in SeqIO._BinaryFormats:
00319             handle = BytesIO()
00320         else:
00321             handle = StringIO()
00322 
00323         try:
00324             c = SeqIO.write(sequences=records, handle=handle, format=format)
00325             assert c == len(records)
00326         except (TypeError, ValueError), e:
00327             #This is often expected to happen, for example when we try and
00328             #write sequences of different lengths to an alignment file.
00329             if "len()" in str(e):
00330                 #Python 2.4.3,
00331                 #>>> len(None)
00332                 #...
00333                 #TypeError: len() of unsized object
00334                 #
00335                 #Python 2.5.2,
00336                 #>>> len(None)
00337                 #...
00338                 #TypeError: object of type 'NoneType' has no len()
00339                 print "Failed: Probably len() of None"
00340             else:
00341                 print indent+"Failed: %s" % str(e)
00342             if records[0].seq.alphabet.letters is not None:
00343                 assert format != t_format, \
00344                        "Should be able to re-write in the original format!"
00345             #Carry on to the next format:
00346             continue
00347 
00348         handle.flush()
00349         handle.seek(0)
00350         #Now ready to read back from the handle...
00351         try:
00352             records2 = list(SeqIO.parse(handle=handle, format=format))
00353         except ValueError, e:
00354             #This is BAD.  We can't read our own output.
00355             #I want to see the output when called from the test harness,
00356             #run_tests.py (which can be funny about new lines on Windows)
00357             handle.seek(0)
00358             raise ValueError("%s\n\n%s\n\n%s" \
00359                               % (str(e), repr(handle.read()), repr(records)))
00360 
00361         assert len(records2) == t_count
00362         for r1, r2 in zip(records, records2):
00363             #Check the bare minimum (ID and sequence) as
00364             #many formats can't store more than that.
00365             assert len(r1) == len(r2)
00366 
00367             #Check the sequence
00368             if format in ["gb", "genbank", "embl", "imgt"]:
00369                 #The GenBank/EMBL parsers will convert to upper case.
00370                 if isinstance(r1.seq, UnknownSeq) \
00371                 and isinstance(r2.seq, UnknownSeq):
00372                     #Jython didn't like us comparing the string of very long
00373                     #UnknownSeq object (out of heap memory error)
00374                     assert r1.seq._character.upper() == r2.seq._character
00375                 else:
00376                     assert r1.seq.tostring().upper() == r2.seq.tostring()
00377             elif format == "qual":
00378                 assert isinstance(r2.seq, UnknownSeq)
00379                 assert len(r2) == len(r1)
00380             else:
00381                 assert r1.seq.tostring() == r2.seq.tostring()
00382             #Beware of different quirks and limitations in the
00383             #valid character sets and the identifier lengths!
00384             if format in ["phylip", "phylip-sequential"]:
00385                 assert r1.id.replace("[","").replace("]","")[:10] == r2.id, \
00386                        "'%s' vs '%s'" % (r1.id, r2.id)
00387             elif format=="phylip-relaxed":
00388                 assert r1.id.replace(" ", "").replace(':', '|') == r2.id, \
00389                         "'%s' vs '%s'" % (r1.id, r2.id)
00390             elif format=="clustal":
00391                 assert r1.id.replace(" ","_")[:30] == r2.id, \
00392                        "'%s' vs '%s'" % (r1.id, r2.id)
00393             elif format=="stockholm":
00394                 assert r1.id.replace(" ","_") == r2.id, \
00395                        "'%s' vs '%s'" % (r1.id, r2.id)
00396             elif format=="fasta":
00397                 assert r1.id.split()[0] == r2.id
00398             else:
00399                 assert r1.id == r2.id, \
00400                        "'%s' vs '%s'" % (r1.id, r2.id)
00401 
00402         if len(records)>1:
00403             #Try writing just one record (passing a SeqRecord, not a list)
00404             if format in SeqIO._BinaryFormats:
00405                 handle = BytesIO()
00406             else:
00407                 handle = StringIO()
00408             SeqIO.write(records[0], handle, format)
00409             assert handle.getvalue() == records[0].format(format)
00410 
00411 
00412 #Check parsers can cope with an empty file
00413 for t_format in SeqIO._FormatToIterator:
00414     if t_format in SeqIO._BinaryFormats or t_format=="uniprot-xml":
00415         #Not allowed empty SFF files.
00416         continue
00417     handle = StringIO()
00418     records = list(SeqIO.parse(handle, t_format))
00419     assert len(records) == 0
00420 
00421 for (t_format, t_alignment, t_filename, t_count) in test_files:
00422     if t_format in SeqIO._BinaryFormats:
00423         mode = "rb"
00424     else:
00425         mode = "r"
00426 
00427     print "Testing reading %s format file %s" % (t_format, t_filename)
00428     assert os.path.isfile(t_filename), t_filename
00429 
00430     #Try as an iterator using handle
00431     h = open(t_filename,mode)
00432     records  = list(SeqIO.parse(handle=h, format=t_format))
00433     h.close()
00434     assert len(records)  == t_count, \
00435          "Found %i records but expected %i" % (len(records), t_count)
00436 
00437     #Try using the iterator with a for loop, and a filename not handle
00438     records2 = []
00439     for record in SeqIO.parse(t_filename, format=t_format):
00440         records2.append(record)
00441     assert len(records2) == t_count
00442 
00443     #Try using the iterator with the next() method
00444     records3 = []
00445     h = open(t_filename,mode)
00446     seq_iterator = SeqIO.parse(handle=h, format=t_format)
00447     while True:
00448         try:
00449             record = seq_iterator.next()
00450         except StopIteration:
00451             break
00452         assert record is not None, "Should raise StopIteration not return None"
00453         records3.append(record)
00454     h.close()
00455 
00456     #Try a mixture of next() and list (a torture test!)
00457     h = open(t_filename,mode)
00458     seq_iterator = SeqIO.parse(handle=h, format=t_format)
00459     try:
00460         record = seq_iterator.next()
00461     except StopIteration:
00462         record = None
00463     if record is not None:
00464         records4 = [record]
00465         records4.extend(list(seq_iterator))
00466     else:
00467         records4 = []
00468     assert len(records4) == t_count
00469     h.close()
00470 
00471     #Try a mixture of next() and for loop (a torture test!)
00472     #with a forward-only-handle
00473     if t_format == "abi":
00474         #Temp hack
00475         h = open(t_filename, mode)
00476     else:
00477         h = ForwardOnlyHandle(open(t_filename, mode))
00478     seq_iterator = SeqIO.parse(h, format=t_format)
00479     try:
00480         record = seq_iterator.next()
00481     except StopIteration:
00482         record = None
00483     if record is not None:
00484         records5 = [record]
00485         for record in seq_iterator:
00486             records5.append(record)
00487     else:
00488         records5 = []
00489     assert len(records5) == t_count
00490     h.close()
00491 
00492     for i in range(t_count):
00493         record = records[i]
00494 
00495         #Check returned expected object type
00496         assert isinstance(record, SeqRecord)
00497         if t_format in possible_unknown_seq_formats:
00498             assert isinstance(record.seq, Seq) or \
00499                    isinstance(record.seq, UnknownSeq)
00500         else:
00501             assert isinstance(record.seq, Seq)
00502         assert isinstance(record.id, basestring)
00503         assert isinstance(record.name, basestring)
00504         assert isinstance(record.description, basestring)
00505         assert record.id != ""
00506 
00507         if "accessions" in record.annotations:
00508             accs = record.annotations["accessions"]
00509             #Check for blanks, or entries with leading/trailing spaces
00510             for acc in accs:
00511                 assert acc and acc == acc.strip(), \
00512                     "Bad accession in annotations: %s" % repr(acc)
00513             assert len(set(accs)) == len(accs), \
00514                    "Repeated accession in annotations: %s" % repr(accs)
00515         for ref in record.dbxrefs:
00516             assert ref and ref == ref.strip(), \
00517                 "Bad cross reference in dbxrefs: %s" % repr(ref)
00518         assert len(record.dbxrefs) == len(record.dbxrefs), \
00519                "Repeated cross reference in dbxrefs: %s" % repr(record.dbxrefs)
00520 
00521 
00522         #Check the lists obtained by the different methods agree
00523         assert compare_record(record, records2[i])
00524         assert compare_record(record, records3[i])
00525         assert compare_record(record, records4[i])
00526         assert compare_record(record, records5[i])
00527 
00528         if i < 3:
00529             print record_summary(record)
00530     # Only printed the only first three records: 0,1,2
00531     if t_count > 4:
00532         print " ..."
00533     if t_count > 3:
00534         print record_summary(records[-1])
00535 
00536     # Check Bio.SeqIO.read(...)
00537     if t_count == 1:
00538         record = SeqIO.read(t_filename, format=t_format)
00539         assert isinstance(record, SeqRecord)
00540     else:
00541         try:
00542             record = SeqIO.read(t_filename, t_format)
00543             assert False, "Bio.SeqIO.read(...) should have failed"
00544         except ValueError:
00545             #Expected to fail
00546             pass
00547 
00548     # Check alphabets
00549     for record in records:
00550         base_alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
00551         if isinstance(base_alpha, Alphabet.SingleLetterAlphabet):
00552             if t_format in no_alpha_formats:
00553                 assert base_alpha == Alphabet.single_letter_alphabet # Too harsh?
00554         else:
00555             base_alpha = None
00556     if base_alpha is None:
00557         good = []
00558         bad =[]
00559         given_alpha=None
00560     elif isinstance(base_alpha, Alphabet.ProteinAlphabet):
00561         good = protein_alphas
00562         bad = dna_alphas + rna_alphas + nucleotide_alphas
00563     elif isinstance(base_alpha, Alphabet.RNAAlphabet):
00564         good = nucleotide_alphas + rna_alphas
00565         bad = protein_alphas + dna_alphas
00566     elif isinstance(base_alpha, Alphabet.DNAAlphabet):
00567         good = nucleotide_alphas + dna_alphas
00568         bad = protein_alphas + rna_alphas
00569     elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
00570         good = nucleotide_alphas
00571         bad = protein_alphas
00572     else:
00573         assert t_format in no_alpha_formats, "Got %s from %s file" \
00574                % (repr(base_alpha), t_format)
00575         good = protein_alphas + dna_alphas + rna_alphas + nucleotide_alphas
00576         bad = []
00577     for given_alpha in good:
00578         #These should all work...
00579         given_base = Alphabet._get_base_alphabet(given_alpha)
00580         for record in SeqIO.parse(t_filename,t_format,given_alpha):
00581             base_alpha = Alphabet._get_base_alphabet(record.seq.alphabet)
00582             assert isinstance(base_alpha, given_base.__class__)
00583             assert base_alpha == given_base
00584         if t_count == 1:
00585             h = open(t_filename,mode)
00586             record = SeqIO.read(h,t_format,given_alpha)
00587             h.close()
00588             assert isinstance(base_alpha, given_base.__class__)
00589             assert base_alpha == given_base
00590     for given_alpha in bad:
00591         #These should all fail...
00592         h = open(t_filename,mode)
00593         try:
00594             print SeqIO.parse(h,t_format,given_alpha).next()
00595             h.close()
00596             assert False, "Forcing wrong alphabet, %s, should fail (%s)" \
00597                    % (repr(given_alpha), t_filename)
00598         except ValueError:
00599             #Good - should fail
00600             pass
00601         h.close()
00602     del good, bad, given_alpha, base_alpha
00603 
00604     if t_alignment:
00605         print "Testing reading %s format file %s as an alignment" \
00606               % (t_format, t_filename)
00607 
00608         alignment = MultipleSeqAlignment(SeqIO.parse( \
00609                     handle=t_filename, format=t_format))
00610         assert len(alignment) == t_count
00611 
00612         alignment_len = alignment.get_alignment_length()
00613 
00614         #Check the record order agrees, and double check the
00615         #sequence lengths all agree too.
00616         for i in range(t_count):
00617             assert compare_record(records[i], alignment[i])
00618             assert len(records[i].seq) == alignment_len
00619 
00620         print alignment_summary(alignment)
00621 
00622     #Some alignment file formats have magic characters which mean
00623     #use the letter in this position in the first sequence.
00624     #They should all have been converted by the parser, but if
00625     #not reversing the record order might expose an error.  Maybe.
00626     records.reverse()
00627     check_simple_write_read(records)
00628 
00629 print "Finished tested reading files"