Back to index

python-biopython  1.60
test_AlignIO.py
Go to the documentation of this file.
00001 # Copyright 2008 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 import os
00007 from StringIO import StringIO
00008 from Bio import SeqIO
00009 from Bio import AlignIO
00010 from Bio.Align.Generic import Alignment
00011 from Bio.Align import AlignInfo, MultipleSeqAlignment
00012 from Bio.Seq import Seq
00013 from Bio.SeqRecord import SeqRecord
00014 
00015 test_write_read_alignment_formats = sorted(AlignIO._FormatToWriter)
00016 test_write_read_align_with_seq_count = test_write_read_alignment_formats \
00017                                      + ["fasta", "tab"]
00018 
00019 # test_files is a list of tuples containing:
00020 # - string:  file format
00021 # - integer: number of sequences per alignment
00022 # - integer: number of alignments
00023 # - string:  relative filename
00024 #
00025 # Most of the input files are also used by test_SeqIO.py,
00026 # and by other additional tests as noted below.
00027 test_files = [
00028 #Following examples are also used in test_Clustalw.py
00029     ("clustal", 2, 1, 'Clustalw/cw02.aln'),
00030     ("clustal", 7, 1, 'Clustalw/opuntia.aln'),
00031     ("clustal", 5, 1, 'Clustalw/hedgehog.aln'),
00032     ("clustal", 2, 1, 'Clustalw/odd_consensus.aln'),
00033     ("clustal",20, 1, 'Clustalw/protein.aln'), #Used in the tutorial
00034     ("clustal",20, 1, 'Clustalw/promals3d.aln'), #Nonstandard header
00035 #Following examples are also used in test_GFF.py
00036     ("fasta", 3, 1, 'GFF/multi.fna'), #Trivial nucleotide alignment
00037 #Following example is also used in test_Nexus.py
00038     ("nexus", 9, 1, 'Nexus/test_Nexus_input.nex'),
00039     ("nexus", 2, 1, 'Nexus/codonposset.nex'),
00040     ("stockholm", 2, 1, 'Stockholm/simple.sth'),
00041     ("stockholm", 6, 1, 'Stockholm/funny.sth'),
00042     ("phylip", 6, 1, 'Phylip/reference_dna.phy'),
00043     ("phylip", 6, 1, 'Phylip/reference_dna2.phy'),
00044     ("phylip",10, 1, 'Phylip/hennigian.phy'),
00045     ("phylip",10, 1, 'Phylip/horses.phy'),
00046     ("phylip",10, 1, 'Phylip/random.phy'),
00047     ("phylip", 3, 1, 'Phylip/interlaced.phy'),
00048     ("phylip", 4, 1, 'Phylip/interlaced2.phy'),
00049     ("phylip-relaxed", 12, 1, 'ExtendedPhylip/primates.phyx'),
00050     ("phylip-sequential", 3, 1, 'Phylip/sequential.phy'),
00051     ("phylip-sequential", 4, 1, 'Phylip/sequential2.phy'),
00052     ("emboss", 4, 1, 'Emboss/alignret.txt'),
00053     ("emboss", 2, 5, 'Emboss/needle.txt'),
00054     ("emboss", 2, 1, 'Emboss/needle_asis.txt'),
00055     ("emboss", 2, 1, 'Emboss/water.txt'),
00056     ("emboss", 2, 1, 'Emboss/water2.txt'),
00057     ("emboss", 2, 1, 'Emboss/matcher_simple.txt'),
00058     ("emboss", 2, 5, 'Emboss/matcher_pair.txt'),
00059     ("fasta-m10", 2, 4, 'Fasta/output001.m10'),
00060     ("fasta-m10", 2, 6, 'Fasta/output002.m10'),
00061     ("fasta-m10", 2, 3, 'Fasta/output003.m10'),
00062     ("fasta-m10", 2, 1, 'Fasta/output004.m10'),
00063     ("fasta-m10", 2, 1, 'Fasta/output005.m10'),
00064     ("fasta-m10", 2, 1, 'Fasta/output006.m10'),
00065     ("fasta-m10", 2, 9, 'Fasta/output007.m10'),
00066     ("fasta-m10", 2, 12,'Fasta/output008.m10'),
00067     ("ig", 16, 1, 'IntelliGenetics/VIF_mase-pro.txt'),
00068     ("pir", 2, 1,  'NBRF/clustalw.pir'),
00069     ]
00070 
00071 def str_summary(text, max_len=40):
00072     if len(text) <= max_len:
00073         return text
00074     else:
00075         return text[:max_len-4] + "..." + text[-3:]
00076 
00077 def alignment_summary(alignment, index="  ", vertical_threshold=5):
00078     """Returns a concise summary of an Alignment object as a string."""
00079     answer = []
00080     alignment_len = alignment.get_alignment_length()
00081     rec_count = len(alignment)
00082     if rec_count < vertical_threshold:
00083         #Show each sequence row horizontally
00084         for record in alignment:
00085             answer.append("%s%s %s" \
00086             % (index,str_summary(record.seq.tostring()),record.id))
00087     else:
00088         #Show each sequence row vertically
00089         for i in range(min(5,alignment_len)):
00090             answer.append(index + str_summary(alignment[:,i]) \
00091                                 + " alignment column %i" % i)
00092         if alignment_len > 5:
00093             i = alignment_len - 1
00094             answer.append(index + str_summary("|" * rec_count) \
00095                                 + " ...")
00096             answer.append(index + str_summary(alignment[:,i]) \
00097                                 + " alignment column %i" % i)
00098     return "\n".join(answer)
00099 
00100 
00101 def check_simple_write_read(alignments, indent=" "):
00102     #print indent+"Checking we can write and then read back these alignments"
00103     for format in test_write_read_align_with_seq_count:
00104         records_per_alignment = len(alignments[0])
00105         for a in alignments:
00106             if records_per_alignment != len(a):
00107                 records_per_alignment = None
00108         #Can we expect this format to work?
00109         if not records_per_alignment \
00110         and format not in test_write_read_alignment_formats:
00111             continue
00112 
00113         print indent+"Checking can write/read as '%s' format" % format
00114 
00115         #Going to write to a handle...
00116         handle = StringIO()
00117 
00118         try:
00119             c = AlignIO.write(alignments, handle=handle, format=format)
00120             assert c == len(alignments)
00121         except ValueError, e:
00122             #This is often expected to happen, for example when we try and
00123             #write sequences of different lengths to an alignment file.
00124             print indent+"Failed: %s" % str(e)
00125             #Carry on to the next format:
00126             continue
00127 
00128         #First, try with the seq_count
00129         if records_per_alignment:
00130             handle.flush()
00131             handle.seek(0)
00132             try:
00133                 alignments2 = list(AlignIO.parse(handle=handle, format=format, \
00134                                                  seq_count=records_per_alignment))
00135             except ValueError, e:
00136                 #This is BAD.  We can't read our own output.
00137                 #I want to see the output when called from the test harness,
00138                 #run_tests.py (which can be funny about new lines on Windows)
00139                 handle.seek(0)
00140                 raise ValueError("%s\n\n%s\n\n%s" \
00141                                   % (str(e), repr(handle.read()), repr(alignments2)))
00142             simple_alignment_comparison(alignments, alignments2, format)
00143 
00144         if format in test_write_read_alignment_formats:
00145             #Don't need the seq_count
00146             handle.flush()
00147             handle.seek(0)
00148             try:
00149                 alignments2 = list(AlignIO.parse(handle=handle, format=format))
00150             except ValueError, e:
00151                 #This is BAD.  We can't read our own output.
00152                 #I want to see the output when called from the test harness,
00153                 #run_tests.py (which can be funny about new lines on Windows)
00154                 handle.seek(0)
00155                 raise ValueError("%s\n\n%s\n\n%s" \
00156                                   % (str(e), repr(handle.read()), repr(alignments2)))
00157             simple_alignment_comparison(alignments, alignments2, format)
00158 
00159         if len(alignments)>1:
00160             #Try writing just one Alignment (not a list)
00161             handle = StringIO()
00162             SeqIO.write(alignments[0], handle, format)
00163             assert handle.getvalue() == alignments[0].format(format)
00164 
00165 def simple_alignment_comparison(alignments, alignments2, format):
00166     assert len(alignments) == len(alignments2)
00167     for a1, a2 in zip(alignments, alignments2):
00168         assert a1.get_alignment_length() == a2.get_alignment_length()
00169         assert len(a1) == len(a2)
00170         for r1, r2 in zip(a1,a2):
00171             #Check the bare minimum (ID and sequence) as
00172             #many formats can't store more than that.
00173 
00174             #Check the sequence
00175             assert r1.seq.tostring() == r2.seq.tostring()
00176 
00177             #Beware of different quirks and limitations in the
00178             #valid character sets and the identifier lengths!
00179             if format in ["phylip", "phylip-sequential"]:
00180                 assert r1.id.replace("[","").replace("]","")[:10] == r2.id, \
00181                        "'%s' vs '%s'" % (r1.id, r2.id)
00182             elif format=="phylip-relaxed":
00183                 assert r1.id.replace(" ", "").replace(':', '|') == r2.id, \
00184                         "'%s' vs '%s'" % (r1.id, r2.id)
00185             elif format=="clustal":
00186                 assert r1.id.replace(" ","_")[:30] == r2.id, \
00187                        "'%s' vs '%s'" % (r1.id, r2.id)
00188             elif format=="stockholm":
00189                 assert r1.id.replace(" ","_") == r2.id, \
00190                        "'%s' vs '%s'" % (r1.id, r2.id)
00191             elif format=="fasta":
00192                 assert r1.id.split()[0] == r2.id
00193             else:
00194                 assert r1.id == r2.id, \
00195                        "'%s' vs '%s'" % (r1.id, r2.id)
00196     return True
00197 
00198 #Check Phylip files reject duplicate identifiers.
00199 def check_phylip_reject_duplicate():
00200     """
00201     Ensure that attempting to write sequences with duplicate IDs after
00202     truncation fails for Phylip format.
00203     """
00204     handle = StringIO()
00205     sequences = [SeqRecord(Seq('AAAA'), id='longsequencename1'),
00206                  SeqRecord(Seq('AAAA'), id='longsequencename2'),
00207                  SeqRecord(Seq('AAAA'), id='other_sequence'),]
00208     alignment = MultipleSeqAlignment(sequences)
00209     try:
00210         # This should raise a ValueError
00211         AlignIO.write(alignment, handle, 'phylip')
00212         assert False, "Duplicate IDs after truncation are not allowed."
00213     except ValueError, e:
00214         # Expected - check the error
00215         assert "Repeated name 'longsequen'" in str(e)
00216 
00217 check_phylip_reject_duplicate()
00218 
00219 
00220 #Check parsers can cope with an empty file
00221 for t_format in AlignIO._FormatToIterator:
00222      handle = StringIO()
00223      alignments = list(AlignIO.parse(handle, t_format))
00224      assert len(alignments) == 0
00225 
00226 #Check writers can cope with no alignments
00227 for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter):
00228      handle = StringIO()
00229      assert 0 == AlignIO.write([], handle, t_format), \
00230             "Writing no alignments to %s format should work!" \
00231             % t_format
00232 
00233 #Check writers reject non-alignments
00234 list_of_records = list(AlignIO.read(open("Clustalw/opuntia.aln"),"clustal"))
00235 for t_format in list(AlignIO._FormatToWriter)+list(SeqIO._FormatToWriter):
00236     handle = StringIO()
00237     try:
00238         AlignIO.write([list_of_records], handle, t_format)
00239         assert False, "Writing non-alignment to %s format should fail!" \
00240             % t_format
00241     except (TypeError, AttributeError, ValueError):
00242         pass
00243     del handle
00244 del list_of_records, t_format
00245 
00246 #Main tests...
00247 for (t_format, t_per, t_count, t_filename) in test_files:
00248     print "Testing reading %s format file %s with %i alignments" \
00249           % (t_format, t_filename, t_count)
00250     assert os.path.isfile(t_filename), t_filename
00251 
00252     #Try as an iterator using handle
00253     alignments  = list(AlignIO.parse(handle=open(t_filename,"r"), format=t_format))
00254     assert len(alignments)  == t_count, \
00255          "Found %i alignments but expected %i" % (len(alignments), t_count)
00256     for alignment in alignments:
00257         assert len(alignment) == t_per, \
00258             "Expected %i records per alignment, got %i" \
00259             % (t_per, len(alignment))
00260 
00261     #Try using the iterator with a for loop and a filename not handle
00262     alignments2 = []
00263     for record in AlignIO.parse(t_filename, format=t_format):
00264         alignments2.append(record)
00265     assert len(alignments2) == t_count
00266 
00267     #Try using the iterator with the next() method
00268     alignments3 = []
00269     seq_iterator = AlignIO.parse(handle=open(t_filename,"r"), format=t_format)
00270     while True:
00271         try:
00272             record = seq_iterator.next()
00273         except StopIteration:
00274             break
00275         assert record is not None, "Should raise StopIteration not return None"
00276         alignments3.append(record)
00277 
00278     #Try a mixture of next() and list (a torture test!)
00279     seq_iterator = AlignIO.parse(handle=open(t_filename,"r"), format=t_format)
00280     try:
00281         record = seq_iterator.next()
00282     except StopIteration:
00283         record = None
00284     if record is not None:
00285         alignments4 = [record]
00286         alignments4.extend(list(seq_iterator))
00287     else:
00288         alignments4 = []
00289     assert len(alignments4) == t_count
00290 
00291     #Try a mixture of next() and for loop (a torture test!)
00292     seq_iterator = AlignIO.parse(handle=open(t_filename,"r"), format=t_format)
00293     try:
00294         record = seq_iterator.next()
00295     except StopIteration:
00296         record = None
00297     if record is not None:
00298         alignments5 = [record]
00299         for record in seq_iterator:
00300             alignments5.append(record)
00301     else:
00302         alignments5 = []
00303     assert len(alignments5) == t_count
00304 
00305     # Check Bio.AlignIO.read(...)
00306     if t_count == 1:
00307         alignment = AlignIO.read(handle=open(t_filename), format=t_format)
00308         assert isinstance(alignment, Alignment)
00309     else:
00310         try:
00311             alignment = AlignIO.read(open(t_filename), t_format)
00312             assert False, "Bio.AlignIO.read(...) should have failed"
00313         except ValueError:
00314             #Expected to fail
00315             pass
00316 
00317     #Print the alignment
00318     for i,alignment in enumerate(alignments):
00319         if i < 3 or i+1 == t_count:
00320             print " Alignment %i, with %i sequences of length %i" \
00321                   % (i,
00322                      len(alignment),
00323                      alignment.get_alignment_length())
00324             print alignment_summary(alignment)
00325         elif i==3:
00326             print " ..."
00327 
00328     #Check AlignInfo.SummaryInfo likes the alignment
00329     summary = AlignInfo.SummaryInfo(alignment)
00330     dumb_consensus = summary.dumb_consensus()
00331     #gap_consensus = summary.gap_consensus()
00332     if t_format != "nexus":
00333         #Hack for bug 2535
00334         pssm = summary.pos_specific_score_matrix()
00335         rep_dict = summary.replacement_dictionary()
00336         try:
00337             info_content = summary.information_content()
00338         except ValueError, e:
00339             if str(e) != "Error in alphabet: not Nucleotide or Protein, supply expected frequencies":
00340                 raise e
00341             pass
00342 
00343     if t_count==1 and t_format not in ["nexus","emboss","fasta-m10"]:
00344         #print " Trying to read a triple concatenation of the input file"
00345         data = open(t_filename,"r").read()
00346         handle = StringIO()
00347         handle.write(data + "\n\n" + data + "\n\n" + data)
00348         handle.seek(0)
00349         assert 3 == len(list(AlignIO.parse(handle=handle, format=t_format, seq_count=t_per)))
00350 
00351     #Some alignment file formats have magic characters which mean
00352     #use the letter in this position in the first sequence.
00353     #They should all have been converted by the parser, but if
00354     #not reversing the record order might expose an error.  Maybe.
00355     alignments.reverse()
00356     check_simple_write_read(alignments)
00357 
00358 print "Finished tested reading files"