Back to index

python-biopython  1.60
test_ClustalOmega_tool.py
Go to the documentation of this file.
00001 # Copyright 2008-2011 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 from Bio import MissingExternalDependencyError
00007 
00008 import sys
00009 import os
00010 from Bio import SeqIO
00011 from Bio import AlignIO
00012 from Bio.Align.Applications import ClustalOmegaCommandline
00013 from Bio.Application import ApplicationError
00014 
00015 #################################################################
00016 
00017 #Try to avoid problems when the OS is in another language
00018 os.environ['LANG'] = 'C'
00019 
00020 clustalo_exe = None
00021 if sys.platform=="win32":
00022     #TODO
00023     raise MissingExternalDependencyError("Testing this on Windows not implemented yet")
00024 else:
00025     import commands
00026     output = commands.getoutput("clustalo --help")
00027     if output.startswith("Clustal Omega"):
00028         clustalo_exe = "clustalo"
00029 
00030 if not clustalo_exe:
00031     raise MissingExternalDependencyError(\
00032         "Install clustalo if you want to use Clustal Omega from Biopython.")
00033 
00034 #################################################################
00035 
00036 print "Checking error conditions"
00037 print "========================="
00038 
00039 print "Empty file"
00040 input_file = "does_not_exist.fasta"
00041 assert not os.path.isfile(input_file)
00042 cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file)
00043 try:
00044     stdout, stderr = cline()
00045     assert False, "Should have failed, returned:\n%s\n%s" % (stdout, stderr)
00046 except ApplicationError, err:
00047     print "Failed (good)"
00048     #Python 2.3 on Windows gave (0, 'Error')
00049     #Python 2.5 on Windows gives [Errno 0] Error
00050     assert "Cannot open sequence file" in str(err) or \
00051            "Cannot open input file" in str(err) or \
00052            "non-zero exit status" in str(err), str(err)
00053 
00054 print
00055 print "Single sequence"
00056 input_file = "Fasta/f001"
00057 assert os.path.isfile(input_file)
00058 assert len(list(SeqIO.parse(input_file,"fasta")))==1
00059 cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file)
00060 try:
00061     stdout, stderr = cline()
00062     assert False, "Should have failed, returned:\n%s\n%s" % (stdout, stderr)
00063 except ApplicationError, err:
00064     print "Failed (good)"
00065     assert "contains 1 sequence, nothing to align" in str(err), str(err)
00066 
00067 print
00068 print "Invalid sequence"
00069 input_file = "Medline/pubmed_result1.txt"
00070 assert os.path.isfile(input_file)
00071 cline = ClustalOmegaCommandline(clustalo_exe, infile=input_file)
00072 try:
00073     stdout, stderr = cline()
00074     assert False, "Should have failed, returned:\n%s\n%s" % (stdout, stderr)
00075 except ApplicationError, err:
00076     print "Failed (good)"
00077     #Ideally we'd catch the return code and raise the specific
00078     #error for "invalid format", rather than just notice there
00079     #is not output file.
00080     #Note:
00081     #Python 2.3 on Windows gave (0, 'Error')
00082     #Python 2.5 on Windows gives [Errno 0] Error
00083     assert "Can't determine format of sequence file" in str(err), str(err)
00084 
00085 #################################################################
00086 print
00087 print "Checking normal situations"
00088 print "=========================="
00089 
00090 #Create a temp fasta file with a space in the name
00091 temp_filename_with_spaces = "Clustalw/temp horses.fasta"
00092 handle = open(temp_filename_with_spaces, "w")
00093 SeqIO.write(SeqIO.parse("Phylip/hennigian.phy","phylip"),handle, "fasta")
00094 handle.close()
00095 
00096 #Create a large input file by converting another example file
00097 #(See Bug 2804, this will produce so much output on stdout that
00098 #subprocess could suffer a deadlock and hang).  Using all the
00099 #records should show the deadlock but is very slow - just thirty
00100 #seems to lockup on Mac OS X, even 20 on Linux (without the fix).
00101 temp_large_fasta_file = "temp_cw_prot.fasta"
00102 handle = open(temp_large_fasta_file, "w")
00103 records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40]
00104 SeqIO.write(records, handle, "fasta")
00105 handle.close()
00106 del handle, records
00107 
00108 for input_file, output_file, newtree_file in [
00109     ("Registry/seqs.fasta", "temp with space.aln", None),
00110     ("Registry/seqs.fasta", "temp_test.aln", None),
00111     ("Registry/seqs.fasta", "temp_test.aln", "temp_test.dnd"),
00112     ("Registry/seqs.fasta", "temp_test.aln", "temp with space.dnd"),
00113     (temp_filename_with_spaces, "temp_test.aln", None),
00114     (temp_filename_with_spaces, "temp with space.aln", None),
00115     (temp_large_fasta_file, "temp_cw_prot.aln", None),
00116     ]:
00117     #Note that ClustalW will map ":" to "_" in it's output file
00118     input_records = SeqIO.to_dict(SeqIO.parse(input_file,"fasta"),
00119                                   lambda rec : rec.id.replace(":","_"))
00120     if os.path.isfile(output_file):
00121         os.remove(output_file)
00122     print "Calling clustalw on %s (with %i records)" \
00123           % (repr(input_file), len(input_records))
00124     print "using output file %s" % repr(output_file)
00125     if newtree_file is not None:
00126         print "requesting output guide tree file %s" % repr(newtree_file)
00127 
00128     #Any filesnames with spaces should get escaped with quotes automatically.
00129     #Using keyword arguments here, force=True to over-write the output file.
00130     cline = ClustalOmegaCommandline(clustalo_exe,
00131                                     infile=input_file,
00132                                     outfile=output_file,
00133                                     outfmt="clustal",
00134                                     force=True)
00135     assert str(eval(repr(cline)))==str(cline)
00136     if newtree_file is not None:
00137         #Test using a property:
00138         cline.guidetree_out = newtree_file
00139         assert str(eval(repr(cline)))==str(cline)
00140     #print cline
00141     output, error = cline()
00142     #assert output, "No output from: %s\n%s" % (cline, error)
00143     assert not output or output.strip().startswith("CLUSTAL"), output
00144     assert error.strip() == "" or \
00145            error.startswith("WARNING: Sequence type is DNA."), error
00146     #Check the output...
00147     align = AlignIO.read(output_file, "clustal")
00148     #The length of the alignment will depend on the version of clustalw
00149     #(clustalw 2.0.10 and clustalw 1.83 are certainly different).
00150     print "Got an alignment, %i sequences" % (len(align))
00151     output_records = SeqIO.to_dict(SeqIO.parse(output_file,"clustal"))
00152     assert len(set(input_records.keys())) == len(set(output_records.keys())), \
00153         "%r vs %r" %(sorted(input_records.keys()), sorted(output_records.keys()))
00154     #TODO - Investigate Clustal Omega underscore/semi-colon mapping
00155     #for record in align:
00156     #    assert str(record.seq) == str(output_records[record.id].seq)
00157     #    assert str(record.seq).replace("-","") == \
00158     #           str(input_records[record.id].seq)
00159 
00160     #Clean up...
00161     os.remove(output_file)
00162 
00163     #TODO Check the DND file was created.
00164     #TODO - Try and parse this with Bio.Nexus?
00165     if newtree_file is not None \
00166     and os.path.isfile(newtree_file):
00167         os.remove(newtree_file)
00168 
00169 #Clean up any stray temp files..
00170 if os.path.isfile(temp_filename_with_spaces):
00171     os.remove(temp_filename_with_spaces)
00172 if os.path.isfile(temp_large_fasta_file):
00173     os.remove(temp_large_fasta_file)
00174 
00175 print "Done"