Back to index

python-biopython  1.60
test_Clustalw_tool.py
Go to the documentation of this file.
00001 # Copyright 2008-2011 by Peter Cock.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 #TODO - Clean up the extra files created by clustalw?  e.g. *.dnd
00007 #and *.aln where we have not requested an explicit name?
00008 from Bio import MissingExternalDependencyError
00009 
00010 import sys
00011 import os
00012 from Bio import SeqIO
00013 from Bio import AlignIO
00014 from Bio.Align.Applications import ClustalwCommandline
00015 from Bio.Application import ApplicationError
00016 
00017 #################################################################
00018 
00019 #Try to avoid problems when the OS is in another language
00020 os.environ['LANG'] = 'C'
00021 
00022 clustalw_exe = None
00023 if sys.platform=="win32":
00024     #TODO - Check the path?
00025     try:
00026         #This can vary depending on the Windows language.
00027         prog_files = os.environ["PROGRAMFILES"]
00028     except KeyError:
00029         prog_files = r"C:\Program Files"
00030 
00031     #Note that EBI's clustalw2 installer, e.g. clustalw-2.0.10-win.msi
00032     #uses C:\Program Files\ClustalW2\clustalw2.exe so we should check
00033     #for that.
00034     #
00035     #Some users doing a manual install have reported using
00036     #C:\Program Files\clustalw.exe
00037     #
00038     #Older installers might use something like this,
00039     #C:\Program Files\Clustalw\clustalw.exe
00040     #
00041     #One particular case is www.tc.cornell.edu currently provide a
00042     #clustalw1.83 installer which uses the following long location:
00043     #C:\Program Files\CTCBioApps\clustalw\v1.83\clustalw1.83.exe
00044     likely_dirs = ["ClustalW2", "",
00045                    "Clustal","Clustalw","Clustalw183","Clustalw1.83",
00046                    r"CTCBioApps\clustalw\v1.83"]
00047     likely_exes = ["clustalw2.exe",
00048                    "clustalw.exe", "clustalw1.83.exe"]
00049     for folder in likely_dirs:
00050         if os.path.isdir(os.path.join(prog_files, folder)):
00051             for filename in likely_exes:
00052                 if os.path.isfile(os.path.join(prog_files, folder, filename)):
00053                     clustalw_exe = os.path.join(prog_files, folder, filename)
00054                     break
00055             if clustalw_exe : break
00056 else:
00057     import commands
00058     #Note that clustalw 1.83 and clustalw 2.0.10 don't obey the --version
00059     #command, but this does cause them to quit cleanly.  Otherwise they prompt
00060     #the user for input (causing a lock up).
00061     output = commands.getoutput("clustalw2 --version")
00062     #Since "not found" may be in another language, try and be sure this is
00063     #really the clustalw tool's output
00064     if "not found" not in output and "CLUSTAL" in output \
00065     and "Multiple Sequence Alignments" in output:
00066         clustalw_exe = "clustalw2"
00067     if not clustalw_exe:
00068         output = commands.getoutput("clustalw --version")
00069         if "not found" not in output and "CLUSTAL" in output \
00070         and "Multiple Sequence Alignments" in output:
00071             clustalw_exe = "clustalw"
00072 
00073 if not clustalw_exe:
00074     raise MissingExternalDependencyError(\
00075         "Install clustalw or clustalw2 if you want to use it from Biopython.")
00076 
00077 #################################################################
00078 
00079 print "Checking error conditions"
00080 print "========================="
00081 
00082 print "Empty file"
00083 input_file = "does_not_exist.fasta"
00084 assert not os.path.isfile(input_file)
00085 cline = ClustalwCommandline(clustalw_exe, infile=input_file)
00086 try:
00087     stdout, stderr = cline()
00088     assert False, "Should have failed, returned:\n%s\n%s" % (stdout, stderr)
00089 except ApplicationError, err:
00090     print "Failed (good)"
00091     #Python 2.3 on Windows gave (0, 'Error')
00092     #Python 2.5 on Windows gives [Errno 0] Error
00093     assert "Cannot open sequence file" in str(err) or \
00094            "Cannot open input file" in str(err) or \
00095            "non-zero exit status" in str(err), str(err)
00096 
00097 print
00098 print "Single sequence"
00099 input_file = "Fasta/f001"
00100 assert os.path.isfile(input_file)
00101 assert len(list(SeqIO.parse(input_file,"fasta")))==1
00102 cline = ClustalwCommandline(clustalw_exe, infile=input_file)
00103 try:
00104     stdout, stderr = cline()
00105     if "cannot do multiple alignment" in (stdout + stderr):
00106         #Zero return code is a possible bug in clustal?
00107         print "Failed (good)"
00108     else:
00109         assert False, "Should have failed, returned:\n%s\n%s" % (stdout, stderr)
00110 except ApplicationError, err:
00111     print "Failed (good)"
00112     assert str(err) == "No records found in handle", str(err)
00113 
00114 print
00115 print "Invalid sequence"
00116 input_file = "Medline/pubmed_result1.txt"
00117 assert os.path.isfile(input_file)
00118 cline = ClustalwCommandline(clustalw_exe, infile=input_file)
00119 try:
00120     stdout, stderr = cline()
00121     assert False, "Should have failed, returned:\n%s\n%s" % (stdout, stderr)
00122 except ApplicationError, err:
00123     print "Failed (good)"
00124     #Ideally we'd catch the return code and raise the specific
00125     #error for "invalid format", rather than just notice there
00126     #is not output file.
00127     #Note:
00128     #Python 2.3 on Windows gave (0, 'Error')
00129     #Python 2.5 on Windows gives [Errno 0] Error
00130     assert "invalid format" in str(err) \
00131            or "not produced" in str(err) \
00132            or "No sequences in file" in str(err) \
00133            or "non-zero exit status " in str(err), str(err)
00134 
00135 #################################################################
00136 print
00137 print "Checking normal situations"
00138 print "=========================="
00139 
00140 #Create a temp fasta file with a space in the name
00141 temp_filename_with_spaces = "Clustalw/temp horses.fasta"
00142 handle = open(temp_filename_with_spaces, "w")
00143 SeqIO.write(SeqIO.parse("Phylip/hennigian.phy","phylip"),handle, "fasta")
00144 handle.close()
00145 
00146 #Create a large input file by converting another example file
00147 #(See Bug 2804, this will produce so much output on stdout that
00148 #subprocess could suffer a deadlock and hang).  Using all the
00149 #records should show the deadlock but is very slow - just thirty
00150 #seems to lockup on Mac OS X, even 20 on Linux (without the fix).
00151 temp_large_fasta_file = "temp_cw_prot.fasta"
00152 handle = open(temp_large_fasta_file, "w")
00153 records = list(SeqIO.parse("NBRF/Cw_prot.pir", "pir"))[:40]
00154 SeqIO.write(records, handle, "fasta")
00155 handle.close()
00156 del handle, records
00157 
00158 for input_file, output_file, newtree_file in [
00159     ("Fasta/f002", "temp_test.aln", None),
00160     ("GFF/multi.fna", "temp with space.aln", None),
00161     ("Registry/seqs.fasta", "temp_test.aln", None),
00162     ("Registry/seqs.fasta", "temp_test.aln", "temp_test.dnd"),
00163     ("Registry/seqs.fasta", "temp_test.aln", "temp with space.dnd"),
00164     (temp_filename_with_spaces, "temp_test.aln", None),
00165     (temp_filename_with_spaces, "temp with space.aln", None),
00166     (temp_large_fasta_file, "temp_cw_prot.aln", None),
00167     ]:
00168     #Note that ClustalW will map ":" to "_" in it's output file
00169     input_records = SeqIO.to_dict(SeqIO.parse(input_file,"fasta"),
00170                                   lambda rec : rec.id.replace(":","_"))
00171     if os.path.isfile(output_file):
00172         os.remove(output_file)
00173     print "Calling clustalw on %s (with %i records)" \
00174           % (repr(input_file), len(input_records))
00175     print "using output file %s" % repr(output_file)
00176     if newtree_file is not None:
00177         print "requesting output guide tree file %s" % repr(newtree_file)
00178 
00179     #Any filesnames with spaces should get escaped with quotes automatically.
00180     #Using keyword arguments here.
00181     cline = ClustalwCommandline(clustalw_exe,
00182                                 infile=input_file,
00183                                 outfile=output_file)
00184     assert str(eval(repr(cline)))==str(cline)
00185     if newtree_file is not None:
00186         #Test using a property:
00187         cline.newtree = newtree_file
00188         #I don't just want the tree, also want the alignment:
00189         cline.align = True
00190         assert str(eval(repr(cline)))==str(cline)
00191     #print cline
00192     output, error = cline()
00193     assert output.strip().startswith("CLUSTAL")
00194     assert error.strip() == ""
00195     #Check the output...
00196     align = AlignIO.read(output_file, "clustal")
00197     #The length of the alignment will depend on the version of clustalw
00198     #(clustalw 2.0.10 and clustalw 1.83 are certainly different).
00199     print "Got an alignment, %i sequences" % (len(align))
00200     output_records = SeqIO.to_dict(SeqIO.parse(output_file,"clustal"))
00201     assert set(input_records.keys()) == set(output_records.keys())
00202     for record in align:
00203         assert str(record.seq) == str(output_records[record.id].seq)
00204         assert str(record.seq).replace("-","") == \
00205                str(input_records[record.id].seq)
00206 
00207     #Clean up...
00208     os.remove(output_file)
00209 
00210     #Check the DND file was created.
00211     #TODO - Try and parse this with Bio.Nexus?
00212     if newtree_file is not None:
00213         tree_file = newtree_file
00214     else:
00215         #Clustalw will name it based on the input file
00216         tree_file = os.path.splitext(input_file)[0] + ".dnd"
00217     assert os.path.isfile(tree_file), \
00218            "Did not find tree file %s" % tree_file
00219     os.remove(tree_file)
00220 
00221 #Clean up any stray temp files..
00222 if os.path.isfile("Fasta/f001.aln"):
00223     os.remove("Fasta/f001.aln")
00224 if os.path.isfile("Medline/pubmed_result1.aln"):
00225     os.remove("Medline/pubmed_result1.aln")
00226 if os.path.isfile(temp_filename_with_spaces):
00227     os.remove(temp_filename_with_spaces)
00228 if os.path.isfile(temp_large_fasta_file):
00229     os.remove(temp_large_fasta_file)
00230 
00231 print "Done"