Back to index

python-biopython  1.60
_Mafft.py
Go to the documentation of this file.
00001 # Copyright 2009 by Cymon J. Cox.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """Command line wrapper for the multiple alignment programme MAFFT.
00006 """
00007 
00008 __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages!
00009 
00010 #TODO - Remove file exist checks? Other wrappers don't do this, and
00011 #it prevent things like preparing commands to run on a cluster.
00012 
00013 import os
00014 from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline
00015 
00016 class MafftCommandline(AbstractCommandline):
00017     """Command line wrapper for the multiple alignment program MAFFT.
00018 
00019     http://align.bmr.kyushu-u.ac.jp/mafft/software/
00020 
00021     Example:
00022 
00023     >>> from Bio.Align.Applications import MafftCommandline
00024     >>> mafft_exe = "/opt/local/mafft"
00025     >>> in_file = "../Doc/examples/opuntia.fasta"
00026     >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
00027     >>> print mafft_cline
00028     /opt/local/mafft ../Doc/examples/opuntia.fasta
00029 
00030     If the mafft binary is on the path (typically the case on a Unix style
00031     operating system) then you don't need to supply the executable location:
00032 
00033     >>> from Bio.Align.Applications import MafftCommandline
00034     >>> in_file = "../Doc/examples/opuntia.fasta"
00035     >>> mafft_cline = MafftCommandline(input=in_file)
00036     >>> print mafft_cline
00037     mafft ../Doc/examples/opuntia.fasta
00038 
00039     You would typically run the command line with mafft_cline() or via
00040     the Python subprocess module, as described in the Biopython tutorial.
00041     Note that MAFFT will write the alignment to stdout, which you may
00042     want to save to a file and then parse, e.g.
00043 
00044     stdout, stderr = mafft_cline()
00045     handle = open("aligned.fasta", "w")
00046     handle.write(stdout)
00047     handle.close()
00048     from Bio import AlignIO
00049     align = AlignIO.read("aligned.fasta", "fasta")
00050 
00051     Alternatively, to parse the output with AlignIO directly you can
00052     use StringIO to turn the string into a handle:
00053 
00054     stdout, stderr = mafft_cline()
00055     from StringIO import StringIO
00056     from Bio import AlignIO
00057     align = AlignIO.read(StringIO(stdout), "fasta")
00058 
00059     Citations:
00060 
00061     Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
00062     multiple ncRNA alignment by incorporating structural information into
00063     a MAFFT-based framework (describes RNA structural alignment methods)
00064 
00065     Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
00066     developments in the MAFFT multiple sequence alignment program
00067     (outlines version 6)
00068 
00069     Katoh, Toh (Bioinformatics 23:372-374, 2007)  Errata PartTree: an
00070     algorithm to build an approximate tree from a large number of
00071     unaligned sequences (describes the PartTree algorithm)
00072 
00073     Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
00074     version 5: improvement in accuracy of multiple sequence alignment
00075     (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
00076     strategies)
00077 
00078     Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
00079 
00080     Last checked against version: MAFFT v6.717b (2009/12/03)
00081     """
00082     def __init__(self, cmd="mafft", **kwargs):
00083         BLOSUM_MATRICES = ["30","45","62","80"]
00084         self.parameters = \
00085             [
00086             #**** Algorithm ****
00087             #Automatically selects an appropriate strategy from L-INS-i, FFT-NS-
00088             #i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
00089             _Switch(["--auto", "auto"],
00090                     "Automatically select strategy. Default off."),
00091             #Distance is calculated based on the number of shared 6mers. Default: on
00092             _Switch(["--6merpair", "6merpair", "sixmerpair"],
00093                     "Distance is calculated based on the number of shared "
00094                     "6mers. Default: on"),
00095             #All pairwise alignments are computed with the Needleman-Wunsch
00096             #algorithm. More accurate but slower than --6merpair. Suitable for a
00097             #set of globally alignable sequences. Applicable to up to ~200
00098             #sequences. A combination with --maxiterate 1000 is recommended (G-
00099             #INS-i). Default: off (6mer distance is used)
00100             _Switch(["--globalpair", "globalpair"],
00101                     "All pairwise alignments are computed with the "
00102                     "Needleman-Wunsch algorithm. Default: off"),
00103             #All pairwise alignments are computed with the Smith-Waterman
00104             #algorithm. More accurate but slower than --6merpair. Suitable for a
00105             #set of locally alignable sequences. Applicable to up to ~200
00106             #sequences. A combination with --maxiterate 1000 is recommended (L-
00107             #INS-i). Default: off (6mer distance is used)
00108             _Switch(["--localpair", "localpair"],
00109                     "All pairwise alignments are computed with the "
00110                     "Smith-Waterman algorithm. Default: off"),
00111             #All pairwise alignments are computed with a local algorithm with
00112             #the generalized affine gap cost (Altschul 1998). More accurate but
00113             #slower than --6merpair. Suitable when large internal gaps are
00114             #expected. Applicable to up to ~200 sequences. A combination with --
00115             #maxiterate 1000 is recommended (E-INS-i). Default: off (6mer
00116             #distance is used)
00117             _Switch(["--genafpair", "genafpair"],
00118                     "All pairwise alignments are computed with a local "
00119                     "algorithm with the generalized affine gap cost "
00120                     "(Altschul 1998). Default: off"),
00121             #All pairwise alignments are computed with FASTA (Pearson and Lipman
00122             #1988). FASTA is required. Default: off (6mer distance is used)
00123             _Switch(["--fastapair", "fastapair"],
00124                     "All pairwise alignments are computed with FASTA "
00125                     "(Pearson and Lipman 1988). Default: off"),
00126             #Weighting factor for the consistency term calculated from pairwise
00127             #alignments. Valid when either of --blobalpair, --localpair, --
00128             #genafpair, --fastapair or --blastpair is selected. Default: 2.7
00129             _Option(["--weighti", "weighti"],
00130                     "Weighting factor for the consistency term calculated "
00131                     "from pairwise alignments. Default: 2.7",
00132                     checker_function=lambda x: isinstance(x, float),
00133                     equate=False),
00134             #Guide tree is built number times in the progressive stage. Valid
00135             #with 6mer distance. Default: 2
00136             _Option(["--retree", "retree"],
00137                     "Guide tree is built number times in the progressive "
00138                     "stage. Valid with 6mer distance. Default: 2",
00139                     checker_function=lambda x: isinstance(x, int),
00140                     equate=False),
00141             #Number cycles of iterative refinement are performed. Default: 0
00142             _Option(["--maxiterate", "maxiterate"],
00143                     "Number cycles of iterative refinement are performed. "
00144                     "Default: 0",
00145                     checker_function=lambda x: isinstance(x, int),
00146                     equate=False),
00147             #Use FFT approximation in group-to-group alignment. Default: on
00148             _Switch(["--fft", "fft"],
00149                     "Use FFT approximation in group-to-group alignment. "
00150                     "Default: on"),
00151             #Do not use FFT approximation in group-to-group alignment. Default:
00152             #off
00153             _Switch(["--nofft", "nofft"],
00154                     "Do not use FFT approximation in group-to-group "
00155                     "alignment. Default: off"),
00156             #Alignment score is not checked in the iterative refinement stage.
00157             #Default: off (score is checked)
00158             _Switch(["--noscore", "noscore"],
00159                     "Alignment score is not checked in the iterative "
00160                     "refinement stage. Default: off (score is checked)"),
00161             #Use the Myers-Miller (1988) algorithm. Default: automatically
00162             #turned on when the alignment length exceeds 10,000 (aa/nt).
00163             _Switch(["--memsave", "memsave"],
00164                     "Use the Myers-Miller (1988) algorithm. Default: "
00165                     "automatically turned on when the alignment length "
00166                     "exceeds 10,000 (aa/nt)."),
00167             #Use a fast tree-building method (PartTree, Katoh and Toh 2007) with
00168             #the 6mer distance. Recommended for a large number (> ~10,000) of
00169             #sequences are input. Default: off
00170             _Switch(["--parttree", "parttree"],
00171                     "Use a fast tree-building method with the 6mer "
00172                     "distance. Default: off"),
00173             #The PartTree algorithm is used with distances based on DP. Slightly
00174             #more accurate and slower than --parttree. Recommended for a large
00175             #number (> ~10,000) of sequences are input. Default: off
00176             _Switch(["--dpparttree", "dpparttree"],
00177                     "The PartTree algorithm is used with distances "
00178                     "based on DP. Default: off"),
00179             #The PartTree algorithm is used with distances based on FASTA.
00180             #Slightly more accurate and slower than --parttree. Recommended for
00181             #a large number (> ~10,000) of sequences are input. FASTA is
00182             #required. Default: off
00183             _Switch(["--fastaparttree", "fastaparttree"],
00184                     "The PartTree algorithm is used with distances based "
00185                     "on FASTA. Default: off"),
00186             #The number of partitions in the PartTree algorithm. Default: 50
00187             _Option(["--partsize", "partsize"],
00188                     "The number of partitions in the PartTree algorithm. "
00189                     "Default: 50",
00190                     checker_function=lambda x: isinstance(x, int),
00191                     equate=False),
00192             #Do not make alignment larger than number sequences. Valid only with
00193             #the --*parttree options. Default: the number of input sequences
00194             _Switch(["--groupsize", "groupsize"],
00195                     "Do not make alignment larger than number sequences. "
00196                     "Default: the number of input sequences"),
00197             #**** Parameter ****
00198             #Gap opening penalty at group-to-group alignment. Default: 1.53
00199             _Option(["--op", "op"],
00200                     "Gap opening penalty at group-to-group alignment. "
00201                     "Default: 1.53",
00202                     checker_function=lambda x: isinstance(x, float),
00203                     equate=False),
00204             #Offset value, which works like gap extension penalty, for group-to-
00205             #group alignment. Deafult: 0.123
00206             _Option(["--ep", "ep"],
00207                     "Offset value, which works like gap extension penalty, "
00208                     "for group-to- group alignment. Default: 0.123",
00209                     checker_function=lambda x: isinstance(x, float),
00210                     equate=False),
00211             #Gap opening penalty at local pairwise alignment. Valid when the --
00212             #localpair or --genafpair option is selected. Default: -2.00
00213             _Option(["--lop", "lop"],
00214                     "Gap opening penalty at local pairwise alignment. "
00215                     "Default: 0.123",
00216                     checker_function=lambda x: isinstance(x, float),
00217                     equate=False),
00218             #Offset value at local pairwise alignment. Valid when the --
00219             #localpair or --genafpair option is selected. Default: 0.1
00220             _Option(["--lep", "lep"],
00221                     "Offset value at local pairwise alignment. "
00222                     "Default: 0.1",
00223                     checker_function=lambda x: isinstance(x, float),
00224                     equate=False),
00225             #Gap extension penalty at local pairwise alignment. Valid when the -
00226             #-localpair or --genafpair option is selected. Default: -0.1
00227             _Option(["--lexp", "lexp"],
00228                     "Gap extension penalty at local pairwise alignment. "
00229                     "Default: -0.1",
00230                     checker_function=lambda x: isinstance(x, float),
00231                     equate=False),
00232             #Gap opening penalty to skip the alignment. Valid when the --
00233             #genafpair option is selected. Default: -6.00
00234             _Option(["--LOP", "LOP"],
00235                     "Gap opening penalty to skip the alignment. "
00236                     "Default: -6.00",
00237                     checker_function=lambda x: isinstance(x, float),
00238                     equate=False),
00239             #Gap extension penalty to skip the alignment. Valid when the --
00240             #genafpair option is selected. Default: 0.00
00241             _Option(["--LEXP", "LEXP"],
00242                     "Gap extension penalty to skip the alignment. "
00243                     "Default: 0.00",
00244                     checker_function=lambda x: isinstance(x, float),
00245                     equate=False),
00246 
00247             #BLOSUM number matrix (Henikoff and Henikoff 1992) is used.
00248             #number=30, 45, 62 or 80. Default: 62
00249             _Option(["--bl", "bl"],
00250                     "BLOSUM number matrix is used. Default: 62",
00251                     checker_function=lambda x: x in BLOSUM_MATRICES,
00252                     equate=False),
00253             #JTT PAM number (Jones et al. 1992) matrix is used. number>0.
00254             #Default: BLOSUM62
00255             _Option(["--jtt", "jtt"],
00256                     "JTT PAM number (Jones et al. 1992) matrix is used. "
00257                     "number>0. Default: BLOSUM62",
00258                     equate=False),
00259             #Transmembrane PAM number (Jones et al. 1994) matrix is used.
00260             #number>0. Default: BLOSUM62
00261             _Option(["--tm", "tm"],
00262                     "Transmembrane PAM number (Jones et al. 1994) "
00263                     "matrix is used. number>0. Default: BLOSUM62",
00264                     checker_function=os.path.exists,
00265                     filename=True,
00266                     equate=False),
00267             #Use a user-defined AA scoring matrix. The format of matrixfile is
00268             #the same to that of BLAST. Ignored when nucleotide sequences are
00269             #input. Default: BLOSUM62
00270             _Option(["--aamatrix", "aamatrix"],
00271                     "Use a user-defined AA scoring matrix. "
00272                     "Default: BLOSUM62",
00273                     checker_function=os.path.exists,
00274                     filename=True,
00275                     equate=False),
00276             #Incorporate the AA/nuc composition information into the scoring
00277             #matrix. Default: off
00278             _Switch(["--fmodel", "fmodel"],
00279                     "Incorporate the AA/nuc composition information into "
00280                     "the scoring matrix (True) or not (False, default)"),
00281             #**** Output ****
00282             #Output format: clustal format. Default: off (fasta format)
00283             _Switch(["--clustalout", "clustalout"],
00284                     "Output format: clustal (True) or fasta (False, default)"),
00285             #Output order: same as input. Default: on
00286             _Switch(["--inputorder", "inputorder"],
00287                     "Output order: same as input (True, default) or alignment "
00288                     "based (False)"),
00289             #Output order: aligned. Default: off (inputorder)
00290             _Switch(["--reorder", "reorder"],
00291                     "Output order: aligned (True) or in input order (False, "
00292                     "default)"),
00293             #Guide tree is output to the input.tree file. Default: off
00294             _Switch(["--treeout", "treeout"],
00295                     "Guide tree is output to the input.tree file (True) or "
00296                     "not (False, default)"),
00297             #Do not report progress. Default: off
00298             _Switch(["--quiet", "quiet"],
00299                     "Do not report progress (True) or not (False, default)."),
00300             #**** Input ****
00301             #Assume the sequences are nucleotide. Deafult: auto
00302             _Switch(["--nuc", "nuc"],
00303                     "Assume the sequences are nucleotide (True/False). "
00304                     "Default: auto"),
00305             #Assume the sequences are amino acid. Deafult: auto
00306             _Switch(["--amino", "amino"],
00307                     "Assume the sequences are amino acid (True/False). "
00308                     "Default: auto"),
00309             ###################### SEEDS #####################################
00310             # MAFFT has multiple --seed commands where the unaligned input is
00311             # aligned to the seed alignment. There can be multiple seeds in the
00312             # form: "mafft --seed align1 --seed align2 [etc] input"
00313             # Effectively for n number of seed alignments. Here we're going to
00314             # assume 6 extra are enough
00315             _Option(["--seed", "seed"],
00316                     "Seed alignments given in alignment_n (fasta format) "
00317                     "are aligned with sequences in input.",
00318                     checker_function=os.path.exists,
00319                     filename=True,
00320                     equate=False),
00321             #The old solution of also defining extra parameters with
00322             #["--seed", "seed1"] etc worked, but clashes with the recent
00323             #code in the base class to look for duplicate paramters and raise
00324             #an error.  Perhaps that check should be ignored here, or maybe
00325             #we can handle this more elegantly...
00326             #TODO - Create an _OptionList parameter which allows a list to be
00327             #assigned to the value?
00328             ####################### END SEEDS  ################################
00329             #The input (must be FASTA format)
00330             _Argument(["input"],
00331                       "Input file name",
00332                       checker_function=os.path.exists,
00333                       filename=True,
00334                       is_required=True),
00335             ###################################################################
00336             #mafft-profile takes a second alignment input as an argument:
00337             #mafft-profile align1 align2
00338             _Argument(["input1"],
00339                       "Second input file name for the mafft-profile command",
00340                       checker_function=os.path.exists,
00341                       filename=True),
00342             ]
00343         AbstractCommandline.__init__(self, cmd, **kwargs)
00344 
00345 def _test():
00346     """Run the module's doctests (PRIVATE).
00347 
00348     This will try and locate the unit tests directory, and run the doctests
00349     from there in order that the relative paths used in the examples work.
00350     """
00351     #TODO - Remove os.path checks on input filenames?
00352     import doctest
00353     import os
00354     if os.path.isdir(os.path.join("..","Tests")):
00355         print "Runing doctests..."
00356         cur_dir = os.path.abspath(os.curdir)
00357         os.chdir(os.path.join("..","Tests"))
00358         doctest.testmod()
00359         os.chdir(cur_dir)
00360         del cur_dir
00361         print "Done"
00362     elif os.path.isdir(os.path.join("Tests")) :
00363         print "Runing doctests..."
00364         cur_dir = os.path.abspath(os.curdir)
00365         os.chdir(os.path.join("Tests"))
00366         doctest.testmod()
00367         os.chdir(cur_dir)
00368         del cur_dir
00369         print "Done"
00370 
00371 if __name__ == "__main__":
00372     _test()