Back to index

python-biopython  1.60
_Muscle.py
Go to the documentation of this file.
00001 # Copyright 2009 by Cymon J. Cox.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """Command line wrapper for the multiple alignment program MUSCLE.
00006 """
00007 
00008 __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages!
00009 
00010 from Bio.Application import _Option, _Switch, AbstractCommandline
00011 
00012 class MuscleCommandline(AbstractCommandline):
00013     r"""Command line wrapper for the multiple alignment program MUSCLE.
00014 
00015     http://www.drive5.com/muscle/
00016 
00017     Example:
00018 
00019     >>> from Bio.Align.Applications import MuscleCommandline
00020     >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe"
00021     >>> in_file = r"C:\My Documents\unaligned.fasta"
00022     >>> out_file = r"C:\My Documents\aligned.fasta"
00023     >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
00024     >>> print muscle_cline
00025     C:\Program Files\Aligments\muscle3.8.31_i86win32.exe -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta"
00026 
00027     You would typically run the command line with muscle_cline() or via
00028     the Python subprocess module, as described in the Biopython tutorial.
00029 
00030     Citations:
00031 
00032     Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
00033     accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.
00034 
00035     Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
00036     reduced time and space complexity. BMC Bioinformatics 5(1): 113.
00037 
00038     Last checked against version: 3.7, briefly against 3.8
00039     """
00040     def __init__(self, cmd="muscle", **kwargs):
00041         CLUSTERING_ALGORITHMS   = ["upgma", "upgmb", "neighborjoining"]
00042         DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3",
00043                                    "kmer4_6"]
00044         DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \
00045                                   ["pctid_kimura", "pctid_log"]
00046         OBJECTIVE_SCORES        = ["sp", "ps", "dp", "xp", "spf", "spm"]
00047         TREE_ROOT_METHODS       = ["pseudo", "midlongestspan", "minavgleafdist"]
00048         SEQUENCE_TYPES          = ["protein", "nucleo", "auto"]
00049         WEIGHTING_SCHEMES       = ["none", "clustalw", "henikoff", "henikoffpb",
00050                                    "gsc", "threeway"]
00051         self.parameters = \
00052            [
00053             #Can't use "in" as the final alias as this is a reserved word in python:
00054             _Option(["-in", "in", "input"],
00055                     "Input filename",
00056                     filename=True,
00057                     equate=False),
00058             _Option(["-out", "out"],
00059                     "Output filename",
00060                     filename=True,
00061                     equate=False),
00062             _Switch(["-diags", "diags"],
00063                     "Find diagonals (faster for similar sequences)"),
00064             _Switch(["-profile", "profile"],
00065                     "Perform a profile alignment"),
00066             _Option(["-in1", "in1"],
00067                     "First input filename for profile alignment",
00068                     filename=True,
00069                     equate=False),
00070             _Option(["-in2", "in2"],
00071                     "Second input filename for a profile alignment",
00072                     filename=True,
00073                     equate=False),
00074             #anchorspacing   Integer              32                 Minimum spacing between
00075             _Option(["-anchorspacing", "anchorspacing"],
00076                     "Minimum spacing between anchor columns",
00077                     checker_function=lambda x: isinstance(x, int),
00078                     equate=False),
00079             #center          Floating point       [1]                Center parameter.
00080             #                                                        Should be negative.
00081             _Option(["-center", "center"],
00082                     "Center parameter - should be negative",
00083                     checker_function=lambda x: isinstance(x, float),
00084                     equate=False),
00085             #cluster1        upgma                upgmb              Clustering method.
00086             _Option(["-cluster1", "cluster1"],
00087                     "Clustering method used in iteration 1",
00088                     checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
00089                     equate=False),
00090             #cluster2        upgmb                                   cluster1 is used in
00091             #                neighborjoining                         iteration 1 and 2,
00092             #                                                        cluster2 in later
00093             #                                                        iterations.
00094             _Option(["-cluster2", "cluster2"],
00095                     "Clustering method used in iteration 2",
00096                     checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
00097                     equate=False),
00098             #diaglength      Integer              24                 Minimum length of
00099             #                                                        diagonal.
00100             _Option(["-diaglength", "diaglength"],
00101                     "Minimum length of diagonal",
00102                     checker_function=lambda x: isinstance(x, int),
00103                     equate=True),
00104             #diagmargin      Integer              5                  Discard this many
00105             #                                                        positions at ends of
00106             #                                                        diagonal.
00107             _Option(["-diagmargin", "diagmargin"],
00108                     "Discard this many positions at ends of diagonal",
00109                     checker_function=lambda x: isinstance(x, int),
00110                     equate=False),
00111             #distance1       kmer6_6              Kmer6_6 (amino) or Distance measure for
00112             #                kmer20_3             Kmer4_6 (nucleo)   iteration 1.
00113             #                kmer20_4
00114             #                kbit20_3
00115             #                kmer4_6
00116             _Option(["-distance1", "distance1"],
00117                     "Distance measure for iteration 1",
00118                     checker_function=lambda x: x in DISTANCE_MEASURES_ITER1,
00119                     equate=False),
00120             #distance2       kmer6_6              pctid_kimura       Distance measure for
00121             #                kmer20_3                                iterations 2, 3 ...
00122             #                kmer20_4
00123             #                kbit20_3
00124             #                pctid_kimura
00125             #                pctid_log
00126             _Option(["-distance2", "distance2"],
00127                     "Distance measure for iteration 2",
00128                     checker_function=lambda x: x in DISTANCE_MEASURES_ITER2,
00129                     equate=False),
00130             #gapopen         Floating point       [1]                The gap open score.
00131             #                                                        Must be negative.
00132             _Option(["-gapopen", "gapopen"],
00133                     "Gap open score - negative number",
00134                     checker_function=lambda x: isinstance(x, float),
00135                     equate=False),
00136             #hydro           Integer              5                  Window size for
00137             #                                                        determining whether a
00138             #                                                        region is hydrophobic.
00139             _Option(["-hydro", "hydro"],
00140                     "Window size for hydrophobic region",
00141                     checker_function=lambda x: isinstance(x, int),
00142                     equate=False),
00143             #hydrofactor     Floating point       1.2                Multiplier for gap
00144             #                                                        open/close penalties in
00145             #                                                        hydrophobic regions.
00146             _Option(["-hydrofactor", "hydrofactor"],
00147                     "Multiplier for gap penalties in hydrophobic regions",
00148                     checker_function=lambda x: isinstance(x, float),
00149                     equate=False),
00150             #log             File name            None.              Log file name (delete
00151             #                                                        existing file).
00152             _Option(["-log", "log"],
00153                     "Log file name",
00154                     filename=True,
00155                     equate=False),
00156             #loga            File name            None.              Log file name (append
00157             #                                                        to existing file).
00158             _Option(["-loga", "loga"],
00159                     "Log file name (append to existing file)",
00160                     filename=True,
00161                     equate=False),
00162             #maxdiagbreak    Integer              1                  Maximum distance
00163             #                                                        between two diagonals
00164             #                                                        that allows them to
00165             #                                                        merge into one
00166             #                                                        diagonal.
00167             _Option(["-maxdiagbreak", "maxdiagbreak"],
00168                     "Maximum distance between two diagonals that allows "
00169                     "them to merge into one diagonal",
00170                     checker_function=lambda x: isinstance(x, int),
00171                     equate=False),
00172             #maxhours        Floating point       None.              Maximum time to run in
00173             #                                                        hours. The actual time
00174             #                                                        may exceed the
00175             #                                                        requested limit by a
00176             #                                                        few minutes. Decimals
00177             #                                                        are allowed, so 1.5
00178             #                                                        means one hour and 30
00179             #                                                        minutes.
00180             _Option(["-maxhours", "maxhours"],
00181                     "Maximum time to run in hours",
00182                     checker_function=lambda x: isinstance(x, float),
00183                     equate=False),
00184             #maxiters        Integer 1, 2 ...     16                 Maximum number of
00185             #                                                        iterations.
00186             _Option(["-maxiters", "maxiters"],
00187                     "Maximum number of iterations",
00188                     checker_function=lambda x: isinstance(x, int),
00189                     equate=False),
00190             #maxtrees        Integer              1                  Maximum number of new
00191             #                                                        trees to build in
00192             #                                                        iteration 2.
00193             _Option(["-maxtrees", "maxtrees"],
00194                     "Maximum number of trees to build in iteration 2",
00195                     checker_function=lambda x: isinstance(x, int),
00196                     equate=False),
00197             #minbestcolscore Floating point       [1]                Minimum score a column
00198             #                                                        must have to be an
00199             #                                                        anchor.
00200             _Option(["-minbestcolscore", "minbestcolscore"],
00201                     "Minimum score a column must have to be an anchor",
00202                     checker_function=lambda x: isinstance(x, float),
00203                     equate=False),
00204             #minsmoothscore  Floating point       [1]                Minimum smoothed score
00205             #                                                        a column must have to
00206             #                                                        be an anchor.
00207             _Option(["-minsmoothscore", "minsmoothscore"],
00208                     "Minimum smoothed score a column must have to "
00209                     "be an anchor",
00210                     checker_function=lambda x: isinstance(x, float),
00211                     equate=False),
00212             #objscore        sp                   spm                Objective score used by
00213             #                ps                                      tree dependent
00214             #                dp                                      refinement.
00215             #                xp                                      sp=sum-of-pairs score.
00216             #                spf                                     spf=sum-of-pairs score
00217             #                spm                                     (dimer approximation)
00218             #                                                        spm=sp for < 100 seqs,
00219             #                                                        otherwise spf
00220             #                                                        dp=dynamic programming
00221             #                                                        score.
00222             #                                                        ps=average profile-
00223             #                                                        sequence score.
00224             #                                                        xp=cross profile score.
00225             _Option(["-objscore", "objscore"],
00226                     "Objective score used by tree dependent refinement",
00227                     checker_function=lambda x: x in OBJECTIVE_SCORES,
00228                     equate=False),
00229             #root1           pseudo               psuedo             Method used to root
00230             _Option(["-root1", "root1"],
00231                     "Method used to root tree in iteration 1",
00232                     checker_function=lambda x: x in TREE_ROOT_METHODS,
00233                     equate=False),
00234             #root2           midlongestspan                          tree; root1 is used in
00235             #                minavgleafdist                          iteration 1 and 2,
00236             #                                                        root2 in later
00237             #                                                        iterations.
00238             _Option(["-root2", "root2"],
00239                     "Method used to root tree in iteration 2",
00240                     checker_function=lambda x: x in TREE_ROOT_METHODS,
00241                     equate=False),
00242             #seqtype         protein              auto               Sequence type.
00243             #                nucleo
00244             #                auto
00245             _Option(["-seqtype", "seqtype"],
00246                     "Sequence type",
00247                     checker_function=lambda x: x in SEQUENCE_TYPES,
00248                     equate=False),
00249             #smoothscoreceil Floating point       [1]                Maximum value of column
00250             #                                                        score for smoothing
00251             #                                                        purposes.
00252             _Option(["-smoothscoreceil", "smoothscoreceil"],
00253                     "Maximum value of column score for smoothing",
00254                     checker_function=lambda x: isinstance(x, float),
00255                     equate=False),
00256             #smoothwindow    Integer              7                  Window used for anchor
00257             #                                                        column smoothing.
00258             _Option(["-smoothwindow", "smoothwindow"],
00259                     "Window used for anchor column smoothing",
00260                     checker_function=lambda x: isinstance(x, int),
00261                     equate=False),
00262             #SUEFF           Floating point value 0.1                Constant used in UPGMB
00263             #                between 0 and 1.                        clustering. Determines
00264             #                                                        the relative fraction
00265             #                                                        of average linkage
00266             #                                                        (SUEFF) vs. nearest-
00267             #                                                        neighbor linkage (1
00268             #                                                        SUEFF).
00269             _Option(["-sueff", "sueff"],
00270                     "Constant used in UPGMB clustering",
00271                     checker_function=lambda x: isinstance(x, float),
00272                     equate=False),
00273             #tree1           File name            None               Save tree produced in
00274             _Option(["-tree1", "tree1"],
00275                     "Save Newick tree from iteration 1",
00276                     equate=False),
00277             #tree2                                                   first or second
00278             #                                                        iteration to given file
00279             #                                                        in Newick (Phylip-
00280             #                                                        compatible) format.
00281             _Option(["-tree2", "tree2"],
00282                     "Save Newick tree from iteration 2",
00283                     equate=False),
00284             #weight1         none                 clustalw           Sequence weighting
00285             _Option(["-weight1", "weight1"],
00286                     "Weighting scheme used in iteration 1",
00287                     checker_function=lambda x: x in WEIGHTING_SCHEMES,
00288                     equate=False),
00289             #weight2         henikoff                                scheme.
00290             #                henikoffpb                              weight1 is used in
00291             #                gsc                                     iterations 1 and 2.
00292             #                clustalw                                weight2 is used for
00293             #                threeway                                tree-dependent
00294             #                                                        refinement.
00295             #                                                        none=all sequences have
00296             #                                                        equal weight.
00297             #                                                        henikoff=Henikoff &
00298             #                                                        Henikoff weighting
00299             #                                                        scheme.
00300             #                                                        henikoffpb=Modified
00301             #                                                        Henikoff scheme as used
00302             #                                                        in PSI-BLAST.
00303             #                                                        clustalw=CLUSTALW
00304             #                                                        method.
00305             #                                                        threeway=Gotoh three-
00306             #                                                        way method.
00307             _Option(["-weight2", "weight2"],
00308                     "Weighting scheme used in iteration 2",
00309                     checker_function=lambda x: x in WEIGHTING_SCHEMES,
00310                     equate=False),
00311             #################### FORMATS #######################################
00312             # Multiple formats can be specified on the command line
00313             # If -msf appears it will be used regardless of other formats
00314             # specified. If -clw appears (and not -msf), clustalw format will be
00315             # used regardless of other formats specified. If both -clw and
00316             # -clwstrict are specified -clwstrict will be used regardless of
00317             # other formats specified. If -fasta is specified and not -msf,
00318             # -clw, or clwstrict, fasta will be used. If -fasta and -html are
00319             # specified -fasta will be used. Only if -html is specified alone
00320             # will html be used. I kid ye not.
00321             #clw                no              Write output in CLUSTALW format (default is
00322             #                                   FASTA).
00323             _Switch(["-clw", "clw"],
00324                     "Write output in CLUSTALW format (with a MUSCLE header)"),
00325             #clwstrict          no              Write output in CLUSTALW format with the
00326             #                                   "CLUSTAL W (1.81)" header rather than the
00327             #                                   MUSCLE version. This is useful when a post-
00328             #                                   processing step is picky about the file
00329             #                                   header.
00330             _Switch(["-clwstrict", "clwstrict"],
00331                     "Write output in CLUSTALW format with version 1.81 header"),
00332             #fasta              yes             Write output in FASTA format. Alternatives
00333             #                                   include clw,
00334             #                                   clwstrict, msf and html.
00335             _Switch(["-fasta", "fasta"],
00336                     "Write output in FASTA format"),
00337             #html               no              Write output in HTML format (default is
00338             #                                   FASTA).
00339             _Switch(["-html", "html"],
00340                     "Write output in HTML format"),
00341             #msf                no              Write output in MSF format (default is
00342             #                                   FASTA).
00343             _Switch(["-msf", "msf"],
00344                     "Write output in MSF format"),
00345             #Phylip interleaved - undocumented as of 3.7
00346             _Switch(["-phyi", "phyi"],
00347                     "Write output in PHYLIP interleaved format"),
00348             #Phylip sequential - undocumented as of 3.7
00349             _Switch(["-phys", "phys"],
00350                     "Write output in PHYLIP sequential format"),
00351             ################## Additional specified output files #########
00352             _Option(["-phyiout", "phyiout"],
00353                     "Write PHYLIP interleaved output to specified filename",
00354                     filename=True,
00355                     equate=False),
00356             _Option(["-physout", "physout"],"Write PHYLIP sequential format to specified filename",
00357                     filename=True,
00358                     equate=False),
00359             _Option(["-htmlout", "htmlout"],"Write HTML output to specified filename",
00360                     filename=True,
00361                     equate=False),
00362             _Option(["-clwout", "clwout"],
00363                     "Write CLUSTALW output (with MUSCLE header) to specified "
00364                     "filename",
00365                     filename=True,
00366                     equate=False),
00367             _Option(["-clwstrictout", "clwstrictout"],
00368                     "Write CLUSTALW output (with version 1.81 header) to "
00369                     "specified filename",
00370                     filename=True,
00371                     equate=False),
00372             _Option(["-msfout", "msfout"],
00373                     "Write MSF format output to specified filename",
00374                     filename=True,
00375                     equate=False),
00376             _Option(["-fastaout", "fastaout"],
00377                     "Write FASTA format output to specified filename",
00378                     filename=True,
00379                     equate=False),
00380             ############## END FORMATS ###################################
00381             #anchors            yes             Use anchor optimization in tree dependent
00382             #                                   refinement iterations.
00383             _Switch(["-anchors", "anchors"],
00384                     "Use anchor optimisation in tree dependent "
00385                     "refinement iterations"),
00386             #noanchors          no              Disable anchor optimization. Default is
00387             #                                   anchors.
00388             _Switch(["-noanchors", "noanchors"],
00389                     "Do not use anchor optimisation in tree dependent "
00390                     "refinement iterations"),
00391             #group              yes             Group similar sequences together in the
00392             #                                   output. This is the default. See also
00393             #                                   stable.
00394             _Switch(["-group", "group"],
00395                     "Group similar sequences in output"),
00396             #stable             no              Preserve input order of sequences in output
00397             #                                   file. Default is to group sequences by
00398             #                                   similarity (group).
00399             _Switch(["-stable", "stable"],
00400                     "Do not group similar sequences in output (not supported in v3.8)"),
00401             ############## log-expectation profile score ######################
00402             # One of either -le, -sp, or -sv
00403             #
00404             # According to the doc, spn is default and the only option for
00405             # nucleotides: this doesnt appear to be true. -le, -sp, and -sv can
00406             # be used and produce numerically different logs (what is going on?)
00407             #
00408             #spn fails on proteins
00409             #le                 maybe           Use log-expectation profile score (VTML240).
00410             #                                    Alternatives are to use sp or sv. This is
00411             #                                    the default for amino acid sequences.
00412             _Switch(["-le", "le"],
00413                     "Use log-expectation profile score (VTML240)"),
00414             #sv                 no              Use sum-of-pairs profile score (VTML240).
00415             #                                   Default is le.
00416             _Switch(["-sv", "sv"],
00417                     "Use sum-of-pairs profile score (VTML240)"),
00418             #sp                 no              Use sum-of-pairs protein profile score
00419             #                                   (PAM200). Default is le.
00420             _Switch(["-sp", "sp"],
00421                     "Use sum-of-pairs protein profile score (PAM200)"),
00422             #spn                maybe           Use sum-of-pairs nucleotide profile score
00423             #                                   (BLASTZ parameters). This is the only option
00424             #                                   for nucleotides, and is therefore the
00425             #                                   default.
00426             _Switch(["-spn", "spn"],
00427                     "Use sum-of-pairs protein nucleotide profile score"),
00428             ############## END log-expectation profile score ######################
00429             #quiet              no              Do not display progress messages.
00430             _Switch(["-quiet", "quiet"],
00431                     "Use sum-of-pairs protein nucleotide profile score"),
00432             #refine             no              Input file is already aligned, skip first
00433             #                                   two iterations and begin tree dependent
00434             #                                   refinement.
00435             _Switch(["-refine", "refine"],
00436                     "Only do tree dependent refinement"),
00437             #core               yes in muscle,  Do not catch exceptions.
00438             #                   no in muscled.
00439             _Switch(["-core", "core"],
00440                     "Catch exceptions"),
00441             #nocore             no in muscle,   Catch exceptions and give an error message
00442             #                   yes in muscled. if possible.
00443             _Switch(["-nocore", "nocore"],
00444                     "Do not catch exceptions"),
00445             #termgapsfull       no              Terminal gaps penalized with full penalty.
00446             #                                   [1] Not fully supported in this version.
00447             #
00448             #termgapshalf       yes             Terminal gaps penalized with half penalty.
00449             #                                   [1] Not fully supported in this version.
00450             #
00451             #termgapshalflonger no              Terminal gaps penalized with half penalty if
00452             #                                   gap relative to
00453             #                                   longer sequence, otherwise with full
00454             #                                   penalty.
00455             #                                   [1] Not fully supported in this version.
00456             #verbose            no              Write parameter settings and progress
00457             #                                   messages to log file.
00458             _Switch(["-verbose", "verbose"],
00459                     "Write parameter settings and progress"),
00460             #version            no              Write version string to stdout and exit.
00461             _Switch(["-version", "version"],
00462                     "Write version string to stdout and exit"),
00463            ]
00464         AbstractCommandline.__init__(self, cmd, **kwargs)
00465 
00466 def _test():
00467     """Run the module's doctests (PRIVATE)."""
00468     print "Runing MUSCLE doctests..."
00469     import doctest
00470     doctest.testmod()
00471     print "Done"
00472 
00473 if __name__ == "__main__":
00474     _test()