Back to index

python-biopython  1.60
_Clustalw.py
Go to the documentation of this file.
00001 # Copyright 2009 by Cymon J. Cox.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """Command line wrapper for the multiple alignment program Clustal W.
00006 """
00007 
00008 __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages!
00009 
00010 import os
00011 from Bio.Application import _Option, _Switch, AbstractCommandline
00012 
00013 class ClustalwCommandline(AbstractCommandline):
00014     """Command line wrapper for clustalw (version one or two).
00015 
00016     http://www.clustal.org/
00017 
00018     Example:
00019 
00020     >>> from Bio.Align.Applications import ClustalwCommandline
00021     >>> in_file = "unaligned.fasta"
00022     >>> clustalw_cline = ClustalwCommandline("clustalw2", infile=in_file)
00023     >>> print clustalw_cline
00024     clustalw2 -infile=unaligned.fasta
00025 
00026     You would typically run the command line with clustalw_cline() or via
00027     the Python subprocess module, as described in the Biopython tutorial.
00028 
00029     Citation:
00030 
00031     Larkin MA, Blackshields G, Brown NP, Chenna R, McGettigan PA,
00032     McWilliam H, Valentin F, Wallace IM, Wilm A, Lopez R, Thompson JD,
00033     Gibson TJ, Higgins DG. (2007). Clustal W and Clustal X version 2.0.
00034     Bioinformatics, 23, 2947-2948. 
00035 
00036     Last checked against versions: 1.83 and 2.0.10
00037     """
00038     #TODO - Should we default to cmd="clustalw2" now?
00039     def __init__(self, cmd="clustalw", **kwargs):
00040         self.parameters = \
00041             [
00042             _Option(["-infile", "-INFILE", "INFILE", "infile"],
00043                     "Input sequences.",
00044                     filename=True),
00045             _Option(["-profile1", "-PROFILE1", "PROFILE1", "profile1"],
00046                     "Profiles (old alignment).",
00047                     filename=True),
00048             _Option(["-profile2", "-PROFILE2", "PROFILE2", "profile2"],
00049                     "Profiles (old alignment).",
00050                     filename=True),
00051             ################## VERBS (do things) #############################
00052             _Switch(["-options", "-OPTIONS", "OPTIONS", "options"],
00053                     "List the command line parameters"),
00054             _Switch(["-help", "-HELP", "HELP", "help"],
00055                     "Outline the command line params."),
00056             _Switch(["-check", "-CHECK", "CHECK", "check"],
00057                     "Outline the command line params."),
00058             _Switch(["-fullhelp", "-FULLHELP", "FULLHELP", "fullhelp"],
00059                     "Output full help content."),
00060             _Switch(["-align", "-ALIGN", "ALIGN", "align"],
00061                     "Do full multiple alignment."),
00062             _Switch(["-tree", "-TREE", "TREE", "tree"],
00063                     "Calculate NJ tree."),
00064             _Option(["-bootstrap", "-BOOTSTRAP", "BOOTSTRAP", "bootstrap"],
00065                     "Bootstrap a NJ tree (n= number of bootstraps; def. = 1000).",
00066                     checker_function=lambda x: isinstance(x, int)),
00067             _Switch(["-convert", "-CONVERT", "CONVERT", "convert"],
00068                     "Output the input sequences in a different file format."),
00069             ##################### PARAMETERS (set things) #########################
00070             # ***General settings:****
00071             # Makes no sense in biopython
00072             #_Option(["-interactive", "-INTERACTIVE", "INTERACTIVE", "interactive"],
00073             #        [],
00074             #        lambda x: 0, #Does not take value
00075             #        False,
00076             #        "read command line, then enter normal interactive menus",
00077             #        False),
00078             _Switch(["-quicktree", "-QUICKTREE", "QUICKTREE", "quicktree"],
00079                     "Use FAST algorithm for the alignment guide tree"),
00080             _Option(["-type", "-TYPE", "TYPE", "type"],
00081                     "PROTEIN or DNA sequences",
00082                     checker_function=lambda x: x in ["PROTEIN", "DNA",
00083                                                      "protein", "dna"]),
00084             _Switch(["-negative", "-NEGATIVE", "NEGATIVE", "negative"],
00085                     "Protein alignment with negative values in matrix"),
00086             _Option(["-outfile", "-OUTFILE", "OUTFILE", "outfile"],
00087                     "Output sequence alignment file name",
00088                     filename=True),
00089             _Option(["-output", "-OUTPUT", "OUTPUT", "output"],
00090                     "Output format: GCG, GDE, PHYLIP, PIR or NEXUS",
00091                     checker_function=lambda x: x in ["GCG", "GDE", "PHYLIP",
00092                                                      "PIR", "NEXUS",
00093                                                      "gcg", "gde", "phylip",
00094                                                      "pir", "nexus"]),
00095             _Option(["-outorder", "-OUTORDER", "OUTORDER", "outorder"],
00096                     "Output taxon order: INPUT or ALIGNED",
00097                     checker_function=lambda x: x in ["INPUT", "input",
00098                                                      "ALIGNED", "aligned"]),
00099             _Option(["-case", "-CASE", "CASE", "case"],
00100                     "LOWER or UPPER (for GDE output only)",
00101                     checker_function=lambda x: x in ["UPPER", "upper",
00102                                                      "LOWER", "lower"]),
00103             _Option(["-seqnos", "-SEQNOS", "SEQNOS", "seqnos"],
00104                     "OFF or ON (for Clustal output only)",
00105                     checker_function=lambda x: x in ["ON", "on",
00106                                                      "OFF", "off"]),
00107             _Option(["-seqno_range", "-SEQNO_RANGE", "SEQNO_RANGE", "seqno_range"],
00108                     "OFF or ON (NEW- for all output formats)",
00109                     checker_function=lambda x: x in ["ON", "on",
00110                                                      "OFF", "off"]),
00111             _Option(["-range", "-RANGE", "RANGE", "range"],
00112                     "Sequence range to write starting m to m+n. "
00113                     "Input as string eg. '24,200'"),
00114             _Option(["-maxseqlen", "-MAXSEQLEN", "MAXSEQLEN", "maxseqlen"],
00115                     "Maximum allowed input sequence length",
00116                     checker_function=lambda x: isinstance(x, int)),
00117             _Switch(["-quiet", "-QUIET", "QUIET", "quiet"],
00118                     "Reduce console output to minimum"),
00119             _Switch(["-stats", "-STATS", "STATS", "stats"],
00120                     "Log some alignents statistics to file"),
00121             # ***Fast Pairwise Alignments:***
00122             _Option(["-ktuple", "-KTUPLE", "KTUPLE", "ktuple"],
00123                     "Word size",
00124                     checker_function=lambda x: isinstance(x, int) or \
00125                                                isinstance(x, float)),
00126             _Option(["-topdiags", "-TOPDIAGS", "TOPDIAGS", "topdiags"],
00127                     "Number of best diags.",
00128                     checker_function=lambda x: isinstance(x, int) or \
00129                                                isinstance(x, float)),
00130             _Option(["-window", "-WINDOW", "WINDOW", "window"],
00131                     "Window around best diags.",
00132                     checker_function=lambda x: isinstance(x, int) or \
00133                                                isinstance(x, float)),
00134             _Option(["-pairgap", "-PAIRGAP", "PAIRGAP", "pairgap"],
00135                     "Gap penalty",
00136                     checker_function=lambda x: isinstance(x, int) or \
00137                                                isinstance(x, float)),
00138             _Option(["-score", "-SCORE", "SCORE", "score"],
00139                     "Either: PERCENT or ABSOLUTE",
00140                     checker_function=lambda x: x in ["percent", "PERCENT",
00141                                                      "absolute","ABSOLUTE"]),
00142             # ***Slow Pairwise Alignments:***
00143             _Option(["-pwmatrix", "-PWMATRIX", "PWMATRIX", "pwmatrix"],
00144                     "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
00145                     checker_function=lambda x: x in ["BLOSUM", "PAM",
00146                                                      "GONNET", "ID",
00147                                                      "blosum", "pam",
00148                                                      "gonnet", "id"] or \
00149                                                 os.path.exists(x),
00150                     filename=True),
00151             _Option(["-pwdnamatrix", "-PWDNAMATRIX", "PWDNAMATRIX", "pwdnamatrix"],
00152                     "DNA weight matrix=IUB, CLUSTALW or filename",
00153                     checker_function=lambda x: x in ["IUB", "CLUSTALW",
00154                                                      "iub", "clustalw"] or \
00155                                                os.path.exists(x),
00156                     filename=True),
00157             _Option(["-pwgapopen", "-PWGAPOPEN", "PWGAPOPEN", "pwgapopen"],
00158                     "Gap opening penalty",
00159                     checker_function=lambda x: isinstance(x, int) or \
00160                                                isinstance(x, float)),
00161             _Option(["-pwgapext", "-PWGAPEXT", "PWGAPEXT", "pwgapext"],
00162                     "Gap opening penalty",
00163                     checker_function=lambda x: isinstance(x, int) or \
00164                                                isinstance(x, float)),
00165             # ***Multiple Alignments:***
00166             _Option(["-newtree", "-NEWTREE", "NEWTREE", "newtree"],
00167                     "Output file name for newly created guide tree",
00168                     filename=True),
00169             _Option(["-usetree", "-USETREE", "USETREE", "usetree"],
00170                     "File name of guide tree",
00171                     checker_function=lambda x: os.path.exists,
00172                     filename=True),
00173             _Option(["-matrix", "-MATRIX", "MATRIX", "matrix"],
00174                     "Protein weight matrix=BLOSUM, PAM, GONNET, ID or filename",
00175                     checker_function=lambda x: x in ["BLOSUM", "PAM",
00176                                                      "GONNET", "ID",
00177                                                      "blosum", "pam",
00178                                                      "gonnet", "id"] or \
00179                                                os.path.exists(x),
00180                     filename=True),
00181             _Option(["-dnamatrix", "-DNAMATRIX", "DNAMATRIX", "dnamatrix"],
00182                     "DNA weight matrix=IUB, CLUSTALW or filename",
00183                     checker_function=lambda x: x in ["IUB", "CLUSTALW",
00184                                                      "iub", "clustalw"] or \
00185                                                os.path.exists(x),
00186                     filename=True),
00187             _Option(["-gapopen", "-GAPOPEN", "GAPOPEN", "gapopen"],
00188                     "Gap opening penalty",
00189                     checker_function=lambda x: isinstance(x, int) or \
00190                                                isinstance(x, float)),
00191             _Option(["-gapext", "-GAPEXT", "GAPEXT", "gapext"],
00192                     "Gap extension penalty",
00193                     checker_function=lambda x: isinstance(x, int) or \
00194                                                isinstance(x, float)),
00195             _Switch(["-endgaps", "-ENDGAPS", "ENDGAPS", "endgaps"],
00196                     "No end gap separation pen."),
00197             _Option(["-gapdist", "-GAPDIST", "GAPDIST", "gapdist"],
00198                     "Gap separation pen. range",
00199                     checker_function=lambda x: isinstance(x, int) or \
00200                                                isinstance(x, float)),
00201             _Switch(["-nopgap", "-NOPGAP", "NOPGAP", "nopgap"],
00202                     "Residue-specific gaps off"),
00203             _Switch(["-nohgap", "-NOHGAP", "NOHGAP", "nohgap"],
00204                     "Hydrophilic gaps off"),
00205             _Switch(["-hgapresidues", "-HGAPRESIDUES", "HGAPRESIDUES", "hgapresidues"],
00206                     "List hydrophilic res."),
00207             _Option(["-maxdiv", "-MAXDIV", "MAXDIV", "maxdiv"],
00208                     "% ident. for delay",
00209                     checker_function=lambda x: isinstance(x, int) or \
00210                                                isinstance(x, float)),
00211             _Option(["-transweight", "-TRANSWEIGHT", "TRANSWEIGHT", "transweight"],
00212                     "Transitions weighting",
00213                     checker_function=lambda x: isinstance(x, int) or \
00214                                                isinstance(x, float)),
00215             _Option(["-iteration", "-ITERATION", "ITERATION", "iteration"],
00216                     "NONE or TREE or ALIGNMENT",
00217                     checker_function=lambda x: x in ["NONE", "TREE",
00218                                                      "ALIGNMENT",
00219                                                      "none", "tree",
00220                                                      "alignment"]),
00221             _Option(["-numiter", "-NUMITER", "NUMITER", "numiter"],
00222                     "maximum number of iterations to perform",
00223                     checker_function=lambda x: isinstance(x, int)),
00224             _Switch(["-noweights", "-NOWEIGHTS", "NOWEIGHTS", "noweights"],
00225                     "Disable sequence weighting"),
00226             # ***Profile Alignments:***
00227             _Switch(["-profile", "-PROFILE", "PROFILE", "profile"],
00228                     "Merge two alignments by profile alignment"),
00229             _Option(["-newtree1", "-NEWTREE1", "NEWTREE1", "newtree1"],
00230                     "Output file name for new guide tree of profile1",
00231                     filename=True),
00232             _Option(["-newtree2", "-NEWTREE2", "NEWTREE2", "newtree2"],
00233                     "Output file for new guide tree of profile2",
00234                     filename=True),
00235             _Option(["-usetree1", "-USETREE1", "USETREE1", "usetree1"],
00236                     "File name of guide tree for profile1",
00237                     checker_function=lambda x: os.path.exists,
00238                     filename=True),
00239             _Option(["-usetree2", "-USETREE2", "USETREE2", "usetree2"],
00240                     "File name of guide tree for profile2",
00241                     checker_function=lambda x: os.path.exists,
00242                     filename=True),
00243             # ***Sequence to Profile Alignments:***
00244             _Switch(["-sequences", "-SEQUENCES", "SEQUENCES", "sequences"],
00245                     "Sequentially add profile2 sequences to profile1 alignment"),
00246             _Switch(["-nosecstr1", "-NOSECSTR1", "NOSECSTR1", "nosecstr1"],
00247                     "Do not use secondary structure-gap penalty mask for profile 1"),
00248             _Switch(["-nosecstr2", "-NOSECSTR2", "NOSECSTR2", "nosecstr2"],
00249                     "Do not use secondary structure-gap penalty mask for profile 2"),
00250             # ***Structure Alignments:***
00251             _Option(["-secstrout", "-SECSTROUT", "SECSTROUT", "secstrout"],
00252                     "STRUCTURE or MASK or BOTH or NONE output in alignment file",
00253                     checker_function=lambda x: x in ["STRUCTURE", "MASK",
00254                                                      "BOTH", "NONE",
00255                                                      "structure", "mask",
00256                                                      "both", "none"]),
00257             _Option(["-helixgap", "-HELIXGAP", "HELIXGAP", "helixgap"],
00258                     "Gap penalty for helix core residues",
00259                     checker_function=lambda x: isinstance(x, int) or \
00260                                                isinstance(x, float)),
00261             _Option(["-strandgap", "-STRANDGAP", "STRANDGAP", "strandgap"],
00262                     "gap penalty for strand core residues",
00263                     checker_function=lambda x: isinstance(x, int) or \
00264                                                isinstance(x, float)),
00265             _Option(["-loopgap", "-LOOPGAP", "LOOPGAP", "loopgap"],
00266                     "Gap penalty for loop regions",
00267                     checker_function=lambda x: isinstance(x, int) or \
00268                                                isinstance(x, float)),
00269             _Option(["-terminalgap", "-TERMINALGAP", "TERMINALGAP", "terminalgap"],
00270                     "Gap penalty for structure termini",
00271                     checker_function=lambda x: isinstance(x, int) or \
00272                                                isinstance(x, float)),
00273             _Option(["-helixendin", "-HELIXENDIN", "HELIXENDIN", "helixendin"],
00274                     "Number of residues inside helix to be treated as terminal",
00275                     checker_function=lambda x: isinstance(x, int)),
00276             _Option(["-helixendout", "-HELIXENDOUT", "HELIXENDOUT", "helixendout"],
00277                     "Number of residues outside helix to be treated as terminal",
00278                     checker_function=lambda x: isinstance(x, int)),
00279             _Option(["-strandendin", "-STRANDENDIN", "STRANDENDIN", "strandendin"],
00280                     "Number of residues inside strand to be treated as terminal",
00281                     checker_function=lambda x: isinstance(x, int)),
00282             _Option(["-strandendout", "-STRANDENDOUT", "STRANDENDOUT", "strandendout"],
00283                     "number of residues outside strand to be treated as terminal",
00284                     checker_function=lambda x: isinstance(x, int)),
00285             # ***Trees:***
00286             _Option(["-outputtree", "-OUTPUTTREE", "OUTPUTTREE", "outputtree"],
00287                     "nj OR phylip OR dist OR nexus",
00288                     checker_function=lambda x: x in ["NJ", "PHYLIP",
00289                                                      "DIST", "NEXUS",
00290                                                      "nj", "phylip",
00291                                                      "dist", "nexus"]),
00292             _Option(["-seed", "-SEED", "SEED", "seed"],
00293                     "Seed number for bootstraps.",
00294                     checker_function=lambda x: isinstance(x, int)),
00295             _Switch(["-kimura", "-KIMURA", "KIMURA", "kimura"],
00296                     "Use Kimura's correction."),
00297             _Switch(["-tossgaps", "-TOSSGAPS", "TOSSGAPS", "tossgaps"],
00298                     "Ignore positions with gaps."),
00299             _Option(["-bootlabels", "-BOOTLABELS", "BOOTLABELS", "bootlabels"],
00300                     "Node OR branch position of bootstrap values in tree display",
00301                     checker_function=lambda x: x in ["NODE", "BRANCH",
00302                                                      "node", "branch"]),
00303             _Option(["-clustering", "-CLUSTERING", "CLUSTERING", "clustering"],
00304                     "NJ or UPGMA",
00305                     checker_function=lambda x: x in ["NJ", "UPGMA", "nj", "upgma"])
00306             ]
00307         AbstractCommandline.__init__(self, cmd, **kwargs)
00308 
00309 def _test():
00310     """Run the module's doctests (PRIVATE)."""
00311     print "Runing ClustalW doctests..."
00312     import doctest
00313     doctest.testmod()
00314     print "Done"
00315 
00316 if __name__ == "__main__":
00317     _test()