Back to index

python-biopython  1.60
_Phyml.py
Go to the documentation of this file.
00001 # Copyright 2011 by Eric Talevich.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its license.
00003 # Please see the LICENSE file that should have been included as part of this
00004 # package.
00005 """Command-line wrapper for the tree inference program PhyML."""
00006 __docformat__ = "restructuredtext en"
00007 
00008 from Bio.Application import _Option, _Switch, AbstractCommandline
00009 
00010 
00011 class PhymlCommandline(AbstractCommandline):
00012     """Command-line wrapper for the tree inference program PhyML.
00013 
00014     Homepage: http://www.atgc-montpellier.fr/phyml
00015 
00016     Citations:
00017 
00018     Guindon S, Gascuel O.
00019     A simple, fast, and accurate algorithm to estimate large phylogenies by maximum
00020     likelihood.
00021     Systematic Biology, 2003 Oct;52(5):696-704.
00022     PubMed PMID: 14530136.
00023 
00024     Guindon S, Dufayard JF, Lefort V, Anisimova M, Hordijk W, Gascuel O.
00025     New Algorithms and Methods to Estimate Maximum-Likelihood Phylogenies: Assessing
00026     the Performance of PhyML 3.0.
00027     Systematic Biology, 2010 59(3):307-21.
00028 
00029     """
00030 
00031     def __init__(self, cmd='phyml', **kwargs):
00032         self.parameters = [
00033             _Option(['-i', '--input', 'input'],
00034                 """Name of the nucleotide or amino-acid sequence file in PHYLIP
00035                 format.""",
00036                 filename=True,
00037                 is_required=True,
00038                 equate=False,
00039                 ),
00040 
00041             _Option(['-d', '--datatype', 'datatype'],
00042                 """Data type is 'nt' for nucleotide (default) and 'aa' for
00043                 amino-acid sequences.""",
00044                 checker_function=lambda x: x in ('nt', 'aa'),
00045                 equate=False,
00046                 ),
00047 
00048             _Switch(['-q', '--sequential', 'sequential'],
00049                 "Changes interleaved format (default) to sequential format."
00050                 ),
00051 
00052             _Option(['-n', '--multiple', 'multiple'],
00053                 "Number of data sets to analyse (integer).",
00054                 checker_function=(lambda x:
00055                     isinstance(x, int) or x.isdigit()),
00056                 equate=False,
00057                 ),
00058 
00059             _Switch(['-p', '--pars', 'pars'],
00060                 """Use a minimum parsimony starting tree.
00061                 
00062                 This option is taken into account when the '-u' option is absent
00063                 and when tree topology modifications are to be done.
00064                 """
00065                 ),
00066 
00067             _Option(['-b', '--bootstrap', 'bootstrap'],
00068                 """Number of bootstrap replicates, if value is > 0.
00069 
00070                 Otherwise: 
00071 
00072                  0: neither approximate likelihood ratio test nor bootstrap
00073                     values are computed.
00074                 -1: approximate likelihood ratio test returning aLRT statistics.
00075                 -2: approximate likelihood ratio test returning Chi2-based
00076                     parametric branch supports.
00077                 -4: SH-like branch supports alone.
00078                 """,
00079                 equate=False,
00080                 ),
00081 
00082             _Option(['-m', '--model', 'model'],
00083                 """Substitution model name.
00084 
00085                 Nucleotide-based models:
00086 
00087                 HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR | custom
00088 
00089                 For the custom option, a string of six digits identifies the
00090                 model. For instance, 000000 corresponds to F81 (or JC69,
00091                 provided the distribution of nucleotide frequencies is uniform).
00092                 012345 corresponds to GTR. This option can be used for encoding
00093                 any model that is a nested within GTR.
00094 
00095                 Amino-acid based models:
00096 
00097                 LG (default) | WAG | JTT | MtREV | Dayhoff | DCMut | RtREV |
00098                 CpREV | VT | Blosum62 | MtMam | MtArt | HIVw | HIVb | custom
00099                 """,
00100                 checker_function=(lambda x: x in (
00101                     # Nucleotide models:
00102                     'HKY85', 'JC69', 'K80', 'F81', 'F84', 'TN93', 'GTR',
00103                     # Amino acid models:
00104                     'LG', 'WAG', 'JTT', 'MtREV', 'Dayhoff', 'DCMut',
00105                     'RtREV', 'CpREV', 'VT', 'Blosum62', 'MtMam', 'MtArt',
00106                     'HIVw', 'HIVb')
00107                     or isinstance(x, int)),
00108                 equate=False,
00109                 ),
00110 
00111             _Option(['-f', 'frequencies'],
00112                 """Character frequencies.
00113 
00114                 -f e, m, or "fA fC fG fT"
00115 
00116                 e : Empirical frequencies, determined as follows : 
00117 
00118                     - Nucleotide sequences: (Empirical) the equilibrium base
00119                       frequencies are estimated by counting the occurence of the
00120                       different bases in the alignment.
00121                     - Amino-acid sequences: (Empirical) the equilibrium
00122                       amino-acid frequencies are estimated by counting the
00123                       occurence of the different amino-acids in the alignment.
00124 
00125                 m : ML/model-based frequencies, determined as follows : 
00126 
00127                     - Nucleotide sequences: (ML) the equilibrium base
00128                       frequencies are estimated using maximum likelihood 
00129                     - Amino-acid sequences: (Model) the equilibrium amino-acid
00130                       frequencies are estimated using the frequencies defined by
00131                       the substitution model.
00132 
00133                 "fA fC fG fT" : only valid for nucleotide-based models.
00134                     fA, fC, fG and fT are floating-point numbers that correspond
00135                     to the frequencies of A, C, G and T, respectively.
00136                 """,
00137                 filename=True, # ensure ".25 .25 .25 .25" stays quoted
00138                 equate=False,
00139                 ),
00140 
00141             _Option(['-t', '--ts/tv', 'ts_tv_ratio'],
00142                 """Transition/transversion ratio. (DNA sequences only.)
00143 
00144                 Can be a fixed positive value (ex:4.0) or e to get the
00145                 maximum-likelihood estimate.
00146                 """,
00147                 equate=False,
00148                 ),
00149 
00150             _Option(['-v', '--pinv', 'prop_invar'],
00151                 """Proportion of invariable sites.
00152 
00153                 Can be a fixed value in the range [0,1], or 'e' to get the
00154                 maximum-likelihood estimate.
00155                 """,
00156                 equate=False,
00157                 ),
00158 
00159             _Option(['-c', '--nclasses', 'nclasses'],
00160                 """Number of relative substitution rate categories.
00161 
00162                 Default 1. Must be a positive integer.
00163                 """,
00164                 equate=False,
00165                 ),
00166 
00167             _Option(['-a', '--alpha', 'alpha'],
00168                 """Distribution of the gamma distribution shape parameter.
00169 
00170                 Can be a fixed positive value, or 'e' to get the
00171                 maximum-likelihood estimate.
00172                 """,
00173                 equate=False,
00174                 ),
00175 
00176             _Option(['-s', '--search', 'search'],
00177                 """Tree topology search operation option.
00178 
00179                 Can be one of:
00180 
00181                     NNI : default, fast
00182                     SPR : a bit slower than NNI
00183                     BEST : best of NNI and SPR search
00184                 """,
00185                 checker_function=lambda x: x in ('NNI', 'SPR', 'BEST'),
00186                 equate=False,
00187                 ),
00188 
00189             # alt name: user_tree_file
00190             _Option(['-u', '--inputtree', 'input_tree'],
00191                 "Starting tree filename. The tree must be in Newick format.",
00192                 filename=True,
00193                 equate=False,
00194                 ),
00195 
00196             _Option(['-o', 'optimize'],
00197                 """Specific parameter optimisation.
00198 
00199                 tlr : tree topology (t), branch length (l) and
00200                       rate parameters (r) are optimised.
00201                 tl  : tree topology and branch length are optimised.
00202                 lr  : branch length and rate parameters are optimised. 
00203                 l   : branch length are optimised.
00204                 r   : rate parameters are optimised.
00205                 n   : no parameter is optimised.
00206                 """,
00207                 equate=False,
00208                 ),
00209 
00210             _Switch(['--rand_start', 'rand_start'],
00211                 """Sets the initial tree to random.
00212 
00213                 Only valid if SPR searches are to be performed.
00214                 """,
00215                 ),
00216 
00217             _Option(['--n_rand_starts', 'n_rand_starts'],
00218                 """Number of initial random trees to be used.
00219 
00220                 Only valid if SPR searches are to be performed.
00221                 """,
00222                 equate=False,
00223                 ),
00224 
00225             _Option(['--r_seed', 'r_seed'],
00226                 """Seed used to initiate the random number generator.
00227 
00228                 Must be an integer.
00229                 """,
00230                 equate=False,
00231                 ),
00232 
00233             _Switch(['--print_site_lnl', 'print_site_lnl'],
00234                 "Print the likelihood for each site in file *_phyml_lk.txt."
00235                 ),
00236 
00237             _Switch(['--print_trace', 'print_trace'],
00238                 """Print each phylogeny explored during the tree search process
00239                 in file *_phyml_trace.txt."""
00240                 ),
00241 
00242             _Option(['--run_id', 'run_id'],
00243                 """Append the given string at the end of each PhyML output file.
00244 
00245                 This option may be useful when running simulations involving
00246                 PhyML.
00247                 """,
00248                 checker_function=lambda x: isinstance(x, basestring),
00249                 equate=False,
00250                 ),
00251 
00252             # XXX should this always be set to True?
00253             _Switch(['--quiet', 'quiet'],
00254                 "No interactive questions (for running in batch mode)."
00255                 ),
00256                 ]
00257         AbstractCommandline.__init__(self, cmd, **kwargs)
00258