Back to index

python-biopython  1.60
_parse_baseml.py
Go to the documentation of this file.
00001 # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 import re
00007 
00008 line_floats_re = re.compile("-*\d+\.\d+")
00009 
00010 def parse_basics(lines, results):
00011     """Parse the basics that should be present in most baseml results files.
00012     """
00013     version_re = re.compile("BASEML \(in paml version (\d+\.\d+[a-z]*).*")
00014     np_re = re.compile("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)")
00015     num_params = -1
00016     for line in lines:
00017         # Find all floating point numbers in this line
00018         line_floats_res = line_floats_re.findall(line)
00019         line_floats = [float(val) for val in line_floats_res]
00020         # Find the version number
00021         # Example match: 
00022         # "BASEML (in paml version 4.3, August 2009)  alignment.phylip"
00023         version_res = version_re.match(line)
00024         if version_res is not None:
00025             results["version"] = version_res.group(1)
00026         # Find max lnL
00027         # Example match:
00028         # ln Lmax (unconstrained) = -316.049385
00029         if "ln Lmax" in line and len(line_floats) == 1:
00030             results["lnL max"] = line_floats[0]
00031         # Find lnL values.
00032         # Example match (lnL = -2021.348300):
00033         # "lnL(ntime: 19  np: 22):  -2021.348300      +0.000000"
00034         elif "lnL(ntime:" in line and len(line_floats) > 0:
00035             results["lnL"] = line_floats[0]
00036             np_res = np_re.match(line)
00037             if np_res is not None:
00038                 num_params = int(np_res.group(1))
00039         # Find tree lengths.
00040         # Example match: "tree length =   1.71931"
00041         elif "tree length" in line and len(line_floats) == 1:
00042             results["tree length"] = line_floats[0]
00043         # Find the estimated tree, only taking the tree if it has
00044         # branch lengths
00045         elif re.match("\(+", line) is not None:
00046             if ":" in line:
00047                 results["tree"] = line.strip()
00048     return (results, num_params)
00049 
00050 def parse_parameters(lines, results, num_params): 
00051     """Parse the various parameters from the file.
00052     """
00053     parameters = {}
00054     parameters = parse_parameter_list(lines, parameters, num_params)
00055     parameters = parse_kappas(lines, parameters)
00056     parameters = parse_rates(lines, parameters)
00057     parameters = parse_freqs(lines, parameters)
00058     results["parameters"] = parameters
00059     return results
00060 
00061 def parse_parameter_list(lines, parameters, num_params):
00062     """ Parse the parameters list, which is just an unlabeled list of numeric values.
00063     """
00064     for line_num in range(len(lines)):
00065         line = lines[line_num]
00066          # Find all floating point numbers in this line
00067         line_floats_res = line_floats_re.findall(line)
00068         line_floats = [float(val) for val in line_floats_res]
00069         # Get parameter list. This can be useful for specifying starting
00070         # parameters in another run by copying the list of parameters
00071         # to a file called in.baseml. Since the parameters must be in
00072         # a fixed order and format, copying and pasting to the file is
00073         # best. For this reason, they are grabbed here just as a long
00074         # string and not as individual numbers.
00075         if len(line_floats) == num_params:
00076            parameters["parameter list"] = line.strip()
00077         # Find SEs. The same format as parameters above is maintained
00078         # since there is a correspondance between the SE format and
00079         # the parameter format.
00080         # Example match:
00081         # "SEs for parameters:
00082         # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 
00083            if "SEs for parameters:" in lines[line_num + 1]:
00084                 SEs_line = lines[line_num + 2]
00085                 parameters["SEs"] = SEs_line.strip()
00086            break
00087     return parameters
00088 
00089 def parse_kappas(lines, parameters):
00090     """Parse out the kappa parameters.
00091     """
00092     kappa_found = False
00093     for line in lines:
00094         # Find all floating point numbers in this line
00095         line_floats_res = line_floats_re.findall(line)
00096         line_floats = [float(val) for val in line_floats_res]
00097         # Find kappa parameter (F84, HKY85, T92 model)
00098         # Example match:
00099         # "Parameters (kappa) in the rate matrix (F84) (Yang 1994 J Mol Evol 39:105-111):
00100         #    3.00749"
00101         if "Parameters (kappa)" in line:
00102             kappa_found = True
00103         elif kappa_found and len(line_floats) > 0:
00104             branch_res = re.match("\s(\d+\.\.\d+)", line)
00105             if branch_res is None:
00106                 if len(line_floats) == 1:
00107                     parameters["kappa"] = line_floats[0]
00108                 else:
00109                     parameters["kappa"] = line_floats
00110                 kappa_found = False
00111             else:
00112                 if parameters.get("branches") is None:
00113                     parameters["branches"] = {}
00114                 branch = branch_res.group(1)
00115                 if len(line_floats) > 0:
00116                     parameters["branches"][branch] = \
00117                         {"t":line_floats[0], "kappa":line_floats[1],
00118                         "TS":line_floats[2], "TV":line_floats[3]}
00119         # Find kappa under REV
00120         # Example match:
00121         # kappa under REV: 999.00000 145.76453  0.00001  0.00001  0.00001
00122         elif "kappa under" in line and len(line_floats) > 0:
00123             if len(line_floats) == 1:
00124                 parameters["kappa"] = line_floats[0]
00125             else:
00126                 parameters["kappa"] = line_floats
00127     return parameters
00128 
00129 def parse_rates(lines, parameters):
00130     """Parse the rate parameters.
00131     """
00132     Q_mat_found = False
00133     trans_probs_found = False
00134     for line in lines:
00135         # Find all floating point numbers in this line
00136         line_floats_res = line_floats_re.findall(line)
00137         line_floats = [float(val) for val in line_floats_res]
00138         # Find rate parameters
00139         # Example match: 
00140         # "Rate parameters:   999.00000 145.59775  0.00001  0.00001  0.00001"
00141         if "Rate parameters:" in line and len(line_floats) > 0:
00142             parameters["rate parameters"] = line_floats
00143         # Find rates
00144         # Example match: 
00145         # "rate:   0.90121  0.96051  0.99831  1.03711  1.10287"
00146         elif "rate: " in line and len(line_floats) > 0:
00147             parameters["rates"] = line_floats
00148         # Find Rate matrix Q & average kappa (REV model)
00149         # Example match:
00150         # Rate matrix Q, Average Ts/Tv =   3.0308
00151         #  -2.483179    1.865730    0.617449    0.000000
00152         #   2.298662   -2.298662    0.000000    0.000000
00153         #   0.335015    0.000000   -0.338059    0.003044
00154         #   0.000000    0.000000    0.004241   -0.004241
00155         elif "matrix Q" in line:
00156             parameters["Q matrix"] = {"matrix":[]}
00157             if len(line_floats) > 0:
00158                 parameters["Q matrix"]["average Ts/Tv"] = \
00159                     line_floats[0]
00160             Q_mat_found = True
00161         elif Q_mat_found and len(line_floats) > 0:
00162             parameters["Q matrix"]["matrix"].append(line_floats)
00163             if len(parameters["Q matrix"]["matrix"]) == 4:
00164                 Q_mat_found = False
00165         # Find alpha (gamma shape parameter for variable rates)
00166         # Example match: "alpha (gamma, K=5) = 192.47918"
00167         elif "alpha" in line and len(line_floats) > 0:
00168             parameters["alpha"] = line_floats[0]
00169         # Find rho for auto-discrete-gamma model
00170         elif "rho" in line and len(line_floats) > 0:
00171             parameters["rho"] = line_floats[0]
00172         elif "transition probabilities" in line:
00173             parameters["transition probs."] = []
00174             trans_probs_found = True
00175         elif trans_probs_found and len(line_floats) > 0:
00176             parameters["transition probs."].append(line_floats)
00177             if len(parameters["transition probs."]) == len(parameters["rates"]):
00178                 trans_probs_found = False
00179     return parameters
00180 
00181 def parse_freqs(lines, parameters):
00182     """Parse the basepair frequencies.
00183     """
00184     root_re = re.compile("Note: node (\d+) is root.")
00185     branch_freqs_found = False
00186     base_freqs_found = False
00187     for line in lines:
00188         # Find all floating point numbers in this line
00189         line_floats_res = line_floats_re.findall(line)
00190         line_floats = [float(val) for val in line_floats_res]
00191         # Find base frequencies from baseml 4.3
00192         # Example match:
00193         # "Base frequencies:   0.20090  0.16306  0.37027  0.26577"  
00194         if "Base frequencies" in line and len(line_floats) > 0:
00195             base_frequencies = {}
00196             base_frequencies["T"] = line_floats[0]
00197             base_frequencies["C"] = line_floats[1]
00198             base_frequencies["A"] = line_floats[2]
00199             base_frequencies["G"] = line_floats[3]
00200             parameters["base frequencies"] = base_frequencies
00201         # Find base frequencies from baseml 4.1:
00202         # Example match:
00203         # "base frequency parameters
00204         # "  0.20317  0.16768  0.36813  0.26102"
00205         elif "base frequency parameters" in line:
00206             base_freqs_found = True
00207         # baseml 4.4 returns to having the base frequencies on the next line
00208         # but the heading changed
00209         elif "Base frequencies" in line and len(line_floats) == 0:
00210             base_freqs_found = True
00211         elif base_freqs_found and len(line_floats) > 0:
00212             base_frequencies = {}
00213             base_frequencies["T"] = line_floats[0]
00214             base_frequencies["C"] = line_floats[1]
00215             base_frequencies["A"] = line_floats[2]
00216             base_frequencies["G"] = line_floats[3]
00217             parameters["base frequencies"] = base_frequencies
00218             base_freqs_found = False            
00219         # Find frequencies
00220         # Example match: 
00221         # "freq:   0.90121  0.96051  0.99831  1.03711  1.10287"
00222         elif "freq: " in line and len(line_floats) > 0:
00223             parameters["rate frequencies"] = line_floats
00224         # Find branch-specific frequency parameters
00225         # Example match (note: I think it's possible to have 4 more
00226         # values per line, enclosed in brackets, so I'll account for 
00227         # this):
00228         # (frequency parameters for branches)  [frequencies at nodes] (see Yang & Roberts 1995 fig 1)
00229         #
00230         # Node #1  ( 0.25824  0.24176  0.25824  0.24176 )
00231         # Node #2  ( 0.00000  0.50000  0.00000  0.50000 )
00232         elif "(frequency parameters for branches)" in line:
00233             parameters["nodes"] = {}
00234             branch_freqs_found = True
00235         elif branch_freqs_found is True:
00236             if len(line_floats) > 0:
00237                 node_res = re.match("Node \#(\d+)", line)
00238                 node_num = int(node_res.group(1))
00239                 node = {"root":False}
00240                 node["frequency parameters"] = line_floats[:4]
00241                 if len(line_floats) > 4:
00242                     node["base frequencies"] = {"T":line_floats[4],
00243                                                 "C":line_floats[5],
00244                                                 "A":line_floats[6],
00245                                                 "G":line_floats[7]}
00246                 parameters["nodes"][node_num] = node
00247             else:
00248                 root_res = root_re.match(line)
00249                 if root_res is not None:
00250                     root_node = int(root_res.group(1))
00251                     parameters["nodes"][root_node]["root"] =\
00252                         True
00253                     branch_freqs_found = False
00254     return parameters