Back to index

python-biopython  1.60
Functions | Variables
Bio.Phylo.PAML._parse_codeml Namespace Reference

Functions

def _nan_float
def parse_basics
def parse_nssites
def parse_model
def parse_siteclass_proportions
def parse_siteclass_omegas
def parse_clademodelc
def parse_branch_site_a
def parse_pairwise
def parse_distances

Variables

tuple line_floats_re = re.compile("-*\d+\.\d+")
 _nan_float = float

Function Documentation

def Bio.Phylo.PAML._parse_codeml._nan_float (   text) [private]

Definition at line 15 of file _parse_codeml.py.

00015 
00016     def _nan_float(text):
00017         try:
00018             return float(text)
00019         except ValueError:
00020             if text.lower()=="nan":
00021                 import struct 
00022                 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0]
00023             else:
00024                 raise

def Bio.Phylo.PAML._parse_codeml.parse_basics (   lines,
  results 
)
Parse the basic information that should be present in most codeml output files.

Definition at line 25 of file _parse_codeml.py.

00025 
00026 def parse_basics(lines, results):
00027     """Parse the basic information that should be present in most codeml output files.
00028     """
00029     # multi_models is used to designate there being results for more than one
00030     # model in the file
00031     multi_models = False
00032     multi_genes = False
00033     version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*")
00034     model_re = re.compile("Model:\s+(.+)")
00035     num_genes_re = re.compile("\(([0-9]+) genes: separate data\)")
00036     # In codeml 4.1, the codon substitution model is headed by:
00037     # "Codon frequencies:"
00038     # In codeml 4.3+ it is headed by:
00039     # "Codon frequency model:"
00040     codon_freq_re = re.compile("Codon frequenc[a-z\s]{3,7}:\s+(.+)")
00041     siteclass_re = re.compile("Site-class models:\s*([^\s]*)")
00042     for line in lines:
00043         # Find all floating point numbers in this line
00044         line_floats_res = line_floats_re.findall(line)
00045         line_floats = [_nan_float(val) for val in line_floats_res]
00046         # Get the program version number
00047         version_res = version_re.match(line)
00048         if version_res is not None:
00049             results["version"] = version_res.group(1)
00050             continue
00051         # Find the model description at the beginning of the file.
00052         model_res = model_re.match(line)
00053         if model_res is not None:
00054             results["model"] = model_res.group(1)
00055         # Find out if more than one genes are analyzed
00056         num_genes_res = num_genes_re.search(line)
00057         if num_genes_res is not None:
00058             results["genes"] = []
00059             num_genes = int(num_genes_res.group(1))
00060             for n in range(num_genes):
00061                 results["genes"].append({})
00062             multi_genes = True
00063             continue
00064         # Get the codon substitution model
00065         codon_freq_res = codon_freq_re.match(line)
00066         if codon_freq_res is not None:
00067             results["codon model"] = codon_freq_res.group(1)
00068             continue
00069         # Find the site-class model name at the beginning of the file.
00070         # This exists only if a NSsites class other than 0 is used.
00071         # If there's no site class model listed, then there are multiple
00072         # models in the results file
00073         # Example match: "Site-class models:  PositiveSelection"
00074         siteclass_res = siteclass_re.match(line)
00075         if siteclass_res is not None:
00076             siteclass_model = siteclass_res.group(1)
00077             if siteclass_model != "":
00078                 results["site-class model"] = siteclass_model
00079                 multi_models = False
00080             else:
00081                 multi_models = True
00082         # Get the maximum log-likelihood
00083         if "ln Lmax" in line and len(line_floats) > 0:
00084             results["lnL max"] = line_floats[0]
00085     return (results, multi_models, multi_genes)
 
def Bio.Phylo.PAML._parse_codeml.parse_branch_site_a (   foreground,
  line_floats,
  site_classes 
)
Parse results specific to the branch site A model.

Definition at line 377 of file _parse_codeml.py.

00377 
00378 def parse_branch_site_a(foreground, line_floats, site_classes):
00379     """Parse results specific to the branch site A model.
00380     """
00381     if not site_classes or len(line_floats) == 0:
00382         return
00383     for n in range(len(line_floats)):
00384         if site_classes[n].get("branch types") is None:
00385             site_classes[n]["branch types"] = {}
00386         if foreground:
00387             site_classes[n]["branch types"]["foreground"] =\
00388                     line_floats[n]
00389         else:
00390             site_classes[n]["branch types"]["background"] =\
00391                     line_floats[n]
00392     return site_classes

Here is the caller graph for this function:

def Bio.Phylo.PAML._parse_codeml.parse_clademodelc (   branch_type_no,
  line_floats,
  site_classes 
)
Parse results specific to the clade model C.

Definition at line 366 of file _parse_codeml.py.

00366 
00367 def parse_clademodelc(branch_type_no, line_floats, site_classes):
00368     """Parse results specific to the clade model C.
00369     """
00370     if not site_classes or len(line_floats) == 0:
00371         return
00372     for n in range(len(line_floats)):
00373         if site_classes[n].get("branch types") is None:
00374             site_classes[n]["branch types"] = {}
00375         site_classes[n]["branch types"][branch_type_no] = line_floats[n]
00376     return site_classes

Here is the caller graph for this function:

def Bio.Phylo.PAML._parse_codeml.parse_distances (   lines,
  results 
)
Parse amino acid sequence distance results.

Definition at line 432 of file _parse_codeml.py.

00432 
00433 def parse_distances(lines, results):
00434     """Parse amino acid sequence distance results.
00435     """
00436     distances = {}
00437     sequences = []
00438     raw_aa_distances_flag = False
00439     ml_aa_distances_flag = False
00440     matrix_row_re = re.compile("(.+)\s{5,15}")
00441     for line in lines:
00442         # Find all floating point numbers in this line
00443         line_floats_res = line_floats_re.findall(line)
00444         line_floats = [_nan_float(val) for val in line_floats_res]
00445         if "AA distances" in line:
00446             raw_aa_distances_flag = True
00447             # In current versions, the raw distances always come
00448             # first but I don't trust this to always be true
00449             ml_aa_distances_flag = False
00450         elif "ML distances of aa seqs." in line:
00451             ml_aa_distances_flag = True
00452             raw_aa_distances_flag = False
00453         # Parse AA distances (raw or ML), in a lower diagonal matrix
00454         matrix_row_res = matrix_row_re.match(line)
00455         if matrix_row_res and (raw_aa_distances_flag or \
00456                 ml_aa_distances_flag):
00457             seq_name = matrix_row_res.group(1).strip()
00458             if seq_name not in sequences:
00459                 sequences.append(seq_name)
00460             if raw_aa_distances_flag:
00461                 if distances.get("raw") is None:
00462                     distances["raw"] = {}
00463                 distances["raw"][seq_name] = {}
00464                 for i in range(0, len(line_floats)):
00465                     distances["raw"][seq_name][sequences[i]] = line_floats[i]
00466                     distances["raw"][sequences[i]][seq_name] = line_floats[i]
00467             else:
00468                 if distances.get("ml") is None:
00469                     distances["ml"] = {}
00470                 distances["ml"][seq_name] = {}
00471                 for i in range(0, len(line_floats)):
00472                     distances["ml"][seq_name][sequences[i]] = line_floats[i]
00473                     distances["ml"][sequences[i]][seq_name] = line_floats[i]
00474     if len(distances) > 0:
00475         results["distances"] = distances
00476     return results
def Bio.Phylo.PAML._parse_codeml.parse_model (   lines,
  results 
)
Parse an individual NSsites model's results.

Definition at line 162 of file _parse_codeml.py.

00162 
00163 def parse_model(lines, results):
00164     """Parse an individual NSsites model's results.
00165     """
00166     parameters = {}
00167     SEs_flag = False
00168     dS_tree_flag = False
00169     dN_tree_flag = False
00170     w_tree_flag = False
00171     num_params = None
00172     tree_re = re.compile("\(\(+")
00173     branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+")
00174     model_params_re = re.compile("([a-z]\d?)=\s+(\d+\.\d+)")
00175     for line in lines:
00176         # Find all floating point numbers in this line
00177         line_floats_res = line_floats_re.findall(line)
00178         line_floats = [_nan_float(val) for val in line_floats_res]
00179         # Check if branch-specific results are in the line
00180         branch_res = branch_re.match(line)
00181         # Check if additional model parameters are in the line
00182         model_params = model_params_re.findall(line)
00183         # Find lnL values.
00184         # Example match (lnL = -2021.348300):
00185         # "lnL(ntime: 19  np: 22):  -2021.348300      +0.000000"
00186         if "lnL(ntime:" in line and len(line_floats) > 0:
00187             results["lnL"] = line_floats[0]
00188             np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)",line)
00189             if np_res is not None:
00190                 num_params = int(np_res.group(1))
00191         # Get parameter list. This can be useful for specifying starting
00192         # parameters in another run by copying the list of parameters
00193         # to a file called in.codeml. Since the parameters must be in
00194         # a fixed order and format, copying and pasting to the file is
00195         # best. For this reason, they are grabbed here just as a long
00196         # string and not as individual numbers.
00197         elif len(line_floats) == num_params and not SEs_flag:
00198             parameters["parameter list"] = line.strip()
00199         # Find SEs. The same format as parameters above is maintained
00200         # since there is a correspondance between the SE format and
00201         # the parameter format.
00202         # Example match:
00203         # "SEs for parameters:
00204         # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 
00205         elif "SEs for parameters:" in line:
00206             SEs_flag = True
00207         elif SEs_flag and len(line_floats) == num_params:
00208             parameters["SEs"] = line.strip()
00209             SEs_flag = False
00210         # Find tree lengths.
00211         # Example match: "tree length =   1.71931"
00212         elif "tree length =" in line and len(line_floats) > 0:
00213             results["tree length"] = line_floats[0]
00214         # Find the estimated trees only taking the tree if it has
00215         # lengths or rate estimates on the branches
00216         elif tree_re.match(line) is not None:
00217             if ":" in line or "#" in line:
00218                 if dS_tree_flag:
00219                     results["dS tree"] = line.strip()
00220                     dS_tree_flag = False
00221                 elif dN_tree_flag:
00222                     results["dN tree"] = line.strip()
00223                     dN_tree_flag = False
00224                 elif w_tree_flag:
00225                     results["omega tree"] = line.strip()
00226                     w_tree_flag = False
00227                 else:
00228                     results["tree"] = line.strip()
00229         elif "dS tree:" in line:
00230             dS_tree_flag = True
00231         elif "dN tree:" in line:
00232             dN_tree_flag = True
00233         elif "w ratios as labels for TreeView:" in line:
00234                 w_tree_flag = True
00235         # Find rates for multiple genes
00236         # Example match: "rates for 2 genes:     1  2.75551"
00237         elif "rates for" in line and len(line_floats) > 0:
00238             line_floats.insert(0, 1.0)
00239             parameters["rates"] = line_floats
00240         # Find kappa values.
00241         # Example match: "kappa (ts/tv) =  2.77541"
00242         elif "kappa (ts/tv)" in line and len(line_floats) > 0:
00243             parameters["kappa"] = line_floats[0]
00244         # Find omega values.
00245         # Example match: "omega (dN/dS) =  0.25122"
00246         elif "omega (dN/dS)" in line and len(line_floats) > 0:
00247             parameters["omega"] = line_floats[0]
00248         elif "w (dN/dS)" in line and len(line_floats) > 0:
00249             parameters["omega"] = line_floats
00250         # Find omega and kappa values for multi-gene files
00251         # Example match: "gene # 1: kappa =   1.72615 omega =   0.39333"
00252         elif "gene # " in line:
00253             gene_num = int(re.match("gene # (\d+)", line).group(1))
00254             if parameters.get("genes") is None:
00255                 parameters["genes"] = {}
00256             parameters["genes"][gene_num] = {"kappa":line_floats[0],
00257                                             "omega":line_floats[1]}
00258         # Find dN values.
00259         # Example match: "tree length for dN:       0.2990"
00260         elif "tree length for dN" in line and len(line_floats) > 0:
00261             parameters["dN"] = line_floats[0]
00262         # Find dS values
00263         # Example match: "tree length for dS:       1.1901"
00264         elif "tree length for dS" in line and len(line_floats) > 0:
00265             parameters["dS"] = line_floats[0]
00266         # Find site class distributions.
00267         # Example match 1 (normal model, 2 site classes):
00268         # "p:   0.77615  0.22385"
00269         # Example match 2 (branch site A model, 4 site classes):
00270         # "proportion       0.00000  0.00000  0.73921  0.26079"
00271         elif line[0:2] == "p:" or line[0:10] == "proportion":
00272             site_classes = parse_siteclass_proportions(line_floats)
00273             parameters["site classes"] = site_classes
00274         # Find the omega value corresponding to each site class
00275         # Example match (2 site classes): "w:   0.10224  1.00000"
00276         elif line[0:2] == "w:":
00277             site_classes = parameters.get("site classes")
00278             site_classes = parse_siteclass_omegas(line, site_classes)
00279             parameters["site classes"] = site_classes
00280         # Find the omega values corresponding to a branch type from  
00281         # the clade model C for each site class
00282         # Example match:
00283         # "branch type 0:    0.31022   1.00000   0.00000"
00284         elif "branch type " in line:
00285             branch_type = re.match("branch type (\d)", line)
00286             if branch_type:
00287                 site_classes = parameters.get("site classes")
00288                 branch_type_no = int(branch_type.group(1))
00289                 site_classes = parse_clademodelc(branch_type_no, line_floats, 
00290                         site_classes)
00291                 parameters["site classes"] = site_classes
00292         # Find the omega values of the foreground branch for each site
00293         # class in the branch site A model
00294         # Example match:
00295         # "foreground w     0.07992  1.00000 134.54218 134.54218"
00296         elif line[0:12] == "foreground w":
00297             site_classes = parameters.get("site classes")
00298             site_classes = parse_branch_site_a(True, line_floats, site_classes)
00299             parameters["site classes"] = site_classes
00300         # Find the omega values of the background for each site
00301         # class in the branch site A model
00302         # Example match:
00303         # "background w     0.07992  1.00000  0.07992  1.00000"
00304         elif line[0:12] == "background w":
00305             site_classes = parameters.get("site classes")
00306             site_classes = parse_branch_site_a(False, line_floats, site_classes)
00307             parameters["site classes"] = site_classes
00308         # Find dN & dS for each branch, which is organized in a table
00309         # The possibility of NaNs forces me to not use the line_floats
00310         # method.
00311         # Example row (some spaces removed to make it smaller...).
00312         # " 6..7   0.000  167.7  54.3  0.0000  0.0000  0.0000  0.0  0.0"
00313         elif branch_res is not None and len(line_floats) > 0:
00314             branch = branch_res.group(1)
00315             if parameters.get("branches") is None:
00316                 parameters["branches"] = {}
00317             #Hack for Jython http://bugs.jython.org/issue1762 float("-nan")
00318             line = line.replace(" -nan", " nan")
00319             params = line.strip().split()[1:]
00320             parameters["branches"][branch]= {
00321                 "t" : _nan_float(params[0].strip()),
00322                 "N" : _nan_float(params[1].strip()),
00323                 "S" : _nan_float(params[2].strip()),
00324                 "omega" :_nan_float(params[3].strip()),
00325                 "dN" : _nan_float(params[4].strip()),
00326                 "dS" : _nan_float(params[5].strip()),
00327                 "N*dN" : _nan_float(params[6].strip()),
00328                 "S*dS" : _nan_float(params[7].strip())}
00329         # Find model parameters, which can be spread across multiple
00330         # lines.
00331         # Example matches:
00332         # "  p0=  0.99043  p=  0.36657 q=  1.04445
00333         #"  (p1=  0.00957) w=  3.25530"
00334         elif len(model_params) > 0:
00335             float_model_params = []
00336             for param in model_params:
00337                 float_model_params.append((param[0], _nan_float(param[1])))
00338             parameters = dict(parameters.items() + float_model_params)
00339     if len(parameters) > 0:
00340         results["parameters"] = parameters
00341     return results

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.Phylo.PAML._parse_codeml.parse_nssites (   lines,
  results,
  multi_models,
  multi_genes 
)
Determine which NSsites models are present and parse them.

Definition at line 86 of file _parse_codeml.py.

00086 
00087 def parse_nssites(lines, results, multi_models, multi_genes):
00088     """Determine which NSsites models are present and parse them.
00089     """
00090 
00091     ns_sites = {}
00092     model_re = re.compile("Model (\d+):\s+(.+)")
00093     gene_re = re.compile("Gene\s+([0-9]+)\s+.+")
00094     siteclass_model = results.get("site-class model")
00095     if not multi_models:
00096     # If there's only one model in the results, find out
00097     # which one it is and then parse it. 
00098         if siteclass_model is None:
00099             siteclass_model = "one-ratio"
00100         current_model = {"one-ratio" : 0,
00101                         "NearlyNeutral" : 1,
00102                         "PositiveSelection" : 2,
00103                         "discrete" : 3,
00104                         "beta" : 7,
00105                         "beta&w>1" : 8}[siteclass_model]
00106         if multi_genes:
00107             genes = results["genes"]
00108             current_gene = None
00109             gene_start = None
00110             for line_num, line in enumerate(lines):
00111                 gene_res = gene_re.match(line)
00112                 if gene_res:
00113                     if current_gene is not None:
00114                         parse_model(lines[gene_start:line_num], model_results)
00115                         genes[current_gene-1] = model_results
00116                     gene_start = line_num
00117                     current_gene = int(gene_res.group(1))
00118                     model_results = {"description": siteclass_model}
00119             if len(genes[current_gene-1]) == 0:
00120                 model_results = parse_model(lines[gene_start:], model_results)
00121                 genes[current_gene-1] = model_results
00122         else:
00123             model_results = {"description" : siteclass_model}
00124             model_results = parse_model(lines, model_results)
00125             ns_sites[current_model] = model_results
00126     else:
00127     # If there are multiple models in the results, scan through
00128     # the file and send each model's text to be parsed individually.
00129         current_model = None
00130         model_start = None
00131         for line_num, line in enumerate(lines):
00132             # Find model names. If this is found on a line,
00133             # all of the following lines until the next time this matches
00134             # contain results for Model X.
00135             # Example match: "Model 1: NearlyNeutral (2 categories)"
00136             model_res = model_re.match(line)
00137             if model_res:
00138                 if current_model is not None:
00139                 # We've already been tracking a model, so it's time
00140                 # to send those lines off for parsing before beginning
00141                 # a new one.
00142                     parse_model(lines[model_start:line_num], model_results)
00143                     ns_sites[current_model] = model_results
00144                 model_start = line_num
00145                 current_model = int(model_res.group(1))
00146                 model_results = {"description":model_res.group(2)}
00147         if ns_sites.get(current_model) is None:
00148         # When we reach the end of the file, we'll still have one more
00149         # model to parse.
00150             model_results = parse_model(lines[model_start:], model_results)
00151             ns_sites[current_model] = model_results
00152     # Only add the ns_sites dict to the results if we really have results. 
00153     # Model M0 is added by default in some cases, so if it exists, make sure 
00154     # it's not empty
00155     if len(ns_sites) == 1:
00156         m0 = ns_sites.get(0)
00157         if not m0 or len(m0) > 1:
00158             results["NSsites"] = ns_sites
00159     elif len(ns_sites) > 1:
00160         results["NSsites"] = ns_sites
00161     return results

Here is the call graph for this function:

def Bio.Phylo.PAML._parse_codeml.parse_pairwise (   lines,
  results 
)
Parse results from pairwise comparisons.

Definition at line 393 of file _parse_codeml.py.

00393 
00394 def parse_pairwise(lines, results):
00395     """Parse results from pairwise comparisons.
00396     """
00397     # Find pairwise comparisons
00398     # Example:
00399     # 2 (Pan_troglo) ... 1 (Homo_sapie)
00400     # lnL = -291.465693
00401     #  0.01262 999.00000  0.00100
00402     #
00403     # t= 0.0126  S=    81.4  N=   140.6  dN/dS= 0.0010  dN= 0.0000  dS= 0.0115
00404     pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)")
00405     pairwise = {}
00406     for line in lines:
00407         # Find all floating point numbers in this line
00408         line_floats_res = line_floats_re.findall(line)
00409         line_floats = [_nan_float(val) for val in line_floats_res]
00410         pair_res = pair_re.match(line)
00411         if pair_res:
00412             seq1 = pair_res.group(1)
00413             seq2 = pair_res.group(2)
00414             if pairwise.get(seq1) is None:
00415                 pairwise[seq1] = {}
00416             if pairwise.get(seq2) is None:
00417                 pairwise[seq2] = {}
00418             if len(line_floats) == 1:
00419                 pairwise[seq1][seq2] = {"lnL" : line_floats[0]}
00420                 pairwise[seq2][seq1] = pairwise[seq1][seq2]
00421             elif len(line_floats) == 6:
00422                 pairwise[seq1][seq2] = {"t" : line_floats[0],
00423                         "S" : line_floats[1],
00424                         "N" : line_floats[2],
00425                         "omega" : line_floats[3],
00426                         "dN" : line_floats[4],
00427                         "dS" : line_floats[5]}
00428                 pairwise[seq2][seq1] = pairwise[seq1][seq2]
00429     if len(pairwise) > 0:
00430         results["pairwise"] = pairwise
00431     return results

def Bio.Phylo.PAML._parse_codeml.parse_siteclass_omegas (   line,
  site_classes 
)
For models which have multiple site classes, find the omega estimated for each class.

Definition at line 351 of file _parse_codeml.py.

00351 
00352 def parse_siteclass_omegas(line, site_classes):
00353     """For models which have multiple site classes, find the omega estimated for each class.
00354     """
00355     # The omega results are tabular with strictly 9 characters per column
00356     # (1 to 3 digits before the  decimal point and 5 after). This causes
00357     # numbers to sometimes run into each other, so we must use a different
00358     # regular expression to account for this. i.e.:
00359     # w:   0.00012  1.00000109.87121
00360     line_floats = re.findall("\d{1,3}\.\d{5}", line)
00361     if not site_classes or len(line_floats) == 0:
00362         return
00363     for n in range(len(line_floats)):
00364         site_classes[n]["omega"] = line_floats[n]
00365     return site_classes

Here is the caller graph for this function:

For models which have multiple site classes, find the proportion of the alignment assigned to each class.

Definition at line 342 of file _parse_codeml.py.

00342 
00343 def parse_siteclass_proportions(line_floats):
00344     """For models which have multiple site classes, find the proportion of the alignment assigned to each class.
00345     """
00346     site_classes = {}
00347     if len(line_floats) > 0:
00348         for n in range(len(line_floats)):
00349             site_classes[n] = {"proportion" : line_floats[n]}
00350     return site_classes

Here is the caller graph for this function:


Variable Documentation

Definition at line 12 of file _parse_codeml.py.

tuple Bio.Phylo.PAML._parse_codeml.line_floats_re = re.compile("-*\d+\.\d+")

Definition at line 8 of file _parse_codeml.py.