Back to index

python-biopython  1.60
_parse_codeml.py
Go to the documentation of this file.
00001 # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 import re
00007 
00008 line_floats_re = re.compile("-*\d+\.\d+")
00009 
00010 try:
00011     float("nan")
00012     _nan_float = float
00013 except ValueError:
00014     #Happens prior to Python 2.6 depending on C library, e.g. breaks on WinXP
00015     def _nan_float(text):
00016         try:
00017             return float(text)
00018         except ValueError:
00019             if text.lower()=="nan":
00020                 import struct 
00021                 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0]
00022             else:
00023                 raise
00024 
00025 def parse_basics(lines, results):
00026     """Parse the basic information that should be present in most codeml output files.
00027     """
00028     # multi_models is used to designate there being results for more than one
00029     # model in the file
00030     multi_models = False
00031     multi_genes = False
00032     version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*")
00033     model_re = re.compile("Model:\s+(.+)")
00034     num_genes_re = re.compile("\(([0-9]+) genes: separate data\)")
00035     # In codeml 4.1, the codon substitution model is headed by:
00036     # "Codon frequencies:"
00037     # In codeml 4.3+ it is headed by:
00038     # "Codon frequency model:"
00039     codon_freq_re = re.compile("Codon frequenc[a-z\s]{3,7}:\s+(.+)")
00040     siteclass_re = re.compile("Site-class models:\s*([^\s]*)")
00041     for line in lines:
00042         # Find all floating point numbers in this line
00043         line_floats_res = line_floats_re.findall(line)
00044         line_floats = [_nan_float(val) for val in line_floats_res]
00045         # Get the program version number
00046         version_res = version_re.match(line)
00047         if version_res is not None:
00048             results["version"] = version_res.group(1)
00049             continue
00050         # Find the model description at the beginning of the file.
00051         model_res = model_re.match(line)
00052         if model_res is not None:
00053             results["model"] = model_res.group(1)
00054         # Find out if more than one genes are analyzed
00055         num_genes_res = num_genes_re.search(line)
00056         if num_genes_res is not None:
00057             results["genes"] = []
00058             num_genes = int(num_genes_res.group(1))
00059             for n in range(num_genes):
00060                 results["genes"].append({})
00061             multi_genes = True
00062             continue
00063         # Get the codon substitution model
00064         codon_freq_res = codon_freq_re.match(line)
00065         if codon_freq_res is not None:
00066             results["codon model"] = codon_freq_res.group(1)
00067             continue
00068         # Find the site-class model name at the beginning of the file.
00069         # This exists only if a NSsites class other than 0 is used.
00070         # If there's no site class model listed, then there are multiple
00071         # models in the results file
00072         # Example match: "Site-class models:  PositiveSelection"
00073         siteclass_res = siteclass_re.match(line)
00074         if siteclass_res is not None:
00075             siteclass_model = siteclass_res.group(1)
00076             if siteclass_model != "":
00077                 results["site-class model"] = siteclass_model
00078                 multi_models = False
00079             else:
00080                 multi_models = True
00081         # Get the maximum log-likelihood
00082         if "ln Lmax" in line and len(line_floats) > 0:
00083             results["lnL max"] = line_floats[0]
00084     return (results, multi_models, multi_genes)
00085  
00086 def parse_nssites(lines, results, multi_models, multi_genes):
00087     """Determine which NSsites models are present and parse them.
00088     """
00089 
00090     ns_sites = {}
00091     model_re = re.compile("Model (\d+):\s+(.+)")
00092     gene_re = re.compile("Gene\s+([0-9]+)\s+.+")
00093     siteclass_model = results.get("site-class model")
00094     if not multi_models:
00095     # If there's only one model in the results, find out
00096     # which one it is and then parse it. 
00097         if siteclass_model is None:
00098             siteclass_model = "one-ratio"
00099         current_model = {"one-ratio" : 0,
00100                         "NearlyNeutral" : 1,
00101                         "PositiveSelection" : 2,
00102                         "discrete" : 3,
00103                         "beta" : 7,
00104                         "beta&w>1" : 8}[siteclass_model]
00105         if multi_genes:
00106             genes = results["genes"]
00107             current_gene = None
00108             gene_start = None
00109             for line_num, line in enumerate(lines):
00110                 gene_res = gene_re.match(line)
00111                 if gene_res:
00112                     if current_gene is not None:
00113                         parse_model(lines[gene_start:line_num], model_results)
00114                         genes[current_gene-1] = model_results
00115                     gene_start = line_num
00116                     current_gene = int(gene_res.group(1))
00117                     model_results = {"description": siteclass_model}
00118             if len(genes[current_gene-1]) == 0:
00119                 model_results = parse_model(lines[gene_start:], model_results)
00120                 genes[current_gene-1] = model_results
00121         else:
00122             model_results = {"description" : siteclass_model}
00123             model_results = parse_model(lines, model_results)
00124             ns_sites[current_model] = model_results
00125     else:
00126     # If there are multiple models in the results, scan through
00127     # the file and send each model's text to be parsed individually.
00128         current_model = None
00129         model_start = None
00130         for line_num, line in enumerate(lines):
00131             # Find model names. If this is found on a line,
00132             # all of the following lines until the next time this matches
00133             # contain results for Model X.
00134             # Example match: "Model 1: NearlyNeutral (2 categories)"
00135             model_res = model_re.match(line)
00136             if model_res:
00137                 if current_model is not None:
00138                 # We've already been tracking a model, so it's time
00139                 # to send those lines off for parsing before beginning
00140                 # a new one.
00141                     parse_model(lines[model_start:line_num], model_results)
00142                     ns_sites[current_model] = model_results
00143                 model_start = line_num
00144                 current_model = int(model_res.group(1))
00145                 model_results = {"description":model_res.group(2)}
00146         if ns_sites.get(current_model) is None:
00147         # When we reach the end of the file, we'll still have one more
00148         # model to parse.
00149             model_results = parse_model(lines[model_start:], model_results)
00150             ns_sites[current_model] = model_results
00151     # Only add the ns_sites dict to the results if we really have results. 
00152     # Model M0 is added by default in some cases, so if it exists, make sure 
00153     # it's not empty
00154     if len(ns_sites) == 1:
00155         m0 = ns_sites.get(0)
00156         if not m0 or len(m0) > 1:
00157             results["NSsites"] = ns_sites
00158     elif len(ns_sites) > 1:
00159         results["NSsites"] = ns_sites
00160     return results
00161 
00162 def parse_model(lines, results):
00163     """Parse an individual NSsites model's results.
00164     """
00165     parameters = {}
00166     SEs_flag = False
00167     dS_tree_flag = False
00168     dN_tree_flag = False
00169     w_tree_flag = False
00170     num_params = None
00171     tree_re = re.compile("\(\(+")
00172     branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+")
00173     model_params_re = re.compile("([a-z]\d?)=\s+(\d+\.\d+)")
00174     for line in lines:
00175         # Find all floating point numbers in this line
00176         line_floats_res = line_floats_re.findall(line)
00177         line_floats = [_nan_float(val) for val in line_floats_res]
00178         # Check if branch-specific results are in the line
00179         branch_res = branch_re.match(line)
00180         # Check if additional model parameters are in the line
00181         model_params = model_params_re.findall(line)
00182         # Find lnL values.
00183         # Example match (lnL = -2021.348300):
00184         # "lnL(ntime: 19  np: 22):  -2021.348300      +0.000000"
00185         if "lnL(ntime:" in line and len(line_floats) > 0:
00186             results["lnL"] = line_floats[0]
00187             np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)",line)
00188             if np_res is not None:
00189                 num_params = int(np_res.group(1))
00190         # Get parameter list. This can be useful for specifying starting
00191         # parameters in another run by copying the list of parameters
00192         # to a file called in.codeml. Since the parameters must be in
00193         # a fixed order and format, copying and pasting to the file is
00194         # best. For this reason, they are grabbed here just as a long
00195         # string and not as individual numbers.
00196         elif len(line_floats) == num_params and not SEs_flag:
00197             parameters["parameter list"] = line.strip()
00198         # Find SEs. The same format as parameters above is maintained
00199         # since there is a correspondance between the SE format and
00200         # the parameter format.
00201         # Example match:
00202         # "SEs for parameters:
00203         # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 
00204         elif "SEs for parameters:" in line:
00205             SEs_flag = True
00206         elif SEs_flag and len(line_floats) == num_params:
00207             parameters["SEs"] = line.strip()
00208             SEs_flag = False
00209         # Find tree lengths.
00210         # Example match: "tree length =   1.71931"
00211         elif "tree length =" in line and len(line_floats) > 0:
00212             results["tree length"] = line_floats[0]
00213         # Find the estimated trees only taking the tree if it has
00214         # lengths or rate estimates on the branches
00215         elif tree_re.match(line) is not None:
00216             if ":" in line or "#" in line:
00217                 if dS_tree_flag:
00218                     results["dS tree"] = line.strip()
00219                     dS_tree_flag = False
00220                 elif dN_tree_flag:
00221                     results["dN tree"] = line.strip()
00222                     dN_tree_flag = False
00223                 elif w_tree_flag:
00224                     results["omega tree"] = line.strip()
00225                     w_tree_flag = False
00226                 else:
00227                     results["tree"] = line.strip()
00228         elif "dS tree:" in line:
00229             dS_tree_flag = True
00230         elif "dN tree:" in line:
00231             dN_tree_flag = True
00232         elif "w ratios as labels for TreeView:" in line:
00233                 w_tree_flag = True
00234         # Find rates for multiple genes
00235         # Example match: "rates for 2 genes:     1  2.75551"
00236         elif "rates for" in line and len(line_floats) > 0:
00237             line_floats.insert(0, 1.0)
00238             parameters["rates"] = line_floats
00239         # Find kappa values.
00240         # Example match: "kappa (ts/tv) =  2.77541"
00241         elif "kappa (ts/tv)" in line and len(line_floats) > 0:
00242             parameters["kappa"] = line_floats[0]
00243         # Find omega values.
00244         # Example match: "omega (dN/dS) =  0.25122"
00245         elif "omega (dN/dS)" in line and len(line_floats) > 0:
00246             parameters["omega"] = line_floats[0]
00247         elif "w (dN/dS)" in line and len(line_floats) > 0:
00248             parameters["omega"] = line_floats
00249         # Find omega and kappa values for multi-gene files
00250         # Example match: "gene # 1: kappa =   1.72615 omega =   0.39333"
00251         elif "gene # " in line:
00252             gene_num = int(re.match("gene # (\d+)", line).group(1))
00253             if parameters.get("genes") is None:
00254                 parameters["genes"] = {}
00255             parameters["genes"][gene_num] = {"kappa":line_floats[0],
00256                                             "omega":line_floats[1]}
00257         # Find dN values.
00258         # Example match: "tree length for dN:       0.2990"
00259         elif "tree length for dN" in line and len(line_floats) > 0:
00260             parameters["dN"] = line_floats[0]
00261         # Find dS values
00262         # Example match: "tree length for dS:       1.1901"
00263         elif "tree length for dS" in line and len(line_floats) > 0:
00264             parameters["dS"] = line_floats[0]
00265         # Find site class distributions.
00266         # Example match 1 (normal model, 2 site classes):
00267         # "p:   0.77615  0.22385"
00268         # Example match 2 (branch site A model, 4 site classes):
00269         # "proportion       0.00000  0.00000  0.73921  0.26079"
00270         elif line[0:2] == "p:" or line[0:10] == "proportion":
00271             site_classes = parse_siteclass_proportions(line_floats)
00272             parameters["site classes"] = site_classes
00273         # Find the omega value corresponding to each site class
00274         # Example match (2 site classes): "w:   0.10224  1.00000"
00275         elif line[0:2] == "w:":
00276             site_classes = parameters.get("site classes")
00277             site_classes = parse_siteclass_omegas(line, site_classes)
00278             parameters["site classes"] = site_classes
00279         # Find the omega values corresponding to a branch type from  
00280         # the clade model C for each site class
00281         # Example match:
00282         # "branch type 0:    0.31022   1.00000   0.00000"
00283         elif "branch type " in line:
00284             branch_type = re.match("branch type (\d)", line)
00285             if branch_type:
00286                 site_classes = parameters.get("site classes")
00287                 branch_type_no = int(branch_type.group(1))
00288                 site_classes = parse_clademodelc(branch_type_no, line_floats, 
00289                         site_classes)
00290                 parameters["site classes"] = site_classes
00291         # Find the omega values of the foreground branch for each site
00292         # class in the branch site A model
00293         # Example match:
00294         # "foreground w     0.07992  1.00000 134.54218 134.54218"
00295         elif line[0:12] == "foreground w":
00296             site_classes = parameters.get("site classes")
00297             site_classes = parse_branch_site_a(True, line_floats, site_classes)
00298             parameters["site classes"] = site_classes
00299         # Find the omega values of the background for each site
00300         # class in the branch site A model
00301         # Example match:
00302         # "background w     0.07992  1.00000  0.07992  1.00000"
00303         elif line[0:12] == "background w":
00304             site_classes = parameters.get("site classes")
00305             site_classes = parse_branch_site_a(False, line_floats, site_classes)
00306             parameters["site classes"] = site_classes
00307         # Find dN & dS for each branch, which is organized in a table
00308         # The possibility of NaNs forces me to not use the line_floats
00309         # method.
00310         # Example row (some spaces removed to make it smaller...).
00311         # " 6..7   0.000  167.7  54.3  0.0000  0.0000  0.0000  0.0  0.0"
00312         elif branch_res is not None and len(line_floats) > 0:
00313             branch = branch_res.group(1)
00314             if parameters.get("branches") is None:
00315                 parameters["branches"] = {}
00316             #Hack for Jython http://bugs.jython.org/issue1762 float("-nan")
00317             line = line.replace(" -nan", " nan")
00318             params = line.strip().split()[1:]
00319             parameters["branches"][branch]= {
00320                 "t" : _nan_float(params[0].strip()),
00321                 "N" : _nan_float(params[1].strip()),
00322                 "S" : _nan_float(params[2].strip()),
00323                 "omega" :_nan_float(params[3].strip()),
00324                 "dN" : _nan_float(params[4].strip()),
00325                 "dS" : _nan_float(params[5].strip()),
00326                 "N*dN" : _nan_float(params[6].strip()),
00327                 "S*dS" : _nan_float(params[7].strip())}
00328         # Find model parameters, which can be spread across multiple
00329         # lines.
00330         # Example matches:
00331         # "  p0=  0.99043  p=  0.36657 q=  1.04445
00332         #"  (p1=  0.00957) w=  3.25530"
00333         elif len(model_params) > 0:
00334             float_model_params = []
00335             for param in model_params:
00336                 float_model_params.append((param[0], _nan_float(param[1])))
00337             parameters = dict(parameters.items() + float_model_params)
00338     if len(parameters) > 0:
00339         results["parameters"] = parameters
00340     return results
00341 
00342 def parse_siteclass_proportions(line_floats):
00343     """For models which have multiple site classes, find the proportion of the alignment assigned to each class.
00344     """
00345     site_classes = {}
00346     if len(line_floats) > 0:
00347         for n in range(len(line_floats)):
00348             site_classes[n] = {"proportion" : line_floats[n]}
00349     return site_classes
00350 
00351 def parse_siteclass_omegas(line, site_classes):
00352     """For models which have multiple site classes, find the omega estimated for each class.
00353     """
00354     # The omega results are tabular with strictly 9 characters per column
00355     # (1 to 3 digits before the  decimal point and 5 after). This causes
00356     # numbers to sometimes run into each other, so we must use a different
00357     # regular expression to account for this. i.e.:
00358     # w:   0.00012  1.00000109.87121
00359     line_floats = re.findall("\d{1,3}\.\d{5}", line)
00360     if not site_classes or len(line_floats) == 0:
00361         return
00362     for n in range(len(line_floats)):
00363         site_classes[n]["omega"] = line_floats[n]
00364     return site_classes
00365 
00366 def parse_clademodelc(branch_type_no, line_floats, site_classes):
00367     """Parse results specific to the clade model C.
00368     """
00369     if not site_classes or len(line_floats) == 0:
00370         return
00371     for n in range(len(line_floats)):
00372         if site_classes[n].get("branch types") is None:
00373             site_classes[n]["branch types"] = {}
00374         site_classes[n]["branch types"][branch_type_no] = line_floats[n]
00375     return site_classes
00376 
00377 def parse_branch_site_a(foreground, line_floats, site_classes):
00378     """Parse results specific to the branch site A model.
00379     """
00380     if not site_classes or len(line_floats) == 0:
00381         return
00382     for n in range(len(line_floats)):
00383         if site_classes[n].get("branch types") is None:
00384             site_classes[n]["branch types"] = {}
00385         if foreground:
00386             site_classes[n]["branch types"]["foreground"] =\
00387                     line_floats[n]
00388         else:
00389             site_classes[n]["branch types"]["background"] =\
00390                     line_floats[n]
00391     return site_classes
00392 
00393 def parse_pairwise(lines, results):
00394     """Parse results from pairwise comparisons.
00395     """
00396     # Find pairwise comparisons
00397     # Example:
00398     # 2 (Pan_troglo) ... 1 (Homo_sapie)
00399     # lnL = -291.465693
00400     #  0.01262 999.00000  0.00100
00401     #
00402     # t= 0.0126  S=    81.4  N=   140.6  dN/dS= 0.0010  dN= 0.0000  dS= 0.0115
00403     pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)")
00404     pairwise = {}
00405     for line in lines:
00406         # Find all floating point numbers in this line
00407         line_floats_res = line_floats_re.findall(line)
00408         line_floats = [_nan_float(val) for val in line_floats_res]
00409         pair_res = pair_re.match(line)
00410         if pair_res:
00411             seq1 = pair_res.group(1)
00412             seq2 = pair_res.group(2)
00413             if pairwise.get(seq1) is None:
00414                 pairwise[seq1] = {}
00415             if pairwise.get(seq2) is None:
00416                 pairwise[seq2] = {}
00417             if len(line_floats) == 1:
00418                 pairwise[seq1][seq2] = {"lnL" : line_floats[0]}
00419                 pairwise[seq2][seq1] = pairwise[seq1][seq2]
00420             elif len(line_floats) == 6:
00421                 pairwise[seq1][seq2] = {"t" : line_floats[0],
00422                         "S" : line_floats[1],
00423                         "N" : line_floats[2],
00424                         "omega" : line_floats[3],
00425                         "dN" : line_floats[4],
00426                         "dS" : line_floats[5]}
00427                 pairwise[seq2][seq1] = pairwise[seq1][seq2]
00428     if len(pairwise) > 0:
00429         results["pairwise"] = pairwise
00430     return results
00431 
00432 def parse_distances(lines, results):
00433     """Parse amino acid sequence distance results.
00434     """
00435     distances = {}
00436     sequences = []
00437     raw_aa_distances_flag = False
00438     ml_aa_distances_flag = False
00439     matrix_row_re = re.compile("(.+)\s{5,15}")
00440     for line in lines:
00441         # Find all floating point numbers in this line
00442         line_floats_res = line_floats_re.findall(line)
00443         line_floats = [_nan_float(val) for val in line_floats_res]
00444         if "AA distances" in line:
00445             raw_aa_distances_flag = True
00446             # In current versions, the raw distances always come
00447             # first but I don't trust this to always be true
00448             ml_aa_distances_flag = False
00449         elif "ML distances of aa seqs." in line:
00450             ml_aa_distances_flag = True
00451             raw_aa_distances_flag = False
00452         # Parse AA distances (raw or ML), in a lower diagonal matrix
00453         matrix_row_res = matrix_row_re.match(line)
00454         if matrix_row_res and (raw_aa_distances_flag or \
00455                 ml_aa_distances_flag):
00456             seq_name = matrix_row_res.group(1).strip()
00457             if seq_name not in sequences:
00458                 sequences.append(seq_name)
00459             if raw_aa_distances_flag:
00460                 if distances.get("raw") is None:
00461                     distances["raw"] = {}
00462                 distances["raw"][seq_name] = {}
00463                 for i in range(0, len(line_floats)):
00464                     distances["raw"][seq_name][sequences[i]] = line_floats[i]
00465                     distances["raw"][sequences[i]][seq_name] = line_floats[i]
00466             else:
00467                 if distances.get("ml") is None:
00468                     distances["ml"] = {}
00469                 distances["ml"][seq_name] = {}
00470                 for i in range(0, len(line_floats)):
00471                     distances["ml"][seq_name][sequences[i]] = line_floats[i]
00472                     distances["ml"][sequences[i]][seq_name] = line_floats[i]
00473     if len(distances) > 0:
00474         results["distances"] = distances
00475     return results