Back to index

python-biopython  1.60
_parse_yn00.py
Go to the documentation of this file.
00001 # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license. Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 import re
00007 
00008 def parse_ng86(lines, results):
00009     """ Parse the Nei & Gojobori (1986) section of the resuls.
00010     Nei_Gojobori results are organized in a lower 
00011     triangular mattrix, with the sequence names labeling
00012     the rows and statistics in the format:
00013     w (dN dS) per column
00014     Example row (2 columns):
00015     0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)"""
00016     sequences = []
00017     for line in lines:
00018         # Find all floating point numbers in this line
00019         line_floats_res = re.findall("-*\d+\.\d+", line)
00020         line_floats = [float(val) for val in line_floats_res] 
00021         matrix_row_res = re.match("(.+)\s{5,15}",line)
00022         if matrix_row_res is not None:
00023             seq_name = matrix_row_res.group(1).strip()
00024             sequences.append(seq_name)
00025             results[seq_name] = {}
00026             for i in range(0, len(line_floats), 3):
00027                 NG86 = {}
00028                 NG86["omega"] = line_floats[i]
00029                 NG86["dN"] = line_floats[i+1]
00030                 NG86["dS"] = line_floats[i+2]
00031                 results[seq_name][sequences[i//3]] = {"NG86":NG86}
00032                 results[sequences[i//3]][seq_name] = {"NG86":NG86}
00033     return (results, sequences)
00034  
00035 def parse_yn00(lines, results, sequences):
00036     """ Parse the Yang & Nielsen (2000) part of the results.
00037     Yang & Nielsen results are organized in a table with
00038     each row comprising one pairwise species comparison.
00039     Rows are labeled by spequence number rather than by
00040     sequence name."""
00041     
00042     # Example (header row and first table row):
00043     # seq. seq.     S       N        t   kappa   omega     dN +- SE    dS +- SE
00044     # 2    1    67.3   154.7   0.0136  3.6564  0.0000 -0.0000 +- 0.0000  0.0150
00045     # +- 0.0151
00046     for line in lines:
00047         # Find all floating point numbers in this line
00048         line_floats_res = re.findall("-*\d+\.\d+", line)
00049         line_floats = [float(val) for val in line_floats_res] 
00050         row_res = re.match("\s+(\d+)\s+(\d+)", line)
00051         if row_res is not None:
00052             seq1 = int(row_res.group(1))
00053             seq2 = int(row_res.group(2))
00054             seq_name1 = sequences[seq1-1]
00055             seq_name2 = sequences[seq2-1]
00056             YN00 = {}
00057             YN00["S"] = line_floats[0]
00058             YN00["N"] = line_floats[1]
00059             YN00["t"] = line_floats[2]
00060             YN00["kappa"] = line_floats[3]
00061             YN00["omega"] = line_floats[4]
00062             YN00["dN"] = line_floats[5]
00063             YN00["dN SE"] = line_floats[6]
00064             YN00["dS"] = line_floats[7]
00065             YN00["dS SE"] = line_floats[8]
00066             results[seq_name1][seq_name2]["YN00"] = YN00
00067             results[seq_name2][seq_name1]["YN00"] = YN00
00068             seq_name1 = None
00069             seq_name2 = None
00070     return results
00071 
00072 def parse_others(lines, results, sequences):
00073     """Parse the results from the other methods.
00074 
00075     The remaining methods are grouped together. Statistics
00076     for all three are listed for each of the pairwise 
00077     species comparisons, with each method's results on its
00078     own line.
00079     The stats in this section must be handled differently
00080     due to the possible presence of NaN values, which won't
00081     get caught by my typical "line_floats" method used above.
00082     """
00083     # Example:
00084     # 2 (Pan_troglo) vs. 1 (Homo_sapie)
00085 
00086     # L(i):      143.0      51.0      28.0  sum=    222.0
00087     # Ns(i):    0.0000    1.0000    0.0000  sum=   1.0000
00088     # Nv(i):    0.0000    0.0000    0.0000  sum=   0.0000
00089     # A(i):     0.0000    0.0200    0.0000
00090     # B(i):    -0.0000   -0.0000   -0.0000
00091     # LWL85:  dS =  0.0227 dN =  0.0000 w = 0.0000 S =   45.0 N =  177.0
00092     # LWL85m: dS =    -nan dN =    -nan w =   -nan S =   -nan N =   -nan (rho = -nan)
00093     # LPB93:  dS =  0.0129 dN =  0.0000 w = 0.0000
00094     seq_name1 = None
00095     seq_name2 = None
00096     for line in lines:
00097         comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line)
00098         if comp_res is not None:
00099             seq_name1 = comp_res.group(1)
00100             seq_name2 = comp_res.group(2)
00101         elif seq_name1 is not None and seq_name2 is not None:
00102             if "dS =" in line:
00103                 stats = {}
00104                 line_stats = line.split(":")[1].strip()
00105                 # Find all of the xx = ###### values in a row
00106                 # ie dS =  0.0227
00107                 # For dN and dS, the values have 8 characters from the equals
00108                 # sign, while the rest have 7 characters. On Windows,
00109                 # NaNs take on weird values like -1.#IND, which might fill the
00110                 # entire fixed column width.
00111                 res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?",
00112                                          line_stats)             
00113                 for stat_pair in res_matches:
00114                     stat = stat_pair.split('=')[0].strip()
00115                     value = stat_pair.split('=')[1].strip()
00116                     try:
00117                         stats[stat] = float(value)
00118                     except:
00119                         stats[stat] = None
00120                 if "LWL85:" in line:
00121                     results[seq_name1][seq_name2]["LWL85"] = stats
00122                     results[seq_name2][seq_name1]["LWL85"] = stats
00123                 elif "LWL85m" in line:
00124                     results[seq_name1][seq_name2]["LWL85m"] = stats
00125                     results[seq_name2][seq_name1]["LWL85m"] = stats
00126                 elif "LPB93" in line:
00127                     results[seq_name1][seq_name2]["LPB93"] = stats
00128                     results[seq_name2][seq_name1]["LPB93"] = stats
00129     return results