Back to index

python-biopython  1.60
Functions
Bio.Phylo.PAML._parse_yn00 Namespace Reference

Functions

def parse_ng86
def parse_yn00
def parse_others

Function Documentation

def Bio.Phylo.PAML._parse_yn00.parse_ng86 (   lines,
  results 
)
Parse the Nei & Gojobori (1986) section of the resuls.
Nei_Gojobori results are organized in a lower 
triangular mattrix, with the sequence names labeling
the rows and statistics in the format:
w (dN dS) per column
Example row (2 columns):
0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)

Definition at line 8 of file _parse_yn00.py.

00008 
00009 def parse_ng86(lines, results):
00010     """ Parse the Nei & Gojobori (1986) section of the resuls.
00011     Nei_Gojobori results are organized in a lower 
00012     triangular mattrix, with the sequence names labeling
00013     the rows and statistics in the format:
00014     w (dN dS) per column
00015     Example row (2 columns):
00016     0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)"""
00017     sequences = []
00018     for line in lines:
00019         # Find all floating point numbers in this line
00020         line_floats_res = re.findall("-*\d+\.\d+", line)
00021         line_floats = [float(val) for val in line_floats_res] 
00022         matrix_row_res = re.match("(.+)\s{5,15}",line)
00023         if matrix_row_res is not None:
00024             seq_name = matrix_row_res.group(1).strip()
00025             sequences.append(seq_name)
00026             results[seq_name] = {}
00027             for i in range(0, len(line_floats), 3):
00028                 NG86 = {}
00029                 NG86["omega"] = line_floats[i]
00030                 NG86["dN"] = line_floats[i+1]
00031                 NG86["dS"] = line_floats[i+2]
00032                 results[seq_name][sequences[i//3]] = {"NG86":NG86}
00033                 results[sequences[i//3]][seq_name] = {"NG86":NG86}
00034     return (results, sequences)
 
def Bio.Phylo.PAML._parse_yn00.parse_others (   lines,
  results,
  sequences 
)
Parse the results from the other methods.

The remaining methods are grouped together. Statistics
for all three are listed for each of the pairwise 
species comparisons, with each method's results on its
own line.
The stats in this section must be handled differently
due to the possible presence of NaN values, which won't
get caught by my typical "line_floats" method used above.

Definition at line 72 of file _parse_yn00.py.

00072 
00073 def parse_others(lines, results, sequences):
00074     """Parse the results from the other methods.
00075 
00076     The remaining methods are grouped together. Statistics
00077     for all three are listed for each of the pairwise 
00078     species comparisons, with each method's results on its
00079     own line.
00080     The stats in this section must be handled differently
00081     due to the possible presence of NaN values, which won't
00082     get caught by my typical "line_floats" method used above.
00083     """
00084     # Example:
00085     # 2 (Pan_troglo) vs. 1 (Homo_sapie)
00086 
00087     # L(i):      143.0      51.0      28.0  sum=    222.0
00088     # Ns(i):    0.0000    1.0000    0.0000  sum=   1.0000
00089     # Nv(i):    0.0000    0.0000    0.0000  sum=   0.0000
00090     # A(i):     0.0000    0.0200    0.0000
00091     # B(i):    -0.0000   -0.0000   -0.0000
00092     # LWL85:  dS =  0.0227 dN =  0.0000 w = 0.0000 S =   45.0 N =  177.0
00093     # LWL85m: dS =    -nan dN =    -nan w =   -nan S =   -nan N =   -nan (rho = -nan)
00094     # LPB93:  dS =  0.0129 dN =  0.0000 w = 0.0000
00095     seq_name1 = None
00096     seq_name2 = None
00097     for line in lines:
00098         comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line)
00099         if comp_res is not None:
00100             seq_name1 = comp_res.group(1)
00101             seq_name2 = comp_res.group(2)
00102         elif seq_name1 is not None and seq_name2 is not None:
00103             if "dS =" in line:
00104                 stats = {}
00105                 line_stats = line.split(":")[1].strip()
00106                 # Find all of the xx = ###### values in a row
00107                 # ie dS =  0.0227
00108                 # For dN and dS, the values have 8 characters from the equals
00109                 # sign, while the rest have 7 characters. On Windows,
00110                 # NaNs take on weird values like -1.#IND, which might fill the
00111                 # entire fixed column width.
00112                 res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?",
00113                                          line_stats)             
00114                 for stat_pair in res_matches:
00115                     stat = stat_pair.split('=')[0].strip()
00116                     value = stat_pair.split('=')[1].strip()
00117                     try:
00118                         stats[stat] = float(value)
00119                     except:
00120                         stats[stat] = None
00121                 if "LWL85:" in line:
00122                     results[seq_name1][seq_name2]["LWL85"] = stats
00123                     results[seq_name2][seq_name1]["LWL85"] = stats
00124                 elif "LWL85m" in line:
00125                     results[seq_name1][seq_name2]["LWL85m"] = stats
00126                     results[seq_name2][seq_name1]["LWL85m"] = stats
00127                 elif "LPB93" in line:
00128                     results[seq_name1][seq_name2]["LPB93"] = stats
00129                     results[seq_name2][seq_name1]["LPB93"] = stats
00130     return results
def Bio.Phylo.PAML._parse_yn00.parse_yn00 (   lines,
  results,
  sequences 
)
Parse the Yang & Nielsen (2000) part of the results.
Yang & Nielsen results are organized in a table with
each row comprising one pairwise species comparison.
Rows are labeled by spequence number rather than by
sequence name.

Definition at line 35 of file _parse_yn00.py.

00035 
00036 def parse_yn00(lines, results, sequences):
00037     """ Parse the Yang & Nielsen (2000) part of the results.
00038     Yang & Nielsen results are organized in a table with
00039     each row comprising one pairwise species comparison.
00040     Rows are labeled by spequence number rather than by
00041     sequence name."""
00042     
00043     # Example (header row and first table row):
00044     # seq. seq.     S       N        t   kappa   omega     dN +- SE    dS +- SE
00045     # 2    1    67.3   154.7   0.0136  3.6564  0.0000 -0.0000 +- 0.0000  0.0150
00046     # +- 0.0151
00047     for line in lines:
00048         # Find all floating point numbers in this line
00049         line_floats_res = re.findall("-*\d+\.\d+", line)
00050         line_floats = [float(val) for val in line_floats_res] 
00051         row_res = re.match("\s+(\d+)\s+(\d+)", line)
00052         if row_res is not None:
00053             seq1 = int(row_res.group(1))
00054             seq2 = int(row_res.group(2))
00055             seq_name1 = sequences[seq1-1]
00056             seq_name2 = sequences[seq2-1]
00057             YN00 = {}
00058             YN00["S"] = line_floats[0]
00059             YN00["N"] = line_floats[1]
00060             YN00["t"] = line_floats[2]
00061             YN00["kappa"] = line_floats[3]
00062             YN00["omega"] = line_floats[4]
00063             YN00["dN"] = line_floats[5]
00064             YN00["dN SE"] = line_floats[6]
00065             YN00["dS"] = line_floats[7]
00066             YN00["dS SE"] = line_floats[8]
00067             results[seq_name1][seq_name2]["YN00"] = YN00
00068             results[seq_name2][seq_name1]["YN00"] = YN00
00069             seq_name1 = None
00070             seq_name2 = None
00071     return results