Back to index

python-biopython  1.60
Namespaces | Classes | Functions | Variables
Bio.SubsMat Namespace Reference

Namespaces

namespace  FreqTable
namespace  MatrixInfo

Classes

class  SeqMat
class  AcceptedReplacementsMatrix
class  ObservedFrequencyMatrix
class  ExpectedFrequencyMatrix
class  SubstitutionMatrix
class  LogOddsMatrix

Functions

def _build_obs_freq_mat
def _exp_freq_table_from_obs_freq
def _build_exp_freq_mat
def _build_subs_mat
def _build_log_odds_mat
def make_log_odds_matrix
def observed_frequency_to_substitution_matrix
def read_text_matrix
def two_mat_relative_entropy
def two_mat_correlation
 Gives the linear correlation coefficient between two matrices.
def two_mat_DJS

Variables

 log = math.log
int NOTYPE = 0
int ACCREP = 1
int OBSFREQ = 2
int SUBS = 3
int EXPFREQ = 4
int LO = 5
float EPSILON = 0.00000000000001
int diagNO = 1
int diagONLY = 2
int diagALL = 3

Function Documentation

def Bio.SubsMat._build_exp_freq_mat (   exp_freq_table) [private]
Build an expected frequency matrix
exp_freq_table: should be a FreqTable instance

Definition at line 421 of file __init__.py.

00421 
00422 def _build_exp_freq_mat(exp_freq_table):
00423    """Build an expected frequency matrix
00424    exp_freq_table: should be a FreqTable instance
00425    """
00426    exp_freq_mat = ExpectedFrequencyMatrix(alphabet=exp_freq_table.alphabet,
00427                                           build_later=1)
00428    for i in exp_freq_mat:
00429       if i[0] == i[1]:
00430          exp_freq_mat[i] = exp_freq_table[i[0]]**2
00431       else:
00432          exp_freq_mat[i] = 2.0*exp_freq_table[i[0]]*exp_freq_table[i[1]]
00433    return exp_freq_mat
00434 #
00435 # Build the substitution matrix
#

Here is the caller graph for this function:

def Bio.SubsMat._build_log_odds_mat (   subs_mat,
  logbase = 2,
  factor = 10.0,
  round_digit = 0,
  keep_nd = 0 
) [private]
_build_log_odds_mat(subs_mat,logbase=10,factor=10.0,round_digit=1):
Build a log-odds matrix
logbase=2: base of logarithm used to build (default 2)
factor=10.: a factor by which each matrix entry is multiplied
round_digit: roundoff place after decimal point
keep_nd: if true, keeps the -999 value for non-determined values (for which there
are no substitutions in the frequency substitutions matrix). If false, plants the
minimum log-odds value of the matrix in entries containing -999

Definition at line 448 of file __init__.py.

00448 
00449 def _build_log_odds_mat(subs_mat,logbase=2,factor=10.0,round_digit=0,keep_nd=0):
00450    """_build_log_odds_mat(subs_mat,logbase=10,factor=10.0,round_digit=1):
00451    Build a log-odds matrix
00452    logbase=2: base of logarithm used to build (default 2)
00453    factor=10.: a factor by which each matrix entry is multiplied
00454    round_digit: roundoff place after decimal point
00455    keep_nd: if true, keeps the -999 value for non-determined values (for which there
00456    are no substitutions in the frequency substitutions matrix). If false, plants the
00457    minimum log-odds value of the matrix in entries containing -999
00458    """
00459    lo_mat = LogOddsMatrix(subs_mat)
00460    for key, value in subs_mat.iteritems():
00461       if value < EPSILON:
00462          lo_mat[key] = -999
00463       else:
00464          lo_mat[key] = round(factor*log(value)/log(logbase),round_digit)
00465    mat_min = min(lo_mat.values())
00466    if not keep_nd:
00467       for i in lo_mat:
00468          if lo_mat[i] <= -999:
00469             lo_mat[i] = mat_min
00470    return lo_mat
00471 
00472 #
00473 # External function. User provides an accepted replacement matrix, and,
00474 # optionally the following: expected frequency table, log base, mult. factor,
00475 # and rounding factor. Generates a log-odds matrix, calling internal SubsMat
00476 # functions.
#

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SubsMat._build_obs_freq_mat (   acc_rep_mat) [private]
build_obs_freq_mat(acc_rep_mat):
Build the observed frequency matrix, from an accepted replacements matrix
The acc_rep_mat matrix should be generated by the user.

Definition at line 395 of file __init__.py.

00395 
00396 def _build_obs_freq_mat(acc_rep_mat):
00397    """
00398    build_obs_freq_mat(acc_rep_mat):
00399    Build the observed frequency matrix, from an accepted replacements matrix
00400    The acc_rep_mat matrix should be generated by the user.
00401    """
00402    # Note: acc_rep_mat should already be a half_matrix!!
00403    total = float(sum(acc_rep_mat.values()))
00404    obs_freq_mat = ObservedFrequencyMatrix(alphabet=acc_rep_mat.alphabet,
00405                                           build_later=1)
00406    for i in acc_rep_mat:
00407       obs_freq_mat[i] = acc_rep_mat[i]/total
00408    return obs_freq_mat

Here is the caller graph for this function:

def Bio.SubsMat._build_subs_mat (   obs_freq_mat,
  exp_freq_mat 
) [private]
Build the substitution matrix 

Definition at line 436 of file __init__.py.

00436 
00437 def _build_subs_mat(obs_freq_mat,exp_freq_mat):
00438    """ Build the substitution matrix """
00439    if obs_freq_mat.ab_list != exp_freq_mat.ab_list:
00440       raise ValueError("Alphabet mismatch in passed matrices")
00441    subs_mat = SubstitutionMatrix(obs_freq_mat)
00442    for i in obs_freq_mat:
00443       subs_mat[i] = obs_freq_mat[i]/exp_freq_mat[i]
00444    return subs_mat
00445 
00446 #
00447 # Build a log-odds matrix
#

Here is the caller graph for this function:

def Bio.SubsMat._exp_freq_table_from_obs_freq (   obs_freq_mat) [private]

Definition at line 409 of file __init__.py.

00409 
00410 def _exp_freq_table_from_obs_freq(obs_freq_mat):
00411    exp_freq_table = {}
00412    for i in obs_freq_mat.alphabet.letters:
00413       exp_freq_table[i] = 0.
00414    for i in obs_freq_mat:
00415       if i[0] == i[1]:
00416          exp_freq_table[i[0]] += obs_freq_mat[i]
00417       else:
00418          exp_freq_table[i[0]] += obs_freq_mat[i] / 2.
00419          exp_freq_table[i[1]] += obs_freq_mat[i] / 2.
00420    return FreqTable.FreqTable(exp_freq_table,FreqTable.FREQ)

Here is the caller graph for this function:

def Bio.SubsMat.make_log_odds_matrix (   acc_rep_mat,
  exp_freq_table = None,
  logbase = 2,
  factor = 1.,
  round_digit = 9,
  keep_nd = 0 
)

Definition at line 478 of file __init__.py.

00478 
00479                     factor=1.,round_digit=9,keep_nd=0):
00480    obs_freq_mat = _build_obs_freq_mat(acc_rep_mat)
00481    if not exp_freq_table:
00482       exp_freq_table = _exp_freq_table_from_obs_freq(obs_freq_mat)
00483    exp_freq_mat = _build_exp_freq_mat(exp_freq_table)
00484    subs_mat = _build_subs_mat(obs_freq_mat, exp_freq_mat)
00485    lo_mat = _build_log_odds_mat(subs_mat,logbase,factor,round_digit,keep_nd)
00486    return lo_mat

Here is the call graph for this function:

Here is the caller graph for this function:

Definition at line 487 of file __init__.py.

00487 
00488 def observed_frequency_to_substitution_matrix(obs_freq_mat):
00489    exp_freq_table = _exp_freq_table_from_obs_freq(obs_freq_mat)
00490    exp_freq_mat = _build_exp_freq_mat(exp_freq_table)
00491    subs_mat = _build_subs_mat(obs_freq_mat, exp_freq_mat)
00492    return subs_mat

Here is the call graph for this function:

def Bio.SubsMat.read_text_matrix (   data_file)

Definition at line 493 of file __init__.py.

00493 
00494 def read_text_matrix(data_file):
00495    matrix = {}
00496    tmp = data_file.read().split("\n")
00497    table=[]
00498    for i in tmp: 
00499       table.append(i.split())
00500    # remove records beginning with ``#''
00501    for rec in table[:]:
00502       if (rec.count('#') > 0):
00503          table.remove(rec)
00504 
00505    # remove null lists
00506    while (table.count([]) > 0):
00507       table.remove([])
00508    # build a dictionary
00509    alphabet = table[0]
00510    j = 0
00511    for rec in table[1:]:
00512       # print j
00513       row = alphabet[j]
00514       # row = rec[0]
00515       if re.compile('[A-z\*]').match(rec[0]):
00516          first_col = 1
00517       else:
00518          first_col = 0
00519       i = 0
00520       for field in rec[first_col:]:
00521          col = alphabet[i]
00522          matrix[(row,col)] = float(field)
00523          i += 1
00524       j += 1
00525    # delete entries with an asterisk
00526    for i in matrix.keys():
00527       if '*' in i: del(matrix[i])
00528    ret_mat = SeqMat(matrix)
00529    return ret_mat

Here is the call graph for this function:

def Bio.SubsMat.two_mat_correlation (   mat_1,
  mat_2 
)

Gives the linear correlation coefficient between two matrices.

Definition at line 569 of file __init__.py.

00569 
00570 def two_mat_correlation(mat_1, mat_2):
00571     try:
00572         import numpy
00573     except ImportError:
00574         raise ImportError, "Please install Numerical Python (numpy) if you want to use this function"
00575     values = []
00576     assert mat_1.ab_list == mat_2.ab_list
00577     for ab_pair in mat_1:
00578        try:
00579           values.append((mat_1[ab_pair], mat_2[ab_pair]))
00580        except KeyError:
00581           raise ValueError, "%s is not a common key" % ab_pair
00582     correlation_matrix = numpy.corrcoef(values, rowvar=0)
00583     correlation = correlation_matrix[0,1]
00584     return correlation
00585 
00586 # Jensen-Shannon Distance
# Need to input observed frequency matrices
def Bio.SubsMat.two_mat_DJS (   mat_1,
  mat_2,
  pi_1 = 0.5,
  pi_2 = 0.5 
)

Definition at line 587 of file __init__.py.

00587 
00588 def two_mat_DJS(mat_1,mat_2,pi_1=0.5,pi_2=0.5):
00589    assert mat_1.ab_list == mat_2.ab_list
00590    assert pi_1 > 0 and pi_2 > 0 and pi_1< 1 and pi_2 <1
00591    assert not (pi_1 + pi_2 - 1.0 > EPSILON)
00592    sum_mat = SeqMat(build_later=1)
00593    sum_mat.ab_list = mat_1.ab_list
00594    for i in mat_1:
00595       sum_mat[i] = pi_1 * mat_1[i] + pi_2 * mat_2[i]
00596    sum_mat.make_entropy()
00597    mat_1.make_entropy()
00598    mat_2.make_entropy()
00599    # print mat_1.entropy, mat_2.entropy
00600    dJS = sum_mat.entropy - pi_1 * mat_1.entropy - pi_2 *mat_2.entropy
00601    return dJS
00602       
00603 """
00604 This isn't working yet. Boo hoo!
00605 def two_mat_print(mat_1, mat_2, f=None,alphabet=None,factor_1=1, factor_2=1,
00606                   format="%4d",bottomformat="%4s",topformat="%4s",
00607                   topindent=7*" ", bottomindent=1*" "):
00608    f = f or sys.stdout
00609    if not alphabet:
00610       assert mat_1.ab_list == mat_2.ab_list
00611       alphabet = mat_1.ab_list
00612    len_alphabet = len(alphabet)
00613    print_mat = {}
00614    topline = topindent
00615    bottomline = bottomindent
00616    for i in alphabet:
00617       bottomline += bottomformat % i
00618       topline += topformat % alphabet[len_alphabet-alphabet.index(i)-1]
00619    topline += '\n'
00620    bottomline += '\n'
00621    f.write(topline)
00622    for i in alphabet:
00623       for j in alphabet:
00624          print_mat[i,j] = -999
00625    diag_1 = {}; diag_2 = {}
00626    for i in alphabet:
00627       for j in alphabet[:alphabet.index(i)+1]:
00628          if i == j:
00629             diag_1[i] = mat_1[(i,i)] 
00630             diag_2[i] = mat_2[(alphabet[len_alphabet-alphabet.index(i)-1],
00631                    alphabet[len_alphabet-alphabet.index(i)-1])]
00632          else:
00633             if i > j:
00634                key = (j,i)
00635             else:
00636                key = (i,j)
00637             mat_2_key = [alphabet[len_alphabet-alphabet.index(key[0])-1],
00638                    alphabet[len_alphabet-alphabet.index(key[1])-1]]
00639             # print mat_2_key
00640             mat_2_key.sort(); mat_2_key = tuple(mat_2_key)
00641             # print key ,"||",  mat_2_key
00642             print_mat[key] = mat_2[mat_2_key] 
00643             print_mat[(key[1],key[0])] = mat_1[key]
00644    for i in alphabet:
00645       outline = i
00646       for j in alphabet:
00647          if i == j:
00648             if diag_1[i] == -999:
00649                val_1 = ' ND'
00650             else:
00651                val_1 = format % (diag_1[i]*factor_1)
00652             if diag_2[i] == -999:
00653                val_2 = ' ND'
00654             else:
00655                val_2 = format % (diag_2[i]*factor_2)
00656             cur_str = val_1 + "  " + val_2
00657          else:
00658             if print_mat[(i,j)] == -999:
00659                val = ' ND'
00660             elif alphabet.index(i) > alphabet.index(j):
00661                val = format % (print_mat[(i,j)]*factor_1)
00662             else:
00663                val = format % (print_mat[(i,j)]*factor_2)
00664             cur_str = val
00665          outline += cur_str
00666       outline += bottomformat % (alphabet[len_alphabet-alphabet.index(i)-1] +
00667                                  '\n')
00668       f.write(outline)
00669    f.write(bottomline)
00670 """
def Bio.SubsMat.two_mat_relative_entropy (   mat_1,
  mat_2,
  logbase = 2,
  diag = diagALL 
)

Definition at line 534 of file __init__.py.

00534 
00535 def two_mat_relative_entropy(mat_1,mat_2,logbase=2,diag=diagALL):
00536    rel_ent = 0.
00537    key_list_1 = sorted(mat_1)
00538    key_list_2 = sorted(mat_2)
00539    key_list = []
00540    sum_ent_1 = 0.; sum_ent_2 = 0.
00541    for i in key_list_1:
00542       if i in key_list_2:
00543          key_list.append(i)
00544    if len(key_list_1) != len(key_list_2):
00545       sys.stderr.write("Warning:first matrix has more entries than the second\n")
00546    if key_list_1 != key_list_2:
00547       sys.stderr.write("Warning: indices not the same between matrices\n")
00548    for key in key_list:
00549       if diag == diagNO and key[0] == key[1]:
00550          continue
00551       if diag == diagONLY and key[0] != key[1]:
00552          continue
00553       if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
00554          sum_ent_1 += mat_1[key]
00555          sum_ent_2 += mat_2[key]
00556          
00557    for key in key_list:
00558       if diag == diagNO and key[0] == key[1]:
00559          continue
00560       if diag == diagONLY and key[0] != key[1]:
00561          continue
00562       if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
00563          val_1 = mat_1[key] / sum_ent_1
00564          val_2 = mat_2[key] / sum_ent_2
00565 #         rel_ent += mat_1[key] * log(mat_1[key]/mat_2[key])/log(logbase)
00566          rel_ent += val_1 * log(val_1/val_2)/log(logbase)
00567    return rel_ent


Variable Documentation

Definition at line 126 of file __init__.py.

Definition at line 532 of file __init__.py.

Definition at line 530 of file __init__.py.

Definition at line 531 of file __init__.py.

float Bio.SubsMat.EPSILON = 0.00000000000001

Definition at line 131 of file __init__.py.

Definition at line 129 of file __init__.py.

int Bio.SubsMat.LO = 5

Definition at line 130 of file __init__.py.

Bio.SubsMat.log = math.log

Definition at line 123 of file __init__.py.

Definition at line 125 of file __init__.py.

Definition at line 127 of file __init__.py.

Definition at line 128 of file __init__.py.