Back to index

python-biopython  1.60
DynamicProgramming.py
Go to the documentation of this file.
00001 """Dynamic Programming algorithms for general usage.
00002 
00003 This module contains classes which implement Dynamic Programming
00004 algorithms that can be used generally.
00005 """
00006 
00007 class AbstractDPAlgorithms(object):
00008     """An abstract class to calculate forward and backward probabiliies.
00009 
00010     This class should not be instantiated directly, but should be used
00011     through a derived class which implements proper scaling of variables.
00012 
00013     This class is just meant to encapsulate the basic foward and backward
00014     algorithms, and allow derived classes to deal with the problems of
00015     multiplying probabilities.
00016 
00017     Derived class of this must implement:
00018 
00019     o _forward_recursion -- Calculate the forward values in the recursion
00020     using some kind of technique for preventing underflow errors.
00021 
00022     o _backward_recursion -- Calculate the backward values in the recursion
00023     step using some technique to prevent underflow errors.
00024     """
00025     def __init__(self, markov_model, sequence):
00026         """Initialize to calculate foward and backward probabilities.
00027 
00028         Arguments:
00029 
00030         o markov_model -- The current Markov model we are working with.
00031 
00032         o sequence -- A training sequence containing a set of emissions.
00033         """
00034         self._mm = markov_model
00035         self._seq = sequence
00036 
00037     def _foward_recursion(self, cur_state, sequence_pos, forward_vars):
00038         """Calculate the forward recursion value.
00039         """
00040         raise NotImplementedError("Subclasses must implement")
00041 
00042     def forward_algorithm(self):
00043         """Calculate sequence probability using the forward algorithm.
00044 
00045         This implements the foward algorithm, as described on p57-58 of
00046         Durbin et al.
00047 
00048         Returns:
00049 
00050         o A dictionary containing the foward variables. This has keys of the
00051         form (state letter, position in the training sequence), and values
00052         containing the calculated forward variable.
00053 
00054         o The calculated probability of the sequence.
00055         """
00056         # all of the different letters that the state path can be in
00057         state_letters = self._seq.states.alphabet.letters
00058         
00059         # -- initialize the algorithm
00060         #
00061         # NOTE: My index numbers are one less than what is given in Durbin
00062         # et al, since we are indexing the sequence going from 0 to
00063         # (Length - 1) not 1 to Length, like in Durbin et al.
00064         #
00065         forward_var = {}
00066         # f_{0}(0) = 1 
00067         forward_var[(state_letters[0], -1)] = 1
00068         # f_{k}(0) = 0, for k > 0
00069         for k in range(1, len(state_letters)):
00070             forward_var[(state_letters[k], -1)] = 0
00071 
00072         # -- now do the recursion step
00073         # loop over the training sequence
00074         # Recursion step: (i = 1 .. L)
00075         for i in range(len(self._seq.emissions)):
00076             # now loop over the letters in the state path
00077             for main_state in state_letters:
00078                 # calculate the forward value using the appropriate
00079                 # method to prevent underflow errors
00080                 forward_value = self._forward_recursion(main_state, i,
00081                                                         forward_var)
00082 
00083                 if forward_value is not None:
00084                     forward_var[(main_state, i)] = forward_value
00085                 
00086         # -- termination step - calculate the probability of the sequence
00087         first_state = state_letters[0]
00088         seq_prob = 0
00089 
00090         for state_item in state_letters:
00091             # f_{k}(L)
00092             forward_value = forward_var[(state_item,
00093                                          len(self._seq.emissions) - 1)]
00094             # a_{k0}
00095             transition_value = self._mm.transition_prob[(state_item,
00096                                                          first_state)]
00097 
00098             seq_prob += forward_value * transition_value
00099 
00100         return forward_var, seq_prob
00101 
00102     def _backward_recursion(self, cur_state, sequence_pos, forward_vars):
00103         """Calculate the backward recursion value.
00104         """
00105         raise NotImplementedError("Subclasses must implement")
00106 
00107     def backward_algorithm(self):
00108         """Calculate sequence probability using the backward algorithm.
00109 
00110         This implements the backward algorithm, as described on p58-59 of
00111         Durbin et al.
00112 
00113         Returns:
00114 
00115         o A dictionary containing the backwards variables. This has keys
00116         of the form (state letter, position in the training sequence),
00117         and values containing the calculated backward variable.
00118         """
00119         # all of the different letters that the state path can be in
00120         state_letters = self._seq.states.alphabet.letters
00121         
00122         # -- initialize the algorithm
00123         #
00124         # NOTE: My index numbers are one less than what is given in Durbin
00125         # et al, since we are indexing the sequence going from 0 to
00126         # (Length - 1) not 1 to Length, like in Durbin et al.
00127         #
00128         backward_var = {}
00129         
00130         first_letter = state_letters[0]
00131         # b_{k}(L) = a_{k0} for all k
00132         for state in state_letters:
00133             backward_var[(state, len(self._seq.emissions) - 1)] = \
00134               self._mm.transition_prob[(state, state_letters[0])]
00135 
00136         # -- recursion
00137         # first loop over the training sequence backwards
00138         # Recursion step: (i = L - 1 ... 1)
00139         all_indexes = range(len(self._seq.emissions) - 1)
00140         all_indexes.reverse()
00141         for i in all_indexes:
00142             # now loop over the letters in the state path
00143             for main_state in state_letters:
00144                 # calculate the backward value using the appropriate
00145                 # method to prevent underflow errors
00146                 backward_value = self._backward_recursion(main_state, i,
00147                                                           backward_var)
00148 
00149                 if backward_value is not None:
00150                     backward_var[(main_state, i)] = backward_value
00151 
00152         # skip the termination step to avoid recalculations -- you should
00153         # get sequence probabilities using the forward algorithm
00154 
00155         return backward_var
00156         
00157 class ScaledDPAlgorithms(AbstractDPAlgorithms):
00158     """Implement forward and backward algorithms using a rescaling approach.
00159 
00160     This scales the f and b variables, so that they remain within a
00161     manageable numerical interval during calculations. This approach is
00162     described in Durbin et al. on p 78.
00163 
00164     This approach is a little more straightfoward then log transformation
00165     but may still give underflow errors for some types of models. In these
00166     cases, the LogDPAlgorithms class should be used.
00167     """
00168     def __init__(self, markov_model, sequence):
00169         """Initialize the scaled approach to calculating probabilities.
00170         Arguments:
00171 
00172         o markov_model -- The current Markov model we are working with.
00173 
00174         o sequence -- A TrainingSequence object that must have a
00175         set of emissions to work with.
00176         """
00177         AbstractDPAlgorithms.__init__(self, markov_model, sequence)
00178 
00179         self._s_values = {}
00180 
00181     def _calculate_s_value(self, seq_pos, previous_vars):
00182         """Calculate the next scaling variable for a sequence position.
00183 
00184         This utilizes the approach of choosing s values such that the
00185         sum of all of the scaled f values is equal to 1.
00186 
00187         Arguments:
00188 
00189         o seq_pos -- The current position we are at in the sequence.
00190 
00191         o previous_vars -- All of the forward or backward variables
00192         calculated so far.
00193 
00194         Returns:
00195 
00196         o The calculated scaling variable for the sequence item.
00197         """
00198         # all of the different letters the state can have
00199         state_letters = self._seq.states.alphabet.letters
00200 
00201         # loop over all of the possible states
00202         s_value = 0
00203         for main_state in state_letters:
00204             emission = self._mm.emission_prob[(main_state,
00205                                                self._seq.emissions[seq_pos])]
00206 
00207             # now sum over all of the previous vars and transitions
00208             trans_and_var_sum = 0
00209             for second_state in self._mm.transitions_from(main_state):
00210                 # the value of the previous f or b value
00211                 var_value = previous_vars[(second_state, seq_pos - 1)]
00212 
00213                 # the transition probability
00214                 trans_value = self._mm.transition_prob[(second_state,
00215                                                         main_state)]
00216 
00217                 trans_and_var_sum += (var_value * trans_value)
00218 
00219             s_value += (emission * trans_and_var_sum)
00220 
00221         return s_value
00222 
00223     def _forward_recursion(self, cur_state, sequence_pos, forward_vars):
00224         """Calculate the value of the forward recursion.
00225 
00226         Arguments:
00227 
00228         o cur_state -- The letter of the state we are calculating the
00229         forward variable for.
00230 
00231         o sequence_pos -- The position we are at in the training seq.
00232 
00233         o forward_vars -- The current set of forward variables
00234         """
00235         # calculate the s value, if we haven't done so already (ie. during
00236         # a previous forward or backward recursion)
00237         if sequence_pos not in self._s_values:
00238             self._s_values[sequence_pos] = \
00239               self._calculate_s_value(sequence_pos, forward_vars)
00240 
00241         # e_{l}(x_{i})
00242         seq_letter = self._seq.emissions[sequence_pos]
00243         cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)]
00244         # divide by the scaling value
00245         scale_emission_prob = (float(cur_emission_prob) /
00246                                float(self._s_values[sequence_pos]))
00247         
00248         # loop over all of the possible states at the position
00249         state_pos_sum = 0
00250         have_transition = 0
00251         for second_state in self._mm.transitions_from(cur_state):
00252             have_transition = 1
00253             
00254             # get the previous forward_var values
00255             # f_{k}(i - 1)
00256             prev_forward = forward_vars[(second_state, sequence_pos - 1)]
00257 
00258             # a_{kl}
00259             cur_trans_prob = self._mm.transition_prob[(second_state,
00260                                                        cur_state)]
00261             state_pos_sum += prev_forward * cur_trans_prob
00262 
00263         # if we have the possiblity of having a transition
00264         # return the recursion value
00265         if have_transition:
00266             return (scale_emission_prob * state_pos_sum)
00267         else:
00268             return None
00269 
00270     def _backward_recursion(self, cur_state, sequence_pos, backward_vars):
00271         """Calculate the value of the backward recursion
00272 
00273         Arguments:
00274 
00275         o cur_state -- The letter of the state we are calculating the
00276         forward variable for.
00277 
00278         o sequence_pos -- The position we are at in the training seq.
00279 
00280         o backward_vars -- The current set of backward variables
00281         """
00282         # calculate the s value, if we haven't done so already (ie. during
00283         # a previous forward or backward recursion)
00284         if sequence_pos not in self._s_values:
00285             self._s_values[sequence_pos] = \
00286               self._calculate_s_value(sequence_pos, backward_vars)
00287 
00288         # loop over all of the possible states at the position
00289         state_pos_sum = 0
00290         have_transition = 0
00291         for second_state in self._mm.transitions_from(cur_state):
00292             have_transition = 1
00293             # e_{l}(x_{i + 1})
00294             seq_letter = self._seq.emissions[sequence_pos + 1]
00295             cur_emission_prob = self._mm.emission_prob[(cur_state, seq_letter)]
00296 
00297             # get the previous backward_var value
00298             # b_{l}(i + 1)
00299             prev_backward = backward_vars[(second_state, sequence_pos + 1)]
00300 
00301             # the transition probability -- a_{kl}
00302             cur_transition_prob = self._mm.transition_prob[(cur_state,
00303                                                             second_state)]
00304 
00305             state_pos_sum += (cur_emission_prob * prev_backward *
00306                               cur_transition_prob)
00307 
00308         # if we have a probability for a transition, return it
00309         if have_transition:
00310             return (state_pos_sum / float(self._s_values[sequence_pos]))
00311         # otherwise we have no probability (ie. we can't do this transition)
00312         # and return None
00313         else:
00314             return None
00315             
00316 class LogDPAlgorithms(AbstractDPAlgorithms):
00317     """Implement forward and backward algorithms using a log approach.
00318 
00319     This uses the approach of calculating the sum of log probabilities
00320     using a lookup table for common values.
00321 
00322     XXX This is not implemented yet!
00323     """
00324     def __init__(self, markov_model, sequence):
00325         raise NotImplementedError("Haven't coded this yet...")
00326 
00327         
00328