Back to index

python-biopython  1.60
Trainer.py
Go to the documentation of this file.
00001 """Provide trainers which estimate parameters based on training sequences.
00002 
00003 These should be used to 'train' a Markov Model prior to actually using
00004 it to decode state paths. When supplied training sequences and a model
00005 to work from, these classes will estimate paramters of the model.
00006 
00007 This aims to estimate two parameters:
00008 
00009 * a_{kl} -- the number of times there is a transition from k to l in the
00010 training data.
00011 
00012 * e_{k}(b) -- the number of emissions of the state b from the letter k
00013 in the training data.
00014 """
00015 # standard modules
00016 import math
00017 
00018 # local stuff
00019 from DynamicProgramming import ScaledDPAlgorithms
00020 
00021 class TrainingSequence(object):
00022     """Hold a training sequence with emissions and optionally, a state path.
00023     """
00024     def __init__(self, emissions, state_path):
00025         """Initialize a training sequence.
00026 
00027         Arguments:
00028 
00029         o emissions - A Seq object containing the sequence of emissions in
00030         the training sequence, and the alphabet of the sequence.
00031 
00032         o state_path - A Seq object containing the sequence of states and
00033         the alphabet of the states. If there is no known state path, then
00034         the sequence of states should be an empty string.
00035         """
00036         if len(state_path) > 0:
00037             assert len(emissions) == len(state_path), \
00038                    "State path does not match associated emissions."
00039         self.emissions = emissions
00040         self.states = state_path
00041 
00042 class AbstractTrainer(object):
00043     """Provide generic functionality needed in all trainers.
00044     """
00045     def __init__(self, markov_model):
00046         self._markov_model = markov_model
00047 
00048     def log_likelihood(self, probabilities):
00049         """Calculate the log likelihood of the training seqs.
00050 
00051         Arguments:
00052 
00053         o probabilities -- A list of the probabilities of each training
00054         sequence under the current paramters, calculated using the forward
00055         algorithm.
00056         """
00057         total_likelihood = 0
00058         for probability in probabilities:
00059             total_likelihood += math.log(probability)
00060 
00061         return total_likelihood
00062                  
00063     def estimate_params(self, transition_counts, emission_counts):
00064         """Get a maximum likelihood estimation of transition and emmission.
00065 
00066         Arguments:
00067         
00068         o transition_counts -- A dictionary with the total number of counts
00069         of transitions between two states.
00070 
00071         o emissions_counts -- A dictionary with the total number of counts
00072         of emmissions of a particular emission letter by a state letter.
00073 
00074         This then returns the maximum likelihood estimators for the
00075         transitions and emissions, estimated by formulas 3.18 in
00076         Durbin et al:
00077 
00078         a_{kl} = A_{kl} / sum(A_{kl'})
00079         e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))
00080 
00081         Returns:
00082         Transition and emission dictionaries containing the maximum
00083         likelihood estimators.
00084         """
00085         # now calculate the information
00086         ml_transitions = self.ml_estimator(transition_counts)
00087         ml_emissions = self.ml_estimator(emission_counts)
00088 
00089         return ml_transitions, ml_emissions
00090 
00091     def ml_estimator(self, counts):
00092         """Calculate the maximum likelihood estimator.
00093 
00094         This can calculate maximum likelihoods for both transitions
00095         and emissions.
00096 
00097         Arguments:
00098 
00099         o counts -- A dictionary of the counts for each item.
00100 
00101         See estimate_params for a description of the formula used for
00102         calculation.
00103         """
00104         # get an ordered list of all items
00105         all_ordered = counts.keys()
00106         all_ordered.sort()
00107         
00108         ml_estimation = {}
00109 
00110         # the total counts for the current letter we are on
00111         cur_letter = None
00112         cur_letter_counts = 0
00113         
00114         for cur_item in all_ordered:
00115             # if we are on a new letter (ie. the first letter of the tuple)
00116             if cur_item[0] != cur_letter:
00117                 # set the new letter we are working with
00118                 cur_letter = cur_item[0]
00119 
00120                 # count up the total counts for this letter
00121                 cur_letter_counts = counts[cur_item]
00122                 
00123                 # add counts for all other items with the same first letter
00124                 cur_position = all_ordered.index(cur_item) + 1
00125 
00126                 # keep adding while we have the same first letter or until
00127                 # we get to the end of the ordered list
00128                 while (cur_position < len(all_ordered) and
00129                        all_ordered[cur_position][0] == cur_item[0]):
00130                     cur_letter_counts += counts[all_ordered[cur_position]]
00131                     cur_position += 1
00132             # otherwise we've already got the total counts for this letter
00133             else:
00134                 pass
00135 
00136             # now calculate the ml and add it to the estimation
00137             cur_ml = float(counts[cur_item]) / float(cur_letter_counts)
00138             ml_estimation[cur_item] = cur_ml
00139 
00140         return ml_estimation
00141             
00142 class BaumWelchTrainer(AbstractTrainer):
00143     """Trainer that uses the Baum-Welch algorithm to estimate parameters.
00144 
00145     These should be used when a training sequence for an HMM has unknown
00146     paths for the actual states, and you need to make an estimation of the
00147     model parameters from the observed emissions.
00148 
00149     This uses the Baum-Welch algorithm, first described in
00150     Baum, L.E. 1972. Inequalities. 3:1-8
00151     This is based on the description in 'Biological Sequence Analysis' by
00152     Durbin et al. in section 3.3
00153 
00154     This algorithm is guaranteed to converge to a local maximum, but not
00155     necessarily to the global maxima, so use with care!
00156     """
00157     def __init__(self, markov_model):
00158         """Initialize the trainer.
00159 
00160         Arguments:
00161         
00162         o markov_model - The model we are going to estimate parameters for.
00163         This should have the parameters with some initial estimates, that
00164         we can build from.
00165         """
00166         AbstractTrainer.__init__(self, markov_model)
00167 
00168     def train(self, training_seqs, stopping_criteria,
00169               dp_method = ScaledDPAlgorithms):
00170         """Estimate the parameters using training sequences.
00171 
00172         The algorithm for this is taken from Durbin et al. p64, so this
00173         is a good place to go for a reference on what is going on.
00174         
00175         Arguments:
00176 
00177         o training_seqs -- A list of TrainingSequence objects to be used
00178         for estimating the parameters.
00179         
00180         o stopping_criteria -- A function, that when passed the change
00181         in log likelihood and threshold, will indicate if we should stop
00182         the estimation iterations.
00183 
00184         o dp_method -- A class instance specifying the dynamic programming
00185         implementation we should use to calculate the forward and
00186         backward variables. By default, we use the scaling method.
00187         """
00188         prev_log_likelihood = None
00189         num_iterations = 1
00190         
00191         while 1:            
00192             transition_count = self._markov_model.get_blank_transitions()
00193             emission_count = self._markov_model.get_blank_emissions()
00194 
00195             # remember all of the sequence probabilities
00196             all_probabilities = []
00197             
00198             for training_seq in training_seqs:
00199                 # calculate the forward and backward variables
00200                 DP = dp_method(self._markov_model, training_seq)
00201                 forward_var, seq_prob = DP.forward_algorithm()
00202                 backward_var =  DP.backward_algorithm()
00203                 
00204                 all_probabilities.append(seq_prob)
00205 
00206                 # update the counts for transitions and emissions
00207                 transition_count = self.update_transitions(transition_count,
00208                                                            training_seq,
00209                                                            forward_var,
00210                                                            backward_var,
00211                                                            seq_prob)
00212                 emission_count = self.update_emissions(emission_count,
00213                                                        training_seq,
00214                                                        forward_var,
00215                                                        backward_var,
00216                                                        seq_prob)
00217 
00218             # update the markov model with the new probabilities
00219             ml_transitions, ml_emissions = \
00220                 self.estimate_params(transition_count, emission_count)
00221             self._markov_model.transition_prob = ml_transitions
00222             self._markov_model.emission_prob = ml_emissions
00223 
00224             cur_log_likelihood =  self.log_likelihood(all_probabilities)
00225 
00226             # if we have previously calculated the log likelihood (ie.
00227             # not the first round), see if we can finish
00228             if prev_log_likelihood is not None:
00229                 # XXX log likelihoods are negatives -- am I calculating
00230                 # the change properly, or should I use the negatives...
00231                 # I'm not sure at all if this is right.
00232                 log_likelihood_change = abs(abs(cur_log_likelihood) -
00233                                             abs(prev_log_likelihood))
00234 
00235                 # check whether we have completed enough iterations to have
00236                 # a good estimation
00237                 if stopping_criteria(log_likelihood_change, num_iterations):
00238                     break
00239 
00240             # set up for another round of iterations
00241             prev_log_likelihood = cur_log_likelihood
00242             num_iterations += 1
00243 
00244         return self._markov_model
00245 
00246     def update_transitions(self, transition_counts, training_seq,
00247                            forward_vars, backward_vars, training_seq_prob):
00248         """Add the contribution of a new training sequence to the transitions.
00249 
00250         Arguments:
00251 
00252         o transition_counts -- A dictionary of the current counts for the
00253         transitions
00254 
00255         o training_seq -- The training sequence we are working with
00256 
00257         o forward_vars -- Probabilities calculated using the forward
00258         algorithm.
00259 
00260         o backward_vars -- Probabilities calculated using the backwards
00261         algorithm.
00262 
00263         o training_seq_prob - The probability of the current sequence.
00264 
00265         This calculates A_{kl} (the estimated transition counts from state
00266         k to state l) using formula 3.20 in Durbin et al.
00267         """
00268         # set up the transition and emission probabilities we are using
00269         transitions = self._markov_model.transition_prob
00270         emissions = self._markov_model.emission_prob
00271         
00272         # loop over the possible combinations of state path letters
00273         for k in training_seq.states.alphabet.letters:
00274             for l in self._markov_model.transitions_from(k):
00275                 estimated_counts = 0
00276                 # now loop over the entire training sequence
00277                 for i in range(len(training_seq.emissions) - 1):
00278                     # the forward value of k at the current position
00279                     forward_value = forward_vars[(k, i)]
00280 
00281                     # the backward value of l in the next position
00282                     backward_value = backward_vars[(l, i + 1)]
00283 
00284                     # the probability of a transition from k to l
00285                     trans_value = transitions[(k, l)]
00286 
00287                     # the probability of getting the emission at the next pos
00288                     emm_value = emissions[(l, training_seq.emissions[i + 1])]
00289 
00290                     estimated_counts += (forward_value * trans_value *
00291                                          emm_value * backward_value)
00292 
00293                 # update the transition approximation
00294                 transition_counts[(k, l)] += (float(estimated_counts) /
00295                                               training_seq_prob)
00296                     
00297         return transition_counts
00298 
00299     def update_emissions(self, emission_counts, training_seq,
00300                            forward_vars, backward_vars, training_seq_prob):
00301         """Add the contribution of a new training sequence to the emissions
00302 
00303         Arguments:
00304 
00305         o emission_counts -- A dictionary of the current counts for the
00306         emissions
00307 
00308         o training_seq -- The training sequence we are working with
00309 
00310         o forward_vars -- Probabilities calculated using the forward
00311         algorithm.
00312 
00313         o backward_vars -- Probabilities calculated using the backwards
00314         algorithm.
00315 
00316         o training_seq_prob - The probability of the current sequence.
00317 
00318         This calculates E_{k}(b) (the estimated emission probability for
00319         emission letter b from state k) using formula 3.21 in Durbin et al.
00320         """
00321         # loop over the possible combinations of state path letters
00322         for k in training_seq.states.alphabet.letters:
00323             # now loop over all of the possible emissions
00324             for b in training_seq.emissions.alphabet.letters:
00325                 expected_times = 0
00326                 # finally loop over the entire training sequence
00327                 for i in range(len(training_seq.emissions)):
00328                     # only count the forward and backward probability if the
00329                     # emission at the position is the same as b
00330                     if training_seq.emissions[i] == b:
00331                         # f_{k}(i) b_{k}(i)
00332                         expected_times += (forward_vars[(k, i)] *
00333                                            backward_vars[(k, i)])
00334 
00335                 # add to E_{k}(b)
00336                 emission_counts[(k, b)] += (float(expected_times) /
00337                                             training_seq_prob)
00338 
00339         return emission_counts
00340 
00341 class KnownStateTrainer(AbstractTrainer):
00342     """Estimate probabilities with known state sequences.
00343 
00344     This should be used for direct estimation of emission and transition
00345     probabilities when both the state path and emission sequence are
00346     known for the training examples.
00347     """
00348     def __init__(self, markov_model):
00349         AbstractTrainer.__init__(self, markov_model)
00350 
00351     def train(self, training_seqs):
00352         """Estimate the Markov Model parameters with known state paths.
00353 
00354         This trainer requires that both the state and the emissions are
00355         known for all of the training sequences in the list of
00356         TrainingSequence objects.
00357         This training will then count all of the transitions and emissions,
00358         and use this to estimate the parameters of the model.
00359         """
00360         # count up all of the transitions and emissions
00361         transition_counts = self._markov_model.get_blank_transitions()
00362         emission_counts = self._markov_model.get_blank_emissions()
00363 
00364         for training_seq in training_seqs:
00365             emission_counts = self._count_emissions(training_seq,
00366                                                     emission_counts)
00367             transition_counts = self._count_transitions(training_seq.states,
00368                                                         transition_counts)
00369 
00370         # update the markov model from the counts
00371         ml_transitions, ml_emissions = \
00372                         self.estimate_params(transition_counts,
00373                                              emission_counts)
00374         self._markov_model.transition_prob = ml_transitions
00375         self._markov_model.emission_prob = ml_emissions
00376 
00377         return self._markov_model
00378 
00379     def _count_emissions(self, training_seq, emission_counts):
00380         """Add emissions from the training sequence to the current counts.
00381 
00382         Arguments:
00383 
00384         o training_seq -- A TrainingSequence with states and emissions
00385         to get the counts from
00386 
00387         o emission_counts -- The current emission counts to add to.
00388         """
00389         for index in range(len(training_seq.emissions)):
00390             cur_state = training_seq.states[index]
00391             cur_emission = training_seq.emissions[index]
00392 
00393             try:
00394                 emission_counts[(cur_state, cur_emission)] += 1
00395             except KeyError:
00396                 raise KeyError("Unexpected emission (%s, %s)"
00397                                % (cur_state, cur_emission))
00398         return emission_counts
00399 
00400     def _count_transitions(self, state_seq, transition_counts):
00401         """Add transitions from the training sequence to the current counts.
00402 
00403         Arguments:
00404 
00405         o state_seq -- A Seq object with the states of the current training
00406         sequence.
00407 
00408         o transition_counts -- The current transition counts to add to.
00409         """
00410         for cur_pos in range(len(state_seq) - 1):
00411             cur_state = state_seq[cur_pos]
00412             next_state = state_seq[cur_pos + 1]
00413 
00414             try:
00415                 transition_counts[(cur_state, next_state)] += 1
00416             except KeyError:
00417                 raise KeyError("Unexpected transition (%s, %s)" %
00418                                (cur_state, next_state))
00419 
00420         return transition_counts
00421 
00422             
00423         
00424             
00425                 
00426             
00427 
00428             
00429