Back to index

python-biopython  1.60
test_HMMGeneral.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 """Test the HMM.MarkovModel and HMM.DynamicProgramming modules.
00003 
00004 Also tests Training methods.
00005 """
00006 # standard modules
00007 import unittest
00008 import math
00009 
00010 # biopython
00011 from Bio import Alphabet
00012 from Bio.Seq import Seq
00013 
00014 
00015 # stuff we are testing
00016 from Bio.HMM import MarkovModel
00017 from Bio.HMM import DynamicProgramming
00018 from Bio.HMM import Trainer
00019 
00020 # create some simple alphabets
00021 class NumberAlphabet(Alphabet.Alphabet):
00022     """Numbers as the states of the model.
00023     """
00024     letters = ['1', '2']
00025 
00026 class LetterAlphabet(Alphabet.Alphabet):
00027     """Letters as the emissions of the model.
00028     """
00029     letters = ['A', 'B']
00030 
00031 # -- helper functions
00032 def test_assertion(name, result, expected):
00033     """Helper function to test an assertion and print out a reasonable error.
00034     """
00035     assert result == expected, "Expected %s, got %s for %s" \
00036            % (expected, result, name)
00037     
00038 class MarkovModelBuilderTest(unittest.TestCase):
00039     def setUp(self):
00040         self.mm_builder = MarkovModel.MarkovModelBuilder(NumberAlphabet(),
00041                                                          LetterAlphabet())
00042 
00043     def test_test_initialize(self):
00044         """Making sure MarkovModelBuilder is initialized correctly.
00045         """
00046         expected_transition_prob = {}
00047         expected_transition_pseudo = {}
00048 
00049         expected_emission_prob = {('2', 'A'): 0, ('1', 'A'): 0,
00050                                   ('1', 'B'): 0, ('2', 'B'): 0}
00051         expected_emission_pseudo = {('2', 'A'): 1, ('1', 'A'): 1,
00052                                     ('1', 'B'): 1, ('2', 'B'): 1}
00053 
00054         assertions = []
00055         test_assertion("Transition prob", self.mm_builder.transition_prob,
00056                           expected_transition_prob)
00057         test_assertion("Transition pseudo",
00058                           self.mm_builder.transition_pseudo,
00059                           expected_transition_pseudo)
00060         test_assertion("Emission prob", self.mm_builder.emission_prob,
00061                            expected_emission_prob)
00062         test_assertion("Emission pseudo", self.mm_builder.emission_pseudo,
00063                            expected_emission_pseudo)
00064 
00065 
00066     def test_allow_all_transitions(self):
00067         """Testing allow_all_transitions.
00068         """
00069         self.mm_builder.allow_all_transitions()
00070 
00071         expected_prob = {('2', '1'): 0, ('1', '1'): 0,
00072                          ('1', '2'): 0, ('2', '2'): 0}
00073 
00074         expected_pseudo = {('2', '1'): 1, ('1', '1'): 1,
00075                            ('1', '2'): 1, ('2', '2'): 1}
00076 
00077         test_assertion("Probabilities", self.mm_builder.transition_prob,
00078                        expected_prob)
00079         
00080         test_assertion("Pseudo counts",  self.mm_builder.transition_pseudo,
00081                        expected_pseudo)
00082 
00083 
00084     def test_set_initial_probabilities(self):
00085         self.mm_builder.set_initial_probabilities({})
00086         test_assertion("Equal initial probabilities by default",
00087                        self.mm_builder.initial_prob, {'1': 0.5, '2': 0.5})
00088 
00089         # initial probability sum > 1, should raise an exception
00090         self.assertRaises(
00091             Exception,
00092             self.mm_builder.set_initial_probabilities,
00093             {'1': 0.6, '2': 0.5})
00094 
00095         # referencing invalid states should raise an exception
00096         self.assertRaises(
00097             Exception,
00098             self.mm_builder.set_initial_probabilities,
00099             {'666': 0.1})
00100 
00101         self.mm_builder.set_initial_probabilities({'1': 0.2})
00102         test_assertion("One default initial probability",
00103                        self.mm_builder.initial_prob, {'1': 0.2, '2': 0.8})
00104 
00105         self.mm_builder.set_initial_probabilities({'1': 0.9, '2': 0.1})
00106         test_assertion("Set initial probabilities",
00107                        self.mm_builder.initial_prob, {'1': 0.9, '2': 0.1})
00108 
00109     def test_set_equal_probabilities(self):
00110         self.mm_builder.allow_transition('1', '2', 0.05)
00111         self.mm_builder.allow_transition('2', '1', 0.95)
00112         self.mm_builder.set_equal_probabilities()
00113 
00114         test_assertion("Equal initial probabilities",
00115                        self.mm_builder.initial_prob,
00116                        {'1': 0.5, '2': 0.5})
00117         test_assertion("Equal transition probabilities",
00118                        self.mm_builder.transition_prob,
00119                        {('1', '2'): 0.5, ('2', '1'): 0.5})
00120         test_assertion("Equal emission probabilities",
00121                        self.mm_builder.emission_prob,
00122                        {('2', 'A'): 0.25, ('1', 'B'): 0.25,
00123                         ('1', 'A'): 0.25, ('2', 'B'): 0.25})
00124 
00125     def test_set_random_probabilities(self):
00126         self.mm_builder.allow_transition('1', '2', 0.05)
00127         self.mm_builder.allow_transition('2', '1', 0.95)
00128         self.mm_builder.set_random_probabilities()
00129 
00130         test_assertion("Number of initial probabilities",
00131                        len(self.mm_builder.initial_prob),
00132                        len(self.mm_builder._state_alphabet.letters))
00133         # To test this more thoroughly, perhaps mock random.random() and
00134         # verify that it's being called as expected?
00135 
00136 class HiddenMarkovModelTest(unittest.TestCase):
00137     def setUp(self):
00138         self.mm_builder = MarkovModel.MarkovModelBuilder(NumberAlphabet(),
00139                                                     LetterAlphabet())
00140 
00141     def test_transitions_from(self):
00142         """Testing the calculation of transitions_from
00143         """
00144         self.mm_builder.allow_transition('1', '2', 1.0)
00145         self.mm_builder.allow_transition('2', '1', 0.5)
00146         self.mm_builder.allow_transition('2', '2', 0.5)
00147         self.mm_builder.set_initial_probabilities({})
00148         self.mm = self.mm_builder.get_markov_model()
00149 
00150         state_1 = self.mm.transitions_from("1")
00151         expected_state_1 = ["2"]
00152         state_1.sort()
00153         expected_state_1.sort()
00154         test_assertion("States reached by transitions from state 1",
00155                        state_1, expected_state_1)
00156 
00157         state_2 = self.mm.transitions_from("2")
00158         expected_state_2 = ["1", "2"]
00159         state_2.sort()
00160         expected_state_2.sort()
00161         test_assertion("States reached by transitions from state 2",
00162                        state_2, expected_state_2)
00163 
00164         fake_state = self.mm.transitions_from("Fake")
00165         expected_fake_state = []
00166         test_assertion("States reached by transitions from a fake transition",
00167                        fake_state, expected_fake_state)
00168 
00169     def test_transitions_to(self):
00170         """Testing the calculation of transitions_to
00171         """
00172         self.mm_builder.allow_transition('1', '1', 0.5)
00173         self.mm_builder.allow_transition('1', '2', 0.5)
00174         self.mm_builder.allow_transition('2', '1', 1.0)
00175         self.mm_builder.set_initial_probabilities({})
00176         self.mm = self.mm_builder.get_markov_model()
00177 
00178         state_1 = self.mm.transitions_to("1")
00179         expected_state_1 = ["1", "2"]
00180         state_1.sort()
00181         expected_state_1.sort()
00182         test_assertion("States with transitions to state 1",
00183                        state_1, expected_state_1)
00184 
00185         state_2 = self.mm.transitions_to("2")
00186         expected_state_2 = ["1"]
00187         state_2.sort()
00188         expected_state_2.sort()
00189         test_assertion("States with transitions to state 2",
00190                        state_2, expected_state_2)
00191 
00192         fake_state = self.mm.transitions_to("Fake")
00193         expected_fake_state = []
00194         test_assertion("States with transitions to a fake transition",
00195                        fake_state, expected_fake_state)
00196 
00197     def test_allow_transition(self):
00198         """Testing allow_transition
00199         """
00200         self.mm_builder.allow_transition('1', '2', 1.0)
00201         self.mm_builder.set_initial_probabilities({})
00202         self.mm = self.mm_builder.get_markov_model()
00203 
00204         state_1 = self.mm.transitions_from("1")
00205         expected_state_1 = ["2"]
00206         state_1.sort()
00207         expected_state_1.sort()
00208         test_assertion("States reached by transitions from state 1",
00209                        state_1, expected_state_1)
00210 
00211         state_2 = self.mm.transitions_from("2")
00212         expected_state_2 = []
00213         state_2.sort()
00214         expected_state_2.sort()
00215         test_assertion("States reached by transitions from state 2",
00216                        state_2, expected_state_2)
00217 
00218         state_1 = self.mm.transitions_to("1")
00219         expected_state_1 = []
00220         state_1.sort()
00221         expected_state_1.sort()
00222         test_assertion("States with transitions to state 1",
00223                        state_1, expected_state_1)
00224 
00225         state_2 = self.mm.transitions_to("2")
00226         expected_state_2 = ["1"]
00227         state_2.sort()
00228         expected_state_2.sort()
00229         test_assertion("States with transitions to state 2",
00230                        state_2, expected_state_2)
00231 
00232     def test_simple_hmm(self):
00233         """Test a simple model with 2 states and 2 symbols.
00234         """
00235 
00236         # set initial probabilities
00237         prob_initial = [0.4, 0.6]
00238         self.mm_builder.set_initial_probabilities(
00239                 {'1': prob_initial[0], '2': prob_initial[1]})
00240         
00241         # set transition probabilities
00242         prob_transition = [[0.35, 0.65], [0.45, 0.55]]
00243         self.mm_builder.allow_transition('1', '1', prob_transition[0][0])
00244         self.mm_builder.allow_transition('1', '2', prob_transition[0][1])
00245         self.mm_builder.allow_transition('2', '1', prob_transition[1][0])
00246         self.mm_builder.allow_transition('2', '2', prob_transition[1][1])
00247 
00248         # set emission probabilities
00249         prob_emission = [[0.45, 0.55], [0.75, 0.25]]
00250         self.mm_builder.set_emission_score('1', 'A', prob_emission[0][0])
00251         self.mm_builder.set_emission_score('1', 'B', prob_emission[0][1])
00252         self.mm_builder.set_emission_score('2', 'A', prob_emission[1][0])
00253         self.mm_builder.set_emission_score('2', 'B', prob_emission[1][1])
00254 
00255         # Check all two letter sequences using a brute force calculation
00256         model = self.mm_builder.get_markov_model()
00257         for first_letter in LetterAlphabet.letters:
00258             for second_letter in LetterAlphabet.letters:
00259                 observed_emissions = [first_letter, second_letter]
00260                 viterbi = model.viterbi(observed_emissions, NumberAlphabet)
00261                 self._checkSimpleHmm(prob_initial, prob_transition,
00262                                  prob_emission, viterbi, observed_emissions)
00263 
00264     def _checkSimpleHmm(self, prob_initial, prob_transition, prob_emission,
00265                          viterbi, observed_emissions):
00266         max_prob = 0
00267 
00268         # expected first and second states in the sequence, calculated below
00269         seq_first_state = None
00270         seq_second_state = None
00271 
00272         # convert the observed letters 'A' or 'B' into 0 or 1
00273         letter1 = ord(observed_emissions[0]) - ord('A')
00274         letter2 = ord(observed_emissions[1]) - ord('A')
00275 
00276         for first_state in NumberAlphabet.letters:
00277             for second_state in NumberAlphabet.letters:
00278                 # compute the probability of the state sequence first_state,
00279                 # second_state emitting the observed_emissions
00280                 state1 = ord(first_state) - ord('1')
00281                 state2 = ord(second_state) - ord('1')
00282                 prob = prob_initial[state1] * prob_emission[state1][letter1] *\
00283                     prob_transition[state1][state2] *\
00284                     prob_emission[state2][letter2]
00285                 if prob > max_prob:
00286                     seq_first_state = first_state
00287                     seq_second_state = second_state
00288                     max_prob = prob
00289 
00290         max_prob = math.log(max_prob)
00291         seq = viterbi[0]
00292         prob = viterbi[1]
00293         test_assertion("state sequence",
00294                        str(seq),
00295                        seq_first_state + seq_second_state)
00296         test_assertion("log probability", round(prob, 11), round(max_prob, 11))
00297 
00298     def test_non_ergodic(self):
00299         """Test a non-ergodic model (meaning that some transitions are not
00300         allowed).
00301         """
00302 
00303         # make state '1' the initial state
00304         prob_1_initial = 1.0
00305         self.mm_builder.set_initial_probabilities(
00306                 {'1': prob_1_initial})
00307 
00308         # probabilities of transitioning from state 1 to 1, and 1 to 2
00309         prob_1_to_1 = 0.5
00310         prob_1_to_2 = 0.5
00311 
00312         # set up allowed transitions
00313         self.mm_builder.allow_transition('1', '1', prob_1_to_1)
00314         self.mm_builder.allow_transition('1', '2', prob_1_to_2)
00315 
00316         # Emission probabilities
00317         # In state 1 the most likely emission is A, in state 2 the most
00318         # likely emission is B. (Would be simpler just to use 1.0 and 0.0
00319         # emission probabilities here, but the algorithm blows up on zero
00320         # probabilities because of the conversion to log space.)
00321         prob_1_A = 0.95
00322         prob_1_B = 0.05
00323         prob_2_A = 0.05
00324         prob_2_B = 0.95
00325 
00326         # set emission probabilities
00327         self.mm_builder.set_emission_score('1', 'A', prob_1_A)
00328         self.mm_builder.set_emission_score('1', 'B', prob_1_B)
00329         self.mm_builder.set_emission_score('2', 'A', prob_2_A)
00330         self.mm_builder.set_emission_score('2', 'B', prob_2_B)
00331 
00332         # run the Viterbi algorithm to find the most probable state path
00333         model = self.mm_builder.get_markov_model()
00334         observed_emissions = ['A', 'B']
00335         viterbi = model.viterbi(observed_emissions, NumberAlphabet)
00336         seq = viterbi[0]
00337         prob = viterbi[1]
00338 
00339         # the most probable path must be from state 1 to state 2
00340         test_assertion("most probable path", str(seq), '12')
00341 
00342         # The probability of that path is the probability of starting in 
00343         # state 1, then emitting an A, then transitioning 1 -> 2, then 
00344         # emitting a B.
00345         # Note that probabilities are converted into log space.
00346         expected_prob = math.log(prob_1_initial)\
00347         + math.log(prob_1_A)\
00348         + math.log(prob_1_to_2)\
00349         + math.log(prob_2_B)
00350         test_assertion("log probability of most probable path",
00351                        prob, expected_prob)
00352 
00353 class ScaledDPAlgorithmsTest(unittest.TestCase):
00354     def setUp(self):
00355         # set up our Markov Model
00356         mm_builder = MarkovModel.MarkovModelBuilder(NumberAlphabet(),
00357                                                     LetterAlphabet())
00358         mm_builder.allow_all_transitions()
00359         mm_builder.set_equal_probabilities()
00360 
00361         mm = mm_builder.get_markov_model()
00362 
00363         # now set up a test sequence
00364         emission_seq = Seq("ABB", LetterAlphabet())
00365         state_seq = Seq("", NumberAlphabet())
00366         training_seq = Trainer.TrainingSequence(emission_seq, state_seq)
00367 
00368         # finally set up the DP
00369         self.dp = DynamicProgramming.ScaledDPAlgorithms(mm, training_seq)
00370         
00371     def test_calculate_s_value(self):
00372         """Testing the calculation of s values.
00373         """
00374         previous_vars = {('1', 0) : .5,
00375                          ('2', 0) : .7}
00376         s_value = self.dp._calculate_s_value(1, previous_vars)
00377 
00378         # print s_value
00379 
00380 class AbstractTrainerTest(unittest.TestCase):
00381     def setUp(self):
00382         # set up a bogus HMM and our trainer
00383         hmm = MarkovModel.HiddenMarkovModel({}, {}, {}, {}, {})
00384         self.test_trainer = Trainer.AbstractTrainer(hmm)
00385     
00386     def test_ml_estimator(self):
00387         """Test the maximum likelihood estimator for simple cases.
00388         """
00389         # set up a simple dictionary
00390         counts = {('A', 'A') : 10,
00391                   ('A', 'B') : 20,
00392                   ('A', 'C') : 15,
00393                   ('B', 'B') : 5,
00394                   ('C', 'A') : 15,
00395                   ('C', 'C') : 10}
00396 
00397         results = self.test_trainer.ml_estimator(counts)
00398 
00399         # now make sure we are getting back the right thing
00400         result_tests = []
00401         result_tests.append([('A', 'A'), float(10) / float(45)])
00402         result_tests.append([('A', 'B'), float(20) / float(45)])
00403         result_tests.append([('A', 'C'), float(15) / float(45)])
00404         result_tests.append([('B', 'B'), float(5) / float(5)])
00405         result_tests.append([('C', 'A'), float(15) / float(25)])
00406         result_tests.append([('C', 'C'), float(10) / float(25)])
00407 
00408         for test_result in result_tests:
00409             assert results[test_result[0]] == test_result[1], \
00410                    "Got %f, expected %f for %s" % (results[test_result[0]],
00411                                                    test_result[1],
00412                                                    test_result[0])
00413 
00414     def test_log_likelihood(self):
00415         """Calculate log likelihood.
00416         """
00417         probs = [.25, .13, .12, .17]
00418 
00419         log_prob = self.test_trainer.log_likelihood(probs)
00420         expected_log_prob = -7.31873556778
00421         assert abs(expected_log_prob - log_prob) < 0.1, \
00422           "Bad probability calculated: %s" % log_prob
00423 
00424 # run the tests
00425 if __name__ == "__main__":
00426     runner = unittest.TextTestRunner(verbosity = 2)
00427     unittest.main(testRunner=runner)