Back to index

python-biopython  1.60
Classes | Functions | Variables
Bio.MarkovModel Namespace Reference

Classes

class  MarkovModel

Functions

def logaddexp
def itemindex
def _readline_and_check_start
def load
def save
def train_bw
def _baum_welch
def _baum_welch_one
def _forward
def _backward
def train_visible
def _mle
def _argmaxes
def find_states
def _viterbi
def _normalize
def _uniform_norm
def _random_norm
def _copy_and_check
def _logsum
def _logvecadd
def _exp_logsum

Variables

 logaddexp = numpy.logaddexp
int VERY_SMALL_NUMBER = 1
tuple LOG0 = numpy.log(VERY_SMALL_NUMBER)
int MAX_ITERATIONS = 1000

Detailed Description

This is an implementation of a state-emitting MarkovModel.  I am using
terminology similar to Manning and Schutze.



Functions:
train_bw        Train a markov model using the Baum-Welch algorithm.
train_visible   Train a visible markov model using MLE.
find_states     Find the a state sequence that explains some observations.

load            Load a MarkovModel.
save            Save a MarkovModel.

Classes:
MarkovModel     Holds the description of a markov model

Function Documentation

def Bio.MarkovModel._argmaxes (   vector,
  allowance = None 
) [private]

Definition at line 463 of file MarkovModel.py.

00463 
00464 def _argmaxes(vector, allowance=None):
00465     return [numpy.argmax(vector)]

Here is the caller graph for this function:

def Bio.MarkovModel._backward (   N,
  T,
  lp_transition,
  lp_emission,
  outputs 
) [private]

Definition at line 356 of file MarkovModel.py.

00356 
00357 def _backward(N, T, lp_transition, lp_emission, outputs):
00358     matrix = numpy.zeros((N, T+1))
00359     for t in range(T-1, -1, -1):
00360         k = outputs[t]
00361         for i in range(N):
00362             # The probability of the state is the sum of the
00363             # transitions from all the states from time t+1.
00364             lprob = LOG0
00365             for j in range(N):
00366                 lp = matrix[j][t+1] + \
00367                      lp_transition[i][j] + \
00368                      lp_emission[i][k]
00369                 lprob = logaddexp(lprob, lp)
00370             matrix[i][t] = lprob
00371     return matrix

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel._baum_welch (   N,
  M,
  training_outputs,
  p_initial = None,
  p_transition = None,
  p_emission = None,
  pseudo_initial = None,
  pseudo_transition = None,
  pseudo_emission = None,
  update_fn = None 
) [private]

Definition at line 195 of file MarkovModel.py.

00195 
00196                 pseudo_emission=None, update_fn=None):
00197     # Returns (p_initial, p_transition, p_emission)
00198     if p_initial==None:
00199         p_initial = _random_norm(N)
00200     else:
00201         p_initial = _copy_and_check(p_initial, (N,))
00202 
00203     if p_transition==None:
00204         p_transition = _random_norm((N,N))
00205     else:
00206         p_transition = _copy_and_check(p_transition, (N,N))
00207     if p_emission==None:
00208         p_emission = _random_norm((N,M))
00209     else:
00210         p_emission = _copy_and_check(p_emission, (N,M))
00211     
00212     # Do all the calculations in log space to avoid underflows.
00213     lp_initial, lp_transition, lp_emission = map(
00214         numpy.log, (p_initial, p_transition, p_emission))
00215     if pseudo_initial!=None:
00216         lpseudo_initial = numpy.log(pseudo_initial)
00217     else:
00218         lpseudo_initial = None
00219     if pseudo_transition!=None:
00220         lpseudo_transition = numpy.log(pseudo_transition)
00221     else:
00222         lpseudo_transition = None
00223     if pseudo_emission!=None:
00224         lpseudo_emission = numpy.log(pseudo_emission)
00225     else:
00226         lpseudo_emission = None
00227 
00228     # Iterate through each sequence of output, updating the parameters
00229     # to the HMM.  Stop when the log likelihoods of the sequences
00230     # stops varying.
00231     prev_llik = None
00232     for i in range(MAX_ITERATIONS):
00233         llik = LOG0
00234         for outputs in training_outputs:
00235             x = _baum_welch_one(
00236                 N, M, outputs,
00237                 lp_initial, lp_transition, lp_emission,
00238                 lpseudo_initial, lpseudo_transition, lpseudo_emission,)
00239             llik += x
00240         if update_fn is not None:
00241             update_fn(i, llik)
00242         if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1:
00243             break
00244         prev_llik = llik
00245     else:
00246         raise RuntimeError("HMM did not converge in %d iterations" \
00247                            % MAX_ITERATIONS)
00248 
00249     # Return everything back in normal space.
00250     return map(numpy.exp, (lp_initial, lp_transition, lp_emission))
    

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel._baum_welch_one (   N,
  M,
  outputs,
  lp_initial,
  lp_transition,
  lp_emission,
  lpseudo_initial,
  lpseudo_transition,
  lpseudo_emission 
) [private]

Definition at line 253 of file MarkovModel.py.

00253 
00254                     lpseudo_initial, lpseudo_transition, lpseudo_emission):
00255     # Do one iteration of Baum-Welch based on a sequence of output.
00256     # NOTE: This will change the values of lp_initial, lp_transition,
00257     # and lp_emission in place.
00258     T = len(outputs)
00259     fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs)
00260     bmat = _backward(N, T, lp_transition, lp_emission, outputs)
00261 
00262     # Calculate the probability of traversing each arc for any given
00263     # transition.
00264     lp_arc = numpy.zeros((N, N, T))
00265     for t in range(T):
00266         k = outputs[t]
00267         lp_traverse = numpy.zeros((N, N)) # P going over one arc.
00268         for i in range(N):
00269             for j in range(N):
00270                 # P(getting to this arc)
00271                 # P(making this transition)
00272                 # P(emitting this character)
00273                 # P(going to the end)
00274                 lp = fmat[i][t] + \
00275                      lp_transition[i][j] + \
00276                      lp_emission[i][k] + \
00277                      bmat[j][t+1]
00278                 lp_traverse[i][j] = lp
00279         # Normalize the probability for this time step.
00280         lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse)
00281 
00282 
00283     # Sum of all the transitions out of state i at time t.
00284     lp_arcout_t = numpy.zeros((N, T))
00285     for t in range(T):
00286         for i in range(N):
00287             lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t])
00288             
00289     # Sum of all the transitions out of state i.
00290     lp_arcout = numpy.zeros(N)
00291     for i in range(N):
00292         lp_arcout[i] = _logsum(lp_arcout_t[i,:])
00293 
00294     # UPDATE P_INITIAL.
00295     lp_initial = lp_arcout_t[:,0]
00296     if lpseudo_initial!=None:
00297         lp_initial = _logvecadd(lp_initial, lpseudo_initial)
00298         lp_initial = lp_initial - _logsum(lp_initial)
00299     
00300     # UPDATE P_TRANSITION.  p_transition[i][j] is the sum of all the
00301     # transitions from i to j, normalized by the sum of the
00302     # transitions out of i.
00303     for i in range(N):
00304         for j in range(N):
00305             lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i]
00306         if lpseudo_transition!=None:
00307             lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition)
00308             lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i])
00309             
00310     # UPDATE P_EMISSION.  lp_emission[i][k] is the sum of all the
00311     # transitions out of i when k is observed, divided by the sum of
00312     # the transitions out of i.
00313     for i in range(N):
00314         ksum = numpy.zeros(M)+LOG0    # ksum[k] is the sum of all i with k.
00315         for t in range(T):
00316             k = outputs[t]
00317             for j in range(N):
00318                 ksum[k] = logaddexp(ksum[k], lp_arc[i,j,t])
00319         ksum = ksum - _logsum(ksum)      # Normalize
00320         if lpseudo_emission!=None:
00321             ksum = _logvecadd(ksum, lpseudo_emission[i])
00322             ksum = ksum - _logsum(ksum)  # Renormalize
00323         lp_emission[i,:] = ksum
00324 
00325     # Calculate the log likelihood of the output based on the forward
00326     # matrix.  Since the parameters of the HMM has changed, the log
00327     # likelihoods are going to be a step behind, and we might be doing
00328     # one extra iteration of training.  The alternative is to rerun
00329     # the _forward algorithm and calculate from the clean one, but
00330     # that may be more expensive than overshooting the training by one
00331     # step.
00332     return _logsum(fmat[:,T])

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel._copy_and_check (   matrix,
  desired_shape 
) [private]

Definition at line 553 of file MarkovModel.py.

00553 
00554 def _copy_and_check(matrix, desired_shape):
00555     # Copy the matrix.
00556     matrix = numpy.array(matrix, copy=1)
00557     # Check the dimensions.
00558     if matrix.shape != desired_shape:
00559         raise ValueError("Incorrect dimension")
00560     # Make sure it's normalized.
00561     if len(matrix.shape) == 1:
00562         if numpy.fabs(sum(matrix)-1.0) > 0.01:
00563             raise ValueError("matrix not normalized to 1.0")
00564     elif len(matrix.shape) == 2:
00565         for i in range(len(matrix)):
00566             if numpy.fabs(sum(matrix[i])-1.0) > 0.01:
00567                 raise ValueError("matrix %d not normalized to 1.0" % i)
00568     else:
00569         raise ValueError("I don't handle matrices > 2 dimensions")
00570     return matrix

Here is the caller graph for this function:

def Bio.MarkovModel._exp_logsum (   numbers) [private]

Definition at line 588 of file MarkovModel.py.

00588 
00589 def _exp_logsum(numbers):
00590     sum = _logsum(numbers)
00591     return numpy.exp(sum)

Here is the call graph for this function:

def Bio.MarkovModel._forward (   N,
  T,
  lp_initial,
  lp_transition,
  lp_emission,
  outputs 
) [private]

Definition at line 333 of file MarkovModel.py.

00333 
00334 def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
00335     # Implement the forward algorithm.  This actually calculates a
00336     # Nx(T+1) matrix, where the last column is the total probability
00337     # of the output.
00338     
00339     matrix = numpy.zeros((N, T+1))
00340     
00341     # Initialize the first column to be the initial values.
00342     matrix[:,0] = lp_initial
00343     for t in range(1, T+1):
00344         k = outputs[t-1]
00345         for j in range(N):
00346             # The probability of the state is the sum of the
00347             # transitions from all the states from time t-1.
00348             lprob = LOG0
00349             for i in range(N):
00350                 lp = matrix[i][t-1] + \
00351                      lp_transition[i][j] + \
00352                      lp_emission[i][k]
00353                 lprob = logaddexp(lprob, lp)
00354             matrix[j][t] = lprob
00355     return matrix

Here is the caller graph for this function:

def Bio.MarkovModel._logsum (   matrix) [private]

Definition at line 571 of file MarkovModel.py.

00571 
00572 def _logsum(matrix):
00573     if len(matrix.shape) > 1:
00574         vec = numpy.reshape(matrix, (numpy.product(matrix.shape),))
00575     else:
00576         vec = matrix
00577     sum = LOG0
00578     for num in vec:
00579         sum = logaddexp(sum, num)
00580     return sum

Here is the caller graph for this function:

def Bio.MarkovModel._logvecadd (   logvec1,
  logvec2 
) [private]

Definition at line 581 of file MarkovModel.py.

00581 
00582 def _logvecadd(logvec1, logvec2):
00583     assert len(logvec1) == len(logvec2), "vectors aren't the same length"
00584     sumvec = numpy.zeros(len(logvec1))
00585     for i in range(len(logvec1)):
00586         sumvec[i] = logaddexp(logvec1[i], logvec2[i])
00587     return sumvec

Here is the caller graph for this function:

def Bio.MarkovModel._mle (   N,
  M,
  training_outputs,
  training_states,
  pseudo_initial,
  pseudo_transition,
  pseudo_emission 
) [private]

Definition at line 427 of file MarkovModel.py.

00427 
00428          pseudo_transition, pseudo_emission):
00429     # p_initial is the probability that a sequence of states starts
00430     # off with a particular one.
00431     p_initial = numpy.zeros(N)
00432     if pseudo_initial:
00433         p_initial = p_initial + pseudo_initial
00434     for states in training_states:
00435         p_initial[states[0]] += 1
00436     p_initial = _normalize(p_initial)
00437     
00438     # p_transition is the probability that a state leads to the next
00439     # one.  C(i,j)/C(i) where i and j are states.
00440     p_transition = numpy.zeros((N,N))
00441     if pseudo_transition:
00442         p_transition = p_transition + pseudo_transition
00443     for states in training_states:
00444         for n in range(len(states)-1):
00445             i, j = states[n], states[n+1]
00446             p_transition[i, j] += 1
00447     for i in range(len(p_transition)):
00448         p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:])
00449 
00450     # p_emission is the probability of an output given a state.
00451     # C(s,o)|C(s) where o is an output and s is a state.
00452     p_emission = numpy.zeros((N,M))
00453     if pseudo_emission:
00454         p_emission = p_emission + pseudo_emission
00455     p_emission = numpy.ones((N,M))
00456     for outputs, states in zip(training_outputs, training_states):
00457         for o, s in zip(outputs, states):
00458             p_emission[s, o] += 1
00459     for i in range(len(p_emission)):
00460         p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:])
00461 
00462     return p_initial, p_transition, p_emission
          

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel._normalize (   matrix) [private]

Definition at line 533 of file MarkovModel.py.

00533 
00534 def _normalize(matrix):
00535     # Make sure numbers add up to 1.0
00536     if len(matrix.shape) == 1:
00537         matrix = matrix / float(sum(matrix))
00538     elif len(matrix.shape) == 2:
00539         # Normalize by rows.
00540         for i in range(len(matrix)):
00541             matrix[i,:] = matrix[i,:] / sum(matrix[i,:])
00542     else:
00543         raise ValueError("I cannot handle matrixes of that shape")
00544     return matrix
    

Here is the caller graph for this function:

def Bio.MarkovModel._random_norm (   shape) [private]

Definition at line 549 of file MarkovModel.py.

00549 
00550 def _random_norm(shape):
00551     matrix = numpy.random.random(shape)
00552     return _normalize(matrix)

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel._readline_and_check_start (   handle,
  start 
) [private]

Definition at line 67 of file MarkovModel.py.

00067 
00068 def _readline_and_check_start(handle, start):
00069     line = handle.readline()
00070     if not line.startswith(start):
00071         raise ValueError("I expected %r but got %r" % (start, line))
00072     return line

Here is the caller graph for this function:

def Bio.MarkovModel._uniform_norm (   shape) [private]

Definition at line 545 of file MarkovModel.py.

00545 
00546 def _uniform_norm(shape):
00547     matrix = numpy.ones(shape)
00548     return _normalize(matrix)

Here is the call graph for this function:

def Bio.MarkovModel._viterbi (   N,
  lp_initial,
  lp_transition,
  lp_emission,
  output 
) [private]

Definition at line 489 of file MarkovModel.py.

00489 
00490 def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
00491     # The Viterbi algorithm finds the most likely set of states for a
00492     # given output.  Returns a list of states.
00493 
00494     T = len(output)
00495     # Store the backtrace in a NxT matrix.
00496     backtrace = []    # list of indexes of states in previous timestep.
00497     for i in range(N):
00498         backtrace.append([None] * T)
00499 
00500     # Store the best scores.
00501     scores = numpy.zeros((N, T))
00502     scores[:,0] = lp_initial + lp_emission[:,output[0]]
00503     for t in range(1, T):
00504         k = output[t]
00505         for j in range(N):
00506             # Find the most likely place it came from.
00507             i_scores = scores[:,t-1] + \
00508                        lp_transition[:,j] + \
00509                        lp_emission[j,k]
00510             indexes = _argmaxes(i_scores)
00511             scores[j,t] = i_scores[indexes[0]]
00512             backtrace[j][t] = indexes
00513 
00514     # Do the backtrace.  First, find a good place to start.  Then,
00515     # we'll follow the backtrace matrix to find the list of states.
00516     # In the event of ties, there may be multiple paths back through
00517     # the matrix, which implies a recursive solution.  We'll simulate
00518     # it by keeping our own stack.
00519     in_process = []    # list of (t, states, score)
00520     results = []       # return values.  list of (states, score)
00521     indexes = _argmaxes(scores[:,T-1])      # pick the first place
00522     for i in indexes:
00523         in_process.append((T-1, [i], scores[i][T-1]))
00524     while in_process:
00525         t, states, score = in_process.pop()
00526         if t == 0:
00527             results.append((states, score))
00528         else:
00529             indexes = backtrace[states[0]][t]
00530             for i in indexes:
00531                 in_process.append((t-1, [i]+states, score))
00532     return results

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel.find_states (   markov_model,
  output 
)
find_states(markov_model, output) -> list of (states, score)

Definition at line 466 of file MarkovModel.py.

00466 
00467 def find_states(markov_model, output):
00468     """find_states(markov_model, output) -> list of (states, score)"""
00469     mm = markov_model
00470     N = len(mm.states)
00471     
00472     # _viterbi does calculations in log space.  Add a tiny bit to the
00473     # matrices so that the logs will not break.
00474     x = mm.p_initial + VERY_SMALL_NUMBER
00475     y = mm.p_transition + VERY_SMALL_NUMBER
00476     z = mm.p_emission + VERY_SMALL_NUMBER
00477     lp_initial, lp_transition, lp_emission = map(numpy.log, (x, y, z))
00478     # Change output into a list of indexes into the alphabet.
00479     indexes = itemindex(mm.alphabet)
00480     output = [indexes[x] for x in output]
00481     
00482     # Run the viterbi algorithm.
00483     results = _viterbi(N, lp_initial, lp_transition, lp_emission, output)
00484 
00485     for i in range(len(results)):
00486         states, score = results[i]
00487         results[i] = [mm.states[x] for x in states], numpy.exp(score)
00488     return results

Here is the call graph for this function:

def Bio.MarkovModel.itemindex (   values)

Definition at line 40 of file MarkovModel.py.

00040 
00041 def itemindex(values):
00042     d = {}
00043     entries = enumerate(values[::-1])
00044     n = len(values)-1
00045     for index, key in entries: d[key] = n-index
00046     return d
00047 
00048 numpy.random.seed()

Here is the caller graph for this function:

def Bio.MarkovModel.load (   handle)
load(handle) -> MarkovModel()

Definition at line 73 of file MarkovModel.py.

00073 
00074 def load(handle):
00075     """load(handle) -> MarkovModel()"""
00076     # Load the states.
00077     line = _readline_and_check_start(handle, "STATES:")
00078     states = line.split()[1:]
00079 
00080     # Load the alphabet.
00081     line = _readline_and_check_start(handle, "ALPHABET:")
00082     alphabet = line.split()[1:]
00083 
00084     mm = MarkovModel(states, alphabet)
00085     N, M = len(states), len(alphabet)
00086 
00087     # Load the initial probabilities.
00088     mm.p_initial = numpy.zeros(N)
00089     line = _readline_and_check_start(handle, "INITIAL:")
00090     for i in range(len(states)):
00091         line = _readline_and_check_start(handle, "  %s:" % states[i])
00092         mm.p_initial[i] = float(line.split()[-1])
00093 
00094     # Load the transition.
00095     mm.p_transition = numpy.zeros((N, N))
00096     line = _readline_and_check_start(handle, "TRANSITION:")
00097     for i in range(len(states)):
00098         line = _readline_and_check_start(handle, "  %s:" % states[i])
00099         mm.p_transition[i,:] = map(float, line.split()[1:])
00100 
00101     # Load the emission.
00102     mm.p_emission = numpy.zeros((N, M))
00103     line = _readline_and_check_start(handle, "EMISSION:")
00104     for i in range(len(states)):
00105         line = _readline_and_check_start(handle, "  %s:" % states[i])
00106         mm.p_emission[i,:] = map(float, line.split()[1:])
00107 
00108     return mm
        

Here is the call graph for this function:

def Bio.MarkovModel.logaddexp (   logx,
  logy 
)

Definition at line 30 of file MarkovModel.py.

00030 
00031     def logaddexp(logx, logy):
00032         if logy - logx > 100:
00033             return logy
00034         elif logx - logy > 100:
00035             return logx
00036         minxy = min(logx, logy)
00037         return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
00038 
00039 

def Bio.MarkovModel.save (   mm,
  handle 
)
save(mm, handle)

Definition at line 109 of file MarkovModel.py.

00109 
00110 def save(mm, handle):
00111     """save(mm, handle)"""
00112     # This will fail if there are spaces in the states or alphabet.
00113     w = handle.write
00114     w("STATES: %s\n" % ' '.join(mm.states))
00115     w("ALPHABET: %s\n" % ' '.join(mm.alphabet))
00116     w("INITIAL:\n")
00117     for i in range(len(mm.p_initial)):
00118         w("  %s: %g\n" % (mm.states[i], mm.p_initial[i]))
00119     w("TRANSITION:\n")
00120     for i in range(len(mm.p_transition)):
00121         x = map(str, mm.p_transition[i])
00122         w("  %s: %s\n" % (mm.states[i], ' '.join(x)))
00123     w("EMISSION:\n")
00124     for i in range(len(mm.p_emission)):
00125         x = map(str, mm.p_emission[i])
00126         w("  %s: %s\n" % (mm.states[i], ' '.join(x)))
00127 
# XXX allow them to specify starting points

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel.train_bw (   states,
  alphabet,
  training_data,
  pseudo_initial = None,
  pseudo_transition = None,
  pseudo_emission = None,
  update_fn = None 
)
train_bw(states, alphabet, training_data[, pseudo_initial]
[, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel

Train a MarkovModel using the Baum-Welch algorithm.  states is a list
of strings that describe the names of each state.  alphabet is a
list of objects that indicate the allowed outputs.  training_data
is a list of observations.  Each observation is a list of objects
from the alphabet.

pseudo_initial, pseudo_transition, and pseudo_emission are
optional parameters that you can use to assign pseudo-counts to
different matrices.  They should be matrices of the appropriate
size that contain numbers to add to each parameter matrix, before
normalization.

update_fn is an optional callback that takes parameters
(iteration, log_likelihood).  It is called once per iteration.

Definition at line 131 of file MarkovModel.py.

00131 
00132              ):
00133     """train_bw(states, alphabet, training_data[, pseudo_initial]
00134     [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel
00135 
00136     Train a MarkovModel using the Baum-Welch algorithm.  states is a list
00137     of strings that describe the names of each state.  alphabet is a
00138     list of objects that indicate the allowed outputs.  training_data
00139     is a list of observations.  Each observation is a list of objects
00140     from the alphabet.
00141 
00142     pseudo_initial, pseudo_transition, and pseudo_emission are
00143     optional parameters that you can use to assign pseudo-counts to
00144     different matrices.  They should be matrices of the appropriate
00145     size that contain numbers to add to each parameter matrix, before
00146     normalization.
00147 
00148     update_fn is an optional callback that takes parameters
00149     (iteration, log_likelihood).  It is called once per iteration.
00150 
00151     """
00152     N, M = len(states), len(alphabet)
00153     if not training_data:
00154         raise ValueError("No training data given.")
00155     if pseudo_initial!=None:
00156         pseudo_initial = numpy.asarray(pseudo_initial)
00157         if pseudo_initial.shape != (N,):
00158             raise ValueError("pseudo_initial not shape len(states)")
00159     if pseudo_transition!=None:
00160         pseudo_transition = numpy.asarray(pseudo_transition)
00161         if pseudo_transition.shape != (N,N):
00162             raise ValueError("pseudo_transition not shape " + \
00163                              "len(states) X len(states)")
00164     if pseudo_emission!=None:
00165         pseudo_emission = numpy.asarray(pseudo_emission)
00166         if pseudo_emission.shape != (N,M):
00167             raise ValueError("pseudo_emission not shape " + \
00168                              "len(states) X len(alphabet)")
00169         
00170     # Training data is given as a list of members of the alphabet.
00171     # Replace those with indexes into the alphabet list for easier
00172     # computation.
00173     training_outputs = []
00174     indexes = itemindex(alphabet)
00175     for outputs in training_data:
00176         training_outputs.append([indexes[x] for x in outputs])
00177 
00178     # Do some sanity checking on the outputs.
00179     lengths = map(len, training_outputs)
00180     if min(lengths) == 0:
00181         raise ValueError("I got training data with outputs of length 0")
00182 
00183     # Do the training with baum welch.
00184     x = _baum_welch(N, M, training_outputs,
00185                     pseudo_initial=pseudo_initial,
00186                     pseudo_transition=pseudo_transition,
00187                     pseudo_emission=pseudo_emission,
00188                     update_fn=update_fn)
00189     p_initial, p_transition, p_emission = x
00190     return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MarkovModel.train_visible (   states,
  alphabet,
  training_data,
  pseudo_initial = None,
  pseudo_transition = None,
  pseudo_emission = None 
)
train_visible(states, alphabet, training_data[, pseudo_initial]
[, pseudo_transition][, pseudo_emission]) -> MarkovModel

Train a visible MarkovModel using maximum likelihoood estimates
for each of the parameters.  states is a list of strings that
describe the names of each state.  alphabet is a list of objects
that indicate the allowed outputs.  training_data is a list of
(outputs, observed states) where outputs is a list of the emission
from the alphabet, and observed states is a list of states from
states.

pseudo_initial, pseudo_transition, and pseudo_emission are
optional parameters that you can use to assign pseudo-counts to
different matrices.  They should be matrices of the appropriate
size that contain numbers to add to each parameter matrix

Definition at line 374 of file MarkovModel.py.

00374 
00375                   pseudo_emission=None):
00376     """train_visible(states, alphabet, training_data[, pseudo_initial]
00377     [, pseudo_transition][, pseudo_emission]) -> MarkovModel
00378 
00379     Train a visible MarkovModel using maximum likelihoood estimates
00380     for each of the parameters.  states is a list of strings that
00381     describe the names of each state.  alphabet is a list of objects
00382     that indicate the allowed outputs.  training_data is a list of
00383     (outputs, observed states) where outputs is a list of the emission
00384     from the alphabet, and observed states is a list of states from
00385     states.
00386 
00387     pseudo_initial, pseudo_transition, and pseudo_emission are
00388     optional parameters that you can use to assign pseudo-counts to
00389     different matrices.  They should be matrices of the appropriate
00390     size that contain numbers to add to each parameter matrix
00391 
00392     """
00393     N, M = len(states), len(alphabet)
00394     if pseudo_initial!=None:
00395         pseudo_initial = numpy.asarray(pseudo_initial)
00396         if pseudo_initial.shape != (N,):
00397             raise ValueError("pseudo_initial not shape len(states)")
00398     if pseudo_transition!=None:
00399         pseudo_transition = numpy.asarray(pseudo_transition)
00400         if pseudo_transition.shape != (N,N):
00401             raise ValueError("pseudo_transition not shape " + \
00402                              "len(states) X len(states)")
00403     if pseudo_emission!=None:
00404         pseudo_emission = numpy.asarray(pseudo_emission)
00405         if pseudo_emission.shape != (N,M):
00406             raise ValueError("pseudo_emission not shape " + \
00407                              "len(states) X len(alphabet)")
00408     
00409     # Training data is given as a list of members of the alphabet.
00410     # Replace those with indexes into the alphabet list for easier
00411     # computation.
00412     training_states, training_outputs = [], []
00413     states_indexes = itemindex(states)
00414     outputs_indexes = itemindex(alphabet)
00415     for toutputs, tstates in training_data:
00416         if len(tstates) != len(toutputs):
00417             raise ValueError("states and outputs not aligned")
00418         training_states.append([states_indexes[x] for x in tstates])
00419         training_outputs.append([outputs_indexes[x] for x in toutputs])
00420 
00421     x = _mle(N, M, training_outputs, training_states,
00422              pseudo_initial, pseudo_transition, pseudo_emission)
00423     p_initial, p_transition, p_emission = x
00424 
00425     return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)

Here is the call graph for this function:

Here is the caller graph for this function:


Variable Documentation

Definition at line 50 of file MarkovModel.py.

Bio.MarkovModel.logaddexp = numpy.logaddexp

Definition at line 22 of file MarkovModel.py.

Definition at line 191 of file MarkovModel.py.

Definition at line 49 of file MarkovModel.py.