Back to index

python-biopython  1.60
Classes | Functions | Variables
Bio.MaxEntropy Namespace Reference

Classes

class  MaxEntropy

Functions

def calculate
def classify
def _eval_feature_fn
def _calc_empirical_expects
def _calc_model_expects
def _calc_p_class_given_x
def _calc_f_sharp
def _iis_solve_delta
def _train_iis
def train
def udf1
def udf2
def udf3

Variables

list xcar
list ycar
tuple xe = train(xcar, ycar, user_functions)
tuple xc = classify(xe, xv)

Function Documentation

def Bio.MaxEntropy._calc_empirical_expects (   xs,
  ys,
  classes,
  features 
) [private]
_calc_empirical_expects(xs, ys, classes, features) -> list of expectations

Calculate the expectation of each function from the data.  This is
the constraint for the maximum entropy distribution.  Return a
list of expectations, parallel to the list of features.

Definition at line 81 of file MaxEntropy.py.

00081 
00082 def _calc_empirical_expects(xs, ys, classes, features):
00083     """_calc_empirical_expects(xs, ys, classes, features) -> list of expectations
00084 
00085     Calculate the expectation of each function from the data.  This is
00086     the constraint for the maximum entropy distribution.  Return a
00087     list of expectations, parallel to the list of features.
00088 
00089     """
00090     # E[f_i] = SUM_x,y P(x, y) f(x, y)
00091     #        = 1/N f(x, y)
00092     class2index = {}
00093     for index, key in enumerate(classes):
00094         class2index[key] = index
00095     ys_i = [class2index[y] for y in ys]
00096     
00097     expect = []
00098     N = len(xs)
00099     for feature in features:
00100         s = 0
00101         for i in range(N):
00102             s += feature.get((i, ys_i[i]), 0)
00103         expect.append(float(s) / N)
00104     return expect

Here is the caller graph for this function:

def Bio.MaxEntropy._calc_f_sharp (   N,
  nclasses,
  features 
) [private]
_calc_f_sharp(N, nclasses, features) -> matrix of f sharp values.

Definition at line 161 of file MaxEntropy.py.

00161 
00162 def _calc_f_sharp(N, nclasses, features):
00163     """_calc_f_sharp(N, nclasses, features) -> matrix of f sharp values."""
00164     # f#(x, y) = SUM_i feature(x, y)
00165     f_sharp = numpy.zeros((N, nclasses))
00166     for feature in features:
00167         for (i, j), f in feature.iteritems():
00168             f_sharp[i][j] += f
00169     return f_sharp

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MaxEntropy._calc_model_expects (   xs,
  classes,
  features,
  alphas 
) [private]
_calc_model_expects(xs, classes, features, alphas) -> list of expectations.

Calculate the expectation of each feature from the model.  This is
not used in maximum entropy training, but provides a good function
for debugging.

Definition at line 105 of file MaxEntropy.py.

00105 
00106 def _calc_model_expects(xs, classes, features, alphas):
00107     """_calc_model_expects(xs, classes, features, alphas) -> list of expectations.
00108 
00109     Calculate the expectation of each feature from the model.  This is
00110     not used in maximum entropy training, but provides a good function
00111     for debugging.
00112 
00113     """
00114     # SUM_X P(x) SUM_Y P(Y|X) F(X, Y)
00115     # = 1/N SUM_X SUM_Y P(Y|X) F(X, Y)
00116     p_yx = _calc_p_class_given_x(xs, classes, features, alphas)
00117 
00118     expects = []
00119     for feature in features:
00120         sum = 0.0
00121         for (i, j), f in feature.iteritems():
00122             sum += p_yx[i][j] * f
00123         expects.append(sum/len(xs))
00124     return expects

Here is the call graph for this function:

def Bio.MaxEntropy._calc_p_class_given_x (   xs,
  classes,
  features,
  alphas 
) [private]
_calc_p_class_given_x(xs, classes, features, alphas) -> matrix

Calculate P(y|x), where y is the class and x is an instance from
the training set.  Return a XSxCLASSES matrix of probabilities.

Definition at line 125 of file MaxEntropy.py.

00125 
00126 def _calc_p_class_given_x(xs, classes, features, alphas):
00127     """_calc_p_class_given_x(xs, classes, features, alphas) -> matrix
00128 
00129     Calculate P(y|x), where y is the class and x is an instance from
00130     the training set.  Return a XSxCLASSES matrix of probabilities.
00131 
00132     """
00133     prob_yx = numpy.zeros((len(xs), len(classes)))
00134 
00135     # Calculate log P(y, x).
00136     assert len(features) == len(alphas)
00137     for feature, alpha in zip(features, alphas):
00138         for (x, y), f in feature.iteritems():
00139             prob_yx[x][y] += alpha * f
00140     # Take an exponent to get P(y, x)
00141     prob_yx = numpy.exp(prob_yx)
00142     # Divide out the probability over each class, so we get P(y|x).
00143     for i in range(len(xs)):
00144         z = sum(prob_yx[i])
00145         prob_yx[i] = prob_yx[i] / z
00146 
00147     #prob_yx = []
00148     #for i in range(len(xs)):
00149     #    z = 0.0   # Normalization factor for this x, over all classes.
00150     #    probs = [0.0] * len(classes)
00151     #    for j in range(len(classes)):
00152     #        log_p = 0.0   # log of the probability of f(x, y)
00153     #        for k in range(len(features)):
00154     #            log_p += alphas[k] * features[k].get((i, j), 0.0)
00155     #        probs[j] = numpy.exp(log_p)
00156     #        z += probs[j]
00157     #    # Normalize the probabilities for this x.
00158     #    probs = map(lambda x, z=z: x/z, probs)
00159     #    prob_yx.append(probs)
00160     return prob_yx

Here is the caller graph for this function:

def Bio.MaxEntropy._eval_feature_fn (   fn,
  xs,
  classes 
) [private]
_eval_feature_fn(fn, xs, classes) -> dict of values

Evaluate a feature function on every instance of the training set
and class.  fn is a callback function that takes two parameters: a
training instance and a class.  Return a dictionary of (training
set index, class index) -> non-zero value.  Values of 0 are not
stored in the dictionary.

Definition at line 63 of file MaxEntropy.py.

00063 
00064 def _eval_feature_fn(fn, xs, classes):
00065     """_eval_feature_fn(fn, xs, classes) -> dict of values
00066 
00067     Evaluate a feature function on every instance of the training set
00068     and class.  fn is a callback function that takes two parameters: a
00069     training instance and a class.  Return a dictionary of (training
00070     set index, class index) -> non-zero value.  Values of 0 are not
00071     stored in the dictionary.
00072 
00073     """
00074     values = {}
00075     for i in range(len(xs)):
00076         for j in range(len(classes)):
00077             f = fn(xs[i], classes[j])
00078             if f != 0:
00079                 values[(i, j)] = f
00080     return values

Here is the caller graph for this function:

def Bio.MaxEntropy._iis_solve_delta (   N,
  feature,
  f_sharp,
  empirical,
  prob_yx,
  max_newton_iterations,
  newton_converge 
) [private]

Definition at line 171 of file MaxEntropy.py.

00171 
00172                      max_newton_iterations, newton_converge):
00173     # Solve delta using Newton's method for:
00174     # SUM_x P(x) * SUM_c P(c|x) f_i(x, c) e^[delta_i * f#(x, c)] = 0
00175     delta = 0.0
00176     iters = 0
00177     while iters < max_newton_iterations: # iterate for Newton's method
00178         f_newton = df_newton = 0.0       # evaluate the function and derivative
00179         for (i, j), f in feature.iteritems():
00180             prod = prob_yx[i][j] * f * numpy.exp(delta * f_sharp[i][j])
00181             f_newton += prod
00182             df_newton += prod * f_sharp[i][j]
00183         f_newton, df_newton = empirical - f_newton / N, -df_newton / N
00184 
00185         ratio = f_newton / df_newton
00186         delta -= ratio
00187         if numpy.fabs(ratio) < newton_converge:  # converged
00188             break
00189         iters = iters + 1
00190     else:
00191         raise RuntimeError("Newton's method did not converge")
00192     return delta

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MaxEntropy._train_iis (   xs,
  classes,
  features,
  f_sharp,
  alphas,
  e_empirical,
  max_newton_iterations,
  newton_converge 
) [private]
Do one iteration of hill climbing to find better alphas (PRIVATE).

Definition at line 194 of file MaxEntropy.py.

00194 
00195                max_newton_iterations, newton_converge):
00196     """Do one iteration of hill climbing to find better alphas (PRIVATE)."""
00197     # This is a good function to parallelize.
00198 
00199     # Pre-calculate P(y|x)
00200     p_yx = _calc_p_class_given_x(xs, classes, features, alphas)
00201 
00202     N = len(xs)
00203     newalphas = alphas[:]
00204     for i in range(len(alphas)):
00205         delta = _iis_solve_delta(N, features[i], f_sharp, e_empirical[i], p_yx,
00206                                  max_newton_iterations, newton_converge)
00207         newalphas[i] += delta
00208     return newalphas
00209 

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MaxEntropy.calculate (   me,
  observation 
)
calculate(me, observation) -> list of log probs

Calculate the log of the probability for each class.  me is a
MaxEntropy object that has been trained.  observation is a vector
representing the observed data.  The return value is a list of
unnormalized log probabilities for each class.

Definition at line 32 of file MaxEntropy.py.

00032 
00033 def calculate(me, observation):
00034     """calculate(me, observation) -> list of log probs
00035 
00036     Calculate the log of the probability for each class.  me is a
00037     MaxEntropy object that has been trained.  observation is a vector
00038     representing the observed data.  The return value is a list of
00039     unnormalized log probabilities for each class.
00040 
00041     """
00042     scores = []
00043     assert len(me.feature_fns) == len(me.alphas)
00044     for klass in me.classes:
00045         lprob = 0.0
00046         for fn, alpha in zip(me.feature_fns, me.alphas):
00047             lprob += fn(observation, klass) * alpha
00048         scores.append(lprob)
00049     return scores

Here is the caller graph for this function:

def Bio.MaxEntropy.classify (   me,
  observation 
)
classify(me, observation) -> class

Classify an observation into a class.

Definition at line 50 of file MaxEntropy.py.

00050 
00051 def classify(me, observation):
00052     """classify(me, observation) -> class
00053 
00054     Classify an observation into a class.
00055 
00056     """
00057     scores = calculate(me, observation)
00058     max_score, klass = scores[0], me.classes[0]
00059     for i in range(1, len(scores)):
00060         if scores[i] > max_score:
00061             max_score, klass = scores[i], me.classes[i]
00062     return klass

Here is the call graph for this function:

def Bio.MaxEntropy.train (   training_set,
  results,
  feature_fns,
  update_fn = None,
  max_iis_iterations = 10000,
  iis_converge = 1.0e-5,
  max_newton_iterations = 100,
  newton_converge = 1.0e-10 
)
Train a maximum entropy classifier, returns MaxEntropy object.

Train a maximum entropy classifier on a training set.
training_set is a list of observations.  results is a list of the
class assignments for each observation.  feature_fns is a list of
the features.  These are callback functions that take an
observation and class and return a 1 or 0.  update_fn is a
callback function that is called at each training iteration.  It is
passed a MaxEntropy object that encapsulates the current state of
the training.

The maximum number of iterations and the convergence criterion for IIS
are given by max_iis_iterations and iis_converge, respectively, while
max_newton_iterations and newton_converge are the maximum number
of iterations and the convergence criterion for Newton's method.

Definition at line 212 of file MaxEntropy.py.

00212 
00213           max_newton_iterations=100, newton_converge=1.0e-10):
00214     """Train a maximum entropy classifier, returns MaxEntropy object.
00215 
00216     Train a maximum entropy classifier on a training set.
00217     training_set is a list of observations.  results is a list of the
00218     class assignments for each observation.  feature_fns is a list of
00219     the features.  These are callback functions that take an
00220     observation and class and return a 1 or 0.  update_fn is a
00221     callback function that is called at each training iteration.  It is
00222     passed a MaxEntropy object that encapsulates the current state of
00223     the training.
00224 
00225     The maximum number of iterations and the convergence criterion for IIS
00226     are given by max_iis_iterations and iis_converge, respectively, while
00227     max_newton_iterations and newton_converge are the maximum number
00228     of iterations and the convergence criterion for Newton's method.
00229     """
00230     if not training_set:
00231         raise ValueError("No data in the training set.")
00232     if len(training_set) != len(results):
00233         raise ValueError("training_set and results should be parallel lists.")
00234 
00235     # Rename variables for convenience.
00236     xs, ys = training_set, results
00237 
00238     # Get a list of all the classes that need to be trained.
00239     classes = sorted(set(results))
00240 
00241     # Cache values for all features.
00242     features = [_eval_feature_fn(fn, training_set, classes)
00243                 for fn in feature_fns]
00244     # Cache values for f#.
00245     f_sharp = _calc_f_sharp(len(training_set), len(classes), features)
00246 
00247     # Pre-calculate the empirical expectations of the features.
00248     e_empirical = _calc_empirical_expects(xs, ys, classes, features)
00249 
00250     # Now train the alpha parameters to weigh each feature.
00251     alphas = [0.0] * len(features)
00252     iters = 0
00253     while iters < max_iis_iterations:
00254         nalphas = _train_iis(xs, classes, features, f_sharp,
00255                              alphas, e_empirical,
00256                              max_newton_iterations, newton_converge)
00257         diff = map(lambda x, y: numpy.fabs(x-y), alphas, nalphas)
00258         diff = reduce(lambda x, y: x+y, diff, 0)
00259         alphas = nalphas
00260         
00261         me = MaxEntropy()
00262         me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns
00263         if update_fn is not None:
00264             update_fn(me)
00265     
00266         if diff < iis_converge:   # converged
00267             break
00268     else:
00269         raise RuntimeError("IIS did not converge")
00270 
00271     return me
00272 

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.MaxEntropy.udf1 (   ts,
  cl 
)

Definition at line 304 of file MaxEntropy.py.

00304 
00305     def udf1(ts, cl):
00306         if ts[0] =='Red':
00307             return 0
00308         else:
00309             return 1

def Bio.MaxEntropy.udf2 (   ts,
  cl 
)

Definition at line 310 of file MaxEntropy.py.

00310 
00311     def udf2(ts, cl):
00312         if ts[1] =='Sports':
00313             return 0
00314         else:
00315             return 1

def Bio.MaxEntropy.udf3 (   ts,
  cl 
)

Definition at line 316 of file MaxEntropy.py.

00316 
00317     def udf3(ts, cl):
00318         if ts[2] =='Domestic':
00319             return 0
00320         else:
00321             return 1


Variable Documentation

Definition at line 326 of file MaxEntropy.py.

Initial value:
00001 [
00002         ['Red',    'Sports', 'Domestic'],
00003         ['Red',    'Sports', 'Domestic'],
00004         ['Red',    'Sports', 'Domestic'],
00005         ['Yellow', 'Sports', 'Domestic'],
00006         ['Yellow', 'Sports', 'Imported'],
00007         ['Yellow', 'SUV',    'Imported'],
00008         ['Yellow', 'SUV',    'Imported'],
00009         ['Yellow', 'SUV',    'Domestic'],
00010         ['Red',    'SUV',    'Imported'],
00011         ['Red',    'Sports', 'Imported']
00012     ]

Definition at line 277 of file MaxEntropy.py.

tuple Bio.MaxEntropy.xe = train(xcar, ycar, user_functions)

Definition at line 324 of file MaxEntropy.py.

Initial value:
00001 [
00002         'Yes',
00003         'No',
00004         'Yes',
00005         'No',
00006         'Yes',
00007         'No',
00008         'Yes',
00009         'No',
00010         'No',
00011         'Yes'
00012     ]

Definition at line 290 of file MaxEntropy.py.