Back to index

python-biopython  1.60
MaxEntropy.py
Go to the documentation of this file.
00001 # Copyright 2001 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """
00007 Maximum Entropy code.
00008 
00009 Uses Improved Iterative Scaling:
00010 XXX ref
00011 
00012 # XXX need to define terminology
00013 
00014 """
00015 
00016 import numpy
00017 
00018 class MaxEntropy(object):
00019     """Holds information for a Maximum Entropy classifier.
00020 
00021     Members:
00022     classes      List of the possible classes of data.
00023     alphas       List of the weights for each feature.
00024     feature_fns  List of the feature functions.
00025 
00026     """
00027     def __init__(self):
00028         self.classes = []
00029         self.alphas = []
00030         self.feature_fns = []
00031 
00032 def calculate(me, observation):
00033     """calculate(me, observation) -> list of log probs
00034 
00035     Calculate the log of the probability for each class.  me is a
00036     MaxEntropy object that has been trained.  observation is a vector
00037     representing the observed data.  The return value is a list of
00038     unnormalized log probabilities for each class.
00039 
00040     """
00041     scores = []
00042     assert len(me.feature_fns) == len(me.alphas)
00043     for klass in me.classes:
00044         lprob = 0.0
00045         for fn, alpha in zip(me.feature_fns, me.alphas):
00046             lprob += fn(observation, klass) * alpha
00047         scores.append(lprob)
00048     return scores
00049 
00050 def classify(me, observation):
00051     """classify(me, observation) -> class
00052 
00053     Classify an observation into a class.
00054 
00055     """
00056     scores = calculate(me, observation)
00057     max_score, klass = scores[0], me.classes[0]
00058     for i in range(1, len(scores)):
00059         if scores[i] > max_score:
00060             max_score, klass = scores[i], me.classes[i]
00061     return klass
00062 
00063 def _eval_feature_fn(fn, xs, classes):
00064     """_eval_feature_fn(fn, xs, classes) -> dict of values
00065 
00066     Evaluate a feature function on every instance of the training set
00067     and class.  fn is a callback function that takes two parameters: a
00068     training instance and a class.  Return a dictionary of (training
00069     set index, class index) -> non-zero value.  Values of 0 are not
00070     stored in the dictionary.
00071 
00072     """
00073     values = {}
00074     for i in range(len(xs)):
00075         for j in range(len(classes)):
00076             f = fn(xs[i], classes[j])
00077             if f != 0:
00078                 values[(i, j)] = f
00079     return values
00080 
00081 def _calc_empirical_expects(xs, ys, classes, features):
00082     """_calc_empirical_expects(xs, ys, classes, features) -> list of expectations
00083 
00084     Calculate the expectation of each function from the data.  This is
00085     the constraint for the maximum entropy distribution.  Return a
00086     list of expectations, parallel to the list of features.
00087 
00088     """
00089     # E[f_i] = SUM_x,y P(x, y) f(x, y)
00090     #        = 1/N f(x, y)
00091     class2index = {}
00092     for index, key in enumerate(classes):
00093         class2index[key] = index
00094     ys_i = [class2index[y] for y in ys]
00095     
00096     expect = []
00097     N = len(xs)
00098     for feature in features:
00099         s = 0
00100         for i in range(N):
00101             s += feature.get((i, ys_i[i]), 0)
00102         expect.append(float(s) / N)
00103     return expect
00104 
00105 def _calc_model_expects(xs, classes, features, alphas):
00106     """_calc_model_expects(xs, classes, features, alphas) -> list of expectations.
00107 
00108     Calculate the expectation of each feature from the model.  This is
00109     not used in maximum entropy training, but provides a good function
00110     for debugging.
00111 
00112     """
00113     # SUM_X P(x) SUM_Y P(Y|X) F(X, Y)
00114     # = 1/N SUM_X SUM_Y P(Y|X) F(X, Y)
00115     p_yx = _calc_p_class_given_x(xs, classes, features, alphas)
00116 
00117     expects = []
00118     for feature in features:
00119         sum = 0.0
00120         for (i, j), f in feature.iteritems():
00121             sum += p_yx[i][j] * f
00122         expects.append(sum/len(xs))
00123     return expects
00124 
00125 def _calc_p_class_given_x(xs, classes, features, alphas):
00126     """_calc_p_class_given_x(xs, classes, features, alphas) -> matrix
00127 
00128     Calculate P(y|x), where y is the class and x is an instance from
00129     the training set.  Return a XSxCLASSES matrix of probabilities.
00130 
00131     """
00132     prob_yx = numpy.zeros((len(xs), len(classes)))
00133 
00134     # Calculate log P(y, x).
00135     assert len(features) == len(alphas)
00136     for feature, alpha in zip(features, alphas):
00137         for (x, y), f in feature.iteritems():
00138             prob_yx[x][y] += alpha * f
00139     # Take an exponent to get P(y, x)
00140     prob_yx = numpy.exp(prob_yx)
00141     # Divide out the probability over each class, so we get P(y|x).
00142     for i in range(len(xs)):
00143         z = sum(prob_yx[i])
00144         prob_yx[i] = prob_yx[i] / z
00145 
00146     #prob_yx = []
00147     #for i in range(len(xs)):
00148     #    z = 0.0   # Normalization factor for this x, over all classes.
00149     #    probs = [0.0] * len(classes)
00150     #    for j in range(len(classes)):
00151     #        log_p = 0.0   # log of the probability of f(x, y)
00152     #        for k in range(len(features)):
00153     #            log_p += alphas[k] * features[k].get((i, j), 0.0)
00154     #        probs[j] = numpy.exp(log_p)
00155     #        z += probs[j]
00156     #    # Normalize the probabilities for this x.
00157     #    probs = map(lambda x, z=z: x/z, probs)
00158     #    prob_yx.append(probs)
00159     return prob_yx
00160 
00161 def _calc_f_sharp(N, nclasses, features):
00162     """_calc_f_sharp(N, nclasses, features) -> matrix of f sharp values."""
00163     # f#(x, y) = SUM_i feature(x, y)
00164     f_sharp = numpy.zeros((N, nclasses))
00165     for feature in features:
00166         for (i, j), f in feature.iteritems():
00167             f_sharp[i][j] += f
00168     return f_sharp
00169 
00170 def _iis_solve_delta(N, feature, f_sharp, empirical, prob_yx,
00171                      max_newton_iterations, newton_converge):
00172     # Solve delta using Newton's method for:
00173     # SUM_x P(x) * SUM_c P(c|x) f_i(x, c) e^[delta_i * f#(x, c)] = 0
00174     delta = 0.0
00175     iters = 0
00176     while iters < max_newton_iterations: # iterate for Newton's method
00177         f_newton = df_newton = 0.0       # evaluate the function and derivative
00178         for (i, j), f in feature.iteritems():
00179             prod = prob_yx[i][j] * f * numpy.exp(delta * f_sharp[i][j])
00180             f_newton += prod
00181             df_newton += prod * f_sharp[i][j]
00182         f_newton, df_newton = empirical - f_newton / N, -df_newton / N
00183 
00184         ratio = f_newton / df_newton
00185         delta -= ratio
00186         if numpy.fabs(ratio) < newton_converge:  # converged
00187             break
00188         iters = iters + 1
00189     else:
00190         raise RuntimeError("Newton's method did not converge")
00191     return delta
00192 
00193 def _train_iis(xs, classes, features, f_sharp, alphas, e_empirical,
00194                max_newton_iterations, newton_converge):
00195     """Do one iteration of hill climbing to find better alphas (PRIVATE)."""
00196     # This is a good function to parallelize.
00197 
00198     # Pre-calculate P(y|x)
00199     p_yx = _calc_p_class_given_x(xs, classes, features, alphas)
00200 
00201     N = len(xs)
00202     newalphas = alphas[:]
00203     for i in range(len(alphas)):
00204         delta = _iis_solve_delta(N, features[i], f_sharp, e_empirical[i], p_yx,
00205                                  max_newton_iterations, newton_converge)
00206         newalphas[i] += delta
00207     return newalphas
00208 
00209 
00210 def train(training_set, results, feature_fns, update_fn=None,
00211           max_iis_iterations=10000, iis_converge=1.0e-5,
00212           max_newton_iterations=100, newton_converge=1.0e-10):
00213     """Train a maximum entropy classifier, returns MaxEntropy object.
00214 
00215     Train a maximum entropy classifier on a training set.
00216     training_set is a list of observations.  results is a list of the
00217     class assignments for each observation.  feature_fns is a list of
00218     the features.  These are callback functions that take an
00219     observation and class and return a 1 or 0.  update_fn is a
00220     callback function that is called at each training iteration.  It is
00221     passed a MaxEntropy object that encapsulates the current state of
00222     the training.
00223 
00224     The maximum number of iterations and the convergence criterion for IIS
00225     are given by max_iis_iterations and iis_converge, respectively, while
00226     max_newton_iterations and newton_converge are the maximum number
00227     of iterations and the convergence criterion for Newton's method.
00228     """
00229     if not training_set:
00230         raise ValueError("No data in the training set.")
00231     if len(training_set) != len(results):
00232         raise ValueError("training_set and results should be parallel lists.")
00233 
00234     # Rename variables for convenience.
00235     xs, ys = training_set, results
00236 
00237     # Get a list of all the classes that need to be trained.
00238     classes = sorted(set(results))
00239 
00240     # Cache values for all features.
00241     features = [_eval_feature_fn(fn, training_set, classes)
00242                 for fn in feature_fns]
00243     # Cache values for f#.
00244     f_sharp = _calc_f_sharp(len(training_set), len(classes), features)
00245 
00246     # Pre-calculate the empirical expectations of the features.
00247     e_empirical = _calc_empirical_expects(xs, ys, classes, features)
00248 
00249     # Now train the alpha parameters to weigh each feature.
00250     alphas = [0.0] * len(features)
00251     iters = 0
00252     while iters < max_iis_iterations:
00253         nalphas = _train_iis(xs, classes, features, f_sharp,
00254                              alphas, e_empirical,
00255                              max_newton_iterations, newton_converge)
00256         diff = map(lambda x, y: numpy.fabs(x-y), alphas, nalphas)
00257         diff = reduce(lambda x, y: x+y, diff, 0)
00258         alphas = nalphas
00259         
00260         me = MaxEntropy()
00261         me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns
00262         if update_fn is not None:
00263             update_fn(me)
00264     
00265         if diff < iis_converge:   # converged
00266             break
00267     else:
00268         raise RuntimeError("IIS did not converge")
00269 
00270     return me
00271 
00272 
00273 if __name__ == "__main__":
00274     #Car data from example Naive Bayes Classifier example by Eric Meisner November 22, 2003
00275     #http://www.inf.u-szeged.hu/~ormandi/teaching/mi2/02-naiveBayes-example.pdf
00276 
00277     xcar=[
00278         ['Red',    'Sports', 'Domestic'],
00279         ['Red',    'Sports', 'Domestic'],
00280         ['Red',    'Sports', 'Domestic'],
00281         ['Yellow', 'Sports', 'Domestic'],
00282         ['Yellow', 'Sports', 'Imported'],
00283         ['Yellow', 'SUV',    'Imported'],
00284         ['Yellow', 'SUV',    'Imported'],
00285         ['Yellow', 'SUV',    'Domestic'],
00286         ['Red',    'SUV',    'Imported'],
00287         ['Red',    'Sports', 'Imported']
00288     ]
00289 
00290     ycar=[
00291         'Yes',
00292         'No',
00293         'Yes',
00294         'No',
00295         'Yes',
00296         'No',
00297         'Yes',
00298         'No',
00299         'No',
00300         'Yes'
00301     ]
00302 
00303     #Requires some rules or features
00304     def udf1(ts, cl):
00305         if ts[0] =='Red':
00306             return 0
00307         else:
00308             return 1
00309 
00310     def udf2(ts, cl):
00311         if ts[1] =='Sports':
00312             return 0
00313         else:
00314             return 1
00315 
00316     def udf3(ts, cl):
00317         if ts[2] =='Domestic':
00318             return 0
00319         else:
00320             return 1
00321 
00322     user_functions=[udf1, udf2, udf3] # must be an iterable type 
00323 
00324     xe=train(xcar, ycar, user_functions)
00325     for xv,yv in zip(xcar, ycar):
00326         xc=classify(xe, xv)
00327         print 'Pred:', xv, 'gives', xc, 'y is', yv