Back to index

python-biopython  1.60
NaiveBayes.py
Go to the documentation of this file.
00001 # Copyright 2000 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """This provides code for a general Naive Bayes learner.
00007 
00008 Naive Bayes is a supervised classification algorithm that uses Bayes
00009 rule to compute the fit between a new observation and some previously
00010 observed data.  The observations are discrete feature vectors, with
00011 the Bayes assumption that the features are independent.  Although this
00012 is hardly ever true, the classifier works well enough in practice.
00013 
00014 Glossary:
00015 observation    A feature vector of discrete data.
00016 class          A possible classification for an observation.
00017 
00018 
00019 Classes:
00020 NaiveBayes     Holds information for a naive Bayes classifier.
00021 
00022 Functions:
00023 train          Train a new naive Bayes classifier.
00024 calculate      Calculate the probabilities of each class, given an observation.
00025 classify       Classify an observation into a class.
00026 
00027 """
00028 
00029 import numpy
00030 
00031 def _contents(items):
00032     term = 1.0/len(items)
00033     counts = {}
00034     for item in items:
00035         counts[item] = counts.get(item,0) + term
00036     return counts
00037 
00038 class NaiveBayes(object):
00039     """Holds information for a NaiveBayes classifier.
00040 
00041     Members:
00042     classes         List of the possible classes of data.
00043     p_conditional   CLASS x DIM array of dicts of value -> P(value|class,dim)
00044     p_prior         List of the prior probabilities for every class.
00045     dimensionality  Dimensionality of the data.
00046 
00047     """
00048     def __init__(self):
00049         self.classes = []
00050         self.p_conditional = None
00051         self.p_prior = []
00052         self.dimensionality = None
00053 
00054 def calculate(nb, observation, scale=0):
00055     """calculate(nb, observation[, scale]) -> probability dict
00056 
00057     Calculate log P(class|observation) for each class.  nb is a NaiveBayes
00058     classifier that has been trained.  observation is a list representing
00059     the observed data.  scale is whether the probability should be
00060     scaled by P(observation).  By default, no scaling is done.  The return
00061     value is a dictionary where the keys is the class and the value is the
00062     log probability of the class.
00063 
00064     """
00065     # P(class|observation) = P(observation|class)*P(class)/P(observation)
00066     # Taking the log:
00067     # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation)
00068 
00069     # Make sure the observation has the right dimensionality.
00070     if len(observation) != nb.dimensionality:
00071         raise ValueError("observation in %d dimension, but classifier in %d" \
00072                          % (len(observation), nb.dimensionality))
00073 
00074     # Calculate log P(observation|class) for every class.
00075     n  = len(nb.classes)
00076     lp_observation_class = numpy.zeros(n)   # array of log P(observation|class)
00077     for i in range(n):
00078         # log P(observation|class) = SUM_i log P(observation_i|class)
00079         probs = [None] * len(observation)
00080         for j in range(len(observation)):
00081             probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
00082         lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300))
00083         lp_observation_class[i] = sum(lprobs)
00084 
00085     # Calculate log P(class).
00086     lp_prior = numpy.log(nb.p_prior)
00087 
00088     # Calculate log P(observation).
00089     lp_observation = 0.0          # P(observation)
00090     if scale:   # Only calculate this if requested.
00091         # log P(observation) = log SUM_i P(observation|class_i)P(class_i)
00092         obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class,-700,+700))
00093         lp_observation = numpy.log(sum(obs))
00094 
00095     # Calculate log P(class|observation).
00096     lp_class_observation = {}      # Dict of class : log P(class|observation)
00097     for i in range(len(nb.classes)):
00098         lp_class_observation[nb.classes[i]] = \
00099             lp_observation_class[i] + lp_prior[i] - lp_observation
00100 
00101     return lp_class_observation
00102 
00103 def classify(nb, observation):
00104     """classify(nb, observation) -> class
00105 
00106     Classify an observation into a class.
00107 
00108     """
00109     # The class is the one with the highest probability.
00110     probs = calculate(nb, observation, scale=0)
00111     max_prob = max_class = None
00112     for klass in nb.classes:
00113         if max_prob is None or probs[klass] > max_prob:
00114             max_prob, max_class = probs[klass], klass
00115     return max_class
00116 
00117 def train(training_set, results, priors=None, typecode=None):
00118     """train(training_set, results[, priors]) -> NaiveBayes
00119 
00120     Train a naive bayes classifier on a training set.  training_set is a
00121     list of observations.  results is a list of the class assignments
00122     for each observation.  Thus, training_set and results must be the same
00123     length.  priors is an optional dictionary specifying the prior
00124     probabilities for each type of result.  If not specified, the priors
00125     will be estimated from the training results.
00126 
00127     """
00128     if not len(training_set):
00129         raise ValueError("No data in the training set.")
00130     if len(training_set) != len(results):
00131         raise ValueError("training_set and results should be parallel lists.")
00132 
00133     # If no typecode is specified, try to pick a reasonable one.  If
00134     # training_set is a Numeric array, then use that typecode.
00135     # Otherwise, choose a reasonable default.
00136     # XXX NOT IMPLEMENTED
00137 
00138     # Check to make sure each vector in the training set has the same
00139     # dimensionality.
00140     dimensions = [len(x) for x in training_set]
00141     if min(dimensions) != max(dimensions):
00142         raise ValueError("observations have different dimensionality")
00143 
00144     nb = NaiveBayes()
00145     nb.dimensionality = dimensions[0]
00146 
00147     # Get a list of all the classes, and
00148     # estimate the prior probabilities for the classes.
00149     if priors is not None:
00150         percs = priors
00151         nb.classes = list(set(results))
00152     else:
00153         class_freq = _contents(results)
00154         nb.classes = class_freq.keys()
00155         percs = class_freq
00156     nb.classes.sort()   # keep it tidy
00157 
00158     nb.p_prior = numpy.zeros(len(nb.classes))
00159     for i in range(len(nb.classes)):
00160         nb.p_prior[i] = percs[nb.classes[i]]
00161 
00162     # Collect all the observations in class.  For each class, make a
00163     # matrix of training instances versus dimensions.  I might be able
00164     # to optimize this with Numeric, if the training_set parameter
00165     # were guaranteed to be a matrix.  However, this may not be the
00166     # case, because the client may be hacking up a sparse matrix or
00167     # something.
00168     c2i = {}      # class to index of class
00169     for index, key in enumerate(nb.classes):
00170         c2i[key] = index
00171     observations = [[] for c in nb.classes]  # separate observations by class
00172     for i in range(len(results)):
00173         klass, obs = results[i], training_set[i]
00174         observations[c2i[klass]].append(obs)
00175     # Now make the observations Numeric matrics.
00176     for i in range(len(observations)):
00177         # XXX typecode must be specified!
00178         observations[i] = numpy.asarray(observations[i], typecode)
00179 
00180     # Calculate P(value|class,dim) for every class.
00181     # This is a good loop to optimize.
00182     nb.p_conditional = []
00183     for i in range(len(nb.classes)):
00184         class_observations = observations[i]   # observations for this class
00185         nb.p_conditional.append([None] * nb.dimensionality)
00186         for j in range(nb.dimensionality):
00187             # Collect all the values in this dimension.
00188             values = class_observations[:, j]
00189 
00190             # Add pseudocounts here.  This needs to be parameterized.
00191             #values = list(values) + range(len(nb.classes))  # XXX add 1
00192 
00193             # Estimate P(value|class,dim)
00194             nb.p_conditional[i][j] = _contents(values)
00195     return nb
00196 
00197 if __name__ == "__main__":
00198     # Car data from example 'Naive Bayes Classifier example' by Eric Meisner November 22, 2003
00199     # http://www.inf.u-szeged.hu/~ormandi/teaching/mi2/02-naiveBayes-example.pdf
00200     xcar=[
00201         ['Red',    'Sports', 'Domestic'],
00202         ['Red',    'Sports', 'Domestic'],
00203         ['Red',    'Sports', 'Domestic'],
00204         ['Yellow', 'Sports', 'Domestic'],
00205         ['Yellow', 'Sports', 'Imported'],
00206         ['Yellow', 'SUV',    'Imported'],
00207         ['Yellow', 'SUV',    'Imported'],
00208         ['Yellow', 'SUV',    'Domestic'],
00209         ['Red',    'SUV',    'Imported'],
00210         ['Red',    'Sports', 'Imported']
00211     ]
00212 
00213     ycar=[
00214         'Yes',
00215         'No',
00216         'Yes',
00217         'No',
00218         'Yes',
00219         'No',
00220         'Yes',
00221         'No',
00222         'No',
00223         'Yes'
00224     ]
00225 
00226     carmodel = train(xcar, ycar)
00227     carresult = classify(carmodel, ['Red', 'Sports', 'Domestic'])
00228     print 'Is Yes?', carresult
00229     carresult = classify(carmodel, ['Red', 'SUV', 'Domestic'])
00230     print 'Is No?', carresult