Back to index

python-biopython  1.60
Classes | Functions | Variables
Bio.NaiveBayes Namespace Reference

Classes

class  NaiveBayes

Functions

def _contents
def calculate
def classify
def train

Variables

list xcar
list ycar
tuple carmodel = train(xcar, ycar)
tuple carresult = classify(carmodel, ['Red', 'Sports', 'Domestic'])

Function Documentation

def Bio.NaiveBayes._contents (   items) [private]

Definition at line 31 of file NaiveBayes.py.

00031 
00032 def _contents(items):
00033     term = 1.0/len(items)
00034     counts = {}
00035     for item in items:
00036         counts[item] = counts.get(item,0) + term
00037     return counts

Here is the caller graph for this function:

def Bio.NaiveBayes.calculate (   nb,
  observation,
  scale = 0 
)
calculate(nb, observation[, scale]) -> probability dict

Calculate log P(class|observation) for each class.  nb is a NaiveBayes
classifier that has been trained.  observation is a list representing
the observed data.  scale is whether the probability should be
scaled by P(observation).  By default, no scaling is done.  The return
value is a dictionary where the keys is the class and the value is the
log probability of the class.

Definition at line 54 of file NaiveBayes.py.

00054 
00055 def calculate(nb, observation, scale=0):
00056     """calculate(nb, observation[, scale]) -> probability dict
00057 
00058     Calculate log P(class|observation) for each class.  nb is a NaiveBayes
00059     classifier that has been trained.  observation is a list representing
00060     the observed data.  scale is whether the probability should be
00061     scaled by P(observation).  By default, no scaling is done.  The return
00062     value is a dictionary where the keys is the class and the value is the
00063     log probability of the class.
00064 
00065     """
00066     # P(class|observation) = P(observation|class)*P(class)/P(observation)
00067     # Taking the log:
00068     # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation)
00069 
00070     # Make sure the observation has the right dimensionality.
00071     if len(observation) != nb.dimensionality:
00072         raise ValueError("observation in %d dimension, but classifier in %d" \
00073                          % (len(observation), nb.dimensionality))
00074 
00075     # Calculate log P(observation|class) for every class.
00076     n  = len(nb.classes)
00077     lp_observation_class = numpy.zeros(n)   # array of log P(observation|class)
00078     for i in range(n):
00079         # log P(observation|class) = SUM_i log P(observation_i|class)
00080         probs = [None] * len(observation)
00081         for j in range(len(observation)):
00082             probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
00083         lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300))
00084         lp_observation_class[i] = sum(lprobs)
00085 
00086     # Calculate log P(class).
00087     lp_prior = numpy.log(nb.p_prior)
00088 
00089     # Calculate log P(observation).
00090     lp_observation = 0.0          # P(observation)
00091     if scale:   # Only calculate this if requested.
00092         # log P(observation) = log SUM_i P(observation|class_i)P(class_i)
00093         obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class,-700,+700))
00094         lp_observation = numpy.log(sum(obs))
00095 
00096     # Calculate log P(class|observation).
00097     lp_class_observation = {}      # Dict of class : log P(class|observation)
00098     for i in range(len(nb.classes)):
00099         lp_class_observation[nb.classes[i]] = \
00100             lp_observation_class[i] + lp_prior[i] - lp_observation
00101 
00102     return lp_class_observation

Here is the caller graph for this function:

def Bio.NaiveBayes.classify (   nb,
  observation 
)
classify(nb, observation) -> class

Classify an observation into a class.

Definition at line 103 of file NaiveBayes.py.

00103 
00104 def classify(nb, observation):
00105     """classify(nb, observation) -> class
00106 
00107     Classify an observation into a class.
00108 
00109     """
00110     # The class is the one with the highest probability.
00111     probs = calculate(nb, observation, scale=0)
00112     max_prob = max_class = None
00113     for klass in nb.classes:
00114         if max_prob is None or probs[klass] > max_prob:
00115             max_prob, max_class = probs[klass], klass
00116     return max_class

Here is the call graph for this function:

def Bio.NaiveBayes.train (   training_set,
  results,
  priors = None,
  typecode = None 
)
train(training_set, results[, priors]) -> NaiveBayes

Train a naive bayes classifier on a training set.  training_set is a
list of observations.  results is a list of the class assignments
for each observation.  Thus, training_set and results must be the same
length.  priors is an optional dictionary specifying the prior
probabilities for each type of result.  If not specified, the priors
will be estimated from the training results.

Definition at line 117 of file NaiveBayes.py.

00117 
00118 def train(training_set, results, priors=None, typecode=None):
00119     """train(training_set, results[, priors]) -> NaiveBayes
00120 
00121     Train a naive bayes classifier on a training set.  training_set is a
00122     list of observations.  results is a list of the class assignments
00123     for each observation.  Thus, training_set and results must be the same
00124     length.  priors is an optional dictionary specifying the prior
00125     probabilities for each type of result.  If not specified, the priors
00126     will be estimated from the training results.
00127 
00128     """
00129     if not len(training_set):
00130         raise ValueError("No data in the training set.")
00131     if len(training_set) != len(results):
00132         raise ValueError("training_set and results should be parallel lists.")
00133 
00134     # If no typecode is specified, try to pick a reasonable one.  If
00135     # training_set is a Numeric array, then use that typecode.
00136     # Otherwise, choose a reasonable default.
00137     # XXX NOT IMPLEMENTED
00138 
00139     # Check to make sure each vector in the training set has the same
00140     # dimensionality.
00141     dimensions = [len(x) for x in training_set]
00142     if min(dimensions) != max(dimensions):
00143         raise ValueError("observations have different dimensionality")
00144 
00145     nb = NaiveBayes()
00146     nb.dimensionality = dimensions[0]
00147 
00148     # Get a list of all the classes, and
00149     # estimate the prior probabilities for the classes.
00150     if priors is not None:
00151         percs = priors
00152         nb.classes = list(set(results))
00153     else:
00154         class_freq = _contents(results)
00155         nb.classes = class_freq.keys()
00156         percs = class_freq
00157     nb.classes.sort()   # keep it tidy
00158 
00159     nb.p_prior = numpy.zeros(len(nb.classes))
00160     for i in range(len(nb.classes)):
00161         nb.p_prior[i] = percs[nb.classes[i]]
00162 
00163     # Collect all the observations in class.  For each class, make a
00164     # matrix of training instances versus dimensions.  I might be able
00165     # to optimize this with Numeric, if the training_set parameter
00166     # were guaranteed to be a matrix.  However, this may not be the
00167     # case, because the client may be hacking up a sparse matrix or
00168     # something.
00169     c2i = {}      # class to index of class
00170     for index, key in enumerate(nb.classes):
00171         c2i[key] = index
00172     observations = [[] for c in nb.classes]  # separate observations by class
00173     for i in range(len(results)):
00174         klass, obs = results[i], training_set[i]
00175         observations[c2i[klass]].append(obs)
00176     # Now make the observations Numeric matrics.
00177     for i in range(len(observations)):
00178         # XXX typecode must be specified!
00179         observations[i] = numpy.asarray(observations[i], typecode)
00180 
00181     # Calculate P(value|class,dim) for every class.
00182     # This is a good loop to optimize.
00183     nb.p_conditional = []
00184     for i in range(len(nb.classes)):
00185         class_observations = observations[i]   # observations for this class
00186         nb.p_conditional.append([None] * nb.dimensionality)
00187         for j in range(nb.dimensionality):
00188             # Collect all the values in this dimension.
00189             values = class_observations[:, j]
00190 
00191             # Add pseudocounts here.  This needs to be parameterized.
00192             #values = list(values) + range(len(nb.classes))  # XXX add 1
00193 
00194             # Estimate P(value|class,dim)
00195             nb.p_conditional[i][j] = _contents(values)
00196     return nb

Here is the call graph for this function:


Variable Documentation

Definition at line 226 of file NaiveBayes.py.

tuple Bio.NaiveBayes.carresult = classify(carmodel, ['Red', 'Sports', 'Domestic'])

Definition at line 227 of file NaiveBayes.py.

Initial value:
00001 [
00002         ['Red',    'Sports', 'Domestic'],
00003         ['Red',    'Sports', 'Domestic'],
00004         ['Red',    'Sports', 'Domestic'],
00005         ['Yellow', 'Sports', 'Domestic'],
00006         ['Yellow', 'Sports', 'Imported'],
00007         ['Yellow', 'SUV',    'Imported'],
00008         ['Yellow', 'SUV',    'Imported'],
00009         ['Yellow', 'SUV',    'Domestic'],
00010         ['Red',    'SUV',    'Imported'],
00011         ['Red',    'Sports', 'Imported']
00012     ]

Definition at line 200 of file NaiveBayes.py.

Initial value:
00001 [
00002         'Yes',
00003         'No',
00004         'Yes',
00005         'No',
00006         'Yes',
00007         'No',
00008         'Yes',
00009         'No',
00010         'No',
00011         'Yes'
00012     ]

Definition at line 213 of file NaiveBayes.py.