Back to index

python-biopython  1.60
kNN.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """
00006 This module provides code for doing k-nearest-neighbors classification.
00007 
00008 k Nearest Neighbors is a supervised learning algorithm that classifies
00009 a new observation based the classes in its surrounding neighborhood.
00010 
00011 Glossary:
00012 distance   The distance between two points in the feature space.
00013 weight     The importance given to each point for classification. 
00014 
00015 
00016 Classes:
00017 kNN           Holds information for a nearest neighbors classifier.
00018 
00019 
00020 Functions:
00021 train        Train a new kNN classifier.
00022 calculate    Calculate the probabilities of each class, given an observation.
00023 classify     Classify an observation into a class.
00024 
00025     Weighting Functions:
00026 equal_weight    Every example is given a weight of 1.
00027 
00028 """
00029 
00030 import numpy
00031 
00032 class kNN(object):
00033     """Holds information necessary to do nearest neighbors classification.
00034 
00035     Members:
00036     classes  Set of the possible classes.
00037     xs       List of the neighbors.
00038     ys       List of the classes that the neighbors belong to.
00039     k        Number of neighbors to look at.
00040 
00041     """
00042     def __init__(self):
00043         """kNN()"""
00044         self.classes = set()
00045         self.xs = []
00046         self.ys = []
00047         self.k = None
00048 
00049 def equal_weight(x, y):
00050     """equal_weight(x, y) -> 1"""
00051     # everything gets 1 vote
00052     return 1
00053 
00054 def train(xs, ys, k, typecode=None):
00055     """train(xs, ys, k) -> kNN
00056     
00057     Train a k nearest neighbors classifier on a training set.  xs is a
00058     list of observations and ys is a list of the class assignments.
00059     Thus, xs and ys should contain the same number of elements.  k is
00060     the number of neighbors that should be examined when doing the
00061     classification.
00062     
00063     """
00064     knn = kNN()
00065     knn.classes = set(ys)
00066     knn.xs = numpy.asarray(xs, typecode)
00067     knn.ys = ys
00068     knn.k = k
00069     return knn
00070 
00071 def calculate(knn, x, weight_fn=equal_weight, distance_fn=None):
00072     """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
00073 
00074     Calculate the probability for each class.  knn is a kNN object.  x
00075     is the observed data.  weight_fn is an optional function that
00076     takes x and a training example, and returns a weight.  distance_fn
00077     is an optional function that takes two points and returns the
00078     distance between them.  If distance_fn is None (the default), the
00079     Euclidean distance is used.  Returns a dictionary of the class to
00080     the weight given to the class.
00081     
00082     """
00083     x = numpy.asarray(x)
00084 
00085     order = []  # list of (distance, index)
00086     if distance_fn:
00087         for i in range(len(knn.xs)):
00088             dist = distance_fn(x, knn.xs[i])
00089             order.append((dist, i))
00090     else:
00091         # Default: Use a fast implementation of the Euclidean distance
00092         temp = numpy.zeros(len(x))
00093         # Predefining temp allows reuse of this array, making this
00094         # function about twice as fast.
00095         for i in range(len(knn.xs)):
00096             temp[:] = x - knn.xs[i]
00097             dist = numpy.sqrt(numpy.dot(temp,temp))
00098             order.append((dist, i))
00099     order.sort()
00100 
00101     # first 'k' are the ones I want.
00102     weights = {}  # class -> number of votes
00103     for k in knn.classes:
00104         weights[k] = 0.0
00105     for dist, i in order[:knn.k]:
00106         klass = knn.ys[i]
00107         weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
00108 
00109     return weights
00110 
00111 def classify(knn, x, weight_fn=equal_weight, distance_fn=None):
00112     """classify(knn, x[, weight_fn][, distance_fn]) -> class
00113 
00114     Classify an observation into a class.  If not specified, weight_fn will
00115     give all neighbors equal weight.  distance_fn is an optional function
00116     that takes two points and returns the distance between them.  If
00117     distance_fn is None (the default), the Euclidean distance is used.
00118     """
00119     weights = calculate(
00120         knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
00121 
00122     most_class = None
00123     most_weight = None
00124     for klass, weight in weights.items():
00125         if most_class is None or weight > most_weight:
00126             most_class = klass
00127             most_weight = weight
00128     return most_class