pythonbiopython
1.60

00001 #!/usr/bin/env python 00002 # This code is part of the Biopython distribution and governed by its 00003 # license. Please see the LICENSE file that should have been included 00004 # as part of this package. 00005 """ 00006 This module provides code for doing knearestneighbors classification. 00007 00008 k Nearest Neighbors is a supervised learning algorithm that classifies 00009 a new observation based the classes in its surrounding neighborhood. 00010 00011 Glossary: 00012 distance The distance between two points in the feature space. 00013 weight The importance given to each point for classification. 00014 00015 00016 Classes: 00017 kNN Holds information for a nearest neighbors classifier. 00018 00019 00020 Functions: 00021 train Train a new kNN classifier. 00022 calculate Calculate the probabilities of each class, given an observation. 00023 classify Classify an observation into a class. 00024 00025 Weighting Functions: 00026 equal_weight Every example is given a weight of 1. 00027 00028 """ 00029 00030 import numpy 00031 00032 class kNN(object): 00033 """Holds information necessary to do nearest neighbors classification. 00034 00035 Members: 00036 classes Set of the possible classes. 00037 xs List of the neighbors. 00038 ys List of the classes that the neighbors belong to. 00039 k Number of neighbors to look at. 00040 00041 """ 00042 def __init__(self): 00043 """kNN()""" 00044 self.classes = set() 00045 self.xs = [] 00046 self.ys = [] 00047 self.k = None 00048 00049 def equal_weight(x, y): 00050 """equal_weight(x, y) > 1""" 00051 # everything gets 1 vote 00052 return 1 00053 00054 def train(xs, ys, k, typecode=None): 00055 """train(xs, ys, k) > kNN 00056 00057 Train a k nearest neighbors classifier on a training set. xs is a 00058 list of observations and ys is a list of the class assignments. 00059 Thus, xs and ys should contain the same number of elements. k is 00060 the number of neighbors that should be examined when doing the 00061 classification. 00062 00063 """ 00064 knn = kNN() 00065 knn.classes = set(ys) 00066 knn.xs = numpy.asarray(xs, typecode) 00067 knn.ys = ys 00068 knn.k = k 00069 return knn 00070 00071 def calculate(knn, x, weight_fn=equal_weight, distance_fn=None): 00072 """calculate(knn, x[, weight_fn][, distance_fn]) > weight dict 00073 00074 Calculate the probability for each class. knn is a kNN object. x 00075 is the observed data. weight_fn is an optional function that 00076 takes x and a training example, and returns a weight. distance_fn 00077 is an optional function that takes two points and returns the 00078 distance between them. If distance_fn is None (the default), the 00079 Euclidean distance is used. Returns a dictionary of the class to 00080 the weight given to the class. 00081 00082 """ 00083 x = numpy.asarray(x) 00084 00085 order = [] # list of (distance, index) 00086 if distance_fn: 00087 for i in range(len(knn.xs)): 00088 dist = distance_fn(x, knn.xs[i]) 00089 order.append((dist, i)) 00090 else: 00091 # Default: Use a fast implementation of the Euclidean distance 00092 temp = numpy.zeros(len(x)) 00093 # Predefining temp allows reuse of this array, making this 00094 # function about twice as fast. 00095 for i in range(len(knn.xs)): 00096 temp[:] = x  knn.xs[i] 00097 dist = numpy.sqrt(numpy.dot(temp,temp)) 00098 order.append((dist, i)) 00099 order.sort() 00100 00101 # first 'k' are the ones I want. 00102 weights = {} # class > number of votes 00103 for k in knn.classes: 00104 weights[k] = 0.0 00105 for dist, i in order[:knn.k]: 00106 klass = knn.ys[i] 00107 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i]) 00108 00109 return weights 00110 00111 def classify(knn, x, weight_fn=equal_weight, distance_fn=None): 00112 """classify(knn, x[, weight_fn][, distance_fn]) > class 00113 00114 Classify an observation into a class. If not specified, weight_fn will 00115 give all neighbors equal weight. distance_fn is an optional function 00116 that takes two points and returns the distance between them. If 00117 distance_fn is None (the default), the Euclidean distance is used. 00118 """ 00119 weights = calculate( 00120 knn, x, weight_fn=weight_fn, distance_fn=distance_fn) 00121 00122 most_class = None 00123 most_weight = None 00124 for klass, weight in weights.items(): 00125 if most_class is None or weight > most_weight: 00126 most_class = klass 00127 most_weight = weight 00128 return most_class