Back to index

python-biopython  1.60
LogisticRegression.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """
00006 This module provides code for doing logistic regressions.
00007 
00008 
00009 Classes:
00010 LogisticRegression    Holds information for a LogisticRegression classifier.
00011 
00012 
00013 Functions:
00014 train        Train a new classifier.
00015 calculate    Calculate the probabilities of each class, given an observation.
00016 classify     Classify an observation into a class.
00017 """
00018 
00019 import numpy
00020 import numpy.linalg
00021 
00022 class LogisticRegression(object):
00023     """Holds information necessary to do logistic regression
00024     classification.
00025 
00026     Members:
00027     beta    List of the weights for each dimension.
00028 
00029     """
00030     def __init__(self):
00031         """LogisticRegression()"""
00032         self.beta = []
00033 
00034 def train(xs, ys, update_fn=None, typecode=None):
00035     """train(xs, ys[, update_fn]) -> LogisticRegression
00036     
00037     Train a logistic regression classifier on a training set.  xs is a
00038     list of observations and ys is a list of the class assignments,
00039     which should be 0 or 1.  xs and ys should contain the same number
00040     of elements.  update_fn is an optional callback function that
00041     takes as parameters that iteration number and log likelihood.
00042     
00043     """
00044     if len(xs) != len(ys):
00045         raise ValueError("xs and ys should be the same length.")
00046     classes = set(ys)
00047     if classes != set([0, 1]):
00048         raise ValueError("Classes should be 0's and 1's")
00049     if typecode is None:
00050         typecode = 'd'
00051 
00052     # Dimensionality of the data is the dimensionality of the
00053     # observations plus a constant dimension.
00054     N, ndims = len(xs), len(xs[0]) + 1
00055     if N==0 or ndims==1:
00056         raise ValueError("No observations or observation of 0 dimension.")
00057 
00058     # Make an X array, with a constant first dimension.
00059     X = numpy.ones((N, ndims), typecode)
00060     X[:, 1:] = xs
00061     Xt = numpy.transpose(X)
00062     y = numpy.asarray(ys, typecode)
00063 
00064     # Initialize the beta parameter to 0.
00065     beta = numpy.zeros(ndims, typecode)
00066 
00067     MAX_ITERATIONS = 500
00068     CONVERGE_THRESHOLD = 0.01
00069     stepsize = 1.0
00070     # Now iterate using Newton-Raphson until the log-likelihoods
00071     # converge.
00072     i = 0
00073     old_beta = old_llik = None
00074     while i < MAX_ITERATIONS:
00075         # Calculate the probabilities.  p = e^(beta X) / (1+e^(beta X))
00076         ebetaX = numpy.exp(numpy.dot(beta, Xt))
00077         p = ebetaX / (1+ebetaX)
00078         
00079         # Find the log likelihood score and see if I've converged.
00080         logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
00081         llik = sum(logp)
00082         if update_fn is not None:
00083             update_fn(iter, llik)
00084         if old_llik is not None:
00085             # Check to see if the likelihood decreased.  If it did, then
00086             # restore the old beta parameters and half the step size.
00087             if llik < old_llik:
00088                 stepsize = stepsize / 2.0
00089                 beta = old_beta
00090             # If I've converged, then stop.
00091             if numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
00092                 break
00093         old_llik, old_beta = llik, beta
00094         i += 1
00095 
00096         W = numpy.identity(N) * p
00097         Xtyp = numpy.dot(Xt, y-p)         # Calculate the first derivative.
00098         XtWX = numpy.dot(numpy.dot(Xt, W), X)   # Calculate the second derivative.
00099         #u, s, vt = singular_value_decomposition(XtWX)
00100         #print "U", u
00101         #print "S", s
00102         delta = numpy.linalg.solve(XtWX, Xtyp)
00103         if numpy.fabs(stepsize-1.0) > 0.001:
00104             delta = delta * stepsize
00105         beta = beta + delta                 # Update beta.
00106     else:
00107         raise RuntimeError("Didn't converge.")
00108 
00109     lr = LogisticRegression()
00110     lr.beta = map(float, beta)   # Convert back to regular array.
00111     return lr
00112 
00113 def calculate(lr, x):
00114     """calculate(lr, x) -> list of probabilities
00115 
00116     Calculate the probability for each class.  lr is a
00117     LogisticRegression object.  x is the observed data.  Returns a
00118     list of the probability that it fits each class.
00119 
00120     """
00121     # Insert a constant term for x.
00122     x = numpy.asarray([1.0] + x)
00123     # Calculate the probability.  p = e^(beta X) / (1+e^(beta X))
00124     ebetaX = numpy.exp(numpy.dot(lr.beta, x))
00125     p = ebetaX / (1+ebetaX)
00126     return [1-p, p]
00127 
00128 def classify(lr, x):
00129     """classify(lr, x) -> 1 or 0
00130 
00131     Classify an observation into a class.
00132 
00133     """
00134     probs = calculate(lr, x)
00135     if probs[0] > probs[1]:
00136         return 0
00137     return 1