Back to index

python-biopython  1.60
lowess.py
Go to the documentation of this file.
00001 # Copyright 2004-2008 by M de Hoon.
00002 # All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 """
00007 This module implements the Lowess function for nonparametric regression.
00008 
00009 Functions:
00010 lowess        Fit a smooth nonparametric regression curve to a scatterplot.
00011 
00012 For more information, see
00013 
00014 William S. Cleveland: "Robust locally weighted regression and smoothing
00015 scatterplots", Journal of the American Statistical Association, December 1979,
00016 volume 74, number 368, pp. 829-836.
00017 
00018 William S. Cleveland and Susan J. Devlin: "Locally weighted regression: An
00019 approach to regression analysis by local fitting", Journal of the American
00020 Statistical Association, September 1988, volume 83, number 403, pp. 596-610.
00021 """
00022 
00023 import numpy
00024 
00025 try:
00026     from Bio.Cluster import median
00027     # The function median in Bio.Cluster is faster than the function median
00028     # in NumPy, as it does not require a full sort.
00029 except ImportError, x:
00030     # Use the median function in NumPy if Bio.Cluster is not available
00031     from numpy import median
00032 
00033 def lowess(x, y, f=2./3., iter=3):
00034     """lowess(x, y, f=2./3., iter=3) -> yest
00035 
00036     Lowess smoother: Robust locally weighted regression.
00037     The lowess function fits a nonparametric regression curve to a scatterplot.
00038     The arrays x and y contain an equal number of elements; each pair
00039     (x[i], y[i]) defines a data point in the scatterplot. The function returns
00040     the estimated (smooth) values of y.
00041 
00042     The smoothing span is given by f. A larger value for f will result in a
00043     smoother curve. The number of robustifying iterations is given by iter. The
00044     function will run faster with a smaller number of iterations.
00045 
00046     x and y should be numpy float arrays of equal length.  The return value is
00047     also a numpy float array of that length.
00048 
00049     e.g.
00050     >>> import numpy
00051     >>> x = numpy.array([4,  4,  7,  7,  8,  9, 10, 10, 10, 11, 11, 12, 12, 12,
00052     ...                 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16,
00053     ...                 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20,
00054     ...                 20, 22, 23, 24, 24, 24, 24, 25], numpy.float)
00055     >>> y = numpy.array([2, 10,  4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24,
00056     ...                 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40,
00057     ...                 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56,
00058     ...                 64, 66, 54, 70, 92, 93, 120, 85], numpy.float)
00059     >>> result = lowess(x, y)
00060     >>> len(result)
00061     50
00062     >>> print "[%0.2f, ..., %0.2f]" % (result[0], result[-1])
00063     [4.85, ..., 84.98]
00064     """
00065     n = len(x)
00066     r = int(numpy.ceil(f*n))
00067     h = [numpy.sort(abs(x-x[i]))[r] for i in range(n)]
00068     w = numpy.clip(abs(([x]-numpy.transpose([x]))/h),0.0,1.0)
00069     w = 1-w*w*w
00070     w = w*w*w
00071     yest = numpy.zeros(n)
00072     delta = numpy.ones(n)
00073     for iteration in range(iter):
00074         for i in xrange(n):
00075             weights = delta * w[:,i]
00076             weights_mul_x = weights * x
00077             b1 = numpy.dot(weights,y)
00078             b2 = numpy.dot(weights_mul_x,y)
00079             A11 = sum(weights)
00080             A12 = sum(weights_mul_x)
00081             A21 = A12
00082             A22 = numpy.dot(weights_mul_x,x)
00083             determinant = A11*A22 - A12*A21
00084             beta1 = (A22*b1-A12*b2) / determinant
00085             beta2 = (A11*b2-A21*b1) / determinant
00086             yest[i] = beta1 + beta2*x[i]
00087         residuals = y-yest
00088         s = median(abs(residuals))
00089         delta[:] = numpy.clip(residuals/(6*s),-1,1)
00090         delta[:] = 1-delta*delta
00091         delta[:] = delta*delta
00092     return yest
00093 
00094 def _test():
00095     """Run the Bio.Statistics.lowess module's doctests."""
00096     print "Running doctests..."
00097     import doctest
00098     doctest.testmod()
00099     print "Done"
00100 
00101 if __name__ == "__main__":
00102     _test()