Back to index

python-biopython  1.60
Thresholds.py
Go to the documentation of this file.
00001 # Copyright 2008 by Norbert Dojer.  All rights reserved.
00002 # Adapted by Bartek Wilczynski.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 """Approximate calculation of appropriate thresholds for motif finding
00007 """
00008 import math,random
00009 
00010 class ScoreDistribution(object):
00011     """ Class representing approximate score distribution for a given motif.
00012 
00013     Utilizes a dynamic programming approch to calculate the distribution of
00014     scores with a predefined precision. Provides a number of methods for calculating
00015     thresholds for motif occurences.
00016     """
00017     def __init__(self,motif,precision=10**3):
00018         self.min_score=min(0.0,motif.min_score())
00019         self.interval=max(0.0,motif.max_score())-self.min_score
00020         self.n_points=precision*motif.length
00021         self.step=self.interval/(self.n_points-1)
00022         self.mo_density=[0.0]*self.n_points
00023         self.mo_density[-self._index_diff(self.min_score)]=1.0
00024         self.bg_density=[0.0]*self.n_points
00025         self.bg_density[-self._index_diff(self.min_score)]=1.0
00026         self.ic=motif.ic()
00027         for lo,mo in zip(motif.log_odds(),motif.pwm()):
00028             self.modify(lo,mo,motif.background)
00029         
00030     def _index_diff(self,x,y=0.0):
00031         return int((x-y+0.5*self.step)//self.step)
00032         
00033     def _add(self,i,j):
00034         return max(0,min(self.n_points-1,i+j))
00035         
00036     def modify(self,scores,mo_probs,bg_probs):
00037         mo_new=[0.0]*self.n_points
00038         bg_new=[0.0]*self.n_points
00039         for k, v in scores.iteritems():
00040             d=self._index_diff(v)
00041             for i in range(self.n_points):
00042                 mo_new[self._add(i,d)]+=self.mo_density[i]*mo_probs[k]
00043                 bg_new[self._add(i,d)]+=self.bg_density[i]*bg_probs[k]
00044         self.mo_density=mo_new
00045         self.bg_density=bg_new
00046         
00047     def threshold_fpr(self,fpr):
00048         """
00049         Approximate the log-odds threshold which makes the type I error (false positive rate).
00050         """
00051         i=self.n_points
00052         prob=0.0
00053         while prob<fpr:
00054             i-=1
00055             prob+=self.bg_density[i]
00056         return self.min_score+i*self.step
00057             
00058     def threshold_fnr(self,fnr):
00059         """
00060         Approximate the log-odds threshold which makes the type II error (false negative rate).
00061         """
00062         i=-1
00063         prob=0.0
00064         while prob<fnr:
00065             i+=1
00066             prob+=self.mo_density[i]
00067         return self.min_score+i*self.step
00068             
00069     def threshold_balanced(self,rate_proportion=1.0,return_rate=False):
00070         """
00071         Approximate the log-odds threshold which makes FNR equal to FPR times rate_proportion
00072         """
00073         i=self.n_points
00074         fpr=0.0
00075         fnr=1.0
00076         while fpr*rate_proportion<fnr:
00077             i-=1
00078             fpr+=self.bg_density[i]
00079             fnr-=self.mo_density[i]
00080         if return_rate:
00081             return self.min_score+i*self.step,fpr
00082         else:
00083             return self.min_score+i*self.step
00084 
00085     def threshold_patser(self):
00086         """Threshold selection mimicking the behaviour of patser (Hertz, Stormo 1999) software.
00087         
00088         It selects such a threshold that the log(fpr)=-ic(M)
00089         note: the actual patser software uses natural logarithms instead of log_2, so the numbers
00090         are not directly comparable.
00091         """
00092         return self.threshold_fpr(fpr=2**-self.ic)