Back to index

python-biopython  1.60
lcc.py
Go to the documentation of this file.
00001 # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
00002 # All rights reserved.  This code is part of the Biopython 
00003 # distribution and governed by its license.
00004 # Please see the LICENSE file that should have been included as part
00005 # of this package.
00006 
00007 import math
00008 
00009 def lcc_mult(seq,wsize):
00010     """Local Composition Complexity (LCC) values over sliding window.
00011 
00012     Returns a list of floats, the LCC values for a sliding window over
00013     the sequence.
00014 
00015     seq - an unambiguous DNA sequence (a string or Seq object)
00016     wsize - window size, integer
00017 
00018     The result is the same as applying lcc_simp multiple times, but this
00019     version is optimized for speed. The optimization works by using the
00020     value of previous window as a base to compute the next one."""
00021     l2 = math.log(2)
00022     tamseq = len(seq)
00023     try:
00024         #Assume its a string
00025         upper = seq.upper()
00026     except AttributeError:
00027         #Should be a Seq object then
00028         upper = seq.tostring().upper()
00029     compone = [0]
00030     lccsal = [0]
00031     for i in range(wsize):
00032         compone.append(((i+1)/float(wsize))*
00033                        ((math.log((i+1)/float(wsize)))/l2))
00034     window = seq[0:wsize]
00035     cant_a = window.count('A')
00036     cant_c = window.count('C')
00037     cant_t = window.count('T')
00038     cant_g = window.count('G')
00039     term_a = compone[cant_a]
00040     term_c = compone[cant_c]
00041     term_t = compone[cant_t]
00042     term_g = compone[cant_g]
00043     lccsal.append(-(term_a+term_c+term_t+term_g))
00044     tail = seq[0]
00045     for x in range (tamseq-wsize):
00046         window = upper[x+1:wsize+x+1]
00047         if tail==window[-1]:
00048             lccsal.append(lccsal[-1])
00049         elif tail=='A':
00050             cant_a -= 1
00051             if window.endswith('C'):
00052                 cant_c += 1
00053                 term_a = compone[cant_a]
00054                 term_c = compone[cant_c]
00055                 lccsal.append(-(term_a+term_c+term_t+term_g))
00056             elif window.endswith('T'):
00057                 cant_t += 1
00058                 term_a = compone[cant_a]
00059                 term_t = compone[cant_t]
00060                 lccsal.append(-(term_a+term_c+term_t+term_g))
00061             elif window.endswith('G'):
00062                 cant_g += 1
00063                 term_a = compone[cant_a]
00064                 term_g = compone[cant_g]
00065                 lccsal.append(-(term_a+term_c+term_t+term_g))
00066         elif tail=='C':
00067             cant_c -= 1
00068             if window.endswith('A'):
00069                 cant_a += 1
00070                 term_a = compone[cant_a]
00071                 term_c = compone[cant_c]
00072                 lccsal.append(-(term_a+term_c+term_t+term_g))
00073             elif window.endswith('T'):
00074                 cant_t += 1
00075                 term_c = compone[cant_c]
00076                 term_t = compone[cant_t]
00077                 lccsal.append(-(term_a+term_c+term_t+term_g))
00078             elif window.endswith('G'):
00079                 cant_g += 1
00080                 term_c = compone[cant_c]
00081                 term_g = compone[cant_g]
00082                 lccsal.append(-(term_a+term_c+term_t+term_g))
00083         elif tail=='T':
00084             cant_t -= 1
00085             if window.endswith('A'):
00086                 cant_a += 1
00087                 term_a = compone[cant_a]
00088                 term_t = compone[cant_t]
00089                 lccsal.append(-(term_a+term_c+term_t+term_g))
00090             elif window.endswith('C'):
00091                 cant_c += 1
00092                 term_c = compone[cant_c]
00093                 term_t = compone[cant_t]
00094                 lccsal.append(-(term_a+term_c+term_t+term_g))
00095             elif window.endswith('G'):
00096                 cant_g += 1
00097                 term_t = compone[cant_t]
00098                 term_g = compone[cant_g]
00099                 lccsal.append(-(term_a+term_c+term_t+term_g))
00100         elif tail=='G':
00101             cant_g -= 1
00102             if window.endswith('A'):
00103                 cant_a += 1
00104                 term_a = compone[cant_a]
00105                 term_g = compone[cant_g]
00106                 lccsal.append(-(term_a+term_c+term_t+term_g))
00107             elif window.endswith('C'):
00108                 cant_c += 1
00109                 term_c = compone[cant_c]
00110                 term_g = compone[cant_g]
00111                 lccsal.append(-(term_a+term_c+term_t+term_g))
00112             elif window.endswith('T'):
00113                 cant_t += 1
00114                 term_t = compone[cant_t]
00115                 term_g = compone[cant_g]
00116                 lccsal.append(-(term_a+term_c+term_t+term_g))
00117         tail = window[0]
00118     return lccsal
00119 
00120 def lcc_simp(seq):
00121     """Local Composition Complexity (LCC) for a sequence.
00122 
00123     seq - an unambiguous DNA sequence (a string or Seq object)
00124     
00125     Returns the Local Composition Complexity (LCC) value for the entire
00126     sequence (as a float).
00127 
00128     Reference:
00129     Andrzej K Konopka (2005) Sequence Complexity and Composition
00130     DOI: 10.1038/npg.els.0005260
00131     """
00132     wsize = len(seq)
00133     try:
00134         #Assume its a string
00135         upper = seq.upper()
00136     except AttributeError:
00137         #Should be a Seq object then
00138         upper = seq.tostring().upper()
00139     l2 = math.log(2)
00140     if 'A' not in seq:
00141         term_a = 0
00142         # Check to avoid calculating the log of 0.
00143     else:
00144         term_a = ((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
00145                                                           /float(wsize)))/l2)
00146     if 'C' not in seq:
00147         term_c = 0
00148     else:
00149         term_c = ((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
00150                                                           /float(wsize)))/l2)
00151     if 'T' not in seq:
00152         term_t = 0
00153     else:
00154         term_t = ((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
00155                                                           /float(wsize)))/l2)
00156     if 'G' not in seq:
00157         term_g = 0
00158     else:
00159         term_g = ((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
00160                                                           /float(wsize)))/l2)
00161     return -(term_a+term_c+term_t+term_g)