Back to index

python-biopython  1.60
_Motif.py
Go to the documentation of this file.
00001 # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 """Implementation of sequence motifs (PRIVATE).
00006 """
00007 from Bio.Seq import Seq
00008 from Bio.SubsMat import FreqTable
00009 from Bio.Alphabet import IUPAC
00010 import math,random
00011 
00012 class Motif(object):
00013     """
00014     A class representing sequence motifs.
00015     """
00016     def __init__(self,alphabet=IUPAC.unambiguous_dna):
00017         self.instances = []
00018         self.has_instances=False
00019         self.counts = {}
00020         self.has_counts=False
00021         self.mask = []
00022         self._pwm_is_current = False
00023         self._pwm = []
00024         self._log_odds_is_current = False
00025         self._log_odds = []
00026         self.alphabet=alphabet
00027         self.length=None
00028         self.background=dict((n, 1.0/len(self.alphabet.letters)) \
00029                              for n in self.alphabet.letters)
00030         self.beta=1.0
00031         self.info=None
00032         self.name=""
00033 
00034     def _check_length(self, len):
00035         if self.length==None:
00036             self.length = len
00037         elif self.length != len:
00038             print "len",self.length,self.instances, len
00039             raise ValueError("You can't change the length of the motif")
00040 
00041     def _check_alphabet(self,alphabet):
00042         if self.alphabet==None:
00043             self.alphabet=alphabet
00044         elif self.alphabet != alphabet:
00045                 raise ValueError("Wrong Alphabet")
00046         
00047     def add_instance(self,instance):
00048         """
00049         adds new instance to the motif
00050         """
00051         self._check_alphabet(instance.alphabet)
00052         self._check_length(len(instance))
00053         if self.has_counts:
00054             for i in range(self.length):
00055                 let=instance[i]
00056                 self.counts[let][i]+=1
00057 
00058         if self.has_instances or not self.has_counts:
00059             self.instances.append(instance)
00060             self.has_instances=True
00061             
00062         self._pwm_is_current = False
00063         self._log_odds_is_current = False
00064 
00065  
00066     def set_mask(self,mask):
00067         """
00068         sets the mask for the motif
00069 
00070         The mask should be a string containing asterisks in the position of significant columns and spaces in other columns
00071         """
00072         self._check_length(len(mask))
00073         self.mask=[]
00074         for char in mask:
00075             if char=="*":
00076                 self.mask.append(1)
00077             elif char==" ":
00078                 self.mask.append(0)
00079             else:
00080                 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
00081     
00082     def pwm(self,laplace=True):
00083         """
00084         returns the PWM computed for the set of instances
00085 
00086         if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions.
00087         """
00088         
00089         if self._pwm_is_current:
00090             return self._pwm
00091         #we need to compute new pwm
00092         self._pwm = []
00093         for i in xrange(self.length):
00094             dict = {}
00095             #filling the dict with 0's
00096             for letter in self.alphabet.letters:
00097                 if laplace:
00098                     dict[letter]=self.beta*self.background[letter]
00099                 else:
00100                     dict[letter]=0.0
00101             if self.has_counts:
00102                 #taking the raw counts
00103                 for letter in self.alphabet.letters:
00104                     dict[letter]+=self.counts[letter][i]
00105             elif self.has_instances:
00106                 #counting the occurences of letters in instances
00107                 for seq in self.instances:
00108                     #dict[seq[i]]=dict[seq[i]]+1
00109                     try:
00110                         dict[seq[i]]+=1
00111                     except KeyError: #we need to ignore non-alphabet letters
00112                         pass
00113             self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet)) 
00114         self._pwm_is_current=1
00115         return self._pwm
00116 
00117     def log_odds(self,laplace=True):
00118         """
00119         returns the logg odds matrix computed for the set of instances
00120         """
00121         
00122         if self._log_odds_is_current:
00123             return self._log_odds
00124         #we need to compute new pwm
00125         self._log_odds = []
00126         pwm=self.pwm(laplace)
00127         for i in xrange(self.length):
00128             d = {}
00129             for a in self.alphabet.letters:
00130                     d[a]=math.log(pwm[i][a]/self.background[a],2)
00131             self._log_odds.append(d)
00132         self._log_odds_is_current=1
00133         return self._log_odds
00134 
00135     def ic(self):
00136         """Method returning the information content of a motif.
00137         """
00138         res=0
00139         pwm=self.pwm()
00140         for i in range(self.length):
00141             res+=2
00142             for a in self.alphabet.letters:
00143                 if pwm[i][a]!=0:
00144                     res+=pwm[i][a]*math.log(pwm[i][a],2)
00145         return res
00146 
00147     def exp_score(self,st_dev=False):
00148         """
00149         Computes expected score of motif's instance and its standard deviation
00150         """
00151         exs=0.0
00152         var=0.0
00153         pwm=self.pwm()
00154         for i in range(self.length):
00155             ex1=0.0
00156             ex2=0.0
00157             for a in self.alphabet.letters:
00158                 if pwm[i][a]!=0:
00159                     ex1+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2))
00160                     ex2+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2))**2
00161             exs+=ex1
00162             var+=ex2-ex1**2
00163         if st_dev:
00164             return exs,math.sqrt(var)
00165         else:
00166             return exs
00167 
00168     def search_instances(self,sequence):
00169         """
00170         a generator function, returning found positions of instances of the motif in a given sequence
00171         """
00172         if not self.has_instances:
00173             raise ValueError ("This motif has no instances")
00174         for pos in xrange(0,len(sequence)-self.length+1):
00175             for instance in self.instances:
00176                 if instance.tostring()==sequence[pos:pos+self.length].tostring():
00177                     yield(pos,instance)
00178                     break # no other instance will fit (we don't want to return multiple hits)
00179 
00180     def score_hit(self,sequence,position,normalized=0,masked=0):
00181         """
00182         give the pwm score for a given position
00183         """
00184         lo=self.log_odds()
00185         score = 0.0
00186         for pos in xrange(self.length):
00187             a = sequence[position+pos]
00188             if not masked or self.mask[pos]:
00189                 try:
00190                     score += lo[pos][a]
00191                 except:
00192                     pass
00193         if normalized:
00194             if not masked:
00195                 score/=self.length
00196             else:
00197                 score/=len([x for x in self.mask if x])
00198         return score
00199     
00200     def search_pwm(self,sequence,normalized=0,masked=0,threshold=0.0,both=True):
00201         """
00202         a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
00203         """
00204         if both:
00205             rc = self.reverse_complement()
00206             
00207         sequence=sequence.tostring().upper()
00208         for pos in xrange(0,len(sequence)-self.length+1):
00209             score = self.score_hit(sequence,pos,normalized,masked)
00210             if score > threshold:
00211                 yield (pos,score)
00212             if both:
00213                 rev_score = rc.score_hit(sequence,pos,normalized,masked)
00214                 if rev_score > threshold:
00215                     yield (-pos,rev_score)
00216 
00217     def dist_pearson(self, motif, masked = 0):
00218         """
00219         return the similarity score based on pearson correlation for the given motif against self.
00220 
00221         We use the Pearson's correlation of the respective probabilities.
00222         """
00223 
00224         if self.alphabet != motif.alphabet:
00225             raise ValueError("Cannot compare motifs with different alphabets")
00226 
00227         max_p=-2
00228         for offset in range(-self.length+1,motif.length):
00229             if offset<0:
00230                 p = self.dist_pearson_at(motif,-offset)
00231             else: #offset>=0
00232                 p = motif.dist_pearson_at(self,offset)
00233             
00234             if max_p<p:
00235                 max_p=p
00236                 max_o=-offset
00237         return 1-max_p,max_o
00238 
00239     def dist_pearson_at(self,motif,offset):
00240         sxx = 0 # \sum x^2
00241         sxy = 0 # \sum x \cdot y
00242         sx = 0  # \sum x
00243         sy = 0  # \sum y
00244         syy = 0 # \sum x^2
00245         norm=max(self.length,offset+motif.length)
00246         
00247         for pos in range(max(self.length,offset+motif.length)):
00248             for l in self.alphabet.letters:
00249                 xi = self[pos][l]
00250                 yi = motif[pos-offset][l]
00251                 sx = sx + xi
00252                 sy = sy + yi
00253                 sxx = sxx + xi * xi
00254                 syy = syy + yi * yi
00255                 sxy = sxy + xi * yi
00256 
00257         norm *= len(self.alphabet.letters)
00258         s1 = (sxy - sx*sy*1.0/norm)
00259         s2 = (norm*sxx - sx*sx*1.0)*(norm*syy- sy*sy*1.0)
00260         return s1/math.sqrt(s2)
00261 
00262     def dist_product(self,other):
00263         """
00264         A similarity measure taking into account a product probability of generating overlaping instances of two motifs
00265         """
00266         max_p=0.0
00267         for offset in range(-self.length+1,other.length):
00268             if offset<0:
00269                 p = self.dist_product_at(other,-offset)
00270             else: #offset>=0
00271                 p = other.dist_product_at(self,offset)
00272             if max_p<p:
00273                 max_p=p
00274                 max_o=-offset
00275         return 1-max_p/self.dist_product_at(self,0),max_o
00276             
00277     def dist_product_at(self,other,offset):
00278         s=0
00279         for i in range(max(self.length,offset+other.length)):
00280             f1=self[i]
00281             f2=other[i-offset]
00282             for n,b in self.background.iteritems():
00283                 s+=b*f1[n]*f2[n]
00284         return s/i
00285 
00286     def dist_dpq(self,other):
00287         r"""Calculates the DPQ distance measure between motifs.
00288 
00289         It is calculated as a maximal value of DPQ formula (shown using LaTeX
00290         markup, familiar to mathematicians):
00291         
00292         \sqrt{\sum_{i=1}^{alignment.len()} \sum_{k=1}^alphabet.len() \
00293         \{ m1[i].freq(alphabet[k])*log_2(m1[i].freq(alphabet[k])/m2[i].freq(alphabet[k])) +
00294            m2[i].freq(alphabet[k])*log_2(m2[i].freq(alphabet[k])/m1[i].freq(alphabet[k]))
00295         }
00296         
00297         over possible non-spaced alignemts of two motifs.  See this reference:
00298 
00299         D. M Endres and J. E Schindelin, "A new metric for probability
00300         distributions", IEEE transactions on Information Theory 49, no. 7
00301         (July 2003): 1858-1860.
00302         """
00303         
00304         min_d=float("inf")
00305         min_o=-1
00306         d_s=[]
00307         for offset in range(-self.length+1,other.length):
00308             #print "%2.3d"%offset,
00309             if offset<0:
00310                 d = self.dist_dpq_at(other,-offset)
00311                 overlap = self.length+offset
00312             else: #offset>=0
00313                 d = other.dist_dpq_at(self,offset)
00314                 overlap = other.length-offset
00315             overlap = min(self.length,other.length,overlap)
00316             out = self.length+other.length-2*overlap
00317             #print d,1.0*(overlap+out)/overlap,d*(overlap+out)/overlap
00318             #d = d/(2*overlap)
00319             d = (d/(out+overlap))*(2*overlap+out)/(2*overlap)
00320             #print d
00321             d_s.append((offset,d))
00322             if min_d> d:
00323                 min_d=d
00324                 min_o=-offset
00325         return min_d,min_o#,d_s
00326             
00327     def dist_dpq_at(self,other,offset):
00328         """
00329         calculates the dist_dpq measure with a given offset.
00330 
00331         offset should satisfy 0<=offset<=len(self)
00332         """
00333         def dpq (f1,f2,alpha):
00334             s=0
00335             for n in alpha.letters:
00336                 avg=(f1[n]+f2[n])/2
00337                 s+=f1[n]*math.log(f1[n]/avg,2)+f2[n]*math.log(f2[n]/avg,2)
00338             return math.sqrt(s)
00339                 
00340         s=0        
00341         for i in range(max(self.length,offset+other.length)):
00342             f1=self[i]
00343             f2=other[i-offset]
00344             s+=dpq(f1,f2,self.alphabet)
00345         return s
00346             
00347     def _read(self,stream):
00348         """Reads the motif from the stream (in AlignAce format).
00349 
00350         the self.alphabet variable must be set beforehand.
00351         If the last line contains asterisks it is used for setting mask
00352         """
00353         
00354         while 1:
00355             ln = stream.readline()
00356             if "*" in ln:
00357                 self.set_mask(ln.strip("\n\c"))
00358                 break
00359             self.add_instance(Seq(ln.strip(),self.alphabet))
00360         
00361     def __str__(self,masked=False):
00362         """ string representation of a motif.
00363         """
00364         str = ""
00365         for inst in self.instances:
00366             str = str + inst.tostring() + "\n"
00367 
00368         if masked:
00369             for i in xrange(self.length):
00370                 if self.mask[i]:
00371                     str = str + "*"
00372                 else:
00373                     str = str + " "
00374             str = str + "\n"
00375         return str
00376 
00377     def __len__(self):
00378         """return the length of a motif
00379 
00380         Please use this method (i.e. invoke len(m)) instead of refering to the m.length directly.
00381         """
00382         if self.length==None:
00383             return 0
00384         else:
00385             return self.length
00386         
00387     def _write(self,stream):
00388         """
00389         writes the motif to the stream
00390         """
00391 
00392         stream.write(self.__str__())
00393             
00394             
00395 
00396     def _to_fasta(self):
00397         """
00398         FASTA representation of motif
00399         """
00400         if not self.has_instances:
00401             self.make_instances_from_counts()
00402         str = ""
00403         for i,inst in enumerate(self.instances):
00404             str = str + ">instance%d\n"%i + inst.tostring() + "\n"
00405             
00406         return str       
00407 
00408     def reverse_complement(self):
00409         """
00410         Gives the reverse complement of the motif
00411         """
00412         res = Motif()
00413         if self.has_instances:
00414             for i in self.instances:
00415                 res.add_instance(i.reverse_complement())
00416         else: # has counts
00417             res.has_counts=True
00418             res.counts["A"]=self.counts["T"][:]
00419             res.counts["T"]=self.counts["A"][:]
00420             res.counts["G"]=self.counts["C"][:]
00421             res.counts["C"]=self.counts["G"][:]
00422             res.counts["A"].reverse()
00423             res.counts["C"].reverse()
00424             res.counts["G"].reverse()
00425             res.counts["T"].reverse()
00426             res.length=self.length
00427         res.mask = self.mask
00428         return res
00429 
00430         
00431     def _from_jaspar_pfm(self,stream,make_instances=False):
00432         """
00433         reads the motif from Jaspar .pfm file
00434 
00435         The instances are fake, but the pwm is accurate.
00436         """
00437         return self._from_horiz_matrix(stream,letters="ACGT",make_instances=make_instances)
00438 
00439     def _from_vert_matrix(self,stream,letters=None,make_instances=False):
00440         """reads a vertical count matrix from stream and fill in the counts.
00441         """
00442 
00443         self.counts = {}
00444         self.has_counts=True
00445         if letters==None:
00446             letters=self.alphabet.letters
00447         self.length=0
00448         for i in letters:
00449             self.counts[i]=[]
00450         for ln in stream.readlines():
00451             rec=map(float,ln.strip().split())
00452             for k,v in zip(letters,rec):
00453                 self.counts[k].append(v)
00454             self.length+=1
00455         self.set_mask("*"*self.length)
00456         if make_instances==True:
00457             self.make_instances_from_counts()
00458         return self
00459         
00460     def _from_horiz_matrix(self,stream,letters=None,make_instances=False):
00461         """reads a horizontal count matrix from stream and fill in the counts.
00462         """
00463         if letters==None:
00464             letters=self.alphabet.letters
00465         self.counts = {}
00466         self.has_counts=True
00467 
00468         for i in letters:
00469             ln = stream.readline().strip().split()
00470             #if there is a letter in the beginning, ignore it
00471             if ln[0]==i:
00472                 ln=ln[1:]
00473             #print ln
00474             try:
00475                 self.counts[i]=map(int,ln)
00476             except ValueError: #not integers
00477                 self.counts[i]=map(float,ln) #map(lambda s: int(100*float(s)),ln)
00478             #print counts[i]
00479         
00480         s = sum(self.counts[nuc][0] for nuc in letters)
00481         l = len(self.counts[letters[0]])
00482         self.length=l
00483         self.set_mask("*"*l)
00484         if make_instances==True:
00485             self.make_instances_from_counts()
00486         return self
00487         
00488 
00489     def make_instances_from_counts(self):
00490         """Creates "fake" instances for a motif created from a count matrix.
00491 
00492         In case the sums of counts are different for different columnes, the
00493         shorter columns are padded with background.
00494         """
00495         alpha="".join(self.alphabet.letters)
00496         #col[i] is a column taken from aligned motif instances
00497         col=[]
00498         self.has_instances=True
00499         self.instances=[]
00500         s = sum(map(lambda nuc: self.counts[nuc][0],self.alphabet.letters))
00501         for i in range(self.length):
00502             col.append("")
00503             for n in self.alphabet.letters:
00504                 col[i] = col[i]+ (n*(self.counts[n][i]))
00505             if len(col[i])<s:
00506                 print "WARNING, column too short",len(col[i]),s
00507                 col[i]+=(alpha*s)[:(s-len(col[i]))]
00508             #print i,col[i]
00509         #iterate over instances
00510         for i in range(s): 
00511             inst="" #start with empty seq
00512             for j in range(self.length): #iterate over positions
00513                 inst+=col[j][i]
00514             #print i,inst
00515             inst=Seq(inst,self.alphabet)                
00516             self.add_instance(inst)
00517         return self.instances
00518 
00519     def make_counts_from_instances(self):
00520         """Creates the count matrix for a motif with instances.
00521 
00522         """
00523         #make strings for "columns" of motifs
00524         #col[i] is a column taken from aligned motif instances
00525         counts={}
00526         for a in self.alphabet.letters:
00527             counts[a]=[]
00528         self.has_counts=True
00529         s = len(self.instances)
00530         for i in range(self.length):
00531             ci = dict((a,0) for a in self.alphabet.letters)
00532             for inst in self.instances:
00533                 ci[inst[i]]+=1
00534             for a in self.alphabet.letters:
00535                 counts[a].append(ci[a])
00536         self.counts=counts
00537         return counts
00538 
00539     def _from_jaspar_sites(self,stream):
00540         """
00541         reads the motif from Jaspar .sites file
00542 
00543         The instances and pwm are OK.
00544         """
00545         
00546         while True:
00547             ln = stream.readline()# read the header "$>...."
00548             if ln=="" or ln[0]!=">":
00549                 break
00550             
00551             ln=stream.readline().strip()#read the actual sequence
00552             i=0
00553             while ln[i]==ln[i].lower():
00554                 i+=1
00555             inst=""
00556             while i<len(ln) and ln[i]==ln[i].upper():
00557                 inst+=ln[i]
00558                 i+=1
00559             inst=Seq(inst,self.alphabet)                
00560             self.add_instance(inst)
00561 
00562         self.set_mask("*"*len(inst))
00563         return self
00564 
00565 
00566     def __getitem__(self,index):
00567         """Returns the probability distribution over symbols at a given position, padding with background.
00568 
00569         If the requested index is out of bounds, the returned distribution comes from background.
00570         """
00571         if index in range(self.length):
00572             return self.pwm()[index]
00573         else:
00574             return self.background
00575 
00576     def consensus(self):
00577         """Returns the consensus sequence of a motif.
00578         """
00579         res=""
00580         for i in range(self.length):
00581             max_f=0
00582             max_n="X"
00583             for n in sorted(self[i]):
00584                 if self[i][n]>max_f:
00585                     max_f=self[i][n]
00586                     max_n=n
00587             res+=max_n
00588         return Seq(res,self.alphabet)
00589 
00590     def anticonsensus(self):
00591         """returns the least probable pattern to be generated from this motif.
00592         """
00593         res=""
00594         for i in range(self.length):
00595             min_f=10.0
00596             min_n="X"
00597             for n in sorted(self[i]):
00598                 if self[i][n]<min_f:
00599                     min_f=self[i][n]
00600                     min_n=n
00601             res+=min_n
00602         return Seq(res,self.alphabet)
00603 
00604     def max_score(self):
00605         """Maximal possible score for this motif.
00606 
00607         returns the score computed for the consensus sequence.
00608         """
00609         return self.score_hit(self.consensus(),0)
00610     
00611     def min_score(self):
00612         """Minimal possible score for this motif.
00613 
00614         returns the score computed for the anticonsensus sequence.
00615         """
00616         return self.score_hit(self.anticonsensus(),0)
00617 
00618     def weblogo(self,fname,format="PNG",**kwds):
00619         """
00620         uses the Berkeley weblogo service to download and save a weblogo of itself
00621         
00622         requires an internet connection.
00623         The parameters from **kwds are passed directly to the weblogo server.
00624         """
00625         import urllib
00626         import urllib2
00627         al= self._to_fasta()
00628         url = 'http://weblogo.berkeley.edu/logo.cgi'
00629         values = {'sequence' : al,
00630                   'format' : format,
00631                   'logowidth' : '18',
00632                   'logoheight' : '5',
00633                   'logounits' : 'cm',
00634                   'kind' : 'AUTO',
00635                   'firstnum' : "1",
00636                   'command' : 'Create Logo',
00637                   'smallsamplecorrection' : "on",
00638                   'symbolsperline' : 32,
00639                   'res' : '96',
00640                   'res_units' : 'ppi',
00641                   'antialias' : 'on',
00642                   'title' : '',
00643                   'barbits' : '',
00644                   'xaxis': 'on',
00645                   'xaxis_label'  : '',
00646                   'yaxis': 'on',
00647                   'yaxis_label' : '',
00648                   'showends' : 'on',
00649                   'shrink' : '0.5',
00650                   'fineprint' : 'on',
00651                   'ticbits' : '1',
00652                   'colorscheme' : 'DEFAULT',
00653                   'color1' : 'green',
00654                   'color2' : 'blue',
00655                   'color3' : 'red',
00656                   'color4' : 'black',
00657                   'color5' : 'purple',
00658                   'color6' : 'orange',
00659                   'color1' : 'black',
00660                   }
00661         for k,v in kwds.iteritems():
00662             values[k]=str(v)
00663             
00664         data = urllib.urlencode(values)
00665         req = urllib2.Request(url, data)
00666         response = urllib2.urlopen(req)
00667         f=open(fname,"w")
00668         im=response.read()
00669         
00670         f.write(im)
00671         f.close()
00672   
00673 
00674     def _to_transfac(self):
00675         """Write the representation of a motif in TRANSFAC format
00676         """
00677         res="XX\nTY Motif\n" #header
00678         try:
00679             res+="ID %s\n"%self.name
00680         except:
00681             pass
00682         res+="BF undef\nP0"
00683         for a in self.alphabet.letters:
00684             res+=" %s"%a
00685         res+="\n"
00686         if not self.has_counts:
00687             self.make_counts_from_instances()
00688         for i in range(self.length):
00689             if i<9:
00690                 res+="0%d"%(i+1)
00691             else:
00692                 res+="%d"%(i+1)
00693             for a in self.alphabet.letters:
00694                 res+=" %d"%self.counts[a][i]
00695             res+="\n"
00696         res+="XX\n"
00697         return res
00698 
00699     def _to_vertical_matrix(self,letters=None):
00700         """Return string representation of the motif as  a matrix.
00701         
00702         """
00703         if letters==None:
00704             letters=self.alphabet.letters
00705         self._pwm_is_current=False
00706         pwm=self.pwm(laplace=False)
00707         res=""
00708         for i in range(self.length):
00709             res+="\t".join([str(pwm[i][a]) for a in letters])
00710             res+="\n"
00711         return res
00712     
00713     def _to_horizontal_matrix(self,letters=None,normalized=True):
00714         """Return string representation of the motif as  a matrix.
00715         
00716         """
00717         if letters==None:
00718             letters=self.alphabet.letters
00719         res=""
00720         if normalized: #output PWM
00721             self._pwm_is_current=False
00722             mat=self.pwm(laplace=False)
00723             for a in letters:
00724                 res+="\t".join([str(mat[i][a]) for i in range(self.length)])
00725                 res+="\n"
00726         else: #output counts
00727             if not self.has_counts:
00728                 self.make_counts_from_instances()
00729             mat=self.counts
00730             for a in letters:
00731                 res+="\t".join([str(mat[a][i]) for i in range(self.length)])
00732                 res+="\n"
00733         return res
00734 
00735     def _to_jaspar_pfm(self):
00736         """Returns the pfm representation of the motif
00737         """
00738         return self._to_horizontal_matrix(normalized=False,letters="ACGT")
00739 
00740     def format(self,format):
00741         """Returns a string representation of the Motif in a given format
00742 
00743         Currently supported fromats:
00744          - jaspar-pfm : JASPAR Position Frequency Matrix
00745          - transfac : TRANSFAC like files
00746          - fasta : FASTA file with instances
00747         """
00748 
00749         formatters={
00750             "jaspar-pfm":   self._to_jaspar_pfm,
00751             "transfac":     self._to_transfac,
00752             "fasta" :       self._to_fasta,
00753             }
00754 
00755         try:
00756             return formatters[format]()
00757         except KeyError:
00758             raise ValueError("Wrong format type")
00759 
00760     def scanPWM(self,seq):
00761         """Matrix of log-odds scores for a nucleotide sequence.
00762  
00763         scans a nucleotide sequence and returns the matrix of log-odds
00764         scores for all positions.
00765 
00766         - the result is a one-dimensional list or numpy array
00767         - the sequence can only be a DNA sequence
00768         - the search is performed only on one strand
00769         """
00770         if self.alphabet!=IUPAC.unambiguous_dna:
00771             raise ValueError("Wrong alphabet! Use only with DNA motifs")
00772         if seq.alphabet!=IUPAC.unambiguous_dna:
00773             raise ValueError("Wrong alphabet! Use only with DNA sequences")
00774 
00775         seq = seq.tostring()
00776 
00777         # check if the fast C code can be used
00778         try:
00779             import _pwm
00780         except ImportError:
00781             # use the slower Python code otherwise
00782             return self._pwm_calculate(seq)
00783         
00784         # get the log-odds matrix into a proper shape
00785         # (each row contains sorted (ACGT) log-odds values)
00786         logodds=[[y[1] for y in sorted(x.items())] for x in self.log_odds()]
00787         return _pwm.calculate(seq, logodds)
00788 
00789     def _pwm_calculate(self, sequence):
00790         logodds = self.log_odds()
00791         m = len(logodds)
00792         s = len(sequence)
00793         n = s - m + 1
00794         result = [None] * n
00795         for i in xrange(n):
00796             score = 0.0
00797             for j in xrange(m):
00798                 c = sequence[i+j]
00799                 temp = logodds[j].get(c)
00800                 if temp==None:
00801                     break
00802                 score += temp
00803             else:
00804                 result[i] = score
00805         return result