Back to index

python-biopython  1.60
MEME.py
Go to the documentation of this file.
00001 # Copyright 2008 by Bartek Wilczynski
00002 # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 
00007 from Bio.Alphabet import IUPAC
00008 from Bio import Seq
00009 import re
00010 from math import sqrt
00011 import sys
00012 from Bio.Motif import Motif
00013 
00014 
00015 
00016 def read(handle):
00017     """Parses the text output of the MEME program into MEME.Record object.
00018     
00019     Example:
00020     
00021     >>> f = open("meme.output.txt")
00022     >>> from Bio.Motif.Parsers import MEME
00023     >>> record = MEME.read(f)
00024     >>> for motif in record.motifs:
00025     ...     for instance in motif.instances:
00026     ...         print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
00027     
00028     """
00029     record = MEMERecord()
00030     __read_version(record, handle)
00031     __read_datafile(record, handle)
00032     __read_alphabet(record, handle)
00033     __read_sequence_names(record, handle)
00034     __read_command(record, handle)
00035     for line in handle:
00036         if line.startswith('MOTIF  1'):
00037             break
00038     else:
00039         raise ValueError('Unexpected end of stream')
00040     while True:
00041         motif = __create_motif(line)
00042         motif.alphabet = record.alphabet
00043         record.motifs.append(motif)
00044         __read_motif_name(motif, handle)
00045         __read_motif_sequences(motif, handle, 'revcomp' in record.command)
00046         __skip_unused_lines(handle)
00047         try:
00048             line = handle.next()
00049         except StopIteration:
00050             raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs')
00051         if line.startswith("SUMMARY OF MOTIFS"):
00052             break
00053         if not line.startswith('MOTIF'):
00054             raise ValueError("Line does not start with 'MOTIF':\n%s" % line)
00055     return record
00056 
00057 
00058 class MEMEMotif (Motif):
00059     """A subclass of Motif used in parsing MEME (and MAST) output.
00060     
00061     This sublcass defines functions and data specific to MEME motifs. 
00062     This includes the evalue for a motif and the PSSM of the motif.
00063     
00064     Methods:
00065     add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values.
00066     add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies
00067     add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position.
00068     compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif
00069     """
00070     def __init__ (self):
00071         Motif.__init__(self)
00072         self.evalue = 0.0
00073     
00074     def _numoccurrences (self, number):
00075         if type(number) == int:
00076             self.num_occurrences = number
00077         else:
00078             number = int(number)
00079             self.num_occurrences = number
00080 
00081     def get_instance_by_name (self,name):
00082         for i in self.instances:
00083             if i.sequence_name == name:
00084                 return i
00085         return None
00086 
00087     def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
00088         inst = MEMEInstance(sequence,self.alphabet)
00089         inst._pvalue(pvalue)
00090         inst._seqname(name)
00091         inst._start(start)
00092         inst._strand(strand)
00093         if self.length:
00094             inst._length(self.length)
00095         else:
00096             inst._length(len(sequence))
00097         if self.name:
00098             inst._motifname(self.name)
00099         self.add_instance(inst)
00100     
00101     def _evalue (self, evalue):
00102         if type(evalue) == float:
00103             self.evalue = evalue
00104         else:
00105             evalue = float(evalue)
00106             self.evalue = evalue
00107     
00108 
00109 class MEMEInstance(Seq.Seq):
00110     """A class describing the instances of a MEME motif, and the data thereof. 
00111     """
00112     def __init__ (self,*args,**kwds):
00113         Seq.Seq.__init__(self,*args,**kwds)
00114         self.sequence_name = ""
00115         self.start = 0
00116         self.pvalue = 1.0
00117         self.strand = 0
00118         self.length = 0
00119         self.motif_name = ""
00120         
00121     
00122     def _seqname (self, name):
00123         self.sequence_name = name
00124         
00125     def _motifname (self, name):
00126         self.motif_name = name
00127     
00128     def _start (self,start):
00129         start = int(start)
00130         self.start = start
00131     
00132     def _pvalue (self,pval):
00133         pval = float(pval)
00134         self.pvalue = pval
00135     
00136     def _score (self, score):
00137         score = float(score)
00138         self.score = score
00139     
00140     def _strand (self, strand):
00141         self.strand = strand
00142     
00143     def _length (self, length):
00144         self.length = length
00145     
00146 
00147 class MEMERecord(object):
00148     """A class for holding the results of a MEME run.
00149     
00150     A MEMERecord is an object that holds the results from running
00151     MEME. It implements no methods of its own.
00152         
00153     """
00154     def __init__ (self):
00155         """__init__ (self)"""
00156         self.motifs = []
00157         self.version = ""
00158         self.datafile = ""
00159         self.command = ""
00160         self.alphabet = None
00161         self.sequence_names = []
00162         
00163     def get_motif_by_name (self, name):
00164         for m in self.motifs:
00165             if m.name == name:
00166                 return m
00167 
00168 
00169 # Everything below is private
00170 
00171 
00172 def __read_version(record, handle):
00173     for line in handle:
00174         if line.startswith('MEME version'):
00175             break
00176     else:
00177         raise ValueError("Improper input file. File should contain a line starting MEME version.")
00178     line = line.strip()
00179     ls = line.split()
00180     record.version = ls[2]
00181 
00182 
00183 def __read_datafile(record, handle):
00184     for line in handle:
00185         if line.startswith('TRAINING SET'):
00186             break
00187     else:
00188         raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.")
00189     try:
00190         line = handle.next()
00191     except StopIteration:
00192         raise ValueError("Unexpected end of stream: Expected to find line starting with '****'")
00193     if not line.startswith('****'):
00194         raise ValueError("Line does not start with '****':\n%s" % line)
00195     try:
00196         line = handle.next()
00197     except StopIteration:
00198         raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'")
00199     if not line.startswith('DATAFILE'):
00200         raise ValueError("Line does not start with 'DATAFILE':\n%s" % line)
00201     line = line.strip()
00202     line = line.replace('DATAFILE= ','')
00203     record.datafile = line
00204 
00205 
00206 def __read_alphabet(record, handle):
00207     try:
00208         line = handle.next()
00209     except StopIteration:
00210         raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'")
00211     if not line.startswith('ALPHABET'):
00212         raise ValueError("Line does not start with 'ALPHABET':\n%s" % line)
00213     line = line.strip()
00214     line = line.replace('ALPHABET= ','')
00215     if line == 'ACGT':
00216         al = IUPAC.unambiguous_dna
00217     else:
00218         al = IUPAC.protein
00219     record.alphabet = al
00220 
00221 
00222 def __read_sequence_names(record, handle):
00223     try:
00224         line = handle.next()
00225     except StopIteration:
00226         raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
00227     if not line.startswith('Sequence name'):
00228         raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
00229     try:
00230         line = handle.next()
00231     except StopIteration:
00232         raise ValueError("Unexpected end of stream: Expected to find line starting with '----'")
00233     if not line.startswith('----'):
00234         raise ValueError("Line does not start with '----':\n%s" % line)
00235     for line in handle:
00236         if line.startswith('***'):
00237             break
00238         line = line.strip()
00239         ls = line.split()
00240         record.sequence_names.append(ls[0])
00241         if len(ls) == 6:
00242             record.sequence_names.append(ls[3])
00243     else:
00244         raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
00245 
00246 
00247 def __read_command(record, handle):
00248     for line in handle:
00249         if line.startswith('command:'):
00250             break
00251     else:
00252         raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'")
00253     line = line.strip()
00254     line = line.replace('command: ','')
00255     record.command = line
00256 
00257 
00258 def __create_motif(line):
00259     line = line[5:].strip()
00260     ls = line.split()
00261     motif = MEMEMotif()
00262     motif.length = int(ls[3])
00263     motif._numoccurrences(ls[6])
00264     motif._evalue(ls[12])
00265     return motif
00266 
00267 
00268 def __read_motif_name(motif, handle):
00269     for line in handle:
00270         if 'sorted by position p-value' in line:
00271             break
00272     else:
00273         raise ValueError('Unexpected end of stream: Failed to find motif name')
00274     line = line.strip()
00275     ls = line.split()
00276     name = " ".join(ls[0:2])
00277     motif.name=name
00278 
00279 
00280 def __read_motif_sequences(motif, handle, rv):
00281     try:
00282         line = handle.next()
00283     except StopIteration:
00284         raise ValueError('Unexpected end of stream: Failed to find motif sequences')
00285     if not line.startswith('---'):
00286         raise ValueError("Line does not start with '---':\n%s" % line)
00287     try:
00288         line = handle.next()
00289     except StopIteration:
00290         raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
00291     if not line.startswith('Sequence name'):
00292         raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
00293     try:
00294         line = handle.next()
00295     except StopIteration:
00296         raise ValueError('Unexpected end of stream: Failed to find motif sequences')
00297     if not line.startswith('---'):
00298         raise ValueError("Line does not start with '---':\n%s" % line)
00299     for line in handle:
00300         if line.startswith('---'):
00301             break
00302         line = line.strip()
00303         ls = line.split()
00304         if rv:
00305             #seq = Seq.Seq(ls[5], record.alphabet)
00306             motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1])
00307         else:
00308             #seq = Seq.Seq(ls[4], record.alphabet)
00309             motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2])
00310     else:
00311         raise ValueError('Unexpected end of stream')
00312 
00313 
00314 def __skip_unused_lines(handle):
00315     for line in handle:
00316         if line.startswith('log-odds matrix'):
00317             break
00318     else:
00319         raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'")
00320     for line in handle:
00321         if line.startswith('---'):
00322             break
00323     else:
00324         raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
00325     for line in handle:
00326         if line.startswith('letter-probability matrix'):
00327             break
00328     else:
00329         raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'")
00330     for line in handle:
00331         if line.startswith('---'):
00332             break
00333     else:
00334         raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
00335     for line in handle:
00336         if line.startswith('Time'):
00337             break
00338     else:
00339         raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'")
00340     try:
00341         line = handle.next()
00342     except StopIteration:
00343         raise ValueError('Unexpected end of stream: Expected to find blank line')
00344     if line.strip():
00345         raise ValueError("Expected blank line, but got:\n%s" % line)
00346     try:
00347         line = handle.next()
00348     except StopIteration:
00349         raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
00350     if not line.startswith('***'):
00351         raise ValueError("Line does not start with '***':\n%s" % line)
00352     for line in handle:
00353         if line.strip():
00354             break
00355     else:
00356         raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
00357     if not line.startswith('***'):
00358         raise ValueError("Line does not start with '***':\n%s" % line)
00359