Back to index

python-biopython  1.60
Motif.py
Go to the documentation of this file.
00001 """Find and deal with motifs in biological sequence data.
00002 
00003 Representing DNA (or RNA or proteins) in a neural network can be difficult
00004 since input sequences can have different lengths. One way to get around
00005 this problem is to deal with sequences by finding common motifs, and counting
00006 the number of times those motifs occur in a sequence. This information can
00007 then be used for creating the neural networks, with occurances of motifs
00008 going into the network instead of raw sequence data.
00009 """
00010 # biopython
00011 from Bio.Alphabet import _verify_alphabet
00012 from Bio.Seq import Seq
00013 
00014 # local modules
00015 from Pattern import PatternRepository
00016 
00017 class MotifFinder(object):
00018     """Find motifs in a set of Sequence Records.
00019     """
00020     def __init__(self, alphabet_strict = 1):
00021         """Initialize a finder to get motifs.
00022 
00023         Arguments:
00024 
00025         o alphabet_strict - Whether or not motifs should be
00026         restricted to having all of there elements within the alphabet
00027         of the sequences. This requires that the Sequences have a real
00028         alphabet, and that all sequences have the same alphabet.
00029         """
00030         self.alphabet_strict = alphabet_strict
00031 
00032     def find(self, seq_records, motif_size):
00033         """Find all motifs of the given size in the passed SeqRecords.
00034 
00035         Arguments:
00036 
00037         o seq_records - A list of SeqRecord objects which the motifs
00038         will be found from.
00039 
00040         o motif_size - The size of the motifs we want to look for.
00041 
00042         Returns:
00043         A PatternRepository object that contains all of the motifs (and their
00044         counts) found in the training sequences).
00045         """
00046         motif_info = self._get_motif_dict(seq_records, motif_size)
00047 
00048         return PatternRepository(motif_info)
00049 
00050     def _get_motif_dict(self, seq_records, motif_size):
00051         """Return a dictionary with information on motifs.
00052 
00053         This internal function essentially does all of the hard work for
00054         finding motifs, and returns a dictionary containing the found motifs
00055         and their counts. This is internal so it can be reused by
00056         find_motif_differences.
00057         """
00058         if self.alphabet_strict:
00059             alphabet = seq_records[0].seq.alphabet
00060         else:
00061             alphabet = None
00062 
00063         # loop through all records to find the motifs in the sequences
00064         all_motifs = {}
00065         for seq_record in seq_records:
00066             # if we are working with alphabets, make sure we are consistent
00067             if alphabet is not None:
00068                 assert seq_record.seq.alphabet == alphabet, \
00069                        "Working with alphabet %s and got %s" % \
00070                        (alphabet, seq_record.seq.alphabet)
00071 
00072             # now start finding motifs in the sequence
00073             for start in range(len(seq_record.seq) - (motif_size - 1)):
00074                 motif = seq_record.seq[start:start + motif_size].tostring()
00075 
00076                 # if we are being alphabet strict, make sure the motif
00077                 # falls within the specified alphabet
00078                 if alphabet is not None:
00079                     motif_seq = Seq(motif, alphabet)
00080                     if _verify_alphabet(motif_seq):
00081                         all_motifs = self._add_motif(all_motifs, motif)
00082 
00083                 # if we are not being strict, just add the motif
00084                 else:
00085                     all_motifs = self._add_motif(all_motifs, motif)
00086 
00087         return all_motifs
00088 
00089     def find_differences(self, first_records, second_records, motif_size):
00090         """Find motifs in two sets of records and return the differences.
00091 
00092         This is used for finding motifs, but instead of just counting up all
00093         of the motifs in a set of records, this returns the differences
00094         between two listings of seq_records.
00095 
00096         o first_records, second_records - Two listings of SeqRecord objects
00097         to have their motifs compared.
00098 
00099         o motif_size - The size of the motifs we are looking for.
00100 
00101         Returns:
00102         A PatternRepository object that has motifs, but instead of their
00103         raw counts, this has the counts in the first set of records
00104         subtracted from the counts in the second set.
00105         """
00106         first_motifs = self._get_motif_dict(first_records, motif_size)
00107         second_motifs = self._get_motif_dict(second_records, motif_size)
00108 
00109         motif_diffs = {}
00110 
00111         # first deal with all of the keys from the first motif
00112         for cur_key in first_motifs:
00113             if cur_key in second_motifs:
00114                 motif_diffs[cur_key] = first_motifs[cur_key] - \
00115                                        second_motifs[cur_key]
00116             else:
00117                 motif_diffs[cur_key] = first_motifs[cur_key]
00118 
00119         # now see if there are any keys from the second motif
00120         # that we haven't got yet.
00121         missing_motifs = list(second_motifs)
00122 
00123         # remove all of the motifs we've already added
00124         for added_motif in motif_diffs:
00125             if added_motif in missing_motifs:
00126                 missing_motifs.remove(added_motif)
00127 
00128         # now put in all of the motifs we didn't get
00129         for cur_key in missing_motifs:
00130             motif_diffs[cur_key] = 0 - second_motifs[cur_key]
00131 
00132         return PatternRepository(motif_diffs)
00133                 
00134     def _add_motif(self, motif_dict, motif_to_add):
00135         """Add a motif to the given dictionary.
00136         """
00137         # incrememt the count of the motif if it is already present
00138         if motif_to_add in motif_dict:
00139             motif_dict[motif_to_add] += 1
00140         # otherwise add it to the dictionary
00141         else:
00142             motif_dict[motif_to_add] = 1
00143 
00144         return motif_dict
00145     
00146 class MotifCoder(object):
00147     """Convert motifs and a sequence into neural network representations.
00148 
00149     This is designed to convert a sequence into a representation that
00150     can be fed as an input into a neural network. It does this by
00151     representing a sequence based the motifs present.
00152     """
00153     def __init__(self, motifs):
00154         """Initialize an input producer with motifs to look for.
00155 
00156         Arguments:
00157 
00158         o motifs - A complete list of motifs, in order, that are to be
00159         searched for in a sequence.
00160         """
00161         self._motifs = motifs
00162 
00163         # check to be sure the motifs make sense (all the same size)
00164         self._motif_size = len(self._motifs[0])
00165         for motif in self._motifs:
00166             if len(motif) != self._motif_size:
00167                 raise ValueError("Motif %s given, expected motif size %s"
00168                                  % (motif, self._motif_size))
00169 
00170     def representation(self, sequence):
00171         """Represent a sequence as a set of motifs.
00172 
00173         Arguments:
00174 
00175         o sequence - A Bio.Seq object to represent as a motif.
00176 
00177         This converts a sequence into a representation based on the motifs.
00178         The representation is returned as a list of the relative amount of
00179         each motif (number of times a motif occured divided by the total
00180         number of motifs in the sequence). The values in the list correspond
00181         to the input order of the motifs specified in the initializer.
00182         """
00183         # initialize a dictionary to hold the motifs in this sequence
00184         seq_motifs = {}
00185         for motif in self._motifs:
00186             seq_motifs[motif] = 0
00187         
00188         # count all of the motifs we are looking for in the sequence
00189         for start in range(len(sequence) - (self._motif_size - 1)):
00190             motif = sequence[start:start + self._motif_size].tostring()
00191 
00192             if motif in seq_motifs:
00193                 seq_motifs[motif] += 1
00194 
00195         # normalize the motifs to go between zero and one
00196         min_count = min(seq_motifs.values())
00197         max_count = max(seq_motifs.values())
00198 
00199         # as long as we have some motifs present, normalize them
00200         # otherwise we'll just return 0 for everything 
00201         if max_count > 0:
00202             for motif in seq_motifs.keys():
00203                 seq_motifs[motif] = (float(seq_motifs[motif] - min_count)
00204                                      / float(max_count))
00205 
00206         # return the relative motif counts in the specified order
00207         motif_amounts = []
00208         for motif in self._motifs:
00209             motif_amounts.append(seq_motifs[motif])
00210 
00211         return motif_amounts
00212