Back to index

python-biopython  1.60
Pattern.py
Go to the documentation of this file.
00001 """Generic functionality useful for all gene representations.
00002 
00003 This module contains classes which can be used for all the different
00004 types of patterns available for representing gene information (ie. motifs,
00005 signatures and schemas). These are the general classes which should be
00006 handle any of the different specific patterns.
00007 """
00008 # standard library
00009 import random
00010 
00011 # biopython
00012 from Bio.Alphabet import _verify_alphabet
00013 from Bio.Seq import Seq, MutableSeq
00014 
00015 class PatternIO(object):
00016     """Allow reading and writing of patterns to files.
00017 
00018     This just defines a simple persistance class for patterns, making
00019     it easy to write them to a file and read 'em back.
00020     """
00021     def __init__(self, alphabet = None):
00022         """Intialize the reader and writer class.
00023 
00024         Arguments:
00025 
00026         o alphabet - An optional argument specifying the alphabet
00027         which patterns should follow. If an alphabet is set it'll be used
00028         to verify that all patterns follow it.
00029 
00030         Attributes:
00031         o separator - A character to use in separating items in a signature
00032         when it is written to a file and read back. This character should
00033         not be in the possible alphabet of the sequences, or there will
00034         be trouble.
00035         """
00036         self._alphabet = alphabet
00037 
00038         self.separator = ";"
00039 
00040     def write(self, pattern_list, output_handle):
00041         """Write a list of patterns to the given handle.
00042         """
00043         for pattern in pattern_list:
00044             # deal with signatures, concatentate them with the separator
00045             if (type(pattern) == type([]) or 
00046                 type(pattern) == type(tuple([]))):
00047                 string_pattern = self.separator.join(pattern)
00048             # deal with the normal cases
00049             else:
00050                 string_pattern = pattern
00051                 
00052             output_handle.write("%s\n" % string_pattern)
00053             
00054     def write_seq(self, seq_pattern_list, output_handle):
00055         """Convenience function to write Seq objects to a file.
00056 
00057         This can take Seqs and MutableSeqs, and write them to a file
00058         as strings.
00059         """
00060         # convert the seq patterns into just string patterns
00061         all_patterns = []
00062 
00063         for seq_pattern in seq_pattern_list:
00064             if isinstance(seq_pattern, MutableSeq):
00065                 seq = seq_pattern.toseq()
00066                 all_patterns.append(seq.tostring())
00067             elif isinstance(seq_pattern, Seq):
00068                 all_patterns.append(seq_pattern.tostring())
00069             else:
00070                 raise ValueError("Unexpected pattern type %r" % seq_pattern)
00071 
00072         self.write(all_patterns, output_handle)
00073 
00074     def read(self, input_handle):
00075         """Read patterns from the specified handle.
00076         """
00077         all_patterns = []
00078         
00079         while 1:
00080             cur_line = input_handle.readline()
00081 
00082             if not(cur_line):
00083                 break
00084 
00085             cur_pattern = cur_line.rstrip()
00086             # split up signatures
00087             if cur_pattern.find(self.separator) >= 0:
00088                 cur_pattern = tuple(cur_pattern.split(self.separator))
00089 
00090             if self._alphabet is not None:
00091                 # make single patterns (not signatures) into lists, so we
00092                 # can check signatures and single patterns the same
00093                 if type(cur_pattern) != type(tuple([])):
00094                     test_pattern = [cur_pattern]
00095                 else:
00096                     test_pattern = cur_pattern
00097                 for pattern_item in test_pattern: 
00098                     pattern_seq = Seq(pattern_item, self._alphabet)
00099                     if not(_verify_alphabet(pattern_seq)):
00100                         raise ValueError("Pattern %s not matching alphabet %s"
00101                                          % (cur_pattern, self._alphabet))
00102 
00103             all_patterns.append(cur_pattern)
00104 
00105         return all_patterns
00106 
00107 class PatternRepository(object):
00108     """This holds a list of specific patterns found in sequences.
00109 
00110     This is designed to be a general holder for a set of patterns and
00111     should be subclassed for specific implementations (ie. holding Motifs
00112     or Signatures.
00113     """
00114     def __init__(self, pattern_info):
00115         """Initialize a repository with patterns,
00116 
00117         Arguments:
00118 
00119         o pattern_info - A representation of all of the patterns found in
00120         a *Finder search. This should be a dictionary, where the keys
00121         are patterns, and the values are the number of times a pattern is
00122         found. 
00123 
00124         The patterns are represented interally as a list of two
00125         tuples, where the first element is the number of times a pattern
00126         occurs, and the second is the pattern itself. This makes it easy
00127         to sort the list and return the top N patterns.
00128         """
00129         self._pattern_dict = pattern_info
00130 
00131         # create the list representation
00132         self._pattern_list = []
00133         for pattern_name in self._pattern_dict:
00134             self._pattern_list.append((self._pattern_dict[pattern_name],
00135                                        pattern_name))
00136 
00137         self._pattern_list.sort()
00138         self._pattern_list.reverse()
00139 
00140     def get_all(self):
00141         """Retrieve all of the patterns in the repository.
00142         """
00143         patterns = []
00144         for pattern_info in self._pattern_list:
00145             patterns.append(pattern_info[1])
00146             
00147         return patterns
00148 
00149     def get_random(self, num_patterns):
00150         """Retrieve the specified number of patterns randomly.
00151 
00152         Randomly selects patterns from the list and returns them.
00153 
00154         Arguments:
00155 
00156         o num_patterns - The total number of patterns to return.
00157         """
00158         all_patterns = []
00159 
00160         while len(all_patterns) < num_patterns:
00161             # pick a pattern, and only add it if it is not already present
00162             new_pattern_info = random.choice(self._pattern_list)
00163 
00164             if new_pattern_info[1] not in all_patterns:
00165                 all_patterns.append(new_pattern_info[1])
00166 
00167         return all_patterns
00168 
00169     def get_top_percentage(self, percent):
00170         """Return a percentage of the patterns.
00171 
00172         This returns the top 'percent' percentage of the patterns in the
00173         repository.
00174         """
00175         all_patterns = self.get_all()
00176 
00177         num_to_return = int(len(all_patterns) * percent)
00178 
00179         return all_patterns[:num_to_return]
00180         
00181     def get_top(self, num_patterns):
00182         """Return the specified number of most frequently occurring patterns
00183 
00184         Arguments:
00185 
00186         o num_patterns - The number of patterns to return.
00187         """
00188         all_patterns = []
00189         for pattern_info in self._pattern_list[:num_patterns]:
00190             all_patterns.append(pattern_info[1])
00191 
00192         return all_patterns
00193     
00194     def get_differing(self, top_num, bottom_num):
00195         """Retrieve patterns that are at the extreme ranges.
00196 
00197         This returns both patterns at the top of the list (ie. the same as
00198         returned by get_top) and at the bottom of the list. This
00199         is especially useful for patterns that are the differences between
00200         two sets of patterns.
00201 
00202         Arguments:
00203 
00204         o top_num - The number of patterns to take from the top of the list.
00205 
00206         o bottom_num - The number of patterns to take from the bottom of
00207         the list.
00208         """
00209         all_patterns = []
00210         # first get from the top of the list
00211         for pattern_info in self._pattern_list[:top_num]:
00212             all_patterns.append(pattern_info[1])
00213 
00214         # then from the bottom
00215         for pattern_info in self._pattern_list[-bottom_num:]:
00216             all_patterns.append(pattern_info[1])
00217 
00218         return all_patterns
00219 
00220     def remove_polyA(self, at_percentage = .9):
00221         """Remove patterns which are likely due to polyA tails from the lists.
00222 
00223         This is just a helper function to remove pattenrs which are likely
00224         just due to polyA tails, and thus are not really great motifs.
00225         This will also get rid of stuff like ATATAT, which might be a
00226         useful motif, so use at your own discretion.
00227 
00228         XXX Could we write a more general function, based on info content
00229         or something like that?
00230         
00231         Arguments:
00232 
00233         o at_percentage - The percentage of A and T residues in a pattern
00234         that qualifies it for being removed.
00235         """
00236         remove_list = []
00237         # find all of the really AT rich patterns
00238         for pattern_info in self._pattern_list:
00239             pattern_at = float(pattern_info[1].count('A') + pattern_info[1].count('T')) / len(pattern_info[1])
00240             if pattern_at > at_percentage:
00241                 remove_list.append(pattern_info)
00242 
00243         # now remove them from the master list
00244         for to_remove in remove_list:
00245             self._pattern_list.remove(to_remove)
00246 
00247     def count(self, pattern):
00248         """Return the number of times the specified pattern is found.
00249         """
00250         try:
00251             return self._pattern_dict[pattern]
00252         except KeyError:
00253             return 0