Back to index

python-biopython  1.60
Signature.py
Go to the documentation of this file.
00001 """Find and deal with signatures in biological sequence data.
00002 
00003 In addition to representing sequences according to motifs (see Motif.py
00004 for more information), we can also use Signatures, which are conserved
00005 regions that are not necessarily consecutive. This may be useful in
00006 the case of very diverged sequences, where signatures may pick out
00007 important conservation that can't be found by motifs (hopefully!).
00008 """
00009 # biopython
00010 from Bio.Alphabet import _verify_alphabet
00011 from Bio.Seq import Seq
00012 
00013 # local stuff
00014 from Pattern import PatternRepository
00015 
00016 class SignatureFinder(object):
00017     """Find Signatures in a group of sequence records.
00018 
00019     In this simple implementation, signatures are just defined as a
00020     two motifs separated by a gap. We need something a lot smarter than
00021     this to find more complicated signatures.
00022     """
00023     def __init__(self, alphabet_strict = 1):
00024         """Initialize a finder to get signatures.
00025 
00026         Arguments:
00027 
00028         o alphabet_strict - Specify whether signatures should be required
00029         to have all letters in the signature be consistent with the
00030         alphabet of the original sequence. This requires that all Seqs
00031         used have a consistent alphabet. This helps protect against getting
00032         useless signatures full of ambiguity signals.
00033         """
00034         self._alphabet_strict = alphabet_strict
00035 
00036     def find(self, seq_records, signature_size, max_gap):
00037         """Find all signatures in a group of sequences.
00038 
00039         Arguments:
00040 
00041         o seq_records - A list of SeqRecord objects we'll use the sequences
00042         from to find signatures.
00043 
00044         o signature_size - The size of each half of a signature (ie. if this
00045         is set at 3, then the signature could be AGC-----GAC)
00046 
00047         o max_gap - The maximum gap size between two parts of a signature.
00048         """
00049         sig_info = self._get_signature_dict(seq_records, signature_size,
00050                                             max_gap)
00051 
00052         return PatternRepository(sig_info)
00053 
00054     def _get_signature_dict(self, seq_records, sig_size, max_gap):
00055         """Return a dictionary with all signatures and their counts.
00056 
00057         This internal function does all of the hard work for the
00058         find_signatures function.
00059         """
00060         if self._alphabet_strict:
00061             alphabet = seq_records[0].seq.alphabet
00062         else:
00063             alphabet = None
00064 
00065         # loop through all records to find signatures
00066         all_sigs = {}
00067         for seq_record in seq_records:
00068             # if we are working with alphabets, make sure we are consistent
00069             if alphabet is not None:
00070                 assert seq_record.seq.alphabet == alphabet, \
00071                        "Working with alphabet %s and got %s" % \
00072                        (alphabet, seq_record.seq.alphabet)
00073 
00074             # now start finding signatures in the sequence
00075             largest_sig_size = sig_size * 2 + max_gap
00076             for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
00077                 # find the first part of the signature
00078                 first_sig = seq_record.seq[start:start + sig_size].tostring()
00079 
00080                 # now find all of the second parts of the signature
00081                 for second in range(start + 1, (start + 1) + max_gap):
00082                     second_sig = seq_record.seq[second: second + sig_size].tostring()
00083 
00084                     # if we are being alphabet strict, make sure both parts
00085                     # of the sig fall within the specified alphabet
00086                     if alphabet is not None:
00087                         first_seq = Seq(first_sig, alphabet)
00088                         second_seq = Seq(second_sig, alphabet)
00089                         if _verify_alphabet(first_seq) \
00090                         and _verify_alphabet(second_seq):
00091                             all_sigs = self._add_sig(all_sigs,
00092                                                      (first_sig, second_sig))
00093 
00094                     # if we are not being strict, just add the motif
00095                     else:
00096                         all_sigs = self._add_sig(all_sigs,
00097                                                  (first_sig, second_sig))
00098 
00099         return all_sigs
00100 
00101     def _add_sig(self, sig_dict, sig_to_add):
00102         """Add a signature to the given dictionary.
00103         """
00104         # incrememt the count of the signature if it is already present
00105         if sig_to_add in sig_dict:
00106             sig_dict[sig_to_add] += 1
00107         # otherwise add it to the dictionary
00108         else:
00109             sig_dict[sig_to_add] = 1
00110 
00111         return sig_dict
00112 
00113 class SignatureCoder(object):
00114     """Convert a Sequence into its signature representatives.
00115 
00116     This takes a sequence and a set of signatures, and converts the
00117     sequence into a list of numbers representing the relative amounts
00118     each signature is seen in the sequence. This allows a sequence to
00119     serve as input into a neural network.
00120     """
00121     def __init__(self, signatures, max_gap):
00122         """Initialize with the signatures to look for.
00123 
00124         Arguments:
00125 
00126         o signatures - A complete list of signatures, in order, that
00127         are to be searched for in the sequences. The signatures should
00128         be represented as a tuple of (first part of the signature,
00129         second_part of the signature) -- ('GATC', 'GATC').
00130 
00131         o max_gap - The maximum gap we can have between the two
00132         elements of the signature.
00133         """
00134         self._signatures = signatures
00135         self._max_gap = max_gap
00136 
00137         # check to be sure the signatures are all the same size
00138         # only do this if we actually have signatures
00139         if len(self._signatures) > 0:
00140             first_sig_size = len(self._signatures[0][0])
00141             second_sig_size = len(self._signatures[0][1])
00142 
00143             assert first_sig_size == second_sig_size, \
00144                    "Ends of the signature do not match: %s" \
00145                    % self._signatures[0]
00146 
00147             for sig in self._signatures:
00148                 assert len(sig[0]) == first_sig_size, \
00149                        "Got first part of signature %s, expected size %s" % \
00150                        (sig[0], first_sig_size)
00151                 assert len(sig[1]) == second_sig_size, \
00152                        "Got second part of signature %s, expected size %s" % \
00153                        (sig[1], second_sig_size)
00154 
00155     def representation(self, sequence):
00156         """Convert a sequence into a representation of its signatures.
00157 
00158         Arguments:
00159 
00160         o sequence - A Seq object we are going to convert into a set of
00161         signatures.
00162 
00163         Returns:
00164         A list of relative signature representations. Each item in the
00165         list corresponds to the signature passed in to the initializer and
00166         is the number of times that the signature was found, divided by the
00167         total number of signatures found in the sequence.
00168         """
00169         # check to be sure we have signatures to deal with,
00170         # otherwise just return an empty list
00171         if len(self._signatures) == 0:
00172             return []
00173         
00174         # initialize a dictionary to hold the signature counts
00175         sequence_sigs = {}
00176         for sig in self._signatures:
00177             sequence_sigs[sig] = 0
00178 
00179         # get a list of all of the first parts of the signatures
00180         all_first_sigs = []
00181         for sig_start, sig_end in self._signatures:
00182             all_first_sigs.append(sig_start)
00183         
00184         # count all of the signatures we are looking for in the sequence
00185         sig_size = len(self._signatures[0][0])
00186         smallest_sig_size = sig_size * 2
00187 
00188         for start in range(len(sequence) - (smallest_sig_size - 1)):
00189             # if the first part matches any of the signatures we are looking
00190             # for, then expand out to look for the second part
00191             first_sig = sequence[start:start + sig_size].tostring()
00192             if first_sig in all_first_sigs:
00193                 for second in range(start + sig_size,
00194                                     (start + sig_size + 1) + self._max_gap):
00195                     second_sig = sequence[second:second + sig_size].tostring()
00196 
00197                     # if we find the motif, increase the counts for it
00198                     if (first_sig, second_sig) in sequence_sigs:
00199                         sequence_sigs[(first_sig, second_sig)] += 1
00200 
00201         # -- normalize the signature info to go between zero and one
00202         min_count = min(sequence_sigs.values())
00203         max_count = max(sequence_sigs.values())
00204 
00205         # as long as we have some signatures present, normalize them
00206         # otherwise we'll just return 0 for everything 
00207         if max_count > 0:
00208             for sig in sequence_sigs:
00209                 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count)
00210                                       / float(max_count))
00211 
00212         # return the relative signature info in the specified order
00213         sig_amounts = []
00214         for sig in self._signatures:
00215             sig_amounts.append(sequence_sigs[sig])
00216 
00217         return sig_amounts
00218