Back to index

python-biopython  1.60
AlignInfo.py
Go to the documentation of this file.
00001 """Extract information from alignment objects.
00002 
00003 In order to try and avoid huge alignment objects with tons of functions,
00004 functions which return summary type information about alignments should
00005 be put into classes in this module.
00006 
00007 classes:
00008 o SummaryInfo
00009 o PSSM
00010 """
00011 
00012 # standard library
00013 import math
00014 import sys
00015 
00016 # biopython modules
00017 from Bio import Alphabet
00018 from Bio.Alphabet import IUPAC
00019 from Bio.Seq import Seq
00020 from Bio.SubsMat import FreqTable
00021 
00022 # Expected random distributions for 20-letter protein, and
00023 # for 4-letter nucleotide alphabets
00024 Protein20Random = 0.05
00025 Nucleotide4Random = 0.25
00026 class SummaryInfo(object):
00027     """Calculate summary info about the alignment.
00028 
00029     This class should be used to caclculate information summarizing the
00030     results of an alignment. This may either be straight consensus info
00031     or more complicated things.
00032     """
00033     def __init__(self, alignment):
00034         """Initialize with the alignment to calculate information on.
00035            ic_vector attribute. A dictionary. Keys: column numbers. Values:
00036         """
00037         self.alignment = alignment
00038         self.ic_vector = {}
00039 
00040     def dumb_consensus(self, threshold = .7, ambiguous = "X",
00041                        consensus_alpha = None, require_multiple = 0):
00042         """Output a fast consensus sequence of the alignment.
00043 
00044         This doesn't do anything fancy at all. It will just go through the
00045         sequence residue by residue and count up the number of each type
00046         of residue (ie. A or G or T or C for DNA) in all sequences in the
00047         alignment. If the percentage of the most common residue type is
00048         greater then the passed threshold, then we will add that residue type,
00049         otherwise an ambiguous character will be added.
00050 
00051         This could be made a lot fancier (ie. to take a substitution matrix
00052         into account), but it just meant for a quick and dirty consensus.
00053 
00054         Arguments:
00055         o threshold - The threshold value that is required to add a particular
00056         atom.
00057         o ambiguous - The ambiguous character to be added when the threshold is
00058         not reached.
00059         o consensus_alpha - The alphabet to return for the consensus sequence.
00060         If this is None, then we will try to guess the alphabet.
00061         o require_multiple - If set as 1, this will require that more than
00062         1 sequence be part of an alignment to put it in the consensus (ie.
00063         not just 1 sequence and gaps).
00064         """
00065         # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
00066         consensus = ''
00067 
00068         # find the length of the consensus we are creating
00069         con_len = self.alignment.get_alignment_length()
00070 
00071         # go through each seq item
00072         for n in range(con_len):
00073             # keep track of the counts of the different atoms we get
00074             atom_dict = {}
00075             num_atoms = 0
00076 
00077             for record in self.alignment._records:
00078                 # make sure we haven't run past the end of any sequences
00079                 # if they are of different lengths
00080                 if n < len(record.seq):
00081                     if record.seq[n] != '-' and record.seq[n] != '.':
00082                         if record.seq[n] not in atom_dict:
00083                             atom_dict[record.seq[n]] = 1
00084                         else:
00085                             atom_dict[record.seq[n]] += 1
00086 
00087                         num_atoms = num_atoms + 1
00088 
00089             max_atoms = []
00090             max_size = 0
00091 
00092             for atom in atom_dict:
00093                 if atom_dict[atom] > max_size:
00094                     max_atoms = [atom]
00095                     max_size = atom_dict[atom]
00096                 elif atom_dict[atom] == max_size:
00097                     max_atoms.append(atom)
00098 
00099             if require_multiple and num_atoms == 1:
00100                 consensus += ambiguous
00101             elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
00102                                          >= threshold):
00103                 consensus += max_atoms[0]
00104             else:
00105                 consensus += ambiguous
00106 
00107         # we need to guess a consensus alphabet if one isn't specified
00108         if consensus_alpha is None:
00109             consensus_alpha = self._guess_consensus_alphabet(ambiguous)
00110 
00111         return Seq(consensus, consensus_alpha)
00112 
00113     def gap_consensus(self, threshold = .7, ambiguous = "X",
00114                        consensus_alpha = None, require_multiple = 0):
00115         """Same as dumb_consensus(), but allows gap on the output.
00116 
00117         Things to do: Let the user define that with only one gap, the result
00118         character in consensus is gap. Let the user select gap character, now
00119         it takes the same is input.
00120         """
00121         # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
00122         consensus = ''
00123 
00124         # find the length of the consensus we are creating
00125         con_len = self.alignment.get_alignment_length()
00126 
00127         # go through each seq item
00128         for n in range(con_len):
00129             # keep track of the counts of the different atoms we get
00130             atom_dict = {}
00131             num_atoms = 0
00132 
00133             for record in self.alignment._records:
00134                 # make sure we haven't run past the end of any sequences
00135                 # if they are of different lengths
00136                 if n < len(record.seq):
00137                     if record.seq[n] not in atom_dict:
00138                         atom_dict[record.seq[n]] = 1
00139                     else:
00140                         atom_dict[record.seq[n]] += 1
00141 
00142                     num_atoms += 1
00143 
00144             max_atoms = []
00145             max_size = 0
00146 
00147             for atom in atom_dict:
00148                 if atom_dict[atom] > max_size:
00149                     max_atoms = [atom]
00150                     max_size = atom_dict[atom]
00151                 elif atom_dict[atom] == max_size:
00152                     max_atoms.append(atom)
00153 
00154             if require_multiple and num_atoms == 1:
00155                 consensus += ambiguous
00156             elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
00157                                          >= threshold):
00158                 consensus += max_atoms[0]
00159             else:
00160                 consensus += ambiguous
00161 
00162         # we need to guess a consensus alphabet if one isn't specified
00163         if consensus_alpha is None:
00164             #TODO - Should we make this into a Gapped alphabet?
00165             consensus_alpha = self._guess_consensus_alphabet(ambiguous)
00166 
00167         return Seq(consensus, consensus_alpha)
00168           
00169     def _guess_consensus_alphabet(self, ambiguous):
00170         """Pick an (ungapped) alphabet for an alignment consesus sequence.
00171 
00172         This just looks at the sequences we have, checks their type, and
00173         returns as appropriate type which seems to make sense with the
00174         sequences we've got.
00175         """
00176         #Start with the (un-gapped version of) the alignment alphabet
00177         a = Alphabet._get_base_alphabet(self.alignment._alphabet)
00178 
00179         #Now check its compatible with all the rest of the sequences
00180         for record in self.alignment:
00181             #Get the (un-gapped version of) the sequence's alphabet
00182             alt =  Alphabet._get_base_alphabet(record.seq.alphabet)
00183             if not isinstance(alt, a.__class__):
00184                 raise ValueError \
00185                 ("Alignment contains a sequence with an incompatible alphabet.")
00186 
00187         #Check the ambiguous character we are going to use in the consensus
00188         #is in the alphabet's list of valid letters (if defined).
00189         if hasattr(a, "letters") and a.letters is not None \
00190         and ambiguous not in a.letters:
00191             #We'll need to pick a more generic alphabet...
00192             if isinstance(a, IUPAC.IUPACUnambiguousDNA):
00193                 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters:
00194                     a = IUPAC.IUPACUnambiguousDNA()
00195                 else:
00196                     a = Alphabet.generic_dna
00197             elif isinstance(a, IUPAC.IUPACUnambiguousRNA):
00198                 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters:
00199                     a = IUPAC.IUPACUnambiguousRNA()
00200                 else:
00201                     a = Alphabet.generic_rna
00202             elif isinstance(a, IUPAC.IUPACProtein):
00203                 if ambiguous in IUPAC.ExtendedIUPACProtein().letters:
00204                     a = IUPAC.ExtendedIUPACProtein()
00205                 else:
00206                     a = Alphabet.generic_protein
00207             else:
00208                 a = Alphabet.single_letter_alphabet
00209         return a
00210 
00211     def replacement_dictionary(self, skip_chars = []):
00212         """Generate a replacement dictionary to plug into a substitution matrix
00213         
00214         This should look at an alignment, and be able to generate the number
00215         of substitutions of different residues for each other in the
00216         aligned object.
00217 
00218         Will then return a dictionary with this information:
00219         {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
00220 
00221         This also treats weighted sequences. The following example shows how
00222         we calculate the replacement dictionary. Given the following
00223         multiple sequence alignments:
00224 
00225         GTATC  0.5
00226         AT--C  0.8
00227         CTGTC  1.0
00228 
00229         For the first column we have:
00230         ('A', 'G') : 0.5 * 0.8 = 0.4
00231         ('C', 'G') : 0.5 * 1.0 = 0.5
00232         ('A', 'C') : 0.8 * 1.0 = 0.8
00233 
00234         We then continue this for all of the columns in the alignment, summing
00235         the information for each substitution in each column, until we end
00236         up with the replacement dictionary.
00237 
00238         Arguments:
00239         o skip_chars - A list of characters to skip when creating the dictionary.
00240         For instance, you might have Xs (screened stuff) or Ns, and not want
00241         to include the ambiguity characters in the dictionary.
00242         """
00243         # get a starting dictionary based on the alphabet of the alignment
00244         rep_dict, skip_items = self._get_base_replacements(skip_chars)
00245 
00246         # iterate through each record
00247         for rec_num1 in range(len(self.alignment._records)):
00248             # iterate through each record from one beyond the current record
00249             # to the end of the list of records
00250             for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
00251                 # for each pair of records, compare the sequences and add
00252                 # the pertinent info to the dictionary
00253                 rep_dict = self._pair_replacement(
00254                     self.alignment._records[rec_num1].seq,
00255                     self.alignment._records[rec_num2].seq,
00256                     self.alignment._records[rec_num1].annotations.get('weight',1.0),
00257                     self.alignment._records[rec_num2].annotations.get('weight',1.0),
00258                     rep_dict, skip_items)
00259 
00260         return rep_dict
00261 
00262     def _pair_replacement(self, seq1, seq2, weight1, weight2,
00263                           start_dict, ignore_chars):
00264         """Compare two sequences and generate info on the replacements seen.
00265 
00266         Arguments:
00267         o seq1, seq2 - The two sequences to compare.
00268         o weight1, weight2 - The relative weights of seq1 and seq2.
00269         o start_dict - The dictionary containing the starting replacement
00270         info that we will modify.
00271         o ignore_chars - A list of characters to ignore when calculating
00272         replacements (ie. '-').
00273 
00274         Returns:
00275         o A replacment dictionary which is modified from initial_dict with
00276         the information from the sequence comparison.
00277         """
00278         # loop through each residue in the sequences
00279         for residue_num in range(len(seq1)):
00280             residue1 = seq1[residue_num]
00281             try:
00282                 residue2 = seq2[residue_num]
00283             # if seq2 is shorter, then we just stop looking at replacements
00284             # and return the information
00285             except IndexError:
00286                 return start_dict
00287 
00288             # if the two residues are characters we want to count
00289             if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
00290                 try:
00291                     # add info about the replacement to the dictionary,
00292                     # modified by the sequence weights
00293                     start_dict[(residue1, residue2)] += weight1 * weight2
00294                                          
00295                 # if we get a key error, then we've got a problem with alphabets
00296                 except KeyError:
00297                     raise ValueError("Residues %s, %s not found in alphabet %s"
00298                                      % (residue1, residue2,
00299                                         self.alignment._alphabet))
00300 
00301         return start_dict
00302 
00303 
00304     def _get_all_letters(self):
00305         """Returns a string containing the expected letters in the alignment."""
00306         all_letters = self.alignment._alphabet.letters
00307         if all_letters is None \
00308         or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \
00309         and all_letters == self.alignment._alphabet.gap_char):
00310             #We are dealing with a generic alphabet class where the
00311             #letters are not defined!  We must build a list of the
00312             #letters used...
00313             set_letters = set()
00314             for record in self.alignment:
00315                 #Note the built in set does not have a union_update
00316                 #which was provided by the sets module's Set
00317                 set_letters = set_letters.union(record.seq)
00318             list_letters = list(set_letters)
00319             list_letters.sort()
00320             all_letters = "".join(list_letters)
00321         return all_letters
00322 
00323     def _get_base_replacements(self, skip_items = []):
00324         """Get a zeroed dictonary of all possible letter combinations.
00325 
00326         This looks at the type of alphabet and gets the letters for it.
00327         It then creates a dictionary with all possible combinations of these
00328         letters as keys (ie. ('A', 'G')) and sets the values as zero.
00329 
00330         Returns:
00331         o The base dictionary created
00332         o A list of alphabet items to skip when filling the dictionary.Right
00333         now the only thing I can imagine in this list is gap characters, but
00334         maybe X's or something else might be useful later. This will also
00335         include any characters that are specified to be skipped.
00336         """
00337         base_dictionary = {}
00338         all_letters = self._get_all_letters()
00339 
00340         # if we have a gapped alphabet we need to find the gap character
00341         # and drop it out
00342         if isinstance(self.alignment._alphabet, Alphabet.Gapped):
00343             skip_items.append(self.alignment._alphabet.gap_char)
00344             all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'')
00345 
00346         # now create the dictionary
00347         for first_letter in all_letters:
00348             for second_letter in all_letters:
00349                 if (first_letter not in skip_items and
00350                     second_letter not in skip_items):
00351                     base_dictionary[(first_letter, second_letter)] = 0
00352 
00353         return base_dictionary, skip_items
00354 
00355 
00356     def pos_specific_score_matrix(self, axis_seq = None,
00357                                   chars_to_ignore = []):
00358         """Create a position specific score matrix object for the alignment.
00359 
00360         This creates a position specific score matrix (pssm) which is an
00361         alternative method to look at a consensus sequence.
00362 
00363         Arguments:
00364         o chars_to_ignore - A listing of all characters not to include in
00365         the pssm.  If the alignment alphabet declares a gap character,
00366         then it will be excluded automatically.
00367         o axis_seq - An optional argument specifying the sequence to
00368         put on the axis of the PSSM. This should be a Seq object. If nothing
00369         is specified, the consensus sequence, calculated with default
00370         parameters, will be used.
00371 
00372         Returns:
00373         o A PSSM (position specific score matrix) object.
00374         """
00375         # determine all of the letters we have to deal with
00376         all_letters = self._get_all_letters()
00377         assert all_letters
00378 
00379         if not isinstance(chars_to_ignore, list):
00380             raise TypeError("chars_to_ignore should be a list.")
00381 
00382         # if we have a gap char, add it to stuff to ignore
00383         if isinstance(self.alignment._alphabet, Alphabet.Gapped):
00384             chars_to_ignore.append(self.alignment._alphabet.gap_char)
00385         
00386         for char in chars_to_ignore:
00387             all_letters = all_letters.replace(char, '')
00388 
00389         if axis_seq:
00390             left_seq = axis_seq
00391             assert len(axis_seq) == self.alignment.get_alignment_length()
00392         else:
00393             left_seq = self.dumb_consensus()
00394 
00395         pssm_info = []
00396         # now start looping through all of the sequences and getting info
00397         for residue_num in range(len(left_seq)):
00398             score_dict = self._get_base_letters(all_letters)
00399             for record in self.alignment._records:
00400                 try:
00401                     this_residue = record.seq[residue_num]
00402                 # if we hit an index error we've run out of sequence and
00403                 # should not add new residues
00404                 except IndexError:
00405                     this_residue = None
00406                     
00407                 if this_residue and this_residue not in chars_to_ignore:
00408                     weight = record.annotations.get('weight', 1.0)
00409                     try:
00410                         score_dict[this_residue] += weight
00411                     # if we get a KeyError then we have an alphabet problem
00412                     except KeyError:
00413                         raise ValueError("Residue %s not found in alphabet %s"
00414                                      % (this_residue,
00415                                         self.alignment._alphabet))
00416 
00417             pssm_info.append((left_seq[residue_num],
00418                               score_dict))
00419 
00420 
00421         return PSSM(pssm_info)
00422                     
00423     def _get_base_letters(self, letters):
00424         """Create a zeroed dictionary with all of the specified letters.
00425         """
00426         base_info = {}
00427         for letter in letters:
00428             base_info[letter] = 0
00429 
00430         return base_info
00431 
00432     def information_content(self, start = 0,
00433                             end = None,
00434                             e_freq_table = None, log_base = 2,
00435                             chars_to_ignore = []):
00436         """Calculate the information content for each residue along an alignment.
00437 
00438         Arguments:
00439         o start, end - The starting an ending points to calculate the
00440         information content. These points should be relative to the first
00441         sequence in the alignment, starting at zero (ie. even if the 'real'
00442         first position in the seq is 203 in the initial sequence, for
00443         the info content, we need to use zero). This defaults to the entire
00444         length of the first sequence.
00445         o e_freq_table - A FreqTable object specifying the expected frequencies
00446         for each letter in the alphabet we are using (e.g. {'G' : 0.4,
00447         'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
00448         included, since these should not have expected frequencies.
00449         o log_base - The base of the logathrim to use in calculating the
00450         information content. This defaults to 2 so the info is in bits.
00451         o chars_to_ignore - A listing of characterw which should be ignored
00452         in calculating the info content.
00453 
00454         Returns:
00455         o A number representing the info content for the specified region.
00456 
00457         Please see the Biopython manual for more information on how information
00458         content is calculated.
00459         """
00460         # if no end was specified, then we default to the end of the sequence
00461         if end is None:
00462             end = len(self.alignment._records[0].seq)
00463         
00464         if start < 0 or end > len(self.alignment._records[0].seq):
00465             raise ValueError \
00466                   ("Start (%s) and end (%s) are not in the range %s to %s"
00467                    % (start, end, 0, len(self.alignment._records[0].seq)))
00468         # determine random expected frequencies, if necessary
00469         random_expected = None
00470         if not e_freq_table:
00471             #TODO - What about ambiguous alphabets?
00472             base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
00473             if isinstance(base_alpha, Alphabet.ProteinAlphabet):
00474                 random_expected = Protein20Random
00475             elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
00476                 random_expected = Nucleotide4Random
00477             else:
00478                 errstr = "Error in alphabet: not Nucleotide or Protein, "
00479                 errstr += "supply expected frequencies"
00480                 raise ValueError(errstr)
00481             del base_alpha
00482         elif not isinstance(e_freq_table, FreqTable.FreqTable):
00483             raise ValueError("e_freq_table should be a FreqTable object")
00484             
00485 
00486         # determine all of the letters we have to deal with
00487         all_letters = self._get_all_letters()
00488         for char in chars_to_ignore:
00489             all_letters = all_letters.replace(char, '')
00490 
00491         info_content = {}
00492         for residue_num in range(start, end):
00493             freq_dict = self._get_letter_freqs(residue_num,
00494                                                self.alignment._records,
00495                                                all_letters, chars_to_ignore)
00496             # print freq_dict,
00497             column_score = self._get_column_info_content(freq_dict,
00498                                                          e_freq_table,
00499                                                          log_base,
00500                                                          random_expected)
00501 
00502             info_content[residue_num] = column_score
00503         # sum up the score
00504         total_info = sum(info_content.itervalues())
00505         # fill in the ic_vector member: holds IC for each column
00506         for i in info_content:
00507             self.ic_vector[i] = info_content[i]
00508         return total_info
00509 
00510     def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
00511         """Determine the frequency of specific letters in the alignment.
00512 
00513         Arguments:
00514         o residue_num - The number of the column we are getting frequencies
00515         from.
00516         o all_records - All of the SeqRecords in the alignment.
00517         o letters - The letters we are interested in getting the frequency
00518         for.
00519         o to_ignore - Letters we are specifically supposed to ignore.
00520 
00521         This will calculate the frequencies of each of the specified letters
00522         in the alignment at the given frequency, and return this as a
00523         dictionary where the keys are the letters and the values are the
00524         frequencies.
00525         """
00526         freq_info = self._get_base_letters(letters)
00527 
00528         total_count = 0
00529         # collect the count info into the dictionary for all the records
00530         for record in all_records:
00531             try:
00532                 if record.seq[residue_num] not in to_ignore:
00533                     weight = record.annotations.get('weight',1.0)
00534                     freq_info[record.seq[residue_num]] += weight
00535                     total_count += weight
00536             # getting a key error means we've got a problem with the alphabet 
00537             except KeyError:
00538                 raise ValueError("Residue %s not found in alphabet %s"
00539                                  % (record.seq[residue_num],
00540                                     self.alignment._alphabet))
00541 
00542         if total_count == 0:
00543             # This column must be entirely ignored characters
00544             for letter in freq_info:
00545                 assert freq_info[letter] == 0
00546                 #TODO - Map this to NA or NaN?
00547         else:
00548             # now convert the counts into frequencies
00549             for letter in freq_info:
00550                 freq_info[letter] = freq_info[letter] / total_count
00551 
00552         return freq_info
00553             
00554     def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
00555                                  random_expected):
00556         """Calculate the information content for a column.
00557 
00558         Arguments:
00559         o obs_freq - The frequencies observed for each letter in the column.
00560         o e_freq_table - An optional argument specifying the expected
00561         frequencies for each letter. This is a SubsMat.FreqTable instance.
00562         o log_base - The base of the logathrim to use in calculating the
00563         info content.
00564         """
00565         try:
00566             gap_char = self.alignment._alphabet.gap_char
00567         except AttributeError:
00568             #The alphabet doesn't declare a gap - there could be none
00569             #in the sequence... or just a vague alphabet.
00570             gap_char = "-" #Safe?
00571             
00572         if e_freq_table:
00573             if not isinstance(e_freq_table, FreqTable.FreqTable):
00574                 raise ValueError("e_freq_table should be a FreqTable object")
00575             # check the expected freq information to make sure it is good
00576             for key in obs_freq:
00577                 if (key != gap_char and key not in e_freq_table):
00578                     raise ValueError("Expected frequency letters %s "
00579                                      "do not match observed %s" \
00580                                      % (e_freq_table.keys(),
00581                                         obs_freq.keys() - [gap_char]))
00582         
00583         total_info = 0.0
00584 
00585         for letter in obs_freq:
00586             inner_log = 0.0
00587             # if we have expected frequencies, modify the log value by them
00588             # gap characters do not have expected frequencies, so they
00589             # should just be the observed frequency.
00590             if letter != gap_char:
00591                 if e_freq_table:
00592                     inner_log = obs_freq[letter] / e_freq_table[letter]
00593                 else:
00594                     inner_log = obs_freq[letter] / random_expected
00595             # if the observed frequency is zero, we don't add any info to the
00596             # total information content
00597             if inner_log > 0:
00598                 letter_info = (obs_freq[letter] * 
00599                                math.log(inner_log) / math.log(log_base))
00600                 total_info += letter_info
00601         return total_info 
00602 
00603     def get_column(self,col):
00604         return self.alignment.get_column(col)
00605 
00606 class PSSM(object):
00607     """Represent a position specific score matrix.
00608 
00609     This class is meant to make it easy to access the info within a PSSM
00610     and also make it easy to print out the information in a nice table.
00611 
00612     Let's say you had an alignment like this:
00613     GTATC
00614     AT--C
00615     CTGTC
00616 
00617     The position specific score matrix (when printed) looks like:
00618 
00619       G A T C
00620     G 1 1 0 1
00621     T 0 0 3 0
00622     A 1 1 0 0
00623     T 0 0 2 0
00624     C 0 0 0 3
00625 
00626     You can access a single element of the PSSM using the following:
00627 
00628     your_pssm[sequence_number][residue_count_name]
00629 
00630     For instance, to get the 'T' residue for the second element in the
00631     above alignment you would need to do:
00632 
00633     your_pssm[1]['T']
00634     """
00635     def __init__(self, pssm):
00636         """Initialize with pssm data to represent.
00637 
00638         The pssm passed should be a list with the following structure:
00639 
00640         list[0] - The letter of the residue being represented (for instance,
00641         from the example above, the first few list[0]s would be GTAT...
00642         list[1] - A dictionary with the letter substitutions and counts.
00643         """
00644         self.pssm = pssm
00645 
00646     def __getitem__(self, pos):
00647         return self.pssm[pos][1]
00648 
00649     def __str__(self):
00650         out = " "
00651         all_residues = self.pssm[0][1].keys()
00652         all_residues.sort()
00653         
00654         # first print out the top header
00655         for res in all_residues:
00656             out += "   %s" % res
00657         out += "\n"
00658 
00659         # for each item, write out the substitutions
00660         for item in self.pssm:
00661             out += "%s " % item[0]
00662             for res in all_residues:
00663                 out += " %.1f" % item[1][res]
00664 
00665             out += "\n"
00666         return out
00667 
00668     def get_residue(self, pos):
00669         """Return the residue letter at the specified position.
00670         """
00671         return self.pssm[pos][0]
00672 
00673 
00674 def print_info_content(summary_info,fout=None,rep_record=0):
00675     """ Three column output: position, aa in representative sequence,
00676         ic_vector value"""
00677     fout = fout or sys.stdout
00678     if not summary_info.ic_vector:
00679         summary_info.information_content()
00680     rep_sequence = summary_info.alignment._records[rep_record].seq
00681     positions = summary_info.ic_vector.keys()
00682     positions.sort()
00683     for pos in positions:
00684         fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
00685                    summary_info.ic_vector[pos]))
00686 
00687 if __name__ == "__main__":
00688     print "Quick test"
00689     from Bio import AlignIO
00690     from Bio.Align.Generic import Alignment
00691 
00692     filename = "../../Tests/GFF/multi.fna"
00693     format = "fasta"
00694     expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25},
00695                                    FreqTable.FREQ,
00696                                    IUPAC.unambiguous_dna)
00697 
00698     alignment = AlignIO.read(open(filename), format)
00699     for record in alignment:
00700         print record.seq.tostring()
00701     print "="*alignment.get_alignment_length()
00702     
00703     summary = SummaryInfo(alignment)
00704     consensus = summary.dumb_consensus(ambiguous="N")
00705     print consensus
00706     consensus = summary.gap_consensus(ambiguous="N")
00707     print consensus
00708     print
00709     print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
00710                                             axis_seq=consensus)
00711     print
00712     #Have a generic alphabet, without a declared gap char, so must tell
00713     #provide the frequencies and chars to ignore explicitly.
00714     print summary.information_content(e_freq_table=expected,
00715                                       chars_to_ignore=['-'])
00716     print
00717     print "Trying a protein sequence with gaps and stops"
00718 
00719     alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
00720     a = Alignment(alpha)
00721     a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
00722     a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
00723     a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
00724     print a
00725     print "="*a.get_alignment_length()
00726 
00727     s = SummaryInfo(a)
00728     c = s.dumb_consensus(ambiguous="X")
00729     print c
00730     c = s.gap_consensus(ambiguous="X")
00731     print c
00732     print
00733     print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
00734 
00735     print s.information_content(chars_to_ignore=['-', '*'])
00736 
00737     
00738     print "Done"