Back to index

python-biopython  1.60
Generic.py
Go to the documentation of this file.
00001 # Copyright 2000-2004 Brad Chapman.
00002 # Copyright 2001 Iddo Friedberg.
00003 # Copyright 2007-2010 by Peter Cock.
00004 # All rights reserved.
00005 # This code is part of the Biopython distribution and governed by its
00006 # license.  Please see the LICENSE file that should have been included
00007 # as part of this package.
00008 """
00009 Contains classes to deal with generic sequence alignment stuff not
00010 specific to a particular program or format.
00011 
00012 Classes:
00013  - Alignment
00014 """
00015 __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages!
00016 
00017 # biopython
00018 from Bio.Seq import Seq
00019 from Bio.SeqRecord import SeqRecord
00020 from Bio import Alphabet
00021 
00022 class Alignment(object):
00023     """Represent a set of alignments (DEPRECATED).
00024 
00025     This is a base class to represent alignments, which can be subclassed
00026     to deal with an alignment in a specific format.
00027 
00028     With the introduction of the MultipleSeqAlignment class in Bio.Align,
00029     this base class is deprecated and is likely to be removed in future
00030     releases of Biopython.
00031     """
00032     def __init__(self, alphabet):
00033         """Initialize a new Alignment object.
00034 
00035         Arguments:
00036          - alphabet - The alphabet to use for the sequence objects that are
00037                       created. This alphabet must be a gapped type.
00038 
00039         e.g.
00040 
00041         >>> from Bio.Alphabet import IUPAC, Gapped
00042         >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
00043         >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
00044         >>> align.add_sequence("Beta",  "ACT-CTAGCTAG")
00045         >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
00046         >>> print align
00047         Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
00048         ACTGCTAGCTAG Alpha
00049         ACT-CTAGCTAG Beta
00050         ACTGCTAGATAG Gamma
00051         """
00052         import warnings
00053         import Bio
00054         warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is deprecated and is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
00055         if not (isinstance(alphabet, Alphabet.Alphabet) \
00056         or isinstance(alphabet, Alphabet.AlphabetEncoder)):
00057             raise ValueError("Invalid alphabet argument")
00058         self._alphabet = alphabet
00059         # hold everything at a list of SeqRecord objects
00060         self._records = []
00061 
00062     def _str_line(self, record):
00063         """Returns a truncated string representation of a SeqRecord (PRIVATE).
00064 
00065         This is a PRIVATE function used by the __str__ method.
00066         """
00067         if len(record.seq) <= 50:
00068             return "%s %s" % (record.seq, record.id)
00069         else:
00070             return "%s...%s %s" \
00071                    % (record.seq[:44], record.seq[-3:], record.id)
00072 
00073     def __str__(self):
00074         """Returns a multi-line string summary of the alignment.
00075 
00076         This output is intended to be readable, but large alignments are
00077         shown truncated.  A maximum of 20 rows (sequences) and 50 columns
00078         are shown, with the record identifiers.  This should fit nicely on a
00079         single screen.  e.g.
00080 
00081         >>> from Bio.Alphabet import IUPAC, Gapped
00082         >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
00083         >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
00084         >>> align.add_sequence("Beta",  "ACT-CTAGCTAG")
00085         >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
00086         >>> print align
00087         Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
00088         ACTGCTAGCTAG Alpha
00089         ACT-CTAGCTAG Beta
00090         ACTGCTAGATAG Gamma
00091 
00092         See also the alignment's format method.
00093         """
00094         rows = len(self._records)
00095         lines = ["%s alignment with %i rows and %i columns" \
00096                  % (str(self._alphabet), rows, self.get_alignment_length())]
00097         if rows <= 20:
00098             lines.extend([self._str_line(rec) for rec in self._records])
00099         else:
00100             lines.extend([self._str_line(rec) for rec in self._records[:18]])
00101             lines.append("...")
00102             lines.append(self._str_line(self._records[-1]))
00103         return "\n".join(lines)
00104 
00105     def __repr__(self):
00106         """Returns a representation of the object for debugging.
00107 
00108         The representation cannot be used with eval() to recreate the object,
00109         which is usually possible with simple python ojects.  For example:
00110 
00111         <Bio.Align.Generic.Alignment instance (2 records of length 14,
00112         SingleLetterAlphabet()) at a3c184c>
00113 
00114         The hex string is the memory address of the object, see help(id).
00115         This provides a simple way to visually distinguish alignments of
00116         the same size.
00117         """
00118         #A doctest for __repr__ would be nice, but __class__ comes out differently
00119         #if run via the __main__ trick.
00120         return "<%s instance (%i records of length %i, %s) at %x>" % \
00121                (self.__class__, len(self._records),
00122                 self.get_alignment_length(), repr(self._alphabet), id(self))
00123         #This version is useful for doing eval(repr(alignment)),
00124         #but it can be VERY long:
00125         #return "%s(%s, %s)" \
00126         #       % (self.__class__, repr(self._records), repr(self._alphabet))
00127 
00128     def format(self, format):
00129         """Returns the alignment as a string in the specified file format.
00130 
00131         The format should be a lower case string supported as an output
00132         format by Bio.AlignIO (such as "fasta", "clustal", "phylip",
00133         "stockholm", etc), which is used to turn the alignment into a
00134         string.
00135 
00136         e.g.
00137 
00138         >>> from Bio.Alphabet import IUPAC, Gapped
00139         >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
00140         >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
00141         >>> align.add_sequence("Beta",  "ACT-CTAGCTAG")
00142         >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
00143         >>> print align.format("fasta")
00144         >Alpha
00145         ACTGCTAGCTAG
00146         >Beta
00147         ACT-CTAGCTAG
00148         >Gamma
00149         ACTGCTAGATAG
00150         <BLANKLINE>
00151         >>> print align.format("phylip")
00152          3 12
00153         Alpha      ACTGCTAGCT AG
00154         Beta       ACT-CTAGCT AG
00155         Gamma      ACTGCTAGAT AG
00156         <BLANKLINE>
00157 
00158         For Python 2.6, 3.0 or later see also the built in format() function.
00159         """
00160         #See also the __format__ added for Python 2.6 / 3.0, PEP 3101
00161         #See also the SeqRecord class and its format() method using Bio.SeqIO
00162         return self.__format__(format)
00163 
00164 
00165     def __format__(self, format_spec):
00166         """Returns the alignment as a string in the specified file format.
00167 
00168         This method supports the python format() function added in
00169         Python 2.6/3.0.  The format_spec should be a lower case
00170         string supported by Bio.AlignIO as an output file format.
00171         See also the alignment's format() method."""
00172         if format_spec:
00173             from StringIO import StringIO
00174             from Bio import AlignIO
00175             handle = StringIO()
00176             AlignIO.write([self], handle, format_spec)
00177             return handle.getvalue()
00178         else:
00179             #Follow python convention and default to using __str__
00180             return str(self)    
00181 
00182     def get_all_seqs(self):
00183         """Return all of the sequences involved in the alignment (DEPRECATED).
00184 
00185         The return value is a list of SeqRecord objects.
00186 
00187         This method is deprecated, as the Alignment object itself now offers
00188         much of the functionality of a list of SeqRecord objects (e.g.
00189         iteration or slicing to create a sub-alignment). Instead use the
00190         Python builtin function list, i.e. my_list = list(my_align)
00191         """
00192         import warnings
00193         import Bio
00194         warnings.warn("This method is deprecated, since the alignment object"
00195                       "now acts more like a list. Instead of calling "
00196                       "align.get_all_seqs() you can use list(align)",
00197                       Bio.BiopythonDeprecationWarning)
00198         return self._records
00199 
00200     def __iter__(self):
00201         """Iterate over alignment rows as SeqRecord objects.
00202 
00203         e.g.
00204 
00205         >>> from Bio.Alphabet import IUPAC, Gapped
00206         >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
00207         >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
00208         >>> align.add_sequence("Beta",  "ACT-CTAGCTAG")
00209         >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
00210         >>> for record in align:
00211         ...    print record.id
00212         ...    print record.seq
00213         Alpha
00214         ACTGCTAGCTAG
00215         Beta
00216         ACT-CTAGCTAG
00217         Gamma
00218         ACTGCTAGATAG
00219         """
00220         return iter(self._records) 
00221 
00222     def get_seq_by_num(self, number):
00223         """Retrieve a sequence by row number (DEPRECATED).
00224 
00225         Returns:
00226          - A Seq object for the requested sequence.
00227 
00228         Raises:
00229          - IndexError - If the specified number is out of range.
00230 
00231         NOTE: This is a legacy method.  In new code where you need to access
00232         the rows of the alignment (i.e. the sequences) consider iterating
00233         over them or accessing them as SeqRecord objects.
00234         """
00235         import warnings
00236         import Bio
00237         warnings.warn("This is a legacy method and is likely to be removed in a future release of Biopython. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", Bio.BiopythonDeprecationWarning)
00238         return self._records[number].seq
00239 
00240     def __len__(self):
00241         """Returns the number of sequences in the alignment.
00242 
00243         Use len(alignment) to get the number of sequences (i.e. the number of
00244         rows), and alignment.get_alignment_length() to get the length of the
00245         longest sequence (i.e. the number of columns).
00246 
00247         This is easy to remember if you think of the alignment as being like a
00248         list of SeqRecord objects.
00249         """
00250         return len(self._records)
00251     
00252     def get_alignment_length(self):
00253         """Return the maximum length of the alignment.
00254 
00255         All objects in the alignment should (hopefully) have the same
00256         length. This function will go through and find this length
00257         by finding the maximum length of sequences in the alignment.
00258 
00259         >>> from Bio.Alphabet import IUPAC, Gapped
00260         >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
00261         >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
00262         >>> align.add_sequence("Beta",  "ACT-CTAGCTAG")
00263         >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
00264         >>> align.get_alignment_length()
00265         12
00266 
00267         If you want to know the number of sequences in the alignment,
00268         use len(align) instead:
00269 
00270         >>> len(align)
00271         3
00272         
00273         """
00274         max_length = 0
00275 
00276         for record in self._records:
00277             if len(record.seq) > max_length:
00278                 max_length = len(record.seq)
00279 
00280         return max_length
00281 
00282     def add_sequence(self, descriptor, sequence, start = None, end = None,
00283                      weight = 1.0):
00284         """Add a sequence to the alignment.
00285 
00286         This doesn't do any kind of alignment, it just adds in the sequence
00287         object, which is assumed to be prealigned with the existing
00288         sequences.
00289 
00290         Arguments:
00291          - descriptor - The descriptive id of the sequence being added.
00292                        This will be used as the resulting SeqRecord's
00293                        .id property (and, for historical compatibility,
00294                        also the .description property)
00295          - sequence - A string with sequence info.
00296          - start - You can explicitly set the start point of the sequence.
00297                    This is useful (at least) for BLAST alignments, which can
00298                    just be partial alignments of sequences.
00299          - end - Specify the end of the sequence, which is important
00300                  for the same reason as the start.
00301          - weight - The weight to place on the sequence in the alignment.
00302                     By default, all sequences have the same weight. (0.0 =>
00303                     no weight, 1.0 => highest weight)
00304         """
00305         new_seq = Seq(sequence, self._alphabet)
00306 
00307         #We are now effectively using the SeqRecord's .id as
00308         #the primary identifier (e.g. in Bio.SeqIO) so we should
00309         #populate it with the descriptor.
00310         #For backwards compatibility, also store this in the
00311         #SeqRecord's description property.
00312         new_record = SeqRecord(new_seq,
00313                                id = descriptor,
00314                                description = descriptor)
00315 
00316         # hack! We really need to work out how to deal with annotations
00317         # and features in biopython. Right now, I'll just use the
00318         # generic annotations dictionary we've got to store the start
00319         # and end, but we should think up something better. I don't know
00320         # if I'm really a big fan of the LocatableSeq thing they've got
00321         # in BioPerl, but I'm not positive what the best thing to do on
00322         # this is...
00323         if start:
00324             new_record.annotations['start'] = start
00325         if end:
00326             new_record.annotations['end'] = end
00327 
00328         # another hack to add weight information to the sequence
00329         new_record.annotations['weight'] = weight
00330 
00331         self._records.append(new_record)
00332         
00333     def get_column(self,col):
00334         """Returns a string containing a given column.
00335 
00336         e.g.
00337 
00338         >>> from Bio.Alphabet import IUPAC, Gapped
00339         >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
00340         >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
00341         >>> align.add_sequence("Beta",  "ACT-CTAGCTAG")
00342         >>> align.add_sequence("Gamma", "ACTGCTAGATAG")
00343         >>> align.get_column(0)
00344         'AAA'
00345         >>> align.get_column(3)
00346         'G-G'
00347         """
00348         #TODO - Support negative indices?
00349         col_str = ''
00350         assert col >= 0 and col <= self.get_alignment_length()
00351         for rec in self._records:
00352             col_str += rec.seq[col]
00353         return col_str
00354 
00355     def __getitem__(self, index):
00356         """Access part of the alignment.
00357 
00358         We'll use the following example alignment here for illustration:
00359 
00360         >>> from Bio.Alphabet import IUPAC, Gapped
00361         >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
00362         >>> align.add_sequence("Alpha",  "ACTGCTAGCTAG")
00363         >>> align.add_sequence("Beta",   "ACT-CTAGCTAG")
00364         >>> align.add_sequence("Gamma",  "ACTGCTAGATAG")
00365         >>> align.add_sequence("Delta",  "ACTGCTTGCTAG")
00366         >>> align.add_sequence("Epsilon","ACTGCTTGATAG")
00367         
00368         You can access a row of the alignment as a SeqRecord using an integer
00369         index (think of the alignment as a list of SeqRecord objects here):
00370 
00371         >>> first_record = align[0]
00372         >>> print first_record.id, first_record.seq
00373         Alpha ACTGCTAGCTAG
00374         >>> last_record = align[-1]
00375         >>> print last_record.id, last_record.seq
00376         Epsilon ACTGCTTGATAG
00377 
00378         You can also access use python's slice notation to create a sub-alignment
00379         containing only some of the SeqRecord objects:
00380 
00381         >>> sub_alignment = align[2:5]
00382         >>> print sub_alignment
00383         Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
00384         ACTGCTAGATAG Gamma
00385         ACTGCTTGCTAG Delta
00386         ACTGCTTGATAG Epsilon
00387 
00388         This includes support for a step, i.e. align[start:end:step], which
00389         can be used to select every second sequence:
00390 
00391         >>> sub_alignment = align[::2]
00392         >>> print sub_alignment
00393         Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
00394         ACTGCTAGCTAG Alpha
00395         ACTGCTAGATAG Gamma
00396         ACTGCTTGATAG Epsilon
00397 
00398         Or to get a copy of the alignment with the rows in reverse order:
00399 
00400         >>> rev_alignment = align[::-1]
00401         >>> print rev_alignment
00402         Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns
00403         ACTGCTTGATAG Epsilon
00404         ACTGCTTGCTAG Delta
00405         ACTGCTAGATAG Gamma
00406         ACT-CTAGCTAG Beta
00407         ACTGCTAGCTAG Alpha
00408 
00409         Right now, these are the ONLY indexing operations supported.  The use of
00410         a second column based index is under discussion for a future update.
00411         """
00412         if isinstance(index, int):
00413             #e.g. result = align[x]
00414             #Return a SeqRecord
00415             return self._records[index]
00416         elif isinstance(index, slice):
00417             #e.g. sub_aling = align[i:j:k]
00418             #Return a new Alignment using only the specified records.
00419             #TODO - See Bug 2554 for changing the __init__ method
00420             #to allow us to do this more cleanly.
00421             sub_align = Alignment(self._alphabet)
00422             sub_align._records = self._records[index]
00423             return sub_align
00424         elif len(index)==2:
00425             raise TypeError("Row and Column indexing is not currently supported,"\
00426                             +"but may be in future.")
00427         else:
00428             raise TypeError("Invalid index type.")
00429 
00430 def _test():
00431     """Run the Bio.Align.Generic module's doctests."""
00432     print "Running doctests..."
00433     import doctest
00434     doctest.testmod()
00435     print "Done"
00436 
00437 if __name__ == "__main__":
00438     _test()