Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2008-2011 by Peter Cock.
00002 # All rights reserved.
00003 # This code is part of the Biopython distribution and governed by its
00004 # license.  Please see the LICENSE file that should have been included
00005 # as part of this package.
00006 """Code for dealing with sequence alignments.
00007 
00008 One of the most important things in this module is the MultipleSeqAlignment
00009 class, used in the Bio.AlignIO module.
00010 
00011 """
00012 __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages!
00013 
00014 from Bio.Seq import Seq
00015 from Bio.SeqRecord import SeqRecord
00016 from Bio import Alphabet
00017 
00018 #We only import this and subclass it for some limited backward compatibilty.
00019 from Bio.Align.Generic import Alignment as _Alignment
00020 class MultipleSeqAlignment(_Alignment):
00021     """Represents a classical multiple sequence alignment (MSA).
00022 
00023     By this we mean a collection of sequences (usually shown as rows) which
00024     are all the same length (usually with gap characters for insertions or
00025     padding). The data can then be regarded as a matrix of letters, with well
00026     defined columns.
00027 
00028     You would typically create an MSA by loading an alignment file with the
00029     AlignIO module:
00030 
00031     >>> from Bio import AlignIO
00032     >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal")
00033     >>> print align
00034     SingleLetterAlphabet() alignment with 7 rows and 156 columns
00035     TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
00036     TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
00037     TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
00038     TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
00039     TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
00040     TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
00041     TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
00042 
00043     In some respects you can treat these objects as lists of SeqRecord objects,
00044     each representing a row of the alignment. Iterating over an alignment gives
00045     the SeqRecord object for each row:
00046 
00047     >>> len(align)
00048     7
00049     >>> for record in align:
00050     ...     print record.id, len(record)
00051     gi|6273285|gb|AF191659.1|AF191 156
00052     gi|6273284|gb|AF191658.1|AF191 156
00053     gi|6273287|gb|AF191661.1|AF191 156
00054     gi|6273286|gb|AF191660.1|AF191 156
00055     gi|6273290|gb|AF191664.1|AF191 156
00056     gi|6273289|gb|AF191663.1|AF191 156
00057     gi|6273291|gb|AF191665.1|AF191 156
00058 
00059     You can also access individual rows as SeqRecord objects via their index:
00060 
00061     >>> print align[0].id
00062     gi|6273285|gb|AF191659.1|AF191
00063     >>> print align[-1].id
00064     gi|6273291|gb|AF191665.1|AF191
00065 
00066     And extract columns as strings:
00067 
00068     >>> print align[:,1]
00069     AAAAAAA
00070 
00071     Or, take just the first ten columns as a sub-alignment:
00072 
00073     >>> print align[:,:10]
00074     SingleLetterAlphabet() alignment with 7 rows and 10 columns
00075     TATACATTAA gi|6273285|gb|AF191659.1|AF191
00076     TATACATTAA gi|6273284|gb|AF191658.1|AF191
00077     TATACATTAA gi|6273287|gb|AF191661.1|AF191
00078     TATACATAAA gi|6273286|gb|AF191660.1|AF191
00079     TATACATTAA gi|6273290|gb|AF191664.1|AF191
00080     TATACATTAA gi|6273289|gb|AF191663.1|AF191
00081     TATACATTAA gi|6273291|gb|AF191665.1|AF191
00082     
00083     Combining this alignment slicing with alignment addition allows you to
00084     remove a section of the alignment. For example, taking just the first
00085     and last ten columns:
00086 
00087     >>> print align[:,:10] + align[:,-10:]
00088     SingleLetterAlphabet() alignment with 7 rows and 20 columns
00089     TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191
00090     TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191
00091     TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191
00092     TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191
00093     TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191
00094     TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191
00095     TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191
00096     
00097     Note - This object is intended to replace the existing Alignment object
00098     defined in module Bio.Align.Generic but is not fully backwards compatible
00099     with it.
00100 
00101     Note - This object does NOT attempt to model the kind of alignments used
00102     in next generation sequencing with multiple sequencing reads which are
00103     much shorter than the alignment, and where there is usually a consensus or
00104     reference sequence with special status.
00105     """
00106 
00107     def __init__(self, records, alphabet=None):
00108         """Initialize a new MultipleSeqAlignment object.
00109 
00110         Arguments:
00111          - records - A list (or iterator) of SeqRecord objects, whose
00112                      sequences are all the same length.  This may be an be an
00113                      empty list.
00114          - alphabet - The alphabet for the whole alignment, typically a gapped
00115                       alphabet, which should be a super-set of the individual
00116                       record alphabets.  If omitted, a consensus alphabet is
00117                       used.
00118 
00119         You would normally load a MSA from a file using Bio.AlignIO, but you
00120         can do this from a list of SeqRecord objects too:
00121 
00122         >>> from Bio.Alphabet import generic_dna
00123         >>> from Bio.Seq import Seq
00124         >>> from Bio.SeqRecord import SeqRecord
00125         >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
00126         >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
00127         >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
00128         >>> align = MultipleSeqAlignment([a, b, c])
00129         >>> print align
00130         DNAAlphabet() alignment with 3 rows and 7 columns
00131         AAAACGT Alpha
00132         AAA-CGT Beta
00133         AAAAGGT Gamma
00134 
00135         NOTE - The older Bio.Align.Generic.Alignment class only accepted a
00136         single argument, an alphabet.  This is still supported via a backwards
00137         compatible "hack" so as not to disrupt existing scripts and users, but
00138         is deprecated and will be removed in a future release.
00139         """
00140         if isinstance(records, Alphabet.Alphabet) \
00141         or isinstance(records, Alphabet.AlphabetEncoder):
00142             if alphabet is None:
00143                 #TODO - Remove this backwards compatible mode!                
00144                 alphabet = records
00145                 records = []
00146                 import warnings
00147                 warnings.warn("Invalid records argument: While the old "
00148                               "Bio.Align.Generic.Alignment class only "
00149                               "accepted a single argument (the alphabet), the "
00150                               "newer Bio.Align.MultipleSeqAlignment class "
00151                               "expects a list/iterator of SeqRecord objects "
00152                               "(which can be an empty list) and an optional "
00153                               "alphabet argument")
00154             else :
00155                 raise ValueError("Invalid records argument")
00156         if alphabet is not None :
00157             if not (isinstance(alphabet, Alphabet.Alphabet) \
00158             or isinstance(alphabet, Alphabet.AlphabetEncoder)):
00159                 raise ValueError("Invalid alphabet argument")
00160             self._alphabet = alphabet
00161         else :
00162             #Default while we add sequences, will take a consensus later
00163             self._alphabet = Alphabet.single_letter_alphabet
00164 
00165         self._records = []
00166         if records:
00167             self.extend(records)
00168             if alphabet is None:
00169                 #No alphabet was given, take a consensus alphabet
00170                 self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for \
00171                                                               rec in self._records \
00172                                                               if rec.seq is not None)
00173 
00174     def extend(self, records):
00175         """Add more SeqRecord objects to the alignment as rows.
00176 
00177         They must all have the same length as the original alignment, and have
00178         alphabets compatible with the alignment's alphabet. For example,
00179 
00180         >>> from Bio.Alphabet import generic_dna
00181         >>> from Bio.Seq import Seq
00182         >>> from Bio.SeqRecord import SeqRecord
00183         >>> from Bio.Align import MultipleSeqAlignment
00184         >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
00185         >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
00186         >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
00187         >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta")
00188         >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon")
00189 
00190         First we create a small alignment (three rows):
00191 
00192         >>> align = MultipleSeqAlignment([a, b, c])
00193         >>> print align
00194         DNAAlphabet() alignment with 3 rows and 7 columns
00195         AAAACGT Alpha
00196         AAA-CGT Beta
00197         AAAAGGT Gamma
00198 
00199         Now we can extend this alignment with another two rows:
00200 
00201         >>> align.extend([d, e])
00202         >>> print align
00203         DNAAlphabet() alignment with 5 rows and 7 columns
00204         AAAACGT Alpha
00205         AAA-CGT Beta
00206         AAAAGGT Gamma
00207         AAAACGT Delta
00208         AAA-GGT Epsilon
00209 
00210         Because the alignment object allows iteration over the rows as
00211         SeqRecords, you can use the extend method with a second alignment
00212         (provided its sequences have the same length as the original alignment).
00213         """
00214         if len(self):
00215             #Use the standard method to get the length
00216             expected_length = self.get_alignment_length()
00217         else:
00218             #Take the first record's length
00219             records = iter(records) #records arg could be list or iterator
00220             try:
00221                 rec = records.next()
00222             except StopIteration:
00223                 #Special case, no records
00224                 return
00225             expected_length = len(rec)
00226             self._append(rec, expected_length)
00227             #Now continue to the rest of the records as usual
00228             
00229         for rec in records:
00230             self._append(rec, expected_length)
00231             
00232     def append(self, record):
00233         """Add one more SeqRecord object to the alignment as a new row.
00234 
00235         This must have the same length as the original alignment (unless this is
00236         the first record), and have an alphabet compatible with the alignment's
00237         alphabet.
00238 
00239         >>> from Bio import AlignIO
00240         >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal")
00241         >>> print align
00242         SingleLetterAlphabet() alignment with 7 rows and 156 columns
00243         TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
00244         TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
00245         TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
00246         TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
00247         TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
00248         TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
00249         TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
00250         >>> len(align)
00251         7
00252 
00253         We'll now construct a dummy record to append as an example:
00254 
00255         >>> from Bio.Seq import Seq
00256         >>> from Bio.SeqRecord import SeqRecord
00257         >>> dummy = SeqRecord(Seq("N"*156), id="dummy")
00258 
00259         Now append this to the alignment,
00260 
00261         >>> align.append(dummy)
00262         >>> print align
00263         SingleLetterAlphabet() alignment with 8 rows and 156 columns
00264         TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
00265         TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
00266         TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
00267         TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
00268         TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
00269         TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
00270         TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
00271         NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy
00272         >>> len(align)
00273         8
00274 
00275         """
00276         if self._records:
00277             self._append(record, self.get_alignment_length())
00278         else:
00279             self._append(record)
00280     
00281     def _append(self, record, expected_length=None):
00282         """Helper function (PRIVATE)."""
00283         if not isinstance(record, SeqRecord):
00284             raise TypeError("New sequence is not a SeqRecord object")
00285 
00286         #Currently the get_alignment_length() call is expensive, so we need
00287         #to avoid calling it repeatedly for __init__ and extend, hence this
00288         #private _append method
00289         if expected_length is not None and len(record) != expected_length:
00290             #TODO - Use the following more helpful error, but update unit tests
00291             #raise ValueError("New sequence is not of length %i" \
00292             #                 % self.get_alignment_length())
00293             raise ValueError("Sequences must all be the same length")
00294             
00295         #Using not self.alphabet.contains(record.seq.alphabet) needs fixing
00296         #for AlphabetEncoders (e.g. gapped versus ungapped).
00297         if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]):
00298             raise ValueError("New sequence's alphabet is incompatible")
00299         self._records.append(record)
00300 
00301     def __add__(self, other):
00302         """Combines to alignments with the same number of rows by adding them.
00303 
00304         If you have two multiple sequence alignments (MSAs), there are two ways to think
00305         about adding them - by row or by column. Using the extend method adds by row.
00306         Using the addition operator adds by column. For example,
00307 
00308         >>> from Bio.Alphabet import generic_dna
00309         >>> from Bio.Seq import Seq
00310         >>> from Bio.SeqRecord import SeqRecord
00311         >>> from Bio.Align import MultipleSeqAlignment
00312         >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha")
00313         >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta")
00314         >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma")
00315         >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha")
00316         >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta")
00317         >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma")
00318         >>> left = MultipleSeqAlignment([a1, b1, c1])
00319         >>> right = MultipleSeqAlignment([a2, b2, c2])
00320 
00321         Now, let's look at these two alignments:
00322 
00323         >>> print left
00324         DNAAlphabet() alignment with 3 rows and 5 columns
00325         AAAAC Alpha
00326         AAA-C Beta
00327         AAAAG Gamma
00328         >>> print right
00329         DNAAlphabet() alignment with 3 rows and 2 columns
00330         GT Alpha
00331         GT Beta
00332         GT Gamma
00333 
00334         And add them:
00335 
00336         >>> print left + right
00337         DNAAlphabet() alignment with 3 rows and 7 columns
00338         AAAACGT Alpha
00339         AAA-CGT Beta
00340         AAAAGGT Gamma
00341 
00342         For this to work, both alignments must have the same number of records (here
00343         they both have 3 rows):
00344 
00345         >>> len(left)
00346         3
00347         >>> len(right)
00348         3
00349 
00350         The individual rows are SeqRecord objects, and these can be added together. Refer
00351         to the SeqRecord documentation for details of how the annotation is handled. This
00352         example is a special case in that both original alignments shared the same names,
00353         meaning when the rows are added they also get the same name.
00354         """
00355         if not isinstance(other, MultipleSeqAlignment):
00356             raise NotImplementedError
00357         if len(self) != len(other):
00358             raise ValueError("When adding two alignments they must have the same length"
00359                              " (i.e. same number or rows)")
00360         alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet])
00361         merged = (left+right for left,right in zip(self, other))
00362         return MultipleSeqAlignment(merged, alpha)
00363 
00364     def __getitem__(self, index):
00365         """Access part of the alignment.
00366 
00367         Depending on the indices, you can get a SeqRecord object
00368         (representing a single row), a Seq object (for a single columns),
00369         a string (for a single characters) or another alignment
00370         (representing some part or all of the alignment).
00371 
00372         align[r,c] gives a single character as a string
00373         align[r] gives a row as a SeqRecord
00374         align[r,:] gives a row as a SeqRecord
00375         align[:,c] gives a column as a Seq (using the alignment's alphabet)
00376 
00377         align[:] and align[:,:] give a copy of the alignment
00378 
00379         Anything else gives a sub alignment, e.g.
00380         align[0:2] or align[0:2,:] uses only row 0 and 1
00381         align[:,1:3] uses only columns 1 and 2
00382         align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2
00383 
00384         We'll use the following example alignment here for illustration:
00385 
00386         >>> from Bio.Alphabet import generic_dna
00387         >>> from Bio.Seq import Seq
00388         >>> from Bio.SeqRecord import SeqRecord
00389         >>> from Bio.Align import MultipleSeqAlignment
00390         >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
00391         >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
00392         >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
00393         >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta")
00394         >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon")
00395         >>> align = MultipleSeqAlignment([a, b, c, d, e], generic_dna)
00396         
00397         You can access a row of the alignment as a SeqRecord using an integer
00398         index (think of the alignment as a list of SeqRecord objects here):
00399 
00400         >>> first_record = align[0]
00401         >>> print first_record.id, first_record.seq
00402         Alpha AAAACGT
00403         >>> last_record = align[-1]
00404         >>> print last_record.id, last_record.seq
00405         Epsilon AAA-GGT
00406 
00407         You can also access use python's slice notation to create a sub-alignment
00408         containing only some of the SeqRecord objects:
00409 
00410         >>> sub_alignment = align[2:5]
00411         >>> print sub_alignment
00412         DNAAlphabet() alignment with 3 rows and 7 columns
00413         AAAAGGT Gamma
00414         AAAACGT Delta
00415         AAA-GGT Epsilon
00416 
00417         This includes support for a step, i.e. align[start:end:step], which
00418         can be used to select every second sequence:
00419 
00420         >>> sub_alignment = align[::2]
00421         >>> print sub_alignment
00422         DNAAlphabet() alignment with 3 rows and 7 columns
00423         AAAACGT Alpha
00424         AAAAGGT Gamma
00425         AAA-GGT Epsilon
00426 
00427         Or to get a copy of the alignment with the rows in reverse order:
00428 
00429         >>> rev_alignment = align[::-1]
00430         >>> print rev_alignment
00431         DNAAlphabet() alignment with 5 rows and 7 columns
00432         AAA-GGT Epsilon
00433         AAAACGT Delta
00434         AAAAGGT Gamma
00435         AAA-CGT Beta
00436         AAAACGT Alpha
00437     
00438         You can also use two indices to specify both rows and columns. Using simple
00439         integers gives you the entry as a single character string. e.g.
00440 
00441         >>> align[3,4]
00442         'C'
00443 
00444         This is equivalent to:
00445 
00446         >>> align[3][4]
00447         'C'
00448 
00449         or:
00450 
00451         >>> align[3].seq[4]
00452         'C'
00453 
00454         To get a single column (as a string) use this syntax:
00455 
00456         >>> align[:,4]
00457         'CCGCG'
00458 
00459         Or, to get part of a column,
00460 
00461         >>> align[1:3,4]
00462         'CG'
00463 
00464         However, in general you get a sub-alignment,
00465 
00466         >>> print align[1:5,3:6]
00467         DNAAlphabet() alignment with 4 rows and 3 columns
00468         -CG Beta
00469         AGG Gamma
00470         ACG Delta
00471         -GG Epsilon
00472 
00473         This should all seem familiar to anyone who has used the NumPy
00474         array or matrix objects.
00475         """
00476         if isinstance(index, int):
00477             #e.g. result = align[x]
00478             #Return a SeqRecord
00479             return self._records[index]
00480         elif isinstance(index, slice):
00481             #e.g. sub_align = align[i:j:k]
00482             return MultipleSeqAlignment(self._records[index], self._alphabet)
00483         elif len(index)!=2:
00484             raise TypeError("Invalid index type.")
00485 
00486         #Handle double indexing
00487         row_index, col_index = index
00488         if isinstance(row_index, int):
00489             #e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord
00490             return self._records[row_index][col_index]
00491         elif isinstance(col_index, int):
00492             #e.g. col_or_part_col = align[1:5, 6], gives a string
00493             return "".join(rec[col_index] for rec in self._records[row_index])
00494         else:
00495             #e.g. sub_align = align[1:4, 5:7], gives another alignment
00496             return MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]),
00497                                         self._alphabet)
00498 
00499     def sort(self, key=None, reverse=False):
00500         """Sort the rows (SeqRecord objects) of the alignment in place.
00501 
00502         This sorts the rows alphabetically using the SeqRecord object id by
00503         default. The sorting can be controlled by supplying a key function
00504         which must map each SeqRecord to a sort value.
00505 
00506         This is useful if you want to add two alignments which use the same
00507         record identifiers, but in a different order. For example,
00508 
00509         >>> from Bio.Alphabet import generic_dna
00510         >>> from Bio.Seq import Seq
00511         >>> from Bio.SeqRecord import SeqRecord
00512         >>> from Bio.Align import MultipleSeqAlignment
00513         >>> align1 = MultipleSeqAlignment([
00514         ...              SeqRecord(Seq("ACGT", generic_dna), id="Human"),
00515         ...              SeqRecord(Seq("ACGG", generic_dna), id="Mouse"),
00516         ...              SeqRecord(Seq("ACGC", generic_dna), id="Chicken"),
00517         ...          ])
00518         >>> align2 = MultipleSeqAlignment([
00519         ...              SeqRecord(Seq("CGGT", generic_dna), id="Mouse"),
00520         ...              SeqRecord(Seq("CGTT", generic_dna), id="Human"),
00521         ...              SeqRecord(Seq("CGCT", generic_dna), id="Chicken"),
00522         ...          ])
00523 
00524         If you simple try and add these without sorting, you get this:
00525 
00526         >>> print align1 + align2
00527         DNAAlphabet() alignment with 3 rows and 8 columns
00528         ACGTCGGT <unknown id>
00529         ACGGCGTT <unknown id>
00530         ACGCCGCT Chicken
00531 
00532         Consult the SeqRecord documentation which explains why you get a
00533         default value when annotation like the identifier doesn't match up.
00534         However, if we sort the alignments first, then add them we get the
00535         desired result:
00536 
00537         >>> align1.sort()
00538         >>> align2.sort()
00539         >>> print align1 + align2
00540         DNAAlphabet() alignment with 3 rows and 8 columns
00541         ACGCCGCT Chicken
00542         ACGTCGTT Human
00543         ACGGCGGT Mouse
00544 
00545         As an example using a different sort order, you could sort on the
00546         GC content of each sequence.
00547 
00548         >>> from Bio.SeqUtils import GC
00549         >>> print align1
00550         DNAAlphabet() alignment with 3 rows and 4 columns
00551         ACGC Chicken
00552         ACGT Human
00553         ACGG Mouse
00554         >>> align1.sort(key = lambda record: GC(record.seq))
00555         >>> print align1
00556         DNAAlphabet() alignment with 3 rows and 4 columns
00557         ACGT Human
00558         ACGC Chicken
00559         ACGG Mouse
00560 
00561         There is also a reverse argument, so if you wanted to sort by ID
00562         but backwards:
00563 
00564         >>> align1.sort(reverse=True)
00565         >>> print align1
00566         DNAAlphabet() alignment with 3 rows and 4 columns
00567         ACGG Mouse
00568         ACGT Human
00569         ACGC Chicken
00570 
00571         """
00572         if key is None:
00573             self._records.sort(key = lambda r: r.id, reverse = reverse)
00574         else:
00575             self._records.sort(key = key, reverse = reverse)
00576 
00577     def get_column(self, col):
00578         """Returns a string containing a given column (DEPRECATED).
00579 
00580         This is a method provided for backwards compatibility with the old
00581         Bio.Align.Generic.Alignment object. Please use the slice notation
00582         instead, since get_column is likely to be removed in a future release
00583         of Biopython..
00584         """
00585         import warnings
00586         import Bio
00587         warnings.warn("This method is deprecated and is provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, as get_column is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
00588         return _Alignment.get_column(self, col)
00589 
00590     def add_sequence(self, descriptor, sequence, start = None, end = None,
00591                      weight = 1.0):
00592         """Add a sequence to the alignment (DEPRECATED).
00593 
00594         The start, end, and weight arguments are not supported! This method
00595         only provides limited backwards compatibility with the old
00596         Bio.Align.Generic.Alignment object. Please use the append method with
00597         a SeqRecord instead, since add_sequence is likely to be removed in a
00598         future release of Biopython.
00599         """
00600         import warnings
00601         import Bio
00602         warnings.warn("The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
00603         #Should we handle start/end/strand information somehow? What for?
00604         #TODO - Should we handle weights somehow? See also AlignInfo code...
00605         if start is not None or end is not None or weight != 1.0:
00606             raise ValueError("The add_Sequence method is obsolete, and only "
00607                              "provides limited backwards compatibily. The"
00608                              "start, end and weight arguments are not "
00609                              "supported.")
00610         self.append(SeqRecord(Seq(sequence, self._alphabet),
00611                               id = descriptor, description = descriptor))
00612 
00613 
00614 def _test():
00615     """Run the Bio.Align module's doctests.
00616 
00617     This will try and locate the unit tests directory, and run the doctests
00618     from there in order that the relative paths used in the examples work.
00619     """
00620     import doctest
00621     import os
00622     if os.path.isdir(os.path.join("..", "..", "Tests", "Clustalw")):
00623         print "Runing doctests..."
00624         cur_dir = os.path.abspath(os.curdir)
00625         os.chdir(os.path.join("..", "..", "Tests"))
00626         doctest.testmod()
00627         os.chdir(cur_dir)
00628         del cur_dir
00629         print "Done"
00630     elif os.path.isdir(os.path.join("Tests", "Clustalw")):
00631         print "Runing doctests..."
00632         cur_dir = os.path.abspath(os.curdir)
00633         os.chdir(os.path.join("Tests"))
00634         doctest.testmod()
00635         os.chdir(cur_dir)
00636         del cur_dir
00637         print "Done"
00638 
00639 if __name__ == "__main__":
00640     #Run the doctests
00641     _test()