Back to index

python-biopython  1.60
Seq.py
Go to the documentation of this file.
00001 # Copyright 2000-2002 Brad Chapman.
00002 # Copyright 2004-2005 by M de Hoon.
00003 # Copyright 2007-2010 by Peter Cock.
00004 # All rights reserved.
00005 # This code is part of the Biopython distribution and governed by its
00006 # license.  Please see the LICENSE file that should have been included
00007 # as part of this package.
00008 """Provides objects to represent biological sequences with alphabets.
00009 
00010 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
00011  - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
00012  - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
00013 """
00014 __docformat__ ="epytext en" #Don't just use plain text in epydoc API pages!
00015 
00016 import string #for maketrans only
00017 import array
00018 import sys
00019 
00020 from Bio import Alphabet
00021 from Bio.Alphabet import IUPAC
00022 from Bio.Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
00023 from Bio.Data import CodonTable
00024 
00025 def _maketrans(complement_mapping):
00026     """Makes a python string translation table (PRIVATE).
00027 
00028     Arguments:
00029      - complement_mapping - a dictionary such as ambiguous_dna_complement
00030        and ambiguous_rna_complement from Data.IUPACData.
00031 
00032     Returns a translation table (a string of length 256) for use with the
00033     python string's translate method to use in a (reverse) complement.
00034     
00035     Compatible with lower case and upper case sequences.
00036 
00037     For internal use only.
00038     """
00039     before = ''.join(complement_mapping.keys())
00040     after  = ''.join(complement_mapping.values())
00041     before = before + before.lower()
00042     after  = after + after.lower()
00043     if sys.version_info[0] == 3 :
00044         return str.maketrans(before, after)
00045     else:
00046         return string.maketrans(before, after)
00047 
00048 _dna_complement_table = _maketrans(ambiguous_dna_complement)
00049 _rna_complement_table = _maketrans(ambiguous_rna_complement)
00050 
00051 class Seq(object):
00052     """A read-only sequence object (essentially a string with an alphabet).
00053 
00054     Like normal python strings, our basic sequence object is immutable.
00055     This prevents you from doing my_seq[5] = "A" for example, but does allow
00056     Seq objects to be used as dictionary keys.
00057 
00058     The Seq object provides a number of string like methods (such as count,
00059     find, split and strip), which are alphabet aware where appropriate.
00060 
00061     In addition to the string like sequence, the Seq object has an alphabet
00062     property. This is an instance of an Alphabet class from Bio.Alphabet,
00063     for example generic DNA, or IUPAC DNA. This describes the type of molecule
00064     (e.g. RNA, DNA, protein) and may also indicate the expected symbols
00065     (letters).
00066 
00067     The Seq object also provides some biological methods, such as complement,
00068     reverse_complement, transcribe, back_transcribe and translate (which are
00069     not applicable to sequences with a protein alphabet).
00070     """
00071     def __init__(self, data, alphabet = Alphabet.generic_alphabet):
00072         """Create a Seq object.
00073 
00074         Arguments:
00075          - seq      - Sequence, required (string)
00076          - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
00077         
00078         You will typically use Bio.SeqIO to read in sequences from files as
00079         SeqRecord objects, whose sequence will be exposed as a Seq object via
00080         the seq property.
00081 
00082         However, will often want to create your own Seq objects directly:
00083 
00084         >>> from Bio.Seq import Seq
00085         >>> from Bio.Alphabet import IUPAC
00086         >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
00087         ...              IUPAC.protein)
00088         >>> my_seq
00089         Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
00090         >>> print my_seq
00091         MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
00092         >>> my_seq.alphabet
00093         IUPACProtein()
00094 
00095         """
00096         # Enforce string storage
00097         if not isinstance(data, basestring):
00098             raise TypeError("The sequence data given to a Seq object should "
00099                             "be a string (not another Seq object etc)")
00100         self._data = data
00101         self.alphabet = alphabet  # Seq API requirement
00102  
00103     # A data property is/was a Seq API requirement
00104     # Note this is read only since the Seq object is meant to be imutable
00105     @property
00106     def data(self) :
00107         """Sequence as a string (DEPRECATED).
00108 
00109         This is a read only property provided for backwards compatility with
00110         older versions of Biopython (as is the tostring() method). We now
00111         encourage you to use str(my_seq) instead of my_seq.data or the method
00112         my_seq.tostring().
00113 
00114         In recent releases of Biopython it was possible to change a Seq object
00115         by updating its data property, but this triggered a deprecation warning.
00116         Now the data property is read only, since Seq objects are meant to be
00117         immutable:
00118 
00119         >>> from Bio.Seq import Seq
00120         >>> from Bio.Alphabet import generic_dna
00121         >>> my_seq = Seq("ACGT", generic_dna)
00122         >>> str(my_seq) == my_seq.tostring() == "ACGT"
00123         True
00124         >>> my_seq.data = "AAAA"
00125         Traceback (most recent call last):
00126            ...
00127         AttributeError: can't set attribute
00128         """
00129         import warnings
00130         import Bio
00131         warnings.warn("Accessing the .data attribute is deprecated. Please "
00132                       "use str(my_seq) or my_seq.tostring() instead of "
00133                       "my_seq.data.", Bio.BiopythonDeprecationWarning)
00134         return str(self)
00135 
00136     def __repr__(self):
00137         """Returns a (truncated) representation of the sequence for debugging."""
00138         if len(self) > 60:
00139             #Shows the last three letters as it is often useful to see if there
00140             #is a stop codon at the end of a sequence.
00141             #Note total length is 54+3+3=60
00142             return "%s('%s...%s', %s)" % (self.__class__.__name__,
00143                                    str(self)[:54], str(self)[-3:],
00144                                    repr(self.alphabet))
00145         else:
00146             return "%s(%s, %s)" % (self.__class__.__name__,
00147                                   repr(self._data),
00148                                    repr(self.alphabet))
00149     def __str__(self):
00150         """Returns the full sequence as a python string, use str(my_seq).
00151 
00152         Note that Biopython 1.44 and earlier would give a truncated
00153         version of repr(my_seq) for str(my_seq).  If you are writing code
00154         which need to be backwards compatible with old Biopython, you
00155         should continue to use my_seq.tostring() rather than str(my_seq).
00156         """
00157         return self._data
00158 
00159     def __hash__(self):
00160         """Hash for comparison.
00161 
00162         See the __cmp__ documentation - we plan to change this!
00163         """
00164         return id(self) #Currently use object identity for equality testing
00165     
00166     def __cmp__(self, other):
00167         """Compare the sequence to another sequence or a string (README).
00168 
00169         Historically comparing Seq objects has done Python object comparison.
00170         After considerable discussion (keeping in mind constraints of the
00171         Python language, hashes and dictionary support) a future release of
00172         Biopython will change this to use simple string comparison. The plan is
00173         that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a
00174         warning.
00175 
00176         This version of Biopython still does Python object comparison, but with
00177         a warning about this future change. During this transition period,
00178         please just do explicit comparisons:
00179 
00180         >>> seq1 = Seq("ACGT")
00181         >>> seq2 = Seq("ACGT")
00182         >>> id(seq1) == id(seq2)
00183         False
00184         >>> str(seq1) == str(seq2)
00185         True
00186 
00187         Note - This method indirectly supports ==, < , etc.
00188         """
00189         if hasattr(other, "alphabet"):
00190             #other should be a Seq or a MutableSeq
00191             import warnings
00192             warnings.warn("In future comparing Seq objects will use string "
00193                           "comparison (not object comparison). Incompatible "
00194                           "alphabets will trigger a warning (not an exception). "
00195                           "In the interim please use id(seq1)==id(seq2) or "
00196                           "str(seq1)==str(seq2) to make your code explicit "
00197                           "and to avoid this warning.", FutureWarning)
00198         return cmp(id(self), id(other))
00199 
00200     def __len__(self):
00201         """Returns the length of the sequence, use len(my_seq)."""
00202         return len(self._data)       # Seq API requirement
00203 
00204     def __getitem__(self, index) :                 # Seq API requirement
00205         """Returns a subsequence of single letter, use my_seq[index]."""
00206         #Note since Python 2.0, __getslice__ is deprecated
00207         #and __getitem__ is used instead.
00208         #See http://docs.python.org/ref/sequence-methods.html
00209         if isinstance(index, int):
00210             #Return a single letter as a string
00211             return self._data[index]
00212         else:
00213             #Return the (sub)sequence as another Seq object
00214             return Seq(self._data[index], self.alphabet)
00215 
00216     def __add__(self, other):
00217         """Add another sequence or string to this sequence.
00218 
00219         If adding a string to a Seq, the alphabet is preserved:
00220 
00221         >>> from Bio.Seq import Seq
00222         >>> from Bio.Alphabet import generic_protein
00223         >>> Seq("MELKI", generic_protein) + "LV"
00224         Seq('MELKILV', ProteinAlphabet())
00225 
00226         When adding two Seq (like) objects, the alphabets are important.
00227         Consider this example:
00228 
00229         >>> from Bio.Seq import Seq
00230         >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
00231         >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna)
00232         >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna)
00233         >>> unamb_dna_seq
00234         Seq('ACGT', IUPACUnambiguousDNA())
00235         >>> ambig_dna_seq
00236         Seq('ACRGT', IUPACAmbiguousDNA())
00237 
00238         If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get
00239         the more general ambiguous IUPAC DNA alphabet:
00240         
00241         >>> unamb_dna_seq + ambig_dna_seq
00242         Seq('ACGTACRGT', IUPACAmbiguousDNA())
00243 
00244         However, if the default generic alphabet is included, the result is
00245         a generic alphabet:
00246 
00247         >>> Seq("") + ambig_dna_seq
00248         Seq('ACRGT', Alphabet())
00249 
00250         You can't add RNA and DNA sequences:
00251         
00252         >>> from Bio.Alphabet import generic_dna, generic_rna
00253         >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna)
00254         Traceback (most recent call last):
00255            ...
00256         TypeError: Incompatible alphabets DNAAlphabet() and RNAAlphabet()
00257 
00258         You can't add nucleotide and protein sequences:
00259 
00260         >>> from Bio.Alphabet import generic_dna, generic_protein
00261         >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein)
00262         Traceback (most recent call last):
00263            ...
00264         TypeError: Incompatible alphabets DNAAlphabet() and ProteinAlphabet()
00265         """
00266         if hasattr(other, "alphabet"):
00267             #other should be a Seq or a MutableSeq
00268             if not Alphabet._check_type_compatible([self.alphabet,
00269                                                     other.alphabet]):
00270                 raise TypeError("Incompatible alphabets %s and %s" \
00271                                 % (repr(self.alphabet), repr(other.alphabet)))
00272             #They should be the same sequence type (or one of them is generic)
00273             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
00274             return self.__class__(str(self) + str(other), a)
00275         elif isinstance(other, basestring):
00276             #other is a plain string - use the current alphabet
00277             return self.__class__(str(self) + other, self.alphabet)
00278         from Bio.SeqRecord import SeqRecord #Lazy to avoid circular imports
00279         if isinstance(other, SeqRecord):
00280             #Get the SeqRecord's __radd__ to handle this
00281             return NotImplemented
00282         else :
00283             raise TypeError
00284 
00285     def __radd__(self, other):
00286         """Adding a sequence on the left.
00287 
00288         If adding a string to a Seq, the alphabet is preserved:
00289 
00290         >>> from Bio.Seq import Seq
00291         >>> from Bio.Alphabet import generic_protein
00292         >>> "LV" + Seq("MELKI", generic_protein)
00293         Seq('LVMELKI', ProteinAlphabet())
00294 
00295         Adding two Seq (like) objects is handled via the __add__ method.
00296         """
00297         if hasattr(other, "alphabet"):
00298             #other should be a Seq or a MutableSeq
00299             if not Alphabet._check_type_compatible([self.alphabet,
00300                                                     other.alphabet]):
00301                 raise TypeError("Incompatable alphabets %s and %s" \
00302                                 % (repr(self.alphabet), repr(other.alphabet)))
00303             #They should be the same sequence type (or one of them is generic)
00304             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
00305             return self.__class__(str(other) + str(self), a)
00306         elif isinstance(other, basestring):
00307             #other is a plain string - use the current alphabet
00308             return self.__class__(other + str(self), self.alphabet)
00309         else:
00310             raise TypeError
00311 
00312     def tostring(self):                            # Seq API requirement
00313         """Returns the full sequence as a python string (semi-obsolete).
00314 
00315         Although not formally deprecated, you are now encouraged to use
00316         str(my_seq) instead of my_seq.tostring()."""
00317         #TODO - Fix all places elsewhere in Biopython using this method,
00318         #then start deprecation process?
00319         #import warnings
00320         #warnings.warn("This method is obsolete; please use str(my_seq) "
00321         #              "instead of my_seq.tostring().",
00322         #              PendingDeprecationWarning)
00323         return str(self)
00324     
00325     def tomutable(self):   # Needed?  Or use a function?
00326         """Returns the full sequence as a MutableSeq object.
00327 
00328         >>> from Bio.Seq import Seq
00329         >>> from Bio.Alphabet import IUPAC
00330         >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
00331         ...              IUPAC.protein)
00332         >>> my_seq
00333         Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
00334         >>> my_seq.tomutable()
00335         MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
00336 
00337         Note that the alphabet is preserved.
00338         """
00339         return MutableSeq(str(self), self.alphabet)
00340 
00341     def _get_seq_str_and_check_alphabet(self, other_sequence):
00342         """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
00343 
00344         For a string argument, returns the string.
00345 
00346         For a Seq or MutableSeq, it checks the alphabet is compatible
00347         (raising an exception if it isn't), and then returns a string.
00348         """
00349         try:
00350             other_alpha = other_sequence.alphabet
00351         except AttributeError:
00352             #Assume other_sequence is a string
00353             return other_sequence
00354 
00355         #Other should be a Seq or a MutableSeq
00356         if not Alphabet._check_type_compatible([self.alphabet, other_alpha]):
00357             raise TypeError("Incompatable alphabets %s and %s" \
00358                             % (repr(self.alphabet), repr(other_alpha)))
00359         #Return as a string
00360         return str(other_sequence)
00361     
00362     def count(self, sub, start=0, end=sys.maxint):
00363         """Non-overlapping count method, like that of a python string.
00364 
00365         This behaves like the python string method of the same name,
00366         which does a non-overlapping count!
00367 
00368         Returns an integer, the number of occurrences of substring
00369         argument sub in the (sub)sequence given by [start:end].
00370         Optional arguments start and end are interpreted as in slice
00371         notation.
00372     
00373         Arguments:
00374          - sub - a string or another Seq object to look for
00375          - start - optional integer, slice start
00376          - end - optional integer, slice end
00377 
00378         e.g.
00379 
00380         >>> from Bio.Seq import Seq
00381         >>> my_seq = Seq("AAAATGA")
00382         >>> print my_seq.count("A")
00383         5
00384         >>> print my_seq.count("ATG")
00385         1
00386         >>> print my_seq.count(Seq("AT"))
00387         1
00388         >>> print my_seq.count("AT", 2, -1)
00389         1
00390 
00391         HOWEVER, please note because python strings and Seq objects (and
00392         MutableSeq objects) do a non-overlapping search, this may not give
00393         the answer you expect:
00394 
00395         >>> "AAAA".count("AA")
00396         2
00397         >>> print Seq("AAAA").count("AA")
00398         2
00399 
00400         A non-overlapping search would give the answer as three!
00401         """
00402         #If it has one, check the alphabet:
00403         sub_str = self._get_seq_str_and_check_alphabet(sub)
00404         return str(self).count(sub_str, start, end)
00405 
00406     def __contains__(self, char):
00407         """Implements the 'in' keyword, like a python string.
00408 
00409         e.g.
00410 
00411         >>> from Bio.Seq import Seq
00412         >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
00413         >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna)
00414         >>> "AAA" in my_dna
00415         True
00416         >>> Seq("AAA") in my_dna
00417         True
00418         >>> Seq("AAA", generic_dna) in my_dna
00419         True
00420 
00421         Like other Seq methods, this will raise a type error if another Seq
00422         (or Seq like) object with an incompatible alphabet is used:
00423 
00424         >>> Seq("AAA", generic_rna) in my_dna
00425         Traceback (most recent call last):
00426            ...
00427         TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
00428         >>> Seq("AAA", generic_protein) in my_dna
00429         Traceback (most recent call last):
00430            ...
00431         TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
00432         """
00433         #If it has one, check the alphabet:
00434         sub_str = self._get_seq_str_and_check_alphabet(char)
00435         return sub_str in str(self)
00436 
00437     def find(self, sub, start=0, end=sys.maxint):
00438         """Find method, like that of a python string.
00439 
00440         This behaves like the python string method of the same name.
00441 
00442         Returns an integer, the index of the first occurrence of substring
00443         argument sub in the (sub)sequence given by [start:end].
00444 
00445         Arguments:
00446          - sub - a string or another Seq object to look for
00447          - start - optional integer, slice start
00448          - end - optional integer, slice end
00449 
00450         Returns -1 if the subsequence is NOT found.
00451         
00452         e.g. Locating the first typical start codon, AUG, in an RNA sequence:
00453 
00454         >>> from Bio.Seq import Seq
00455         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
00456         >>> my_rna.find("AUG")
00457         3
00458         """
00459         #If it has one, check the alphabet:
00460         sub_str = self._get_seq_str_and_check_alphabet(sub)
00461         return str(self).find(sub_str, start, end)
00462 
00463     def rfind(self, sub, start=0, end=sys.maxint):
00464         """Find from right method, like that of a python string.
00465 
00466         This behaves like the python string method of the same name.
00467 
00468         Returns an integer, the index of the last (right most) occurrence of
00469         substring argument sub in the (sub)sequence given by [start:end].
00470 
00471         Arguments:
00472          - sub - a string or another Seq object to look for
00473          - start - optional integer, slice start
00474          - end - optional integer, slice end
00475 
00476         Returns -1 if the subsequence is NOT found.
00477 
00478         e.g. Locating the last typical start codon, AUG, in an RNA sequence:
00479 
00480         >>> from Bio.Seq import Seq
00481         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
00482         >>> my_rna.rfind("AUG")
00483         15
00484         """
00485         #If it has one, check the alphabet:
00486         sub_str = self._get_seq_str_and_check_alphabet(sub)
00487         return str(self).rfind(sub_str, start, end)
00488 
00489     def startswith(self, prefix, start=0, end=sys.maxint):
00490         """Does the Seq start with the given prefix?  Returns True/False.
00491 
00492         This behaves like the python string method of the same name.
00493 
00494         Return True if the sequence starts with the specified prefix
00495         (a string or another Seq object), False otherwise.
00496         With optional start, test sequence beginning at that position.
00497         With optional end, stop comparing sequence at that position.
00498         prefix can also be a tuple of strings to try.  e.g.
00499         
00500         >>> from Bio.Seq import Seq
00501         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
00502         >>> my_rna.startswith("GUC")
00503         True
00504         >>> my_rna.startswith("AUG")
00505         False
00506         >>> my_rna.startswith("AUG", 3)
00507         True
00508         >>> my_rna.startswith(("UCC","UCA","UCG"),1)
00509         True
00510         """
00511         #If it has one, check the alphabet:
00512         if isinstance(prefix, tuple):
00513             #TODO - Once we drop support for Python 2.4, instead of this
00514             #loop offload to the string method (requires Python 2.5+).
00515             #Check all the alphabets first...
00516             prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
00517                               for p in prefix]
00518             for prefix_str in prefix_strings:
00519                 if str(self).startswith(prefix_str, start, end):
00520                     return True
00521             return False
00522         else:
00523             prefix_str = self._get_seq_str_and_check_alphabet(prefix)
00524             return str(self).startswith(prefix_str, start, end)
00525 
00526     def endswith(self, suffix, start=0, end=sys.maxint):
00527         """Does the Seq end with the given suffix?  Returns True/False.
00528 
00529         This behaves like the python string method of the same name.
00530 
00531         Return True if the sequence ends with the specified suffix
00532         (a string or another Seq object), False otherwise.
00533         With optional start, test sequence beginning at that position.
00534         With optional end, stop comparing sequence at that position.
00535         suffix can also be a tuple of strings to try.  e.g.
00536 
00537         >>> from Bio.Seq import Seq
00538         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
00539         >>> my_rna.endswith("UUG")
00540         True
00541         >>> my_rna.endswith("AUG")
00542         False
00543         >>> my_rna.endswith("AUG", 0, 18)
00544         True
00545         >>> my_rna.endswith(("UCC","UCA","UUG"))
00546         True
00547         """        
00548         #If it has one, check the alphabet:
00549         if isinstance(suffix, tuple):
00550             #TODO - Once we drop support for Python 2.4, instead of this
00551             #loop offload to the string method (requires Python 2.5+).
00552             #Check all the alphabets first...
00553             suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
00554                               for p in suffix]
00555             for suffix_str in suffix_strings:
00556                 if str(self).endswith(suffix_str, start, end):
00557                     return True
00558             return False
00559         else:
00560             suffix_str = self._get_seq_str_and_check_alphabet(suffix)
00561             return str(self).endswith(suffix_str, start, end)
00562 
00563 
00564     def split(self, sep=None, maxsplit=-1):
00565         """Split method, like that of a python string.
00566 
00567         This behaves like the python string method of the same name.
00568 
00569         Return a list of the 'words' in the string (as Seq objects),
00570         using sep as the delimiter string.  If maxsplit is given, at
00571         most maxsplit splits are done.  If maxsplit is ommited, all
00572         splits are made.
00573 
00574         Following the python string method, sep will by default be any
00575         white space (tabs, spaces, newlines) but this is unlikely to
00576         apply to biological sequences.
00577         
00578         e.g.
00579 
00580         >>> from Bio.Seq import Seq
00581         >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
00582         >>> my_aa = my_rna.translate()
00583         >>> my_aa
00584         Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
00585         >>> my_aa.split("*")
00586         [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
00587         >>> my_aa.split("*",1)
00588         [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
00589 
00590         See also the rsplit method:
00591 
00592         >>> my_aa.rsplit("*",1)
00593         [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
00594         """
00595         #If it has one, check the alphabet:
00596         sep_str = self._get_seq_str_and_check_alphabet(sep)
00597         #TODO - If the sep is the defined stop symbol, or gap char,
00598         #should we adjust the alphabet?
00599         return [Seq(part, self.alphabet) \
00600                 for part in str(self).split(sep_str, maxsplit)]
00601 
00602     def rsplit(self, sep=None, maxsplit=-1):
00603         """Right split method, like that of a python string.
00604 
00605         This behaves like the python string method of the same name.
00606 
00607         Return a list of the 'words' in the string (as Seq objects),
00608         using sep as the delimiter string.  If maxsplit is given, at
00609         most maxsplit splits are done COUNTING FROM THE RIGHT.
00610         If maxsplit is ommited, all splits are made.
00611 
00612         Following the python string method, sep will by default be any
00613         white space (tabs, spaces, newlines) but this is unlikely to
00614         apply to biological sequences.
00615         
00616         e.g. print my_seq.rsplit("*",1)
00617 
00618         See also the split method.
00619         """
00620         #If it has one, check the alphabet:
00621         sep_str = self._get_seq_str_and_check_alphabet(sep)
00622         return [Seq(part, self.alphabet) \
00623                 for part in str(self).rsplit(sep_str, maxsplit)]
00624 
00625     def strip(self, chars=None):
00626         """Returns a new Seq object with leading and trailing ends stripped.
00627 
00628         This behaves like the python string method of the same name.
00629 
00630         Optional argument chars defines which characters to remove.  If
00631         ommitted or None (default) then as for the python string method,
00632         this defaults to removing any white space.
00633         
00634         e.g. print my_seq.strip("-")
00635 
00636         See also the lstrip and rstrip methods.
00637         """
00638         #If it has one, check the alphabet:
00639         strip_str = self._get_seq_str_and_check_alphabet(chars)
00640         return Seq(str(self).strip(strip_str), self.alphabet)
00641 
00642     def lstrip(self, chars=None):
00643         """Returns a new Seq object with leading (left) end stripped.
00644 
00645         This behaves like the python string method of the same name.
00646 
00647         Optional argument chars defines which characters to remove.  If
00648         ommitted or None (default) then as for the python string method,
00649         this defaults to removing any white space.
00650         
00651         e.g. print my_seq.lstrip("-")
00652 
00653         See also the strip and rstrip methods.
00654         """
00655         #If it has one, check the alphabet:
00656         strip_str = self._get_seq_str_and_check_alphabet(chars)
00657         return Seq(str(self).lstrip(strip_str), self.alphabet)
00658 
00659     def rstrip(self, chars=None):
00660         """Returns a new Seq object with trailing (right) end stripped.
00661 
00662         This behaves like the python string method of the same name.
00663 
00664         Optional argument chars defines which characters to remove.  If
00665         ommitted or None (default) then as for the python string method,
00666         this defaults to removing any white space.
00667         
00668         e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
00669 
00670         >>> from Bio.Alphabet import IUPAC
00671         >>> from Bio.Seq import Seq
00672         >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
00673         >>> my_seq
00674         Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
00675         >>> my_seq.rstrip("A")
00676         Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
00677 
00678         See also the strip and lstrip methods.
00679         """
00680         #If it has one, check the alphabet:
00681         strip_str = self._get_seq_str_and_check_alphabet(chars)
00682         return Seq(str(self).rstrip(strip_str), self.alphabet)
00683 
00684     def upper(self):
00685         """Returns an upper case copy of the sequence.
00686 
00687         >>> from Bio.Alphabet import HasStopCodon, generic_protein
00688         >>> from Bio.Seq import Seq
00689         >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein))
00690         >>> my_seq
00691         Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*'))
00692         >>> my_seq.lower()
00693         Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*'))
00694         >>> my_seq.upper()
00695         Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*'))
00696 
00697         This will adjust the alphabet if required. See also the lower method.
00698         """
00699         return Seq(str(self).upper(), self.alphabet._upper())
00700 
00701     def lower(self):
00702         """Returns a lower case copy of the sequence.
00703 
00704         This will adjust the alphabet if required. Note that the IUPAC alphabets
00705         are upper case only, and thus a generic alphabet must be substituted.
00706 
00707         >>> from Bio.Alphabet import Gapped, generic_dna
00708         >>> from Bio.Alphabet import IUPAC
00709         >>> from Bio.Seq import Seq
00710         >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*"))
00711         >>> my_seq
00712         Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*'))
00713         >>> my_seq.lower()
00714         Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*'))
00715 
00716         See also the upper method.
00717         """
00718         return Seq(str(self).lower(), self.alphabet._lower())
00719 
00720     def complement(self):
00721         """Returns the complement sequence. New Seq object.
00722 
00723         >>> from Bio.Seq import Seq
00724         >>> from Bio.Alphabet import IUPAC
00725         >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
00726         >>> my_dna
00727         Seq('CCCCCGATAG', IUPACUnambiguousDNA())
00728         >>> my_dna.complement()
00729         Seq('GGGGGCTATC', IUPACUnambiguousDNA())
00730 
00731         You can of course used mixed case sequences,
00732 
00733         >>> from Bio.Seq import Seq
00734         >>> from Bio.Alphabet import generic_dna
00735         >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
00736         >>> my_dna
00737         Seq('CCCCCgatA-GD', DNAAlphabet())
00738         >>> my_dna.complement()
00739         Seq('GGGGGctaT-CH', DNAAlphabet())
00740 
00741         Note in the above example, ambiguous character D denotes
00742         G, A or T so its complement is H (for C, T or A).
00743         
00744         Trying to complement a protein sequence raises an exception.
00745 
00746         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
00747         >>> my_protein.complement()
00748         Traceback (most recent call last):
00749            ...
00750         ValueError: Proteins do not have complements!
00751         """
00752         base = Alphabet._get_base_alphabet(self.alphabet)
00753         if isinstance(base, Alphabet.ProteinAlphabet):
00754             raise ValueError("Proteins do not have complements!")
00755         if isinstance(base, Alphabet.DNAAlphabet):
00756             ttable = _dna_complement_table
00757         elif isinstance(base, Alphabet.RNAAlphabet):
00758             ttable = _rna_complement_table
00759         elif ('U' in self._data or 'u' in self._data) \
00760         and ('T' in self._data or 't' in self._data):
00761             #TODO - Handle this cleanly?
00762             raise ValueError("Mixed RNA/DNA found")
00763         elif 'U' in self._data or 'u' in self._data:
00764             ttable = _rna_complement_table
00765         else:
00766             ttable = _dna_complement_table
00767         #Much faster on really long sequences than the previous loop based one.
00768         #thx to Michael Palmer, University of Waterloo
00769         return Seq(str(self).translate(ttable), self.alphabet)
00770 
00771     def reverse_complement(self):
00772         """Returns the reverse complement sequence. New Seq object.
00773 
00774         >>> from Bio.Seq import Seq
00775         >>> from Bio.Alphabet import IUPAC
00776         >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
00777         >>> my_dna
00778         Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
00779         >>> my_dna.reverse_complement()
00780         Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
00781 
00782         Note in the above example, since R = G or A, its complement
00783         is Y (which denotes C or T).
00784 
00785         You can of course used mixed case sequences,
00786 
00787         >>> from Bio.Seq import Seq
00788         >>> from Bio.Alphabet import generic_dna
00789         >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
00790         >>> my_dna
00791         Seq('CCCCCgatA-G', DNAAlphabet())
00792         >>> my_dna.reverse_complement()
00793         Seq('C-TatcGGGGG', DNAAlphabet())
00794 
00795         Trying to complement a protein sequence raises an exception:
00796 
00797         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
00798         >>> my_protein.reverse_complement()
00799         Traceback (most recent call last):
00800            ...
00801         ValueError: Proteins do not have complements!
00802         """
00803         #Use -1 stride/step to reverse the complement
00804         return self.complement()[::-1]
00805 
00806     def transcribe(self):
00807         """Returns the RNA sequence from a DNA sequence. New Seq object.
00808 
00809         >>> from Bio.Seq import Seq
00810         >>> from Bio.Alphabet import IUPAC
00811         >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
00812         ...                  IUPAC.unambiguous_dna)
00813         >>> coding_dna
00814         Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
00815         >>> coding_dna.transcribe()
00816         Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
00817 
00818         Trying to transcribe a protein or RNA sequence raises an exception:
00819 
00820         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
00821         >>> my_protein.transcribe()
00822         Traceback (most recent call last):
00823            ...
00824         ValueError: Proteins cannot be transcribed!
00825         """
00826         base = Alphabet._get_base_alphabet(self.alphabet)
00827         if isinstance(base, Alphabet.ProteinAlphabet):
00828             raise ValueError("Proteins cannot be transcribed!")
00829         if isinstance(base, Alphabet.RNAAlphabet):
00830             raise ValueError("RNA cannot be transcribed!")
00831 
00832         if self.alphabet==IUPAC.unambiguous_dna:
00833             alphabet = IUPAC.unambiguous_rna
00834         elif self.alphabet==IUPAC.ambiguous_dna:
00835             alphabet = IUPAC.ambiguous_rna
00836         else:
00837             alphabet = Alphabet.generic_rna
00838         return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
00839     
00840     def back_transcribe(self):
00841         """Returns the DNA sequence from an RNA sequence. New Seq object.
00842 
00843         >>> from Bio.Seq import Seq
00844         >>> from Bio.Alphabet import IUPAC
00845         >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
00846         ...                     IUPAC.unambiguous_rna)
00847         >>> messenger_rna
00848         Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
00849         >>> messenger_rna.back_transcribe()
00850         Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
00851 
00852         Trying to back-transcribe a protein or DNA sequence raises an
00853         exception:
00854 
00855         >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
00856         >>> my_protein.back_transcribe()
00857         Traceback (most recent call last):
00858            ...
00859         ValueError: Proteins cannot be back transcribed!
00860         """
00861         base = Alphabet._get_base_alphabet(self.alphabet)
00862         if isinstance(base, Alphabet.ProteinAlphabet):
00863             raise ValueError("Proteins cannot be back transcribed!")
00864         if isinstance(base, Alphabet.DNAAlphabet):
00865             raise ValueError("DNA cannot be back transcribed!")
00866 
00867         if self.alphabet==IUPAC.unambiguous_rna:
00868             alphabet = IUPAC.unambiguous_dna
00869         elif self.alphabet==IUPAC.ambiguous_rna:
00870             alphabet = IUPAC.ambiguous_dna
00871         else:
00872             alphabet = Alphabet.generic_dna
00873         return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
00874 
00875     def translate(self, table="Standard", stop_symbol="*", to_stop=False,
00876                   cds=False):
00877         """Turns a nucleotide sequence into a protein sequence. New Seq object.
00878 
00879         This method will translate DNA or RNA sequences, and those with a
00880         nucleotide or generic alphabet.  Trying to translate a protein
00881         sequence raises an exception.
00882 
00883         Arguments:
00884          - table - Which codon table to use?  This can be either a name
00885                    (string), an NCBI identifier (integer), or a CodonTable
00886                    object (useful for non-standard genetic codes).  This
00887                    defaults to the "Standard" table.
00888          - stop_symbol - Single character string, what to use for terminators.
00889                          This defaults to the asterisk, "*".
00890          - to_stop - Boolean, defaults to False meaning do a full translation
00891                      continuing on past any stop codons (translated as the
00892                      specified stop_symbol).  If True, translation is
00893                      terminated at the first in frame stop codon (and the
00894                      stop_symbol is not appended to the returned protein
00895                      sequence).
00896          - cds - Boolean, indicates this is a complete CDS.  If True,
00897                  this checks the sequence starts with a valid alternative start
00898                  codon (which will be translated as methionine, M), that the
00899                  sequence length is a multiple of three, and that there is a
00900                  single in frame stop codon at the end (this will be excluded
00901                  from the protein sequence, regardless of the to_stop option).
00902                  If these tests fail, an exception is raised.
00903         
00904         e.g. Using the standard table:
00905 
00906         >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
00907         >>> coding_dna.translate()
00908         Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
00909         >>> coding_dna.translate(stop_symbol="@")
00910         Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
00911         >>> coding_dna.translate(to_stop=True)
00912         Seq('VAIVMGR', ExtendedIUPACProtein())
00913 
00914         Now using NCBI table 2, where TGA is not a stop codon:
00915 
00916         >>> coding_dna.translate(table=2)
00917         Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
00918         >>> coding_dna.translate(table=2, to_stop=True)
00919         Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
00920 
00921         In fact, GTG is an alternative start codon under NCBI table 2, meaning
00922         this sequence could be a complete CDS:
00923 
00924         >>> coding_dna.translate(table=2, cds=True)
00925         Seq('MAIVMGRWKGAR', ExtendedIUPACProtein())
00926 
00927         It isn't a valid CDS under NCBI table 1, due to both the start codon and
00928         also the in frame stop codons:
00929         
00930         >>> coding_dna.translate(table=1, cds=True)
00931         Traceback (most recent call last):
00932             ...
00933         TranslationError: First codon 'GTG' is not a start codon
00934 
00935         If the sequence has no in-frame stop codon, then the to_stop argument
00936         has no effect:
00937 
00938         >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
00939         >>> coding_dna2.translate()
00940         Seq('LAIVMGR', ExtendedIUPACProtein())
00941         >>> coding_dna2.translate(to_stop=True)
00942         Seq('LAIVMGR', ExtendedIUPACProtein())
00943 
00944         NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
00945         or a stop codon.  These are translated as "X".  Any invalid codon
00946         (e.g. "TA?" or "T-A") will throw a TranslationError.
00947 
00948         NOTE - Does NOT support gapped sequences.
00949 
00950         NOTE - This does NOT behave like the python string's translate
00951         method.  For that use str(my_seq).translate(...) instead.
00952         """
00953         if isinstance(table, str) and len(table)==256:
00954             raise ValueError("The Seq object translate method DOES NOT take " \
00955                              + "a 256 character string mapping table like " \
00956                              + "the python string object's translate method. " \
00957                              + "Use str(my_seq).translate(...) instead.")
00958         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
00959                       Alphabet.ProteinAlphabet):
00960             raise ValueError("Proteins cannot be translated!")
00961         try:
00962             table_id = int(table)
00963         except ValueError:
00964             #Assume its a table name
00965             if self.alphabet==IUPAC.unambiguous_dna:
00966                 #Will use standard IUPAC protein alphabet, no need for X
00967                 codon_table = CodonTable.unambiguous_dna_by_name[table]
00968             elif self.alphabet==IUPAC.unambiguous_rna:
00969                 #Will use standard IUPAC protein alphabet, no need for X
00970                 codon_table = CodonTable.unambiguous_rna_by_name[table]
00971             else:
00972                 #This will use the extended IUPAC protein alphabet with X etc.
00973                 #The same table can be used for RNA or DNA (we use this for
00974                 #translating strings).
00975                 codon_table = CodonTable.ambiguous_generic_by_name[table]
00976         except (AttributeError, TypeError):
00977             #Assume its a CodonTable object
00978             if isinstance(table, CodonTable.CodonTable):
00979                 codon_table = table
00980             else:
00981                 raise ValueError('Bad table argument')
00982         else:
00983             #Assume its a table ID
00984             if self.alphabet==IUPAC.unambiguous_dna:
00985                 #Will use standard IUPAC protein alphabet, no need for X
00986                 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
00987             elif self.alphabet==IUPAC.unambiguous_rna:
00988                 #Will use standard IUPAC protein alphabet, no need for X
00989                 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
00990             else:
00991                 #This will use the extended IUPAC protein alphabet with X etc.
00992                 #The same table can be used for RNA or DNA (we use this for
00993                 #translating strings).
00994                 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
00995         protein = _translate_str(str(self), codon_table, \
00996                                  stop_symbol, to_stop, cds)
00997         if stop_symbol in protein:
00998             alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
00999                                              stop_symbol = stop_symbol)
01000         else:
01001             alphabet = codon_table.protein_alphabet
01002         return Seq(protein, alphabet)
01003 
01004     def ungap(self, gap=None):
01005         """Return a copy of the sequence without the gap character(s).
01006 
01007         The gap character can be specified in two ways - either as an explicit
01008         argument, or via the sequence's alphabet. For example:
01009 
01010         >>> from Bio.Seq import Seq
01011         >>> from Bio.Alphabet import generic_dna
01012         >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna)
01013         >>> my_dna
01014         Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet())
01015         >>> my_dna.ungap("-")
01016         Seq('ATATGAAATTTGAAAA', DNAAlphabet())
01017 
01018         If the gap character is not given as an argument, it will be taken from
01019         the sequence's alphabet (if defined). Notice that the returned sequence's
01020         alphabet is adjusted since it no longer requires a gapped alphabet:
01021 
01022         >>> from Bio.Seq import Seq
01023         >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon
01024         >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "=")))
01025         >>> my_pro
01026         Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*'))
01027         >>> my_pro.ungap()
01028         Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*'))
01029 
01030         Or, with a simpler gapped DNA example:
01031 
01032         >>> from Bio.Seq import Seq
01033         >>> from Bio.Alphabet import IUPAC, Gapped
01034         >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "="))
01035         >>> my_seq
01036         Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
01037         >>> my_seq.ungap()
01038         Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
01039 
01040         As long as it is consistent with the alphabet, although it is redundant,
01041         you can still supply the gap character as an argument to this method:
01042 
01043         >>> my_seq
01044         Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
01045         >>> my_seq.ungap("=")
01046         Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
01047         
01048         However, if the gap character given as the argument disagrees with that
01049         declared in the alphabet, an exception is raised:
01050 
01051         >>> my_seq
01052         Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
01053         >>> my_seq.ungap("-")
01054         Traceback (most recent call last):
01055            ...
01056         ValueError: Gap '-' does not match '=' from alphabet
01057 
01058         Finally, if a gap character is not supplied, and the alphabet does not
01059         define one, an exception is raised:
01060 
01061         >>> from Bio.Seq import Seq
01062         >>> from Bio.Alphabet import generic_dna
01063         >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna)
01064         >>> my_dna
01065         Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet())
01066         >>> my_dna.ungap()
01067         Traceback (most recent call last):
01068            ...
01069         ValueError: Gap character not given and not defined in alphabet
01070 
01071         """
01072         if hasattr(self.alphabet, "gap_char"):
01073             if not gap:
01074                 gap = self.alphabet.gap_char
01075             elif gap != self.alphabet.gap_char:
01076                 raise ValueError("Gap %s does not match %s from alphabet" \
01077                                  % (repr(gap), repr(self.alphabet.gap_char)))
01078             alpha = Alphabet._ungap(self.alphabet)
01079         elif not gap:
01080             raise ValueError("Gap character not given and not defined in alphabet")
01081         else:
01082             alpha = self.alphabet #modify!
01083         if len(gap)!=1 or not isinstance(gap, str):
01084             raise ValueError("Unexpected gap character, %s" % repr(gap))
01085         return Seq(str(self).replace(gap, ""), alpha)
01086 
01087 class UnknownSeq(Seq):
01088     """A read-only sequence object of known length but unknown contents.
01089 
01090     If you have an unknown sequence, you can represent this with a normal
01091     Seq object, for example:
01092 
01093     >>> my_seq = Seq("N"*5)
01094     >>> my_seq
01095     Seq('NNNNN', Alphabet())
01096     >>> len(my_seq)
01097     5
01098     >>> print my_seq
01099     NNNNN
01100 
01101     However, this is rather wasteful of memory (especially for large
01102     sequences), which is where this class is most usefull:
01103 
01104     >>> unk_five = UnknownSeq(5)
01105     >>> unk_five
01106     UnknownSeq(5, alphabet = Alphabet(), character = '?')
01107     >>> len(unk_five)
01108     5
01109     >>> print(unk_five)
01110     ?????
01111 
01112     You can add unknown sequence together, provided their alphabets and
01113     characters are compatible, and get another memory saving UnknownSeq:
01114 
01115     >>> unk_four = UnknownSeq(4)
01116     >>> unk_four
01117     UnknownSeq(4, alphabet = Alphabet(), character = '?')
01118     >>> unk_four + unk_five
01119     UnknownSeq(9, alphabet = Alphabet(), character = '?')
01120 
01121     If the alphabet or characters don't match up, the addition gives an
01122     ordinary Seq object:
01123     
01124     >>> unk_nnnn = UnknownSeq(4, character = "N")
01125     >>> unk_nnnn
01126     UnknownSeq(4, alphabet = Alphabet(), character = 'N')
01127     >>> unk_nnnn + unk_four
01128     Seq('NNNN????', Alphabet())
01129 
01130     Combining with a real Seq gives a new Seq object:
01131 
01132     >>> known_seq = Seq("ACGT")
01133     >>> unk_four + known_seq
01134     Seq('????ACGT', Alphabet())
01135     >>> known_seq + unk_four
01136     Seq('ACGT????', Alphabet())
01137     """
01138     def __init__(self, length, alphabet = Alphabet.generic_alphabet, character = None):
01139         """Create a new UnknownSeq object.
01140 
01141         If character is ommited, it is determed from the alphabet, "N" for
01142         nucleotides, "X" for proteins, and "?" otherwise.
01143         """
01144         self._length = int(length)
01145         if self._length < 0:
01146             #TODO - Block zero length UnknownSeq?  You can just use a Seq!
01147             raise ValueError("Length must not be negative.")
01148         self.alphabet = alphabet
01149         if character:
01150             if len(character) != 1:
01151                 raise ValueError("character argument should be a single letter string.")
01152             self._character = character
01153         else:
01154             base = Alphabet._get_base_alphabet(alphabet)
01155             #TODO? Check the case of the letters in the alphabet?
01156             #We may have to use "n" instead of "N" etc.
01157             if isinstance(base, Alphabet.NucleotideAlphabet):
01158                 self._character = "N"
01159             elif isinstance(base, Alphabet.ProteinAlphabet):
01160                 self._character = "X"
01161             else:
01162                 self._character = "?"
01163 
01164     def __len__(self):
01165         """Returns the stated length of the unknown sequence."""
01166         return self._length
01167     
01168     def __str__(self):
01169         """Returns the unknown sequence as full string of the given length."""
01170         return self._character * self._length
01171 
01172     def __repr__(self):
01173         return "UnknownSeq(%i, alphabet = %s, character = %s)" \
01174                % (self._length, repr(self.alphabet), repr(self._character))
01175 
01176     def __add__(self, other):
01177         """Add another sequence or string to this sequence.
01178 
01179         Adding two UnknownSeq objects returns another UnknownSeq object
01180         provided the character is the same and the alphabets are compatible.
01181 
01182         >>> from Bio.Seq import UnknownSeq
01183         >>> from Bio.Alphabet import generic_protein
01184         >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein)
01185         UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X')
01186 
01187         If the characters differ, an UnknownSeq object cannot be used, so a
01188         Seq object is returned:
01189 
01190         >>> from Bio.Seq import UnknownSeq
01191         >>> from Bio.Alphabet import generic_protein
01192         >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein,
01193         ...                                              character="x")
01194         Seq('XXXXXXXXXXxxxxx', ProteinAlphabet())
01195 
01196         If adding a string to an UnknownSeq, a new Seq is returned with the
01197         same alphabet:
01198         
01199         >>> from Bio.Seq import UnknownSeq
01200         >>> from Bio.Alphabet import generic_protein
01201         >>> UnknownSeq(5, generic_protein) + "LV"
01202         Seq('XXXXXLV', ProteinAlphabet())
01203         """
01204         if isinstance(other, UnknownSeq) \
01205         and other._character == self._character:
01206             #TODO - Check the alphabets match
01207             return UnknownSeq(len(self)+len(other),
01208                               self.alphabet, self._character)
01209         #Offload to the base class...
01210         return Seq(str(self), self.alphabet) + other
01211 
01212     def __radd__(self, other):
01213         #If other is an UnknownSeq, then __add__ would be called.
01214         #Offload to the base class...
01215         return other + Seq(str(self), self.alphabet)
01216 
01217     def __getitem__(self, index):
01218         """Get a subsequence from the UnknownSeq object.
01219         
01220         >>> unk = UnknownSeq(8, character="N")
01221         >>> print unk[:]
01222         NNNNNNNN
01223         >>> print unk[5:3]
01224         <BLANKLINE>
01225         >>> print unk[1:-1]
01226         NNNNNN
01227         >>> print unk[1:-1:2]
01228         NNN
01229         """
01230         if isinstance(index, int):
01231             #TODO - Check the bounds without wasting memory
01232             return str(self)[index]
01233         old_length = self._length
01234         step = index.step
01235         if step is None or step == 1:
01236             #This calculates the length you'd get from ("N"*old_length)[index]
01237             start = index.start
01238             end = index.stop
01239             if start is None:
01240                 start = 0
01241             elif start < 0:
01242                 start = max(0, old_length + start)
01243             elif start > old_length:
01244                 start = old_length
01245             if end is None:
01246                 end = old_length
01247             elif end < 0:
01248                 end = max(0, old_length + end)
01249             elif end > old_length:
01250                 end = old_length
01251             new_length = max(0, end-start)
01252         elif step == 0:
01253             raise ValueError("slice step cannot be zero")
01254         else:
01255             #TODO - handle step efficiently
01256             new_length = len(("X"*old_length)[index])
01257         #assert new_length == len(("X"*old_length)[index]), \
01258         #       (index, start, end, step, old_length,
01259         #        new_length, len(("X"*old_length)[index]))
01260         return UnknownSeq(new_length, self.alphabet, self._character)
01261 
01262     def count(self, sub, start=0, end=sys.maxint):
01263         """Non-overlapping count method, like that of a python string.
01264 
01265         This behaves like the python string (and Seq object) method of the
01266         same name, which does a non-overlapping count!
01267 
01268         Returns an integer, the number of occurrences of substring
01269         argument sub in the (sub)sequence given by [start:end].
01270         Optional arguments start and end are interpreted as in slice
01271         notation.
01272     
01273         Arguments:
01274          - sub - a string or another Seq object to look for
01275          - start - optional integer, slice start
01276          - end - optional integer, slice end
01277 
01278         >>> "NNNN".count("N")
01279         4
01280         >>> Seq("NNNN").count("N")
01281         4
01282         >>> UnknownSeq(4, character="N").count("N")
01283         4
01284         >>> UnknownSeq(4, character="N").count("A")
01285         0
01286         >>> UnknownSeq(4, character="N").count("AA")
01287         0
01288 
01289         HOWEVER, please note because that python strings and Seq objects (and
01290         MutableSeq objects) do a non-overlapping search, this may not give
01291         the answer you expect:
01292 
01293         >>> UnknownSeq(4, character="N").count("NN")
01294         2
01295         >>> UnknownSeq(4, character="N").count("NNN")
01296         1
01297         """
01298         sub_str = self._get_seq_str_and_check_alphabet(sub)
01299         if len(sub_str) == 1:
01300             if str(sub_str) == self._character:
01301                 if start==0 and end >= self._length:
01302                     return self._length
01303                 else:
01304                     #This could be done more cleverly...
01305                     return str(self).count(sub_str, start, end)
01306             else:
01307                 return 0
01308         else:
01309             if set(sub_str) == set(self._character):
01310                 if start==0 and end >= self._length:
01311                     return self._length // len(sub_str)
01312                 else:
01313                     #This could be done more cleverly...
01314                     return str(self).count(sub_str, start, end)
01315             else:
01316                 return 0
01317 
01318     def complement(self):
01319         """The complement of an unknown nucleotide equals itself.
01320 
01321         >>> my_nuc = UnknownSeq(8)
01322         >>> my_nuc
01323         UnknownSeq(8, alphabet = Alphabet(), character = '?')
01324         >>> print my_nuc
01325         ????????
01326         >>> my_nuc.complement()
01327         UnknownSeq(8, alphabet = Alphabet(), character = '?')
01328         >>> print my_nuc.complement()
01329         ????????
01330         """
01331         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
01332                       Alphabet.ProteinAlphabet):
01333             raise ValueError("Proteins do not have complements!")
01334         return self
01335 
01336     def reverse_complement(self):
01337         """The reverse complement of an unknown nucleotide equals itself.
01338 
01339         >>> my_nuc = UnknownSeq(10)
01340         >>> my_nuc
01341         UnknownSeq(10, alphabet = Alphabet(), character = '?')
01342         >>> print my_nuc
01343         ??????????
01344         >>> my_nuc.reverse_complement()
01345         UnknownSeq(10, alphabet = Alphabet(), character = '?')
01346         >>> print my_nuc.reverse_complement()
01347         ??????????
01348         """
01349         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
01350                       Alphabet.ProteinAlphabet):
01351             raise ValueError("Proteins do not have complements!")
01352         return self
01353 
01354     def transcribe(self):
01355         """Returns unknown RNA sequence from an unknown DNA sequence.
01356 
01357         >>> my_dna = UnknownSeq(10, character="N")
01358         >>> my_dna
01359         UnknownSeq(10, alphabet = Alphabet(), character = 'N')
01360         >>> print my_dna
01361         NNNNNNNNNN
01362         >>> my_rna = my_dna.transcribe()
01363         >>> my_rna
01364         UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
01365         >>> print my_rna
01366         NNNNNNNNNN
01367         """
01368         #Offload the alphabet stuff
01369         s = Seq(self._character, self.alphabet).transcribe()
01370         return UnknownSeq(self._length, s.alphabet, self._character)
01371 
01372     def back_transcribe(self):
01373         """Returns unknown DNA sequence from an unknown RNA sequence.
01374 
01375         >>> my_rna = UnknownSeq(20, character="N")
01376         >>> my_rna
01377         UnknownSeq(20, alphabet = Alphabet(), character = 'N')
01378         >>> print my_rna
01379         NNNNNNNNNNNNNNNNNNNN
01380         >>> my_dna = my_rna.back_transcribe()
01381         >>> my_dna
01382         UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
01383         >>> print my_dna
01384         NNNNNNNNNNNNNNNNNNNN
01385         """
01386         #Offload the alphabet stuff
01387         s = Seq(self._character, self.alphabet).back_transcribe()
01388         return UnknownSeq(self._length, s.alphabet, self._character)
01389 
01390     def upper(self):
01391         """Returns an upper case copy of the sequence.
01392 
01393         >>> from Bio.Alphabet import generic_dna
01394         >>> from Bio.Seq import UnknownSeq
01395         >>> my_seq = UnknownSeq(20, generic_dna, character="n")
01396         >>> my_seq
01397         UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n')
01398         >>> print my_seq
01399         nnnnnnnnnnnnnnnnnnnn
01400         >>> my_seq.upper()
01401         UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
01402         >>> print my_seq.upper()
01403         NNNNNNNNNNNNNNNNNNNN
01404 
01405         This will adjust the alphabet if required. See also the lower method.
01406         """
01407         return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
01408 
01409     def lower(self):
01410         """Returns a lower case copy of the sequence.
01411 
01412         This will adjust the alphabet if required:
01413 
01414         >>> from Bio.Alphabet import IUPAC
01415         >>> from Bio.Seq import UnknownSeq
01416         >>> my_seq = UnknownSeq(20, IUPAC.extended_protein)
01417         >>> my_seq
01418         UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X')
01419         >>> print my_seq
01420         XXXXXXXXXXXXXXXXXXXX
01421         >>> my_seq.lower()
01422         UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x')
01423         >>> print my_seq.lower()
01424         xxxxxxxxxxxxxxxxxxxx
01425 
01426         See also the upper method.
01427         """
01428         return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
01429 
01430     def translate(self, **kwargs):
01431         """Translate an unknown nucleotide sequence into an unknown protein.
01432 
01433         e.g.
01434 
01435         >>> my_seq = UnknownSeq(11, character="N")
01436         >>> print my_seq
01437         NNNNNNNNNNN
01438         >>> my_protein = my_seq.translate()
01439         >>> my_protein
01440         UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
01441         >>> print my_protein
01442         XXX
01443 
01444         In comparison, using a normal Seq object:
01445 
01446         >>> my_seq = Seq("NNNNNNNNNNN")
01447         >>> print my_seq
01448         NNNNNNNNNNN
01449         >>> my_protein = my_seq.translate()
01450         >>> my_protein
01451         Seq('XXX', ExtendedIUPACProtein())
01452         >>> print my_protein
01453         XXX
01454 
01455         """
01456         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
01457                       Alphabet.ProteinAlphabet):
01458             raise ValueError("Proteins cannot be translated!")
01459         return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
01460 
01461     def ungap(self, gap=None):
01462         """Return a copy of the sequence without the gap character(s).
01463 
01464         The gap character can be specified in two ways - either as an explicit
01465         argument, or via the sequence's alphabet. For example:
01466 
01467         >>> from Bio.Seq import UnknownSeq
01468         >>> from Bio.Alphabet import Gapped, generic_dna
01469         >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-"))
01470         >>> my_dna
01471         UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N')
01472         >>> my_dna.ungap()
01473         UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
01474         >>> my_dna.ungap("-")
01475         UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
01476 
01477         If the UnknownSeq is using the gap character, then an empty Seq is
01478         returned:
01479 
01480         >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-")
01481         >>> my_gap
01482         UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-')
01483         >>> my_gap.ungap()
01484         Seq('', DNAAlphabet())
01485         >>> my_gap.ungap("-")
01486         Seq('', DNAAlphabet())
01487 
01488         Notice that the returned sequence's alphabet is adjusted to remove any
01489         explicit gap character declaration.
01490         """
01491         #Offload the alphabet stuff
01492         s = Seq(self._character, self.alphabet).ungap()
01493         if s :
01494             return UnknownSeq(self._length, s.alphabet, self._character)
01495         else :
01496             return Seq("", s.alphabet)
01497 
01498 class MutableSeq(object):
01499     """An editable sequence object (with an alphabet).
01500 
01501     Unlike normal python strings and our basic sequence object (the Seq class)
01502     which are immuatable, the MutableSeq lets you edit the sequence in place.
01503     However, this means you cannot use a MutableSeq object as a dictionary key.
01504 
01505     >>> from Bio.Seq import MutableSeq
01506     >>> from Bio.Alphabet import generic_dna
01507     >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
01508     >>> my_seq
01509     MutableSeq('ACTCGTCGTCG', DNAAlphabet())
01510     >>> my_seq[5]
01511     'T'
01512     >>> my_seq[5] = "A"
01513     >>> my_seq
01514     MutableSeq('ACTCGACGTCG', DNAAlphabet())
01515     >>> my_seq[5]
01516     'A'
01517     >>> my_seq[5:8] = "NNN"
01518     >>> my_seq
01519     MutableSeq('ACTCGNNNTCG', DNAAlphabet())
01520     >>> len(my_seq)
01521     11
01522 
01523     Note that the MutableSeq object does not support as many string-like
01524     or biological methods as the Seq object.
01525     """
01526     def __init__(self, data, alphabet = Alphabet.generic_alphabet):
01527         if sys.version_info[0] == 3:
01528             self.array_indicator = "u"
01529         else:
01530             self.array_indicator = "c"
01531         if isinstance(data, str): #TODO - What about unicode?
01532             self.data = array.array(self.array_indicator, data)
01533         else:
01534             self.data = data   # assumes the input is an array
01535         self.alphabet = alphabet
01536     
01537     def __repr__(self):
01538         """Returns a (truncated) representation of the sequence for debugging."""
01539         if len(self) > 60:
01540             #Shows the last three letters as it is often useful to see if there
01541             #is a stop codon at the end of a sequence.
01542             #Note total length is 54+3+3=60
01543             return "%s('%s...%s', %s)" % (self.__class__.__name__,
01544                                    str(self[:54]), str(self[-3:]),
01545                                    repr(self.alphabet))
01546         else:
01547             return "%s('%s', %s)" % (self.__class__.__name__,
01548                                    str(self),
01549                                    repr(self.alphabet))
01550 
01551     def __str__(self):
01552         """Returns the full sequence as a python string.
01553 
01554         Note that Biopython 1.44 and earlier would give a truncated
01555         version of repr(my_seq) for str(my_seq).  If you are writing code
01556         which needs to be backwards compatible with old Biopython, you
01557         should continue to use my_seq.tostring() rather than str(my_seq).
01558         """
01559         #See test_GAQueens.py for an historic usage of a non-string alphabet!
01560         return "".join(self.data)
01561 
01562     def __cmp__(self, other):
01563         """Compare the sequence to another sequence or a string (README).
01564 
01565         Currently if compared to another sequence the alphabets must be
01566         compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise
01567         an exception. Otherwise only the sequence itself is compared, not the
01568         precise alphabet.
01569 
01570         A future release of Biopython will change this (and the Seq object etc)
01571         to use simple string comparison. The plan is that comparing sequences
01572         with incompatible alphabets (e.g. DNA to RNA) will trigger a warning
01573         but not an exception.
01574 
01575         During this transition period, please just do explicit comparisons:
01576 
01577         >>> seq1 = MutableSeq("ACGT")
01578         >>> seq2 = MutableSeq("ACGT")
01579         >>> id(seq1) == id(seq2)
01580         False
01581         >>> str(seq1) == str(seq2)
01582         True
01583 
01584         This method indirectly supports ==, < , etc.
01585         """
01586         if hasattr(other, "alphabet"):
01587             #other should be a Seq or a MutableSeq
01588             import warnings
01589             warnings.warn("In future comparing incompatible alphabets will "
01590                           "only trigger a warning (not an exception). In " 
01591                           "the interim please use id(seq1)==id(seq2) or "
01592                           "str(seq1)==str(seq2) to make your code explicit "
01593                           "and to avoid this warning.", FutureWarning)
01594             if not Alphabet._check_type_compatible([self.alphabet,
01595                                                     other.alphabet]):
01596                 raise TypeError("Incompatable alphabets %s and %s" \
01597                                 % (repr(self.alphabet), repr(other.alphabet)))
01598             #They should be the same sequence type (or one of them is generic)
01599             if isinstance(other, MutableSeq):
01600                 #See test_GAQueens.py for an historic usage of a non-string
01601                 #alphabet!  Comparing the arrays supports this.
01602                 return cmp(self.data, other.data)
01603             else:
01604                 return cmp(str(self), str(other))
01605         elif isinstance(other, basestring):
01606             return cmp(str(self), other)
01607         else:
01608             raise TypeError
01609 
01610     def __len__(self): return len(self.data)
01611 
01612     def __getitem__(self, index):
01613         #Note since Python 2.0, __getslice__ is deprecated
01614         #and __getitem__ is used instead.
01615         #See http://docs.python.org/ref/sequence-methods.html
01616         if isinstance(index, int):
01617             #Return a single letter as a string
01618             return self.data[index]
01619         else:
01620             #Return the (sub)sequence as another Seq object
01621             return MutableSeq(self.data[index], self.alphabet)
01622 
01623     def __setitem__(self, index, value):
01624         #Note since Python 2.0, __setslice__ is deprecated
01625         #and __setitem__ is used instead.
01626         #See http://docs.python.org/ref/sequence-methods.html
01627         if isinstance(index, int):
01628             #Replacing a single letter with a new string
01629             self.data[index] = value
01630         else:
01631             #Replacing a sub-sequence
01632             if isinstance(value, MutableSeq):
01633                 self.data[index] = value.data
01634             elif isinstance(value, type(self.data)):
01635                 self.data[index] = value
01636             else:
01637                 self.data[index] = array.array(self.array_indicator,
01638                                                str(value))
01639 
01640     def __delitem__(self, index):
01641         #Note since Python 2.0, __delslice__ is deprecated
01642         #and __delitem__ is used instead.
01643         #See http://docs.python.org/ref/sequence-methods.html
01644         
01645         #Could be deleting a single letter, or a slice
01646         del self.data[index]
01647     
01648     def __add__(self, other):
01649         """Add another sequence or string to this sequence.
01650 
01651         Returns a new MutableSeq object."""
01652         if hasattr(other, "alphabet"):
01653             #other should be a Seq or a MutableSeq
01654             if not Alphabet._check_type_compatible([self.alphabet,
01655                                                     other.alphabet]):
01656                 raise TypeError("Incompatable alphabets %s and %s" \
01657                                 % (repr(self.alphabet), repr(other.alphabet)))
01658             #They should be the same sequence type (or one of them is generic)
01659             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
01660             if isinstance(other, MutableSeq):
01661                 #See test_GAQueens.py for an historic usage of a non-string
01662                 #alphabet!  Adding the arrays should support this.
01663                 return self.__class__(self.data + other.data, a)
01664             else:
01665                 return self.__class__(str(self) + str(other), a)
01666         elif isinstance(other, basestring):
01667             #other is a plain string - use the current alphabet
01668             return self.__class__(str(self) + str(other), self.alphabet)
01669         else:
01670             raise TypeError
01671 
01672     def __radd__(self, other):
01673         if hasattr(other, "alphabet"):
01674             #other should be a Seq or a MutableSeq
01675             if not Alphabet._check_type_compatible([self.alphabet,
01676                                                     other.alphabet]):
01677                 raise TypeError("Incompatable alphabets %s and %s" \
01678                                 % (repr(self.alphabet), repr(other.alphabet)))
01679             #They should be the same sequence type (or one of them is generic)
01680             a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
01681             if isinstance(other, MutableSeq):
01682                 #See test_GAQueens.py for an historic usage of a non-string
01683                 #alphabet!  Adding the arrays should support this.
01684                 return self.__class__(other.data + self.data, a)
01685             else:
01686                 return self.__class__(str(other) + str(self), a)
01687         elif isinstance(other, basestring):
01688             #other is a plain string - use the current alphabet
01689             return self.__class__(str(other) + str(self), self.alphabet)
01690         else:
01691             raise TypeError
01692 
01693     def append(self, c):
01694         self.data.append(c)
01695 
01696     def insert(self, i, c):
01697         self.data.insert(i, c)
01698 
01699     def pop(self, i = (-1)):
01700         c = self.data[i]
01701         del self.data[i]
01702         return c
01703 
01704     def remove(self, item):
01705         for i in range(len(self.data)):
01706             if self.data[i] == item:
01707                 del self.data[i]
01708                 return
01709         raise ValueError("MutableSeq.remove(x): x not in list")
01710 
01711     def count(self, sub, start=0, end=sys.maxint):
01712         """Non-overlapping count method, like that of a python string.
01713 
01714         This behaves like the python string method of the same name,
01715         which does a non-overlapping count!
01716 
01717         Returns an integer, the number of occurrences of substring
01718         argument sub in the (sub)sequence given by [start:end].
01719         Optional arguments start and end are interpreted as in slice
01720         notation.
01721     
01722         Arguments:
01723          - sub - a string or another Seq object to look for
01724          - start - optional integer, slice start
01725          - end - optional integer, slice end
01726 
01727         e.g.
01728         
01729         >>> from Bio.Seq import MutableSeq
01730         >>> my_mseq = MutableSeq("AAAATGA")
01731         >>> print my_mseq.count("A")
01732         5
01733         >>> print my_mseq.count("ATG")
01734         1
01735         >>> print my_mseq.count(Seq("AT"))
01736         1
01737         >>> print my_mseq.count("AT", 2, -1)
01738         1
01739         
01740         HOWEVER, please note because that python strings, Seq objects and
01741         MutableSeq objects do a non-overlapping search, this may not give
01742         the answer you expect:
01743 
01744         >>> "AAAA".count("AA")
01745         2
01746         >>> print MutableSeq("AAAA").count("AA")
01747         2
01748 
01749         A non-overlapping search would give the answer as three!
01750         """
01751         try:
01752             #TODO - Should we check the alphabet?
01753             search = sub.tostring()
01754         except AttributeError:
01755             search = sub
01756 
01757         if not isinstance(search, basestring):
01758             raise TypeError("expected a string, Seq or MutableSeq")
01759 
01760         if len(search) == 1:
01761             #Try and be efficient and work directly from the array.
01762             count = 0
01763             for c in self.data[start:end]:
01764                 if c == search: count += 1
01765             return count
01766         else:
01767             #TODO - Can we do this more efficiently?
01768             return self.tostring().count(search, start, end)
01769 
01770     def index(self, item):
01771         for i in range(len(self.data)):
01772             if self.data[i] == item:
01773                 return i
01774         raise ValueError("MutableSeq.index(x): x not in list")
01775 
01776     def reverse(self):
01777         """Modify the mutable sequence to reverse itself.
01778 
01779         No return value.
01780         """
01781         self.data.reverse()
01782 
01783     def complement(self):
01784         """Modify the mutable sequence to take on its complement.
01785 
01786         Trying to complement a protein sequence raises an exception.
01787 
01788         No return value.
01789         """
01790         if isinstance(Alphabet._get_base_alphabet(self.alphabet),
01791                       Alphabet.ProteinAlphabet):
01792             raise ValueError("Proteins do not have complements!")
01793         if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna):
01794             d = ambiguous_dna_complement
01795         elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna):
01796             d = ambiguous_rna_complement
01797         elif 'U' in self.data and 'T' in self.data:
01798             #TODO - Handle this cleanly?
01799             raise ValueError("Mixed RNA/DNA found")
01800         elif 'U' in self.data:
01801             d = ambiguous_rna_complement
01802         else:
01803             d = ambiguous_dna_complement
01804         c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()])
01805         d.update(c)
01806         self.data = map(lambda c: d[c], self.data)
01807         self.data = array.array(self.array_indicator, self.data)
01808         
01809     def reverse_complement(self):
01810         """Modify the mutable sequence to take on its reverse complement.
01811 
01812         Trying to reverse complement a protein sequence raises an exception.
01813 
01814         No return value.
01815         """
01816         self.complement()
01817         self.data.reverse()
01818 
01819     ## Sorting a sequence makes no sense.
01820     # def sort(self, *args): self.data.sort(*args)
01821     
01822     def extend(self, other):
01823         if isinstance(other, MutableSeq):
01824             for c in other.data:
01825                 self.data.append(c)
01826         else:
01827             for c in other:
01828                 self.data.append(c)
01829 
01830     def tostring(self):
01831         """Returns the full sequence as a python string (semi-obsolete).
01832 
01833         Although not formally deprecated, you are now encouraged to use
01834         str(my_seq) instead of my_seq.tostring().
01835 
01836         Because str(my_seq) will give you the full sequence as a python string,
01837         there is often no need to make an explicit conversion.  For example,
01838         
01839         print "ID={%s}, sequence={%s}" % (my_name, my_seq)
01840 
01841         On Biopython 1.44 or older you would have to have done this:
01842 
01843         print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
01844         """
01845         return "".join(self.data)
01846 
01847     def toseq(self):
01848         """Returns the full sequence as a new immutable Seq object.
01849 
01850         >>> from Bio.Seq import Seq
01851         >>> from Bio.Alphabet import IUPAC
01852         >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", 
01853         ...                      IUPAC.protein)
01854         >>> my_mseq
01855         MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
01856         >>> my_mseq.toseq()
01857         Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
01858 
01859         Note that the alphabet is preserved.
01860         """
01861         return Seq("".join(self.data), self.alphabet)
01862 
01863 # The transcribe, backward_transcribe, and translate functions are
01864 # user-friendly versions of the corresponding functions in Bio.Transcribe
01865 # and Bio.Translate. The functions work both on Seq objects, and on strings.
01866 
01867 def transcribe(dna):
01868     """Transcribes a DNA sequence into RNA.
01869 
01870     If given a string, returns a new string object.
01871 
01872     Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
01873 
01874     Trying to transcribe a protein or RNA sequence raises an exception.
01875 
01876     e.g.
01877     
01878     >>> transcribe("ACTGN")
01879     'ACUGN'
01880     """
01881     if isinstance(dna, Seq):
01882         return dna.transcribe()
01883     elif isinstance(dna, MutableSeq):
01884         return dna.toseq().transcribe()
01885     else:
01886         return dna.replace('T','U').replace('t','u')
01887 
01888 def back_transcribe(rna):
01889     """Back-transcribes an RNA sequence into DNA.
01890 
01891     If given a string, returns a new string object.
01892     
01893     Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
01894 
01895     Trying to transcribe a protein or DNA sequence raises an exception.
01896 
01897     e.g.
01898 
01899     >>> back_transcribe("ACUGN")
01900     'ACTGN'
01901     """
01902     if isinstance(rna, Seq):
01903         return rna.back_transcribe()
01904     elif isinstance(rna, MutableSeq):
01905         return rna.toseq().back_transcribe()
01906     else:
01907         return rna.replace('U','T').replace('u','t')
01908     
01909 def _translate_str(sequence, table, stop_symbol="*", to_stop=False,
01910                    cds=False, pos_stop="X"):
01911     """Helper function to translate a nucleotide string (PRIVATE).
01912 
01913     Arguments:
01914      - sequence    - a string
01915      - table       - a CodonTable object (NOT a table name or id number)
01916      - stop_symbol - a single character string, what to use for terminators.
01917      - to_stop     - boolean, should translation terminate at the first
01918                      in frame stop codon?  If there is no in-frame stop codon
01919                      then translation continues to the end.
01920      - pos_stop    - a single character string for a possible stop codon
01921                      (e.g. TAN or NNN)
01922      - cds - Boolean, indicates this is a complete CDS.  If True, this
01923              checks the sequence starts with a valid alternative start
01924              codon (which will be translated as methionine, M), that the
01925              sequence length is a multiple of three, and that there is a
01926              single in frame stop codon at the end (this will be excluded
01927              from the protein sequence, regardless of the to_stop option).
01928              If these tests fail, an exception is raised.
01929 
01930     Returns a string.
01931 
01932     e.g.
01933 
01934     >>> from Bio.Data import CodonTable
01935     >>> table = CodonTable.ambiguous_dna_by_id[1]
01936     >>> _translate_str("AAA", table)
01937     'K'
01938     >>> _translate_str("TAR", table)
01939     '*'
01940     >>> _translate_str("TAN", table)
01941     'X'
01942     >>> _translate_str("TAN", table, pos_stop="@")
01943     '@'
01944     >>> _translate_str("TA?", table)
01945     Traceback (most recent call last):
01946        ...
01947     TranslationError: Codon 'TA?' is invalid
01948     >>> _translate_str("ATGCCCTAG", table, cds=True)
01949     'MP'
01950     >>> _translate_str("AAACCCTAG", table, cds=True)
01951     Traceback (most recent call last):
01952        ...
01953     TranslationError: First codon 'AAA' is not a start codon
01954     >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
01955     Traceback (most recent call last):
01956        ...
01957     TranslationError: Extra in frame stop codon found.
01958     """
01959     sequence = sequence.upper()
01960     amino_acids = []
01961     forward_table = table.forward_table
01962     stop_codons = table.stop_codons
01963     if table.nucleotide_alphabet.letters is not None:
01964         valid_letters = set(table.nucleotide_alphabet.letters.upper())
01965     else:
01966         #Assume the worst case, ambiguous DNA or RNA:
01967         valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
01968                             IUPAC.ambiguous_rna.letters.upper())
01969     if cds:
01970         if str(sequence[:3]).upper() not in table.start_codons:
01971             raise CodonTable.TranslationError(\
01972                 "First codon '%s' is not a start codon" % sequence[:3])
01973         if len(sequence) % 3 != 0:
01974             raise CodonTable.TranslationError(\
01975                 "Sequence length %i is not a multiple of three" % len(sequence))
01976         if str(sequence[-3:]).upper() not in stop_codons:
01977             raise CodonTable.TranslationError(\
01978                 "Final codon '%s' is not a stop codon" % sequence[-3:])
01979         #Don't translate the stop symbol, and manually translate the M
01980         sequence = sequence[3:-3]
01981         amino_acids = ["M"]
01982     n = len(sequence)
01983     for i in xrange(0,n-n%3,3):
01984         codon = sequence[i:i+3]
01985         try:
01986             amino_acids.append(forward_table[codon])
01987         except (KeyError, CodonTable.TranslationError):
01988             #Todo? Treat "---" as a special case (gapped translation)
01989             if codon in table.stop_codons:
01990                 if cds:
01991                     raise CodonTable.TranslationError(\
01992                         "Extra in frame stop codon found.")
01993                 if to_stop : break
01994                 amino_acids.append(stop_symbol)
01995             elif valid_letters.issuperset(set(codon)):
01996                 #Possible stop codon (e.g. NNN or TAN)
01997                 amino_acids.append(pos_stop)
01998             else:
01999                 raise CodonTable.TranslationError(\
02000                     "Codon '%s' is invalid" % codon)
02001     return "".join(amino_acids)
02002 
02003 def translate(sequence, table="Standard", stop_symbol="*", to_stop=False,
02004               cds=False):
02005     """Translate a nucleotide sequence into amino acids.
02006 
02007     If given a string, returns a new string object. Given a Seq or
02008     MutableSeq, returns a Seq object with a protein alphabet.
02009 
02010     Arguments:
02011      - table - Which codon table to use?  This can be either a name (string),
02012                an NCBI identifier (integer), or a CodonTable object (useful
02013                for non-standard genetic codes).  Defaults to the "Standard"
02014                table.
02015      - stop_symbol - Single character string, what to use for any
02016                      terminators, defaults to the asterisk, "*".
02017      - to_stop - Boolean, defaults to False meaning do a full
02018                  translation continuing on past any stop codons
02019                  (translated as the specified stop_symbol).  If
02020                  True, translation is terminated at the first in
02021                  frame stop codon (and the stop_symbol is not
02022                  appended to the returned protein sequence).
02023      - cds - Boolean, indicates this is a complete CDS.  If True, this
02024                  checks the sequence starts with a valid alternative start
02025                  codon (which will be translated as methionine, M), that the
02026                  sequence length is a multiple of three, and that there is a
02027                  single in frame stop codon at the end (this will be excluded
02028                  from the protein sequence, regardless of the to_stop option).
02029                  If these tests fail, an exception is raised.
02030     
02031     A simple string example using the default (standard) genetic code:
02032     
02033     >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
02034     >>> translate(coding_dna)
02035     'VAIVMGR*KGAR*'
02036     >>> translate(coding_dna, stop_symbol="@")
02037     'VAIVMGR@KGAR@'
02038     >>> translate(coding_dna, to_stop=True)
02039     'VAIVMGR'
02040      
02041     Now using NCBI table 2, where TGA is not a stop codon:
02042 
02043     >>> translate(coding_dna, table=2)
02044     'VAIVMGRWKGAR*'
02045     >>> translate(coding_dna, table=2, to_stop=True)
02046     'VAIVMGRWKGAR'
02047 
02048     In fact this example uses an alternative start codon valid under NCBI table 2,
02049     GTG, which means this example is a complete valid CDS which when translated
02050     should really start with methionine (not valine):
02051     
02052     >>> translate(coding_dna, table=2, cds=True)
02053     'MAIVMGRWKGAR'
02054 
02055     Note that if the sequence has no in-frame stop codon, then the to_stop
02056     argument has no effect:
02057 
02058     >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
02059     >>> translate(coding_dna2)
02060     'VAIVMGR'
02061     >>> translate(coding_dna2, to_stop=True)
02062     'VAIVMGR'
02063     
02064     NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
02065     or a stop codon.  These are translated as "X".  Any invalid codon
02066     (e.g. "TA?" or "T-A") will throw a TranslationError.
02067 
02068     NOTE - Does NOT support gapped sequences.
02069     
02070     It will however translate either DNA or RNA.
02071     """
02072     if isinstance(sequence, Seq):
02073         return sequence.translate(table, stop_symbol, to_stop, cds)
02074     elif isinstance(sequence, MutableSeq):
02075         #Return a Seq object
02076         return sequence.toseq().translate(table, stop_symbol, to_stop, cds)
02077     else:
02078         #Assume its a string, return a string
02079         try:
02080             codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
02081         except ValueError:
02082             codon_table = CodonTable.ambiguous_generic_by_name[table]
02083         except (AttributeError, TypeError):
02084             if isinstance(table, CodonTable.CodonTable):
02085                 codon_table = table
02086             else:
02087                 raise ValueError('Bad table argument')
02088         return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
02089       
02090 def reverse_complement(sequence):
02091     """Returns the reverse complement sequence of a nucleotide string.
02092 
02093     If given a string, returns a new string object.
02094     Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet.
02095 
02096     Supports unambiguous and ambiguous nucleotide sequences.
02097 
02098     e.g.
02099 
02100     >>> reverse_complement("ACTG-NH")
02101     'DN-CAGT'
02102     """
02103     if isinstance(sequence, Seq):
02104         #Return a Seq
02105         return sequence.reverse_complement()
02106     elif isinstance(sequence, MutableSeq):
02107         #Return a Seq
02108         #Don't use the MutableSeq reverse_complement method as it is 'in place'.
02109         return sequence.toseq().reverse_complement()
02110 
02111     #Assume its a string.
02112     #In order to avoid some code duplication, the old code would turn the string
02113     #into a Seq, use the reverse_complement method, and convert back to a string.
02114     #This worked, but is over five times slower on short sequences!
02115     if ('U' in sequence or 'u' in sequence) \
02116     and ('T' in sequence or 't' in sequence):
02117         raise ValueError("Mixed RNA/DNA found")
02118     elif 'U' in sequence or 'u' in sequence:
02119         ttable = _rna_complement_table
02120     else:
02121         ttable = _dna_complement_table
02122     return sequence.translate(ttable)[::-1]
02123 
02124 def _test():
02125     """Run the Bio.Seq module's doctests (PRIVATE)."""
02126     if sys.version_info[0:2] == (3,1):
02127         print "Not running Bio.Seq doctest on Python 3.1"
02128         print "See http://bugs.python.org/issue7490"
02129     else:
02130         print "Runing doctests..."
02131         import doctest
02132         doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
02133         print "Done"
02134 
02135 if __name__ == "__main__":
02136     _test()