Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 # Copyright 2000-2002 by Andrew Dalke.
00002 # Revisions copyright 2007-2010 by Peter Cock.
00003 # All rights reserved.
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 
00008 """Alphabets used in Seq objects etc to declare sequence type and letters.
00009 
00010 This is used by sequences which contain a finite number of similar words.
00011 """
00012 
00013 class Alphabet(object):
00014     size = None     # default to no fixed size for words
00015     letters = None  # default to no fixed alphabet
00016                     # In general, a list-like object. However,
00017                     # assuming letters are single characters, use a
00018                     # string. This is expected for use with Seq like
00019                     # objects.
00020 
00021     def __repr__(self):
00022         return self.__class__.__name__ + "()"
00023 
00024     def contains(self, other):
00025         """Does this alphabet 'contain' the other (OBSOLETE?).
00026 
00027         Returns a boolean.  This relies on the Alphabet subclassing
00028         hierarchy only, and does not check the letters property.
00029         This isn't ideal, and doesn't seem to work as intended
00030         with the AlphabetEncoder classes."""
00031         return isinstance(other, self.__class__)
00032 
00033     def _case_less(self):
00034         """Return an case-less variant of the current alphabet (PRIVATE)."""
00035         #TODO - remove this method by dealing with things in subclasses?
00036         if isinstance(self, ProteinAlphabet):
00037             return generic_protein
00038         elif isinstance(self, DNAAlphabet):
00039             return generic_dna
00040         elif isinstance(self, NucleotideAlphabet):
00041             return generic_rna
00042         elif isinstance(self, NucleotideAlphabet):
00043             return generic_nucleotide
00044         elif isinstance(self, SingleLetterAlphabet):
00045             return single_letter_alphabet
00046         else:
00047             return generic_alphabet
00048 
00049     def _upper(self):
00050         """Return an upper case variant of the current alphabet (PRIVATE)."""
00051         if not self.letters or self.letters==self.letters.upper():
00052             #Easy case, no letters or already upper case!
00053             return self
00054         else:
00055             #TODO - Raise NotImplementedError and handle via subclass?
00056             return self._case_less()
00057 
00058     def _lower(self):
00059         """Return a lower case variant of the current alphabet (PRIVATE)."""
00060         if not self.letters or self.letters==self.letters.lower():
00061             #Easy case, no letters or already lower case!
00062             return self
00063         else:
00064             #TODO - Raise NotImplementedError and handle via subclass?
00065             return self._case_less()
00066 
00067 generic_alphabet = Alphabet()
00068 
00069 class SingleLetterAlphabet(Alphabet):
00070     size = 1
00071     letters = None   # string of all letters in the alphabet
00072 
00073 single_letter_alphabet = SingleLetterAlphabet()
00074 
00075 ########### Protein
00076 
00077 class ProteinAlphabet(SingleLetterAlphabet):
00078     pass
00079 
00080 generic_protein = ProteinAlphabet()
00081 
00082 ########### DNA
00083 class NucleotideAlphabet(SingleLetterAlphabet):
00084     pass
00085 
00086 generic_nucleotide = NucleotideAlphabet()
00087 
00088 class DNAAlphabet(NucleotideAlphabet):
00089     pass
00090 
00091 generic_dna = DNAAlphabet()
00092 
00093 
00094 ########### RNA
00095 
00096 class RNAAlphabet(NucleotideAlphabet):
00097     pass
00098 
00099 generic_rna = RNAAlphabet()
00100 
00101 
00102 
00103 ########### Other per-sequence encodings
00104 
00105 class SecondaryStructure(SingleLetterAlphabet):
00106     letters = "HSTC"
00107 
00108 class ThreeLetterProtein(Alphabet):
00109     size = 3
00110     letters = [
00111         "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
00112         "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
00113         "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx",
00114         ]
00115         
00116 ###### Non per-sequence modifications
00117 
00118 # (These are Decorator classes)
00119 
00120 class AlphabetEncoder(object):
00121     def __init__(self, alphabet, new_letters):
00122         self.alphabet = alphabet
00123         self.new_letters = new_letters
00124         if alphabet.letters is not None:
00125             self.letters = alphabet.letters + new_letters
00126         else:
00127             self.letters = None
00128     def __getattr__(self, key):
00129         if key[:2] == "__" and key[-2:] == "__":
00130             raise AttributeError(key)
00131         return getattr(self.alphabet, key)
00132 
00133     def __repr__(self):
00134         return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet,
00135                                self.new_letters)
00136 
00137     def contains(self, other):
00138         """Does this alphabet 'contain' the other (OBSOLETE?).
00139 
00140         This is isn't implemented for the base AlphabetEncoder,
00141         which will always return 0 (False)."""
00142         return 0
00143 
00144     def _upper(self):
00145         """Return an upper case variant of the current alphabet (PRIVATE)."""
00146         return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
00147 
00148     def _lower(self):
00149         """Return a lower case variant of the current alphabet (PRIVATE)."""
00150         return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
00151 
00152     
00153 class Gapped(AlphabetEncoder):
00154     def __init__(self, alphabet, gap_char = "-"):
00155         AlphabetEncoder.__init__(self, alphabet, gap_char)
00156         self.gap_char = gap_char
00157 
00158     def contains(self, other):
00159         """Does this alphabet 'contain' the other (OBSOLETE?).
00160 
00161         Returns a boolean.  This relies on the Alphabet subclassing
00162         hierarchy, and attempts to check the gap character.  This fails
00163         if the other alphabet does not have a gap character!
00164         """
00165         return other.gap_char == self.gap_char and \
00166                self.alphabet.contains(other.alphabet)
00167 
00168     def _upper(self):
00169         """Return an upper case variant of the current alphabet (PRIVATE)."""
00170         return Gapped(self.alphabet._upper(), self.gap_char.upper())
00171 
00172     def _lower(self):
00173         """Return a lower case variant of the current alphabet (PRIVATE)."""
00174         return Gapped(self.alphabet._lower(), self.gap_char.lower())
00175 
00176             
00177 class HasStopCodon(AlphabetEncoder):
00178     def __init__(self, alphabet, stop_symbol = "*"):
00179         AlphabetEncoder.__init__(self, alphabet, stop_symbol)
00180         self.stop_symbol = stop_symbol
00181         
00182     def __cmp__(self, other):
00183         x = cmp(self.alphabet, other.alphabet)
00184         if x == 0:
00185             return cmp(self.stop_symbol, other.stop_symbol)
00186         return x
00187 
00188     def contains(self, other):
00189         """Does this alphabet 'contain' the other (OBSOLETE?).
00190 
00191         Returns a boolean.  This relies on the Alphabet subclassing
00192         hierarchy, and attempts to check the stop symbol.  This fails
00193         if the other alphabet does not have a stop symbol!
00194         """
00195         return other.stop_symbol == self.stop_symbol and \
00196                self.alphabet.contains(other.alphabet)
00197 
00198     def _upper(self):
00199         """Return an upper case variant of the current alphabet (PRIVATE)."""
00200         return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
00201 
00202     def _lower(self):
00203         """Return a lower case variant of the current alphabet (PRIVATE)."""
00204         return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
00205 
00206 
00207 def _get_base_alphabet(alphabet):
00208     """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE)."""
00209     a = alphabet
00210     while isinstance(a, AlphabetEncoder):
00211         a = a.alphabet
00212     assert isinstance(a, Alphabet), \
00213            "Invalid alphabet found, %s" % repr(a)
00214     return a
00215 
00216 def _ungap(alphabet):
00217     """Returns the alphabet without any gap encoder (PRIVATE)."""
00218     #TODO - Handle via method of the objects?
00219     if not hasattr(alphabet, "gap_char"):
00220         return alphabet
00221     elif isinstance(alphabet, Gapped):
00222         return alphabet.alphabet
00223     elif isinstance(alphabet, HasStopCodon):
00224         return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol)
00225     elif isinstance(alphabet, AlphabetEncoder):
00226         return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters)
00227     else:
00228         raise NotImplementedError
00229     
00230 def _consensus_base_alphabet(alphabets):
00231     """Returns a common but often generic base alphabet object (PRIVATE).
00232 
00233     This throws away any AlphabetEncoder information, e.g. Gapped alphabets.
00234 
00235     Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
00236     letter.  These DO NOT raise an exception!"""
00237     common = None
00238     for alpha in alphabets:
00239         a = _get_base_alphabet(alpha)
00240         if common is None:
00241             common = a
00242         elif common == a:
00243             pass
00244         elif isinstance(a, common.__class__):
00245             pass
00246         elif isinstance(common, a.__class__):
00247             common = a
00248         elif isinstance(a, NucleotideAlphabet) \
00249         and isinstance(common, NucleotideAlphabet):
00250             #e.g. Give a mix of RNA and DNA alphabets
00251             common = generic_nucleotide
00252         elif isinstance(a, SingleLetterAlphabet) \
00253         and isinstance(common, SingleLetterAlphabet):
00254             #This is a pretty big mis-match!
00255             common = single_letter_alphabet
00256         else:
00257             #We have a major mis-match... take the easy way out!
00258             return generic_alphabet
00259     if common is None:
00260         #Given NO alphabets!
00261         return generic_alphabet
00262     return common
00263 
00264 def _consensus_alphabet(alphabets):
00265     """Returns a common but often generic alphabet object (PRIVATE).
00266 
00267     >>> from Bio.Alphabet import IUPAC
00268     >>> _consensus_alphabet([IUPAC.extended_protein, IUPAC.protein])
00269     ExtendedIUPACProtein()
00270     >>> _consensus_alphabet([generic_protein, IUPAC.protein])
00271     ProteinAlphabet()
00272 
00273     Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single
00274     letter.  These DO NOT raise an exception!
00275 
00276     >>> _consensus_alphabet([generic_dna, generic_nucleotide])
00277     NucleotideAlphabet()
00278     >>> _consensus_alphabet([generic_dna, generic_rna])
00279     NucleotideAlphabet()
00280     >>> _consensus_alphabet([generic_dna, generic_protein])
00281     SingleLetterAlphabet()
00282     >>> _consensus_alphabet([single_letter_alphabet, generic_protein])
00283     SingleLetterAlphabet()
00284     
00285     This is aware of Gapped and HasStopCodon and new letters added by
00286     other AlphabetEncoders.  This WILL raise an exception if more than
00287     one gap character or stop symbol is present.
00288 
00289     >>> from Bio.Alphabet import IUPAC
00290     >>> _consensus_alphabet([Gapped(IUPAC.extended_protein), HasStopCodon(IUPAC.protein)])
00291     HasStopCodon(Gapped(ExtendedIUPACProtein(), '-'), '*')
00292     >>> _consensus_alphabet([Gapped(IUPAC.protein, "-"), Gapped(IUPAC.protein, "=")])
00293     Traceback (most recent call last):
00294         ...
00295     ValueError: More than one gap character present
00296     >>> _consensus_alphabet([HasStopCodon(IUPAC.protein, "*"), HasStopCodon(IUPAC.protein, "+")])
00297     Traceback (most recent call last):
00298         ...
00299     ValueError: More than one stop symbol present
00300     """
00301     base = _consensus_base_alphabet(alphabets)
00302     gap = None
00303     stop = None
00304     new_letters = ""
00305     for alpha in alphabets:
00306         #Gaps...
00307         if not hasattr(alpha, "gap_char"):
00308             pass
00309         elif gap is None:
00310             gap = alpha.gap_char
00311         elif gap == alpha.gap_char:
00312             pass
00313         else:
00314             raise ValueError("More than one gap character present")
00315         #Stops...
00316         if not hasattr(alpha, "stop_symbol"):
00317             pass
00318         elif stop is None:
00319             stop = alpha.stop_symbol
00320         elif stop == alpha.stop_symbol:
00321             pass
00322         else:
00323             raise ValueError("More than one stop symbol present")
00324         #New letters...
00325         if hasattr(alpha, "new_letters"):
00326             for letter in alpha.new_letters:
00327                 if letter not in new_letters \
00328                 and letter != gap and letter != stop:
00329                     new_letters += letter
00330 
00331     alpha = base
00332     if new_letters:
00333         alpha = AlphabetEncoder(alpha, new_letters)
00334     if gap:
00335         alpha = Gapped(alpha, gap_char=gap)
00336     if stop:
00337         alpha = HasStopCodon(alpha, stop_symbol=stop)
00338     return alpha
00339 
00340 def _check_type_compatible(alphabets):
00341     """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE).
00342 
00343     >>> _check_type_compatible([generic_dna, generic_nucleotide])
00344     True
00345     >>> _check_type_compatible([generic_dna, generic_rna])
00346     False
00347     >>> _check_type_compatible([generic_dna, generic_protein])
00348     False
00349     >>> _check_type_compatible([single_letter_alphabet, generic_protein])
00350     True
00351 
00352     This relies on the Alphabet subclassing hierarchy.  It does not
00353     check things like gap characters or stop symbols."""
00354     dna, rna, nucl, protein = False, False, False, False
00355     for alpha in alphabets:
00356         a = _get_base_alphabet(alpha)
00357         if isinstance(a, DNAAlphabet):
00358             dna = True
00359             nucl = True
00360             if rna or protein : return False
00361         elif isinstance(a, RNAAlphabet):
00362             rna = True
00363             nucl = True
00364             if dna or protein : return False
00365         elif isinstance(a, NucleotideAlphabet):
00366             nucl = True
00367             if protein : return False
00368         elif isinstance(a, ProteinAlphabet):
00369             protein = True
00370             if nucl : return False
00371     return True
00372 
00373 def _verify_alphabet(sequence):
00374     """Check all letters in sequence are in the alphabet (PRIVATE).
00375 
00376     >>> from Bio.Seq import Seq
00377     >>> from Bio.Alphabet import IUPAC
00378     >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
00379     ...              IUPAC.protein)
00380     >>> _verify_alphabet(my_seq)
00381     True
00382 
00383     This example has an X, which is not in the IUPAC protein alphabet
00384     (you should be using the IUPAC extended protein alphabet):
00385 
00386     >>> bad_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFX",
00387     ...                IUPAC.protein)
00388     >>> _verify_alphabet(bad_seq)
00389     False
00390 
00391     This replaces Bio.utils.verify_alphabet() since we are deprecating
00392     that. Potentially this could be added to the Alphabet object, and
00393     I would like it to be an option when creating a Seq object... but
00394     that might slow things down.
00395     """
00396     letters = sequence.alphabet.letters
00397     if not letters:
00398         raise ValueError("Alphabet does not define letters.")
00399     for letter in sequence:
00400         if letter not in letters:
00401             return False
00402     return True
00403