Back to index

python-biopython  1.60
Restriction.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 #      Restriction Analysis Libraries.
00004 #      Copyright (C) 2004. Frederic Sohm.
00005 #
00006 # This code is part of the Biopython distribution and governed by its
00007 # license.  Please see the LICENSE file that should have been included
00008 # as part of this package.
00009 #
00010 
00011 """ Notes about the diverses class of the restriction enzyme implementation.
00012 
00013         RestrictionType is the type of all restriction enzymes.
00014     ----------------------------------------------------------------------------
00015         AbstractCut implements some methods that are common to all enzymes.
00016     ----------------------------------------------------------------------------
00017         NoCut, OneCut,TwoCuts   represent the number of double strand cuts
00018                                 produced by the enzyme.
00019                                 they correspond to the 4th field of the rebase
00020                                 record emboss_e.NNN.
00021                 0->NoCut    : the enzyme is not characterised.
00022                 2->OneCut   : the enzyme produce one double strand cut.
00023                 4->TwoCuts  : two double strand cuts.
00024     ----------------------------------------------------------------------------
00025         Meth_Dep, Meth_Undep    represent the methylation susceptibility to
00026                                 the enzyme.
00027                                 Not implemented yet.
00028     ----------------------------------------------------------------------------
00029         Palindromic,            if the site is palindromic or not.
00030         NotPalindromic          allow some optimisations of the code.
00031                                 No need to check the reverse strand
00032                                 with palindromic sites.
00033     ----------------------------------------------------------------------------                                    
00034         Unknown, Blunt,         represent the overhang.
00035         Ov5, Ov3                Unknown is here for symetry reasons and
00036                                 correspond to enzymes that are not characterised
00037                                 in rebase.
00038     ----------------------------------------------------------------------------
00039         Defined, Ambiguous,     represent the sequence of the overhang.
00040         NotDefined             
00041                                 NotDefined is for enzymes not characterised in
00042                                 rebase.
00043                                 
00044                                 Defined correspond to enzymes that display a
00045                                 constant overhang whatever the sequence.
00046                                 ex : EcoRI. G^AATTC -> overhang :AATT
00047                                             CTTAA^G
00048 
00049                                 Ambiguous : the overhang varies with the
00050                                 sequence restricted.
00051                                 Typically enzymes which cut outside their
00052                                 restriction site or (but not always)
00053                                 inside an ambiguous site.
00054                                 ex:
00055                                 AcuI CTGAAG(22/20)  -> overhang : NN
00056                                 AasI GACNNN^NNNGTC  -> overhang : NN
00057                                      CTGN^NNNNNCAG
00058 
00059             note : these 3 classes refers to the overhang not the site.
00060                So the enzyme ApoI (RAATTY) is defined even if its restriction
00061                site is ambiguous.
00062                                 
00063                     ApoI R^AATTY -> overhang : AATT -> Defined
00064                          YTTAA^R
00065                Accordingly, blunt enzymes are always Defined even
00066                when they cut outside their restriction site.
00067     ----------------------------------------------------------------------------
00068         Not_available,          as found in rebase file emboss_r.NNN files.
00069         Commercially_available
00070                                 allow the selection of the enzymes according to
00071                                 their suppliers to reduce the quantity
00072                                 of results.
00073                                 Also will allow the implementation of buffer
00074                                 compatibility tables. Not implemented yet.
00075 
00076                                 the list of suppliers is extracted from
00077                                 emboss_s.NNN
00078     ----------------------------------------------------------------------------
00079         """
00080 
00081 import re
00082 import itertools
00083 
00084 from Bio.Seq import Seq, MutableSeq
00085 from Bio.Alphabet import IUPAC
00086 
00087 from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict
00088 from Bio.Restriction.Restriction_Dictionary import typedict
00089 from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict
00090 from Bio.Restriction.RanaConfig import *
00091 from Bio.Restriction.PrintFormat import PrintFormat
00092 
00093 #Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this
00094 #namespace), but have deprecated that module.
00095 def _check_bases(seq_string):
00096     """Check characters in a string (PRIVATE).
00097 
00098     Remove digits and white space present in string. Allows any valid ambiguous
00099     IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted).
00100     
00101     Other characters (e.g. symbols) trigger a TypeError.
00102     
00103     Returns the string WITH A LEADING SPACE (!). This is for backwards
00104     compatibility, and may in part be explained by the fact that
00105     Bio.Restriction doesn't use zero based counting.
00106     """
00107     #Remove white space and make upper case:
00108     seq_string = "".join(seq_string.split()).upper()
00109     #Remove digits
00110     for c in "0123456789" : seq_string = seq_string.replace(c,"")
00111     #Check only allowed IUPAC letters
00112     if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")) :
00113         raise TypeError("Invalid character found in %s" % repr(seq_string))
00114     return " " + seq_string
00115 
00116 
00117 matching = {'A' : 'ARWMHVDN', 'C' : 'CYSMHBVN', 'G' : 'GRSKBVDN',
00118             'T' : 'TYWKHBDN', 'R' : 'ABDGHKMNSRWV', 'Y' : 'CBDHKMNSTWVY',
00119             'W' : 'ABDHKMNRTWVY', 'S' : 'CBDGHKMNSRVY', 'M' : 'ACBDHMNSRWVY',
00120             'K' : 'BDGHKNSRTWVY', 'H' : 'ACBDHKMNSRTWVY',
00121             'B' : 'CBDGHKMNSRTWVY', 'V' : 'ACBDGHKMNSRWVY',
00122             'D' : 'ABDGHKMNSRTWVY', 'N' : 'ACBDGHKMNSRTWVY'}
00123 
00124 DNA = Seq
00125     
00126 class FormattedSeq(object):
00127     """FormattedSeq(seq, [linear=True])-> new FormattedSeq.
00128 
00129     Translate a Bio.Seq into a formatted sequence to be used with Restriction.
00130 
00131     Roughly:
00132         remove anything which is not IUPAC alphabet and then add a space
00133         in front of the sequence to get a biological index instead of a
00134         python index (i.e. index of the first base is 1 not 0).
00135 
00136         Retains information about the shape of the molecule linear (default)
00137         or circular. Restriction sites are search over the edges of circular
00138         sequence."""
00139 
00140     def __init__(self, seq, linear = True):
00141         """FormattedSeq(seq, [linear=True])-> new FormattedSeq.
00142 
00143         seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq.
00144         if seq is a FormattedSeq, linear will have no effect on the
00145         shape of the sequence."""
00146         if isinstance(seq, Seq) or isinstance(seq, MutableSeq):
00147             stringy       = seq.tostring()
00148             self.lower    = stringy.islower()
00149             #Note this adds a leading space to the sequence (!)
00150             self.data     = _check_bases(stringy)
00151             self.linear   = linear
00152             self.klass    = seq.__class__
00153             self.alphabet = seq.alphabet
00154         elif isinstance(seq, FormattedSeq):
00155             self.lower    = seq.lower
00156             self.data     = seq.data
00157             self.linear   = seq.linear
00158             self.alphabet = seq.alphabet
00159             self.klass    = seq.klass   
00160         else:
00161             raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
00162 
00163     def __len__(self):
00164         return len(self.data) - 1
00165 
00166     def __repr__(self):
00167         return 'FormattedSeq(%s, linear=%s)' %(repr(self[1:]), repr(self.linear))
00168 
00169     def __eq__(self, other):
00170         if isinstance(other, FormattedSeq):
00171             if repr(self) == repr(other):
00172                 return True
00173             else:
00174                 return False
00175         return False
00176     
00177     def circularise(self):
00178         """FS.circularise() -> circularise FS"""
00179         self.linear = False
00180         return
00181 
00182     def linearise(self):
00183         """FS.linearise() -> linearise FS"""
00184         self.linear = True
00185         return
00186 
00187     def to_linear(self):
00188         """FS.to_linear() -> new linear FS instance"""
00189         new = self.__class__(self)
00190         new.linear = True
00191         return new
00192 
00193     def to_circular(self):
00194         """FS.to_circular() -> new circular FS instance"""
00195         new = self.__class__(self)
00196         new.linear = False
00197         return new
00198 
00199     def is_linear(self):
00200         """FS.is_linear() -> bool.
00201 
00202         True if the sequence will analysed as a linear sequence."""
00203         return self.linear
00204 
00205     def finditer(self, pattern, size):
00206         """FS.finditer(pattern, size) -> list.
00207 
00208         return a list of pattern into the sequence.
00209         the list is made of tuple (location, pattern.group).
00210         the latter is used with non palindromic sites.
00211         pattern is the regular expression pattern corresponding to the
00212         enzyme restriction site.
00213         size is the size of the restriction enzyme recognition-site size."""
00214         if self.is_linear():
00215             data = self.data
00216         else:
00217             data = self.data + self.data[1:size]
00218         return [(i.start(), i.group) for i in re.finditer(pattern, data)]
00219 
00220     def __getitem__(self, i):
00221         if self.lower:
00222             return self.klass((self.data[i]).lower(), self.alphabet)
00223         return self.klass(self.data[i], self.alphabet)
00224     
00225 
00226 class RestrictionType(type):
00227     """RestrictionType. Type from which derives all enzyme classes.
00228 
00229     Implement the operator methods."""
00230    
00231     def __init__(cls, name='', bases=(), dct={}):
00232         """RE(name, bases, dct) -> RestrictionType instance.
00233 
00234         Not intended to be used in normal operation. The enzymes are
00235         instantiated when importing the module.
00236         
00237         see below."""
00238         if "-" in name :
00239             raise ValueError("Problem with hyphen in %s as enzyme name" \
00240                              % repr(name))
00241         # 2011/11/26 - Nobody knows what this call was supposed to accomplish,
00242         # but all unit tests seem to pass without it. 
00243         # super(RestrictionType, cls).__init__(cls, name, bases, dct)
00244         try :
00245             cls.compsite = re.compile(cls.compsite)
00246         except Exception, err :
00247             raise ValueError("Problem with regular expression, re.compiled(%s)" \
00248                              % repr(cls.compsite))
00249         
00250     def __add__(cls, other):
00251         """RE.__add__(other) -> RestrictionBatch().
00252 
00253         if other is an enzyme returns a batch of the two enzymes.
00254         if other is already a RestrictionBatch add enzyme to it."""
00255         if isinstance(other, RestrictionType):
00256             return RestrictionBatch([cls, other])    
00257         elif isinstance(other, RestrictionBatch):
00258             return other.add_nocheck(cls)
00259         else:
00260             raise TypeError
00261         
00262     def __div__(cls, other):
00263         """RE.__div__(other) -> list.
00264 
00265         RE/other
00266         returns RE.search(other)."""
00267         return cls.search(other)
00268     
00269     def __rdiv__(cls, other):
00270         """RE.__rdiv__(other) -> list.
00271 
00272         other/RE
00273         returns RE.search(other)."""
00274         return cls.search(other)
00275     
00276     def __truediv__(cls, other):
00277         """RE.__truediv__(other) -> list.
00278 
00279         RE/other
00280         returns RE.search(other)."""
00281         return cls.search(other)
00282     
00283     def __rtruediv__(cls, other):
00284         """RE.__rtruediv__(other) -> list.
00285 
00286         other/RE
00287         returns RE.search(other)."""
00288         return cls.search(other)
00289     
00290     def __floordiv__(cls, other):
00291         """RE.__floordiv__(other) -> list.
00292 
00293         RE//other
00294         returns RE.catalyse(other)."""
00295         return cls.catalyse(other)
00296     
00297     def __rfloordiv__(cls, other):
00298         """RE.__rfloordiv__(other) -> list.
00299 
00300         other//RE
00301         returns RE.catalyse(other)."""
00302         return cls.catalyse(other)
00303     
00304     def __str__(cls):
00305         """RE.__str__() -> str.
00306 
00307         return the name of the enzyme."""
00308         return cls.__name__
00309     
00310     def __repr__(cls):
00311         """RE.__repr__() -> str.
00312 
00313         used with eval or exec will instantiate the enzyme."""
00314         return "%s" % cls.__name__
00315     
00316     def __len__(cls):
00317         """RE.__len__() -> int.
00318 
00319         length of the recognition site."""
00320         return cls.size
00321     
00322     def __hash__(cls):
00323         #Python default is to use id(...)
00324         #This is consistent with the __eq__ implementation
00325         return id(cls)
00326     
00327     def __eq__(cls, other):
00328         """RE == other -> bool
00329 
00330         True if RE and other are the same enzyme.
00331         
00332         Specifically this checks they are the same Python object.
00333         """
00334         #assert (id(cls)==id(other)) == (other is cls) == (cls is other)
00335         return id(cls)==id(other)
00336 
00337     def __ne__(cls, other):
00338         """RE != other -> bool.
00339         isoschizomer strict, same recognition site, same restriction -> False
00340         all the other-> True
00341         
00342         WARNING - This is not the inverse of the __eq__ method.
00343         """
00344         if not isinstance(other, RestrictionType):
00345             return True
00346         elif cls.charac == other.charac:
00347             return False
00348         else:
00349             return True
00350 
00351     def __rshift__(cls, other):
00352         """RE >> other -> bool.
00353         
00354         neoschizomer : same recognition site, different restriction. -> True
00355         all the others :                                             -> False"""
00356         if not isinstance(other, RestrictionType):
00357             return False
00358         elif cls.site == other.site and cls.charac != other.charac:
00359             return True
00360         else:
00361             return False
00362 
00363     def __mod__(cls, other):
00364         """a % b -> bool.
00365 
00366         Test compatibility of the overhang of a and b.
00367         True if a and b have compatible overhang."""
00368         if not isinstance(other, RestrictionType):
00369             raise TypeError( \
00370                   'expected RestrictionType, got %s instead' % type(other))
00371         return cls._mod1(other)
00372         
00373     def __ge__(cls, other):
00374         """a >= b -> bool.
00375 
00376         a is greater or equal than b if the a site is longer than b site.
00377         if their site have the same length sort by alphabetical order of their
00378         names."""
00379         if not isinstance(other, RestrictionType):
00380             raise NotImplementedError
00381         if len(cls) > len(other):
00382             return True
00383         elif cls.size == len(other) and cls.__name__ >= other.__name__:
00384             return True
00385         else:
00386             return False
00387 
00388     def __gt__(cls, other):
00389         """a > b -> bool.
00390 
00391         sorting order:
00392                     1. size of the recognition site.
00393                     2. if equal size, alphabetical order of the names."""
00394         if not isinstance(other, RestrictionType):
00395             raise NotImplementedError
00396         if len(cls) > len(other):
00397             return True
00398         elif cls.size == len(other) and cls.__name__ > other.__name__:
00399             return True
00400         else:
00401             return False
00402 
00403     def __le__(cls, other):
00404         """a <= b -> bool.
00405 
00406         sorting order:
00407                     1. size of the recognition site.
00408                     2. if equal size, alphabetical order of the names."""
00409         if not isinstance(other, RestrictionType):
00410             raise NotImplementedError
00411         elif len(cls) < len(other):
00412             return True
00413         elif len(cls) == len(other) and cls.__name__ <= other.__name__:
00414             return True
00415         else:
00416             return False
00417 
00418     def __lt__(cls, other):
00419         """a < b -> bool.
00420 
00421         sorting order:
00422                     1. size of the recognition site.
00423                     2. if equal size, alphabetical order of the names."""
00424         if not isinstance(other, RestrictionType):
00425             raise NotImplementedError
00426         elif len(cls) < len(other):
00427             return True
00428         elif len(cls) == len(other) and cls.__name__ < other.__name__:
00429             return True
00430         else:
00431             return False
00432     
00433     
00434 class AbstractCut(RestrictionType):
00435     """Implement the methods that are common to all restriction enzymes.
00436 
00437     All the methods are classmethod.
00438 
00439     For internal use only. Not meant to be instantiate."""
00440 
00441     @classmethod
00442     def search(cls, dna, linear=True):
00443         """RE.search(dna, linear=True) -> list.
00444 
00445         return a list of all the site of RE in dna. Compensate for circular
00446         sequences and so on.
00447 
00448         dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance.
00449         
00450         if linear is False, the restriction sites than span over the boundaries
00451         will be included.
00452 
00453         The positions are the first base of the 3' fragment,
00454         i.e. the first base after the position the enzyme will cut. """
00455         #
00456         #   Separating search from _search allow a (very limited) optimisation
00457         #   of the search when using a batch of restriction enzymes.
00458         #   in this case the DNA is tested once by the class which implements
00459         #   the batch instead of being tested by each enzyme single.
00460         #   see RestrictionBatch.search() for example.
00461         #
00462         if isinstance(dna, FormattedSeq):
00463             cls.dna = dna
00464             return cls._search()
00465         else :  
00466             cls.dna = FormattedSeq(dna, linear)
00467             return cls._search()
00468 
00469     @classmethod
00470     def all_suppliers(self):
00471         """RE.all_suppliers -> print all the suppliers of R"""
00472         supply = [x[0] for x in suppliers_dict.itervalues()]
00473         supply.sort()
00474         print ",\n".join(supply)
00475         return
00476 
00477     @classmethod
00478     def is_equischizomer(self, other):
00479         """RE.is_equischizomers(other) -> bool.
00480 
00481         True if other is an isoschizomer of RE.
00482         False else.
00483 
00484         equischizomer <=> same site, same position of restriction."""
00485         return not self != other
00486 
00487     @classmethod
00488     def is_neoschizomer(self, other):
00489         """RE.is_neoschizomers(other) -> bool.
00490 
00491         True if other is an isoschizomer of RE.
00492         False else.
00493 
00494         neoschizomer <=> same site, different position of restriction."""
00495         return self >> other
00496 
00497     @classmethod
00498     def is_isoschizomer(self, other):
00499         """RE.is_isoschizomers(other) -> bool.
00500 
00501         True if other is an isoschizomer of RE.
00502         False else.
00503 
00504         isoschizomer <=> same site."""
00505         return (not self != other) or self >> other
00506 
00507     @classmethod
00508     def equischizomers(self, batch=None):
00509         """RE.equischizomers([batch]) -> list.
00510 
00511         return a tuple of all the isoschizomers of RE.
00512         if batch is supplied it is used instead of the default AllEnzymes.
00513 
00514         equischizomer <=> same site, same position of restriction."""
00515         if not batch : batch = AllEnzymes
00516         r = [x for x in batch if not self != x]
00517         i = r.index(self)
00518         del r[i]
00519         r.sort()
00520         return r
00521 
00522     @classmethod
00523     def neoschizomers(self, batch=None):
00524         """RE.neoschizomers([batch]) -> list.
00525 
00526         return a tuple of all the neoschizomers of RE.
00527         if batch is supplied it is used instead of the default AllEnzymes.
00528 
00529         neoschizomer <=> same site, different position of restriction."""
00530         if not batch : batch = AllEnzymes              
00531         r = [x for x in batch if self >> x]
00532         r.sort()
00533         return r
00534 
00535     @classmethod
00536     def isoschizomers(self, batch=None):
00537         """RE.isoschizomers([batch]) -> list.
00538 
00539         return a tuple of all the equischizomers and neoschizomers of RE.
00540         if batch is supplied it is used instead of the default AllEnzymes."""
00541         if not batch : batch = AllEnzymes 
00542         r = [x for x in batch if (self >> x) or (not self != x)]
00543         i = r.index(self)
00544         del r[i]
00545         r.sort()
00546         return r
00547 
00548     @classmethod
00549     def frequency(self):
00550         """RE.frequency() -> int.
00551 
00552         frequency of the site."""
00553         return self.freq
00554 
00555 
00556 class NoCut(AbstractCut):
00557     """Implement the methods specific to the enzymes that do not cut.
00558 
00559     These enzymes are generally enzymes that have been only partially
00560     characterised and the way they cut the DNA is unknow or enzymes for
00561     which the pattern of cut is to complex to be recorded in Rebase
00562     (ncuts values of 0 in emboss_e.###).
00563 
00564     When using search() with these enzymes the values returned are at the start of
00565     the restriction site.
00566 
00567     Their catalyse() method returns a TypeError.
00568 
00569     Unknown and NotDefined are also part of the base classes of these enzymes.
00570 
00571     Internal use only. Not meant to be instantiated."""
00572 
00573     @classmethod
00574     def cut_once(self):
00575         """RE.cut_once() -> bool.
00576 
00577         True if the enzyme cut the sequence one time on each strand."""
00578         return False
00579 
00580     @classmethod
00581     def cut_twice(self):
00582         """RE.cut_twice() -> bool.
00583 
00584         True if the enzyme cut the sequence twice on each strand."""
00585         return False
00586 
00587     @classmethod
00588     def _modify(self, location):
00589         """RE._modify(location) -> int.
00590 
00591         for internal use only.
00592         
00593         location is an integer corresponding to the location of the match for
00594         the enzyme pattern in the sequence.
00595         _modify returns the real place where the enzyme will cut.
00596 
00597         example:
00598             EcoRI pattern : GAATTC
00599             EcoRI will cut after the G.
00600             so in the sequence:
00601                      ______
00602             GAATACACGGAATTCGA
00603                      |
00604                      10
00605             dna.finditer(GAATTC, 6) will return 10 as G is the 10th base
00606             EcoRI cut after the G so:
00607             EcoRI._modify(10) -> 11.
00608 
00609         if the enzyme cut twice _modify will returns two integer corresponding
00610         to each cutting site.
00611         """
00612         yield location
00613 
00614     @classmethod
00615     def _rev_modify(self, location):
00616         """RE._rev_modify(location) -> generator of int.
00617 
00618         for internal use only.
00619         
00620         as _modify for site situated on the antiparallel strand when the
00621         enzyme is not palindromic
00622         """
00623         yield location
00624 
00625     @classmethod
00626     def characteristic(self):
00627         """RE.characteristic() -> tuple.
00628 
00629         the tuple contains the attributes:
00630             fst5 -> first 5' cut ((current strand) or None
00631             fst3 -> first 3' cut (complementary strand) or None
00632             scd5 -> second 5' cut (current strand) or None
00633             scd5 -> second 3' cut (complementary strand) or None
00634             site -> recognition site."""
00635         return None, None, None, None, self.site
00636 
00637 
00638 class OneCut(AbstractCut):
00639     """Implement the methods specific to the enzymes that cut the DNA only once
00640 
00641     Correspond to ncuts values of 2 in emboss_e.###
00642 
00643     Internal use only. Not meant to be instantiated."""
00644 
00645     @classmethod
00646     def cut_once(self):
00647         """RE.cut_once() -> bool.
00648 
00649         True if the enzyme cut the sequence one time on each strand."""
00650         return True
00651 
00652     @classmethod
00653     def cut_twice(self):
00654         """RE.cut_twice() -> bool.
00655 
00656         True if the enzyme cut the sequence twice on each strand."""
00657         return False
00658 
00659     @classmethod
00660     def _modify(self, location):
00661         """RE._modify(location) -> int.
00662 
00663         for internal use only.
00664         
00665         location is an integer corresponding to the location of the match for
00666         the enzyme pattern in the sequence.
00667         _modify returns the real place where the enzyme will cut.
00668 
00669         example:
00670             EcoRI pattern : GAATTC
00671             EcoRI will cut after the G.
00672             so in the sequence:
00673                      ______
00674             GAATACACGGAATTCGA
00675                      |
00676                      10
00677             dna.finditer(GAATTC, 6) will return 10 as G is the 10th base
00678             EcoRI cut after the G so:
00679             EcoRI._modify(10) -> 11.
00680 
00681         if the enzyme cut twice _modify will returns two integer corresponding
00682         to each cutting site.
00683         """
00684         yield location + self.fst5
00685 
00686     @classmethod
00687     def _rev_modify(self, location):
00688         """RE._rev_modify(location) -> generator of int.
00689 
00690         for internal use only.
00691         
00692         as _modify for site situated on the antiparallel strand when the
00693         enzyme is not palindromic
00694         """
00695         yield location - self.fst3   
00696 
00697     @classmethod
00698     def characteristic(self):
00699         """RE.characteristic() -> tuple.
00700 
00701         the tuple contains the attributes:
00702             fst5 -> first 5' cut ((current strand) or None
00703             fst3 -> first 3' cut (complementary strand) or None
00704             scd5 -> second 5' cut (current strand) or None
00705             scd5 -> second 3' cut (complementary strand) or None
00706             site -> recognition site."""
00707         return self.fst5, self.fst3, None, None, self.site
00708 
00709 
00710 class TwoCuts(AbstractCut):
00711     """Implement the methods specific to the enzymes that cut the DNA twice
00712 
00713     Correspond to ncuts values of 4 in emboss_e.###
00714 
00715     Internal use only. Not meant to be instantiated."""
00716 
00717     @classmethod
00718     def cut_once(self):
00719         """RE.cut_once() -> bool.
00720 
00721         True if the enzyme cut the sequence one time on each strand."""
00722         return False
00723 
00724     @classmethod
00725     def cut_twice(self):
00726         """RE.cut_twice() -> bool.
00727 
00728         True if the enzyme cut the sequence twice on each strand."""
00729         return True
00730 
00731     @classmethod
00732     def _modify(self, location):
00733         """RE._modify(location) -> int.
00734 
00735         for internal use only.
00736         
00737         location is an integer corresponding to the location of the match for
00738         the enzyme pattern in the sequence.
00739         _modify returns the real place where the enzyme will cut.
00740 
00741         example:
00742             EcoRI pattern : GAATTC
00743             EcoRI will cut after the G.
00744             so in the sequence:
00745                      ______
00746             GAATACACGGAATTCGA
00747                      |
00748                      10
00749             dna.finditer(GAATTC, 6) will return 10 as G is the 10th base
00750             EcoRI cut after the G so:
00751             EcoRI._modify(10) -> 11.
00752 
00753         if the enzyme cut twice _modify will returns two integer corresponding
00754         to each cutting site.
00755         """
00756         yield location + self.fst5
00757         yield location + self.scd5
00758 
00759     @classmethod
00760     def _rev_modify(self, location):
00761         """RE._rev_modify(location) -> generator of int.
00762 
00763         for internal use only.
00764         
00765         as _modify for site situated on the antiparallel strand when the
00766         enzyme is not palindromic
00767         """
00768         yield location - self.fst3 
00769         yield location - self.scd3 
00770 
00771     @classmethod
00772     def characteristic(self):
00773         """RE.characteristic() -> tuple.
00774 
00775         the tuple contains the attributes:
00776             fst5 -> first 5' cut ((current strand) or None
00777             fst3 -> first 3' cut (complementary strand) or None
00778             scd5 -> second 5' cut (current strand) or None
00779             scd5 -> second 3' cut (complementary strand) or None
00780             site -> recognition site."""
00781         return self.fst5, self.fst3, self.scd5, self.scd3, self.site
00782 
00783 
00784 class Meth_Dep(AbstractCut):
00785     """Implement the information about methylation.
00786 
00787     Enzymes of this class possess a site which is methylable."""
00788 
00789     @classmethod
00790     def is_methylable(self):
00791         """RE.is_methylable() -> bool.
00792 
00793         True if the recognition site is a methylable."""
00794         return True
00795 
00796 
00797 class Meth_Undep(AbstractCut):
00798     """Implement informations about methylation sensitibility.
00799 
00800     Enzymes of this class are not sensible to methylation."""
00801 
00802     @classmethod
00803     def is_methylable(self):
00804         """RE.is_methylable() -> bool.
00805 
00806         True if the recognition site is a methylable."""
00807         return False
00808 
00809 
00810 class Palindromic(AbstractCut):
00811     """Implement the methods specific to the enzymes which are palindromic
00812 
00813     palindromic means : the recognition site and its reverse complement are
00814                         identical.
00815     Remarks     : an enzyme with a site CGNNCG is palindromic even if some
00816                   of the sites that it will recognise are not.
00817                   for example here : CGAACG
00818 
00819     Internal use only. Not meant to be instantiated."""
00820 
00821     @classmethod
00822     def _search(self):
00823         """RE._search() -> list.
00824 
00825         for internal use only.
00826 
00827         implement the search method for palindromic and non palindromic enzyme.
00828         """
00829         siteloc = self.dna.finditer(self.compsite,self.size)
00830         self.results = [r for s,g in siteloc for r in self._modify(s)]
00831         if self.results : self._drop()
00832         return self.results
00833 
00834     @classmethod
00835     def is_palindromic(self):
00836         """RE.is_palindromic() -> bool.
00837 
00838         True if the recognition site is a palindrom."""
00839         return True
00840 
00841 
00842 class NonPalindromic(AbstractCut):
00843     """Implement the methods specific to the enzymes which are not palindromic
00844 
00845     palindromic means : the recognition site and its reverse complement are
00846                         identical.
00847 
00848     Internal use only. Not meant to be instantiated."""
00849 
00850     @classmethod
00851     def _search(self):
00852         """RE._search() -> list.
00853 
00854         for internal use only.
00855 
00856         implement the search method for palindromic and non palindromic enzyme.
00857         """
00858         iterator = self.dna.finditer(self.compsite, self.size)
00859         self.results = []
00860         modif = self._modify
00861         revmodif = self._rev_modify
00862         s = str(self)
00863         self.on_minus = []
00864         for start, group in iterator:
00865             if group(s):
00866                 self.results += [r for r in modif(start)]
00867             else:
00868                 self.on_minus += [r for r in revmodif(start)]
00869         self.results += self.on_minus   
00870         if self.results:
00871             self.results.sort()
00872             self._drop()
00873         return self.results
00874 
00875     @classmethod
00876     def is_palindromic(self):
00877         """RE.is_palindromic() -> bool.
00878 
00879         True if the recognition site is a palindrom."""
00880         return False
00881 
00882 
00883 class Unknown(AbstractCut):
00884     """Implement the methods specific to the enzymes for which the overhang
00885     is unknown.
00886 
00887     These enzymes are also NotDefined and NoCut.
00888 
00889     Internal use only. Not meant to be instantiated."""
00890 
00891     @classmethod
00892     def catalyse(self, dna, linear=True):
00893         """RE.catalyse(dna, linear=True) -> tuple of DNA.
00894         RE.catalyze(dna, linear=True) -> tuple of DNA.
00895 
00896         return a tuple of dna as will be produced by using RE to restrict the
00897         dna.
00898         
00899         dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance.
00900         
00901         if linear is False, the sequence is considered to be circular and the
00902         output will be modified accordingly."""
00903         raise NotImplementedError('%s restriction is unknown.' \
00904                                   % self.__name__)
00905     catalyze = catalyse
00906 
00907     @classmethod
00908     def is_blunt(self):
00909         """RE.is_blunt() -> bool.
00910 
00911         True if the enzyme produces blunt end.
00912 
00913         see also:
00914             RE.is_3overhang()
00915             RE.is_5overhang()
00916             RE.is_unknown()"""
00917         return False
00918 
00919     @classmethod
00920     def is_5overhang(self):
00921         """RE.is_5overhang() -> bool.
00922 
00923         True if the enzyme produces 5' overhang sticky end.
00924 
00925         see also:
00926             RE.is_3overhang()
00927             RE.is_blunt()
00928             RE.is_unknown()"""
00929         return False
00930 
00931     @classmethod
00932     def is_3overhang(self):
00933         """RE.is_3overhang() -> bool.
00934 
00935         True if the enzyme produces 3' overhang sticky end.
00936 
00937         see also:
00938             RE.is_5overhang()
00939             RE.is_blunt()
00940             RE.is_unknown()"""
00941         return False
00942 
00943     @classmethod
00944     def overhang(self):
00945         """RE.overhang() -> str. type of overhang of the enzyme.,
00946 
00947         can be "3' overhang", "5' overhang", "blunt", "unknown"   """
00948         return 'unknown'
00949 
00950     @classmethod
00951     def compatible_end(self):
00952         """RE.compatible_end() -> list.
00953 
00954         list of all the enzymes that share compatible end with RE."""
00955         return []
00956 
00957     @classmethod
00958     def _mod1(self, other):
00959         """RE._mod1(other) -> bool.
00960 
00961         for internal use only
00962         
00963         test for the compatibility of restriction ending of RE and other."""
00964         return False
00965 
00966 
00967 class Blunt(AbstractCut):
00968     """Implement the methods specific to the enzymes for which the overhang
00969     is blunt.
00970 
00971     The enzyme cuts the + strand and the - strand of the DNA at the same
00972     place.
00973     
00974     Internal use only. Not meant to be instantiated."""
00975 
00976     @classmethod
00977     def catalyse(self, dna, linear=True):
00978         """RE.catalyse(dna, linear=True) -> tuple of DNA.
00979         RE.catalyze(dna, linear=True) -> tuple of DNA.
00980 
00981         return a tuple of dna as will be produced by using RE to restrict the
00982         dna.
00983         
00984         dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance.
00985         
00986         if linear is False, the sequence is considered to be circular and the
00987         output will be modified accordingly."""
00988         r = self.search(dna, linear)
00989         d = self.dna
00990         if not r : return d[1:],
00991         fragments = []
00992         length = len(r)-1
00993         if d.is_linear():
00994             #
00995             #   START of the sequence to FIRST site.
00996             #
00997             fragments.append(d[1:r[0]])                
00998             if length:
00999                 #
01000                 #   if more than one site add them.
01001                 #
01002                 fragments += [d[r[x]:r[x+1]] for x in xrange(length)]
01003             #
01004             #   LAST site to END of the sequence.
01005             #
01006             fragments.append(d[r[-1]:])                 
01007         else:
01008             #
01009             #   circular : bridge LAST site to FIRST site.
01010             #
01011             fragments.append(d[r[-1]:]+d[1:r[0]])
01012             if not length:
01013                 #
01014                 #   one site we finish here.
01015                 #
01016                 return tuple(fragments)
01017             #
01018             #   add the others.
01019             #
01020             fragments += [d[r[x]:r[x+1]] for x in xrange(length)]
01021         return tuple(fragments)
01022     catalyze = catalyse
01023 
01024     @classmethod
01025     def is_blunt(self):
01026         """RE.is_blunt() -> bool.
01027 
01028         True if the enzyme produces blunt end.
01029 
01030         see also:
01031             RE.is_3overhang()
01032             RE.is_5overhang()
01033             RE.is_unknown()"""
01034         return True
01035 
01036     @classmethod
01037     def is_5overhang(self):
01038         """RE.is_5overhang() -> bool.
01039 
01040         True if the enzyme produces 5' overhang sticky end.
01041 
01042         see also:
01043             RE.is_3overhang()
01044             RE.is_blunt()
01045             RE.is_unknown()"""
01046         return False
01047 
01048     @classmethod
01049     def is_3overhang(self):
01050         """RE.is_3overhang() -> bool.
01051 
01052         True if the enzyme produces 3' overhang sticky end.
01053 
01054         see also:
01055             RE.is_5overhang()
01056             RE.is_blunt()
01057             RE.is_unknown()"""
01058         return False
01059 
01060     @classmethod
01061     def overhang(self):
01062         """RE.overhang() -> str. type of overhang of the enzyme.,
01063 
01064         can be "3' overhang", "5' overhang", "blunt", "unknown"   """
01065         return 'blunt'
01066 
01067     @classmethod
01068     def compatible_end(self, batch=None):
01069         """RE.compatible_end() -> list.
01070 
01071         list of all the enzymes that share compatible end with RE."""
01072         if not batch : batch = AllEnzymes
01073         r = [x for x in iter(AllEnzymes) if x.is_blunt()]
01074         r.sort()
01075         return r 
01076 
01077     @staticmethod
01078     def _mod1(other):
01079         """RE._mod1(other) -> bool.
01080 
01081         for internal use only
01082         
01083         test for the compatibility of restriction ending of RE and other."""
01084         return issubclass(other, Blunt)
01085 
01086 
01087 class Ov5(AbstractCut):
01088     """Implement the methods specific to the enzymes for which the overhang
01089     is recessed in 3'.
01090 
01091     The enzyme cuts the + strand after the - strand of the DNA.
01092     
01093     Internal use only. Not meant to be instantiated."""
01094 
01095     @classmethod
01096     def catalyse(self, dna, linear=True):
01097         """RE.catalyse(dna, linear=True) -> tuple of DNA.
01098         RE.catalyze(dna, linear=True) -> tuple of DNA.
01099 
01100         return a tuple of dna as will be produced by using RE to restrict the
01101         dna.
01102         
01103         dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance.
01104         
01105         if linear is False, the sequence is considered to be circular and the
01106         output will be modified accordingly."""
01107         r = self.search(dna, linear)
01108         d = self.dna
01109         if not r : return d[1:],
01110         length = len(r)-1
01111         fragments = []
01112         if d.is_linear():
01113             #
01114             #   START of the sequence to FIRST site.
01115             #
01116             fragments.append(d[1:r[0]])
01117             if length:
01118                 #
01119                 #   if more than one site add them.
01120                 #
01121                 fragments += [d[r[x]:r[x+1]] for x in xrange(length)]
01122             #
01123             #   LAST site to END of the sequence.
01124             #
01125             fragments.append(d[r[-1]:])             
01126         else:
01127             #
01128             #   circular : bridge LAST site to FIRST site.
01129             #
01130             fragments.append(d[r[-1]:]+d[1:r[0]])
01131             if not length:
01132                 #
01133                 #   one site we finish here.
01134                 #
01135                 return tuple(fragments)
01136             #
01137             #   add the others.
01138             #
01139             fragments += [d[r[x]:r[x+1]] for x in xrange(length)]
01140         return tuple(fragments)
01141     catalyze = catalyse
01142 
01143     @classmethod
01144     def is_blunt(self):
01145         """RE.is_blunt() -> bool.
01146 
01147         True if the enzyme produces blunt end.
01148 
01149         see also:
01150             RE.is_3overhang()
01151             RE.is_5overhang()
01152             RE.is_unknown()"""
01153         return False
01154 
01155     @classmethod
01156     def is_5overhang(self):
01157         """RE.is_5overhang() -> bool.
01158 
01159         True if the enzyme produces 5' overhang sticky end.
01160 
01161         see also:
01162             RE.is_3overhang()
01163             RE.is_blunt()
01164             RE.is_unknown()"""
01165         return True
01166 
01167     @classmethod
01168     def is_3overhang(self):
01169         """RE.is_3overhang() -> bool.
01170 
01171         True if the enzyme produces 3' overhang sticky end.
01172 
01173         see also:
01174             RE.is_5overhang()
01175             RE.is_blunt()
01176             RE.is_unknown()"""
01177         return False
01178 
01179     @classmethod
01180     def overhang(self):
01181         """RE.overhang() -> str. type of overhang of the enzyme.,
01182 
01183         can be "3' overhang", "5' overhang", "blunt", "unknown"   """
01184         return "5' overhang"
01185 
01186     @classmethod
01187     def compatible_end(self, batch=None):
01188         """RE.compatible_end() -> list.
01189 
01190         list of all the enzymes that share compatible end with RE."""
01191         if not batch : batch = AllEnzymes
01192         r = [x for x in iter(AllEnzymes) if x.is_5overhang() and x % self]
01193         r.sort()
01194         return r 
01195 
01196     @classmethod
01197     def _mod1(self, other):
01198         """RE._mod1(other) -> bool.
01199 
01200         for internal use only
01201         
01202         test for the compatibility of restriction ending of RE and other."""
01203         if issubclass(other, Ov5) : return self._mod2(other)
01204         else : return False
01205 
01206 
01207 class Ov3(AbstractCut):
01208     """Implement the methods specific to the enzymes for which the overhang
01209     is recessed in 5'.
01210 
01211     The enzyme cuts the - strand after the + strand of the DNA.
01212     
01213     Internal use only. Not meant to be instantiated."""
01214 
01215     @classmethod
01216     def catalyse(self, dna, linear=True):
01217         """RE.catalyse(dna, linear=True) -> tuple of DNA.
01218         RE.catalyze(dna, linear=True) -> tuple of DNA.
01219 
01220         return a tuple of dna as will be produced by using RE to restrict the
01221         dna.
01222         
01223         dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance.
01224         
01225         if linear is False, the sequence is considered to be circular and the
01226         output will be modified accordingly."""
01227         r = self.search(dna, linear)
01228         d = self.dna
01229         if not r : return d[1:],    
01230         fragments = []
01231         length = len(r)-1
01232         if d.is_linear():
01233             #
01234             #   START of the sequence to FIRST site.
01235             #
01236             fragments.append(d[1:r[0]])
01237             if length:
01238                 #
01239                 #   if more than one site add them.
01240                 #
01241                 fragments += [d[r[x]:r[x+1]] for x in xrange(length)]
01242             #
01243             #   LAST site to END of the sequence.
01244             #
01245             fragments.append(d[r[-1]:])             
01246         else:
01247             #
01248             #   circular : bridge LAST site to FIRST site.
01249             #
01250             fragments.append(d[r[-1]:]+d[1:r[0]])
01251             if not length:
01252                 #
01253                 #   one site we finish here.
01254                 #
01255                 return tuple(fragments)
01256             #
01257             #   add the others.
01258             #
01259             fragments += [d[r[x]:r[x+1]] for x in xrange(length)]
01260         return tuple(fragments)
01261     catalyze = catalyse
01262 
01263     @classmethod
01264     def is_blunt(self):
01265         """RE.is_blunt() -> bool.
01266 
01267         True if the enzyme produces blunt end.
01268 
01269         see also:
01270             RE.is_3overhang()
01271             RE.is_5overhang()
01272             RE.is_unknown()"""
01273         return False
01274 
01275     @classmethod
01276     def is_5overhang(self):
01277         """RE.is_5overhang() -> bool.
01278 
01279         True if the enzyme produces 5' overhang sticky end.
01280 
01281         see also:
01282             RE.is_3overhang()
01283             RE.is_blunt()
01284             RE.is_unknown()"""
01285         return False
01286 
01287     @classmethod
01288     def is_3overhang(self):
01289         """RE.is_3overhang() -> bool.
01290 
01291         True if the enzyme produces 3' overhang sticky end.
01292 
01293         see also:
01294             RE.is_5overhang()
01295             RE.is_blunt()
01296             RE.is_unknown()"""
01297         return True
01298 
01299     @classmethod
01300     def overhang(self):
01301         """RE.overhang() -> str. type of overhang of the enzyme.,
01302 
01303         can be "3' overhang", "5' overhang", "blunt", "unknown"   """
01304         return "3' overhang"
01305 
01306     @classmethod
01307     def compatible_end(self, batch=None):
01308         """RE.compatible_end() -> list.
01309 
01310         list of all the enzymes that share compatible end with RE."""
01311         if not batch : batch = AllEnzymes
01312         r = [x for x in iter(AllEnzymes) if x.is_3overhang() and x % self]
01313         r.sort()
01314         return r 
01315 
01316     @classmethod
01317     def _mod1(self, other):
01318         """RE._mod1(other) -> bool.
01319 
01320         for internal use only
01321         
01322         test for the compatibility of restriction ending of RE and other."""
01323         #
01324         #   called by RE._mod1(other) when the one of the enzyme is ambiguous
01325         #
01326         if issubclass(other, Ov3) : return self._mod2(other)
01327         else : return False
01328 
01329     
01330 class Defined(AbstractCut):
01331     """Implement the methods specific to the enzymes for which the overhang
01332     and the cut are not variable.
01333 
01334     Typical example : EcoRI -> G^AATT_C
01335                       The overhang will always be AATT
01336     Notes:
01337         Blunt enzymes are always defined. even if there site is GGATCCNNN^_N
01338         There overhang is always the same : blunt!
01339     
01340     Internal use only. Not meant to be instantiated."""
01341 
01342     @classmethod
01343     def _drop(self):
01344         """RE._drop() -> list.
01345 
01346         for internal use only.
01347 
01348         drop the site that are situated outside the sequence in linear sequence.
01349         modify the index for site in circular sequences."""
01350         #
01351         #   remove or modify the results that are outside the sequence.
01352         #   This is necessary since after finding the site we add the distance
01353         #   from the site to the cut with the _modify and _rev_modify methods.
01354         #   For linear we will remove these sites altogether.
01355         #   For circular sequence, we modify the result rather than _drop it
01356         #   since the site is in the sequence.
01357         # 
01358         length = len(self.dna)
01359         drop = itertools.dropwhile
01360         take = itertools.takewhile
01361         if self.dna.is_linear():
01362             self.results = [x for x in drop(lambda x:x<1, self.results)]
01363             self.results = [x for x in take(lambda x:x<length, self.results)]
01364         else:
01365             for index, location in enumerate(self.results):
01366                 if location < 1:
01367                     self.results[index] += length
01368                 else:
01369                     break
01370             for index, location in enumerate(self.results[::-1]):
01371                 if location > length:
01372                     self.results[-(index+1)] -= length
01373                 else:
01374                     break
01375         return
01376 
01377     @classmethod
01378     def is_defined(self):
01379         """RE.is_defined() -> bool.
01380 
01381         True if the sequence recognised and cut is constant,
01382         i.e. the recognition site is not degenerated AND the enzyme cut inside
01383         the site.
01384 
01385         see also:
01386             RE.is_ambiguous()
01387             RE.is_unknown()"""
01388         return True
01389 
01390     @classmethod
01391     def is_ambiguous(self):
01392         """RE.is_ambiguous() -> bool.
01393 
01394         True if the sequence recognised and cut is ambiguous,
01395         i.e. the recognition site is degenerated AND/OR the enzyme cut outside
01396         the site.
01397 
01398         see also:
01399             RE.is_defined()
01400             RE.is_unknown()"""
01401         return False
01402 
01403     @classmethod
01404     def is_unknown(self):
01405         """RE.is_unknown() -> bool.
01406 
01407         True if the sequence is unknown,
01408         i.e. the recognition site has not been characterised yet.
01409 
01410         see also:
01411             RE.is_defined()
01412             RE.is_ambiguous()"""
01413         return False
01414 
01415     @classmethod
01416     def elucidate(self):
01417         """RE.elucidate() -> str
01418 
01419         return a representation of the site with the cut on the (+) strand
01420         represented as '^' and the cut on the (-) strand as '_'.
01421         ie:
01422         >>> EcoRI.elucidate()   # 5' overhang
01423         'G^AATT_C'
01424         >>> KpnI.elucidate()    # 3' overhang
01425         'G_GTAC^C'
01426         >>> EcoRV.elucidate()   # blunt
01427         'GAT^_ATC'
01428         >>> SnaI.elucidate()     # NotDefined, cut profile unknown.
01429         '? GTATAC ?'
01430         >>>
01431         """
01432         f5 = self.fst5
01433         f3 = self.fst3
01434         site = self.site
01435         if self.cut_twice() : re =  'cut twice, not yet implemented sorry.'
01436         elif self.is_5overhang():
01437             if f5 == f3 == 0 : re = 'N^'+ self.site + '_N'
01438             elif f3 == 0 : re = site[:f5] + '^' + site[f5:] + '_N'
01439             else : re = site[:f5] + '^' + site[f5:f3] + '_' + site[f3:]
01440         elif self.is_blunt():
01441             re =  site[:f5] + '^_' + site[f5:]
01442         else:
01443             if f5 == f3 == 0 : re = 'N_'+  site + '^N'
01444             else : re = site[:f3] + '_' + site[f3:f5] +'^'+ site[f5:]
01445         return re
01446 
01447     @classmethod
01448     def _mod2(self, other):
01449         """RE._mod2(other) -> bool.
01450 
01451         for internal use only
01452         
01453         test for the compatibility of restriction ending of RE and other."""
01454         #
01455         #   called by RE._mod1(other) when the one of the enzyme is ambiguous
01456         #
01457         if other.ovhgseq == self.ovhgseq:
01458             return True
01459         elif issubclass(other, Ambiguous):
01460             return other._mod2(self)
01461         else:
01462             return False
01463 
01464     
01465 class Ambiguous(AbstractCut):
01466     """Implement the methods specific to the enzymes for which the overhang
01467     is variable.
01468 
01469     Typical example : BstXI -> CCAN_NNNN^NTGG
01470                       The overhang can be any sequence of 4 bases.
01471     Notes:
01472         Blunt enzymes are always defined. even if there site is GGATCCNNN^_N
01473         There overhang is always the same : blunt!
01474     
01475     Internal use only. Not meant to be instantiated."""
01476 
01477     @classmethod
01478     def _drop(self):
01479         """RE._drop() -> list.
01480 
01481         for internal use only.
01482 
01483         drop the site that are situated outside the sequence in linear sequence.
01484         modify the index for site in circular sequences."""
01485         length = len(self.dna)
01486         drop = itertools.dropwhile
01487         take = itertools.takewhile
01488         if self.dna.is_linear():
01489             self.results = [x for x in drop(lambda x : x < 1, self.results)]
01490             self.results = [x for x in take(lambda x : x <length, self.results)]
01491         else:
01492             for index, location in enumerate(self.results):
01493                 if location < 1:
01494                     self.results[index] += length
01495                 else:
01496                     break
01497             for index, location in enumerate(self.results[::-1]):
01498                 if location > length:
01499                     self.results[-(index+1)] -= length
01500                 else:
01501                     break
01502         return 
01503 
01504     @classmethod
01505     def is_defined(self):
01506         """RE.is_defined() -> bool.
01507 
01508         True if the sequence recognised and cut is constant,
01509         i.e. the recognition site is not degenerated AND the enzyme cut inside
01510         the site.
01511 
01512         see also:
01513             RE.is_ambiguous()
01514             RE.is_unknown()"""
01515         return False
01516 
01517     @classmethod
01518     def is_ambiguous(self):
01519         """RE.is_ambiguous() -> bool.
01520 
01521         True if the sequence recognised and cut is ambiguous,
01522         i.e. the recognition site is degenerated AND/OR the enzyme cut outside
01523         the site.
01524 
01525         
01526         see also:
01527             RE.is_defined()
01528             RE.is_unknown()"""
01529         return True
01530 
01531     @classmethod
01532     def is_unknown(self):
01533         """RE.is_unknown() -> bool.
01534 
01535         True if the sequence is unknown,
01536         i.e. the recognition site has not been characterised yet.
01537 
01538         see also:
01539             RE.is_defined()
01540             RE.is_ambiguous()"""
01541         return False
01542 
01543     @classmethod
01544     def _mod2(self, other):
01545         """RE._mod2(other) -> bool.
01546 
01547         for internal use only
01548         
01549         test for the compatibility of restriction ending of RE and other."""
01550         #
01551         #   called by RE._mod1(other) when the one of the enzyme is ambiguous
01552         #
01553         if len(self.ovhgseq) != len(other.ovhgseq):
01554             return False
01555         else:
01556             se = self.ovhgseq
01557             for base in se:
01558                 if base in 'ATCG':
01559                     pass
01560                 if base in 'N':
01561                     se = '.'.join(se.split('N'))
01562                 if base in 'RYWMSKHDBV':
01563                     expand = '['+ matching[base] + ']'
01564                     se = expand.join(se.split(base))
01565             if re.match(se, other.ovhgseq):
01566                 return True
01567             else:
01568                 return False         
01569 
01570     @classmethod
01571     def elucidate(self):
01572         """RE.elucidate() -> str
01573 
01574         return a representation of the site with the cut on the (+) strand
01575         represented as '^' and the cut on the (-) strand as '_'.
01576         ie:
01577         >>> EcoRI.elucidate()   # 5' overhang
01578         'G^AATT_C'
01579         >>> KpnI.elucidate()    # 3' overhang
01580         'G_GTAC^C'
01581         >>> EcoRV.elucidate()   # blunt
01582         'GAT^_ATC'
01583         >>> SnaI.elucidate()     # NotDefined, cut profile unknown.
01584         '? GTATAC ?'
01585         >>>
01586         """
01587         f5 = self.fst5
01588         f3 = self.fst3
01589         length = len(self)
01590         site = self.site
01591         if self.cut_twice() : re = 'cut twice, not yet implemented sorry.'
01592         elif self.is_5overhang():
01593             if f3 == f5 == 0:
01594                 re = 'N^' + site +'_N'
01595             elif 0 <= f5 <= length and 0 <= f3+length <= length:
01596                 re = site[:f5] + '^' + site[f5:f3] + '_' + site[f3:] 
01597             elif 0 <= f5 <= length:
01598                 re = site[:f5] + '^' + site[f5:] + f3*'N' + '_N'
01599             elif 0 <= f3+length <= length:
01600                 re = 'N^' + abs(f5) * 'N' + site[:f3] + '_' + site[f3:]
01601             elif f3+length < 0:
01602                 re = 'N^'*abs(f5)*'N' + '_' + abs(length+f3)*'N' + site
01603             elif f5 > length:
01604                 re = site + (f5-length)*'N'+'^'+(length+f3-f5)*'N'+'_N'
01605             else:
01606                 re = 'N^' + abs(f5) * 'N' + site + f3*'N' + '_N'
01607         elif self.is_blunt():
01608             if f5 < 0:
01609                 re = 'N^_' + abs(f5)*'N' + site
01610             elif f5 > length:
01611                 re = site + (f5-length)*'N' + '^_N'
01612             else:
01613                 raise ValueError('%s.easyrepr() : error f5=%i' \
01614                                  % (self.name,f5))
01615         else:
01616             if f3 == 0:
01617                 if f5 == 0 : re = 'N_' + site + '^N'
01618                 else : re = site + '_' + (f5-length)*'N' + '^N'
01619             elif 0 < f3+length <= length and 0 <= f5 <= length:
01620                 re = site[:f3] + '_' + site[f3:f5] + '^' + site[f5:]
01621             elif 0 < f3+length <= length:
01622                 re = site[:f3] + '_' + site[f3:] + (f5-length)*'N' + '^N'
01623             elif 0 <= f5 <= length:
01624                 re = 'N_' +'N'*(f3+length) + site[:f5] + '^' + site[f5:]
01625             elif f3 > 0:
01626                 re = site + f3*'N' + '_' + (f5-f3-length)*'N' + '^N'
01627             elif f5 < 0:
01628                 re = 'N_' + abs(f3-f5+length)*'N' + '^' + abs(f5)*'N' + site
01629             else:
01630                 re = 'N_' + abs(f3+length)*'N' + site + (f5-length)*'N' + '^N'
01631         return re
01632 
01633 
01634 class NotDefined(AbstractCut):
01635     """Implement the methods specific to the enzymes for which the overhang
01636     is not characterised.
01637 
01638     Correspond to NoCut and Unknown.
01639     
01640     Internal use only. Not meant to be instantiated."""
01641 
01642     @classmethod
01643     def _drop(self):
01644         """RE._drop() -> list.
01645 
01646         for internal use only.
01647 
01648         drop the site that are situated outside the sequence in linear sequence.
01649         modify the index for site in circular sequences."""
01650         if self.dna.is_linear():
01651             return
01652         else:
01653             length = len(self.dna)
01654             for index, location in enumerate(self.results):
01655                 if location < 1:
01656                     self.results[index] += length
01657                 else:
01658                     break
01659             for index, location in enumerate(self.results[:-1]):
01660                 if location > length:
01661                     self.results[-(index+1)] -= length
01662                 else:
01663                     break
01664         return        
01665 
01666     @classmethod
01667     def is_defined(self):
01668         """RE.is_defined() -> bool.
01669 
01670         True if the sequence recognised and cut is constant,
01671         i.e. the recognition site is not degenerated AND the enzyme cut inside
01672         the site.
01673 
01674         see also:
01675             RE.is_ambiguous()
01676             RE.is_unknown()"""
01677         return False
01678 
01679     @classmethod
01680     def is_ambiguous(self):
01681         """RE.is_ambiguous() -> bool.
01682 
01683         True if the sequence recognised and cut is ambiguous,
01684         i.e. the recognition site is degenerated AND/OR the enzyme cut outside
01685         the site.
01686 
01687         
01688         see also:
01689             RE.is_defined()
01690             RE.is_unknown()"""
01691         return False
01692 
01693     @classmethod
01694     def is_unknown(self):
01695         """RE.is_unknown() -> bool.
01696 
01697         True if the sequence is unknown,
01698         i.e. the recognition site has not been characterised yet.
01699 
01700         see also:
01701             RE.is_defined()
01702             RE.is_ambiguous()"""
01703         return True
01704 
01705     @classmethod
01706     def _mod2(self, other):
01707         """RE._mod2(other) -> bool.
01708 
01709         for internal use only
01710         
01711         test for the compatibility of restriction ending of RE and other."""
01712         #
01713         #   Normally we should not arrive here. But well better safe than sorry.
01714         #   the overhang is not defined we are compatible with nobody.
01715         #   could raise an Error may be rather than return quietly.
01716         #
01717         #return False
01718         raise ValueError("%s.mod2(%s), %s : NotDefined. pas glop pas glop!" \
01719                          % (str(self), str(other), str(self)))
01720 
01721     @classmethod
01722     def elucidate(self):
01723         """RE.elucidate() -> str
01724 
01725         return a representation of the site with the cut on the (+) strand
01726         represented as '^' and the cut on the (-) strand as '_'.
01727         ie:
01728         >>> EcoRI.elucidate()   # 5' overhang
01729         'G^AATT_C'
01730         >>> KpnI.elucidate()    # 3' overhang
01731         'G_GTAC^C'
01732         >>> EcoRV.elucidate()   # blunt
01733         'GAT^_ATC'
01734         >>> SnaI.elucidate()     # NotDefined, cut profile unknown.
01735         '? GTATAC ?'
01736         >>>
01737         """
01738         return '? %s ?' % self.site
01739 
01740     
01741 class Commercially_available(AbstractCut):
01742     #
01743     #   Recent addition to Rebase make this naming convention uncertain.
01744     #   May be better to says enzymes which have a supplier.
01745     #
01746     """Implement the methods specific to the enzymes which are commercially
01747     available.
01748     
01749     Internal use only. Not meant to be instantiated."""
01750 
01751     @classmethod
01752     def suppliers(self):
01753         """RE.suppliers() -> print the suppliers of RE."""
01754         supply = suppliers_dict.items()
01755         for k,v in supply:
01756             if k in self.suppl:
01757                 print v[0]+','
01758         return
01759 
01760     @classmethod
01761     def supplier_list(self):
01762         """RE.supplier_list() -> list.
01763 
01764         list of the supplier names for RE."""
01765         return [v[0] for k,v in suppliers_dict.items() if k in self.suppl]
01766 
01767     @classmethod
01768     def buffers(self, supplier):
01769         """RE.buffers(supplier) -> string.
01770 
01771         not implemented yet."""
01772         return
01773 
01774     @classmethod
01775     def is_comm(self):
01776         """RE.iscomm() -> bool.
01777 
01778         True if RE has suppliers."""
01779         return True
01780 
01781 
01782 class Not_available(AbstractCut):
01783     """Implement the methods specific to the enzymes which are not commercially
01784     available.
01785     
01786     Internal use only. Not meant to be instantiated."""
01787 
01788     @staticmethod
01789     def suppliers():
01790         """RE.suppliers() -> print the suppliers of RE."""
01791         return None
01792 
01793     @classmethod
01794     def supplier_list(self):
01795         """RE.supplier_list() -> list.
01796 
01797         list of the supplier names for RE."""
01798         return []
01799 
01800     @classmethod
01801     def buffers(self, supplier):
01802         """RE.buffers(supplier) -> string.
01803 
01804         not implemented yet."""
01805         raise TypeError("Enzyme not commercially available.")
01806 
01807     @classmethod
01808     def is_comm(self):
01809         """RE.iscomm() -> bool.
01810 
01811         True if RE has suppliers."""
01812         return False
01813 
01814     
01815 ###############################################################################  
01816 #                                                                             #
01817 #                       Restriction Batch                                     #
01818 #                                                                             #
01819 ###############################################################################
01820 
01821 
01822 class RestrictionBatch(set):
01823 
01824     def __init__(self, first=[], suppliers=[]):
01825         """RestrictionBatch([sequence]) -> new RestrictionBatch."""
01826         first = [self.format(x) for x in first]
01827         first += [eval(x) for n in suppliers for x in suppliers_dict[n][1]]
01828         set.__init__(self, first)
01829         self.mapping = dict.fromkeys(self)
01830         self.already_mapped = None
01831             
01832     def __str__(self):
01833         if len(self) < 5:
01834             return '+'.join(self.elements())
01835         else:
01836             return '...'.join(('+'.join(self.elements()[:2]),\
01837                                '+'.join(self.elements()[-2:])))
01838 
01839     def __repr__(self):
01840         return 'RestrictionBatch(%s)' % self.elements()
01841     
01842     def __contains__(self, other):
01843         try:
01844             other = self.format(other)
01845         except ValueError : # other is not a restriction enzyme
01846             return False
01847         return set.__contains__(self, other)
01848     
01849     def __div__(self, other):
01850         return self.search(other)
01851     
01852     def __rdiv__(self, other):
01853         return self.search(other)
01854 
01855     def get(self, enzyme, add=False):
01856         """B.get(enzyme[, add]) -> enzyme class.
01857 
01858         if add is True and enzyme is not in B add enzyme to B.
01859         if add is False (which is the default) only return enzyme.
01860         if enzyme is not a RestrictionType or can not be evaluated to
01861         a RestrictionType, raise a ValueError."""
01862         e = self.format(enzyme)
01863         if e in self:
01864             return e
01865         elif add:
01866             self.add(e)
01867             return e
01868         else:
01869             raise ValueError('enzyme %s is not in RestrictionBatch' \
01870                              % e.__name__)
01871 
01872     def lambdasplit(self, func):
01873         """B.lambdasplit(func) -> RestrictionBatch .
01874 
01875         the new batch will contains only the enzymes for which
01876         func return True."""
01877         d = [x for x in itertools.ifilter(func, self)]
01878         new = RestrictionBatch()
01879         new._data = dict(zip(d, [True]*len(d)))
01880         return new
01881 
01882     def add_supplier(self, letter):
01883         """B.add_supplier(letter) -> add a new set of enzyme to B.
01884 
01885         letter represents the suppliers as defined in the dictionary
01886         RestrictionDictionary.suppliers
01887         return None.
01888         raise a KeyError if letter is not a supplier code."""
01889         supplier = suppliers_dict[letter]
01890         self.suppliers.append(letter)
01891         for x in supplier[1]:
01892             self.add_nocheck(eval(x))
01893         return
01894 
01895     def current_suppliers(self):
01896         """B.current_suppliers() -> add a new set of enzyme to B.
01897 
01898         return a sorted list of the suppliers which have been used to
01899         create the batch."""
01900         suppl_list = [suppliers_dict[x][0] for x in self.suppliers]
01901         suppl_list.sort()
01902         return suppl_list
01903 
01904     def __iadd__(self, other):
01905         """ b += other -> add other to b, check the type of other."""
01906         self.add(other)
01907         return self
01908 
01909     def __add__(self, other):
01910         """ b + other -> new RestrictionBatch."""
01911         new = self.__class__(self)
01912         new.add(other)
01913         return new
01914 
01915     def remove(self, other):
01916         """B.remove(other) -> remove other from B if other is a RestrictionType.
01917 
01918         Safe set.remove method. Verify that other is a RestrictionType or can be
01919         evaluated to a RestrictionType.
01920         raise a ValueError if other can not be evaluated to a RestrictionType.
01921         raise a KeyError if other is not in B."""
01922         return set.remove(self, self.format(other))
01923 
01924     def add(self, other):
01925         """B.add(other) -> add other to B if other is a RestrictionType.
01926 
01927         Safe set.add method. Verify that other is a RestrictionType or can be
01928         evaluated to a RestrictionType.
01929         raise a ValueError if other can not be evaluated to a RestrictionType.
01930         """
01931         return set.add(self, self.format(other))
01932 
01933     def add_nocheck(self, other):
01934         """B.add_nocheck(other) -> add other to B. don't check type of other.
01935         """
01936         return set.add(self, other)
01937         
01938     def format(self, y):
01939         """B.format(y) -> RestrictionType or raise ValueError.
01940 
01941         if y is a RestrictionType return y
01942         if y can be evaluated to a RestrictionType return eval(y)
01943         raise a Value Error in all other case."""
01944         try:
01945             if isinstance(y, RestrictionType):
01946                 return y
01947             elif isinstance(eval(str(y)), RestrictionType):
01948                 return eval(y)
01949             
01950             else:
01951                 pass
01952         except (NameError, SyntaxError):
01953             pass
01954         raise ValueError('%s is not a RestrictionType' % y.__class__)
01955         
01956 
01957     def is_restriction(self, y):
01958         """B.is_restriction(y) -> bool.
01959 
01960         True is y or eval(y) is a RestrictionType.""" 
01961         return isinstance(y, RestrictionType) or \
01962                isinstance(eval(str(y)), RestrictionType)
01963     
01964     def split(self, *classes, **bool):
01965         """B.split(class, [class.__name__ = True]) -> new RestrictionBatch.
01966 
01967         it works but it is slow, so it has really an interest when splitting
01968         over multiple conditions."""
01969         def splittest(element):
01970             for klass in classes:
01971                 b = bool.get(klass.__name__, True)
01972                 if issubclass(element, klass):
01973                     if b:
01974                         continue
01975                     else:
01976                         return False
01977                 elif b:
01978                     return False
01979                 else:
01980                     continue
01981             return True
01982         d = [k for k in itertools.ifilter(splittest, self)]
01983         new = RestrictionBatch()
01984         new._data = dict(zip(d, [True]*len(d)))
01985         return new
01986       
01987     def elements(self):
01988         """B.elements() -> tuple.
01989 
01990         give all the names of the enzymes in B sorted alphabetically."""
01991         l = [str(e) for e in self]
01992         l.sort()
01993         return l
01994 
01995     def as_string(self):
01996         """B.as_string() -> list.
01997 
01998         return a list of the name of the elements of B."""
01999         return [str(e) for e in self]
02000 
02001     @classmethod
02002     def suppl_codes(self):
02003         """B.suppl_codes() -> dict
02004 
02005         letter code for the suppliers"""
02006         supply = dict([(k,v[0]) for k,v in suppliers_dict.iteritems()]) 
02007         return supply
02008 
02009     @classmethod
02010     def show_codes(self):
02011         "B.show_codes() -> letter codes for the suppliers"""
02012         supply = [' = '.join(i) for i in self.suppl_codes().iteritems()]
02013         print '\n'.join(supply)
02014         return
02015 
02016     def search(self, dna, linear=True):
02017         """B.search(dna) -> dict."""
02018         #
02019         #   here we replace the search method of the individual enzymes
02020         #   with one unique testing method.
02021         #
02022         if not hasattr(self, "already_mapped") :
02023             #TODO - Why does this happen!
02024             #Try the "doctest" at the start of PrintFormat.py
02025             self.already_mapped = None
02026         if isinstance(dna, DNA):
02027             # For the searching, we just care about the sequence as a string,
02028             # if that is the same we can use the cached search results.
02029             # At the time of writing, Seq == method isn't implemented,
02030             # and therefore does object identity which is stricter.
02031             if (str(dna), linear) == self.already_mapped:
02032                 return self.mapping
02033             else:
02034                 self.already_mapped = str(dna), linear
02035                 fseq = FormattedSeq(dna, linear)
02036                 self.mapping = dict([(x, x.search(fseq)) for x in self])
02037                 return self.mapping
02038         elif isinstance(dna, FormattedSeq):
02039             if (str(dna), dna.linear) == self.already_mapped:
02040                 return self.mapping
02041             else:
02042                 self.already_mapped = str(dna), dna.linear
02043                 self.mapping = dict([(x, x.search(dna)) for x in self])
02044                 return self.mapping
02045         raise TypeError("Expected Seq or MutableSeq instance, got %s instead"\
02046                         %type(dna))
02047 
02048 ###############################################################################  
02049 #                                                                             #
02050 #                       Restriction Analysis                                  #
02051 #                                                                             #
02052 ###############################################################################
02053 
02054 class Analysis(RestrictionBatch, PrintFormat):
02055 
02056     def __init__(self, restrictionbatch=RestrictionBatch(),sequence=DNA(''),
02057                  linear=True):
02058         """Analysis([restrictionbatch [, sequence] linear=True]) -> New Analysis class.
02059 
02060         For most of the method of this class if a dictionary is given it will
02061         be used as the base to calculate the results. 
02062         If no dictionary is given a new analysis using the Restriction Batch
02063         which has been given when the Analysis class has been instantiated."""
02064         RestrictionBatch.__init__(self, restrictionbatch)
02065         self.rb = restrictionbatch
02066         self.sequence = sequence
02067         self.linear = linear
02068         if self.sequence:
02069             self.search(self.sequence, self.linear)
02070 
02071     def __repr__(self):
02072         return 'Analysis(%s,%s,%s)'%\
02073                (repr(self.rb),repr(self.sequence),self.linear)
02074 
02075     def _sub_set(self, wanted):
02076         """A._sub_set(other_set) -> dict.
02077 
02078         Internal use only.
02079         
02080         screen the results through wanted set.
02081         Keep only the results for which the enzymes is in wanted set.
02082         """
02083         return dict([(k,v) for k,v in self.mapping.iteritems() if k in wanted])
02084     
02085     def _boundaries(self, start, end):
02086         """A._boundaries(start, end) -> tuple.
02087 
02088         Format the boundaries for use with the methods that limit the
02089         search to only part of the sequence given to analyse.
02090         """
02091         if not isinstance(start, int):
02092             raise TypeError('expected int, got %s instead' % type(start))
02093         if not isinstance(end, int):
02094             raise TypeError('expected int, got %s instead' % type(end))
02095         if start < 1:
02096             start += len(self.sequence)
02097         if end < 1:
02098             end += len(self.sequence)
02099         if start < end:
02100             pass
02101         else:
02102             start, end == end, start
02103         if start < 1:
02104             start == 1
02105         if start < end:
02106             return start, end, self._test_normal
02107         else:
02108             return start, end, self._test_reverse
02109 
02110     def _test_normal(self, start, end, site):
02111         """A._test_normal(start, end, site) -> bool.
02112 
02113         Internal use only
02114         Test if site is in between start and end.
02115         """
02116         return start <= site < end 
02117 
02118     def _test_reverse(self, start, end, site):
02119         """A._test_reverse(start, end, site) -> bool.
02120 
02121         Internal use only
02122         Test if site is in between end and start (for circular sequences).
02123         """
02124         return start <= site <= len(self.sequence) or 1 <= site < end
02125 
02126     def print_that(self, dct=None, title='', s1=''):
02127         """A.print_that([dct[, title[, s1]]]) -> print the results from dct.
02128 
02129         If dct is not given the full dictionary is used.
02130         """
02131         if not dct:
02132             dct = self.mapping
02133         print
02134         return PrintFormat.print_that(self, dct, title, s1)
02135         
02136     def change(self, **what):
02137         """A.change(**attribute_name) -> Change attribute of Analysis.
02138 
02139         It is possible to change the width of the shell by setting
02140         self.ConsoleWidth to what you want.
02141         self.NameWidth refer to the maximal length of the enzyme name.
02142 
02143         Changing one of these parameters here might not give the results
02144         you expect. In which case, you can settle back to a 80 columns shell
02145         or try to change self.Cmodulo and self.PrefWidth in PrintFormat until
02146         you get it right."""
02147         for k,v in what.iteritems():
02148             if k in ('NameWidth', 'ConsoleWidth'):
02149                 setattr(self, k, v)
02150                 self.Cmodulo    = self.ConsoleWidth % self.NameWidth
02151                 self.PrefWidth  = self.ConsoleWidth - self.Cmodulo
02152             elif k is 'sequence':
02153                 setattr(self, 'sequence', v) 
02154                 self.search(self.sequence, self.linear)
02155             elif k is 'rb':
02156                 self = Analysis.__init__(self, v, self.sequence, self.linear)
02157             elif k is 'linear':
02158                 setattr(self, 'linear', v)
02159                 self.search(self.sequence, v)
02160             elif k in ('Indent', 'Maxsize'):
02161                 setattr(self, k, v)
02162             elif k in ('Cmodulo', 'PrefWidth'):
02163                 raise AttributeError( \
02164                     'To change %s, change NameWidth and/or ConsoleWidth' \
02165                     % name)
02166             else:
02167                 raise AttributeError( \
02168                     'Analysis has no attribute %s' % name)
02169         return
02170 
02171     def full(self, linear=True):
02172         """A.full() -> dict.
02173         
02174         Full Restriction Map of the sequence."""
02175         return self.mapping
02176 
02177     def blunt(self, dct = None):
02178         """A.blunt([dct]) -> dict.
02179         
02180         Only the enzymes which have a 3'overhang restriction site."""
02181         if not dct:
02182             dct = self.mapping
02183         return dict([(k,v) for k,v in dct.iteritems() if k.is_blunt()])
02184         
02185     def overhang5(self, dct=None):
02186         """A.overhang5([dct]) -> dict.
02187         
02188         Only the enzymes which have a 5' overhang restriction site."""
02189         if not dct:
02190             dct = self.mapping
02191         return dict([(k,v) for k,v in dct.iteritems() if k.is_5overhang()])
02192         
02193 
02194     def overhang3(self, dct=None):
02195         """A.Overhang3([dct]) -> dict.
02196         
02197         Only the enzymes which have a 3'overhang restriction site."""
02198         if not dct:
02199             dct = self.mapping
02200         return dict([(k,v) for k,v in dct.iteritems() if k.is_3overhang()])
02201         
02202         
02203     def defined(self, dct=None):
02204         """A.defined([dct]) -> dict.
02205         
02206         Only the enzymes that have a defined restriction site in Rebase."""
02207         if not dct:
02208             dct = self.mapping
02209         return dict([(k,v) for k,v in dct.iteritems() if k.is_defined()])
02210         
02211     def with_sites(self, dct=None):
02212         """A.with_sites([dct]) -> dict.
02213         
02214         Enzymes which have at least one site in the sequence."""
02215         if not dct:
02216             dct = self.mapping
02217         return dict([(k,v) for k,v in dct.iteritems() if v])
02218 
02219     def without_site(self, dct=None):
02220         """A.without_site([dct]) -> dict.
02221         
02222         Enzymes which have no site in the sequence."""
02223         if not dct:
02224             dct = self.mapping
02225         return dict([(k,v) for k,v in dct.iteritems() if not v])
02226 
02227     def with_N_sites(self, N, dct=None):
02228         """A.With_N_Sites(N [, dct]) -> dict.
02229         
02230         Enzymes which cut N times the sequence."""
02231         if not dct:
02232             dct = self.mapping
02233         return dict([(k,v) for k,v in dct.iteritems()if len(v) == N])
02234 
02235     def with_number_list(self, list, dct= None):
02236         if not dct:
02237             dct = self.mapping
02238         return dict([(k,v) for k,v in dct.iteritems() if len(v) in list])
02239                              
02240     def with_name(self, names, dct=None):
02241         """A.with_name(list_of_names [, dct]) ->
02242         
02243          Limit the search to the enzymes named in list_of_names."""
02244         for i, enzyme in enumerate(names):
02245             if not enzyme in AllEnzymes:
02246                 print "no datas for the enzyme:", str(name)
02247                 del names[i]       
02248         if not dct:
02249             return RestrictionBatch(names).search(self.sequence)
02250         return dict([(n, dct[n]) for n in names if n in dct])
02251 
02252     def with_site_size(self, site_size, dct=None):
02253         """A.with_site_size(site_size [, dct]) ->
02254         
02255          Limit the search to the enzymes whose site is of size <site_size>."""
02256         sites = [name for name in self if name.size == site_size]
02257         if not dct:
02258             return RestrictionBatch(sites).search(self.sequence)
02259         return dict([(k,v) for k,v in dct.iteritems() if k in site_size])  
02260     
02261     def only_between(self, start, end, dct=None):
02262         """A.only_between(start, end[, dct]) -> dict.
02263 
02264         Enzymes that cut the sequence only in between start and end."""
02265         start, end, test = self._boundaries(start, end)
02266         if not dct:
02267             dct = self.mapping
02268         d = dict(dct)
02269         for key, sites in dct.iteritems():
02270             if not sites:
02271                 del d[key]
02272                 continue
02273             for site in sites:
02274                 if test(start, end, site):
02275                     continue
02276                 else:
02277                     del d[key]
02278                     break
02279         return d
02280         
02281     def between(self, start, end, dct=None):
02282         """A.between(start, end [, dct]) -> dict.
02283 
02284         Enzymes that cut the sequence at least in between start and end.
02285         They may cut outside as well."""
02286         start, end, test = self._boundaries(start, end)
02287         d = {}
02288         if not dct:
02289             dct = self.mapping
02290         for key, sites in dct.iteritems():
02291             for site in sites:
02292                 if test(start, end, site):
02293                     d[key] = sites
02294                     break
02295                 continue
02296         return d
02297     
02298     def show_only_between(self, start, end, dct=None):
02299         """A.show_only_between(start, end [, dct]) -> dict.
02300 
02301         Enzymes that cut the sequence outside of the region
02302         in between start and end but do not cut inside."""
02303         d = []
02304         if start <= end:
02305             d = [(k, [vv for vv in v if start<=vv<=end])
02306                  for v in self.between(start, end, dct)]
02307         else:
02308             d = [(k, [vv for vv in v if start<=vv or vv <= end])
02309                  for v in self.between(start, end, dct)]
02310         return dict(d)
02311         
02312     def only_outside(self, start, end, dct = None):
02313         """A.only_outside(start, end [, dct]) -> dict.
02314 
02315         Enzymes that cut the sequence outside of the region
02316         in between start and end but do not cut inside.""" 
02317         start, end, test = self._boundaries(start, end)
02318         if not dct : dct = self.mapping
02319         d = dict(dct)
02320         for key, sites in dct.iteritems():
02321             if not sites:
02322                 del d[key]
02323                 continue
02324             for site in sites:
02325                 if test(start, end, site):
02326                     del d[key]
02327                     break
02328                 else:
02329                     continue
02330         return d
02331 
02332     def outside(self, start, end, dct=None):
02333         """A.outside((start, end [, dct]) -> dict.
02334 
02335         Enzymes that cut outside the region in between start and end.
02336         No test is made to know if they cut or not inside this region."""
02337         start, end, test = self._boundaries(start, end)
02338         if not dct:
02339             dct = self.mapping
02340         d = {}
02341         for key, sites in dct.iteritems():
02342             for site in sites:
02343                 if test(start, end, site):
02344                     continue
02345                 else:
02346                     d[key] = sites 
02347                     break      
02348         return d
02349    
02350 
02351     def do_not_cut(self, start, end, dct = None):
02352         """A.do_not_cut(start, end [, dct]) -> dict.
02353 
02354         Enzymes that do not cut the region in between start and end."""
02355         if not dct:
02356             dct = self.mapping
02357         d = self.without_site()
02358         d.update(self.only_outside(start, end, dct))   
02359         return d
02360     
02361 #
02362 #   The restriction enzyme classes are created dynamically when the module is
02363 #   imported. Here is the magic which allow the creation of the
02364 #   restriction-enzyme classes.
02365 #
02366 #   The reason for the two dictionaries in Restriction_Dictionary
02367 #   one for the types (which will be called pseudo-type as they really
02368 #   correspond to the values that instances of RestrictionType can take)
02369 #   and one for the enzymes is efficiency as the bases are evaluated
02370 #   once per pseudo-type.
02371 #
02372 #   However Restriction is still a very inefficient module at import. But
02373 #   remember that around 660 classes (which is more or less the size of Rebase)
02374 #   have to be created dynamically. However, this processing take place only
02375 #   once.
02376 #   This inefficiency is however largely compensated by the use of metaclass
02377 #   which provide a very efficient layout for the class themselves mostly
02378 #   alleviating the need of if/else loops in the class methods.
02379 #
02380 #   It is essential to run Restriction with doc string optimisation (-OO switch)
02381 #   as the doc string of 660 classes take a lot of processing.
02382 #
02383 CommOnly    = RestrictionBatch()    # commercial enzymes
02384 NonComm     = RestrictionBatch()    # not available commercially
02385 for TYPE, (bases, enzymes) in typedict.iteritems():
02386     #
02387     #   The keys are the pseudo-types TYPE (stored as type1, type2...)
02388     #   The names are not important and are only present to differentiate
02389     #   the keys in the dict. All the pseudo-types are in fact RestrictionType.
02390     #   These names will not be used after and the pseudo-types are not
02391     #   kept in the locals() dictionary. It is therefore impossible to
02392     #   import them.
02393     #   Now, if you have look at the dictionary, you will see that not all the
02394     #   types are present as those without corresponding enzymes have been
02395     #   removed by Dictionary_Builder().
02396     #
02397     #   The values are tuples which contain
02398     #   as first element a tuple of bases (as string) and
02399     #   as second element the names of the enzymes.
02400     #
02401     #   First eval the bases.
02402     #
02403     bases = tuple([eval(x) for x in bases])
02404     #
02405     #   now create the particular value of RestrictionType for the classes
02406     #   in enzymes.
02407     #
02408     T = type.__new__(RestrictionType, 'RestrictionType', bases, {})
02409     for k in enzymes:
02410         #
02411         #   Now, we go through all the enzymes and assign them their type.
02412         #   enzymedict[k] contains the values of the attributes for this
02413         #   particular class (self.site, self.ovhg,....).
02414         #
02415         newenz = T(k, bases, enzymedict[k])
02416         #
02417         #   we add the enzymes to the corresponding batch.
02418         #
02419         #   No need to verify the enzyme is a RestrictionType -> add_nocheck
02420         #
02421         if newenz.is_comm() : CommOnly.add_nocheck(newenz)
02422         else : NonComm.add_nocheck(newenz)
02423 #
02424 #   AllEnzymes is a RestrictionBatch with all the enzymes from Rebase.
02425 #
02426 AllEnzymes = CommOnly | NonComm
02427 #
02428 #   Now, place the enzymes in locals so they can be imported.
02429 #
02430 names = [str(x) for x in AllEnzymes]
02431 try:
02432     del x
02433 except NameError:
02434     #Scoping changed in Python 3, the variable isn't leaked
02435     pass
02436 locals().update(dict(zip(names, AllEnzymes)))
02437 __all__=['FormattedSeq', 'Analysis', 'RestrictionBatch','AllEnzymes','CommOnly','NonComm']+names
02438 del k, enzymes, TYPE, bases, names