Back to index

python-biopython  1.60
RestrictionCompiler.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 #      Restriction Analysis Libraries.
00004 #      Copyright (C) 2004. Frederic Sohm.
00005 #
00006 # This code is part of the Biopython distribution and governed by its
00007 # license.  Please see the LICENSE file that should have been included
00008 # as part of this package.
00009 #
00010 #   this script is used to produce the dictionary which will contains the data
00011 #   about the restriction enzymes from the Emboss/Rebase data files
00012 #   namely
00013 #   emboss_e.### (description of the sites),
00014 #   emboss_r.### (origin, methylation, references)
00015 #   emboss_s.### (suppliers)
00016 #   where ### is a number of three digits : 1 for the year two for the month
00017 #
00018 #   very dirty implementation but it does the job, so...
00019 #   Not very quick either but you are not supposed to use it frequently.
00020 #
00021 #   The results are stored in
00022 #   path/to/site-packages/Bio/Restriction/Restriction_Dictionary.py
00023 #   the file contains two dictionary:
00024 #   'rest_dict' which contains the data for the enzymes
00025 #   and
00026 #   'suppliers' which map the name of the suppliers to their abbreviation.
00027 #
00028 
00029 """Convert a serie of Rebase files into a Restriction_Dictionary.py module.
00030 
00031 The Rebase files are in the emboss format:
00032 
00033     emboss_e.###    -> contains informations about the restriction sites.
00034     emboss_r.###    -> contains general informations about the enzymes.
00035     emboss_s.###    -> contains informations about the suppliers.
00036     
00037 ### is a 3 digit number. The first digit is the year and the two last the month.
00038 """
00039 
00040 import sre
00041 import os
00042 import itertools
00043 import time
00044 import sys
00045 import site
00046 import shutil
00047 
00048 from Bio.Seq import Seq
00049 from Bio.Alphabet import IUPAC
00050 
00051 import Bio.Restriction.Restriction
00052 from Bio.Restriction.Restriction import AbstractCut, RestrictionType, NoCut, OneCut,\
00053 TwoCuts, Meth_Dep, Meth_Undep, Palindromic, NonPalindromic, Unknown, Blunt,\
00054 Ov5, Ov3, NotDefined, Defined, Ambiguous, Commercially_available, Not_available 
00055 
00056 import Bio.Restriction.RanaConfig as config
00057 from Bio.Restriction._Update.Update import RebaseUpdate
00058 from Bio.Restriction.Restriction import *
00059 from Bio.Restriction.DNAUtils import antiparallel
00060 
00061 DNA=Seq
00062 dna_alphabet = {'A':'A', 'C':'C', 'G':'G', 'T':'T',
00063                 'R':'AG', 'Y':'CT', 'W':'AT', 'S':'CG', 'M':'AC', 'K':'GT',
00064                 'H':'ACT', 'B':'CGT', 'V':'ACG', 'D':'AGT',
00065                 'N':'ACGT',
00066                 'a': 'a', 'c': 'c', 'g': 'g', 't': 't',
00067                 'r':'ag', 'y':'ct', 'w':'at', 's':'cg', 'm':'ac', 'k':'gt',
00068                 'h':'act', 'b':'cgt', 'v':'acg', 'd':'agt',
00069                 'n':'acgt'}
00070 
00071 
00072 complement_alphabet = {'A':'T', 'T':'A', 'C':'G', 'G':'C','R':'Y', 'Y':'R',
00073                        'W':'W', 'S':'S', 'M':'K', 'K':'M', 'H':'D', 'D':'H',
00074                        'B':'V', 'V':'B', 'N':'N','a':'t', 'c':'g', 'g':'c',
00075                        't':'a', 'r':'y', 'y':'r', 'w':'w', 's':'s','m':'k',
00076                        'k':'m', 'h':'d', 'd':'h', 'b':'v', 'v':'b', 'n':'n'}
00077 enzymedict = {}
00078 suppliersdict = {}
00079 classdict = {}
00080 typedict = {}
00081 
00082 
00083 class OverhangError(ValueError):
00084     """Exception for dealing with overhang."""
00085     pass
00086           
00087 def BaseExpand(base):
00088     """BaseExpand(base) -> string.
00089 
00090     given a degenerated base, returns its meaning in IUPAC alphabet.
00091 
00092     i.e:
00093         b= 'A' -> 'A'
00094         b= 'N' -> 'ACGT'
00095         etc..."""
00096     base = base.upper()
00097     return dna_alphabet[base]
00098 
00099 def regex(site):
00100     """regex(site) -> string.
00101 
00102     Construct a regular expression from a DNA sequence.
00103     i.e.:
00104         site = 'ABCGN'   -> 'A[CGT]CG.'"""
00105     reg_ex = site
00106     for base in reg_ex:
00107         if base in ('A', 'T', 'C', 'G', 'a', 'c', 'g', 't'):
00108             pass
00109         if base in ('N', 'n'):
00110             reg_ex = '.'.join(reg_ex.split('N'))
00111             reg_ex = '.'.join(reg_ex.split('n'))
00112         if base in ('R', 'Y', 'W', 'M', 'S', 'K', 'H', 'D', 'B', 'V'):
00113             expand = '['+ str(BaseExpand(base))+']'
00114             reg_ex = expand.join(reg_ex.split(base))
00115     return reg_ex
00116 
00117 def Antiparallel(sequence):
00118     """Antiparallel(sequence) -> string.
00119 
00120     returns a string which represents the reverse complementary strand of
00121     a DNA sequence."""
00122     return antiparallel(sequence.tostring())
00123 
00124 def is_palindrom(sequence):
00125     """is_palindrom(sequence) -> bool.
00126 
00127     True is the sequence is a palindrom.
00128     sequence is a DNA object."""
00129     return sequence == DNA(Antiparallel(sequence))
00130 
00131 def LocalTime():
00132     """LocalTime() -> string.
00133 
00134     LocalTime calculate the extension for emboss file for the current year and
00135     month."""
00136     t = time.gmtime()
00137     year = str(t.tm_year)[-1]
00138     month = str(t.tm_mon)
00139     if len(month) == 1 : month = '0'+month
00140     return year+month
00141                 
00142 
00143 class newenzyme(object):
00144     """construct the attributes of the enzyme corresponding to 'name'."""
00145     def __init__(cls, name):
00146         cls.opt_temp = 37
00147         cls.inact_temp = 65
00148         cls.substrat = 'DNA'
00149         target = enzymedict[name]
00150         cls.site = target[0]
00151         cls.size = target[1]
00152         cls.suppl = tuple(target[9])
00153         cls.freq = target[11]
00154         cls.ovhg = target[13]
00155         cls.ovhgseq = target[14]
00156         cls.bases = ()
00157         #
00158         #   Is the site palindromic?
00159         #   Important for the way the DNA is search for the site.
00160         #   Palindromic sites needs to be looked for only over 1 strand.
00161         #   Non Palindromic needs to be search for on the reverse complement
00162         #   as well.
00163         #
00164         if target[10] : cls.bases += ('Palindromic',)
00165         else : cls.bases += ('NonPalindromic',)
00166         #
00167         #   Number of cut the enzyme produce.
00168         #   0 => unknown, the enzyme has not been fully characterised.
00169         #   2 => 1 cut, (because one cut is realised by cutting 2 strands
00170         #   4 => 2 cuts, same logic.
00171         #   A little bit confusing but it is the way EMBOSS/Rebase works.
00172         #
00173         if not target[2]:
00174             #
00175             #   => undefined enzymes, nothing to be done.
00176             #
00177             cls.bases += ('NoCut','Unknown', 'NotDefined')
00178             cls.fst5 = None
00179             cls.fst3 = None
00180             cls.scd5 = None
00181             cls.scd3 = None
00182             cls.ovhg = None
00183             cls.ovhgseq = None
00184         else:
00185             #
00186             #   we will need to calculate the overhang.
00187             #
00188             if target[2] == 2:
00189                 cls.bases += ('OneCut',)
00190                 cls.fst5 = target[4]
00191                 cls.fst3 = target[5]
00192                 cls.scd5 = None
00193                 cls.scd3 = None
00194             else:
00195                 cls.bases += ('TwoCuts',)
00196                 cls.fst5 = target[4]
00197                 cls.fst3 = target[5]
00198                 cls.scd5 = target[6]
00199                 cls.scd3 = target[7]
00200             #
00201             #   Now, prepare the overhangs which will be added to the DNA 
00202             #   after the cut.
00203             #   Undefined enzymes will not be allowed to catalyse,
00204             #   they are not available commercially anyway.
00205             #   I assumed that if an enzyme cut twice the overhang will be of
00206             #   the same kind. The only exception is HaeIV. I do not deal
00207             #   with that at the moment (ie I don't include it,
00208             #   need to be fixed).
00209             #   They generally cut outside their recognition site and
00210             #   therefore the overhang is undetermined and dependent of
00211             #   the DNA sequence upon which the enzyme act.
00212             #
00213             if target[3]:
00214                 #
00215                 #   rebase field for blunt: blunt == 1, other == 0.
00216                 #   The enzyme is blunt. No overhang.
00217                 #
00218                 cls.bases += ('Blunt', 'Defined')
00219                 cls.ovhg = 0
00220             elif isinstance(cls.ovhg, int):
00221                 #
00222                 #   => overhang is sequence dependent
00223                 #
00224                 if cls.ovhg > 0:
00225                     #
00226                     #   3' overhang, ambiguous site (outside recognition site
00227                     #   or site containing ambiguous bases (N, W, R,...)
00228                     #
00229                     cls.bases += ('Ov3', 'Ambiguous')
00230                 elif cls.ovhg < 0:
00231                     #
00232                     #   5' overhang, ambiguous site (outside recognition site
00233                     #   or site containing ambiguous bases (N, W, R,...)
00234                     #
00235                     cls.bases += ('Ov5', 'Ambiguous')
00236             else:
00237                 #
00238                 #   cls.ovhg is a string => overhang is constant
00239                 #
00240                 if cls.fst5 - (cls.fst3 + cls.size) < 0:
00241                     cls.bases += ('Ov5', 'Defined')
00242                     cls.ovhg = - len(cls.ovhg)
00243                 else:
00244                     cls.bases += ('Ov3', 'Defined')
00245                     cls.ovhg = + len(cls.ovhg)
00246         #
00247         #   Next class : sensibility to methylation.
00248         #   Set by EmbossMixer from emboss_r.txt file
00249         #   Not really methylation dependent at the moment, stands rather for
00250         #   'is the site methylable?'.
00251         #   Proper methylation sensibility has yet to be implemented.
00252         #   But the class is there for further development.
00253         #
00254         if target[8]:
00255             cls.bases += ('Meth_Dep', )
00256             cls.compsite = target[12]
00257         else:
00258             cls.bases += ('Meth_Undep',)
00259             cls.compsite = target[12]
00260         #
00261         #   Next class will allow to select enzymes in function of their
00262         #   suppliers. Not essential but can be useful.
00263         #
00264         if cls.suppl:
00265             cls.bases += ('Commercially_available', )
00266         else:
00267             cls.bases += ('Not_available', )
00268         cls.bases += ('AbstractCut', 'RestrictionType')
00269         cls.__name__ = name
00270         cls.results = None
00271         cls.dna = None
00272         cls.__bases__ = cls.bases
00273         cls.charac = (cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site)
00274         if not target[2] and cls.suppl:
00275             supp = ', '.join([suppliersdict[s][0] for s in cls.suppl])
00276             print 'WARNING : It seems that %s is both commercially available\
00277             \n\tand its characteristics are unknown. \
00278             \n\tThis seems counter-intuitive.\
00279             \n\tThere is certainly an error either in ranacompiler or\
00280             \n\tin this REBASE release.\
00281             \n\tThe supplier is : %s.' % (name, supp)
00282         return
00283 
00284 
00285 class TypeCompiler(object):
00286     
00287     """Build the different types possible for Restriction Enzymes"""
00288 
00289     def __init__(self):
00290         """TypeCompiler() -> new TypeCompiler instance."""
00291         pass
00292 
00293     def buildtype(self):
00294         """TC.buildtype() -> generator.
00295 
00296         build the new types that will be needed for constructing the
00297         restriction enzymes."""
00298         baT = (AbstractCut, RestrictionType)
00299         cuT = (NoCut, OneCut, TwoCuts)
00300         meT = (Meth_Dep, Meth_Undep)
00301         paT = (Palindromic, NonPalindromic)
00302         ovT = (Unknown, Blunt, Ov5, Ov3)
00303         deT = (NotDefined, Defined, Ambiguous)
00304         coT = (Commercially_available, Not_available)
00305         All = (baT, cuT, meT, paT, ovT, deT, coT)
00306         #
00307         #   Now build the types. Only the most obvious are left out.
00308         #   Modified even the most obvious are not so obvious.
00309         #   emboss_*.403 AspCNI is unknown and commercially available.
00310         #   So now do not remove the most obvious.
00311         #
00312         types = [(p,c,o,d,m,co,baT[0],baT[1])
00313                  for p in paT for c in cuT for o in ovT
00314                  for d in deT for m in meT for co in coT]
00315         n= 1
00316         for ty in types:
00317             dct = {}
00318             for t in ty:
00319                 dct.update(t.__dict__)
00320                 #
00321                 #   here we need to customize the dictionary.
00322                 #   i.e. types deriving from OneCut have always scd5 and scd3
00323                 #   equal to None. No need therefore to store that in a specific
00324                 #   enzyme of this type. but it then need to be in the type.
00325                 #
00326                 dct['results'] = []
00327                 dct['substrat'] = 'DNA'
00328                 dct['dna'] = None
00329                 if t == NoCut:
00330                     dct.update({'fst5':None,'fst3':None,
00331                                 'scd5':None,'scd3':None,
00332                                 'ovhg':None,'ovhgseq':None})
00333                 elif t == OneCut:
00334                     dct.update({'scd5':None, 'scd3':None})
00335             class klass(type):
00336                 def __new__(cls):
00337                     return type.__new__(cls, 'type%i'%n,ty,dct)
00338                 def __init__(cls):
00339                     super(klass, cls).__init__('type%i'%n,ty,dct)
00340             yield klass()
00341             n+=1
00342 
00343 start = '\n\
00344 #!/usr/bin/env python\n\
00345 #\n\
00346 #      Restriction Analysis Libraries.\n\
00347 #      Copyright (C) 2004. Frederic Sohm.\n\
00348 #\n\
00349 # This code is part of the Biopython distribution and governed by its\n\
00350 # license.  Please see the LICENSE file that should have been included\n\
00351 # as part of this package.\n\
00352 #\n\
00353 # This file is automatically generated - do not edit it by hand! Instead,\n\
00354 # use the tool Scripts/Restriction/ranacompiler.py which in turn uses\n\
00355 # Bio/Restriction/_Update/RestrictionCompiler.py\n\
00356 #\n\
00357 # The following dictionaries used to be defined in one go, but that does\n\
00358 # not work on Jython due to JVM limitations. Therefore we break this up\n\
00359 # into steps, using temporary functions to avoid the JVM limits.\n\
00360 \n\n'
00361 
00362 class DictionaryBuilder(object):
00363 
00364     def __init__(self, e_mail='', ftp_proxy=''):
00365         """DictionaryBuilder([e_mail[, ftp_proxy]) -> DictionaryBuilder instance.
00366 
00367         If the emboss files used for the construction need to be updated this
00368         class will download them if the ftp connection is correctly set.
00369         either in RanaConfig.py or given at run time.
00370         
00371         e_mail is the e-mail address used as password for the anonymous
00372         ftp connection.
00373 
00374         proxy is the ftp_proxy to use if any."""
00375         self.rebase_pass = e_mail or config.Rebase_password
00376         self.proxy = ftp_proxy or config.ftp_proxy
00377     
00378     def build_dict(self):
00379         """DB.build_dict() -> None.
00380 
00381         Construct the dictionary and build the files containing the new
00382         dictionaries."""
00383         #
00384         #   first parse the emboss files.
00385         #
00386         emboss_e, emboss_r, emboss_s = self.lastrebasefile()
00387         #
00388         #   the results will be stored into enzymedict.
00389         #
00390         self.information_mixer(emboss_r, emboss_e, emboss_s)
00391         emboss_r.close()
00392         emboss_e.close()
00393         emboss_s.close()
00394         #
00395         #   we build all the possible type 
00396         #
00397         tdct = {}
00398         for klass in TypeCompiler().buildtype():
00399             exec klass.__name__ +'= klass'
00400             exec "tdct['"+klass.__name__+"'] = klass"
00401 
00402         #
00403         #   Now we build the enzymes from enzymedict
00404         #   and store them in a dictionary.
00405         #   The type we will need will also be stored.
00406         #
00407 
00408         for name in enzymedict:
00409             #
00410             #   the class attributes first:
00411             #
00412             cls = newenzyme(name)
00413             #
00414             #   Now select the right type for the enzyme.
00415             #
00416             bases = cls.bases
00417             clsbases = tuple([eval(x) for x in bases])
00418             typestuff = ''
00419             for n, t in tdct.iteritems():
00420                 #
00421                 #   if the bases are the same. it is the right type.
00422                 #   create the enzyme and remember the type
00423                 #
00424                 if t.__bases__ == clsbases:
00425                     typestuff = t
00426                     typename = t.__name__
00427                 continue
00428             #
00429             #   now we build the dictionaries.
00430             #
00431             dct = dict(cls.__dict__)
00432             del dct['bases']
00433             del dct['__bases__']
00434             del dct['__name__']# no need to keep that, it's already in the type.
00435             classdict[name] = dct
00436            
00437             commonattr = ['fst5', 'fst3', 'scd5', 'scd3', 'substrat',
00438                           'ovhg', 'ovhgseq','results', 'dna']
00439             if typename in typedict:
00440                 typedict[typename][1].append(name)
00441             else:
00442                 enzlst= []
00443                 tydct = dict(typestuff.__dict__)
00444                 tydct = dict([(k,v) for k,v in tydct.iteritems() if k in commonattr])
00445                 enzlst.append(name)
00446                 typedict[typename] = (bases, enzlst)
00447             for letter in cls.__dict__['suppl']:
00448                 supplier = suppliersdict[letter]
00449                 suppliersdict[letter][1].append(name)
00450         if not classdict or not suppliersdict or not typedict:
00451             print 'One of the new dictionaries is empty.'
00452             print 'Check the integrity of the emboss file before continuing.'
00453             print 'Update aborted.'
00454             sys.exit()
00455         #
00456         #   How many enzymes this time?
00457         #
00458         print '\nThe new database contains %i enzymes.\n' % len(classdict)
00459         #
00460         #   the dictionaries are done. Build the file 
00461         #
00462         #update = config.updatefolder
00463         
00464         update = os.getcwd()
00465         results = open(os.path.join(update, 'Restriction_Dictionary.py'), 'w')
00466         print 'Writing the dictionary containing the new Restriction classes.\t',
00467         results.write(start)
00468         results.write('rest_dict = {}\n')
00469         for name in sorted(classdict) :
00470             results.write("def _temp():\n")
00471             results.write("    return {\n")
00472             for key, value in classdict[name].iteritems() :
00473                 results.write("        %s : %s,\n" % (repr(key), repr(value)))
00474             results.write("    }\n")
00475             results.write("rest_dict[%s] = _temp()\n" % repr(name))
00476             results.write("\n")
00477         print 'OK.\n'
00478         print 'Writing the dictionary containing the suppliers datas.\t\t',
00479         results.write('suppliers = {}\n')
00480         for name in sorted(suppliersdict) :
00481             results.write("def _temp():\n")
00482             results.write("    return (\n")
00483             for value in suppliersdict[name] :
00484                 results.write("        %s,\n" % repr(value))
00485             results.write("    )\n")
00486             results.write("suppliers[%s] = _temp()\n" % repr(name))
00487             results.write("\n")
00488         print 'OK.\n'
00489         print 'Writing the dictionary containing the Restriction types.\t',
00490         results.write('typedict = {}\n')
00491         for name in sorted(typedict) :
00492             results.write("def _temp():\n")
00493             results.write("    return (\n")
00494             for value in typedict[name] :
00495                 results.write("        %s,\n" % repr(value))
00496             results.write("    )\n")
00497             results.write("typedict[%s] = _temp()\n" % repr(name))
00498             results.write("\n")
00499         #I had wanted to do "del _temp" at each stage (just for clarity), but
00500         #that pushed the code size just over the Jython JVM limit. We include
00501         #one the final "del _temp" to clean up the namespace.
00502         results.write("del _temp\n")
00503         results.write("\n")
00504         print 'OK.\n'
00505         results.close()
00506         return
00507 
00508     def install_dict(self):
00509         """DB.install_dict() -> None.
00510 
00511         Install the newly created dictionary in the site-packages folder.
00512 
00513         May need super user privilege on some architectures."""
00514         print '\n ' +'*'*78 + ' \n'
00515         print '\n\t\tInstalling Restriction_Dictionary.py'
00516         try:
00517             import Bio.Restriction.Restriction_Dictionary as rd
00518         except ImportError:
00519             print '\
00520             \n Unable to locate the previous Restriction_Dictionary.py module\
00521             \n Aborting installation.'
00522             sys.exit()
00523         #
00524         #   first save the old file in Updates
00525         #
00526         old = os.path.join(os.path.split(rd.__file__)[0],
00527                            'Restriction_Dictionary.py')
00528         #update_folder = config.updatefolder
00529         update_folder = os.getcwd()
00530         shutil.copyfile(old, os.path.join(update_folder,
00531                                           'Restriction_Dictionary.old'))
00532         #
00533         #   Now test and install.
00534         #
00535         new = os.path.join(update_folder, 'Restriction_Dictionary.py')
00536         try:
00537             execfile(new)
00538             print '\
00539             \n\tThe new file seems ok. Proceeding with the installation.'   
00540         except SyntaxError:
00541             print '\
00542             \n The new dictionary file is corrupted. Aborting the installation.'
00543             return
00544         try:
00545             shutil.copyfile(new, old)
00546             print'\n\t Everything ok. If you need it a version of the old\
00547             \n\t dictionary have been saved in the Updates folder under\
00548             \n\t the name Restriction_Dictionary.old.'
00549             print '\n ' +'*'*78 + ' \n'
00550         except IOError:
00551             print '\n ' +'*'*78 + ' \n'
00552             print '\
00553             \n\t WARNING : Impossible to install the new dictionary.\
00554             \n\t Are you sure you have write permission to the folder :\n\
00555             \n\t %s ?\n\n' % os.path.split(old)[0]
00556             return self.no_install()
00557         return
00558 
00559     def no_install(self):
00560         """BD.no_install() -> None.
00561 
00562         build the new dictionary but do not install the dictionary."""
00563         print '\n ' +'*'*78 + '\n'
00564         #update = config.updatefolder
00565         try:
00566             import Bio.Restriction.Restriction_Dictionary as rd
00567         except ImportError:
00568             print '\
00569             \n Unable to locate the previous Restriction_Dictionary.py module\
00570             \n Aborting installation.'
00571             sys.exit()
00572         #
00573         #   first save the old file in Updates
00574         #
00575         old = os.path.join(os.path.split(rd.__file__)[0],
00576                            'Restriction_Dictionary.py')
00577         update = os.getcwd()
00578         shutil.copyfile(old, os.path.join(update, 'Restriction_Dictionary.old'))
00579         places = update, os.path.split(Bio.Restriction.Restriction.__file__)[0]
00580         print "\t\tCompilation of the new dictionary : OK.\
00581         \n\t\tInstallation : No.\n\
00582         \n You will find the newly created 'Restriction_Dictionary.py' file\
00583         \n in the folder : \n\
00584         \n\t%s\n\
00585         \n Make a copy of 'Restriction_Dictionary.py' and place it with \
00586         \n the other Restriction libraries.\n\
00587         \n note : \
00588         \n This folder should be :\n\
00589         \n\t%s\n" % places
00590         print '\n ' +'*'*78 + '\n'
00591         return
00592         
00593 
00594     def lastrebasefile(self):
00595         """BD.lastrebasefile() -> None.
00596 
00597         Check the emboss files are up to date and download them if they are not.
00598         """
00599         embossnames = ('emboss_e', 'emboss_r', 'emboss_s')
00600         #
00601         #   first check if we have the last update:
00602         #
00603         emboss_now = ['.'.join((x,LocalTime())) for x in embossnames]
00604         update_needed = False
00605         #dircontent = os.listdir(config.Rebase) #    local database content
00606         dircontent = os.listdir(os.getcwd())
00607         base = os.getcwd() # added for biopython current directory
00608         for name in emboss_now:
00609             if name in dircontent:
00610                 pass
00611             else:
00612                 update_needed = True
00613                 
00614         if not update_needed:
00615             #
00616             #   nothing to be done
00617             #
00618             print '\n Using the files : %s'% ', '.join(emboss_now)
00619             return tuple([open(os.path.join(base, n)) for n in emboss_now])
00620         else:
00621             #
00622             #   may be download the files.
00623             #
00624             print '\n The rebase files are more than one month old.\
00625             \n Would you like to update them before proceeding?(y/n)'
00626             r = raw_input(' update [n] >>> ')
00627             if r in ['y', 'yes', 'Y', 'Yes']:
00628                 updt = RebaseUpdate(self.rebase_pass, self.proxy)
00629                 updt.openRebase()
00630                 updt.getfiles()
00631                 updt.close()
00632                 print '\n Update complete. Creating the dictionaries.\n'
00633                 print '\n Using the files : %s'% ', '.join(emboss_now)
00634                 return tuple([open(os.path.join(base, n)) for n in emboss_now])
00635             else:
00636                 #
00637                 #   we will use the last files found without updating.
00638                 #   But first we check we have some file to use.
00639                 #
00640                 class NotFoundError(Exception):
00641                     pass
00642                 for name in embossnames:
00643                     try:
00644                         for file in dircontent:
00645                             if file.startswith(name):
00646                                 break
00647                         else:
00648                             pass
00649                         raise NotFoundError
00650                     except NotFoundError:
00651                         print "\nNo %s file found. Upgrade is impossible.\n"%name
00652                         sys.exit()
00653                     continue
00654                 pass
00655         #
00656         #   now find the last file.
00657         #
00658         last = [0]
00659         for file in dircontent:
00660             fs = file.split('.')
00661             try:
00662                 if fs[0] in embossnames and int(fs[1]) > int(last[-1]):
00663                     if last[0] : last.append(fs[1])
00664                     else : last[0] = fs[1]
00665                 else:
00666                     continue
00667             except ValueError:
00668                 continue
00669         last.sort()
00670         last = last[::-1]
00671         if int(last[-1]) < 100 : last[0], last[-1] = last[-1], last[0]
00672         for number in last:
00673             files = [(name, name+'.%s'%number) for name in embossnames]
00674             strmess = '\nLast EMBOSS files found are :\n'
00675             try:
00676                 for name,file in files:
00677                     if os.path.isfile(os.path.join(base, file)):
00678                         strmess += '\t%s.\n'%file
00679                     else:
00680                         raise ValueError
00681                 print strmess
00682                 emboss_e = open(os.path.join(base, 'emboss_e.%s'%number),'r')
00683                 emboss_r = open(os.path.join(base, 'emboss_r.%s'%number),'r')
00684                 emboss_s = open(os.path.join(base, 'emboss_s.%s'%number),'r')
00685                 return emboss_e, emboss_r, emboss_s
00686             except ValueError:
00687                 continue
00688 
00689     def parseline(self, line):
00690         line = [line[0]]+[line[1].upper()]+[int(i) for i in line[2:9]]+line[9:]
00691         name = line[0].replace("-","_")
00692         site = line[1]          #   sequence of the recognition site
00693         dna = DNA(site)  
00694         size = line[2]          #   size of the recognition site
00695         #
00696         #   Calculate the overhang.
00697         #
00698         fst5 = line[5]  #   first site sense strand 
00699         fst3 = line[6]  #   first site antisense strand
00700         scd5 = line[7]  #   second site sense strand
00701         scd3 = line[8]  #   second site antisense strand
00702         
00703         #
00704         #   the overhang is the difference between the two cut
00705         #
00706         ovhg1 = fst5 - fst3
00707         ovhg2 = scd5 - scd3
00708         
00709         #
00710         #   0 has the meaning 'do not cut' in rebase. So we get short of 1
00711         #   for the negative numbers so we add 1 to negative sites for now.
00712         #   We will deal with the record later.
00713         #
00714         
00715         if fst5 < 0 : fst5 += 1
00716         if fst3 < 0 : fst3 += 1
00717         if scd5 < 0 : scd5 += 1
00718         if scd3 < 0 : scd3 += 1
00719 
00720         if ovhg2 != 0 and ovhg1 != ovhg2:
00721             #
00722             #   different length of the overhang of the first and second cut
00723             #   it's a pain to deal with and at the moment it concerns only
00724             #   one enzyme which is not commercially available (HaeIV).
00725             #   So we don't deal with it but we check the progression
00726             #   of the affair.
00727             #   Should HaeIV become commercially available or other similar
00728             #   new enzymes be added, this might be modified.
00729             #
00730             print '\
00731             \nWARNING : %s cut twice with different overhang length each time.\
00732             \n\tUnable to deal with this behaviour. \
00733             \n\tThis enzyme will not be included in the database. Sorry.' %name
00734             print '\tChecking :',
00735             raise OverhangError
00736         if 0 <= fst5 <= size and 0 <= fst3 <= size:
00737             #
00738             # cut inside recognition site
00739             #
00740             if fst5 < fst3:
00741                 #
00742                 #  5' overhang
00743                 #
00744                 ovhg1 = ovhgseq = site[fst5:fst3]       
00745             elif fst5 > fst3:
00746                 #
00747                 #  3' overhang
00748                 #
00749                 ovhg1 = ovhgseq = site[fst3:fst5]  
00750             else:
00751                 #
00752                 #  blunt
00753                 #
00754                 ovhg1 = ovhgseq = ''            
00755             for base in 'NRYWMSKHDBV':
00756                 if base in ovhg1:
00757                     #
00758                     #   site and overhang degenerated
00759                     #
00760                     ovhgseq = ovhg1
00761                     if fst5 < fst3 :  ovhg1 = - len(ovhg1)
00762                     else : ovhg1 = len(ovhg1)
00763                     break
00764                 else:
00765                     continue
00766         elif 0 <= fst5 <= size:
00767             #
00768             #   5' cut inside the site 3' outside
00769             #
00770             if fst5 < fst3:
00771                 #
00772                 #   3' cut after the site
00773                 #
00774                 ovhgseq = site[fst5:] + (fst3 - size) * 'N' 
00775             elif fst5 > fst3:
00776                 #
00777                 #   3' cut before the site
00778                 #
00779                 ovhgseq = abs(fst3) * 'N' + site[:fst5] 
00780             else:
00781                 #
00782                 #   blunt outside
00783                 #
00784                 ovhg1 = ovhgseq = '' 
00785         elif 0 <= fst3 <= size:
00786             #
00787             #   3' cut inside the site, 5' outside
00788             #
00789             if fst5 < fst3:
00790                 #
00791                 #   5' cut before the site
00792                 #
00793                 ovhgseq = abs(fst5) * 'N' + site[:fst3]
00794             elif fst5 > fst3:
00795                 #
00796                 #   5' cut after the site
00797                 #
00798                 ovhgseq = site[fst3:] + (fst5 - size) * 'N'
00799             else:
00800                 #
00801                 #   should not happend
00802                 #
00803                 raise ValueError('Error in #1')
00804         elif fst3 < 0 and size < fst5:
00805             #
00806             #   3' overhang. site is included.
00807             #
00808             ovhgseq = abs(fst3)*'N' + site + (fst5-size)*'N'
00809         elif fst5 < 0 and size <fst3:
00810             #
00811             #   5' overhang. site is included.
00812             #
00813             ovhgseq = abs(fst5)*'N' + site + (fst3-size)*'N'
00814         else:
00815             #
00816             #   5' and  3' outside of the site
00817             #
00818             ovhgseq = 'N' * abs(ovhg1)
00819         #
00820         #   Now line[5] to [8] are the location of the cut but we have to
00821         #   deal with the weird mathematics of biologists.
00822         #
00823         #   EMBOSS sequence numbering give:
00824         #                 DNA = 'a c g t A C G T'
00825         #                             -1 1 2 3 4
00826         #
00827         #   Biologists do not know about 0. Too much use of latin certainly.
00828         #
00829         #   To compensate, we add 1 to the positions if they are negative.
00830         #   No need to modify 0 as it means no cut and will not been used.
00831         #   Positive numbers should be ok since our sequence starts 1.
00832         #
00833         #   Moreover line[6] and line[8] represent cut on the reverse strand.
00834         #   They will be used for non palindromic sites and sre.finditer
00835         #   will detect the site in inverse orientation so we need to add the
00836         #   length of the site to compensate (+1 if they are negative).
00837         #
00838         for x in (5, 7):
00839             if line[x] < 0 : line[x] += 1
00840         for x in (6, 8):
00841             if line[x] > 0 : line[x] -= size
00842             elif line[x] < 0 : line[x] = line[x] - size + 1
00843         #
00844         #   now is the site palindromic?
00845         #   produce the regular expression which correspond to the site.
00846         #   tag of the regex will be the name of the enzyme for palindromic
00847         #   enzymesband two tags for the other, the name for the sense sequence
00848         #   and the name with '_as' at the end for the antisense sequence.
00849         #
00850         rg = ''
00851         if is_palindrom(dna):
00852             line.append(True)
00853             rg = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
00854         else:
00855             line.append(False)
00856             sense = ''.join(['(?P<', name, '>', regex(site.upper()), ')'])
00857             antisense = ''.join(['(?P<', name, '_as>',
00858                                  regex(Antiparallel(dna)), ')'])
00859             rg = sense + '|' + antisense
00860         #
00861         #   exact frequency of the site. (ie freq(N) == 1, ...)
00862         #
00863         f = [4/len(dna_alphabet[l]) for l in site.upper()]
00864         freq = reduce(lambda x, y : x*y, f)
00865         line.append(freq)
00866         #
00867         #   append regex and ovhg1, they have not been appended before not to
00868         #   break the factory class. simply to leazy to make the changes there.
00869         #
00870         line.append(rg)
00871         line.append(ovhg1)
00872         line.append(ovhgseq)
00873         return line
00874 
00875     def removestart(self, file):
00876         #
00877         #   remove the heading of the file.
00878         #
00879         return [l for l in itertools.dropwhile(lambda l:l.startswith('#'),file)]
00880 
00881     def getblock(self, file, index):
00882         #
00883         #   emboss_r.txt, separation between blocks is //
00884         #
00885         take = itertools.takewhile
00886         block = [l for l in take(lambda l :not l.startswith('//'),file[index:])]
00887         index += len(block)+1
00888         return block, index
00889 
00890     def get(self, block):
00891         #
00892         #   take what we want from the block.
00893         #   Each block correspond to one enzyme.
00894         #   block[0] => enzyme name
00895         #   block[3] => methylation (position and type)
00896         #   block[5] => suppliers (as a string of single letter)
00897         #
00898         bl3 = block[3].strip() 
00899         if not bl3 : bl3 = False #  site is not methylable
00900         return (block[0].strip(), bl3, block[5].strip())
00901 
00902     def information_mixer(self, file1, file2, file3):
00903         #
00904         #   Mix all the information from the 3 files and produce a coherent
00905         #   restriction record.
00906         #     
00907         methfile = self.removestart(file1)
00908         sitefile = self.removestart(file2)
00909         supplier = self.removestart(file3)
00910         
00911         i1, i2= 0, 0
00912         try:
00913             while True:
00914                 block, i1 = self.getblock(methfile, i1)
00915                 bl = self.get(block)
00916                 line = (sitefile[i2].strip()).split()
00917                 name = line[0]
00918                 if name == bl[0]:
00919                     line.append(bl[1])  #   -> methylation
00920                     line.append(bl[2])  #   -> suppliers
00921                 else:
00922                     bl = self.get(oldblock)
00923                     if line[0] == bl[0]:
00924                         line.append(bl[1])
00925                         line.append(bl[2])
00926                         i2 += 1
00927                     else:
00928                         raise TypeError  
00929                 oldblock = block
00930                 i2 += 1
00931                 try:
00932                     line = self.parseline(line)
00933                 except OverhangError :          #   overhang error 
00934                     n = name                    #   do not include the enzyme
00935                     if not bl[2]:
00936                         print 'Anyway, %s is not commercially available.\n' %n
00937                     else:
00938                         print 'Unfortunately, %s is commercially available.\n'%n
00939 
00940                     continue 
00941                 #Hyphens can't be used as a Python name, nor as a
00942                 #group name in a regular expression.
00943                 name = name.replace("-","_")
00944                 if name in enzymedict:
00945                     #
00946                     #   deal with TaqII and its two sites.
00947                     #
00948                     print '\nWARNING :',
00949                     print name, 'has two different sites.\n'
00950                     other = line[0].replace("-","_")
00951                     dna = DNA(line[1])
00952                     sense1 = regex(dna.tostring())
00953                     antisense1 = regex(Antiparallel(dna))
00954                     dna = DNA(enzymedict[other][0])
00955                     sense2 = regex(dna.tostring())
00956                     antisense2 = regex(Antiparallel(dna))
00957                     sense = '(?P<'+other+'>'+sense1+'|'+sense2+')'
00958                     antisense = '(?P<'+other+'_as>'+antisense1+'|'+antisense2 + ')'
00959                     reg = sense + '|' + antisense 
00960                     line[1] = line[1] + '|' + enzymedict[other][0]
00961                     line[-1] = reg
00962                 #
00963                 #   the data to produce the enzyme class are then stored in
00964                 #   enzymedict.
00965                 #
00966                 enzymedict[name] = line[1:] #element zero was the name
00967         except IndexError:
00968             pass
00969         for i in supplier:
00970             #
00971             #   construction of the list of suppliers.
00972             #
00973             t = i.strip().split(' ', 1)
00974             suppliersdict[t[0]] = (t[1], [])
00975         return
00976 
00977