Back to index

python-biopython  1.60
getgene.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # Created: Sun Oct 15 16:16:20 2000
00003 # Last changed: Time-stamp: <01/02/15 09:01:27 thomas>
00004 # thomas@cbs.dtu.dk, http://www.cbs.dtu.dk/thomas
00005 # File: getgene.py
00006 
00007 """ Example code to index a non-reduntant protein database of
00008     SwissProt + TrEMBL for fast lookup and retrieval.
00009     
00010     To build the database and index it:
00011         cd /opt/bio/data/
00012         wget -N -nd -r -l1 -A'.dat.Z' ftp://expasy.cbr.nrc.ca/databases/sp_tr_nrdb/
00013         zcat *.dat.Z > nr.dat
00014         ./getgene.py --index nr.dat
00015         setenv PYPHY '/opt/bio/data'
00016         
00017     To retrieve entries from the command line:
00018     ./getgene.py EFTU_ECOLI
00019 
00020     To use from a python script:
00021     from getgene import DB_Index
00022 
00023     db_index = DB_Index()
00024 
00025     # retrieve a complete entry:
00026     db_index.Get('EFTU_ECOLI')
00027 
00028     # get organism, lineage and gene
00029     db_index.Get_OS_OC_GN('EFTU_ECOLI')
00030 """    
00031     
00032 
00033 
00034 import string, re
00035 import os, sys
00036 import gdbm
00037 
00038 class DB_Index:
00039     def __init__(self, open = 1):
00040         if open: self.Open()
00041             
00042     def Create(self, infile, outfile):
00043         db = gdbm.open(outfile, 'n')
00044         fid = open(infile)
00045 
00046         db['datafile'] = os.path.abspath(infile)
00047         
00048         while 1:
00049             line = fid.readline()
00050             if not line or not len(line): break
00051 
00052             if line[:3] == 'ID ':
00053                 id = string.split(line)[1]
00054                 start = fid.tell() - len(line)
00055 
00056             elif line[:3] == 'AC ':
00057                 acc = string.split(line)[1]
00058                 if acc[-1] ==';': acc = acc[:-1]
00059 
00060             elif line[:2] =='//':
00061                 stop = fid.tell()
00062                 try:
00063                     value = '%d %d' % (start, stop)
00064                     db[id] = value
00065                     db[acc] = value
00066                     id, acc, start, stop = None, None, None, None
00067                 except:
00068                     print 'AARRGGGG', start, stop, type(start), type(stop)
00069                     print id, acc
00070                     
00071         db.close()
00072         fid.close()
00073 
00074     def Open(self, indexfile = None):
00075         if not indexfile:
00076             indexfile = os.path.join(os.environ['PYPHY'],'nr.dat.indexed')
00077 
00078         self.db = gdbm.open(indexfile)
00079         self.datafile = self.db['datafile']
00080         self.fid = open(self.datafile)
00081 
00082     def Close(self):
00083         self.db.close()
00084         
00085     def Get(self, id):
00086         try:
00087             values = self.db[id]
00088         except:
00089             return None
00090         start, stop= map(int,string.split(values))
00091         self.fid.seek(start)
00092         txt = self.fid.read(stop - start)
00093         return txt
00094         
00095     def Get_Organism(self, id):
00096         entry = self.Get(id)
00097         if not entry: return None
00098         for line in string.split(entry, '\n'):
00099             if line[0:5] == 'OS   ':
00100                 OS = string.strip(line[5:])
00101                 if OS[-1] ==".": OS = OS[0:-1]
00102                 return OS
00103             if line[0:2] =="//": break
00104         return OS
00105 
00106     def FixOS(self, os):
00107         os = string.split(os,',')[0]
00108         os = string.split(os,'(')[0]
00109         return string.strip(os)
00110     
00111     def Get_Taxonomy(self, id):
00112         entry = self.Get(id)
00113         if not entry: return None
00114         OC = ""
00115         for line in string.split(entry, '\n'):
00116             if line[0:5] == 'OC   ':
00117                 OC = OC + string.strip(line[5:])
00118                 if OC[-1] ==".": OC = OC[0:-1]
00119             if line[0:2] =="//": break
00120         return OC
00121     
00122     def Get_Kingdom(self, id):
00123         res = self.Get_Taxonomy(id)
00124         #print id, res
00125         if not res: return "U"
00126         kd = string.strip(string.split(res,";")[0])
00127         if kd == "Eubacteria" or kd == "Prokaryota" or kd == "Bacteria": return "B"
00128         elif kd == "Eukaryota" or kd =="Eukaryotae": return "E"
00129         elif kd == "Archaebacteria" or kd == "Archaea": return "A"
00130         elif kd == "Viridae" or kd == "Viruses": return "V"
00131         else:
00132             print kd, "UNKNOWN"
00133             return "U"
00134         
00135     def Get_Gene(self, id):
00136         entry = self.Get(id)
00137         if not entry: return None
00138         GN = ''
00139         for line in string.split(entry, '\n'):
00140             if line[0:5] == 'GN   ':
00141                 GN = string.strip(line[5:])
00142                 if GN[-1] ==".": GN = GN[0:-1]
00143                 return GN
00144             if line[0:2] =="//": break
00145         return GN
00146 
00147 
00148     def Get_OS_OC_GN(self, id):
00149         entry = self.Get(id)
00150         if not entry: return None, None, None
00151         OS, OC, GN = "","",""
00152         for line in string.split(entry, '\n'):
00153             if line[0:5] == 'OS   ':
00154                 OS = string.strip(line[5:])
00155                 if OS[-1] ==".": OS = OS[0:-1]
00156             if line[0:5] == 'OC   ':
00157                 OC = OC + string.strip(line[5:])
00158                 if OC[-1] ==".": OC = OC[0:-1]
00159             if line[0:5] == 'GN   ':
00160                 GN = string.strip(line[5:])
00161                 if GN[-1] ==".": GN = GN[0:-1]
00162             if line[0:2] =="//": break
00163         return OS, OC, GN
00164     
00165     def Get_OS_OC_OG(self, id):
00166         entry = self.Get(id)
00167         if not entry: return None, None, None
00168         OS, OC, OG = "","",""
00169         for line in string.split(entry, '\n'):
00170             if line[0:5] == 'OS   ':
00171                 OS = string.strip(line[5:])
00172                 if OS[-1] ==".": OS = OS[0:-1]
00173             if line[0:5] == 'OC   ':
00174                 OC = OC + string.strip(line[5:])
00175                 if OC[-1] ==".": OC = OC[0:-1]
00176             if line[0:5] == 'OG   ':
00177                 OG = string.strip(line[5:])
00178                 if OG[-1] ==".": OG = OG[0:-1]
00179             if line[0:2] =="//": break
00180         return OS, OC, OG
00181 
00182     def Get_SQ(self, id, fasta = 1):
00183         entry = self.Get(id)
00184         if not entry: return ""
00185         SQ = ""
00186         record = 0
00187         for line in string.split(entry, '\n'):
00188             if record: SQ = SQ + string.strip(line[5:])
00189             if line[0:5] == 'SQ   ': record = 1
00190             if line[0:2] =="//": break
00191 
00192         SQ = re.sub('[ \n]','',SQ)
00193         if fasta: SQ = '>%s\n%s' % (id, re.sub('(.{60})','\\1\n',SQ))
00194         return SQ
00195 
00196     def Get_XX(self, id, xx):
00197         entry = self.Get(id)
00198         if not entry: return ""
00199         XX = ""
00200         for line in string.split(entry, '\n'):
00201             if line[0:5] == '%s   ' % xx:
00202                 XX = XX + string.strip(line[5:])
00203                 if XX[-1] ==".": XX = XX[0:-1]
00204             if line[0:2] =="//": break
00205         return XX
00206         
00207     def Get_Keywords(self, id):
00208         entry = self.Get(id)
00209         if not entry: return []
00210         keywords = []
00211         for line in string.split(entry, '\n'):
00212             if line[0:5] == 'KW   ':
00213                 for i in string.split(string.strip(line[5:]),';'):
00214                     kw = string.strip(i)
00215                     if len(kw) < 2: continue
00216                     if kw[-1] == '.': kw = kw[:-1]
00217                     keywords.append(kw)
00218             if line[0:2] =="//": break
00219         return keywords
00220 
00221 
00222 
00223 def help(exit = 0):
00224     name = os.path.basename(sys.argv[0])
00225     print 'Usage: %s <db> <gene ID>' % name
00226     print '  or   %s --index <db.dat>' % name
00227     if exit: sys.exit(0)
00228 
00229 if __name__ == '__main__':
00230     pyphy_home = os.environ.get('PYPHY', None)
00231 
00232     if len(sys.argv) == 1: help(exit = 1)
00233     db_index = DB_Index(open = 0)
00234     func = db_index.Get
00235     for arg in sys.argv[1:]:
00236         if arg == '--index':
00237             sys.argv.remove(arg)
00238             infile = sys.argv[1]
00239             outfile = os.path.basename(infile) + '.indexed'
00240             db_index.Create(infile, outfile)
00241             sys.exit(0)
00242 
00243         elif arg[:4] == '-Get':
00244             sys.argv.remove(arg)
00245             func = getattr(db_index, arg[1:])
00246 
00247         elif arg == '-h' or arg == '--help': help(exit = 1)
00248 
00249     db = 'nr.dat'
00250     if len(sys.argv) == 2:
00251         # shortcut, mostly we use nr.dat so dont bother to name it
00252         ids = sys.argv[1:]
00253     else:
00254         try:
00255              db = sys.argv[1]
00256              ids = sys.argv[2:]
00257         except:
00258             help(exit = 1)
00259         
00260     dbfile = os.path.join(pyphy_home, db + '.indexed')
00261     db_index.Open(dbfile)
00262     for id in ids:
00263         #print db_index.Get(id)
00264         print func(id)
00265         
00266 
00267