Back to index

python-biopython  1.60
scop_pdb.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 
00003 # Copyright 2001 by Gavin E. Crooks.  All rights reserved.
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 
00008 
00009 import getopt
00010 import sys
00011 import types
00012 import urllib
00013 
00014 from Bio.SCOP import *
00015 
00016 def usage() :
00017     print \
00018 """Extract a SCOP domain's ATOM and HETATOM records from the relevant PDB file.
00019 
00020 For example:
00021   scop_pdb.py astral-rapid-access-1.55.raf dir.cla.scop.txt_1.55 d3hbib_
00022 
00023 A result file, d3hbib_.ent, will be generated in the working directory.
00024 
00025 The required RAF file can be found at [http://astral.stanford.edu/raf.html],
00026 and the SCOP CLA file at [http://scop.berkeley.edu/parse/index.html].
00027 
00028 Note: Errors will occur if the PDB file has been altered since the creation
00029 of the SCOP CLA and ASTRAL RAF files.
00030  
00031 Usage: scop_pdb [-h] [-i file] [-o file] [-p pdb_url_prefix]
00032                  raf_url cla_url [sid] [sid] [sid] ...
00033 
00034  -h        -- Print this help message.
00035 
00036  -i file   -- Input file name. Each line should start with an sid (Scop domain
00037               identifier). Blank lines, and lines starting with '#' are
00038               ignored. If file is '-' then data is read from stdin. If not
00039               given then sids are taken from the command line. 
00040 
00041  -o file   -- Output file name. If '-' then data is written to stdout. If not
00042               given then data is written to files named sid+'.ent'.
00043 
00044  -p pdb_url-- A URL for PDB files. The token '%s' will be replaced with the
00045               4 character PDB ID. If the pdb_url is not given then the latest
00046               PDB file is retrieved directly from rcsb.org.
00047 
00048 
00049   raf_url  -- The URL or filename of an ASTRAL Rapid Access File sequence map.
00050               See [http://astral.stanford.edu/raf.html]
00051 
00052   cla_url  -- The URL or filename of a SCOP parsable CLA file.
00053               See [http://scop.berkeley.edu/parse/index.html]
00054 
00055   sid      -- A SCOP domain identifier. e.g. d3hbib_ 
00056 """
00057 
00058 
00059 
00060 default_pdb_url = "http://www.rcsb.org/pdb/cgi/export.cgi/somefile.pdb?" \
00061                       "format=PDB&pdbId=%s&compression=None"
00062 #default_pdb_url = "file://usr/local/db/pdb/data/010331/snapshot/all/pdb%s.ent"
00063 
00064 def open_pdb(pdbid, pdb_url=None) :
00065     if pdb_url ==None: pdb_url = default_pdb_url
00066     url = pdb_url % pdbid
00067     fn, header = urllib.urlretrieve(url)
00068     return open(fn)
00069 
00070 
00071 def main():
00072     try:
00073         opts, args = getopt.getopt(sys.argv[1:], "hp:o:i:",
00074              ["help", "usage","pdb=","output=","input="])
00075     except getopt.GetoptError:
00076         # print help information and exit:
00077         usage()
00078         sys.exit(2)
00079 
00080     input= None
00081     in_handle = None
00082     output = None
00083     pdb_url = None
00084     cla_url = None
00085     raf_url = None
00086     
00087     for o, a in opts:
00088         if o in ("-h", "--help","--usage"):
00089             usage()
00090             sys.exit()
00091         elif o in ("-o", "--output"):
00092             output = a
00093         elif o in ("-i", "--input"):
00094             input = a
00095         elif o in ("-p", "--pdb"):
00096             pdb_url = a
00097 
00098     if len(args) <2 :
00099         print >>sys.stderr, \
00100              "Not enough arguments. Try --help for more details."
00101         sys.exit(2)
00102 
00103     raf_url = args[0]
00104     cla_url = args[1]
00105     
00106     (raf_filename, headers) = urllib.urlretrieve(raf_url)
00107     seqMapIndex = Raf.SeqMapIndex(raf_filename)
00108 
00109     (cla_filename, headers) = urllib.urlretrieve(cla_url)
00110     claIndex = Cla.Index(cla_filename)
00111 
00112     if input == None :
00113         sids = args[2:]
00114     elif input == '-' :
00115         sids = sys.stdin.xreadlines()
00116     else :
00117         in_handle = open(input)
00118         sids = in_handle.xreadlines()
00119 
00120     try:
00121         for sid in sids :
00122             if not sid or sid[0:1]=='#': continue
00123             id = sid[0:7]
00124             pdbid=id[1:5]
00125             s = pdbid[0:1]
00126             if s=='0' or s=='s' :
00127                 print >>sys.stderr,"No coordinates for domain "+id
00128                 continue
00129 
00130             if output == None :
00131                 filename = id+".ent"
00132                 out_handle = open(filename, "w+")
00133             elif output == '-' :
00134                 out_handle = sys.stdout
00135             else :
00136                 out_handle = open(output, "w+")
00137                 
00138             try:
00139                 try:
00140                     claRec = claIndex[id]
00141                     residues = claRec.residues
00142                     seqMap = seqMapIndex.getSeqMap(residues)
00143                     pdbid = residues.pdbid
00144 
00145                     f = open_pdb(pdbid, pdb_url) 
00146                     try:
00147                         seqMap.getAtoms(f, out_handle)
00148                     finally :
00149                         f.close()
00150                 except (IOError, KeyError, RuntimeError), e:
00151                     print >>sys.stderr, "I cannot do SCOP domain ",id,":",e
00152             finally :
00153                 out_handle.close()
00154     finally :
00155         if in_handle != None :
00156             in_handle.close()
00157                 
00158                 
00159 if __name__ == "__main__":
00160     main()
00161 
00162 
00163 
00164 
00165 
00166 
00167 
00168 
00169 
00170 
00171 
00172 
00173 
00174 
00175 
00176 
00177 
00178