Back to index

python-biopython  1.60
PDBList.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # PDBList.py
00004 #
00005 # A tool for tracking changes in the PDB Protein Structure Database.
00006 #
00007 # Version 2.0
00008 #
00009 # (c) 2003 Kristian Rother
00010 # This work was supported by the German Ministry of Education
00011 # and Research (BMBF). Project http://www.bcbio.de
00012 # 
00013 # Contact the author
00014 #    homepage : http://www.rubor.de/bioinf
00015 #    email    : krother@genesilico.pl
00016 #
00017 #
00018 # This Code is released under the conditions of the Biopython license.
00019 # It may be distributed freely with respect to the original author.
00020 # Any maintainer of the BioPython code may change this notice
00021 # when appropriate.
00022 
00023 """Access the PDB over the internet (for example to download structures)."""
00024 
00025 import gzip
00026 import os
00027 import shutil
00028 from urllib2 import urlopen as _urlopen
00029 import warnings
00030 
00031 from Bio import BiopythonDeprecationWarning
00032 
00033 
00034 class PDBList(object):
00035     """
00036     This class provides quick access to the structure lists on the
00037     PDB server or its mirrors. The structure lists contain
00038     four-letter PDB codes, indicating that structures are
00039     new, have been modified or are obsolete. The lists are released
00040     on a weekly basis.
00041 
00042     It also provides a function to retrieve PDB files from the server.
00043     To use it properly, prepare a directory /pdb or the like,
00044     where PDB files are stored.
00045 
00046     If You want to use this module from inside a proxy, add
00047     the proxy variable to Your environment, e.g. in Unix
00048     export HTTP_PROXY='http://realproxy.charite.de:888'    
00049     (This can also be added to ~/.bashrc)
00050     """
00051     
00052     PDB_REF="""
00053     The Protein Data Bank: a computer-based archival file for macromolecular structures.
00054     F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
00055     J. Mol. Biol. 112 pp. 535-542 (1977)
00056     http://www.pdb.org/.
00057     """
00058 
00059     alternative_download_url = "http://www.rcsb.org/pdb/files/"
00060     # just append PDB code to this, and then it works.
00061     
00062     def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
00063         """Initialize the class with the default server or a custom one."""
00064         # remote pdb server
00065         self.pdb_server = server
00066 
00067         # local pdb file tree
00068         self.local_pdb = pdb
00069 
00070         # local file tree for obsolete pdb files
00071         if obsolete_pdb:
00072             self.obsolete_pdb = obsolete_pdb
00073         else:
00074             self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete')
00075             if not os.access(self.obsolete_pdb,os.F_OK):
00076                 os.makedirs(self.obsolete_pdb)
00077 
00078         # variables for command-line options
00079         self.overwrite = 0
00080         self.flat_tree = 0
00081 
00082 
00083     def get_status_list(self,url):
00084         """Retrieves a list of pdb codes in the weekly pdb status file
00085         from the given URL. Used by get_recent_files.
00086         
00087         Typical contents of the list files parsed by this method is now
00088         very simply one PDB name per line.
00089         """
00090         handle = _urlopen(url)
00091         answer = []
00092         for line in handle:
00093             pdb = line.strip()
00094             assert len(pdb)==4
00095             answer.append(pdb)
00096         handle.close()
00097         return answer
00098 
00099 
00100     def get_recent_changes(self):
00101         """Returns three lists of the newest weekly files (added,mod,obsolete).
00102         
00103         Reads the directories with changed entries from the PDB server and
00104         returns a tuple of three URL's to the files of new, modified and
00105         obsolete entries from the most recent list. The directory with the
00106         largest numerical name is used.
00107         Returns None if something goes wrong.
00108         
00109         Contents of the data/status dir (20031013 would be used);
00110         drwxrwxr-x   2 1002     sysadmin     512 Oct  6 18:28 20031006
00111         drwxrwxr-x   2 1002     sysadmin     512 Oct 14 02:14 20031013
00112         -rw-r--r--   1 1002     sysadmin    1327 Mar 12  2001 README
00113         """     
00114         url = _urlopen(self.pdb_server + '/pub/pdb/data/status/')
00115         recent = filter(str.isdigit,
00116                         (x.split()[-1] for x in url.readlines())
00117                         )[-1]
00118         path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent)
00119         # Retrieve the lists
00120         added = self.get_status_list(path+'added.pdb')
00121         modified = self.get_status_list(path+'modified.pdb')
00122         obsolete = self.get_status_list(path+'obsolete.pdb')
00123         return [added,modified,obsolete]
00124 
00125     def get_all_entries(self):
00126         """Retrieves a big file containing all the 
00127         PDB entries and some annotation to them. 
00128         Returns a list of PDB codes in the index file.
00129         """
00130         print "retrieving index file. Takes about 5 MB."
00131         url = _urlopen(self.pdb_server +
00132                        '/pub/pdb/derived_data/index/entries.idx')
00133         return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
00134 
00135     def get_all_obsolete(self):
00136         """Returns a list of all obsolete entries ever in the PDB.
00137 
00138         Returns a list of all obsolete pdb codes that have ever been
00139         in the PDB.
00140         
00141         Gets and parses the file from the PDB server in the format
00142         (the first pdb_code column is the one used). The file looks
00143         like this:
00144 
00145          LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
00146         OBSLTE    31-JUL-94 116L     216L
00147         ...
00148         OBSLTE    29-JAN-96 1HFT     2HFT
00149         OBSLTE    21-SEP-06 1HFV     2J5X
00150         OBSLTE    21-NOV-03 1HG6     
00151         OBSLTE    18-JUL-84 1HHB     2HHB 3HHB 
00152         OBSLTE    08-NOV-96 1HID     2HID
00153         OBSLTE    01-APR-97 1HIU     2HIU
00154         OBSLTE    14-JAN-04 1HKE     1UUZ
00155         ...
00156 
00157         """
00158         handle = _urlopen(self.pdb_server +
00159                           '/pub/pdb/data/status/obsolete.dat')
00160         # Extract pdb codes. Could use a list comprehension, but I want
00161         # to include an assert to check for mis-reading the data.
00162         obsolete = []
00163         for line in handle:
00164             if not line.startswith("OBSLTE ") : continue
00165             pdb = line.split()[2]
00166             assert len(pdb)==4
00167             obsolete.append(pdb)
00168         handle.close()
00169         return obsolete
00170 
00171     def retrieve_pdb_file(self,pdb_code, obsolete=0, compression=None,
00172             uncompress=None, pdir=None):
00173         """ Retrieves a PDB structure file from the PDB server and
00174         stores it in a local file tree.
00175         The PDB structure is returned as a single string.
00176         If obsolete==1, the file will be saved in a special file tree.
00177         If uncompress is specified, a system utility will decompress the .gz
00178         archive. Otherwise, Python gzip utility will handle it.
00179         compression does nothing, as all archives are already in .gz format
00180 
00181         @param pdir: put the file in this directory (default: create a PDB-style directory tree) 
00182         @type pdir: string
00183 
00184         @return: filename
00185         @rtype: string
00186         """
00187         # Alert the user about deprecated parameters
00188         if compression is not None:
00189             warnings.warn("PDB file servers now only host .gz archives: "
00190                     "the compression parameter will not do anything"
00191                     , BiopythonDeprecationWarning)
00192         if uncompress is not None:
00193             warnings.warn("Decompression is handled with the gzip module: "
00194                     "the uncompression parameter will not do anything"
00195                     , BiopythonDeprecationWarning)
00196 
00197         # Get the structure
00198         code=pdb_code.lower()
00199         filename="pdb%s.ent.gz"%code
00200         if not obsolete:
00201             url=(self.pdb_server+
00202                  '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent.gz'
00203                  % (code[1:3],code))
00204         else:
00205             url=(self.pdb_server+
00206                  '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent.gz'
00207                  % (code[1:3],code))
00208             
00209         # In which dir to put the pdb file?
00210         if pdir is None:
00211             if self.flat_tree:
00212                 if not obsolete:
00213                     path=self.local_pdb
00214                 else:
00215                     path=self.obsolete_pdb
00216             else:
00217                 # Put in PDB-style directory tree
00218                 if not obsolete:
00219                     path=os.path.join(self.local_pdb, code[1:3])
00220                 else:
00221                     path=os.path.join(self.obsolete_pdb,code[1:3])
00222         else:
00223             # Put in specified directory
00224             path=pdir
00225             
00226         if not os.access(path,os.F_OK):
00227             os.makedirs(path)
00228             
00229         filename=os.path.join(path, filename)
00230         # the final uncompressed file
00231         final_file=os.path.join(path, "pdb%s.ent" % code)
00232 
00233         # Skip download if the file already exists
00234         if not self.overwrite:
00235             if os.path.exists(final_file):
00236                 print "Structure exists: '%s' " % final_file
00237                 return final_file
00238 
00239         # Retrieve the file
00240         print "Downloading PDB structure '%s'..." % pdb_code
00241         lines = _urlopen(url).read()
00242         open(filename,'wb').write(lines)
00243 
00244         # Uncompress the file
00245         gz = gzip.open(filename, 'rb')
00246         out = open(final_file, 'wb')
00247         out.writelines(gz.read())
00248         gz.close()
00249         out.close()
00250         os.remove(filename)
00251 
00252         return final_file
00253             
00254 
00255     def update_pdb(self):
00256         """
00257         I guess this is the 'most wanted' function from this module.
00258         It gets the weekly lists of new and modified pdb entries and
00259         automatically downloads the according PDB files.
00260         You can call this module as a weekly cronjob.
00261         """
00262         assert os.path.isdir(self.local_pdb)
00263         assert os.path.isdir(self.obsolete_pdb)
00264         
00265         new, modified, obsolete = self.get_recent_changes()
00266         
00267         for pdb_code in new+modified:
00268             try:
00269                 self.retrieve_pdb_file(pdb_code)
00270             except Exception:
00271                 print 'error %s\n' % pdb_code
00272                 # you can insert here some more log notes that
00273                 # something has gone wrong.            
00274 
00275         # Move the obsolete files to a special folder
00276         for pdb_code in obsolete:
00277             if self.flat_tree:
00278                 old_file = os.path.join(self.local_pdb,
00279                                         'pdb%s.ent' % pdb_code)
00280                 new_dir = self.obsolete_pdb             
00281             else:
00282                 old_file = os.path.join(self.local_pdb, pdb_code[1:3],
00283                                         'pdb%s.ent' % pdb_code)
00284                 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3])
00285             new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code)
00286             if os.path.isfile(old_file):
00287                 if not os.path.isdir(new_dir):
00288                     os.mkdir(new_dir)
00289                 try:
00290                     shutil.move(old_file, new_file)
00291                 except Exception:
00292                     print "Could not move %s to obsolete folder" % old_file
00293             elif os.path.isfile(new_file):
00294                 print "Obsolete file %s already moved" % old_file
00295             else:
00296                 print "Obsolete file %s is missing" % old_file
00297 
00298 
00299     def download_entire_pdb(self, listfile=None):
00300         """Retrieve all PDB entries not present in the local PDB copy.
00301 
00302         Writes a list file containing all PDB codes (optional, if listfile is
00303         given).
00304         """ 
00305         entries = self.get_all_entries()
00306         for pdb_code in entries:
00307             self.retrieve_pdb_file(pdb_code)
00308         # Write the list
00309         if listfile:
00310             outfile = open(listfile, 'w')
00311             outfile.writelines((x+'\n' for x in entries))
00312             outfile.close()
00313 
00314     def download_obsolete_entries(self, listfile=None):
00315         """Retrieve all obsolete PDB entries not present in the local obsolete
00316         PDB copy.
00317 
00318         Writes a list file containing all PDB codes (optional, if listfile is
00319         given).
00320         """ 
00321         entries = self.get_all_obsolete()
00322         for pdb_code in entries:
00323             self.retrieve_pdb_file(pdb_code, obsolete=1)
00324 
00325         # Write the list
00326         if listfile:
00327             outfile = open(listfile, 'w')
00328             outfile.writelines((x+'\n' for x in entries))
00329             outfile.close()
00330 
00331     def get_seqres_file(self,savefile='pdb_seqres.txt'):
00332         """Retrieves a (big) file containing all the sequences of PDB entries
00333         and writes it to a file.
00334         """
00335         print "retrieving sequence file. Takes about 15 MB."
00336         handle = _urlopen(self.pdb_server + 
00337                           '/pub/pdb/derived_data/pdb_seqres.txt')
00338         lines = handle.readlines()
00339         outfile = open(savefile, 'w')
00340         outfile.writelines(lines)
00341         outfile.close()
00342         handle.close()
00343 
00344 
00345 if __name__ == '__main__':
00346 
00347     import sys
00348 
00349     doc = """PDBList.py
00350     (c) Kristian Rother 2003, Contributed to BioPython
00351 
00352     Usage:
00353     PDBList.py update <pdb_path> [options]   - write weekly PDB updates to
00354                                                local pdb tree.
00355     PDBList.py all    <pdb_path> [options]   - write all PDB entries to
00356                                                local pdb tree.
00357     PDBList.py obsol  <pdb_path> [options]   - write all obsolete PDB
00358                                                entries to local pdb tree.
00359     PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure
00360 
00361     Options:
00362        -d   A single directory will be used as <pdb_path>, not a tree.
00363        -o   Overwrite existing structure files.
00364     """
00365     print doc
00366 
00367     if len(sys.argv)>2:
00368         pdb_path = sys.argv[2]
00369         pl = PDBList(pdb=pdb_path)
00370         if len(sys.argv)>3:
00371             for option in sys.argv[3:]:
00372                 if option == '-d': pl.flat_tree = 1
00373                 elif option == '-o': pl.overwrite = 1
00374 
00375     else:
00376         pdb_path = os.getcwd()
00377         pl = PDBList()
00378         pl.flat_tree = 1        
00379 
00380     if len(sys.argv) > 1:   
00381         if sys.argv[1] == 'update':
00382             # update PDB
00383             print "updating local PDB at "+pdb_path 
00384             pl.update_pdb()
00385 
00386         elif sys.argv[1] == 'all':
00387             # get the entire PDB
00388             pl.download_entire_pdb()
00389 
00390         elif sys.argv[1] == 'obsol':
00391             # get all obsolete entries
00392             pl.download_obsolete_entries(pdb_path)
00393 
00394         elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit():
00395             # get single PDB entry
00396             pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path)