Back to index

python-biopython  1.60
generate_three_to_one_dict.py
Go to the documentation of this file.
00001 # File download/unzip written 2012 by Lenna X. Peterson (arklenna@gmail.com)
00002 # Dictionary extraction written 2011 by Hongbo Zhu
00003 #
00004 # This code is part of the Biopython distribution and governed by its
00005 # license.  Please see the LICENSE file that should have been included
00006 # as part of this package.
00007 
00008 """
00009 Download and parse PDB Chemical Component Dictionary,
00010 then write out dict for to_one_letter_code. 
00011 """
00012 
00013 import gzip
00014 import inspect
00015 import os
00016 import urllib
00017 import warnings
00018 
00019 url = "ftp://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
00020 
00021 # extract name of gzip file
00022 gzname = os.path.basename(url)
00023 # extract name of cif file (split by sep, remove last, rejoin)
00024 cifname = os.extsep.join(gzname.split(os.extsep)[:-1])
00025 
00026 url_handle = urllib.urlopen(url)
00027 
00028 with open(gzname, 'wb') as gzh:
00029     print "Downloading file... (approx. 29 MB)"
00030     while True:
00031         data = url_handle.read(1024)
00032         if len(data) == 0:
00033             break
00034         gzh.write(data)
00035 
00036 # size as of 13 April 2012
00037 if os.path.getsize(gzname) < 29944258:
00038     warnings.warn("ERROR: Downloaded file is too small", 
00039                   RuntimeWarning)
00040 
00041 fh = gzip.open(gzname, 'rb')
00042 
00043 # write extracted file to disk (not necessary)
00044 #with open(cifname, 'wb') as cifh:
00045     #print "Extracting file..."
00046     #cifh.write(fh.read())
00047 
00048 # The following code written by Hongbo Zhu
00049 # generate three_to_one_dict
00050 # two records in PDB Chemical Component Dictionary are parsed to
00051 # generate the dictionary:
00052 # _chem_comp.one_letter_code
00053 # _chem_comp.three_letter_code
00054 
00055 three_to_one_buf = []      # all three-letter codes
00056 three_to_one_buf_noq = []  # only those with non-'?' one-letter codes
00057 
00058 current_line = 'to_one_letter_code = {'
00059 current_line_noq = 'to_one_letter_code = {'
00060 
00061 found_one   = False  # found one-letter code
00062 found_three = False  # found three-letter code
00063 
00064 counter = 0
00065 counter_noq = 0
00066 
00067 line = fh.readline()
00068 
00069 while line:
00070     
00071     if line.startswith('_chem_comp.one_letter_code'):
00072         one = line.strip().split()[-1]
00073         found_one = True
00074     if line.startswith('_chem_comp.three_letter_code'):
00075         three = '%-3s' % (line.strip().split()[-1],) # make it three-letter
00076         found_three = True
00077 
00078     if found_one and found_three:
00079         if counter%5 == 0:
00080             three_to_one_buf.append('%s\n' % (current_line,))
00081             current_line = '    '
00082 
00083         current_line = '%s\'%s\':\'%s\',' % (current_line, three, one)
00084         counter += 1
00085 
00086         if one != '?':
00087             if counter_noq%5 == 0:
00088                 three_to_one_buf_noq.append('%s\n'% (current_line_noq,))
00089                 current_line_noq = '    '
00090 
00091             current_line_noq = '%s\'%s\':\'%s\',' % (current_line_noq, three, one)
00092             counter_noq += 1
00093 
00094         found_one = False
00095         found_three = False
00096             
00097 
00098     line = fh.readline()
00099 
00100 if len(current_line) < 5:
00101     three_to_one_buf[-1] = three_to_one_buf[:-1] # remove the last comma
00102     three_to_one_buf.append('}')
00103 else:
00104     three_to_one_buf.append('%s }' % (current_line[:-1]))
00105 
00106 if len(current_line_noq) < 5:
00107     three_to_one_buf_noq[-1] = three_to_one_buf_noq[:-1]
00108     three_to_one_buf_noq.append('}')
00109 else:
00110     three_to_one_buf_noq.append('%s }' % (current_line_noq[:-1]))
00111 
00112 # Find path of current script
00113 _scriptPath = os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0])
00114 # Path to SCOP module
00115 _rafPath = os.path.normpath(os.path.join(_scriptPath, "..", "..", "Bio", "SCOP"))
00116 _threeAllPath = os.path.join(_rafPath, 'three_to_one_all.py')
00117 _threePath = os.path.join(_rafPath, 'three_to_one_dict.py')
00118 
00119 #with open(_threeAllPath, 'w') as fh:
00120     #fh.writelines(three_to_one_buf)
00121 with open(_threePath, 'w') as fh:
00122     fh.writelines(three_to_one_buf_noq)