Back to index

python-biopython  1.60
DSSP.py
Go to the documentation of this file.
00001 # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Use the DSSP program to calculate secondary structure and accessibility.
00007 
00008 You need to have a working version of DSSP (and a license, free for academic
00009 use) in order to use this. For DSSP, see U{http://www.cmbi.kun.nl/gv/dssp/}.
00010 
00011 The DSSP codes for secondary structure used here are:
00012 
00013     - H        Alpha helix (4-12)
00014     - B        Isolated beta-bridge residue
00015     - E        Strand
00016     - G        3-10 helix
00017     - I        pi helix
00018     - T        Turn
00019     - S        Bend
00020     - -        None
00021 """
00022 
00023 import os
00024 import re
00025 import tempfile
00026 
00027 from Bio.SCOP.Raf import to_one_letter_code
00028 
00029 from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap
00030 from Bio.PDB.PDBExceptions import PDBException
00031 from Bio.PDB.PDBParser import PDBParser
00032 
00033 
00034 # Match C in DSSP
00035 _dssp_cys=re.compile('[a-z]')
00036 
00037 # Maximal ASA of amino acids
00038 # Values from Sander & Rost, (1994), Proteins, 20:216-226
00039 # Used for relative accessibility
00040 MAX_ACC={}
00041 MAX_ACC["ALA"]=106.0
00042 MAX_ACC["CYS"]=135.0
00043 MAX_ACC["ASP"]=163.0
00044 MAX_ACC["GLU"]=194.0
00045 MAX_ACC["PHE"]=197.0
00046 MAX_ACC["GLY"]=84.0
00047 MAX_ACC["HIS"]=184.0
00048 MAX_ACC["ILE"]=169.0
00049 MAX_ACC["LYS"]=205.0
00050 MAX_ACC["LEU"]=164.0
00051 MAX_ACC["MET"]=188.0
00052 MAX_ACC["ASN"]=157.0
00053 MAX_ACC["PRO"]=136.0
00054 MAX_ACC["GLN"]=198.0
00055 MAX_ACC["ARG"]=248.0
00056 MAX_ACC["SER"]=130.0
00057 MAX_ACC["THR"]=142.0
00058 MAX_ACC["VAL"]=142.0
00059 MAX_ACC["TRP"]=227.0
00060 MAX_ACC["TYR"]=222.0
00061 
00062 
00063 def ss_to_index(ss):
00064     """
00065     Secondary structure symbol to index.
00066     H=0
00067     E=1
00068     C=2
00069     """
00070     if ss=='H':
00071         return 0
00072     if ss=='E':
00073         return 1
00074     if ss=='C':
00075         return 2
00076     assert 0
00077 
00078 
00079 def dssp_dict_from_pdb_file(in_file, DSSP="dssp"):
00080     """
00081     Create a DSSP dictionary from a PDB file.
00082 
00083     Example:
00084         >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb")
00085         >>> aa, ss, acc=dssp_dict[('A', 1)]
00086 
00087     @param in_file: pdb file
00088     @type in_file: string
00089 
00090     @param DSSP: DSSP executable (argument to os.system)
00091     @type DSSP: string
00092 
00093     @return: a dictionary that maps (chainid, resid) to 
00094         amino acid type, secondary structure code and 
00095         accessibility.
00096     @rtype: {}
00097     """
00098     out_file = tempfile.NamedTemporaryFile(suffix='.dssp')
00099     out_file.flush()    # needed?
00100     os.system("%s %s > %s" % (DSSP, in_file, out_file.name))
00101     out_dict, keys = make_dssp_dict(out_file.name)
00102     out_file.close()
00103     return out_dict, keys
00104 
00105 
00106 def make_dssp_dict(filename):
00107     """
00108     Return a DSSP dictionary that maps (chainid, resid) to
00109     aa, ss and accessibility, from a DSSP file.
00110 
00111     @param filename: the DSSP output file
00112     @type filename: string
00113     """
00114     dssp = {}
00115     handle = open(filename, "r")
00116     try:
00117         start = 0
00118         keys = []
00119         for l in handle.readlines():
00120             sl = l.split()
00121             if sl[1] == "RESIDUE":
00122                 # Start parsing from here
00123                 start = 1
00124                 continue
00125             if not start:
00126                 continue
00127             if l[9] == " ":
00128                 # Skip -- missing residue
00129                 continue
00130             resseq = int(l[5:10])
00131             icode = l[10]
00132             chainid = l[11]
00133             aa = l[13]
00134             ss = l[16]
00135             if ss == " ":
00136                 ss = "-"
00137             try:
00138                 acc = int(l[34:38]) 
00139                 phi = float(l[103:109])
00140                 psi = float(l[109:115])
00141             except ValueError, exc:
00142                 # DSSP output breaks its own format when there are >9999
00143                 # residues, since only 4 digits are allocated to the seq num
00144                 # field.  See 3kic chain T res 321, 1vsy chain T res 6077.
00145                 # Here, look for whitespace to figure out the number of extra
00146                 # digits, and shift parsing the rest of the line by that amount.
00147                 if l[34] != ' ':
00148                     shift = l[34:].find(' ')
00149                     acc = int((l[34+shift:38+shift]))
00150                     phi = float(l[103+shift:109+shift])
00151                     psi = float(l[109+shift:115+shift])
00152                 else:
00153                     raise ValueError, exc
00154             res_id = (" ", resseq, icode)
00155             dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi)
00156             keys.append((chainid, res_id))
00157     finally:
00158         handle.close()
00159     return dssp, keys
00160 
00161 
00162 class DSSP(AbstractResiduePropertyMap):
00163     """
00164     Run DSSP on a pdb file, and provide a handle to the 
00165     DSSP secondary structure and accessibility.
00166 
00167     Note that DSSP can only handle one model.
00168 
00169     Example:
00170 
00171         >>> p = PDBParser()
00172         >>> structure = p.get_structure("1MOT", "1MOT.pdb")
00173         >>> model = structure[0]
00174         >>> dssp = DSSP(model, "1MOT.pdb")
00175         >>> # DSSP data is accessed by a tuple (chain_id, res_id)
00176         >>> a_key = dssp.keys()[2]
00177         >>> # residue object, secondary structure, solvent accessibility,
00178         >>> # relative accessiblity, phi, psi
00179         >>> dssp[a_key]
00180         (<Residue ALA het=  resseq=251 icode= >,
00181         'H',
00182         72,
00183         0.67924528301886788,
00184         -61.200000000000003,
00185         -42.399999999999999)
00186     """
00187 
00188     def __init__(self, model, pdb_file, dssp="dssp"):
00189         """
00190         @param model: the first model of the structure
00191         @type model: L{Model}
00192 
00193         @param pdb_file: a PDB file
00194         @type pdb_file: string
00195 
00196         @param dssp: the dssp executable (ie. the argument to os.system)
00197         @type dssp: string
00198         """
00199         # create DSSP dictionary
00200         dssp_dict, dssp_keys = dssp_dict_from_pdb_file(pdb_file, dssp)
00201         dssp_map = {}
00202         dssp_list = []
00203 
00204         def resid2code(res_id):
00205             """Serialize a residue's resseq and icode for easy comparison."""
00206             return '%s%s' % (res_id[1], res_id[2])
00207 
00208         # Now create a dictionary that maps Residue objects to 
00209         # secondary structure and accessibility, and a list of 
00210         # (residue, (secondary structure, accessibility)) tuples
00211         for key in dssp_keys:
00212             chain_id, res_id = key
00213             chain = model[chain_id]
00214             try:
00215                 res = chain[res_id]
00216             except KeyError:
00217                 # In DSSP, HET field is not considered in residue identifier.
00218                 # Thus HETATM records may cause unnecessary exceptions.
00219                 # (See 3jui chain A res 593.)
00220                 # Try the lookup again with all HETATM other than water
00221                 res_seq_icode = resid2code(res_id)
00222                 for r in chain:
00223                     if r.id[0] not in (' ', 'W'):
00224                         # Compare resseq + icode
00225                         if resid2code(r.id) == res_seq_icode:
00226                             # Found a matching residue
00227                             res = r
00228                             break
00229                 else:
00230                     raise KeyError(res_id)
00231 
00232             # For disordered residues of point mutations, BioPython uses the
00233             # last one as default, But DSSP takes the first one (alternative
00234             # location is blank, A or 1). See 1h9h chain E resi 22.
00235             # Here we select the res in which all atoms have altloc blank, A or
00236             # 1. If no such residues are found, simply use the first one appears
00237             # (as DSSP does).
00238             if res.is_disordered() == 2:
00239                 for rk in res.disordered_get_id_list():
00240                     # All atoms in the disordered residue should have the same
00241                     # altloc, so it suffices to check the altloc of the first
00242                     # atom.
00243                     altloc = res.child_dict[rk].get_list()[0].get_altloc()
00244                     if altloc in tuple('A1 '):
00245                         res.disordered_select(rk)
00246                         break
00247                 else:
00248                     # Simply select the first one
00249                     res.disordered_select(res.disordered_get_id_list()[0])
00250 
00251             # Sometimes point mutations are put into HETATM and ATOM with altloc
00252             # 'A' and 'B'.
00253             # See 3piu chain A residue 273:
00254             #   <Residue LLP het=H_LLP resseq=273 icode= >
00255             #   <Residue LYS het=  resseq=273 icode= >
00256             # DSSP uses the HETATM LLP as it has altloc 'A'
00257             # We check the altloc code here.
00258             elif res.is_disordered() == 1:
00259                 # Check altloc of all atoms in the DisorderedResidue. If it
00260                 # contains blank, A or 1, then use it.  Otherwise, look for HET
00261                 # residues of the same seq+icode.  If not such HET residues are
00262                 # found, just accept the current one.
00263                 altlocs = set(a.get_altloc() for a in res.get_unpacked_list())
00264                 if altlocs.isdisjoint('A1 '):
00265                     # Try again with all HETATM other than water
00266                     res_seq_icode = resid2code(res_id)
00267                     for r in chain:
00268                         if r.id[0] not in (' ', 'W'):
00269                             if (resid2code(r.id) == res_seq_icode and
00270                                 r.get_list()[0].get_altloc() in tuple('A1 ')):
00271                                 res = r
00272                                 break
00273 
00274             aa, ss, acc, phi, psi = dssp_dict[key]
00275             res.xtra["SS_DSSP"] = ss
00276             res.xtra["EXP_DSSP_ASA"] = acc
00277             res.xtra["PHI_DSSP"] = phi
00278             res.xtra["PSI_DSSP"] = psi
00279             # Relative accessibility
00280             resname = res.get_resname()
00281             try:
00282                 rel_acc = acc/MAX_ACC[resname]
00283             except KeyError:
00284                 # Invalid value for resname
00285                 rel_acc = 'NA'
00286             else:
00287                 if rel_acc > 1.0:
00288                     rel_acc = 1.0
00289             res.xtra["EXP_DSSP_RASA"] = rel_acc
00290             # Verify if AA in DSSP == AA in Structure
00291             # Something went wrong if this is not true!
00292             # NB: DSSP uses X often
00293             resname = to_one_letter_code.get(resname, 'X')
00294             if resname == "C":
00295                 # DSSP renames C in C-bridges to a,b,c,d,...
00296                 # - we rename it back to 'C'
00297                 if _dssp_cys.match(aa):
00298                     aa = 'C'
00299             # Take care of HETATM again
00300             if (resname != aa) and (res.id[0] == ' ' or aa != 'X'):
00301                 raise PDBException("Structure/DSSP mismatch at %s" % res)
00302             dssp_map[key] = ((res, ss, acc, rel_acc, phi, psi))
00303             dssp_list.append((res, ss, acc, rel_acc, phi, psi))
00304 
00305         AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys,
00306                 dssp_list)
00307 
00308 
00309 if __name__ == "__main__":
00310     import sys
00311 
00312     p = PDBParser()
00313     s = p.get_structure('X', sys.argv[1])
00314     model = s[0]
00315     d = DSSP(model, sys.argv[1])
00316 
00317     for r in d:
00318         print r
00319     print "Handled", len(d), "residues"
00320     print d.keys()
00321     if ('A', 1) in d:
00322         print d[('A', 1)]
00323         print s[0]['A'][1].xtra
00324     # Secondary structure
00325     print ''.join(d[key][1] for key in d.keys())
00326