Back to index

python-biopython  1.60
Raf.py
Go to the documentation of this file.
00001 # Copyright 2001 by Gavin E. Crooks.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 # Gavin E. Crooks 2001-10-10
00007 
00008 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
00009 
00010 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
00011 records (representing the sequence of the molecule used in an experiment) to 
00012 the ATOM records (representing the atoms experimentally observed). 
00013 
00014 This data is derived from the Protein Data Bank CIF files. Known errors in the
00015 CIF files are corrected manually, with the original PDB file serving as the
00016 final arbiter in case of discrepancies. 
00017 
00018 Residues are referenced by residue ID. This consists of a the PDB residue
00019 sequence number (upto 4 digits) and an optional PDB  insertion code (an
00020 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
00021 
00022 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
00023 
00024 to_one_letter_code -- A mapping from the 3-letter amino acid codes found
00025                         in PDB files to 1-letter codes.  The 3-letter codes
00026                         include chemically modified residues.
00027 """
00028 
00029 from copy import copy 
00030 
00031 from Bio.SCOP.Residues import Residues
00032 
00033 from three_to_one_dict import to_one_letter_code
00034 
00035 def normalize_letters(one_letter_code):
00036     """Convert RAF one-letter amino acid codes into IUPAC standard codes.
00037     
00038     Letters are uppercased, and "." ("Unknown") is converted to "X".
00039     """
00040     if one_letter_code == '.':
00041         return 'X'
00042     else:
00043         return one_letter_code.upper()
00044 
00045 class SeqMapIndex(dict):
00046     """An RAF file index.
00047 
00048     The RAF file itself is about 50 MB. This index provides rapid, random
00049     access of RAF records without having to load the entire file into memory.
00050 
00051     The index key is a concatenation of the  PDB ID and chain ID. e.g
00052     "2drcA", "155c_". RAF uses an underscore to indicate blank
00053     chain IDs.    
00054     """
00055 
00056     def __init__(self, filename):
00057         """
00058         Arguments:
00059         
00060           filename  -- The file to index
00061         """
00062         dict.__init__(self)
00063         self.filename = filename
00064 
00065         f = open(self.filename, "rU")
00066         try:
00067             position = 0
00068             while True:
00069                 line = f.readline()
00070                 if not line: break
00071                 key = line[0:5]
00072                 if key != None:
00073                     self[key]=position
00074                 position = f.tell()
00075         finally:
00076             f.close()
00077 
00078     def __getitem__(self, key):
00079         """ Return an item from the indexed file. """
00080         position = dict.__getitem__(self,key)
00081 
00082         f = open(self.filename, "rU")
00083         try:
00084             f.seek(position)
00085             line = f.readline()
00086             record = SeqMap(line)
00087         finally:
00088             f.close()
00089         return record
00090 
00091 
00092     def getSeqMap(self, residues):
00093         """Get the sequence map for a collection of residues.
00094 
00095         residues -- A Residues instance, or a string that can be converted into
00096                     a Residues instance.
00097         """
00098         if isinstance(residues, basestring):
00099             residues = Residues(residues)
00100 
00101         pdbid  = residues.pdbid
00102         frags = residues.fragments
00103         if not frags: frags =(('_','',''),) # All residues of unnamed chain
00104 
00105         seqMap = None
00106         for frag in frags:
00107             chainid = frag[0]
00108             if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_':
00109                 chainid = '_'
00110             id = pdbid + chainid
00111             
00112             
00113             sm = self[id]
00114             
00115             #Cut out fragment of interest
00116             start = 0
00117             end = len(sm.res)
00118             if frag[1] : start = int(sm.index(frag[1], chainid))
00119             if frag[2] : end = int(sm.index(frag[2], chainid)+1)
00120             
00121             sm = sm[start:end]
00122 
00123             if seqMap == None:
00124                 seqMap = sm
00125             else:
00126                 seqMap += sm
00127                             
00128         return seqMap
00129 
00130 
00131 class SeqMap(object):
00132     """An ASTRAL RAF (Rapid Access Format) Sequence Map.
00133     
00134     This is a list like object; You can find the location of particular residues
00135     with index(), slice this SeqMap into fragments, and glue fragments back
00136     together with extend().
00137 
00138     pdbid -- The PDB 4 character ID
00139 
00140     pdb_datestamp -- From the PDB file
00141 
00142     version -- The RAF format version. e.g. 0.01
00143 
00144     flags -- RAF flags. (See release notes for more information.)
00145 
00146     res -- A list of Res objects, one for each residue in this sequence map
00147     """
00148 
00149     def __init__(self, line=None):
00150         self.pdbid = ''
00151         self.pdb_datestamp = ''
00152         self.version = ''
00153         self.flags = ''
00154         self.res = []
00155         if line:
00156             self._process(line)
00157         
00158 
00159     def _process(self, line):
00160         """Parses a RAF record into a SeqMap object.
00161         """
00162         header_len = 38
00163  
00164         line = line.rstrip()  # no trailing whitespace        
00165 
00166         if len(line)<header_len: 
00167             raise ValueError("Incomplete header: "+line)
00168 
00169         self.pdbid = line[0:4]
00170         chainid = line[4:5]
00171         
00172         self.version = line[6:10]
00173 
00174         #Raf format versions 0.01 and 0.02 are identical for practical purposes
00175         if(self.version != "0.01" and  self.version !="0.02"):
00176             raise ValueError("Incompatible RAF version: "+self.version)
00177 
00178         self.pdb_datestamp = line[14:20]
00179         self.flags = line[21:27]
00180 
00181         for i in range(header_len, len(line), 7):
00182             f = line[i : i+7]
00183             if len(f)!=7:
00184                 raise ValueError("Corrupt Field: ("+f+")")
00185             r = Res()
00186             r.chainid = chainid
00187             r.resid =  f[0:5].strip()
00188             r.atom = normalize_letters(f[5:6])
00189             r.seqres = normalize_letters(f[6:7])
00190 
00191             self.res.append(r)
00192 
00193 
00194     def index(self, resid, chainid="_"):
00195         for i in range(0, len(self.res)):
00196             if self.res[i].resid == resid and self.res[i].chainid == chainid:
00197                 return i
00198         raise KeyError("No such residue "+chainid+resid)
00199 
00200     def __getitem__(self, index):
00201         if not isinstance(index, slice):
00202             raise NotImplementedError
00203         s = copy(self)
00204         s.res = s.res[index]
00205         return s
00206 
00207     def append(self, res):
00208         """Append another Res object onto the list of residue mappings."""
00209         self.res.append(res)
00210 
00211     def extend(self, other):
00212         """Append another SeqMap onto the end of self.
00213 
00214         Both SeqMaps must have the same PDB ID, PDB datestamp and
00215         RAF version.  The RAF flags are erased if they are inconsistent. This
00216         may happen when fragments are taken from different chains.
00217         """
00218         if not isinstance(other, SeqMap):
00219             raise TypeError("Can only extend a SeqMap with a SeqMap.")
00220         if self.pdbid != other.pdbid:
00221             raise TypeError("Cannot add fragments from different proteins")
00222         if self.version != other.version:
00223             raise TypeError("Incompatible rafs")
00224         if self.pdb_datestamp != other.pdb_datestamp:
00225             raise TypeError("Different pdb dates!")
00226         if self.flags != other.flags:
00227             self.flags = ''
00228         self.res += other.res
00229 
00230     def __iadd__(self, other):
00231         self.extend(other)
00232         return self
00233 
00234     def __add__(self, other):
00235         s = copy(self)
00236         s.extend(other)
00237         return s
00238 
00239     def getAtoms(self, pdb_handle, out_handle):
00240         """Extract all relevant ATOM and HETATOM records from a PDB file.
00241 
00242         The PDB file is scanned for ATOM and HETATOM records. If the
00243         chain ID, residue ID (seqNum and iCode), and residue type match
00244         a residue in this sequence map, then the record is echoed to the
00245         output handle.
00246 
00247         This is typically used to find the coordinates of a domain, or other
00248         residue subset.
00249 
00250         pdb_handle -- A handle to the relevant PDB file.
00251         
00252         out_handle -- All output is written to this file like object.
00253         """
00254         #This code should be refactored when (if?) biopython gets a PDB parser 
00255 
00256         #The set of residues that I have to find records for. 
00257         resSet = {}
00258         for r in self.res:
00259             if r.atom=='X' : #Unknown residue type
00260                 continue
00261             chainid = r.chainid
00262             if chainid == '_':
00263                 chainid = ' '
00264             resid = r.resid
00265             resSet[(chainid,resid)] = r
00266 
00267         resFound = {}
00268         for line in pdb_handle.xreadlines():
00269             if line.startswith("ATOM  ") or line.startswith("HETATM"):
00270                 chainid = line[21:22]
00271                 resid = line[22:27].strip()
00272                 key = (chainid, resid)
00273                 if key in resSet:
00274                     res = resSet[key]
00275                     atom_aa = res.atom
00276                     resName = line[17:20]
00277                     if resName in to_one_letter_code:
00278                         if to_one_letter_code[resName] == atom_aa:
00279                             out_handle.write(line)
00280                             resFound[key] = res
00281 
00282         if len(resSet) != len(resFound):
00283             #for k in resFound.keys():
00284             #    del resSet[k]
00285             #print resSet
00286                                      
00287             raise RuntimeError('I could not find at least one ATOM or HETATM' \
00288                    +' record for each and every residue in this sequence map.')
00289 
00290 
00291 class Res(object):
00292     """ A single residue mapping from a RAF record.
00293 
00294     chainid -- A single character chain ID.
00295 
00296     resid   -- The residue ID. 
00297 
00298     atom    -- amino acid one-letter code from ATOM records. 
00299 
00300     seqres  -- amino acid one-letter code from SEQRES records.
00301     """
00302     def __init__(self):
00303         self.chainid = ''
00304         self.resid = ''
00305         self.atom = ''
00306         self.seqres = ''
00307 
00308 
00309 def parse(handle):
00310     """Iterates over a RAF file, returning a SeqMap object for each line
00311     in the file.
00312 
00313     Arguments:
00314         
00315         handle -- file-like object.
00316     """ 
00317     for line in handle:
00318         yield SeqMap(line)