Back to index

python-biopython  1.60
PDBParser.py
Go to the documentation of this file.
00001 # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.  
00005 
00006 """Parser for PDB files."""
00007 
00008 # For using with statement in Python 2.5 or Jython
00009 from __future__ import with_statement
00010 
00011 import warnings
00012 
00013 import numpy
00014 
00015 from Bio.File import as_handle
00016 
00017 from Bio.PDB.PDBExceptions import \
00018         PDBConstructionException, PDBConstructionWarning
00019 from Bio.PDB.StructureBuilder import StructureBuilder
00020 from Bio.PDB.parse_pdb_header import _parse_pdb_header_list
00021 
00022 
00023 # If PDB spec says "COLUMNS 18-20" this means line[17:20]
00024 
00025 
00026 class PDBParser(object):
00027     """
00028     Parse a PDB file and return a Structure object.
00029     """
00030 
00031     def __init__(self, PERMISSIVE=True, get_header=False,
00032                  structure_builder=None, QUIET=False):
00033         """
00034         The PDB parser call a number of standard methods in an aggregated
00035         StructureBuilder object. Normally this object is instanciated by the
00036         PDBParser object itself, but if the user provides his own StructureBuilder
00037         object, the latter is used instead.
00038 
00039         Arguments:
00040         
00041         o PERMISSIVE - Evaluated as a Boolean. If false, exceptions in
00042         constructing the SMCRA data structure are fatal. If true (DEFAULT),
00043         the exceptions are caught, but some residues or atoms will be missing.
00044         THESE EXCEPTIONS ARE DUE TO PROBLEMS IN THE PDB FILE!.
00045 
00046         o structure_builder - an optional user implemented StructureBuilder class. 
00047 
00048         o QUIET - Evaluated as a Boolean. If true, warnings issued in constructing
00049         the SMCRA data will be supressed. If false (DEFAULT), they will be shown.
00050         These warnings might be indicative of problems in the PDB file!        
00051         """
00052         if structure_builder!=None:
00053             self.structure_builder=structure_builder
00054         else:
00055             self.structure_builder=StructureBuilder()
00056         self.header=None
00057         self.trailer=None
00058         self.line_counter=0
00059         self.PERMISSIVE=bool(PERMISSIVE)
00060         self.QUIET=bool(QUIET)
00061 
00062     # Public methods
00063 
00064     def get_structure(self, id, file):
00065         """Return the structure.
00066 
00067         Arguments:
00068         o id - string, the id that will be used for the structure
00069         o file - name of the PDB file OR an open filehandle
00070         """
00071 
00072         if self.QUIET:
00073             warning_list = warnings.filters[:]
00074             warnings.filterwarnings('ignore', category=PDBConstructionWarning)
00075             
00076         self.header=None
00077         self.trailer=None
00078         # Make a StructureBuilder instance (pass id of structure as parameter)
00079         self.structure_builder.init_structure(id)
00080 
00081         with as_handle(file) as handle:
00082             self._parse(handle.readlines())
00083 
00084         self.structure_builder.set_header(self.header)
00085         # Return the Structure instance
00086         structure = self.structure_builder.get_structure()
00087         
00088         if self.QUIET:
00089             warnings.filters = warning_list
00090         
00091         return structure
00092 
00093     def get_header(self):
00094         "Return the header."
00095         return self.header
00096 
00097     def get_trailer(self):
00098         "Return the trailer."
00099         return self.trailer
00100 
00101     # Private methods
00102     
00103     def _parse(self, header_coords_trailer):
00104         "Parse the PDB file."
00105         # Extract the header; return the rest of the file
00106         self.header, coords_trailer=self._get_header(header_coords_trailer)
00107         # Parse the atomic data; return the PDB file trailer
00108         self.trailer=self._parse_coordinates(coords_trailer)
00109     
00110     def _get_header(self, header_coords_trailer):
00111         "Get the header of the PDB file, return the rest."
00112         structure_builder=self.structure_builder
00113         i = 0
00114         for i in range(0, len(header_coords_trailer)):
00115             structure_builder.set_line_counter(i+1)
00116             line=header_coords_trailer[i]
00117             record_type=line[0:6] 
00118             if(record_type=='ATOM  ' or record_type=='HETATM' or record_type=='MODEL '):
00119                 break
00120         header=header_coords_trailer[0:i]
00121         # Return the rest of the coords+trailer for further processing
00122         self.line_counter=i
00123         coords_trailer=header_coords_trailer[i:]
00124         header_dict=_parse_pdb_header_list(header)
00125         return header_dict, coords_trailer
00126     
00127     def _parse_coordinates(self, coords_trailer):
00128         "Parse the atomic data in the PDB file."
00129         local_line_counter=0
00130         structure_builder=self.structure_builder
00131         current_model_id=0
00132         # Flag we have an open model
00133         model_open=0
00134         current_chain_id=None
00135         current_segid=None
00136         current_residue_id=None
00137         current_resname=None
00138         for i in range(0, len(coords_trailer)):
00139             line=coords_trailer[i]
00140             record_type=line[0:6]
00141             global_line_counter=self.line_counter+local_line_counter+1
00142             structure_builder.set_line_counter(global_line_counter)
00143             if(record_type=='ATOM  ' or record_type=='HETATM'):
00144                 # Initialize the Model - there was no explicit MODEL record
00145                 if not model_open:
00146                     structure_builder.init_model(current_model_id)
00147                     current_model_id+=1
00148                     model_open=1
00149                 fullname=line[12:16]
00150                 # get rid of whitespace in atom names
00151                 split_list=fullname.split()
00152                 if len(split_list)!=1:
00153                     # atom name has internal spaces, e.g. " N B ", so
00154                     # we do not strip spaces
00155                     name=fullname
00156                 else:
00157                     # atom name is like " CA ", so we can strip spaces
00158                     name=split_list[0]
00159                 altloc=line[16:17]
00160                 resname=line[17:20]
00161                 chainid=line[21:22]
00162                 try:
00163                     serial_number=int(line[6:11])
00164                 except:
00165                     serial_number=0
00166                 resseq=int(line[22:26].split()[0])   # sequence identifier   
00167                 icode=line[26:27]           # insertion code
00168                 if record_type=='HETATM':       # hetero atom flag
00169                     if resname=="HOH" or resname=="WAT":
00170                         hetero_flag="W"
00171                     else:
00172                         hetero_flag="H"
00173                 else:
00174                     hetero_flag=" "
00175                 residue_id=(hetero_flag, resseq, icode)
00176                 # atomic coordinates
00177                 try:
00178                     x=float(line[30:38]) 
00179                     y=float(line[38:46]) 
00180                     z=float(line[46:54])
00181                 except:
00182                     #Should we allow parsing to continue in permissive mode?
00183                     #If so what coordindates should we default to?  Easier to abort!
00184                     raise PDBConstructionException(\
00185                         "Invalid or missing coordinate(s) at line %i." \
00186                         % global_line_counter)
00187                 coord=numpy.array((x, y, z), 'f')
00188                 # occupancy & B factor
00189                 try:
00190                     occupancy=float(line[54:60])
00191                 except:
00192                     self._handle_PDB_exception("Invalid or missing occupancy",
00193                                                global_line_counter)
00194                     occupancy = 0.0 #Is one or zero a good default?
00195                 try:
00196                     bfactor=float(line[60:66])
00197                 except:
00198                     self._handle_PDB_exception("Invalid or missing B factor",
00199                                                global_line_counter)
00200                     bfactor = 0.0 #The PDB use a default of zero if the data is missing
00201                 segid=line[72:76]
00202                 element=line[76:78].strip()
00203                 if current_segid!=segid:
00204                     current_segid=segid
00205                     structure_builder.init_seg(current_segid)
00206                 if current_chain_id!=chainid:
00207                     current_chain_id=chainid
00208                     structure_builder.init_chain(current_chain_id)
00209                     current_residue_id=residue_id
00210                     current_resname=resname
00211                     try:
00212                         structure_builder.init_residue(resname, hetero_flag, resseq, icode)
00213                     except PDBConstructionException, message:
00214                         self._handle_PDB_exception(message, global_line_counter)
00215                 elif current_residue_id!=residue_id or current_resname!=resname:
00216                     current_residue_id=residue_id
00217                     current_resname=resname
00218                     try:
00219                         structure_builder.init_residue(resname, hetero_flag, resseq, icode)
00220                     except PDBConstructionException, message:
00221                         self._handle_PDB_exception(message, global_line_counter) 
00222                 # init atom
00223                 try:
00224                     structure_builder.init_atom(name, coord, bfactor, occupancy, altloc,
00225                                                 fullname, serial_number, element)
00226                 except PDBConstructionException, message:
00227                     self._handle_PDB_exception(message, global_line_counter)
00228             elif(record_type=='ANISOU'):
00229                 anisou=map(float, (line[28:35], line[35:42], line[43:49], line[49:56], line[56:63], line[63:70]))
00230                 # U's are scaled by 10^4 
00231                 anisou_array=(numpy.array(anisou, 'f')/10000.0).astype('f')
00232                 structure_builder.set_anisou(anisou_array)
00233             elif(record_type=='MODEL '):
00234                 try:
00235                     serial_num=int(line[10:14])
00236                 except:
00237                     self._handle_PDB_exception("Invalid or missing model serial number",
00238                                                global_line_counter)
00239                     serial_num=0
00240                 structure_builder.init_model(current_model_id,serial_num)
00241                 current_model_id+=1
00242                 model_open=1
00243                 current_chain_id=None
00244                 current_residue_id=None
00245             elif(record_type=='END   ' or record_type=='CONECT'):
00246                 # End of atomic data, return the trailer
00247                 self.line_counter=self.line_counter+local_line_counter
00248                 return coords_trailer[local_line_counter:]
00249             elif(record_type=='ENDMDL'):
00250                 model_open=0
00251                 current_chain_id=None
00252                 current_residue_id=None
00253             elif(record_type=='SIGUIJ'):
00254                 # standard deviation of anisotropic B factor
00255                 siguij=map(float, (line[28:35], line[35:42], line[42:49], line[49:56], line[56:63], line[63:70]))
00256                 # U sigma's are scaled by 10^4
00257                 siguij_array=(numpy.array(siguij, 'f')/10000.0).astype('f')   
00258                 structure_builder.set_siguij(siguij_array)
00259             elif(record_type=='SIGATM'):
00260                 # standard deviation of atomic positions
00261                 sigatm=map(float, (line[30:38], line[38:45], line[46:54], line[54:60], line[60:66]))
00262                 sigatm_array=numpy.array(sigatm, 'f')
00263                 structure_builder.set_sigatm(sigatm_array)
00264             local_line_counter=local_line_counter+1
00265         # EOF (does not end in END or CONECT)
00266         self.line_counter=self.line_counter+local_line_counter
00267         return []
00268 
00269     def _handle_PDB_exception(self, message, line_counter):
00270         """
00271         This method catches an exception that occurs in the StructureBuilder
00272         object (if PERMISSIVE), or raises it again, this time adding the 
00273         PDB line number to the error message.
00274         """
00275         message="%s at line %i." % (message, line_counter)
00276         if self.PERMISSIVE:
00277             # just print a warning - some residues/atoms may be missing
00278             warnings.warn("PDBConstructionException: %s\n"
00279                           "Exception ignored.\n"
00280                           "Some atoms or residues may be missing in the data structure."
00281                           % message, PDBConstructionWarning)
00282         else:
00283             # exceptions are fatal - raise again with new message (including line nr)
00284             raise PDBConstructionException(message)
00285 
00286 
00287 if __name__=="__main__":
00288 
00289     import sys
00290 
00291     p=PDBParser(PERMISSIVE=True)
00292 
00293     filename = sys.argv[1]
00294     s=p.get_structure("scr", filename)
00295 
00296     for m in s:
00297         p=m.get_parent()
00298         assert(p is s)
00299         for c in m:
00300             p=c.get_parent()
00301             assert(p is m)
00302             for r in c:
00303                 print r
00304                 p=r.get_parent()
00305                 assert(p is c)
00306                 for a in r:
00307                     p=a.get_parent()
00308                     if not p is r:
00309                         print p, r
00310                     
00311                 
00312                 
00313