Back to index

python-biopython  1.60
parse_pdb_header.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # parse_pdb_header.py
00004 # parses header of PDB files into a python dictionary.
00005 # emerged from the Columba database project www.columba-db.de.
00006 # 
00007 # author: Kristian Rother
00008 # 
00009 # license: same as BioPython, read LICENSE.TXT from current BioPython release.
00010 # 
00011 # last modified: 9.2.2004
00012 #
00013 # Added some small changes: the whole PDB file is not read in anymore, but just
00014 # until the first ATOM record (faster). I also split parse_pdb_header into 
00015 # parse_pdb_header and parse_pdb_header_list, because parse_pdb_header_list
00016 # can be more easily reused in PDBParser.
00017 #
00018 # Thomas, 19/03/04
00019 #
00020 # Renamed some clearly private functions to _something (ie. parse_pdb_header_list
00021 # is now _parse_pdb_header_list)
00022 # Thomas 9/05/04
00023 
00024 """Parse the header of a PDB file."""
00025 
00026 # For 'with' on Python 2.5/Jython 2.5
00027 from __future__ import with_statement
00028 import re
00029 
00030 from Bio import File
00031 
00032 def _get_journal(inl):
00033     # JRNL        AUTH   L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV,           2BBK   7
00034     journal=""
00035     for l in inl:
00036         if re.search("\AJRNL",l):
00037             journal+=l[19:72].lower()
00038     journal=re.sub("\s\s+"," ",journal)
00039     return journal
00040 
00041 def _get_references(inl):
00042     # REMARK   1 REFERENCE 1                                                  1CSE  11
00043     # REMARK   1  AUTH   W.BODE,E.PAPAMOKOS,D.MUSIL                           1CSE  12
00044     references=[]
00045     actref=""
00046     for l in inl:        
00047         if re.search("\AREMARK   1",l):
00048             if re.search("\AREMARK   1 REFERENCE",l):
00049                 if actref!="":
00050                     actref=re.sub("\s\s+"," ",actref)
00051                     if actref!=" ":
00052                         references.append(actref)
00053                     actref=""
00054             else:
00055                 actref+=l[19:72].lower()
00056 
00057     if actref!="":
00058         actref=re.sub("\s\s+"," ",actref)
00059         if actref!=" ":
00060             references.append(actref)
00061     return references
00062     
00063       
00064 # bring dates to format: 1909-01-08
00065 def _format_date(pdb_date):
00066     """Converts dates from DD-Mon-YY to YYYY-MM-DD format."""
00067     date=""
00068     year=int(pdb_date[7:])
00069     if year<50:
00070         century=2000
00071     else:
00072         century=1900            
00073     date=str(century+year)+"-"
00074     all_months=['xxx','Jan','Feb','Mar','Apr','May','Jun','Jul',\
00075     'Aug','Sep','Oct','Nov','Dec']        
00076     month=str(all_months.index(pdb_date[3:6]))
00077     if len(month)==1:
00078         month = '0'+month
00079     date = date+month+'-'+pdb_date[:2]
00080     return date
00081 
00082 
00083 def _chop_end_codes(line):
00084     """Chops lines ending with  '     1CSA  14' and the like."""
00085     return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z","",line)
00086 
00087 def _chop_end_misc(line):
00088     """Chops lines ending with  '     14-JUL-97  1CSA' and the like."""
00089     return re.sub("\s\s\s\s+.*\Z","",line)
00090 
00091 def _nice_case(line):
00092     """Makes A Lowercase String With Capitals."""
00093     l=line.lower()
00094     s=""
00095     i=0
00096     nextCap=1
00097     while i<len(l):
00098         c=l[i]
00099         if c>='a' and c<='z' and nextCap:
00100             c=c.upper()
00101             nextCap=0
00102         elif c==' ' or c=='.' or c==',' or c==';' or c==':' or c=='\t' or\
00103             c=='-' or c=='_':
00104             nextCap=1            
00105         s+=c
00106         i+=1
00107     return s
00108 
00109 def parse_pdb_header(infile):
00110     """
00111     Returns the header lines of a pdb file as a dictionary.
00112 
00113     Dictionary keys are: head, deposition_date, release_date, structure_method,
00114     resolution, structure_reference, journal_reference, author and
00115     compound.
00116     """
00117     header = []
00118     with File.as_handle(infile, 'r') as f:
00119         for l in f:
00120             record_type=l[0:6]
00121             if (record_type=='ATOM  ' or record_type=='HETATM' or
00122                     record_type=='MODEL '):
00123                 break
00124             else:
00125                 header.append(l)
00126     return _parse_pdb_header_list(header)
00127 
00128 def _parse_pdb_header_list(header):
00129     # database fields
00130     dict={'name':"",
00131         'head':'',
00132         'deposition_date' : "1909-01-08",
00133         'release_date' : "1909-01-08",
00134         'structure_method' : "unknown",
00135         'resolution' : 0.0,
00136         'structure_reference' : "unknown",
00137         'journal_reference' : "unknown",
00138         'author' : "",
00139         'compound':{'1':{'misc':''}},'source':{'1':{'misc':''}}}
00140 
00141     dict['structure_reference'] = _get_references(header)
00142     dict['journal_reference'] = _get_journal(header)
00143     comp_molid="1"
00144     src_molid="1"
00145     last_comp_key="misc"
00146     last_src_key="misc"
00147 
00148     for hh in header:
00149         h=re.sub("[\s\n\r]*\Z","",hh) # chop linebreaks off
00150         #key=re.sub("\s.+\s*","",h)
00151         key = h[:6].strip()
00152         #tail=re.sub("\A\w+\s+\d*\s*","",h)
00153         tail = h[10:].strip()
00154         # print key+":"+tail
00155         
00156         # From here, all the keys from the header are being parsed
00157         if key=="TITLE":
00158             name=_chop_end_codes(tail).lower()
00159             if 'name' in dict:
00160                 dict['name'] += " "+name
00161             else:
00162                 dict['name']=name
00163         elif key=="HEADER":            
00164             rr=re.search("\d\d-\w\w\w-\d\d",tail)
00165             if rr!=None:
00166                 dict['deposition_date']=_format_date(_nice_case(rr.group()))
00167             head=_chop_end_misc(tail).lower()
00168             dict['head']=head
00169         elif key=="COMPND":            
00170             tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
00171             # look for E.C. numbers in COMPND lines
00172             rec = re.search('\d+\.\d+\.\d+\.\d+',tt)
00173             if rec:
00174                 dict['compound'][comp_molid]['ec_number']=rec.group()
00175                 tt=re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)","",tt)
00176             tok=tt.split(":")
00177             if len(tok)>=2:
00178                 ckey=tok[0]
00179                 cval=re.sub("\A\s*","",tok[1])
00180                 if ckey=='mol_id':
00181                     dict['compound'][cval]={'misc':''}
00182                     comp_molid=cval
00183                     last_comp_key="misc"
00184                 else:
00185                     dict['compound'][comp_molid][ckey]=cval            
00186                     last_comp_key=ckey
00187             else:
00188                 dict['compound'][comp_molid][last_comp_key]+=tok[0]+" "
00189         elif key=="SOURCE":
00190             tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
00191             tok=tt.split(":")
00192             # print tok
00193             if len(tok)>=2:
00194                 ckey=tok[0]
00195                 cval=re.sub("\A\s*","",tok[1])
00196                 if ckey=='mol_id':
00197                     dict['source'][cval]={'misc':''}
00198                     comp_molid=cval
00199                     last_src_key="misc"
00200                 else:
00201                     dict['source'][comp_molid][ckey]=cval            
00202                     last_src_key=ckey
00203             else:
00204                 dict['source'][comp_molid][last_src_key]+=tok[0]+" "
00205         elif key=="KEYWDS":
00206             kwd=_chop_end_codes(tail).lower()
00207             if 'keywords' in dict:
00208                 dict['keywords']+=" "+kwd
00209             else:
00210                 dict['keywords']=kwd
00211         elif key=="EXPDTA":
00212             expd=_chop_end_codes(tail)
00213             # chop junk at end of lines for some structures
00214             expd=re.sub('\s\s\s\s\s\s\s.*\Z','',expd)
00215             # if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr'
00216             # if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction'
00217             dict['structure_method']=expd.lower()
00218         elif key=="CAVEAT":
00219             # make Annotation entries out of these!!!
00220             pass
00221         elif key=="REVDAT":
00222             rr=re.search("\d\d-\w\w\w-\d\d",tail)
00223             if rr!=None:
00224                 dict['release_date']=_format_date(_nice_case(rr.group()))
00225         elif key=="JRNL":
00226             # print key,tail
00227             if 'journal' in dict:
00228                 dict['journal']+=tail
00229             else:
00230                 dict['journal']=tail
00231         elif key=="AUTHOR":
00232             auth = _nice_case(_chop_end_codes(tail))
00233             if 'author' in dict:
00234                 dict['author']+=auth
00235             else:
00236                 dict['author']=auth
00237         elif key=="REMARK":
00238             if re.search("REMARK   2 RESOLUTION.",hh):
00239                 r=_chop_end_codes(re.sub("REMARK   2 RESOLUTION.",'',hh))
00240                 r=re.sub("\s+ANGSTROM.*","",r)
00241                 try:
00242                     dict['resolution']=float(r)
00243                 except:
00244                     #print 'nonstandard resolution',r
00245                     dict['resolution']=None
00246         else:
00247             # print key
00248             pass
00249     if dict['structure_method']=='unknown': 
00250         if dict['resolution']>0.0: dict['structure_method']='x-ray diffraction'
00251     return dict
00252 
00253 if __name__=='__main__':
00254     # Reads a PDB file passed as argument, parses its header, extracts
00255     # some data and returns it as a dictionary.
00256     import sys
00257     filename = sys.argv[1]
00258     handle = open(filename,'r')
00259     data_dict = parse_pdb_header(handle)
00260     handle.close()
00261 
00262     # print the dictionary
00263     for k, y in data_dict.iteritems():
00264         print "-"*40
00265         print k
00266         print y