Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 """Parser for FSSP files, used in a database of protein fold classifications.
00002 
00003 This is a module to handle FSSP files. For now it parses only the header,
00004 summary and alignment sections.
00005 
00006 See: Holm and Sander (1996) The FSSP database: fold classification based on
00007 structure-structure alignment of proteins.
00008 
00009 functions: read_fssp(file_handle): reads an fssp file into the records. Returns a
00010 tuple of two instances.
00011 mult_align: returns a Biopython alignment object
00012 """
00013 import re
00014 import fssp_rec
00015 from Bio.Align import Generic
00016 from Bio import Alphabet
00017 fff_rec = fssp_rec.fff_rec
00018 header_records = {
00019    'database' : re.compile('^DATABASE'),
00020    'pdbid': re.compile('^PDBID'),
00021    'header': re.compile('^HEADER'),
00022    'compnd': re.compile('^COMPND'),
00023    'author': re.compile('^AUTHOR'),
00024    'source': re.compile('^SOURCE'),
00025    'seqlength': re.compile('^SEQLENGTH'),
00026    'nalign': re.compile('^NALIGN')
00027 }
00028 
00029 summary_title = re.compile('## +SUMMARY')
00030 summary_rec = re.compile(' *[0-9]+: +[1-9][0-9a-z]{3,3}')
00031 alignments_title= re.compile('## +ALIGNMENTS')
00032 alignments_rec = re.compile(' *[0-9]+ +-{0,1}[0-9]+')
00033 equiv_title = re.compile('## +EQUIVALENCES')
00034 
00035 class FSSPHeader(object):
00036    def __init__(self):
00037       self.database = None
00038       self.pdbid = ''
00039       self.header = ''
00040       self.compnd = ''
00041       self.source = ''
00042       self.author = []
00043       self.seqlength = 0
00044       self.nalign = 0
00045    def fill_header(self,inline):
00046       for i in header_records:
00047          if header_records[i].match(inline):
00048             if i == 'database' or i == 'seqlength' or i == 'nalign':
00049                setattr(self,i,int(inline.split()[1]))
00050             elif i == 'compnd' or i == 'author':
00051                setattr(self,i,inline.split()[1:])
00052             elif i == 'source' or i == 'header':
00053                attr = inline[inline.find(' ')+1:].strip()
00054                setattr(self,i,attr)
00055             else:
00056                setattr(self,i,inline.split()[1])
00057 
00058 class PosAlign(object):
00059    def __init__(self,inStr):
00060       inStr = inStr.strip()
00061       if len(inStr) != 1 and len(inStr)!= 2:
00062          raise ValueError('PosAlign: length not 2 chars' + inStr)
00063       if inStr == '..':
00064          self.aa = '-'
00065          self.gap = 1
00066       else:
00067          self.gap = 0
00068          self.aa = inStr[0]
00069          if self.aa == self.aa.lower():
00070             self.aa = 'C'
00071          if len(inStr) == 2:
00072             self.ss = inStr[1].upper()
00073          else:
00074             self.ss = '0'
00075 
00076    def __repr__(self):
00077       if self.gap:
00078          outstring = '..'
00079       else:
00080          outstring = self.aa+self.ss.lower()
00081       return outstring
00082 
00083    __str__  = __repr__
00084 
00085 
00086 
00087 
00088 class FSSPSumRec(object):
00089    """ Contains info from an FSSP summary record"""
00090    def __init__(self,in_str):
00091       self.raw = in_str
00092       in_rec = in_str.strip().split()
00093       # print in_rec
00094       self.nr = int(in_rec[0][:-1])
00095       self.pdb1 = in_rec[1][:4]
00096       if len(in_rec[1]) == 4:
00097          self.chain1='0'
00098       elif len(in_rec[1]) == 5:
00099          self.chain1=in_rec[1][4]
00100       else:
00101          raise ValueError('Bad PDB ID 1')
00102       self.pdb2 = in_rec[2][:4]
00103       if len(in_rec[2]) == 4:
00104          self.chain2='0'
00105       elif len(in_rec[2]) == 5:
00106          self.chain2=in_rec[2][4]
00107       else:
00108          raise ValueError('Bad PDB ID 2')
00109       self.zscore = float(in_rec[3])
00110       self.rmsd = float(in_rec[4])
00111       self.lali = float(in_rec[5])
00112       self.lseq2 = float(in_rec[6])
00113       self.pID = float(in_rec[7])
00114       self.revers = int(in_rec[8])
00115       self.permut = int(in_rec[9])
00116       self.nfrag = int(in_rec[10])
00117       self.topo = in_rec[11]
00118       self.doc = ''
00119       for i in in_rec[12:]:
00120          self.doc = self.doc + i + ' '
00121       self.doc = self.doc.rstrip() + '\n'
00122 
00123    def __repr__(self):
00124       return self.raw
00125    __str__ = __repr__
00126 
00127 class FSSPAlignRec(object):
00128    def __init__(self,in_fff_rec):
00129       # print in_fff_rec
00130       self.abs_res_num = int(in_fff_rec[fssp_rec.align.abs_res_num])
00131       self.pdb_res_num = in_fff_rec[fssp_rec.align.pdb_res_num].strip()
00132       self.chain_id  = in_fff_rec[fssp_rec.align.chain_id]
00133       if self.chain_id == ' ':
00134          self.chain_id = '0'
00135       self.res_name = in_fff_rec[fssp_rec.align.res_name]
00136       if self.res_name == self.res_name.lower():
00137          self.res_name = 'C'
00138       self.ss1 = in_fff_rec[fssp_rec.align.ss1]
00139       self.turn3 = in_fff_rec[fssp_rec.align.turn3]
00140       self.turn4 = in_fff_rec[fssp_rec.align.turn4]
00141       self.turn5 = in_fff_rec[fssp_rec.align.turn5]
00142       self.pos_align_dict = {}
00143       self.PosAlignList = []
00144    def add_align_list(self,align_list):
00145       for i in align_list:
00146          self.PosAlignList.append(PosAlign(i))
00147    def pos_align_list2dict(self):
00148       j = 1
00149       for i in self.PosAlignList:
00150          self.pos_align_dict[j] = i
00151          j = j + 1
00152 
00153 
00154 class FSSPAlignDict(dict):
00155    def __init__(self):
00156       # The following two dictionaries are pointers to records in self
00157       # The first dictionary is a "pdb_residue_number: self_key"
00158       # The second dictionary is a "absolute_residue_number: self_key"
00159       self.pdb_res_dict = {}
00160       self.abs_res_dict = {}
00161       self.data = {}
00162    def build_resnum_list(self):
00163       for i in self:
00164          self.abs_res_dict[self[i].abs_res_num] = i
00165          self.pdb_res_dict[self[i].pdb_res_num] = i
00166    # Given an absolute residue number & chain, returns the relevant fssp
00167    # record
00168    def abs(self,num):
00169       return self[self.abs_res_dict[num]]
00170    # Given an PDB residue number & chain, returns the relevant fssp
00171    # record
00172    def pdb(self,num):
00173       return self[self.pdb_res_dict[num]]
00174    # Returns a sequence string
00175 
00176    def sequence(self,num):
00177       s = ''
00178       sorted_pos_nums = self.abs_res_dict.keys()
00179       sorted_pos_nums.sort()
00180       for i in sorted_pos_nums:
00181          s += self.abs(i).pos_align_dict[num].aa
00182       return s
00183 
00184    def fasta_mult_align(self):
00185       mult_align_dict = {}
00186       for j in self.abs(1).pos_align_dict:
00187          mult_align_dict[j] = ''
00188       for fssp_rec in self.itervalues():
00189          for j in fssp_rec.pos_align_dict:
00190             mult_align_dict[j] += fssp_rec.pos_align_dict[j].aa
00191       seq_order = mult_align_dict.keys()
00192       seq_order.sort()
00193       out_str = ''
00194       for i in seq_order:
00195          out_str += '> %d\n' % i
00196          k = 0
00197          for j in mult_align_dict[i]:
00198             k += 1
00199             if k % 72 == 0:
00200                out_str += '\n'
00201             out_str += j
00202          out_str += '\n'
00203       return out_str
00204 
00205 class FSSPSumDict(dict):
00206    pass
00207 
00208 #
00209 # Process a fssp file into its constituents. Return a 2-tuple containing
00210 # a list of FSSPSumRecs and a dictionary of alignment records.
00211 #
00212 def read_fssp(fssp_handle):
00213    header = FSSPHeader()
00214    sum_dict = FSSPSumDict()
00215    align_dict = FSSPAlignDict()
00216    # fssp_handle=open(fssp_handlename)
00217    curline = fssp_handle.readline()
00218    while not summary_title.match(curline):
00219       # Still in title
00220       header.fill_header(curline)
00221       curline = fssp_handle.readline()
00222       
00223    if not summary_title.match(curline):
00224       raise ValueError('Bad FSSP file: no summary record found')
00225    curline = fssp_handle.readline()  #Read the title line, discard
00226    curline = fssp_handle.readline()  #Read the next line
00227    # Process the summary records into a list
00228    while summary_rec.match(curline):
00229       cur_sum_rec = FSSPSumRec(curline)
00230       sum_dict[cur_sum_rec.nr] = cur_sum_rec
00231       curline = fssp_handle.readline()
00232 
00233    # Outer loop: process everything up to the EQUIVALENCES title record
00234    while not equiv_title.match(curline):
00235       while (not alignments_title.match(curline) and
00236              not equiv_title.match(curline)):
00237          curline = fssp_handle.readline()
00238       if not alignments_title.match(curline):
00239          if equiv_title.match(curline):
00240             # print "Reached equiv_title"
00241             break
00242          else:
00243             raise ValueError('Bad FSSP file: no alignments title record found')
00244 
00245       if equiv_title.match(curline):
00246          break
00247       # If we got to this point, this means that we have matched an
00248       # alignments title. Parse the alignment records in a loop.
00249       curline = fssp_handle.readline()  #Read the title line, discard
00250       curline = fssp_handle.readline()  #Read the next line
00251       while alignments_rec.match(curline):
00252          align_rec = FSSPAlignRec(fff_rec(curline))
00253          key = align_rec.chain_id+align_rec.res_name+str(align_rec.pdb_res_num)
00254          align_list = curline[fssp_rec.align.start_aa_list:].strip().split()
00255          if key not in align_dict:
00256             align_dict[key] = align_rec
00257          align_dict[key].add_align_list(align_list)
00258          curline = fssp_handle.readline()
00259          if not curline:
00260             print 'EOFEOFEOF'
00261             raise EOFError
00262    for i in align_dict.itervalues():
00263       i.pos_align_list2dict()
00264       del i.PosAlignList
00265    align_dict.build_resnum_list()
00266    return (header, sum_dict, align_dict)
00267