Back to index

python-biopython  1.60
Polypeptide.py
Go to the documentation of this file.
00001 # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk)
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """Polypeptide-related classes (construction and representation).
00007 
00008 Simple example with multiple chains,
00009 
00010     >>> from Bio.PDB.PDBParser import PDBParser
00011     >>> from Bio.PDB.Polypeptide import PPBuilder
00012     >>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb')
00013     >>> ppb=PPBuilder()
00014     >>> for pp in ppb.build_peptides(structure):
00015     ...     print pp.get_sequence()
00016     LVFFAEDVGSNKGAIIGLMVGGVVIA
00017     LVFFAEDVGSNKGAIIGLMVGGVVIA
00018     LVFFAEDVGSNKGAIIGLMVGGVVIA
00019     LVFFAEDVGSNKGAIIGLMVGGVVIA
00020     LVFFAEDVGSNKGAIIGLMVGGVVIA
00021 
00022 Example with non-standard amino acids using HETATM lines in the PDB file,
00023 in this case selenomethionine (MSE):
00024 
00025     >>> from Bio.PDB.PDBParser import PDBParser
00026     >>> from Bio.PDB.Polypeptide import PPBuilder
00027     >>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb')
00028     >>> ppb=PPBuilder()
00029     >>> for pp in ppb.build_peptides(structure):
00030     ...     print pp.get_sequence()
00031     DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW
00032     TETLLVQNANPDCKTILKALGPGATLEE
00033     TACQG
00034 
00035 If you want to, you can include non-standard amino acids in the peptides:
00036 
00037     >>> for pp in ppb.build_peptides(structure, aa_only=False):
00038     ...     print pp.get_sequence()
00039     ...     print pp.get_sequence()[0], pp[0].get_resname()
00040     ...     print pp.get_sequence()[-7], pp[-7].get_resname()
00041     ...     print pp.get_sequence()[-6], pp[-6].get_resname()
00042     MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG
00043     M MSE
00044     M MSE
00045     M MSE
00046 
00047 In this case the selenomethionines (the first and also seventh and sixth from
00048 last residues) have been shown as M (methionine) by the get_sequence method.
00049 """
00050 
00051 import warnings
00052 
00053 from Bio.Alphabet import generic_protein
00054 from Bio.Seq import Seq
00055 from Bio.SCOP.Raf import to_one_letter_code
00056 from Bio.PDB.PDBExceptions import PDBException
00057 from Bio.PDB.Residue import Residue, DisorderedResidue
00058 from Bio.PDB.Vector import calc_dihedral, calc_angle
00059 
00060 
00061 standard_aa_names=["ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", 
00062                    "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL",
00063                    "TRP", "TYR"]
00064 
00065 
00066 aa1="ACDEFGHIKLMNPQRSTVWY"
00067 aa3=standard_aa_names
00068 
00069 d1_to_index={}
00070 dindex_to_1={}
00071 d3_to_index={}
00072 dindex_to_3={}
00073 
00074 # Create some lookup tables
00075 for i in range(0, 20):
00076     n1=aa1[i]
00077     n3=aa3[i]
00078     d1_to_index[n1]=i
00079     dindex_to_1[i]=n1
00080     d3_to_index[n3]=i
00081     dindex_to_3[i]=n3
00082 
00083 def index_to_one(index):
00084     """Index to corresponding one letter amino acid name.
00085     
00086     >>> index_to_one(0)
00087     'A'
00088     >>> index_to_one(19)
00089     'Y'
00090     """
00091     return dindex_to_1[index]
00092 
00093 def one_to_index(s):
00094     """One letter code to index.
00095     
00096     >>> one_to_index('A')
00097     0
00098     >>> one_to_index('Y')
00099     19
00100     """
00101     return d1_to_index[s]
00102 
00103 def index_to_three(i):
00104     """Index to corresponding three letter amino acid name.
00105     
00106     >>> index_to_three(0)
00107     'ALA'
00108     >>> index_to_three(19)
00109     'TYR'
00110     """
00111     return dindex_to_3[i]
00112 
00113 def three_to_index(s):
00114     """Three letter code to index.
00115     
00116     >>> three_to_index('ALA')
00117     0
00118     >>> three_to_index('TYR')
00119     19
00120     """
00121     return d3_to_index[s]
00122 
00123 def three_to_one(s):
00124     """Three letter code to one letter code.
00125     
00126     >>> three_to_one('ALA')
00127     'A'
00128     >>> three_to_one('TYR')
00129     'Y'
00130 
00131     For non-standard amino acids, you get a KeyError:
00132 
00133     >>> three_to_one('MSE')
00134     Traceback (most recent call last):
00135        ...
00136     KeyError: 'MSE'
00137     """
00138     i=d3_to_index[s]
00139     return dindex_to_1[i]
00140 
00141 def one_to_three(s):
00142     """One letter code to three letter code.
00143     
00144     >>> one_to_three('A')
00145     'ALA'
00146     >>> one_to_three('Y')
00147     'TYR'
00148     """
00149     i=d1_to_index[s]
00150     return dindex_to_3[i]
00151 
00152 def is_aa(residue, standard=False):
00153     """Return True if residue object/string is an amino acid.
00154 
00155     @param residue: a L{Residue} object OR a three letter amino acid code
00156     @type residue: L{Residue} or string
00157 
00158     @param standard: flag to check for the 20 AA (default false) 
00159     @type standard: boolean
00160 
00161     >>> is_aa('ALA')
00162     True
00163 
00164     Known three letter codes for modified amino acids are supported,
00165 
00166     >>> is_aa('FME')
00167     True
00168     >>> is_aa('FME', standard=True)
00169     False
00170     """
00171     #TODO - What about special cases like XXX, can they appear in PDB files?
00172     if not isinstance(residue, basestring):
00173         residue=residue.get_resname()
00174     residue=residue.upper()
00175     if standard:
00176         return residue in d3_to_index
00177     else:
00178         return residue in to_one_letter_code
00179 
00180 
00181 class Polypeptide(list):
00182     """A polypeptide is simply a list of L{Residue} objects."""
00183     def get_ca_list(self):
00184         """Get list of C-alpha atoms in the polypeptide.
00185         
00186         @return: the list of C-alpha atoms
00187         @rtype: [L{Atom}, L{Atom}, ...]
00188         """
00189         ca_list=[]
00190         for res in self:
00191             ca=res["CA"]
00192             ca_list.append(ca)
00193         return ca_list
00194 
00195     def get_phi_psi_list(self):
00196         """Return the list of phi/psi dihedral angles."""
00197         ppl=[]
00198         lng=len(self)
00199         for i in range(0, lng):
00200             res=self[i]
00201             try:
00202                 n=res['N'].get_vector()
00203                 ca=res['CA'].get_vector()
00204                 c=res['C'].get_vector()
00205             except:
00206                 # Some atoms are missing
00207                 # Phi/Psi cannot be calculated for this residue
00208                 ppl.append((None, None))
00209                 res.xtra["PHI"]=None
00210                 res.xtra["PSI"]=None
00211                 continue
00212             # Phi
00213             if i>0:
00214                 rp=self[i-1]
00215                 try:
00216                     cp=rp['C'].get_vector()
00217                     phi=calc_dihedral(cp, n, ca, c)
00218                 except:
00219                     phi=None
00220             else:
00221                 # No phi for residue 0!
00222                 phi=None
00223             # Psi
00224             if i<(lng-1):
00225                 rn=self[i+1]
00226                 try:
00227                     nn=rn['N'].get_vector()
00228                     psi=calc_dihedral(n, ca, c, nn)
00229                 except:
00230                     psi=None
00231             else:
00232                 # No psi for last residue!
00233                 psi=None
00234             ppl.append((phi, psi))
00235             # Add Phi/Psi to xtra dict of residue
00236             res.xtra["PHI"]=phi
00237             res.xtra["PSI"]=psi
00238         return ppl
00239 
00240     def get_tau_list(self):
00241         """List of tau torsions angles for all 4 consecutive Calpha atoms."""
00242         ca_list=self.get_ca_list()
00243         tau_list=[]
00244         for i in range(0, len(ca_list)-3):
00245             atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2], ca_list[i+3])
00246             v1, v2, v3, v4 = [a.get_vector() for a in atom_list]
00247             tau=calc_dihedral(v1, v2, v3, v4)
00248             tau_list.append(tau)
00249             # Put tau in xtra dict of residue
00250             res=ca_list[i+2].get_parent()
00251             res.xtra["TAU"]=tau
00252         return tau_list
00253 
00254     def get_theta_list(self):
00255         """List of theta angles for all 3 consecutive Calpha atoms."""
00256         theta_list=[]
00257         ca_list=self.get_ca_list()
00258         for i in range(0, len(ca_list)-2):
00259             atom_list = (ca_list[i], ca_list[i+1], ca_list[i+2])
00260             v1, v2, v3 = [a.get_vector() for a in atom_list]
00261             theta=calc_angle(v1, v2, v3)
00262             theta_list.append(theta)
00263             # Put tau in xtra dict of residue
00264             res=ca_list[i+1].get_parent()
00265             res.xtra["THETA"]=theta
00266         return theta_list
00267 
00268     def get_sequence(self):
00269         """Return the AA sequence as a Seq object.
00270 
00271         @return: polypeptide sequence 
00272         @rtype: L{Seq}
00273         """
00274         s=""
00275         for res in self:
00276             s += to_one_letter_code.get(res.get_resname(), 'X')
00277         seq=Seq(s, generic_protein)
00278         return seq
00279 
00280     def __repr__(self):
00281         """Return string representation of the polypeptide.
00282         
00283         Return <Polypeptide start=START end=END>, where START
00284         and END are sequence identifiers of the outer residues.
00285         """
00286         start=self[0].get_id()[1]
00287         end=self[-1].get_id()[1]
00288         s="<Polypeptide start=%s end=%s>" % (start, end)
00289         return s
00290 
00291 class _PPBuilder:
00292     """Base class to extract polypeptides.
00293     
00294     It checks if two consecutive residues in a chain are connected.
00295     The connectivity test is implemented by a subclass.
00296     
00297     This assumes you want both standard and non-standard amino acids.
00298     """
00299     def __init__(self, radius):
00300         """
00301         @param radius: distance
00302         @type radius: float
00303         """
00304         self.radius=radius
00305 
00306     def _accept(self, residue, standard_aa_only):
00307         """Check if the residue is an amino acid (PRIVATE)."""
00308         if is_aa(residue, standard=standard_aa_only):
00309             return True
00310         elif not standard_aa_only and "CA" in residue.child_dict:
00311             #It has an alpha carbon...
00312             #We probably need to update the hard coded list of
00313             #non-standard residues, see function is_aa for details.
00314             warnings.warn("Assuming residue %s is an unknown modified "
00315                           "amino acid" % residue.get_resname())
00316             return True
00317         else:
00318             # not a standard AA so skip
00319             return False
00320     
00321     def build_peptides(self, entity, aa_only=1):
00322         """Build and return a list of Polypeptide objects.
00323 
00324         @param entity: polypeptides are searched for in this object
00325         @type entity: L{Structure}, L{Model} or L{Chain}
00326 
00327         @param aa_only: if 1, the residue needs to be a standard AA
00328         @type aa_only: int
00329         """
00330         is_connected=self._is_connected
00331         accept=self._accept
00332         level=entity.get_level()
00333         # Decide wich entity we are dealing with
00334         if level=="S":
00335             model=entity[0]
00336             chain_list=model.get_list()
00337         elif level=="M":
00338             chain_list=entity.get_list()
00339         elif level=="C":
00340             chain_list=[entity]
00341         else:
00342             raise PDBException("Entity should be Structure, Model or Chain.")
00343         pp_list=[]
00344         for chain in chain_list:
00345             chain_it=iter(chain)
00346             try:
00347                 prev_res = chain_it.next()
00348                 while not accept(prev_res, aa_only):
00349                     prev_res = chain_it.next()
00350             except StopIteration:
00351                 #No interesting residues at all in this chain
00352                 continue
00353             pp=None
00354             for next_res in chain_it:
00355                 if accept(prev_res, aa_only) \
00356                 and accept(next_res, aa_only) \
00357                 and is_connected(prev_res, next_res):
00358                     if pp is None:
00359                         pp=Polypeptide()
00360                         pp.append(prev_res)
00361                         pp_list.append(pp)
00362                     pp.append(next_res)
00363                 else:
00364                     #Either too far apart, or one of the residues is unwanted.
00365                     #End the current peptide
00366                     pp=None
00367                 prev_res=next_res
00368         return pp_list
00369 
00370 
00371 class CaPPBuilder(_PPBuilder):
00372     """Use CA--CA distance to find polypeptides."""
00373     def __init__(self, radius=4.3):
00374         _PPBuilder.__init__(self, radius)
00375 
00376     def _is_connected(self, prev_res, next_res):
00377         for r in [prev_res, next_res]:
00378             if not r.has_id("CA"):
00379                 return False
00380         n=next_res["CA"]
00381         p=prev_res["CA"]
00382         # Unpack disordered
00383         if n.is_disordered():
00384             nlist=n.disordered_get_list()
00385         else:
00386             nlist=[n]
00387         if p.is_disordered():
00388             plist=p.disordered_get_list()
00389         else:
00390             plist=[p]
00391         for nn in nlist:
00392             for pp in plist:
00393                 if (nn-pp)<self.radius:
00394                     return True
00395         return False
00396 
00397 
00398 class PPBuilder(_PPBuilder):
00399     """Use C--N distance to find polypeptides."""
00400     def __init__(self, radius=1.8):
00401         _PPBuilder.__init__(self, radius)
00402 
00403     def _is_connected(self, prev_res, next_res):
00404         if not prev_res.has_id("C"):
00405             return False
00406         if not next_res.has_id("N"):
00407             return False
00408         test_dist=self._test_dist
00409         c=prev_res["C"]
00410         n=next_res["N"]
00411         # Test all disordered atom positions!
00412         if c.is_disordered():
00413             clist=c.disordered_get_list()
00414         else:
00415             clist=[c]
00416         if n.is_disordered():
00417             nlist=n.disordered_get_list()
00418         else:
00419             nlist=[n]
00420         for nn in nlist:
00421             for cc in clist:
00422                 # To form a peptide bond, N and C must be 
00423                 # within radius and have the same altloc
00424                 # identifier or one altloc blank
00425                 n_altloc=nn.get_altloc()
00426                 c_altloc=cc.get_altloc()
00427                 if n_altloc==c_altloc or n_altloc==" " or c_altloc==" ": 
00428                     if test_dist(nn, cc):
00429                         # Select the disordered atoms that
00430                         # are indeed bonded
00431                         if c.is_disordered():
00432                             c.disordered_select(c_altloc)
00433                         if n.is_disordered():
00434                             n.disordered_select(n_altloc)
00435                         return True
00436         return False
00437 
00438     def _test_dist(self, c, n):
00439         """Return 1 if distance between atoms<radius (PRIVATE)."""
00440         if (c-n)<self.radius:
00441             return 1
00442         else:
00443             return 0
00444     
00445 
00446 if __name__=="__main__":
00447     import sys
00448     from Bio.PDB.PDBParser import PDBParser
00449 
00450     p=PDBParser(PERMISSIVE=True)
00451 
00452     s=p.get_structure("scr", sys.argv[1])
00453 
00454     ppb=PPBuilder()
00455 
00456     print "C-N"
00457     for pp in ppb.build_peptides(s):
00458         print pp.get_sequence()
00459     for pp in ppb.build_peptides(s[0]):
00460         print pp.get_sequence()
00461     for pp in ppb.build_peptides(s[0]["A"]):
00462         print pp.get_sequence()
00463 
00464     for pp in ppb.build_peptides(s):
00465         for phi, psi in pp.get_phi_psi_list():
00466             print phi, psi
00467 
00468     ppb=CaPPBuilder()
00469 
00470     print "CA-CA"
00471     for pp in ppb.build_peptides(s):
00472         print pp.get_sequence()
00473     for pp in ppb.build_peptides(s[0]):
00474         print pp.get_sequence()
00475     for pp in ppb.build_peptides(s[0]["A"]):
00476         print pp.get_sequence()
00477