Back to index

python-biopython  1.60
ProtParam.py
Go to the documentation of this file.
00001 # Copyright Yair Benita Y.Benita@pharm.uu.nl
00002 # Biopython (http://biopython.org) license applies
00003 
00004 """Simple protein analysis.
00005 
00006 Example,
00007 
00008 X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV")
00009 print X.count_amino_acids()
00010 print X.get_amino_acids_percent()
00011 print X.molecular_weight()
00012 print X.aromaticity()
00013 print X.instability_index()
00014 print X.flexibility()
00015 print X.isoelectric_point()
00016 print X.secondary_structure_fraction()
00017 print X.protein_scale(ProtParamData.kd, 9, 0.4)
00018 """
00019 
00020 import sys
00021 import ProtParamData, IsoelectricPoint
00022 from Bio.Seq import Seq
00023 from Bio.Alphabet import IUPAC
00024 from Bio.Data import IUPACData
00025 
00026 
00027 class ProteinAnalysis(object):
00028     """Class containing methods for protein analysis.
00029 
00030     The constructor takes one argument: the protein sequence as a
00031     string and builds a sequence object using the Bio.Seq module. This is done
00032     just to make sure the sequence is a protein sequence and not anything else.
00033     
00034     """
00035     def __init__(self, prot_sequence):
00036         if prot_sequence.islower():
00037             self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
00038         else:
00039             self.sequence = Seq(prot_sequence, IUPAC.protein)
00040         self.amino_acids_content = None
00041         self.amino_acids_percent = None
00042         self.length = len(self.sequence)
00043         
00044     def count_amino_acids(self):
00045         """Count standard amino acids, returns a dict.
00046             
00047         Counts the number times each amino acid is in the protein
00048         sequence. Returns a dictionary {AminoAcid:Number}.
00049         
00050         The return value is cached in self.amino_acids_content.
00051         It is not recalculated upon subsequent calls.
00052         """
00053         if self.amino_acids_content is None:
00054             prot_dic = dict([(k, 0) for k in IUPACData.protein_letters])
00055             for aa in prot_dic:
00056                 prot_dic[aa] = self.sequence.count(aa)
00057             
00058             self.amino_acids_content = prot_dic
00059             
00060         return self.amino_acids_content
00061     
00062     def get_amino_acids_percent(self):
00063         """Calculate the amino acid content in percentages.
00064 
00065         The same as count_amino_acids only returns the Number in percentage of
00066         entire sequence. Returns a dictionary of {AminoAcid:percentage}.
00067         
00068         The return value is cached in self.amino_acids_percent.
00069         
00070         input is the dictionary self.amino_acids_content.
00071         output is a dictionary with amino acids as keys.
00072         """
00073         if self.amino_acids_percent is None:
00074             aa_counts = self.count_amino_acids()
00075                 
00076             percentages = {}
00077             for aa in aa_counts:
00078                 percentages[aa] = aa_counts[aa] / float(self.length)
00079                 
00080             self.amino_acids_percent = percentages
00081 
00082         return self.amino_acids_percent
00083 
00084     def molecular_weight (self):
00085         """Calculate MW from Protein sequence"""
00086         # make local dictionary for speed
00087         aa_weights = {}
00088         for i in IUPACData.protein_weights:
00089             # remove a molecule of water from the amino acid weight
00090             aa_weights[i] = IUPACData.protein_weights[i] - 18.02
00091 
00092         total_weight = 18.02 # add just one water molecule for the whole sequence
00093         for aa in self.sequence:
00094             total_weight += aa_weights[aa]
00095 
00096         return total_weight
00097 
00098     def aromaticity(self):
00099         """Calculate the aromaticity according to Lobry, 1994.
00100 
00101         Calculates the aromaticity value of a protein according to Lobry, 1994.
00102         It is simply the relative frequency of Phe+Trp+Tyr.
00103         """
00104         aromatic_aas = 'YWF'
00105         aa_percentages = self.get_amino_acids_percent()
00106         
00107         aromaticity = sum([aa_percentages[aa] for aa in aromatic_aas])
00108 
00109         return aromaticity
00110 
00111     def instability_index(self):
00112         """Calculate the instability index according to Guruprasad et al 1990.
00113 
00114         Implementation of the method of Guruprasad et al. 1990 to test a
00115         protein for stability. Any value above 40 means the protein is unstable
00116         (has a short half life). 
00117         
00118         See: Guruprasad K., Reddy B.V.B., Pandit M.W.
00119         Protein Engineering 4:155-161(1990).
00120         """
00121         index = ProtParamData.DIWV
00122         score = 0.0
00123         
00124         for i in range(self.length - 1):
00125             this, next = self.sequence[i:i+2]
00126             dipeptide_value = index[this][next]
00127             score += dipeptide_value
00128 
00129         return (10.0 / self.length) * score
00130 
00131     def flexibility(self):
00132         """Calculate the flexibility according to Vihinen, 1994.
00133         
00134         No argument to change window size because parameters are specific for a
00135         window=9. The parameters used are optimized for determining the flexibility.
00136         """
00137         flexibilities = ProtParamData.Flex
00138         window_size = 9
00139         weights = [0.25, 0.4375, 0.625, 0.8125, 1]
00140         scores = []
00141 
00142         for i in range(self.length - window_size):
00143             subsequence = self.sequence[i:i+window_size]
00144             score = 0.0
00145 
00146             for j in range(window_size // 2):
00147                 front = subsequence[j]
00148                 back = subsequence[window_size - j - 1]
00149                 score += (flexibilities[front] + flexibilities[back]) * weights[j]
00150 
00151             middle = subsequence[window_size // 2 + 1]
00152             score += flexibilities[middle]
00153             
00154             scores.append(score / 5.25)
00155 
00156         return scores
00157 
00158     def gravy(self):
00159         """Calculate the gravy according to Kyte and Doolittle."""
00160         total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence)
00161             
00162         return total_gravy / self.length
00163 
00164 
00165     def _weight_list(self, window, edge):
00166         """Makes a list of relative weight of the
00167         window edges compared to the window center. The weights are linear.
00168         it actually generates half a list. For a window of size 9 and edge 0.4
00169         you get a list of [0.4, 0.55, 0.7, 0.85]. 
00170         """
00171         unit = 2 * (1.0 - edge) / (window - 1)
00172         weights = [0.0] * (window // 2)
00173         
00174         for i in range(window // 2):
00175             weights[i] = edge + unit * i
00176 
00177         return weights
00178     
00179     def protein_scale(self, param_dict, window, edge=1.0):
00180         """Compute a profile by any amino acid scale.
00181         
00182         An amino acid scale is defined by a numerical value assigned to each type of
00183         amino acid. The most frequently used scales are the hydrophobicity or
00184         hydrophilicity scales and the secondary structure conformational parameters
00185         scales, but many other scales exist which are based on different chemical and
00186         physical properties of the amino acids.  You can set several parameters that
00187         control the computation  of a scale profile, such as the window size and the
00188         window edge relative weight value.  
00189         
00190         WindowSize: The window size is the length
00191         of the interval to use for the profile computation. For a window size n, we
00192         use the i-(n-1)/2 neighboring residues on each side to compute
00193         the score for residue i. The score for residue i is the sum of the scaled values
00194         for these amino acids, optionally weighted according to their position in the
00195         window.  
00196         
00197         Edge: The central amino acid of the window always has a weight of 1.
00198         By default, the amino acids at the remaining window positions have the same
00199         weight, but you can make the residue at the center of the window  have a
00200         larger weight than the others by setting the edge value for the  residues at
00201         the beginning and end of the interval to a value between 0 and 1. For
00202         instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
00203         1.0, 0.7, 0.4.  
00204         
00205         The method returns a list of values which can be plotted to
00206         view the change along a protein sequence.  Many scales exist. Just add your
00207         favorites to the ProtParamData modules.
00208 
00209         Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
00210         """
00211         # generate the weights
00212         #   _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
00213         #   what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
00214         #   in the loop.
00215         weights = self._weight_list(window, edge)
00216         scores = []
00217         
00218         # the score in each Window is divided by the sum of weights
00219         # (* 2 + 1) since the weight list is one sided:
00220         sum_of_weights = sum(weights) * 2 + 1
00221         
00222         for i in range(self.length - window + 1):
00223             subsequence = self.sequence[i:i+window]
00224             score = 0.0
00225             
00226             for j in range(window // 2):
00227                 # walk from the outside of the Window towards the middle.
00228                 # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid
00229                 try:
00230                     front = param_dict[subsequence[j]]
00231                     back = param_dict[subsequence[window - j - 1]]
00232                     score += weights[j] * front + weights[j] * back
00233                 except KeyError:
00234                     sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
00235                              (subsequence[j], subsequence[window - j - 1]))
00236 
00237             # Now add the middle value, which always has a weight of 1.
00238             middle = subsequence[window // 2]
00239             if middle in param_dict:
00240                 score += param_dict[middle]
00241             else:
00242                 sys.stderr.write('warning: %s  is not a standard amino acid.\n' % (middle))
00243         
00244             scores.append(score / sum_of_weights)
00245             
00246         return scores
00247 
00248     def isoelectric_point(self):
00249         """Calculate the isoelectric point.
00250         
00251         Uses the module IsoelectricPoint to calculate the pI of a protein.
00252         """
00253         aa_content = self.count_amino_acids()
00254             
00255         ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
00256         return ie_point.pi()
00257         
00258     def secondary_structure_fraction (self):
00259         """Calculate fraction of helix, turn and sheet.
00260         
00261         Returns a list of the fraction of amino acids which tend
00262         to be in Helix, Turn or Sheet.
00263         
00264         Amino acids in helix: V, I, Y, F, W, L.
00265         Amino acids in Turn: N, P, G, S.
00266         Amino acids in sheet: E, M, A, L.
00267         
00268         Returns a tuple of three integers (Helix, Turn, Sheet).
00269         """
00270         aa_percentages = self.get_amino_acids_percent()
00271             
00272         helix = sum([aa_percentages[r] for r in 'VIYFWL'])
00273         turn  = sum([aa_percentages[r] for r in 'NPGS'])
00274         sheet = sum([aa_percentages[r] for r in 'EMAL'])
00275 
00276         return helix, turn, sheet
00277