Back to index

python-biopython  1.60
CodonUsage.py
Go to the documentation of this file.
00001 import math
00002 from CodonUsageIndices import SharpEcoliIndex
00003 from Bio import SeqIO # To parse a FASTA file
00004 
00005 
00006 CodonsDict = {'TTT':0, 'TTC':0, 'TTA':0, 'TTG':0, 'CTT':0, 
00007 'CTC':0, 'CTA':0, 'CTG':0, 'ATT':0, 'ATC':0, 
00008 'ATA':0, 'ATG':0, 'GTT':0, 'GTC':0, 'GTA':0, 
00009 'GTG':0, 'TAT':0, 'TAC':0, 'TAA':0, 'TAG':0, 
00010 'CAT':0, 'CAC':0, 'CAA':0, 'CAG':0, 'AAT':0, 
00011 'AAC':0, 'AAA':0, 'AAG':0, 'GAT':0, 'GAC':0, 
00012 'GAA':0, 'GAG':0, 'TCT':0, 'TCC':0, 'TCA':0, 
00013 'TCG':0, 'CCT':0, 'CCC':0, 'CCA':0, 'CCG':0, 
00014 'ACT':0, 'ACC':0, 'ACA':0, 'ACG':0, 'GCT':0, 
00015 'GCC':0, 'GCA':0, 'GCG':0, 'TGT':0, 'TGC':0, 
00016 'TGA':0, 'TGG':0, 'CGT':0, 'CGC':0, 'CGA':0, 
00017 'CGG':0, 'AGT':0, 'AGC':0, 'AGA':0, 'AGG':0, 
00018 'GGT':0, 'GGC':0, 'GGA':0, 'GGG':0}
00019 
00020 
00021 # this dictionary shows which codons encode the same AA
00022 SynonymousCodons = {
00023     'CYS': ['TGT', 'TGC'], 
00024     'ASP': ['GAT', 'GAC'],
00025     'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
00026     'GLN': ['CAA', 'CAG'], 
00027     'MET': ['ATG'], 
00028     'ASN': ['AAC', 'AAT'],
00029     'PRO': ['CCT', 'CCG', 'CCA', 'CCC'], 
00030     'LYS': ['AAG', 'AAA'],
00031     'STOP': ['TAG', 'TGA', 'TAA'], 
00032     'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
00033     'PHE': ['TTT', 'TTC'], 
00034     'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
00035     'GLY': ['GGT', 'GGG', 'GGA', 'GGC'], 
00036     'ILE': ['ATC', 'ATA', 'ATT'],
00037     'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'], 
00038     'HIS': ['CAT', 'CAC'],
00039     'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'], 
00040     'TRP': ['TGG'],
00041     'VAL': ['GTA', 'GTC', 'GTG', 'GTT'], 
00042     'GLU': ['GAG', 'GAA'], 
00043     'TYR': ['TAT', 'TAC']
00044 }
00045 
00046 
00047 class CodonAdaptationIndex(object):
00048     """A codon adaptation index (CAI) implementation.
00049     
00050     Implements the codon adaptation index (CAI) described by Sharp and
00051     Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).
00052 
00053     NOTE - This implementation does not currently cope with alternative genetic
00054     codes: only the synonymous codons in the standard table are considered.
00055     """
00056     
00057     def __init__(self):
00058         self.index = {}
00059         self.codon_count = {}
00060     
00061     # use this method with predefined CAI index
00062     def set_cai_index(self, index):
00063         """Sets up an index to be used when calculating CAI for a gene.
00064         Just pass a dictionary similar to the SharpEcoliIndex in the
00065         CodonUsageIndices module.
00066         """
00067         self.index = index  
00068     
00069     def generate_index(self, fasta_file):
00070         """Generate a codon usage index from a FASTA file of CDS sequences.
00071         
00072         Takes a location of a Fasta file containing CDS sequences
00073         (which must all have a whole number of codons) and generates a codon
00074         usage index.
00075         """
00076         
00077         # first make sure we're not overwriting an existing index:
00078         if self.index != {} or self.codon_count != {}:
00079             raise ValueError("an index has already been set or a codon count has been done. cannot overwrite either.")
00080             
00081         # count codon occurrences in the file.
00082         self._count_codons(fasta_file)   
00083     
00084         # now to calculate the index we first need to sum the number of times
00085         # synonymous codons were used all together.
00086         for aa in SynonymousCodons:
00087             total = 0.0
00088             rcsu = [] # RCSU values are CodonCount/((1/num of synonymous codons) * sum of all synonymous codons)
00089             codons = SynonymousCodons[aa]
00090             
00091             for codon in codons:
00092                 total += self.codon_count[codon]
00093                 
00094             # calculate the RSCU value for each of the codons
00095             for codon in codons:
00096                 denominator = float(total) / len(codons)
00097                 rcsu.append(self.codon_count[codon] / denominator)
00098             
00099             # now generate the index W=RCSUi/RCSUmax:
00100             rcsu_max = max(rcsu)
00101             for i in range(len(codons)):
00102                 self.index[codons[i]] = rcsu[i] / rcsu_max
00103         
00104         
00105     def cai_for_gene(self, dna_sequence):
00106         """Calculate the CAI (float) for the provided DNA sequence (string).
00107         
00108         This method uses the Index (either the one you set or the one you generated)
00109         and returns the CAI for the DNA sequence.
00110         """
00111         cai_value, cai_length = 0, 0
00112 
00113         # if no index is set or generated, the default SharpEcoliIndex will be used.
00114         if self.index == {}:
00115             self.set_cai_index(SharpEcoliIndex)
00116             
00117         if dna_sequence.islower():
00118             dna_sequence = dna_sequence.upper()
00119 
00120         for i in range (0, len(dna_sequence), 3):
00121             codon = dna_sequence[i:i+3]
00122             if codon in self.index:
00123                 if codon not in ['ATG', 'TGG']: # these two codons are always one, exclude them
00124                     cai_value += math.log(self.index[codon])
00125                     cai_length += 1
00126             elif codon not in ['TGA', 'TAA', 'TAG']: # some indices may not include stop codons
00127                 raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, self.index))
00128 
00129         return math.exp(cai_value / (cai_length - 1.0))
00130         
00131             
00132     def _count_codons(self, fasta_file):
00133         handle = open(fasta_file, 'r')
00134         
00135         # make the codon dictionary local
00136         self.codon_count = CodonsDict.copy()
00137 
00138         # iterate over sequence and count all the codons in the FastaFile.
00139         for cur_record in SeqIO.parse(handle, "fasta"):
00140             # make sure the sequence is lower case
00141             if str(cur_record.seq).islower():
00142                 dna_sequence = str(cur_record.seq).upper()
00143             else:
00144                 dna_sequence = str(cur_record.seq)
00145             for i in range(0, len(dna_sequence), 3):
00146                 codon = dna_sequence[i:i+3]
00147                 if codon in self.codon_count:
00148                     self.codon_count[codon] += 1
00149                 else:
00150                     raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id))
00151         handle.close()
00152     
00153     # this just gives the index when the objects is printed.
00154     def print_index(self):
00155         """Prints out the index used."""
00156         for i in sorted(self.index):
00157             print "%s\t%.3f" % (i, self.index[i])
00158