Back to index

python-biopython  1.60
__init__.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # Created: Wed May 29 08:07:18 2002
00003 # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se
00004 # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark.
00005 # All rights reserved.
00006 # This code is part of the Biopython distribution and governed by its
00007 # license.  Please see the LICENSE file that should have been included
00008 # as part of this package.
00009 
00010 """Miscellaneous functions for dealing with sequences."""
00011 
00012 import re, time
00013 from Bio import SeqIO
00014 from Bio.Seq import Seq
00015 from Bio import Alphabet
00016 from Bio.Alphabet import IUPAC
00017 from Bio.Data import IUPACData, CodonTable
00018 
00019 
00020 ######################################
00021 # DNA
00022 ######################
00023 # {{{ 
00024 
00025 
00026 def GC(seq):
00027     """Calculates G+C content, returns the percentage (float between 0 and 100).
00028 
00029     Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
00030     when counting the G and C content.  The percentage is calculated against
00031     the full length, e.g.: 
00032 
00033     >>> from Bio.SeqUtils import GC
00034     >>> GC("ACTGN")
00035     40.0
00036 
00037     Note that this will return zero for an empty sequence.
00038     """
00039     try:
00040         gc = sum(map(seq.count,['G','C','g','c','S','s']))
00041         return gc*100.0/len(seq)
00042     except ZeroDivisionError:
00043         return 0.0
00044         
00045     
00046 def GC123(seq):
00047     """Calculates total G+C content plus first, second and third positions.
00048 
00049     Returns a tuple of four floats (percentages between 0 and 100) for the
00050     entire sequence, and the three codon positions.  e.g.
00051 
00052     >>> from Bio.SeqUtils import GC123
00053     >>> GC123("ACTGTN")
00054     (40.0, 50.0, 50.0, 0.0)
00055 
00056     Copes with mixed case sequences, but does NOT deal with ambiguous
00057     nucleotides.
00058     """
00059     d= {}
00060     for nt in ['A','T','G','C']:
00061        d[nt] = [0,0,0]
00062 
00063     for i in range(0,len(seq),3):
00064         codon = seq[i:i+3]
00065         if len(codon) <3: codon += '  '
00066         for pos in range(0,3):
00067             for nt in ['A','T','G','C']:
00068                 if codon[pos] == nt or codon[pos] == nt.lower():
00069                     d[nt][pos] += 1
00070     gc = {}
00071     gcall = 0
00072     nall = 0
00073     for i in range(0,3):
00074         try:
00075             n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
00076             gc[i] = (d['G'][i] + d['C'][i])*100.0/n
00077         except:
00078             gc[i] = 0
00079 
00080         gcall = gcall + d['G'][i] + d['C'][i]
00081         nall = nall + n
00082 
00083     gcall = 100.0*gcall/nall
00084     return gcall, gc[0], gc[1], gc[2]
00085 
00086 def GC_skew(seq, window = 100):
00087     """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
00088 
00089     Returns a list of ratios (floats), controlled by the length of the sequence
00090     and the size of the window.
00091 
00092     Does NOT look at any ambiguous nucleotides.
00093     """
00094     # 8/19/03: Iddo: added lowercase 
00095     values = []
00096     for i in range(0, len(seq), window):
00097         s = seq[i: i + window]
00098         g = s.count('G') + s.count('g')
00099         c = s.count('C') + s.count('c')
00100         skew = (g-c)/float(g+c)
00101         values.append(skew)
00102     return values
00103 
00104 from math import pi, sin, cos, log
00105 def xGC_skew(seq, window = 1000, zoom = 100,
00106                          r = 300, px = 100, py = 100):
00107     """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
00108     from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
00109                         VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
00110     yscroll = Scrollbar(orient = VERTICAL)
00111     xscroll = Scrollbar(orient = HORIZONTAL)
00112     canvas = Canvas(yscrollcommand = yscroll.set,
00113                     xscrollcommand = xscroll.set, background = 'white')
00114     win = canvas.winfo_toplevel()
00115     win.geometry('700x700')
00116    
00117     yscroll.config(command = canvas.yview)
00118     xscroll.config(command = canvas.xview)
00119     yscroll.pack(side = RIGHT, fill = Y)
00120     xscroll.pack(side = BOTTOM, fill = X)
00121     canvas.pack(fill=BOTH, side = LEFT, expand = 1)
00122     canvas.update()
00123 
00124     X0, Y0  = r + px, r + py
00125     x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
00126    
00127     ty = Y0
00128     canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
00129     ty +=20
00130     canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
00131     ty +=20
00132     canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
00133     ty +=20
00134     canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
00135     ty +=20
00136     canvas.create_oval(x1,y1, x2, y2)
00137 
00138     acc = 0
00139     start = 0
00140     for gc in GC_skew(seq, window):
00141         r1 = r
00142         acc+=gc
00143         # GC skew
00144         alpha = pi - (2*pi*start)/len(seq)
00145         r2 = r1 - gc*zoom
00146         x1 = X0 + r1 * sin(alpha)
00147         y1 = Y0 + r1 * cos(alpha)
00148         x2 = X0 + r2 * sin(alpha)
00149         y2 = Y0 + r2 * cos(alpha)
00150         canvas.create_line(x1,y1,x2,y2, fill = 'blue')
00151         # accumulated GC skew
00152         r1 = r - 50
00153         r2 = r1 - acc
00154         x1 = X0 + r1 * sin(alpha)
00155         y1 = Y0 + r1 * cos(alpha)
00156         x2 = X0 + r2 * sin(alpha)
00157         y2 = Y0 + r2 * cos(alpha)
00158         canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
00159 
00160         canvas.update()
00161         start += window
00162 
00163     canvas.configure(scrollregion = canvas.bbox(ALL))
00164 
00165 def molecular_weight(seq):
00166     """Calculate the molecular weight of a DNA sequence."""
00167     if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna)
00168     weight_table = IUPACData.unambiguous_dna_weights
00169     return sum(weight_table[x] for x in seq)
00170 
00171 def nt_search(seq, subseq):
00172     """Search for a DNA subseq in sequence.
00173 
00174     use ambiguous values (like N = A or T or C or G, R = A or G etc.)
00175     searches only on forward strand
00176     """
00177     pattern = ''
00178     for nt in subseq:
00179         value = IUPACData.ambiguous_dna_values[nt]
00180         if len(value) == 1:
00181             pattern += value
00182         else:
00183             pattern += '[%s]' % value
00184 
00185     pos = -1
00186     result = [pattern]
00187     l = len(seq)
00188     while True:
00189         pos+=1
00190         s = seq[pos:]
00191         m = re.search(pattern, s)
00192         if not m: break
00193         pos += int(m.start(0))
00194         result.append(pos)
00195     return result
00196 
00197 # }}}
00198    
00199 ######################################
00200 # Protein
00201 ######################
00202 # {{{ 
00203 
00204 
00205 def seq3(seq):
00206     """Turn a one letter code protein sequence into one with three letter codes.
00207 
00208     The single input argument 'seq' should be a protein sequence using single
00209     letter codes, either as a python string or as a Seq or MutableSeq object.
00210 
00211     This function returns the amino acid sequence as a string using the three
00212     letter amino acid codes. Output follows the IUPAC standard (including
00213     ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
00214     for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
00215     Any unknown character (including possible gap characters), is changed into
00216     'Xaa'.
00217 
00218     e.g.
00219     >>> from Bio.SeqUtils import seq3
00220     >>> seq3("MAIVMGRWKGAR*")
00221     'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
00222 
00223     This function was inspired by BioPerl's seq3.
00224     """
00225     threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
00226                  'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
00227                  'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
00228                  'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
00229                  'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
00230                  'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
00231                  'U':'Sel', 'O':'Pyl', 'J':'Xle',
00232                  }
00233     #We use a default of 'Xaa' for undefined letters
00234     #Note this will map '-' to 'Xaa' which may be undesirable!
00235     return ''.join([threecode.get(aa,'Xaa') for aa in seq])
00236 
00237 
00238 # }}}
00239 
00240 ######################################
00241 # Mixed ??? 
00242 ######################
00243 # {{{ 
00244 
00245 
00246 def six_frame_translations(seq, genetic_code = 1):
00247     """Formatted string showing the 6 frame translations and GC content.
00248 
00249     nice looking 6 frame translation with GC content - code from xbbtools
00250     similar to DNA Striders six-frame translation
00251 
00252     e.g.
00253     from Bio.SeqUtils import six_frame_translations
00254     print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
00255     """
00256     from Bio.Seq import reverse_complement, translate
00257     anti = reverse_complement(seq)
00258     comp = anti[::-1]
00259     length = len(seq)
00260     frames = {}
00261     for i in range(0,3):
00262         frames[i+1]  = translate(seq[i:], genetic_code)
00263         frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
00264 
00265     # create header
00266     if length > 20:
00267         short = '%s ... %s' % (seq[:10], seq[-10:])
00268     else:
00269         short = seq
00270     #TODO? Remove the date as this would spoil any unit test...
00271     date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
00272     header = 'GC_Frame: %s, ' % date
00273     for nt in ['a','t','g','c']:
00274         header += '%s:%d ' % (nt, seq.count(nt.upper()))
00275       
00276     header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))       
00277     res = header
00278    
00279     for i in range(0,length,60):
00280         subseq = seq[i:i+60]
00281         csubseq = comp[i:i+60]
00282         p = i/3
00283         res = res + '%d/%d\n' % (i+1, i/3+1)
00284         res = res + '  ' + '  '.join(map(None,frames[3][p:p+20])) + '\n'
00285         res = res + ' ' + '  '.join(map(None,frames[2][p:p+20])) + '\n'
00286         res = res + '  '.join(map(None,frames[1][p:p+20])) + '\n'
00287         # seq
00288         res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
00289         res = res + csubseq.lower() + '\n'
00290         # - frames
00291         res = res + '  '.join(map(None,frames[-2][p:p+20]))  +' \n'
00292         res = res + ' ' + '  '.join(map(None,frames[-1][p:p+20])) + '\n'
00293         res = res + '  ' + '  '.join(map(None,frames[-3][p:p+20])) + '\n\n'
00294     return res
00295 
00296 # }}}
00297 
00298 ######################################
00299 # FASTA file utilities
00300 ######################
00301 # {{{ 
00302 
00303 
00304 def quick_FASTA_reader(file):
00305     """Simple FASTA reader, returning a list of string tuples.
00306 
00307     The single argument 'file' should be the filename of a FASTA format file.
00308     This function will open and read in the entire file, constructing a list
00309     of all the records, each held as a tuple of strings (the sequence name or
00310     title, and its sequence).
00311 
00312     This function was originally intended for use on large files, where its
00313     low overhead makes it very fast.  However, because it returns the data as
00314     a single in memory list, this can require a lot of RAM on large files.
00315    
00316     You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
00317     allows you to iterate over the records one by one (avoiding having all the
00318     records in memory at once).  Using Bio.SeqIO also makes it easy to switch
00319     between different input file formats.  However, please note that rather
00320     than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
00321     """
00322     #Want to split on "\n>" not just ">" in case there are any extra ">"
00323     #in the name/description.  So, in order to make sure we also split on
00324     #the first entry, prepend a "\n" to the start of the file.
00325     handle = open(file)
00326     txt = "\n" + handle.read()
00327     handle.close()
00328     entries = []
00329     for entry in txt.split('\n>')[1:]:
00330         name,seq= entry.split('\n',1)
00331         seq = seq.replace('\n','').replace(' ','').upper()
00332         entries.append((name, seq))
00333     return entries
00334 
00335 
00336 # }}}
00337 
00338 
00339 def _test():
00340     """Run the Bio.SeqUtils module's doctests (PRIVATE)."""
00341     print "Runing doctests..."
00342     import doctest
00343     doctest.testmod()
00344     print "Done"
00345 
00346 if __name__ == "__main__":
00347     _test()