Back to index

python-biopython  1.60
Namespaces | Functions
Bio.SeqUtils Namespace Reference

Namespaces

namespace  CheckSum
namespace  CodonUsage
namespace  CodonUsageIndices
namespace  IsoelectricPoint
namespace  lcc
namespace  MeltingTemp
namespace  ProtParam
namespace  ProtParamData

Functions

def GC
 DNA.
def GC123
def GC_skew
def xGC_skew
def molecular_weight
def nt_search
def seq3
 Protein.
def six_frame_translations
 Mixed ???
def quick_FASTA_reader
 FASTA file utilities.
def _test

Function Documentation

def Bio.SeqUtils._test ( ) [private]
Run the Bio.SeqUtils module's doctests (PRIVATE).

Definition at line 339 of file __init__.py.

00339 
00340 def _test():
00341     """Run the Bio.SeqUtils module's doctests (PRIVATE)."""
00342     print "Runing doctests..."
00343     import doctest
00344     doctest.testmod()
00345     print "Done"

def Bio.SeqUtils.GC (   seq)

DNA.

{{{

Calculates G+C content, returns the percentage (float between 0 and 100).

Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
when counting the G and C content.  The percentage is calculated against
the full length, e.g.: 

>>> from Bio.SeqUtils import GC
>>> GC("ACTGN")
40.0

Note that this will return zero for an empty sequence.

Definition at line 26 of file __init__.py.

00026 
00027 def GC(seq):
00028     """Calculates G+C content, returns the percentage (float between 0 and 100).
00029 
00030     Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
00031     when counting the G and C content.  The percentage is calculated against
00032     the full length, e.g.: 
00033 
00034     >>> from Bio.SeqUtils import GC
00035     >>> GC("ACTGN")
00036     40.0
00037 
00038     Note that this will return zero for an empty sequence.
00039     """
00040     try:
00041         gc = sum(map(seq.count,['G','C','g','c','S','s']))
00042         return gc*100.0/len(seq)
00043     except ZeroDivisionError:
00044         return 0.0
00045         
    

Here is the caller graph for this function:

def Bio.SeqUtils.GC123 (   seq)
Calculates total G+C content plus first, second and third positions.

Returns a tuple of four floats (percentages between 0 and 100) for the
entire sequence, and the three codon positions.  e.g.

>>> from Bio.SeqUtils import GC123
>>> GC123("ACTGTN")
(40.0, 50.0, 50.0, 0.0)

Copes with mixed case sequences, but does NOT deal with ambiguous
nucleotides.

Definition at line 46 of file __init__.py.

00046 
00047 def GC123(seq):
00048     """Calculates total G+C content plus first, second and third positions.
00049 
00050     Returns a tuple of four floats (percentages between 0 and 100) for the
00051     entire sequence, and the three codon positions.  e.g.
00052 
00053     >>> from Bio.SeqUtils import GC123
00054     >>> GC123("ACTGTN")
00055     (40.0, 50.0, 50.0, 0.0)
00056 
00057     Copes with mixed case sequences, but does NOT deal with ambiguous
00058     nucleotides.
00059     """
00060     d= {}
00061     for nt in ['A','T','G','C']:
00062        d[nt] = [0,0,0]
00063 
00064     for i in range(0,len(seq),3):
00065         codon = seq[i:i+3]
00066         if len(codon) <3: codon += '  '
00067         for pos in range(0,3):
00068             for nt in ['A','T','G','C']:
00069                 if codon[pos] == nt or codon[pos] == nt.lower():
00070                     d[nt][pos] += 1
00071     gc = {}
00072     gcall = 0
00073     nall = 0
00074     for i in range(0,3):
00075         try:
00076             n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
00077             gc[i] = (d['G'][i] + d['C'][i])*100.0/n
00078         except:
00079             gc[i] = 0
00080 
00081         gcall = gcall + d['G'][i] + d['C'][i]
00082         nall = nall + n
00083 
00084     gcall = 100.0*gcall/nall
00085     return gcall, gc[0], gc[1], gc[2]

def Bio.SeqUtils.GC_skew (   seq,
  window = 100 
)
Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.

Returns a list of ratios (floats), controlled by the length of the sequence
and the size of the window.

Does NOT look at any ambiguous nucleotides.

Definition at line 86 of file __init__.py.

00086 
00087 def GC_skew(seq, window = 100):
00088     """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
00089 
00090     Returns a list of ratios (floats), controlled by the length of the sequence
00091     and the size of the window.
00092 
00093     Does NOT look at any ambiguous nucleotides.
00094     """
00095     # 8/19/03: Iddo: added lowercase 
00096     values = []
00097     for i in range(0, len(seq), window):
00098         s = seq[i: i + window]
00099         g = s.count('G') + s.count('g')
00100         c = s.count('C') + s.count('c')
00101         skew = (g-c)/float(g+c)
00102         values.append(skew)
00103     return values

Here is the call graph for this function:

Here is the caller graph for this function:

Calculate the molecular weight of a DNA sequence.

Definition at line 165 of file __init__.py.

00165 
00166 def molecular_weight(seq):
00167     """Calculate the molecular weight of a DNA sequence."""
00168     if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna)
00169     weight_table = IUPACData.unambiguous_dna_weights
00170     return sum(weight_table[x] for x in seq)

def Bio.SeqUtils.nt_search (   seq,
  subseq 
)
Search for a DNA subseq in sequence.

use ambiguous values (like N = A or T or C or G, R = A or G etc.)
searches only on forward strand

Definition at line 171 of file __init__.py.

00171 
00172 def nt_search(seq, subseq):
00173     """Search for a DNA subseq in sequence.
00174 
00175     use ambiguous values (like N = A or T or C or G, R = A or G etc.)
00176     searches only on forward strand
00177     """
00178     pattern = ''
00179     for nt in subseq:
00180         value = IUPACData.ambiguous_dna_values[nt]
00181         if len(value) == 1:
00182             pattern += value
00183         else:
00184             pattern += '[%s]' % value
00185 
00186     pos = -1
00187     result = [pattern]
00188     l = len(seq)
00189     while True:
00190         pos+=1
00191         s = seq[pos:]
00192         m = re.search(pattern, s)
00193         if not m: break
00194         pos += int(m.start(0))
00195         result.append(pos)
00196     return result
00197 
00198 # }}}
   

FASTA file utilities.

{{{

Simple FASTA reader, returning a list of string tuples.

The single argument 'file' should be the filename of a FASTA format file.
This function will open and read in the entire file, constructing a list
of all the records, each held as a tuple of strings (the sequence name or
title, and its sequence).

This function was originally intended for use on large files, where its
low overhead makes it very fast.  However, because it returns the data as
a single in memory list, this can require a lot of RAM on large files.
   
You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
allows you to iterate over the records one by one (avoiding having all the
records in memory at once).  Using Bio.SeqIO also makes it easy to switch
between different input file formats.  However, please note that rather
than simple strings, Bio.SeqIO uses SeqRecord objects for each record.

Definition at line 304 of file __init__.py.

00304 
00305 def quick_FASTA_reader(file):
00306     """Simple FASTA reader, returning a list of string tuples.
00307 
00308     The single argument 'file' should be the filename of a FASTA format file.
00309     This function will open and read in the entire file, constructing a list
00310     of all the records, each held as a tuple of strings (the sequence name or
00311     title, and its sequence).
00312 
00313     This function was originally intended for use on large files, where its
00314     low overhead makes it very fast.  However, because it returns the data as
00315     a single in memory list, this can require a lot of RAM on large files.
00316    
00317     You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
00318     allows you to iterate over the records one by one (avoiding having all the
00319     records in memory at once).  Using Bio.SeqIO also makes it easy to switch
00320     between different input file formats.  However, please note that rather
00321     than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
00322     """
00323     #Want to split on "\n>" not just ">" in case there are any extra ">"
00324     #in the name/description.  So, in order to make sure we also split on
00325     #the first entry, prepend a "\n" to the start of the file.
00326     handle = open(file)
00327     txt = "\n" + handle.read()
00328     handle.close()
00329     entries = []
00330     for entry in txt.split('\n>')[1:]:
00331         name,seq= entry.split('\n',1)
00332         seq = seq.replace('\n','').replace(' ','').upper()
00333         entries.append((name, seq))
00334     return entries
00335 
00336 
00337 # }}}
00338 

Here is the call graph for this function:

Here is the caller graph for this function:

def Bio.SeqUtils.seq3 (   seq)

Protein.

{{{

Turn a one letter code protein sequence into one with three letter codes.

The single input argument 'seq' should be a protein sequence using single
letter codes, either as a python string or as a Seq or MutableSeq object.

This function returns the amino acid sequence as a string using the three
letter amino acid codes. Output follows the IUPAC standard (including
ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
Any unknown character (including possible gap characters), is changed into
'Xaa'.

e.g.
>>> from Bio.SeqUtils import seq3
>>> seq3("MAIVMGRWKGAR*")
'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'

This function was inspired by BioPerl's seq3.

Definition at line 205 of file __init__.py.

00205 
00206 def seq3(seq):
00207     """Turn a one letter code protein sequence into one with three letter codes.
00208 
00209     The single input argument 'seq' should be a protein sequence using single
00210     letter codes, either as a python string or as a Seq or MutableSeq object.
00211 
00212     This function returns the amino acid sequence as a string using the three
00213     letter amino acid codes. Output follows the IUPAC standard (including
00214     ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
00215     for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
00216     Any unknown character (including possible gap characters), is changed into
00217     'Xaa'.
00218 
00219     e.g.
00220     >>> from Bio.SeqUtils import seq3
00221     >>> seq3("MAIVMGRWKGAR*")
00222     'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
00223 
00224     This function was inspired by BioPerl's seq3.
00225     """
00226     threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
00227                  'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
00228                  'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
00229                  'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
00230                  'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
00231                  'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
00232                  'U':'Sel', 'O':'Pyl', 'J':'Xle',
00233                  }
00234     #We use a default of 'Xaa' for undefined letters
00235     #Note this will map '-' to 'Xaa' which may be undesirable!
00236     return ''.join([threecode.get(aa,'Xaa') for aa in seq])
00237 
00238 
00239 # }}}

def Bio.SeqUtils.six_frame_translations (   seq,
  genetic_code = 1 
)

Mixed ???

{{{

Formatted string showing the 6 frame translations and GC content.

nice looking 6 frame translation with GC content - code from xbbtools
similar to DNA Striders six-frame translation

e.g.
from Bio.SeqUtils import six_frame_translations
print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")

Definition at line 246 of file __init__.py.

00246 
00247 def six_frame_translations(seq, genetic_code = 1):
00248     """Formatted string showing the 6 frame translations and GC content.
00249 
00250     nice looking 6 frame translation with GC content - code from xbbtools
00251     similar to DNA Striders six-frame translation
00252 
00253     e.g.
00254     from Bio.SeqUtils import six_frame_translations
00255     print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
00256     """
00257     from Bio.Seq import reverse_complement, translate
00258     anti = reverse_complement(seq)
00259     comp = anti[::-1]
00260     length = len(seq)
00261     frames = {}
00262     for i in range(0,3):
00263         frames[i+1]  = translate(seq[i:], genetic_code)
00264         frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
00265 
00266     # create header
00267     if length > 20:
00268         short = '%s ... %s' % (seq[:10], seq[-10:])
00269     else:
00270         short = seq
00271     #TODO? Remove the date as this would spoil any unit test...
00272     date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
00273     header = 'GC_Frame: %s, ' % date
00274     for nt in ['a','t','g','c']:
00275         header += '%s:%d ' % (nt, seq.count(nt.upper()))
00276       
00277     header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))       
00278     res = header
00279    
00280     for i in range(0,length,60):
00281         subseq = seq[i:i+60]
00282         csubseq = comp[i:i+60]
00283         p = i/3
00284         res = res + '%d/%d\n' % (i+1, i/3+1)
00285         res = res + '  ' + '  '.join(map(None,frames[3][p:p+20])) + '\n'
00286         res = res + ' ' + '  '.join(map(None,frames[2][p:p+20])) + '\n'
00287         res = res + '  '.join(map(None,frames[1][p:p+20])) + '\n'
00288         # seq
00289         res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
00290         res = res + csubseq.lower() + '\n'
00291         # - frames
00292         res = res + '  '.join(map(None,frames[-2][p:p+20]))  +' \n'
00293         res = res + ' ' + '  '.join(map(None,frames[-1][p:p+20])) + '\n'
00294         res = res + '  ' + '  '.join(map(None,frames[-3][p:p+20])) + '\n\n'
00295     return res
00296 
00297 # }}}

Here is the call graph for this function:

def Bio.SeqUtils.xGC_skew (   seq,
  window = 1000,
  zoom = 100,
  r = 300,
  px = 100,
  py = 100 
)
Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).

Definition at line 106 of file __init__.py.

00106 
00107                          r = 300, px = 100, py = 100):
00108     """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
00109     from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
00110                         VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
00111     yscroll = Scrollbar(orient = VERTICAL)
00112     xscroll = Scrollbar(orient = HORIZONTAL)
00113     canvas = Canvas(yscrollcommand = yscroll.set,
00114                     xscrollcommand = xscroll.set, background = 'white')
00115     win = canvas.winfo_toplevel()
00116     win.geometry('700x700')
00117    
00118     yscroll.config(command = canvas.yview)
00119     xscroll.config(command = canvas.xview)
00120     yscroll.pack(side = RIGHT, fill = Y)
00121     xscroll.pack(side = BOTTOM, fill = X)
00122     canvas.pack(fill=BOTH, side = LEFT, expand = 1)
00123     canvas.update()
00124 
00125     X0, Y0  = r + px, r + py
00126     x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
00127    
00128     ty = Y0
00129     canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
00130     ty +=20
00131     canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
00132     ty +=20
00133     canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
00134     ty +=20
00135     canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
00136     ty +=20
00137     canvas.create_oval(x1,y1, x2, y2)
00138 
00139     acc = 0
00140     start = 0
00141     for gc in GC_skew(seq, window):
00142         r1 = r
00143         acc+=gc
00144         # GC skew
00145         alpha = pi - (2*pi*start)/len(seq)
00146         r2 = r1 - gc*zoom
00147         x1 = X0 + r1 * sin(alpha)
00148         y1 = Y0 + r1 * cos(alpha)
00149         x2 = X0 + r2 * sin(alpha)
00150         y2 = Y0 + r2 * cos(alpha)
00151         canvas.create_line(x1,y1,x2,y2, fill = 'blue')
00152         # accumulated GC skew
00153         r1 = r - 50
00154         r2 = r1 - acc
00155         x1 = X0 + r1 * sin(alpha)
00156         y1 = Y0 + r1 * cos(alpha)
00157         x2 = X0 + r2 * sin(alpha)
00158         y2 = Y0 + r2 * cos(alpha)
00159         canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
00160 
00161         canvas.update()
00162         start += window
00163 
00164     canvas.configure(scrollregion = canvas.bbox(ALL))

Here is the call graph for this function:

Here is the caller graph for this function: