Back to index

python-biopython  1.60
dnal.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 #
00006 # Bio.Wise contains modules for running and processing the output of
00007 # some of the models in the Wise2 package by Ewan Birney available from:
00008 # ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/
00009 # http://www.ebi.ac.uk/Wise2/
00010 # 
00011 # Bio.Wise.psw is for protein Smith-Waterman alignments
00012 # Bio.Wise.dnal is for Smith-Waterman DNA alignments
00013 
00014 __version__ = "$Revision: 1.12 $"
00015 
00016 import commands
00017 import itertools
00018 import os
00019 import re
00020 
00021 from Bio import Wise
00022 
00023 _SCORE_MATCH = 4
00024 _SCORE_MISMATCH = -1
00025 _SCORE_GAP_START = -5
00026 _SCORE_GAP_EXTENSION = -1
00027 
00028 _CMDLINE_DNAL = ["dnal", "-alb", "-nopretty"]
00029 
00030 def _build_dnal_cmdline(match, mismatch, gap, extension):
00031     res = _CMDLINE_DNAL[:]
00032     res.extend(["-match", str(match)])
00033     res.extend(["-mis", str(mismatch)])
00034     res.extend(["-gap", str(-gap)]) # negative: convert score to penalty
00035     res.extend(["-ext", str(-extension)])  # negative: convert score to penalty
00036 
00037     return res
00038 
00039 _CMDLINE_FGREP_COUNT = "fgrep -c '%s' %s"
00040 def _fgrep_count(pattern, file):
00041     return int(commands.getoutput(_CMDLINE_FGREP_COUNT % (pattern, file)))
00042 
00043 _re_alb_line2coords = re.compile(r"^\[([^:]+):[^\[]+\[([^:]+):")
00044 def _alb_line2coords(line):
00045     return tuple([int(coord)+1 # one-based -> zero-based
00046                   for coord
00047                   in _re_alb_line2coords.match(line).groups()])
00048 
00049 def _get_coords(filename):
00050     alb = file(filename)
00051 
00052     start_line = None
00053     end_line = None
00054 
00055     for line in alb:
00056         if line.startswith("["):
00057             if not start_line:
00058                 start_line = line # rstrip not needed
00059             else:
00060                 end_line = line
00061 
00062     if end_line is None: # sequence is too short
00063         return [(0, 0), (0, 0)]
00064         
00065     return zip(*map(_alb_line2coords, [start_line, end_line])) # returns [(start0, end0), (start1, end1)]
00066 
00067 def _any(seq, pred=bool):
00068     "Returns True if pred(x) is True at least one element in the iterable"
00069     return True in itertools.imap(pred, seq)
00070 
00071 class Statistics(object):
00072     """
00073     Calculate statistics from an ALB report
00074     """
00075     def __init__(self, filename, match, mismatch, gap, extension):
00076         self.matches = _fgrep_count('"SEQUENCE" %s' % match, filename)
00077         self.mismatches = _fgrep_count('"SEQUENCE" %s' % mismatch, filename)
00078         self.gaps = _fgrep_count('"INSERT" %s' % gap, filename)
00079 
00080         if gap == extension:
00081             self.extensions = 0
00082         else:
00083             self.extensions = _fgrep_count('"INSERT" %s' % extension, filename)
00084             
00085         self.score = (match*self.matches +
00086                       mismatch*self.mismatches +
00087                       gap*self.gaps +
00088                       extension*self.extensions)
00089 
00090         if _any([self.matches, self.mismatches, self.gaps, self.extensions]):
00091             self.coords = _get_coords(filename)
00092         else:
00093             self.coords = [(0, 0), (0,0)]
00094 
00095     def identity_fraction(self):
00096         return self.matches/(self.matches+self.mismatches)
00097 
00098     header = "identity_fraction\tmatches\tmismatches\tgaps\textensions"
00099 
00100     def __str__(self):
00101         return "\t".join([str(x) for x in (self.identity_fraction(), self.matches, self.mismatches, self.gaps, self.extensions)])
00102 
00103 def align(pair, match=_SCORE_MATCH, mismatch=_SCORE_MISMATCH, gap=_SCORE_GAP_START, extension=_SCORE_GAP_EXTENSION, **keywds):
00104     cmdline = _build_dnal_cmdline(match, mismatch, gap, extension)
00105     temp_file = Wise.align(cmdline, pair, **keywds)
00106     try:
00107         return Statistics(temp_file.name, match, mismatch, gap, extension)
00108     except AttributeError:
00109         try:
00110             keywds['dry_run']
00111             return None
00112         except KeyError:
00113             raise
00114 
00115 def main():
00116     import sys
00117     stats = align(sys.argv[1:3])
00118     print "\n".join(["%s: %s" % (attr, getattr(stats, attr))
00119                      for attr in
00120                      ("matches", "mismatches", "gaps", "extensions")])
00121     print "identity_fraction: %s" % stats.identity_fraction()
00122     print "coords: %s" % stats.coords
00123 
00124 def _test(*args, **keywds):
00125     import doctest, sys
00126     doctest.testmod(sys.modules[__name__], *args, **keywds)
00127 
00128 if __name__ == "__main__":
00129     if __debug__:
00130         _test()
00131     main()