Back to index

python-biopython  1.60
pairwise2.py
Go to the documentation of this file.
00001 # Copyright 2002 by Jeffrey Chang.  All rights reserved.
00002 # This code is part of the Biopython distribution and governed by its
00003 # license.  Please see the LICENSE file that should have been included
00004 # as part of this package.
00005 
00006 """This package implements pairwise sequence alignment using a dynamic
00007 programming algorithm.
00008 
00009 This provides functions to get global and local alignments between two
00010 sequences.  A global alignment finds the best concordance between all
00011 characters in two sequences.  A local alignment finds just the
00012 subsequences that align the best.
00013 
00014 When doing alignments, you can specify the match score and gap
00015 penalties.  The match score indicates the compatibility between an
00016 alignment of two characters in the sequences.  Highly compatible
00017 characters should be given positive scores, and incompatible ones
00018 should be given negative scores or 0.  The gap penalties should be
00019 negative.
00020 
00021 The names of the alignment functions in this module follow the
00022 convention
00023 <alignment type>XX
00024 where <alignment type> is either "global" or "local" and XX is a 2
00025 character code indicating the parameters it takes.  The first
00026 character indicates the parameters for matches (and mismatches), and
00027 the second indicates the parameters for gap penalties.
00028 
00029 The match parameters are
00030 CODE  DESCRIPTION
00031 x     No parameters.  Identical characters have score of 1, otherwise 0.
00032 m     A match score is the score of identical chars, otherwise mismatch score.
00033 d     A dictionary returns the score of any pair of characters.
00034 c     A callback function returns scores.
00035 
00036 The gap penalty parameters are
00037 CODE  DESCRIPTION
00038 x     No gap penalties.
00039 s     Same open and extend gap penalties for both sequences.
00040 d     The sequences have different open and extend gap penalties.
00041 c     A callback function returns the gap penalties.
00042 
00043 All the different alignment functions are contained in an object
00044 "align".  For example:
00045 
00046     >>> from Bio import pairwise2
00047     >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG")
00048 
00049 will return a list of the alignments between the two strings.  The
00050 parameters of the alignment function depends on the function called.
00051 Some examples:
00052 
00053     # Find the best global alignment between the two sequences.
00054     # Identical characters are given 1 point.  No points are deducted
00055     # for mismatches or gaps.
00056     >>> for a in pairwise2.align.globalxx("ACCGT", "ACG"):
00057     ...     print format_alignment(*a)
00058     ACCGT
00059     |||||
00060     AC-G-
00061       Score=3
00062     <BLANKLINE>
00063     ACCGT
00064     |||||
00065     A-CG-
00066       Score=3
00067     <BLANKLINE>
00068 
00069     
00070     # Same thing as before, but with a local alignment.
00071     >>> for a in pairwise2.align.localxx("ACCGT", "ACG"):
00072     ...     print format_alignment(*a)
00073     ACCGT
00074     ||||
00075     AC-G-
00076       Score=3
00077     <BLANKLINE>
00078     ACCGT
00079     ||||
00080     A-CG-
00081       Score=3
00082     <BLANKLINE>
00083     
00084     # Do a global alignment.  Identical characters are given 2 points,
00085     # 1 point is deducted for each non-identical character.
00086     >>> for a in pairwise2.align.globalmx("ACCGT", "ACG", 2, -1):
00087     ...     print format_alignment(*a)
00088     ACCGT
00089     |||||
00090     AC-G-
00091       Score=6
00092     <BLANKLINE>
00093     ACCGT
00094     |||||
00095     A-CG-
00096       Score=6
00097     <BLANKLINE>
00098 
00099     # Same as above, except now 0.5 points are deducted when opening a
00100     # gap, and 0.1 points are deducted when extending it.
00101     >>> for a in pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1):
00102     ...     print format_alignment(*a)
00103     ACCGT
00104     |||||
00105     AC-G-
00106       Score=5
00107     <BLANKLINE>
00108     ACCGT
00109     |||||
00110     A-CG-
00111       Score=5
00112     <BLANKLINE>
00113 
00114 The alignment function can also use known matrices already included in 
00115 Biopython ( Bio.SubsMat -> MatrixInfo ).
00116 
00117     >>> from Bio.SubsMat import MatrixInfo as matlist
00118     >>> matrix = matlist.blosum62
00119     >>> for a in pairwise2.align.globaldx("KEVLA", "EVL", matrix):
00120     ...     print format_alignment(*a)
00121     KEVLA
00122     |||||
00123     -EVL-
00124       Score=13
00125     <BLANKLINE>
00126 
00127 To see a description of the parameters for a function, please look at
00128 the docstring for the function via the help function, e.g.
00129 type help(pairwise2.align.localds) at the Python prompt.
00130 """
00131 # The alignment functions take some undocumented keyword parameters:
00132 # - penalize_extend_when_opening: boolean
00133 #   Whether to count an extension penalty when opening a gap.  If
00134 #   false, a gap of 1 is only penalize an "open" penalty, otherwise it
00135 #   is penalized "open+extend".
00136 # - penalize_end_gaps: boolean
00137 #   Whether to count the gaps at the ends of an alignment.  By
00138 #   default, they are counted for global alignments but not for local
00139 #   ones.
00140 # - gap_char: string
00141 #   Which character to use as a gap character in the alignment
00142 #   returned.  By default, uses '-'.
00143 # - force_generic: boolean
00144 #   Always use the generic, non-cached, dynamic programming function.
00145 #   For debugging.
00146 # - score_only: boolean
00147 #   Only get the best score, don't recover any alignments.  The return
00148 #   value of the function is the score.
00149 # - one_alignment_only: boolean
00150 #   Only recover one alignment.
00151 
00152 MAX_ALIGNMENTS = 1000   # maximum alignments recovered in traceback
00153 
00154 class align(object):
00155     """This class provides functions that do alignments."""
00156     
00157     class alignment_function:
00158         """This class is callable impersonates an alignment function.
00159         The constructor takes the name of the function.  This class
00160         will decode the name of the function to figure out how to
00161         interpret the parameters.
00162 
00163         """
00164         # match code -> tuple of (parameters, docstring)
00165         match2args = {
00166             'x' : ([], ''),
00167             'm' : (['match', 'mismatch'],
00168 """match is the score to given to identical characters.  mismatch is
00169 the score given to non-identical ones."""),
00170             'd' : (['match_dict'],
00171 """match_dict is a dictionary where the keys are tuples of pairs of
00172 characters and the values are the scores, e.g. ("A", "C") : 2.5."""),
00173             'c' : (['match_fn'],
00174 """match_fn is a callback function that takes two characters and
00175 returns the score between them."""),
00176             }
00177         # penalty code -> tuple of (parameters, docstring)
00178         penalty2args = {
00179             'x' : ([], ''),
00180             's' : (['open', 'extend'],
00181 """open and extend are the gap penalties when a gap is opened and
00182 extended.  They should be negative."""),
00183             'd' : (['openA', 'extendA', 'openB', 'extendB'],
00184 """openA and extendA are the gap penalties for sequenceA, and openB
00185 and extendB for sequeneB.  The penalties should be negative."""),
00186             'c' : (['gap_A_fn', 'gap_B_fn'],
00187 """gap_A_fn and gap_B_fn are callback functions that takes 1) the
00188 index where the gap is opened, and 2) the length of the gap.  They
00189 should return a gap penalty."""),
00190             }
00191 
00192         def __init__(self, name):
00193             # Check to make sure the name of the function is
00194             # reasonable.
00195             if name.startswith("global"):
00196                 if len(name) != 8:
00197                     raise AttributeError("function should be globalXX")
00198             elif name.startswith("local"):
00199                 if len(name) != 7:
00200                     raise AttributeError("function should be localXX")
00201             else:
00202                 raise AttributeError(name)
00203             align_type, match_type, penalty_type = \
00204                         name[:-2], name[-2], name[-1]
00205             try:
00206                 match_args, match_doc = self.match2args[match_type]
00207             except KeyError, x:
00208                 raise AttributeError("unknown match type %r" % match_type)
00209             try:
00210                 penalty_args, penalty_doc = self.penalty2args[penalty_type]
00211             except KeyError, x:
00212                 raise AttributeError("unknown penalty type %r" % penalty_type)
00213 
00214             # Now get the names of the parameters to this function.
00215             param_names = ['sequenceA', 'sequenceB']
00216             param_names.extend(match_args)
00217             param_names.extend(penalty_args)
00218             self.function_name = name
00219             self.align_type = align_type
00220             self.param_names = param_names
00221 
00222             self.__name__ = self.function_name
00223             # Set the doc string.
00224             doc = "%s(%s) -> alignments\n" % (
00225                 self.__name__, ', '.join(self.param_names))
00226             if match_doc:
00227                 doc += "\n%s\n" % match_doc
00228             if penalty_doc:
00229                 doc += "\n%s\n" % penalty_doc
00230             doc += (
00231 """\nalignments is a list of tuples (seqA, seqB, score, begin, end).
00232 seqA and seqB are strings showing the alignment between the
00233 sequences.  score is the score of the alignment.  begin and end
00234 are indexes into seqA and seqB that indicate the where the
00235 alignment occurs.
00236 """)
00237             self.__doc__ = doc
00238 
00239         def decode(self, *args, **keywds):
00240             # Decode the arguments for the _align function.  keywds
00241             # will get passed to it, so translate the arguments to
00242             # this function into forms appropriate for _align.
00243             keywds = keywds.copy()
00244             if len(args) != len(self.param_names):
00245                 raise TypeError("%s takes exactly %d argument (%d given)" \
00246                     % (self.function_name, len(self.param_names), len(args)))
00247             i = 0
00248             while i < len(self.param_names):
00249                 if self.param_names[i] in [
00250                     'sequenceA', 'sequenceB',
00251                     'gap_A_fn', 'gap_B_fn', 'match_fn']:
00252                     keywds[self.param_names[i]] = args[i]
00253                     i += 1
00254                 elif self.param_names[i] == 'match':
00255                     assert self.param_names[i+1] == 'mismatch'
00256                     match, mismatch = args[i], args[i+1]
00257                     keywds['match_fn'] = identity_match(match, mismatch)
00258                     i += 2
00259                 elif self.param_names[i] == 'match_dict':
00260                     keywds['match_fn'] = dictionary_match(args[i])
00261                     i += 1
00262                 elif self.param_names[i] == 'open':
00263                     assert self.param_names[i+1] == 'extend'
00264                     open, extend = args[i], args[i+1]
00265                     pe = keywds.get('penalize_extend_when_opening', 0)
00266                     keywds['gap_A_fn'] = affine_penalty(open, extend, pe)
00267                     keywds['gap_B_fn'] = affine_penalty(open, extend, pe)
00268                     i += 2
00269                 elif self.param_names[i] == 'openA':
00270                     assert self.param_names[i+3] == 'extendB'
00271                     openA, extendA, openB, extendB = args[i:i+4]
00272                     pe = keywds.get('penalize_extend_when_opening', 0)
00273                     keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe)
00274                     keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe)
00275                     i += 4
00276                 else:
00277                     raise ValueError("unknown parameter %r" \
00278                                      % self.param_names[i])
00279 
00280             # Here are the default parameters for _align.  Assign
00281             # these to keywds, unless already specified.
00282             pe = keywds.get('penalize_extend_when_opening', 0)
00283             default_params = [
00284                 ('match_fn', identity_match(1, 0)),
00285                 ('gap_A_fn', affine_penalty(0, 0, pe)),
00286                 ('gap_B_fn', affine_penalty(0, 0, pe)),
00287                 ('penalize_extend_when_opening', 0),
00288                 ('penalize_end_gaps', self.align_type == 'global'),
00289                 ('align_globally', self.align_type == 'global'),
00290                 ('gap_char', '-'),
00291                 ('force_generic', 0),
00292                 ('score_only', 0),
00293                 ('one_alignment_only', 0)
00294                 ]
00295             for name, default in default_params:
00296                 keywds[name] = keywds.get(name, default)
00297             return keywds
00298             
00299         def __call__(self, *args, **keywds):
00300             keywds = self.decode(*args, **keywds)
00301             return _align(**keywds)
00302         
00303     def __getattr__(self, attr):
00304         return self.alignment_function(attr)
00305 align = align()
00306 
00307 
00308 def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
00309            penalize_extend_when_opening, penalize_end_gaps,
00310            align_globally, gap_char, force_generic, score_only,
00311            one_alignment_only):
00312     if not sequenceA or not sequenceB:
00313         return []
00314 
00315     if (not force_generic) and isinstance(gap_A_fn, affine_penalty) \
00316     and isinstance(gap_B_fn, affine_penalty):
00317         open_A, extend_A = gap_A_fn.open, gap_A_fn.extend
00318         open_B, extend_B = gap_B_fn.open, gap_B_fn.extend
00319         x = _make_score_matrix_fast(
00320             sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
00321             penalize_extend_when_opening, penalize_end_gaps, align_globally,
00322             score_only)
00323     else:
00324         x = _make_score_matrix_generic(
00325             sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
00326             penalize_extend_when_opening, penalize_end_gaps, align_globally,
00327             score_only)
00328     score_matrix, trace_matrix = x
00329 
00330     #print "SCORE"; print_matrix(score_matrix)
00331     #print "TRACEBACK"; print_matrix(trace_matrix)
00332          
00333     # Look for the proper starting point.  Get a list of all possible
00334     # starting points.
00335     starts = _find_start(
00336         score_matrix, sequenceA, sequenceB,
00337         gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally)
00338     # Find the highest score.
00339     best_score = max([x[0] for x in starts])
00340 
00341     # If they only want the score, then return it.
00342     if score_only:
00343         return best_score
00344     
00345     tolerance = 0  # XXX do anything with this?
00346     # Now find all the positions within some tolerance of the best
00347     # score.
00348     i = 0
00349     while i < len(starts):
00350         score, pos = starts[i]
00351         if rint(abs(score-best_score)) > rint(tolerance):
00352             del starts[i]
00353         else:
00354             i += 1
00355     
00356     # Recover the alignments and return them.
00357     x = _recover_alignments(
00358         sequenceA, sequenceB, starts, score_matrix, trace_matrix,
00359         align_globally, penalize_end_gaps, gap_char, one_alignment_only)
00360     return x
00361 
00362 def _make_score_matrix_generic(
00363     sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 
00364     penalize_extend_when_opening, penalize_end_gaps, align_globally,
00365     score_only):
00366     # This is an implementation of the Needleman-Wunsch dynamic
00367     # programming algorithm for aligning sequences.
00368     
00369     # Create the score and traceback matrices.  These should be in the
00370     # shape:
00371     # sequenceA (down) x sequenceB (across)
00372     lenA, lenB = len(sequenceA), len(sequenceB)
00373     score_matrix, trace_matrix = [], []
00374     for i in range(lenA):
00375         score_matrix.append([None] * lenB)
00376         trace_matrix.append([[None]] * lenB)
00377 
00378     # The top and left borders of the matrices are special cases
00379     # because there are no previously aligned characters.  To simplify
00380     # the main loop, handle these separately.
00381     for i in range(lenA):
00382         # Align the first residue in sequenceB to the ith residue in
00383         # sequence A.  This is like opening up i gaps at the beginning
00384         # of sequence B.
00385         score = match_fn(sequenceA[i], sequenceB[0])
00386         if penalize_end_gaps:
00387             score += gap_B_fn(0, i)
00388         score_matrix[i][0] = score
00389     for i in range(1, lenB):
00390         score = match_fn(sequenceA[0], sequenceB[i])
00391         if penalize_end_gaps:
00392             score += gap_A_fn(0, i)
00393         score_matrix[0][i] = score
00394 
00395     # Fill in the score matrix.  Each position in the matrix
00396     # represents an alignment between a character from sequenceA to
00397     # one in sequence B.  As I iterate through the matrix, find the
00398     # alignment by choose the best of:
00399     #    1) extending a previous alignment without gaps
00400     #    2) adding a gap in sequenceA
00401     #    3) adding a gap in sequenceB
00402     for row in range(1, lenA):
00403         for col in range(1, lenB):
00404             # First, calculate the score that would occur by extending
00405             # the alignment without gaps.
00406             best_score = score_matrix[row-1][col-1]
00407             best_score_rint = rint(best_score)
00408             best_indexes = [(row-1, col-1)]
00409 
00410             # Try to find a better score by opening gaps in sequenceA.
00411             # Do this by checking alignments from each column in the
00412             # previous row.  Each column represents a different
00413             # character to align from, and thus a different length
00414             # gap.
00415             for i in range(0, col-1):
00416                 score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i)
00417                 score_rint = rint(score)
00418                 if score_rint == best_score_rint:
00419                     best_score, best_score_rint = score, score_rint
00420                     best_indexes.append((row-1, i))
00421                 elif score_rint > best_score_rint:
00422                     best_score, best_score_rint = score, score_rint
00423                     best_indexes = [(row-1, i)]
00424             
00425             # Try to find a better score by opening gaps in sequenceB.
00426             for i in range(0, row-1):
00427                 score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i)
00428                 score_rint = rint(score)
00429                 if score_rint == best_score_rint:
00430                     best_score, best_score_rint = score, score_rint
00431                     best_indexes.append((i, col-1))
00432                 elif score_rint > best_score_rint:
00433                     best_score, best_score_rint = score, score_rint
00434                     best_indexes = [(i, col-1)]
00435 
00436             score_matrix[row][col] = best_score + \
00437                                      match_fn(sequenceA[row], sequenceB[col])
00438             if not align_globally and score_matrix[row][col] < 0:
00439                 score_matrix[row][col] = 0
00440             trace_matrix[row][col] = best_indexes
00441     return score_matrix, trace_matrix
00442 
00443 def _make_score_matrix_fast(
00444     sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
00445     penalize_extend_when_opening, penalize_end_gaps,
00446     align_globally, score_only):
00447     first_A_gap = calc_affine_penalty(1, open_A, extend_A,
00448                                       penalize_extend_when_opening)
00449     first_B_gap = calc_affine_penalty(1, open_B, extend_B,
00450                                       penalize_extend_when_opening)
00451 
00452     # Create the score and traceback matrices.  These should be in the
00453     # shape:
00454     # sequenceA (down) x sequenceB (across)
00455     lenA, lenB = len(sequenceA), len(sequenceB)
00456     score_matrix, trace_matrix = [], []
00457     for i in range(lenA):
00458         score_matrix.append([None] * lenB)
00459         trace_matrix.append([[None]] * lenB)
00460 
00461     # The top and left borders of the matrices are special cases
00462     # because there are no previously aligned characters.  To simplify
00463     # the main loop, handle these separately.
00464     for i in range(lenA):
00465         # Align the first residue in sequenceB to the ith residue in
00466         # sequence A.  This is like opening up i gaps at the beginning
00467         # of sequence B.
00468         score = match_fn(sequenceA[i], sequenceB[0])
00469         if penalize_end_gaps:
00470             score += calc_affine_penalty(
00471                 i, open_B, extend_B, penalize_extend_when_opening)
00472         score_matrix[i][0] = score
00473     for i in range(1, lenB):
00474         score = match_fn(sequenceA[0], sequenceB[i])
00475         if penalize_end_gaps:
00476             score += calc_affine_penalty(
00477                 i, open_A, extend_A, penalize_extend_when_opening)
00478         score_matrix[0][i] = score
00479 
00480     # In the generic algorithm, at each row and column in the score
00481     # matrix, we had to scan all previous rows and columns to see
00482     # whether opening a gap might yield a higher score.  Here, since
00483     # we know the penalties are affine, we can cache just the best
00484     # score in the previous rows and columns.  Instead of scanning
00485     # through all the previous rows and cols, we can just look at the
00486     # cache for the best one.  Whenever the row or col increments, the
00487     # best cached score just decreases by extending the gap longer.
00488 
00489     # The best score and indexes for each row (goes down all columns).
00490     # I don't need to store the last row because it's the end of the
00491     # sequence.
00492     row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1)
00493     # The best score and indexes for each column (goes across rows).
00494     col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1)
00495 
00496     for i in range(lenA-1):
00497         # Initialize each row to be the alignment of sequenceA[i] to
00498         # sequenceB[0], plus opening a gap in sequenceA.
00499         row_cache_score[i] = score_matrix[i][0] + first_A_gap
00500         row_cache_index[i] = [(i, 0)]
00501     for i in range(lenB-1):
00502         col_cache_score[i] = score_matrix[0][i] + first_B_gap
00503         col_cache_index[i] = [(0, i)]
00504         
00505     # Fill in the score_matrix.
00506     for row in range(1, lenA):
00507         for col in range(1, lenB):
00508             # Calculate the score that would occur by extending the
00509             # alignment without gaps.
00510             nogap_score = score_matrix[row-1][col-1]
00511             
00512             # Check the score that would occur if there were a gap in
00513             # sequence A.
00514             if col > 1:
00515                 row_score = row_cache_score[row-1]
00516             else:
00517                 row_score = nogap_score - 1   # Make sure it's not the best.
00518             # Check the score that would occur if there were a gap in
00519             # sequence B.  
00520             if row > 1:
00521                 col_score = col_cache_score[col-1]
00522             else:
00523                 col_score = nogap_score - 1
00524 
00525             best_score = max(nogap_score, row_score, col_score)
00526             best_score_rint = rint(best_score)
00527             best_index = []
00528             if best_score_rint == rint(nogap_score):
00529                 best_index.append((row-1, col-1))
00530             if best_score_rint == rint(row_score):
00531                 best_index.extend(row_cache_index[row-1])
00532             if best_score_rint == rint(col_score):
00533                 best_index.extend(col_cache_index[col-1])
00534 
00535             # Set the score and traceback matrices.
00536             score = best_score + match_fn(sequenceA[row], sequenceB[col])
00537             if not align_globally and score < 0:
00538                 score_matrix[row][col] = 0
00539             else:
00540                 score_matrix[row][col] = score
00541             trace_matrix[row][col] = best_index
00542 
00543             # Update the cached column scores.  The best score for
00544             # this can come from either extending the gap in the
00545             # previous cached score, or opening a new gap from the
00546             # most previously seen character.  Compare the two scores
00547             # and keep the best one.
00548             open_score = score_matrix[row-1][col-1] + first_B_gap
00549             extend_score = col_cache_score[col-1] + extend_B
00550             open_score_rint, extend_score_rint = \
00551                              rint(open_score), rint(extend_score)
00552             if open_score_rint > extend_score_rint:
00553                 col_cache_score[col-1] = open_score
00554                 col_cache_index[col-1] = [(row-1, col-1)]
00555             elif extend_score_rint > open_score_rint:
00556                 col_cache_score[col-1] = extend_score
00557             else:
00558                 col_cache_score[col-1] = open_score
00559                 if (row-1, col-1) not in col_cache_index[col-1]:
00560                     col_cache_index[col-1] = col_cache_index[col-1] + \
00561                                              [(row-1, col-1)]
00562 
00563             # Update the cached row scores.
00564             open_score = score_matrix[row-1][col-1] + first_A_gap
00565             extend_score = row_cache_score[row-1] + extend_A
00566             open_score_rint, extend_score_rint = \
00567                              rint(open_score), rint(extend_score)
00568             if open_score_rint > extend_score_rint:
00569                 row_cache_score[row-1] = open_score
00570                 row_cache_index[row-1] = [(row-1, col-1)]
00571             elif extend_score_rint > open_score_rint:
00572                 row_cache_score[row-1] = extend_score
00573             else:
00574                 row_cache_score[row-1] = open_score
00575                 if (row-1, col-1) not in row_cache_index[row-1]:
00576                     row_cache_index[row-1] = row_cache_index[row-1] + \
00577                                              [(row-1, col-1)]
00578                     
00579     return score_matrix, trace_matrix
00580     
00581 def _recover_alignments(sequenceA, sequenceB, starts,
00582                         score_matrix, trace_matrix, align_globally,
00583                         penalize_end_gaps, gap_char, one_alignment_only):
00584     # Recover the alignments by following the traceback matrix.  This
00585     # is a recursive procedure, but it's implemented here iteratively
00586     # with a stack.
00587     lenA, lenB = len(sequenceA), len(sequenceB)
00588     tracebacks = [] # list of (seq1, seq2, score, begin, end)
00589     in_process = [] # list of ([same as tracebacks], prev_pos, next_pos)
00590 
00591     # sequenceA and sequenceB may be sequences, including strings,
00592     # lists, or list-like objects.  In order to preserve the type of
00593     # the object, we need to use slices on the sequences instead of
00594     # indexes.  For example, sequenceA[row] may return a type that's
00595     # not compatible with sequenceA, e.g. if sequenceA is a list and
00596     # sequenceA[row] is a string.  Thus, avoid using indexes and use
00597     # slices, e.g. sequenceA[row:row+1].  Assume that client-defined
00598     # sequence classes preserve these semantics.
00599 
00600     # Initialize the in_process stack
00601     for score, (row, col) in starts:
00602         if align_globally:
00603             begin, end = None, None
00604         else:
00605             begin, end = None, -max(lenA-row, lenB-col)+1
00606             if not end:
00607                 end = None
00608         # Initialize the in_process list with empty sequences of the
00609         # same type as sequenceA.  To do this, take empty slices of
00610         # the sequences.
00611         in_process.append(
00612             (sequenceA[0:0], sequenceB[0:0], score, begin, end,
00613              (lenA, lenB), (row, col)))
00614         if one_alignment_only:
00615             break
00616     while in_process and len(tracebacks) < MAX_ALIGNMENTS:
00617         seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop()
00618         prevA, prevB = prev_pos
00619         if next_pos is None:
00620             prevlen = len(seqA)
00621             # add the rest of the sequences
00622             seqA = sequenceA[:prevA] + seqA
00623             seqB = sequenceB[:prevB] + seqB
00624             # add the rest of the gaps
00625             seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char)
00626             
00627             # Now make sure begin is set.
00628             if begin is None:
00629                 if align_globally:
00630                     begin = 0
00631                 else:
00632                     begin = len(seqA) - prevlen
00633             tracebacks.append((seqA, seqB, score, begin, end))
00634         else:
00635             nextA, nextB = next_pos
00636             nseqA, nseqB = prevA-nextA, prevB-nextB
00637             maxseq = max(nseqA, nseqB)
00638             ngapA, ngapB = maxseq-nseqA, maxseq-nseqB
00639             seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA
00640             seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB
00641             prev_pos = next_pos
00642             # local alignment stops early if score falls < 0
00643             if not align_globally and score_matrix[nextA][nextB] <= 0:
00644                 begin = max(prevA, prevB)
00645                 in_process.append(
00646                     (seqA, seqB, score, begin, end, prev_pos, None))
00647             else:
00648                 for next_pos in trace_matrix[nextA][nextB]:
00649                     in_process.append(
00650                         (seqA, seqB, score, begin, end, prev_pos, next_pos))
00651                     if one_alignment_only:
00652                         break
00653                     
00654     return _clean_alignments(tracebacks)
00655 
00656 def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn,
00657                 penalize_end_gaps, align_globally):
00658     # Return a list of (score, (row, col)) indicating every possible
00659     # place to start the tracebacks.
00660     if align_globally:
00661         if penalize_end_gaps:
00662             starts = _find_global_start(
00663                 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, 1)
00664         else:
00665             starts = _find_global_start(
00666                 sequenceA, sequenceB, score_matrix, None, None, 0)
00667     else:
00668         starts = _find_local_start(score_matrix)
00669     return starts
00670 
00671 def _find_global_start(sequenceA, sequenceB,
00672                        score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
00673     # The whole sequence should be aligned, so return the positions at
00674     # the end of either one of the sequences.
00675     nrows, ncols = len(score_matrix), len(score_matrix[0])
00676     positions = []
00677     # Search all rows in the last column.
00678     for row in range(nrows):
00679         # Find the score, penalizing end gaps if necessary.
00680         score = score_matrix[row][ncols-1]
00681         if penalize_end_gaps:
00682             score += gap_B_fn(ncols, nrows-row-1)
00683         positions.append((score, (row, ncols-1)))
00684     # Search all columns in the last row.
00685     for col in range(ncols-1):
00686         score = score_matrix[nrows-1][col]
00687         if penalize_end_gaps:
00688             score += gap_A_fn(nrows, ncols-col-1)
00689         positions.append((score, (nrows-1, col)))
00690     return positions
00691 
00692 def _find_local_start(score_matrix):
00693     # Return every position in the matrix.
00694     positions = []
00695     nrows, ncols = len(score_matrix), len(score_matrix[0])
00696     for row in range(nrows):
00697         for col in range(ncols):
00698             score = score_matrix[row][col]
00699             positions.append((score, (row, col)))
00700     return positions
00701 
00702 def _clean_alignments(alignments):
00703     # Take a list of alignments and return a cleaned version.  Remove
00704     # duplicates, make sure begin and end are set correctly, remove
00705     # empty alignments.
00706     unique_alignments = []
00707     for align in alignments :
00708         if align not in unique_alignments :
00709             unique_alignments.append(align)
00710     i = 0
00711     while i < len(unique_alignments):
00712         seqA, seqB, score, begin, end = unique_alignments[i]
00713         # Make sure end is set reasonably.
00714         if end is None:   # global alignment
00715             end = len(seqA)
00716         elif end < 0:
00717             end = end + len(seqA)
00718         # If there's no alignment here, get rid of it.
00719         if begin >= end:
00720             del unique_alignments[i]
00721             continue
00722         unique_alignments[i] = seqA, seqB, score, begin, end
00723         i += 1
00724     return unique_alignments
00725 
00726 def _pad_until_equal(s1, s2, char):
00727     # Add char to the end of s1 or s2 until they are equal length.
00728     ls1, ls2 = len(s1), len(s2)
00729     if ls1 < ls2:
00730         s1 = _pad(s1, char, ls2-ls1)
00731     elif ls2 < ls1:
00732         s2 = _pad(s2, char, ls1-ls2)
00733     return s1, s2
00734 
00735 def _lpad_until_equal(s1, s2, char):
00736     # Add char to the beginning of s1 or s2 until they are equal
00737     # length.
00738     ls1, ls2 = len(s1), len(s2)
00739     if ls1 < ls2:
00740         s1 = _lpad(s1, char, ls2-ls1)
00741     elif ls2 < ls1:
00742         s2 = _lpad(s2, char, ls1-ls2)
00743     return s1, s2
00744 
00745 def _pad(s, char, n):
00746     # Append n chars to the end of s.
00747     return s + char*n
00748 
00749 def _lpad(s, char, n):
00750     # Prepend n chars to the beginning of s.
00751     return char*n + s
00752 
00753 _PRECISION = 1000
00754 def rint(x, precision=_PRECISION):
00755     return int(x * precision + 0.5)
00756 
00757 class identity_match:
00758     """identity_match([match][, mismatch]) -> match_fn
00759 
00760     Create a match function for use in an alignment.  match and
00761     mismatch are the scores to give when two residues are equal or
00762     unequal.  By default, match is 1 and mismatch is 0.
00763 
00764     """
00765     def __init__(self, match=1, mismatch=0):
00766         self.match = match
00767         self.mismatch = mismatch
00768     def __call__(self, charA, charB):
00769         if charA == charB:
00770             return self.match
00771         return self.mismatch
00772 
00773 class dictionary_match:
00774     """dictionary_match(score_dict[, symmetric]) -> match_fn
00775 
00776     Create a match function for use in an alignment.  score_dict is a
00777     dictionary where the keys are tuples (residue 1, residue 2) and
00778     the values are the match scores between those residues.  symmetric
00779     is a flag that indicates whether the scores are symmetric.  If
00780     true, then if (res 1, res 2) doesn't exist, I will use the score
00781     at (res 2, res 1).
00782 
00783     """
00784     def __init__(self, score_dict, symmetric=1):
00785         self.score_dict = score_dict
00786         self.symmetric = symmetric
00787     def __call__(self, charA, charB):
00788         if self.symmetric and (charA, charB) not in self.score_dict:
00789             # If the score dictionary is symmetric, then look up the
00790             # score both ways.
00791             charB, charA = charA, charB
00792         return self.score_dict[(charA, charB)]
00793 
00794 class affine_penalty:
00795     """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn
00796 
00797     Create a gap function for use in an alignment.
00798 
00799     """
00800     def __init__(self, open, extend, penalize_extend_when_opening=0):
00801         if open > 0 or extend > 0:
00802             raise ValueError("Gap penalties should be non-positive.")
00803         self.open, self.extend = open, extend
00804         self.penalize_extend_when_opening = penalize_extend_when_opening
00805     def __call__(self, index, length):
00806         return calc_affine_penalty(
00807             length, self.open, self.extend, self.penalize_extend_when_opening)
00808 
00809 def calc_affine_penalty(length, open, extend, penalize_extend_when_opening):
00810     if length <= 0:
00811         return 0
00812     penalty = open + extend * length
00813     if not penalize_extend_when_opening:
00814         penalty -= extend
00815     return penalty
00816 
00817 def print_matrix(matrix):
00818     """print_matrix(matrix)
00819 
00820     Print out a matrix.  For debugging purposes.
00821 
00822     """
00823     # Transpose the matrix and get the length of the values in each column.
00824     matrixT = [[] for x in range(len(matrix[0]))]
00825     for i in range(len(matrix)):
00826         for j in range(len(matrix[i])):
00827             matrixT[j].append(len(str(matrix[i][j])))
00828     ndigits = map(max, matrixT)
00829     for i in range(len(matrix)):
00830         #Using string formatting trick to add leading spaces,
00831         print " ".join("%*s " % (ndigits[j], matrix[i][j]) \
00832                        for j in range(len(matrix[i])))
00833 
00834 def format_alignment(align1, align2, score, begin, end):
00835     """format_alignment(align1, align2, score, begin, end) -> string
00836 
00837     Format the alignment prettily into a string.
00838 
00839     """
00840     s = []
00841     s.append("%s\n" % align1)
00842     s.append("%s%s\n" % (" "*begin, "|"*(end-begin)))
00843     s.append("%s\n" % align2)
00844     s.append("  Score=%g\n" % score)
00845     return ''.join(s)
00846 
00847 
00848 # Try and load C implementations of functions.  If I can't,
00849 # then just ignore and use the pure python implementations.
00850 try:
00851     from cpairwise2 import rint, _make_score_matrix_fast
00852 except ImportError:
00853     pass
00854 
00855 
00856 def _test():
00857     """Run the module's doctests (PRIVATE)."""
00858     print "Runing doctests..."
00859     import doctest
00860     doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
00861     print "Done"
00862 
00863 if __name__ == "__main__":
00864     _test()