Back to index

moin  1.9.0~rc2
difflib.py
Go to the documentation of this file.
00001 #! /usr/bin/env python
00002 # Python 2.4.3 (maybe other versions, too) has a broken difflib, sometimes
00003 # raising a "maximum recursion depth exceeded in cmp" exception.
00004 # This is taken from python.org SVN repo revision 54230 with patches
00005 # 36160 and 34415 reversed for python2.3 compatibility.
00006 # Also, startswith(tuple) [2.5] was changed to multiple startswith(elem).
00007 
00008 """
00009 Module difflib -- helpers for computing deltas between objects.
00010 
00011 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
00012     Use SequenceMatcher to return list of the best "good enough" matches.
00013 
00014 Function context_diff(a, b):
00015     For two lists of strings, return a delta in context diff format.
00016 
00017 Function ndiff(a, b):
00018     Return a delta: the difference between `a` and `b` (lists of strings).
00019 
00020 Function restore(delta, which):
00021     Return one of the two sequences that generated an ndiff delta.
00022 
00023 Function unified_diff(a, b):
00024     For two lists of strings, return a delta in unified diff format.
00025 
00026 Class SequenceMatcher:
00027     A flexible class for comparing pairs of sequences of any type.
00028 
00029 Class Differ:
00030     For producing human-readable deltas from sequences of lines of text.
00031 
00032 Class HtmlDiff:
00033     For producing HTML side by side comparison with change highlights.
00034 """
00035 
00036 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
00037            'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
00038            'unified_diff', 'HtmlDiff']
00039 
00040 def _calculate_ratio(matches, length):
00041     if length:
00042         return 2.0 * matches / length
00043     return 1.0
00044 
00045 class SequenceMatcher:
00046 
00047     """
00048     SequenceMatcher is a flexible class for comparing pairs of sequences of
00049     any type, so long as the sequence elements are hashable.  The basic
00050     algorithm predates, and is a little fancier than, an algorithm
00051     published in the late 1980's by Ratcliff and Obershelp under the
00052     hyperbolic name "gestalt pattern matching".  The basic idea is to find
00053     the longest contiguous matching subsequence that contains no "junk"
00054     elements (R-O doesn't address junk).  The same idea is then applied
00055     recursively to the pieces of the sequences to the left and to the right
00056     of the matching subsequence.  This does not yield minimal edit
00057     sequences, but does tend to yield matches that "look right" to people.
00058 
00059     SequenceMatcher tries to compute a "human-friendly diff" between two
00060     sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
00061     longest *contiguous* & junk-free matching subsequence.  That's what
00062     catches peoples' eyes.  The Windows(tm) windiff has another interesting
00063     notion, pairing up elements that appear uniquely in each sequence.
00064     That, and the method here, appear to yield more intuitive difference
00065     reports than does diff.  This method appears to be the least vulnerable
00066     to synching up on blocks of "junk lines", though (like blank lines in
00067     ordinary text files, or maybe "<P>" lines in HTML files).  That may be
00068     because this is the only method of the 3 that has a *concept* of
00069     "junk" <wink>.
00070 
00071     Example, comparing two strings, and considering blanks to be "junk":
00072 
00073     >>> s = SequenceMatcher(lambda x: x == " ",
00074     ...                     "private Thread currentThread;",
00075     ...                     "private volatile Thread currentThread;")
00076     >>>
00077 
00078     .ratio() returns a float in [0, 1], measuring the "similarity" of the
00079     sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
00080     sequences are close matches:
00081 
00082     >>> print round(s.ratio(), 3)
00083     0.866
00084     >>>
00085 
00086     If you're only interested in where the sequences match,
00087     .get_matching_blocks() is handy:
00088 
00089     >>> for block in s.get_matching_blocks():
00090     ...     print "a[%d] and b[%d] match for %d elements" % block
00091     a[0] and b[0] match for 8 elements
00092     a[8] and b[17] match for 21 elements
00093     a[29] and b[38] match for 0 elements
00094 
00095     Note that the last tuple returned by .get_matching_blocks() is always a
00096     dummy, (len(a), len(b), 0), and this is the only case in which the last
00097     tuple element (number of elements matched) is 0.
00098 
00099     If you want to know how to change the first sequence into the second,
00100     use .get_opcodes():
00101 
00102     >>> for opcode in s.get_opcodes():
00103     ...     print "%6s a[%d:%d] b[%d:%d]" % opcode
00104      equal a[0:8] b[0:8]
00105     insert a[8:8] b[8:17]
00106      equal a[8:29] b[17:38]
00107 
00108     See the Differ class for a fancy human-friendly file differencer, which
00109     uses SequenceMatcher both to compare sequences of lines, and to compare
00110     sequences of characters within similar (near-matching) lines.
00111 
00112     See also function get_close_matches() in this module, which shows how
00113     simple code building on SequenceMatcher can be used to do useful work.
00114 
00115     Timing:  Basic R-O is cubic time worst case and quadratic time expected
00116     case.  SequenceMatcher is quadratic time for the worst case and has
00117     expected-case behavior dependent in a complicated way on how many
00118     elements the sequences have in common; best case time is linear.
00119 
00120     Methods:
00121 
00122     __init__(isjunk=None, a='', b='')
00123         Construct a SequenceMatcher.
00124 
00125     set_seqs(a, b)
00126         Set the two sequences to be compared.
00127 
00128     set_seq1(a)
00129         Set the first sequence to be compared.
00130 
00131     set_seq2(b)
00132         Set the second sequence to be compared.
00133 
00134     find_longest_match(alo, ahi, blo, bhi)
00135         Find longest matching block in a[alo:ahi] and b[blo:bhi].
00136 
00137     get_matching_blocks()
00138         Return list of triples describing matching subsequences.
00139 
00140     get_opcodes()
00141         Return list of 5-tuples describing how to turn a into b.
00142 
00143     ratio()
00144         Return a measure of the sequences' similarity (float in [0,1]).
00145 
00146     quick_ratio()
00147         Return an upper bound on .ratio() relatively quickly.
00148 
00149     real_quick_ratio()
00150         Return an upper bound on ratio() very quickly.
00151     """
00152 
00153     def __init__(self, isjunk=None, a='', b=''):
00154         """Construct a SequenceMatcher.
00155 
00156         Optional arg isjunk is None (the default), or a one-argument
00157         function that takes a sequence element and returns true iff the
00158         element is junk.  None is equivalent to passing "lambda x: 0", i.e.
00159         no elements are considered to be junk.  For example, pass
00160             lambda x: x in " \\t"
00161         if you're comparing lines as sequences of characters, and don't
00162         want to synch up on blanks or hard tabs.
00163 
00164         Optional arg a is the first of two sequences to be compared.  By
00165         default, an empty string.  The elements of a must be hashable.  See
00166         also .set_seqs() and .set_seq1().
00167 
00168         Optional arg b is the second of two sequences to be compared.  By
00169         default, an empty string.  The elements of b must be hashable. See
00170         also .set_seqs() and .set_seq2().
00171         """
00172 
00173         # Members:
00174         # a
00175         #      first sequence
00176         # b
00177         #      second sequence; differences are computed as "what do
00178         #      we need to do to 'a' to change it into 'b'?"
00179         # b2j
00180         #      for x in b, b2j[x] is a list of the indices (into b)
00181         #      at which x appears; junk elements do not appear
00182         # fullbcount
00183         #      for x in b, fullbcount[x] == the number of times x
00184         #      appears in b; only materialized if really needed (used
00185         #      only for computing quick_ratio())
00186         # matching_blocks
00187         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
00188         #      ascending & non-overlapping in i and in j; terminated by
00189         #      a dummy (len(a), len(b), 0) sentinel
00190         # opcodes
00191         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
00192         #      one of
00193         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
00194         #          'delete'    a[i1:i2] should be deleted
00195         #          'insert'    b[j1:j2] should be inserted
00196         #          'equal'     a[i1:i2] == b[j1:j2]
00197         # isjunk
00198         #      a user-supplied function taking a sequence element and
00199         #      returning true iff the element is "junk" -- this has
00200         #      subtle but helpful effects on the algorithm, which I'll
00201         #      get around to writing up someday <0.9 wink>.
00202         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
00203         # isbjunk
00204         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
00205         #      it's really the has_key method of a hidden dict.
00206         #      DOES NOT WORK for x in a!
00207         # isbpopular
00208         #      for x in b, isbpopular(x) is true iff b is reasonably long
00209         #      (at least 200 elements) and x accounts for more than 1% of
00210         #      its elements.  DOES NOT WORK for x in a!
00211 
00212         self.isjunk = isjunk
00213         self.a = self.b = None
00214         self.set_seqs(a, b)
00215 
00216     def set_seqs(self, a, b):
00217         """Set the two sequences to be compared.
00218 
00219         >>> s = SequenceMatcher()
00220         >>> s.set_seqs("abcd", "bcde")
00221         >>> s.ratio()
00222         0.75
00223         """
00224 
00225         self.set_seq1(a)
00226         self.set_seq2(b)
00227 
00228     def set_seq1(self, a):
00229         """Set the first sequence to be compared.
00230 
00231         The second sequence to be compared is not changed.
00232 
00233         >>> s = SequenceMatcher(None, "abcd", "bcde")
00234         >>> s.ratio()
00235         0.75
00236         >>> s.set_seq1("bcde")
00237         >>> s.ratio()
00238         1.0
00239         >>>
00240 
00241         SequenceMatcher computes and caches detailed information about the
00242         second sequence, so if you want to compare one sequence S against
00243         many sequences, use .set_seq2(S) once and call .set_seq1(x)
00244         repeatedly for each of the other sequences.
00245 
00246         See also set_seqs() and set_seq2().
00247         """
00248 
00249         if a is self.a:
00250             return
00251         self.a = a
00252         self.matching_blocks = self.opcodes = None
00253 
00254     def set_seq2(self, b):
00255         """Set the second sequence to be compared.
00256 
00257         The first sequence to be compared is not changed.
00258 
00259         >>> s = SequenceMatcher(None, "abcd", "bcde")
00260         >>> s.ratio()
00261         0.75
00262         >>> s.set_seq2("abcd")
00263         >>> s.ratio()
00264         1.0
00265         >>>
00266 
00267         SequenceMatcher computes and caches detailed information about the
00268         second sequence, so if you want to compare one sequence S against
00269         many sequences, use .set_seq2(S) once and call .set_seq1(x)
00270         repeatedly for each of the other sequences.
00271 
00272         See also set_seqs() and set_seq1().
00273         """
00274 
00275         if b is self.b:
00276             return
00277         self.b = b
00278         self.matching_blocks = self.opcodes = None
00279         self.fullbcount = None
00280         self.__chain_b()
00281 
00282     # For each element x in b, set b2j[x] to a list of the indices in
00283     # b where x appears; the indices are in increasing order; note that
00284     # the number of times x appears in b is len(b2j[x]) ...
00285     # when self.isjunk is defined, junk elements don't show up in this
00286     # map at all, which stops the central find_longest_match method
00287     # from starting any matching block at a junk element ...
00288     # also creates the fast isbjunk function ...
00289     # b2j also does not contain entries for "popular" elements, meaning
00290     # elements that account for more than 1% of the total elements, and
00291     # when the sequence is reasonably large (>= 200 elements); this can
00292     # be viewed as an adaptive notion of semi-junk, and yields an enormous
00293     # speedup when, e.g., comparing program files with hundreds of
00294     # instances of "return NULL;" ...
00295     # note that this is only called when b changes; so for cross-product
00296     # kinds of matches, it's best to call set_seq2 once, then set_seq1
00297     # repeatedly
00298 
00299     def __chain_b(self):
00300         # Because isjunk is a user-defined (not C) function, and we test
00301         # for junk a LOT, it's important to minimize the number of calls.
00302         # Before the tricks described here, __chain_b was by far the most
00303         # time-consuming routine in the whole module!  If anyone sees
00304         # Jim Roskind, thank him again for profile.py -- I never would
00305         # have guessed that.
00306         # The first trick is to build b2j ignoring the possibility
00307         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
00308         # out the junk later is much cheaper than building b2j "right"
00309         # from the start.
00310         b = self.b
00311         n = len(b)
00312         self.b2j = b2j = {}
00313         populardict = {}
00314         for i, elt in enumerate(b):
00315             if elt in b2j:
00316                 indices = b2j[elt]
00317                 if n >= 200 and len(indices) * 100 > n:
00318                     populardict[elt] = 1
00319                     del indices[:]
00320                 else:
00321                     indices.append(i)
00322             else:
00323                 b2j[elt] = [i]
00324 
00325         # Purge leftover indices for popular elements.
00326         for elt in populardict:
00327             del b2j[elt]
00328 
00329         # Now b2j.keys() contains elements uniquely, and especially when
00330         # the sequence is a string, that's usually a good deal smaller
00331         # than len(string).  The difference is the number of isjunk calls
00332         # saved.
00333         isjunk = self.isjunk
00334         junkdict = {}
00335         if isjunk:
00336             for d in populardict, b2j:
00337                 for elt in d.keys():
00338                     if isjunk(elt):
00339                         junkdict[elt] = 1
00340                         del d[elt]
00341 
00342         # Now for x in b, isjunk(x) == x in junkdict, but the
00343         # latter is much faster.  Note too that while there may be a
00344         # lot of junk in the sequence, the number of *unique* junk
00345         # elements is probably small.  So the memory burden of keeping
00346         # this dict alive is likely trivial compared to the size of b2j.
00347         self.isbjunk = junkdict.has_key
00348         self.isbpopular = populardict.has_key
00349 
00350     def find_longest_match(self, alo, ahi, blo, bhi):
00351         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
00352 
00353         If isjunk is not defined:
00354 
00355         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
00356             alo <= i <= i+k <= ahi
00357             blo <= j <= j+k <= bhi
00358         and for all (i',j',k') meeting those conditions,
00359             k >= k'
00360             i <= i'
00361             and if i == i', j <= j'
00362 
00363         In other words, of all maximal matching blocks, return one that
00364         starts earliest in a, and of all those maximal matching blocks that
00365         start earliest in a, return the one that starts earliest in b.
00366 
00367         >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
00368         >>> s.find_longest_match(0, 5, 0, 9)
00369         (0, 4, 5)
00370 
00371         If isjunk is defined, first the longest matching block is
00372         determined as above, but with the additional restriction that no
00373         junk element appears in the block.  Then that block is extended as
00374         far as possible by matching (only) junk elements on both sides.  So
00375         the resulting block never matches on junk except as identical junk
00376         happens to be adjacent to an "interesting" match.
00377 
00378         Here's the same example as before, but considering blanks to be
00379         junk.  That prevents " abcd" from matching the " abcd" at the tail
00380         end of the second sequence directly.  Instead only the "abcd" can
00381         match, and matches the leftmost "abcd" in the second sequence:
00382 
00383         >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
00384         >>> s.find_longest_match(0, 5, 0, 9)
00385         (1, 0, 4)
00386 
00387         If no blocks match, return (alo, blo, 0).
00388 
00389         >>> s = SequenceMatcher(None, "ab", "c")
00390         >>> s.find_longest_match(0, 2, 0, 1)
00391         (0, 0, 0)
00392         """
00393 
00394         # CAUTION:  stripping common prefix or suffix would be incorrect.
00395         # E.g.,
00396         #    ab
00397         #    acab
00398         # Longest matching block is "ab", but if common prefix is
00399         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
00400         # strip, so ends up claiming that ab is changed to acab by
00401         # inserting "ca" in the middle.  That's minimal but unintuitive:
00402         # "it's obvious" that someone inserted "ac" at the front.
00403         # Windiff ends up at the same place as diff, but by pairing up
00404         # the unique 'b's and then matching the first two 'a's.
00405 
00406         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
00407         besti, bestj, bestsize = alo, blo, 0
00408         # find longest junk-free match
00409         # during an iteration of the loop, j2len[j] = length of longest
00410         # junk-free match ending with a[i-1] and b[j]
00411         j2len = {}
00412         nothing = []
00413         for i in xrange(alo, ahi):
00414             # look at all instances of a[i] in b; note that because
00415             # b2j has no junk keys, the loop is skipped if a[i] is junk
00416             j2lenget = j2len.get
00417             newj2len = {}
00418             for j in b2j.get(a[i], nothing):
00419                 # a[i] matches b[j]
00420                 if j < blo:
00421                     continue
00422                 if j >= bhi:
00423                     break
00424                 k = newj2len[j] = j2lenget(j-1, 0) + 1
00425                 if k > bestsize:
00426                     besti, bestj, bestsize = i-k+1, j-k+1, k
00427             j2len = newj2len
00428 
00429         # Extend the best by non-junk elements on each end.  In particular,
00430         # "popular" non-junk elements aren't in b2j, which greatly speeds
00431         # the inner loop above, but also means "the best" match so far
00432         # doesn't contain any junk *or* popular non-junk elements.
00433         while besti > alo and bestj > blo and \
00434               not isbjunk(b[bestj-1]) and \
00435               a[besti-1] == b[bestj-1]:
00436             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
00437         while besti+bestsize < ahi and bestj+bestsize < bhi and \
00438               not isbjunk(b[bestj+bestsize]) and \
00439               a[besti+bestsize] == b[bestj+bestsize]:
00440             bestsize += 1
00441 
00442         # Now that we have a wholly interesting match (albeit possibly
00443         # empty!), we may as well suck up the matching junk on each
00444         # side of it too.  Can't think of a good reason not to, and it
00445         # saves post-processing the (possibly considerable) expense of
00446         # figuring out what to do with it.  In the case of an empty
00447         # interesting match, this is clearly the right thing to do,
00448         # because no other kind of match is possible in the regions.
00449         while besti > alo and bestj > blo and \
00450               isbjunk(b[bestj-1]) and \
00451               a[besti-1] == b[bestj-1]:
00452             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
00453         while besti+bestsize < ahi and bestj+bestsize < bhi and \
00454               isbjunk(b[bestj+bestsize]) and \
00455               a[besti+bestsize] == b[bestj+bestsize]:
00456             bestsize = bestsize + 1
00457 
00458         return besti, bestj, bestsize
00459 
00460     def get_matching_blocks(self):
00461         """Return list of triples describing matching subsequences.
00462 
00463         Each triple is of the form (i, j, n), and means that
00464         a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
00465         i and in j.  New in Python 2.5, it's also guaranteed that if
00466         (i, j, n) and (i', j', n') are adjacent triples in the list, and
00467         the second is not the last triple in the list, then i+n != i' or
00468         j+n != j'.  IOW, adjacent triples never describe adjacent equal
00469         blocks.
00470 
00471         The last triple is a dummy, (len(a), len(b), 0), and is the only
00472         triple with n==0.
00473 
00474         >>> s = SequenceMatcher(None, "abxcd", "abcd")
00475         >>> s.get_matching_blocks()
00476         [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
00477         """
00478 
00479         if self.matching_blocks is not None:
00480             return self.matching_blocks
00481         la, lb = len(self.a), len(self.b)
00482 
00483         # This is most naturally expressed as a recursive algorithm, but
00484         # at least one user bumped into extreme use cases that exceeded
00485         # the recursion limit on their box.  So, now we maintain a list
00486         # ('queue`) of blocks we still need to look at, and append partial
00487         # results to `matching_blocks` in a loop; the matches are sorted
00488         # at the end.
00489         queue = [(0, la, 0, lb)]
00490         matching_blocks = []
00491         while queue:
00492             alo, ahi, blo, bhi = queue.pop()
00493             i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
00494             # a[alo:i] vs b[blo:j] unknown
00495             # a[i:i+k] same as b[j:j+k]
00496             # a[i+k:ahi] vs b[j+k:bhi] unknown
00497             if k:   # if k is 0, there was no matching block
00498                 matching_blocks.append(x)
00499                 if alo < i and blo < j:
00500                     queue.append((alo, i, blo, j))
00501                 if i+k < ahi and j+k < bhi:
00502                     queue.append((i+k, ahi, j+k, bhi))
00503         matching_blocks.sort()
00504 
00505         # It's possible that we have adjacent equal blocks in the
00506         # matching_blocks list now.  Starting with 2.5, this code was added
00507         # to collapse them.
00508         i1 = j1 = k1 = 0
00509         non_adjacent = []
00510         for i2, j2, k2 in matching_blocks:
00511             # Is this block adjacent to i1, j1, k1?
00512             if i1 + k1 == i2 and j1 + k1 == j2:
00513                 # Yes, so collapse them -- this just increases the length of
00514                 # the first block by the length of the second, and the first
00515                 # block so lengthened remains the block to compare against.
00516                 k1 += k2
00517             else:
00518                 # Not adjacent.  Remember the first block (k1==0 means it's
00519                 # the dummy we started with), and make the second block the
00520                 # new block to compare against.
00521                 if k1:
00522                     non_adjacent.append((i1, j1, k1))
00523                 i1, j1, k1 = i2, j2, k2
00524         if k1:
00525             non_adjacent.append((i1, j1, k1))
00526 
00527         non_adjacent.append( (la, lb, 0) )
00528         self.matching_blocks = non_adjacent
00529         return self.matching_blocks
00530 
00531     def get_opcodes(self):
00532         """Return list of 5-tuples describing how to turn a into b.
00533 
00534         Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
00535         has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
00536         tuple preceding it, and likewise for j1 == the previous j2.
00537 
00538         The tags are strings, with these meanings:
00539 
00540         'replace':  a[i1:i2] should be replaced by b[j1:j2]
00541         'delete':   a[i1:i2] should be deleted.
00542                     Note that j1==j2 in this case.
00543         'insert':   b[j1:j2] should be inserted at a[i1:i1].
00544                     Note that i1==i2 in this case.
00545         'equal':    a[i1:i2] == b[j1:j2]
00546 
00547         >>> a = "qabxcd"
00548         >>> b = "abycdf"
00549         >>> s = SequenceMatcher(None, a, b)
00550         >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
00551         ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
00552         ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
00553          delete a[0:1] (q) b[0:0] ()
00554           equal a[1:3] (ab) b[0:2] (ab)
00555         replace a[3:4] (x) b[2:3] (y)
00556           equal a[4:6] (cd) b[3:5] (cd)
00557          insert a[6:6] () b[5:6] (f)
00558         """
00559 
00560         if self.opcodes is not None:
00561             return self.opcodes
00562         i = j = 0
00563         self.opcodes = answer = []
00564         for ai, bj, size in self.get_matching_blocks():
00565             # invariant:  we've pumped out correct diffs to change
00566             # a[:i] into b[:j], and the next matching block is
00567             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
00568             # out a diff to change a[i:ai] into b[j:bj], pump out
00569             # the matching block, and move (i,j) beyond the match
00570             tag = ''
00571             if i < ai and j < bj:
00572                 tag = 'replace'
00573             elif i < ai:
00574                 tag = 'delete'
00575             elif j < bj:
00576                 tag = 'insert'
00577             if tag:
00578                 answer.append( (tag, i, ai, j, bj) )
00579             i, j = ai+size, bj+size
00580             # the list of matching blocks is terminated by a
00581             # sentinel with size 0
00582             if size:
00583                 answer.append( ('equal', ai, i, bj, j) )
00584         return answer
00585 
00586     def get_grouped_opcodes(self, n=3):
00587         """ Isolate change clusters by eliminating ranges with no changes.
00588 
00589         Return a generator of groups with upto n lines of context.
00590         Each group is in the same format as returned by get_opcodes().
00591 
00592         >>> from pprint import pprint
00593         >>> a = map(str, range(1,40))
00594         >>> b = a[:]
00595         >>> b[8:8] = ['i']     # Make an insertion
00596         >>> b[20] += 'x'       # Make a replacement
00597         >>> b[23:28] = []      # Make a deletion
00598         >>> b[30] += 'y'       # Make another replacement
00599         >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
00600         [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
00601          [('equal', 16, 19, 17, 20),
00602           ('replace', 19, 20, 20, 21),
00603           ('equal', 20, 22, 21, 23),
00604           ('delete', 22, 27, 23, 23),
00605           ('equal', 27, 30, 23, 26)],
00606          [('equal', 31, 34, 27, 30),
00607           ('replace', 34, 35, 30, 31),
00608           ('equal', 35, 38, 31, 34)]]
00609         """
00610 
00611         codes = self.get_opcodes()
00612         if not codes:
00613             codes = [("equal", 0, 1, 0, 1)]
00614         # Fixup leading and trailing groups if they show no changes.
00615         if codes[0][0] == 'equal':
00616             tag, i1, i2, j1, j2 = codes[0]
00617             codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
00618         if codes[-1][0] == 'equal':
00619             tag, i1, i2, j1, j2 = codes[-1]
00620             codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
00621 
00622         nn = n + n
00623         group = []
00624         for tag, i1, i2, j1, j2 in codes:
00625             # End the current group and start a new one whenever
00626             # there is a large range with no changes.
00627             if tag == 'equal' and i2-i1 > nn:
00628                 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
00629                 yield group
00630                 group = []
00631                 i1, j1 = max(i1, i2-n), max(j1, j2-n)
00632             group.append((tag, i1, i2, j1 ,j2))
00633         if group and not (len(group)==1 and group[0][0] == 'equal'):
00634             yield group
00635 
00636     def ratio(self):
00637         """Return a measure of the sequences' similarity (float in [0,1]).
00638 
00639         Where T is the total number of elements in both sequences, and
00640         M is the number of matches, this is 2.0*M / T.
00641         Note that this is 1 if the sequences are identical, and 0 if
00642         they have nothing in common.
00643 
00644         .ratio() is expensive to compute if you haven't already computed
00645         .get_matching_blocks() or .get_opcodes(), in which case you may
00646         want to try .quick_ratio() or .real_quick_ratio() first to get an
00647         upper bound.
00648 
00649         >>> s = SequenceMatcher(None, "abcd", "bcde")
00650         >>> s.ratio()
00651         0.75
00652         >>> s.quick_ratio()
00653         0.75
00654         >>> s.real_quick_ratio()
00655         1.0
00656         """
00657 
00658         matches = reduce(lambda sum, triple: sum + triple[-1],
00659                          self.get_matching_blocks(), 0)
00660         return _calculate_ratio(matches, len(self.a) + len(self.b))
00661 
00662     def quick_ratio(self):
00663         """Return an upper bound on ratio() relatively quickly.
00664 
00665         This isn't defined beyond that it is an upper bound on .ratio(), and
00666         is faster to compute.
00667         """
00668 
00669         # viewing a and b as multisets, set matches to the cardinality
00670         # of their intersection; this counts the number of matches
00671         # without regard to order, so is clearly an upper bound
00672         if self.fullbcount is None:
00673             self.fullbcount = fullbcount = {}
00674             for elt in self.b:
00675                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
00676         fullbcount = self.fullbcount
00677         # avail[x] is the number of times x appears in 'b' less the
00678         # number of times we've seen it in 'a' so far ... kinda
00679         avail = {}
00680         availhas, matches = avail.has_key, 0
00681         for elt in self.a:
00682             if availhas(elt):
00683                 numb = avail[elt]
00684             else:
00685                 numb = fullbcount.get(elt, 0)
00686             avail[elt] = numb - 1
00687             if numb > 0:
00688                 matches = matches + 1
00689         return _calculate_ratio(matches, len(self.a) + len(self.b))
00690 
00691     def real_quick_ratio(self):
00692         """Return an upper bound on ratio() very quickly.
00693 
00694         This isn't defined beyond that it is an upper bound on .ratio(), and
00695         is faster to compute than either .ratio() or .quick_ratio().
00696         """
00697 
00698         la, lb = len(self.a), len(self.b)
00699         # can't have more matches than the number of elements in the
00700         # shorter sequence
00701         return _calculate_ratio(min(la, lb), la + lb)
00702 
00703 def get_close_matches(word, possibilities, n=3, cutoff=0.6):
00704     """Use SequenceMatcher to return list of the best "good enough" matches.
00705 
00706     word is a sequence for which close matches are desired (typically a
00707     string).
00708 
00709     possibilities is a list of sequences against which to match word
00710     (typically a list of strings).
00711 
00712     Optional arg n (default 3) is the maximum number of close matches to
00713     return.  n must be > 0.
00714 
00715     Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
00716     that don't score at least that similar to word are ignored.
00717 
00718     The best (no more than n) matches among the possibilities are returned
00719     in a list, sorted by similarity score, most similar first.
00720 
00721     >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
00722     ['apple', 'ape']
00723     >>> import keyword as _keyword
00724     >>> get_close_matches("wheel", _keyword.kwlist)
00725     ['while']
00726     >>> get_close_matches("apple", _keyword.kwlist)
00727     []
00728     >>> get_close_matches("accept", _keyword.kwlist)
00729     ['except']
00730     """
00731 
00732     if not n >  0:
00733         raise ValueError("n must be > 0: %r" % (n,))
00734     if not 0.0 <= cutoff <= 1.0:
00735         raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
00736     result = []
00737     s = SequenceMatcher()
00738     s.set_seq2(word)
00739     for x in possibilities:
00740         s.set_seq1(x)
00741         if s.real_quick_ratio() >= cutoff and \
00742            s.quick_ratio() >= cutoff and \
00743            s.ratio() >= cutoff:
00744             result.append((s.ratio(), x))
00745 
00746     # Sort by score.    
00747     result.sort()   
00748     # Retain only the best n.   
00749     result = result[-n:]    
00750     # Move best-scorer to head of list.     
00751     result.reverse()    
00752     # Strip scores.     
00753     return [x for score, x in result]
00754 
00755 def _count_leading(line, ch):
00756     """
00757     Return number of `ch` characters at the start of `line`.
00758 
00759     Example:
00760 
00761     >>> _count_leading('   abc', ' ')
00762     3
00763     """
00764 
00765     i, n = 0, len(line)
00766     while i < n and line[i] == ch:
00767         i += 1
00768     return i
00769 
00770 class Differ:
00771     r"""
00772     Differ is a class for comparing sequences of lines of text, and
00773     producing human-readable differences or deltas.  Differ uses
00774     SequenceMatcher both to compare sequences of lines, and to compare
00775     sequences of characters within similar (near-matching) lines.
00776 
00777     Each line of a Differ delta begins with a two-letter code:
00778 
00779         '- '    line unique to sequence 1
00780         '+ '    line unique to sequence 2
00781         '  '    line common to both sequences
00782         '? '    line not present in either input sequence
00783 
00784     Lines beginning with '? ' attempt to guide the eye to intraline
00785     differences, and were not present in either input sequence.  These lines
00786     can be confusing if the sequences contain tab characters.
00787 
00788     Note that Differ makes no claim to produce a *minimal* diff.  To the
00789     contrary, minimal diffs are often counter-intuitive, because they synch
00790     up anywhere possible, sometimes accidental matches 100 pages apart.
00791     Restricting synch points to contiguous matches preserves some notion of
00792     locality, at the occasional cost of producing a longer diff.
00793 
00794     Example: Comparing two texts.
00795 
00796     First we set up the texts, sequences of individual single-line strings
00797     ending with newlines (such sequences can also be obtained from the
00798     `readlines()` method of file-like objects):
00799 
00800     >>> text1 = '''  1. Beautiful is better than ugly.
00801     ...   2. Explicit is better than implicit.
00802     ...   3. Simple is better than complex.
00803     ...   4. Complex is better than complicated.
00804     ... '''.splitlines(1)
00805     >>> len(text1)
00806     4
00807     >>> text1[0][-1]
00808     '\n'
00809     >>> text2 = '''  1. Beautiful is better than ugly.
00810     ...   3.   Simple is better than complex.
00811     ...   4. Complicated is better than complex.
00812     ...   5. Flat is better than nested.
00813     ... '''.splitlines(1)
00814 
00815     Next we instantiate a Differ object:
00816 
00817     >>> d = Differ()
00818 
00819     Note that when instantiating a Differ object we may pass functions to
00820     filter out line and character 'junk'.  See Differ.__init__ for details.
00821 
00822     Finally, we compare the two:
00823 
00824     >>> result = list(d.compare(text1, text2))
00825 
00826     'result' is a list of strings, so let's pretty-print it:
00827 
00828     >>> from pprint import pprint as _pprint
00829     >>> _pprint(result)
00830     ['    1. Beautiful is better than ugly.\n',
00831      '-   2. Explicit is better than implicit.\n',
00832      '-   3. Simple is better than complex.\n',
00833      '+   3.   Simple is better than complex.\n',
00834      '?     ++\n',
00835      '-   4. Complex is better than complicated.\n',
00836      '?            ^                     ---- ^\n',
00837      '+   4. Complicated is better than complex.\n',
00838      '?           ++++ ^                      ^\n',
00839      '+   5. Flat is better than nested.\n']
00840 
00841     As a single multi-line string it looks like this:
00842 
00843     >>> print ''.join(result),
00844         1. Beautiful is better than ugly.
00845     -   2. Explicit is better than implicit.
00846     -   3. Simple is better than complex.
00847     +   3.   Simple is better than complex.
00848     ?     ++
00849     -   4. Complex is better than complicated.
00850     ?            ^                     ---- ^
00851     +   4. Complicated is better than complex.
00852     ?           ++++ ^                      ^
00853     +   5. Flat is better than nested.
00854 
00855     Methods:
00856 
00857     __init__(linejunk=None, charjunk=None)
00858         Construct a text differencer, with optional filters.
00859 
00860     compare(a, b)
00861         Compare two sequences of lines; generate the resulting delta.
00862     """
00863 
00864     def __init__(self, linejunk=None, charjunk=None):
00865         """
00866         Construct a text differencer, with optional filters.
00867 
00868         The two optional keyword parameters are for filter functions:
00869 
00870         - `linejunk`: A function that should accept a single string argument,
00871           and return true iff the string is junk. The module-level function
00872           `IS_LINE_JUNK` may be used to filter out lines without visible
00873           characters, except for at most one splat ('#').  It is recommended
00874           to leave linejunk None; as of Python 2.3, the underlying
00875           SequenceMatcher class has grown an adaptive notion of "noise" lines
00876           that's better than any static definition the author has ever been
00877           able to craft.
00878 
00879         - `charjunk`: A function that should accept a string of length 1. The
00880           module-level function `IS_CHARACTER_JUNK` may be used to filter out
00881           whitespace characters (a blank or tab; **note**: bad idea to include
00882           newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
00883         """
00884 
00885         self.linejunk = linejunk
00886         self.charjunk = charjunk
00887 
00888     def compare(self, a, b):
00889         r"""
00890         Compare two sequences of lines; generate the resulting delta.
00891 
00892         Each sequence must contain individual single-line strings ending with
00893         newlines. Such sequences can be obtained from the `readlines()` method
00894         of file-like objects.  The delta generated also consists of newline-
00895         terminated strings, ready to be printed as-is via the writeline()
00896         method of a file-like object.
00897 
00898         Example:
00899 
00900         >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
00901         ...                                'ore\ntree\nemu\n'.splitlines(1))),
00902         - one
00903         ?  ^
00904         + ore
00905         ?  ^
00906         - two
00907         - three
00908         ?  -
00909         + tree
00910         + emu
00911         """
00912 
00913         cruncher = SequenceMatcher(self.linejunk, a, b)
00914         for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
00915             if tag == 'replace':
00916                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
00917             elif tag == 'delete':
00918                 g = self._dump('-', a, alo, ahi)
00919             elif tag == 'insert':
00920                 g = self._dump('+', b, blo, bhi)
00921             elif tag == 'equal':
00922                 g = self._dump(' ', a, alo, ahi)
00923             else:
00924                 raise ValueError, 'unknown tag %r' % (tag,)
00925 
00926             for line in g:
00927                 yield line
00928 
00929     def _dump(self, tag, x, lo, hi):
00930         """Generate comparison results for a same-tagged range."""
00931         for i in xrange(lo, hi):
00932             yield '%s %s' % (tag, x[i])
00933 
00934     def _plain_replace(self, a, alo, ahi, b, blo, bhi):
00935         assert alo < ahi and blo < bhi
00936         # dump the shorter block first -- reduces the burden on short-term
00937         # memory if the blocks are of very different sizes
00938         if bhi - blo < ahi - alo:
00939             first  = self._dump('+', b, blo, bhi)
00940             second = self._dump('-', a, alo, ahi)
00941         else:
00942             first  = self._dump('-', a, alo, ahi)
00943             second = self._dump('+', b, blo, bhi)
00944 
00945         for g in first, second:
00946             for line in g:
00947                 yield line
00948 
00949     def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
00950         r"""
00951         When replacing one block of lines with another, search the blocks
00952         for *similar* lines; the best-matching pair (if any) is used as a
00953         synch point, and intraline difference marking is done on the
00954         similar pair. Lots of work, but often worth it.
00955 
00956         Example:
00957 
00958         >>> d = Differ()
00959         >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
00960         ...                            ['abcdefGhijkl\n'], 0, 1)
00961         >>> print ''.join(results),
00962         - abcDefghiJkl
00963         ?    ^  ^  ^
00964         + abcdefGhijkl
00965         ?    ^  ^  ^
00966         """
00967 
00968         # don't synch up unless the lines have a similarity score of at
00969         # least cutoff; best_ratio tracks the best score seen so far
00970         best_ratio, cutoff = 0.74, 0.75
00971         cruncher = SequenceMatcher(self.charjunk)
00972         eqi, eqj = None, None   # 1st indices of equal lines (if any)
00973 
00974         # search for the pair that matches best without being identical
00975         # (identical lines must be junk lines, & we don't want to synch up
00976         # on junk -- unless we have to)
00977         for j in xrange(blo, bhi):
00978             bj = b[j]
00979             cruncher.set_seq2(bj)
00980             for i in xrange(alo, ahi):
00981                 ai = a[i]
00982                 if ai == bj:
00983                     if eqi is None:
00984                         eqi, eqj = i, j
00985                     continue
00986                 cruncher.set_seq1(ai)
00987                 # computing similarity is expensive, so use the quick
00988                 # upper bounds first -- have seen this speed up messy
00989                 # compares by a factor of 3.
00990                 # note that ratio() is only expensive to compute the first
00991                 # time it's called on a sequence pair; the expensive part
00992                 # of the computation is cached by cruncher
00993                 if cruncher.real_quick_ratio() > best_ratio and \
00994                       cruncher.quick_ratio() > best_ratio and \
00995                       cruncher.ratio() > best_ratio:
00996                     best_ratio, best_i, best_j = cruncher.ratio(), i, j
00997         if best_ratio < cutoff:
00998             # no non-identical "pretty close" pair
00999             if eqi is None:
01000                 # no identical pair either -- treat it as a straight replace
01001                 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
01002                     yield line
01003                 return
01004             # no close pair, but an identical pair -- synch up on that
01005             best_i, best_j, best_ratio = eqi, eqj, 1.0
01006         else:
01007             # there's a close pair, so forget the identical pair (if any)
01008             eqi = None
01009 
01010         # a[best_i] very similar to b[best_j]; eqi is None iff they're not
01011         # identical
01012 
01013         # pump out diffs from before the synch point
01014         for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
01015             yield line
01016 
01017         # do intraline marking on the synch pair
01018         aelt, belt = a[best_i], b[best_j]
01019         if eqi is None:
01020             # pump out a '-', '?', '+', '?' quad for the synched lines
01021             atags = btags = ""
01022             cruncher.set_seqs(aelt, belt)
01023             for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
01024                 la, lb = ai2 - ai1, bj2 - bj1
01025                 if tag == 'replace':
01026                     atags += '^' * la
01027                     btags += '^' * lb
01028                 elif tag == 'delete':
01029                     atags += '-' * la
01030                 elif tag == 'insert':
01031                     btags += '+' * lb
01032                 elif tag == 'equal':
01033                     atags += ' ' * la
01034                     btags += ' ' * lb
01035                 else:
01036                     raise ValueError, 'unknown tag %r' % (tag,)
01037             for line in self._qformat(aelt, belt, atags, btags):
01038                 yield line
01039         else:
01040             # the synch pair is identical
01041             yield '  ' + aelt
01042 
01043         # pump out diffs from after the synch point
01044         for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
01045             yield line
01046 
01047     def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
01048         g = []
01049         if alo < ahi:
01050             if blo < bhi:
01051                 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
01052             else:
01053                 g = self._dump('-', a, alo, ahi)
01054         elif blo < bhi:
01055             g = self._dump('+', b, blo, bhi)
01056 
01057         for line in g:
01058             yield line
01059 
01060     def _qformat(self, aline, bline, atags, btags):
01061         r"""
01062         Format "?" output and deal with leading tabs.
01063 
01064         Example:
01065 
01066         >>> d = Differ()
01067         >>> results = d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',
01068         ...                      '  ^ ^  ^      ', '+  ^ ^  ^      ')
01069         >>> for line in results: print repr(line)
01070         ...
01071         '- \tabcDefghiJkl\n'
01072         '? \t ^ ^  ^\n'
01073         '+ \t\tabcdefGhijkl\n'
01074         '? \t  ^ ^  ^\n'
01075         """
01076 
01077         # Can hurt, but will probably help most of the time.
01078         common = min(_count_leading(aline, "\t"),
01079                      _count_leading(bline, "\t"))
01080         common = min(common, _count_leading(atags[:common], " "))
01081         atags = atags[common:].rstrip()
01082         btags = btags[common:].rstrip()
01083 
01084         yield "- " + aline
01085         if atags:
01086             yield "? %s%s\n" % ("\t" * common, atags)
01087 
01088         yield "+ " + bline
01089         if btags:
01090             yield "? %s%s\n" % ("\t" * common, btags)
01091 
01092 # With respect to junk, an earlier version of ndiff simply refused to
01093 # *start* a match with a junk element.  The result was cases like this:
01094 #     before: private Thread currentThread;
01095 #     after:  private volatile Thread currentThread;
01096 # If you consider whitespace to be junk, the longest contiguous match
01097 # not starting with junk is "e Thread currentThread".  So ndiff reported
01098 # that "e volatil" was inserted between the 't' and the 'e' in "private".
01099 # While an accurate view, to people that's absurd.  The current version
01100 # looks for matching blocks that are entirely junk-free, then extends the
01101 # longest one of those as far as possible but only with matching junk.
01102 # So now "currentThread" is matched, then extended to suck up the
01103 # preceding blank; then "private" is matched, and extended to suck up the
01104 # following blank; then "Thread" is matched; and finally ndiff reports
01105 # that "volatile " was inserted before "Thread".  The only quibble
01106 # remaining is that perhaps it was really the case that " volatile"
01107 # was inserted after "private".  I can live with that <wink>.
01108 
01109 import re
01110 
01111 def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):
01112     r"""
01113     Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
01114 
01115     Examples:
01116 
01117     >>> IS_LINE_JUNK('\n')
01118     True
01119     >>> IS_LINE_JUNK('  #   \n')
01120     True
01121     >>> IS_LINE_JUNK('hello\n')
01122     False
01123     """
01124 
01125     return pat(line) is not None
01126 
01127 def IS_CHARACTER_JUNK(ch, ws=" \t"):
01128     r"""
01129     Return 1 for ignorable character: iff `ch` is a space or tab.
01130 
01131     Examples:
01132 
01133     >>> IS_CHARACTER_JUNK(' ')
01134     True
01135     >>> IS_CHARACTER_JUNK('\t')
01136     True
01137     >>> IS_CHARACTER_JUNK('\n')
01138     False
01139     >>> IS_CHARACTER_JUNK('x')
01140     False
01141     """
01142 
01143     return ch in ws
01144 
01145 
01146 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
01147                  tofiledate='', n=3, lineterm='\n'):
01148     r"""
01149     Compare two sequences of lines; generate the delta as a unified diff.
01150 
01151     Unified diffs are a compact way of showing line changes and a few
01152     lines of context.  The number of context lines is set by 'n' which
01153     defaults to three.
01154 
01155     By default, the diff control lines (those with ---, +++, or @@) are
01156     created with a trailing newline.  This is helpful so that inputs
01157     created from file.readlines() result in diffs that are suitable for
01158     file.writelines() since both the inputs and outputs have trailing
01159     newlines.
01160 
01161     For inputs that do not have trailing newlines, set the lineterm
01162     argument to "" so that the output will be uniformly newline free.
01163 
01164     The unidiff format normally has a header for filenames and modification
01165     times.  Any or all of these may be specified using strings for
01166     'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.  The modification
01167     times are normally expressed in the format returned by time.ctime().
01168 
01169     Example:
01170 
01171     >>> for line in unified_diff('one two three four'.split(),
01172     ...             'zero one tree four'.split(), 'Original', 'Current',
01173     ...             'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',
01174     ...             lineterm=''):
01175     ...     print line
01176     --- Original Sat Jan 26 23:30:50 1991
01177     +++ Current Fri Jun 06 10:20:52 2003
01178     @@ -1,4 +1,4 @@
01179     +zero
01180      one
01181     -two
01182     -three
01183     +tree
01184      four
01185     """
01186 
01187     started = False
01188     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
01189         if not started:
01190             yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)
01191             yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)
01192             started = True
01193         i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
01194         yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm)
01195         for tag, i1, i2, j1, j2 in group:
01196             if tag == 'equal':
01197                 for line in a[i1:i2]:
01198                     yield ' ' + line
01199                 continue
01200             if tag == 'replace' or tag == 'delete':
01201                 for line in a[i1:i2]:
01202                     yield '-' + line
01203             if tag == 'replace' or tag == 'insert':
01204                 for line in b[j1:j2]:
01205                     yield '+' + line
01206 
01207 # See http://www.unix.org/single_unix_specification/
01208 def context_diff(a, b, fromfile='', tofile='',
01209                  fromfiledate='', tofiledate='', n=3, lineterm='\n'):
01210     r"""
01211     Compare two sequences of lines; generate the delta as a context diff.
01212 
01213     Context diffs are a compact way of showing line changes and a few
01214     lines of context.  The number of context lines is set by 'n' which
01215     defaults to three.
01216 
01217     By default, the diff control lines (those with *** or ---) are
01218     created with a trailing newline.  This is helpful so that inputs
01219     created from file.readlines() result in diffs that are suitable for
01220     file.writelines() since both the inputs and outputs have trailing
01221     newlines.
01222 
01223     For inputs that do not have trailing newlines, set the lineterm
01224     argument to "" so that the output will be uniformly newline free.
01225 
01226     The context diff format normally has a header for filenames and
01227     modification times.  Any or all of these may be specified using
01228     strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
01229     The modification times are normally expressed in the format returned
01230     by time.ctime().  If not specified, the strings default to blanks.
01231 
01232     Example:
01233 
01234     >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1),
01235     ...       'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current',
01236     ...       'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')),
01237     *** Original Sat Jan 26 23:30:50 1991
01238     --- Current Fri Jun 06 10:22:46 2003
01239     ***************
01240     *** 1,4 ****
01241       one
01242     ! two
01243     ! three
01244       four
01245     --- 1,4 ----
01246     + zero
01247       one
01248     ! tree
01249       four
01250     """
01251 
01252     started = False
01253     prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':'  '}
01254     for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
01255         if not started:
01256             yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm)
01257             yield '--- %s %s%s' % (tofile, tofiledate, lineterm)
01258             started = True
01259 
01260         yield '***************%s' % (lineterm,)
01261         if group[-1][2] - group[0][1] >= 2:
01262             yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm)
01263         else:
01264             yield '*** %d ****%s' % (group[-1][2], lineterm)
01265         visiblechanges = [e for e in group if e[0] in ('replace', 'delete')]
01266         if visiblechanges:
01267             for tag, i1, i2, _, _ in group:
01268                 if tag != 'insert':
01269                     for line in a[i1:i2]:
01270                         yield prefixmap[tag] + line
01271 
01272         if group[-1][4] - group[0][3] >= 2:
01273             yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm)
01274         else:
01275             yield '--- %d ----%s' % (group[-1][4], lineterm)
01276         visiblechanges = [e for e in group if e[0] in ('replace', 'insert')]
01277         if visiblechanges:
01278             for tag, _, _, j1, j2 in group:
01279                 if tag != 'delete':
01280                     for line in b[j1:j2]:
01281                         yield prefixmap[tag] + line
01282 
01283 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
01284     r"""
01285     Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
01286 
01287     Optional keyword parameters `linejunk` and `charjunk` are for filter
01288     functions (or None):
01289 
01290     - linejunk: A function that should accept a single string argument, and
01291       return true iff the string is junk.  The default is None, and is
01292       recommended; as of Python 2.3, an adaptive notion of "noise" lines is
01293       used that does a good job on its own.
01294 
01295     - charjunk: A function that should accept a string of length 1. The
01296       default is module-level function IS_CHARACTER_JUNK, which filters out
01297       whitespace characters (a blank or tab; note: bad idea to include newline
01298       in this!).
01299 
01300     Tools/scripts/ndiff.py is a command-line front-end to this function.
01301 
01302     Example:
01303 
01304     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
01305     ...              'ore\ntree\nemu\n'.splitlines(1))
01306     >>> print ''.join(diff),
01307     - one
01308     ?  ^
01309     + ore
01310     ?  ^
01311     - two
01312     - three
01313     ?  -
01314     + tree
01315     + emu
01316     """
01317     return Differ(linejunk, charjunk).compare(a, b)
01318 
01319 def _mdiff(fromlines, tolines, context=None, linejunk=None,
01320            charjunk=IS_CHARACTER_JUNK):
01321     r"""Returns generator yielding marked up from/to side by side differences.
01322 
01323     Arguments:
01324     fromlines -- list of text lines to compared to tolines
01325     tolines -- list of text lines to be compared to fromlines
01326     context -- number of context lines to display on each side of difference,
01327                if None, all from/to text lines will be generated.
01328     linejunk -- passed on to ndiff (see ndiff documentation)
01329     charjunk -- passed on to ndiff (see ndiff documentation)
01330 
01331     This function returns an interator which returns a tuple:
01332     (from line tuple, to line tuple, boolean flag)
01333 
01334     from/to line tuple -- (line num, line text)
01335         line num -- integer or None (to indicate a context seperation)
01336         line text -- original line text with following markers inserted:
01337             '\0+' -- marks start of added text
01338             '\0-' -- marks start of deleted text
01339             '\0^' -- marks start of changed text
01340             '\1' -- marks end of added/deleted/changed text
01341 
01342     boolean flag -- None indicates context separation, True indicates
01343         either "from" or "to" line contains a change, otherwise False.
01344 
01345     This function/iterator was originally developed to generate side by side
01346     file difference for making HTML pages (see HtmlDiff class for example
01347     usage).
01348 
01349     Note, this function utilizes the ndiff function to generate the side by
01350     side difference markup.  Optional ndiff arguments may be passed to this
01351     function and they in turn will be passed to ndiff.
01352     """
01353     import re
01354 
01355     # regular expression for finding intraline change indices
01356     change_re = re.compile('(\++|\-+|\^+)')
01357 
01358     # create the difference iterator to generate the differences
01359     diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
01360 
01361     def _make_line(lines, format_key, side, num_lines=[0,0]):
01362         """Returns line of text with user's change markup and line formatting.
01363 
01364         lines -- list of lines from the ndiff generator to produce a line of
01365                  text from.  When producing the line of text to return, the
01366                  lines used are removed from this list.
01367         format_key -- '+' return first line in list with "add" markup around
01368                           the entire line.
01369                       '-' return first line in list with "delete" markup around
01370                           the entire line.
01371                       '?' return first line in list with add/delete/change
01372                           intraline markup (indices obtained from second line)
01373                       None return first line in list with no markup
01374         side -- indice into the num_lines list (0=from,1=to)
01375         num_lines -- from/to current line number.  This is NOT intended to be a
01376                      passed parameter.  It is present as a keyword argument to
01377                      maintain memory of the current line numbers between calls
01378                      of this function.
01379 
01380         Note, this function is purposefully not defined at the module scope so
01381         that data it needs from its parent function (within whose context it
01382         is defined) does not need to be of module scope.
01383         """
01384         num_lines[side] += 1
01385         # Handle case where no user markup is to be added, just return line of
01386         # text with user's line format to allow for usage of the line number.
01387         if format_key is None:
01388             return (num_lines[side],lines.pop(0)[2:])
01389         # Handle case of intraline changes
01390         if format_key == '?':
01391             text, markers = lines.pop(0), lines.pop(0)
01392             # find intraline changes (store change type and indices in tuples)
01393             sub_info = []
01394             def record_sub_info(match_object,sub_info=sub_info):
01395                 sub_info.append([match_object.group(1)[0],match_object.span()])
01396                 return match_object.group(1)
01397             change_re.sub(record_sub_info,markers)
01398             # process each tuple inserting our special marks that won't be
01399             # noticed by an xml/html escaper.
01400             for key,(begin,end) in sub_info[::-1]:
01401                 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
01402             text = text[2:]
01403         # Handle case of add/delete entire line
01404         else:
01405             text = lines.pop(0)[2:]
01406             # if line of text is just a newline, insert a space so there is
01407             # something for the user to highlight and see.
01408             if not text:
01409                 text = ' '
01410             # insert marks that won't be noticed by an xml/html escaper.
01411             text = '\0' + format_key + text + '\1'
01412         # Return line of text, first allow user's line formatter to do its
01413         # thing (such as adding the line number) then replace the special
01414         # marks with what the user's change markup.
01415         return (num_lines[side],text)
01416 
01417     def _line_iterator():
01418         """Yields from/to lines of text with a change indication.
01419 
01420         This function is an iterator.  It itself pulls lines from a
01421         differencing iterator, processes them and yields them.  When it can
01422         it yields both a "from" and a "to" line, otherwise it will yield one
01423         or the other.  In addition to yielding the lines of from/to text, a
01424         boolean flag is yielded to indicate if the text line(s) have
01425         differences in them.
01426 
01427         Note, this function is purposefully not defined at the module scope so
01428         that data it needs from its parent function (within whose context it
01429         is defined) does not need to be of module scope.
01430         """
01431         lines = []
01432         num_blanks_pending, num_blanks_to_yield = 0, 0
01433         while True:
01434             # Load up next 4 lines so we can look ahead, create strings which
01435             # are a concatenation of the first character of each of the 4 lines
01436             # so we can do some very readable comparisons.
01437             while len(lines) < 4:
01438                 try:
01439                     lines.append(diff_lines_iterator.next())
01440                 except StopIteration:
01441                     lines.append('X')
01442             s = ''.join([line[0] for line in lines])
01443             if s.startswith('X'):
01444                 # When no more lines, pump out any remaining blank lines so the
01445                 # corresponding add/delete lines get a matching blank line so
01446                 # all line pairs get yielded at the next level.
01447                 num_blanks_to_yield = num_blanks_pending
01448             elif s.startswith('-?+?'):
01449                 # simple intraline change
01450                 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
01451                 continue
01452             elif s.startswith('--++'):
01453                 # in delete block, add block coming: we do NOT want to get
01454                 # caught up on blank lines yet, just process the delete line
01455                 num_blanks_pending -= 1
01456                 yield _make_line(lines,'-',0), None, True
01457                 continue
01458             elif s.startswith('--?+') or \
01459                  s.startswith('--+') or \
01460                  s.startswith('- '):
01461                 # in delete block and see a intraline change or unchanged line
01462                 # coming: yield the delete line and then blanks
01463                 from_line,to_line = _make_line(lines,'-',0), None
01464                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
01465             elif s.startswith('-+?'):
01466                 # intraline change
01467                 yield _make_line(lines,None,0), _make_line(lines,'?',1), True
01468                 continue
01469             elif s.startswith('-?+'):
01470                 # intraline change
01471                 yield _make_line(lines,'?',0), _make_line(lines,None,1), True
01472                 continue
01473             elif s.startswith('-'):
01474                 # delete FROM line
01475                 num_blanks_pending -= 1
01476                 yield _make_line(lines,'-',0), None, True
01477                 continue
01478             elif s.startswith('+--'):
01479                 # in add block, delete block coming: we do NOT want to get
01480                 # caught up on blank lines yet, just process the add line
01481                 num_blanks_pending += 1
01482                 yield None, _make_line(lines,'+',1), True
01483                 continue
01484             elif s.startswith('+ ') or \
01485                  s.startswith('+-'):
01486                 # will be leaving an add block: yield blanks then add line
01487                 from_line, to_line = None, _make_line(lines,'+',1)
01488                 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
01489             elif s.startswith('+'):
01490                 # inside an add block, yield the add line
01491                 num_blanks_pending += 1
01492                 yield None, _make_line(lines,'+',1), True
01493                 continue
01494             elif s.startswith(' '):
01495                 # unchanged text, yield it to both sides
01496                 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
01497                 continue
01498             # Catch up on the blank lines so when we yield the next from/to
01499             # pair, they are lined up.
01500             while(num_blanks_to_yield < 0):
01501                 num_blanks_to_yield += 1
01502                 yield None,('','\n'),True
01503             while(num_blanks_to_yield > 0):
01504                 num_blanks_to_yield -= 1
01505                 yield ('','\n'),None,True
01506             if s.startswith('X'):
01507                 raise StopIteration
01508             else:
01509                 yield from_line,to_line,True
01510 
01511     def _line_pair_iterator():
01512         """Yields from/to lines of text with a change indication.
01513 
01514         This function is an iterator.  It itself pulls lines from the line
01515         iterator.  Its difference from that iterator is that this function
01516         always yields a pair of from/to text lines (with the change
01517         indication).  If necessary it will collect single from/to lines
01518         until it has a matching pair from/to pair to yield.
01519 
01520         Note, this function is purposefully not defined at the module scope so
01521         that data it needs from its parent function (within whose context it
01522         is defined) does not need to be of module scope.
01523         """
01524         line_iterator = _line_iterator()
01525         fromlines,tolines=[],[]
01526         while True:
01527             # Collecting lines of text until we have a from/to pair
01528             while (len(fromlines)==0 or len(tolines)==0):
01529                 from_line, to_line, found_diff =line_iterator.next()
01530                 if from_line is not None:
01531                     fromlines.append((from_line,found_diff))
01532                 if to_line is not None:
01533                     tolines.append((to_line,found_diff))
01534             # Once we have a pair, remove them from the collection and yield it
01535             from_line, fromDiff = fromlines.pop(0)
01536             to_line, to_diff = tolines.pop(0)
01537             yield (from_line,to_line,fromDiff or to_diff)
01538 
01539     # Handle case where user does not want context differencing, just yield
01540     # them up without doing anything else with them.
01541     line_pair_iterator = _line_pair_iterator()
01542     if context is None:
01543         while True:
01544             yield line_pair_iterator.next()
01545     # Handle case where user wants context differencing.  We must do some
01546     # storage of lines until we know for sure that they are to be yielded.
01547     else:
01548         context += 1
01549         lines_to_write = 0
01550         while True:
01551             # Store lines up until we find a difference, note use of a
01552             # circular queue because we only need to keep around what
01553             # we need for context.
01554             index, contextLines = 0, [None]*(context)
01555             found_diff = False
01556             while(found_diff is False):
01557                 from_line, to_line, found_diff = line_pair_iterator.next()
01558                 i = index % context
01559                 contextLines[i] = (from_line, to_line, found_diff)
01560                 index += 1
01561             # Yield lines that we have collected so far, but first yield
01562             # the user's separator.
01563             if index > context:
01564                 yield None, None, None
01565                 lines_to_write = context
01566             else:
01567                 lines_to_write = index
01568                 index = 0
01569             while(lines_to_write):
01570                 i = index % context
01571                 index += 1
01572                 yield contextLines[i]
01573                 lines_to_write -= 1
01574             # Now yield the context lines after the change
01575             lines_to_write = context-1
01576             while(lines_to_write):
01577                 from_line, to_line, found_diff = line_pair_iterator.next()
01578                 # If another change within the context, extend the context
01579                 if found_diff:
01580                     lines_to_write = context-1
01581                 else:
01582                     lines_to_write -= 1
01583                 yield from_line, to_line, found_diff
01584 
01585 
01586 _file_template = """
01587 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
01588           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
01589 
01590 <html>
01591 
01592 <head>
01593     <meta http-equiv="Content-Type"
01594           content="text/html; charset=ISO-8859-1" />
01595     <title></title>
01596     <style type="text/css">%(styles)s
01597     </style>
01598 </head>
01599 
01600 <body>
01601     %(table)s%(legend)s
01602 </body>
01603 
01604 </html>"""
01605 
01606 _styles = """
01607         table.diff {font-family:Courier; border:medium;}
01608         .diff_header {background-color:#e0e0e0}
01609         td.diff_header {text-align:right}
01610         .diff_next {background-color:#c0c0c0}
01611         .diff_add {background-color:#aaffaa}
01612         .diff_chg {background-color:#ffff77}
01613         .diff_sub {background-color:#ffaaaa}"""
01614 
01615 _table_template = """
01616     <table class="diff" id="difflib_chg_%(prefix)s_top"
01617            cellspacing="0" cellpadding="0" rules="groups" >
01618         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
01619         <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
01620         %(header_row)s
01621         <tbody>
01622 %(data_rows)s        </tbody>
01623     </table>"""
01624 
01625 _legend = """
01626     <table class="diff" summary="Legends">
01627         <tr> <th colspan="2"> Legends </th> </tr>
01628         <tr> <td> <table border="" summary="Colors">
01629                       <tr><th> Colors </th> </tr>
01630                       <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
01631                       <tr><td class="diff_chg">Changed</td> </tr>
01632                       <tr><td class="diff_sub">Deleted</td> </tr>
01633                   </table></td>
01634              <td> <table border="" summary="Links">
01635                       <tr><th colspan="2"> Links </th> </tr>
01636                       <tr><td>(f)irst change</td> </tr>
01637                       <tr><td>(n)ext change</td> </tr>
01638                       <tr><td>(t)op</td> </tr>
01639                   </table></td> </tr>
01640     </table>"""
01641 
01642 class HtmlDiff(object):
01643     """For producing HTML side by side comparison with change highlights.
01644 
01645     This class can be used to create an HTML table (or a complete HTML file
01646     containing the table) showing a side by side, line by line comparison
01647     of text with inter-line and intra-line change highlights.  The table can
01648     be generated in either full or contextual difference mode.
01649 
01650     The following methods are provided for HTML generation:
01651 
01652     make_table -- generates HTML for a single side by side table
01653     make_file -- generates complete HTML file with a single side by side table
01654 
01655     See tools/scripts/diff.py for an example usage of this class.
01656     """
01657 
01658     _file_template = _file_template
01659     _styles = _styles
01660     _table_template = _table_template
01661     _legend = _legend
01662     _default_prefix = 0
01663 
01664     def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
01665                  charjunk=IS_CHARACTER_JUNK):
01666         """HtmlDiff instance initializer
01667 
01668         Arguments:
01669         tabsize -- tab stop spacing, defaults to 8.
01670         wrapcolumn -- column number where lines are broken and wrapped,
01671             defaults to None where lines are not wrapped.
01672         linejunk,charjunk -- keyword arguments passed into ndiff() (used to by
01673             HtmlDiff() to generate the side by side HTML differences).  See
01674             ndiff() documentation for argument default values and descriptions.
01675         """
01676         self._tabsize = tabsize
01677         self._wrapcolumn = wrapcolumn
01678         self._linejunk = linejunk
01679         self._charjunk = charjunk
01680 
01681     def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False,
01682                   numlines=5):
01683         """Returns HTML file of side by side comparison with change highlights
01684 
01685         Arguments:
01686         fromlines -- list of "from" lines
01687         tolines -- list of "to" lines
01688         fromdesc -- "from" file column header string
01689         todesc -- "to" file column header string
01690         context -- set to True for contextual differences (defaults to False
01691             which shows full differences).
01692         numlines -- number of context lines.  When context is set True,
01693             controls number of lines displayed before and after the change.
01694             When context is False, controls the number of lines to place
01695             the "next" link anchors before the next change (so click of
01696             "next" link jumps to just before the change).
01697         """
01698 
01699         return self._file_template % dict(
01700             styles = self._styles,
01701             legend = self._legend,
01702             table = self.make_table(fromlines,tolines,fromdesc,todesc,
01703                                     context=context,numlines=numlines))
01704 
01705     def _tab_newline_replace(self,fromlines,tolines):
01706         """Returns from/to line lists with tabs expanded and newlines removed.
01707 
01708         Instead of tab characters being replaced by the number of spaces
01709         needed to fill in to the next tab stop, this function will fill
01710         the space with tab characters.  This is done so that the difference
01711         algorithms can identify changes in a file when tabs are replaced by
01712         spaces and vice versa.  At the end of the HTML generation, the tab
01713         characters will be replaced with a nonbreakable space.
01714         """
01715         def expand_tabs(line):
01716             # hide real spaces
01717             line = line.replace(' ','\0')
01718             # expand tabs into spaces
01719             line = line.expandtabs(self._tabsize)
01720             # relace spaces from expanded tabs back into tab characters
01721             # (we'll replace them with markup after we do differencing)
01722             line = line.replace(' ','\t')
01723             return line.replace('\0',' ').rstrip('\n')
01724         fromlines = [expand_tabs(line) for line in fromlines]
01725         tolines = [expand_tabs(line) for line in tolines]
01726         return fromlines,tolines
01727 
01728     def _split_line(self,data_list,line_num,text):
01729         """Builds list of text lines by splitting text lines at wrap point
01730 
01731         This function will determine if the input text line needs to be
01732         wrapped (split) into separate lines.  If so, the first wrap point
01733         will be determined and the first line appended to the output
01734         text line list.  This function is used recursively to handle
01735         the second part of the split line to further split it.
01736         """
01737         # if blank line or context separator, just add it to the output list
01738         if not line_num:
01739             data_list.append((line_num,text))
01740             return
01741 
01742         # if line text doesn't need wrapping, just add it to the output list
01743         size = len(text)
01744         max = self._wrapcolumn
01745         if (size <= max) or ((size -(text.count('\0')*3)) <= max):
01746             data_list.append((line_num,text))
01747             return
01748 
01749         # scan text looking for the wrap point, keeping track if the wrap
01750         # point is inside markers
01751         i = 0
01752         n = 0
01753         mark = ''
01754         while n < max and i < size:
01755             if text[i] == '\0':
01756                 i += 1
01757                 mark = text[i]
01758                 i += 1
01759             elif text[i] == '\1':
01760                 i += 1
01761                 mark = ''
01762             else:
01763                 i += 1
01764                 n += 1
01765 
01766         # wrap point is inside text, break it up into separate lines
01767         line1 = text[:i]
01768         line2 = text[i:]
01769 
01770         # if wrap point is inside markers, place end marker at end of first
01771         # line and start marker at beginning of second line because each
01772         # line will have its own table tag markup around it.
01773         if mark:
01774             line1 = line1 + '\1'
01775             line2 = '\0' + mark + line2
01776 
01777         # tack on first line onto the output list
01778         data_list.append((line_num,line1))
01779 
01780         # use this routine again to wrap the remaining text
01781         self._split_line(data_list,'>',line2)
01782 
01783     def _line_wrapper(self,diffs):
01784         """Returns iterator that splits (wraps) mdiff text lines"""
01785 
01786         # pull from/to data and flags from mdiff iterator
01787         for fromdata,todata,flag in diffs:
01788             # check for context separators and pass them through
01789             if flag is None:
01790                 yield fromdata,todata,flag
01791                 continue
01792             (fromline,fromtext),(toline,totext) = fromdata,todata
01793             # for each from/to line split it at the wrap column to form
01794             # list of text lines.
01795             fromlist,tolist = [],[]
01796             self._split_line(fromlist,fromline,fromtext)
01797             self._split_line(tolist,toline,totext)
01798             # yield from/to line in pairs inserting blank lines as
01799             # necessary when one side has more wrapped lines
01800             while fromlist or tolist:
01801                 if fromlist:
01802                     fromdata = fromlist.pop(0)
01803                 else:
01804                     fromdata = ('',' ')
01805                 if tolist:
01806                     todata = tolist.pop(0)
01807                 else:
01808                     todata = ('',' ')
01809                 yield fromdata,todata,flag
01810 
01811     def _collect_lines(self,diffs):
01812         """Collects mdiff output into separate lists
01813 
01814         Before storing the mdiff from/to data into a list, it is converted
01815         into a single line of text with HTML markup.
01816         """
01817 
01818         fromlist,tolist,flaglist = [],[],[]
01819         # pull from/to data and flags from mdiff style iterator
01820         for fromdata,todata,flag in diffs:
01821             try:
01822                 # store HTML markup of the lines into the lists
01823                 fromlist.append(self._format_line(0,flag,*fromdata))
01824                 tolist.append(self._format_line(1,flag,*todata))
01825             except TypeError:
01826                 # exceptions occur for lines where context separators go
01827                 fromlist.append(None)
01828                 tolist.append(None)
01829             flaglist.append(flag)
01830         return fromlist,tolist,flaglist
01831 
01832     def _format_line(self,side,flag,linenum,text):
01833         """Returns HTML markup of "from" / "to" text lines
01834 
01835         side -- 0 or 1 indicating "from" or "to" text
01836         flag -- indicates if difference on line
01837         linenum -- line number (used for line number column)
01838         text -- line text to be marked up
01839         """
01840         try:
01841             linenum = '%d' % linenum
01842             id = ' id="%s%s"' % (self._prefix[side],linenum)
01843         except TypeError:
01844             # handle blank lines where linenum is '>' or ''
01845             id = ''
01846         # replace those things that would get confused with HTML symbols
01847         text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
01848 
01849         # make space non-breakable so they don't get compressed or line wrapped
01850         text = text.replace(' ','&nbsp;').rstrip()
01851 
01852         return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
01853                % (id,linenum,text)
01854 
01855     def _make_prefix(self):
01856         """Create unique anchor prefixes"""
01857 
01858         # Generate a unique anchor prefix so multiple tables
01859         # can exist on the same HTML page without conflicts.
01860         fromprefix = "from%d_" % HtmlDiff._default_prefix
01861         toprefix = "to%d_" % HtmlDiff._default_prefix
01862         HtmlDiff._default_prefix += 1
01863         # store prefixes so line format method has access
01864         self._prefix = [fromprefix,toprefix]
01865 
01866     def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
01867         """Makes list of "next" links"""
01868 
01869         # all anchor names will be generated using the unique "to" prefix
01870         toprefix = self._prefix[1]
01871 
01872         # process change flags, generating middle column of next anchors/links
01873         next_id = ['']*len(flaglist)
01874         next_href = ['']*len(flaglist)
01875         num_chg, in_change = 0, False
01876         last = 0
01877         for i,flag in enumerate(flaglist):
01878             if flag:
01879                 if not in_change:
01880                     in_change = True
01881                     last = i
01882                     # at the beginning of a change, drop an anchor a few lines
01883                     # (the context lines) before the change for the previous
01884                     # link
01885                     i = max([0,i-numlines])
01886                     next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
01887                     # at the beginning of a change, drop a link to the next
01888                     # change
01889                     num_chg += 1
01890                     next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
01891                          toprefix,num_chg)
01892             else:
01893                 in_change = False
01894         # check for cases where there is no content to avoid exceptions
01895         if not flaglist:
01896             flaglist = [False]
01897             next_id = ['']
01898             next_href = ['']
01899             last = 0
01900             if context:
01901                 fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
01902                 tolist = fromlist
01903             else:
01904                 fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
01905         # if not a change on first line, drop a link
01906         if not flaglist[0]:
01907             next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
01908         # redo the last link to link to the top
01909         next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
01910 
01911         return fromlist,tolist,flaglist,next_href,next_id
01912 
01913     def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
01914                    numlines=5):
01915         """Returns HTML table of side by side comparison with change highlights
01916 
01917         Arguments:
01918         fromlines -- list of "from" lines
01919         tolines -- list of "to" lines
01920         fromdesc -- "from" file column header string
01921         todesc -- "to" file column header string
01922         context -- set to True for contextual differences (defaults to False
01923             which shows full differences).
01924         numlines -- number of context lines.  When context is set True,
01925             controls number of lines displayed before and after the change.
01926             When context is False, controls the number of lines to place
01927             the "next" link anchors before the next change (so click of
01928             "next" link jumps to just before the change).
01929         """
01930 
01931         # make unique anchor prefixes so that multiple tables may exist
01932         # on the same page without conflict.
01933         self._make_prefix()
01934 
01935         # change tabs to spaces before it gets more difficult after we insert
01936         # markkup
01937         fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
01938 
01939         # create diffs iterator which generates side by side from/to data
01940         if context:
01941             context_lines = numlines
01942         else:
01943             context_lines = None
01944         diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
01945                       charjunk=self._charjunk)
01946 
01947         # set up iterator to wrap lines that exceed desired width
01948         if self._wrapcolumn:
01949             diffs = self._line_wrapper(diffs)
01950 
01951         # collect up from/to lines and flags into lists (also format the lines)
01952         fromlist,tolist,flaglist = self._collect_lines(diffs)
01953 
01954         # process change flags, generating middle column of next anchors/links
01955         fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
01956             fromlist,tolist,flaglist,context,numlines)
01957 
01958         s = []
01959         fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
01960               '<td class="diff_next">%s</td>%s</tr>\n'
01961         for i in range(len(flaglist)):
01962             if flaglist[i] is None:
01963                 # mdiff yields None on separator lines skip the bogus ones
01964                 # generated for the first line
01965                 if i > 0:
01966                     s.append('        </tbody>        \n        <tbody>\n')
01967             else:
01968                 s.append( fmt % (next_id[i],next_href[i],fromlist[i],
01969                                            next_href[i],tolist[i]))
01970         if fromdesc or todesc:
01971             header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
01972                 '<th class="diff_next"><br /></th>',
01973                 '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
01974                 '<th class="diff_next"><br /></th>',
01975                 '<th colspan="2" class="diff_header">%s</th>' % todesc)
01976         else:
01977             header_row = ''
01978 
01979         table = self._table_template % dict(
01980             data_rows=''.join(s),
01981             header_row=header_row,
01982             prefix=self._prefix[1])
01983 
01984         return table.replace('\0+','<span class="diff_add">'). \
01985                      replace('\0-','<span class="diff_sub">'). \
01986                      replace('\0^','<span class="diff_chg">'). \
01987                      replace('\1','</span>'). \
01988                      replace('\t','&nbsp;')
01989 
01990 del re
01991 
01992 def restore(delta, which):
01993     r"""
01994     Generate one of the two sequences that generated a delta.
01995 
01996     Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
01997     lines originating from file 1 or 2 (parameter `which`), stripping off line
01998     prefixes.
01999 
02000     Examples:
02001 
02002     >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
02003     ...              'ore\ntree\nemu\n'.splitlines(1))
02004     >>> diff = list(diff)
02005     >>> print ''.join(restore(diff, 1)),
02006     one
02007     two
02008     three
02009     >>> print ''.join(restore(diff, 2)),
02010     ore
02011     tree
02012     emu
02013     """
02014     try:
02015         tag = {1: "- ", 2: "+ "}[int(which)]
02016     except KeyError:
02017         raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
02018                            % which)
02019     prefixes = ("  ", tag)
02020     for line in delta:
02021         if line[:2] in prefixes:
02022             yield line[2:]
02023 
02024 def _test():
02025     import doctest, difflib
02026     return doctest.testmod(difflib)
02027 
02028 if __name__ == "__main__":
02029     _test()