Back to index

plone3  3.1.7
htmldiff.py
Go to the documentation of this file.
00001 """
00002 htmldiff.py
00003 (C) Ian Bicking <ianb@colorstudy.com>
00004 
00005 Finds the differences between two HTML files.  *Not* line-by-line
00006 comparison (more word-by-word).
00007 
00008 Command-line usage:
00009   ./htmldiff.py test1.html test2.html
00010 
00011 Better results if you use mxTidy first.  The output is HTML.
00012 """
00013 
00014 from difflib import SequenceMatcher
00015 import re
00016 from StringIO import StringIO
00017 import cgi
00018 
00019 def htmlEncode(s, esc=cgi.escape):
00020     return esc(s, 1)
00021 
00022 commentRE = re.compile('<!--.*?-->', re.S)
00023 tagRE = re.compile('<.*?>', re.S)
00024 headRE = re.compile('<\s*head\s*>', re.S | re.I)
00025 
00026 class HTMLMatcher(SequenceMatcher):
00027 
00028     def __init__(self, source1, source2):
00029         SequenceMatcher.__init__(self, None, source1, source2)
00030 
00031     def set_seq1(self, a):
00032         SequenceMatcher.set_seq1(self, self.splitHTML(a))
00033 
00034     def set_seq2(self, b):
00035         SequenceMatcher.set_seq2(self, self.splitHTML(b))
00036         
00037     def splitTags(self, t):
00038         result = []
00039         pos = 0
00040         while 1:
00041             match = tagRE.search(t, pos=pos)
00042             if not match:
00043                 result.append(t[pos:])
00044                 break
00045             result.append(t[pos:match.start()])
00046             result.append(match.group(0))
00047             pos = match.end()
00048         return result
00049 
00050     def splitWords(self, t):
00051         return t.strip().split()
00052 
00053     def splitHTML(self, t):
00054         t = commentRE.sub('', t)
00055         r = self.splitTags(t)
00056         result = []
00057         for item in r:
00058             if item.startswith('<'):
00059                 result.append(item)
00060             else:
00061                 result.extend(self.splitWords(item))
00062         return result
00063 
00064     def htmlDiff(self, addStylesheet=False):
00065         opcodes = self.get_opcodes()
00066         a = self.a
00067         b = self.b
00068         out = StringIO()
00069         #print [o[0] for o in opcodes]
00070         for tag, i1, i2, j1, j2 in opcodes:
00071             if tag == 'equal':
00072                 for item in a[i1:i2]:
00073                     out.write(item)
00074                     out.write(' ')
00075             if tag == 'delete' or tag == 'replace':
00076                 self.textDelete(a[i1:i2], out)
00077             if tag == 'insert' or tag == 'replace':
00078                 self.textInsert(b[j1:j2], out)
00079         html = out.getvalue()
00080         out.close()
00081         if addStylesheet:
00082             html = self.addStylesheet(html, self.stylesheet())
00083         return html
00084 
00085     def textDelete(self, lst, out):
00086         inSpan = False
00087         for item in lst:
00088             if item.startswith('<'):
00089                 if inSpan:
00090                     out.write(self.endDeleteText())
00091                     inSpan = False
00092                 out.write(self.formatDeleteTag(item))
00093             else:
00094                 if not inSpan:
00095                     out.write(self.startDeleteText())
00096                     inSpan = True
00097                 out.write(item)
00098                 out.write(' ')
00099         if inSpan:
00100             out.write(self.endDeleteText())
00101 
00102     def textInsert(self, lst, out):
00103         inSpan = False
00104         for item in lst:
00105             if item.startswith('<'):
00106                 if inSpan:
00107                     out.write(self.endInsertText())
00108                     inSpan = False
00109                 out.write(self.formatInsertTag(item))
00110                 out.write(item)
00111                 out.write(' ')
00112             else:
00113                 if not inSpan:
00114                     out.write(self.startInsertText())
00115                     inSpan = True
00116                 out.write(item)
00117                 out.write(' ')
00118         if inSpan:
00119             out.write(self.endInsertText())
00120 
00121     def stylesheet(self):
00122         return '''
00123 .insert { background-color: #aaffaa }
00124 .delete { background-color: #ff8888 }
00125 .tagInsert { background-color: #007700; color: #ffffff }
00126 .tagDelete { background-color: #770000; color: #ffffff }
00127 '''
00128 
00129     def addStylesheet(self, html, ss):
00130         match = headRE.search(html)
00131         if match:
00132             pos = match.end()
00133         else:
00134             pos = 0
00135         return ('%s<style type="text/css"><!--\n%s\n--></style>%s'
00136                 % (html[:pos], ss, html[pos:]))
00137 
00138     def startInsertText(self):
00139         return '<span class="insert">'
00140     def endInsertText(self):
00141         return '</span> '
00142     def startDeleteText(self):
00143         return '<span class="delete">'
00144     def endDeleteText(self):
00145         return '</span> '
00146     def formatInsertTag(self, tag):
00147         return '<span class="tagInsert">insert: <tt>%s</tt></span> ' % htmlEncode(tag)
00148     def formatDeleteTag(self, tag):
00149         return '<span class="tagDelete">delete: <tt>%s</tt></span> ' % htmlEncode(tag)
00150 
00151 class NoTagHTMLMatcher(HTMLMatcher):
00152     def formatInsertTag(self, tag):
00153         return ''
00154     def formatDeleteTag(self, tag):
00155         return ''
00156 
00157 def htmldiff(source1, source2, addStylesheet=False):
00158     """
00159     Return the difference between two pieces of HTML
00160 
00161         >>> htmldiff('test1', 'test2')
00162         '<span class="delete">test1 </span> <span class="insert">test2 </span> '
00163         >>> htmldiff('test1', 'test1')
00164         'test1 '
00165         >>> htmldiff('<b>test1</b>', '<i>test1</i>')
00166         '<span class="tagDelete">delete: <tt>&lt;b&gt;</tt></span> <span class="tagInsert">insert: <tt>&lt;i&gt;</tt></span> <i> test1 <span class="tagDelete">delete: <tt>&lt;/b&gt;</tt></span> <span class="tagInsert">insert: <tt>&lt;/i&gt;</tt></span> </i> '
00167     """
00168     h = HTMLMatcher(source1, source2)
00169     return h.htmlDiff(addStylesheet)
00170 
00171 def diffFiles(f1, f2):
00172     source1 = open(f1).read()
00173     source2 = open(f2).read()
00174     return htmldiff(source1, source2, True)
00175 
00176 class SimpleHTMLMatcher(HTMLMatcher):
00177     """
00178     Like HTMLMatcher, but returns a simpler diff
00179     """
00180     def startInsertText(self):
00181         return '+['
00182     def endInsertText(self):
00183         return ']'
00184     def startDeleteText(self):
00185         return '-['
00186     def endDeleteText(self):
00187         return ']'
00188     def formatInsertTag(self, tag):
00189         return '+[%s]' % tag
00190     def formatDeleteTag(self, tag):
00191         return '-[%s]' % tag
00192 
00193 def simplehtmldiff(source1, source2):
00194     """
00195     Simpler form of htmldiff; mostly for testing, like:
00196 
00197         >>> simplehtmldiff('test1', 'test2')
00198         '-[test1 ]+[test2 ]'
00199         >>> simplehtmldiff('<b>Hello world!</b>', '<i>Hello you!</i>')
00200         '-[<b>]+[<i>]<i> Hello -[world! ]-[</b>]+[you! ]+[</i>]</i> '
00201     """
00202     h = SimpleHTMLMatcher(source1, source2)
00203     return h.htmlDiff()
00204 
00205 class TextMatcher(HTMLMatcher):
00206 
00207 
00208     def set_seq1(self, a):
00209         SequenceMatcher.set_seq1(self, a.split('\n'))
00210 
00211     def set_seq2(self, b):
00212         SequenceMatcher.set_seq2(self, b.split('\n'))
00213 
00214     def htmlDiff(self, addStylesheet=False):
00215         opcodes = self.get_opcodes()
00216         a = self.a
00217         b = self.b
00218         out = StringIO()
00219         for tag, i1, i2, j1, j2 in opcodes:
00220             if tag == 'equal':
00221                 self.writeLines(a[i1:i2], out)
00222             if tag == 'delete' or tag == 'replace':
00223                 out.write(self.startDeleteText())
00224                 self.writeLines(a[i1:i2], out)
00225                 out.write(self.endDeleteText())
00226             if tag == 'insert' or tag == 'replace':
00227                 out.write(self.startInsertText())
00228                 self.writeLines(b[j1:j2], out)
00229                 out.write(self.endInsertText())
00230         html = out.getvalue()
00231         out.close()
00232         if addStylesheet:
00233             html = self.addStylesheet(html, self.stylesheet())
00234         return html
00235 
00236     def writeLines(self, lines, out):
00237         for line in lines:
00238             line = htmlEncode(line)
00239             line = line.replace('  ', '&nbsp; ')
00240             line = line.replace('\t', '&nbsp; &nbsp; &nbsp; &nbsp; ')
00241             if line.startswith(' '):
00242                 line = '&nbsp;' + line[1:]
00243             out.write('<tt>%s</tt><br>\n' % line)
00244 
00245 if __name__ == '__main__':
00246     import sys
00247     if not sys.argv[1:]:
00248         print "Usage: %s file1 file2" % sys.argv[0]
00249         print "or to test: %s test" % sys.argv[0]
00250     elif sys.argv[1] == 'test' and not sys.argv[2:]:
00251         import doctest
00252         doctest.testmod()
00253     else:
00254         print diffFiles(sys.argv[1], sys.argv[2])
00255