| #!/usr/bin/python |
| # -*- coding: utf-8 -*- |
| """ $Id: htmldiff,v 1.62 2016/10/06 10:46:19 dom Exp $ |
| """ |
| |
| import atexit |
| import cgi |
| import http_auth |
| import httplib |
| import os |
| import re |
| import surbl |
| import sys |
| import tempfile |
| import tidy |
| import urlparse |
| |
| from subprocess import Popen, PIPE |
| |
| CONTENT_TYPE = "text/html;charset=utf-8" |
| |
| Page = """ |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US"> |
| <head><title>HTML Diff service</title> |
| <link rel="stylesheet" href="http://www.w3.org/StyleSheets/base" /> |
| </head> |
| <body> |
| |
| <p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a> <a href="http://www.w3.org/2003/Editors">W3C Editors homepage</a></p> |
| |
| <h1>Create Diff between HTML pages</h1> |
| """ |
| Page2 = """ |
| <form method="GET"> |
| <p>Address of reference document: <input name="doc1" type="url" value="%s" style="width:100%%"/></p> |
| <p>Address of new document: <input name="doc2" value="%s" style="width:100%%"/></p> |
| <p><input type="submit" value="get Diff"/></p> |
| </form> |
| |
| <p><strong>Tip</strong>: if the document uses the W3C convention on linking to its previous version, you can specify only the address of the new document — the previous link will be automatically detected.</p> |
| <h2>Diff markings</h2> |
| <p>This service relies on <a href="https://www.gnu.org/software/diffutils/">GNU diff</a>. The found differences are roughly marked as follow: |
| <ul> |
| <li>deleted text is shown in pink with down-arrows (as styled for a <del> element)</li> |
| <li>where there is replacement, it’s shown in green with bi-directional arrows,</li> |
| <li>where there is newly inserted text, it’s yellow with up arrows (<ins> element)</li> |
| </ul> |
| <address> |
| script $Revision: 1.62 $ of $Date: 2016/10/06 10:46:19 $<br /> |
| by <a href="http://www.w3.org/People/Dom/">Dominique Hazaël-Massieux</a><br />based on <a href="https://dev.w3.org/cvsweb/2009/htmldiff/htmldiff.pl">Shane McCarron’ Perl script</a> wrapped in a <a href="http://dev.w3.org/cvsweb/2009/htmldiff/">Python CGI</a> |
| </address> |
| </body> |
| </html> |
| """ |
| |
| def checkInputUrl(url): |
| checker = surbl.SurblChecker('/usr/local/share/surbl/two-level-tlds','/afs/w3.org/pub/WWW/Systems/Server/debian/generic/usr/local/etc/surbl.whitelist') |
| |
| if url[:5] == 'file:' or len(urlparse.urlparse(url)[0])<2: |
| print "Status: 403" |
| print "Content-Type: text/plain" |
| print |
| print "sorry, I decline to handle file: addresses" |
| sys.exit() |
| elif checker.isMarkedAsSpam(url): |
| print "Status: 403" |
| print "Content-Type: text/plain; charset=utf-8" |
| print |
| print "sorry, this URL matches a record known in SURBL. See http://www.surbl.org/" |
| sys.exit() |
| |
| def copyHeader(copy_func, source, key, header_name=None): |
| value = source.get(key) |
| if not value: |
| return False |
| elif header_name is None: |
| header_name = key |
| copy_func(header_name, value) |
| return True |
| |
| def setupRequest(source_headers): |
| opener = http_auth.ProxyAuthURLopener() |
| copyHeader(opener.addheader, source_headers, 'If-Modified-Since') |
| copyHeader(opener.addheader, os.environ, 'REMOTE_ADDR', 'X_Forward_IP_Addr') |
| return opener |
| |
| def tidyFile(file): |
| # option for tidy |
| options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8') |
| html5 = re.search(r"<!doctype\s+html\s*>", file.read(4096), |
| re.IGNORECASE) |
| file.seek(0) |
| html5_options = {"add_xml_space": "no", |
| "output_xhtml": "no", |
| "tidy_mark": "no", |
| "new_blocklevel_tags": 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle', |
| "new_inline_tags": 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark', |
| "break_before_br": "no", |
| "vertical_space": "no", |
| "enclose_text": "no", |
| "numeric_entities": "yes", |
| "wrap": "1000", |
| "wrap_attributes": "no", |
| "drop_empty_paras": "no" |
| } |
| if html5: |
| options.update(html5_options) |
| newtidy = tidy.parseString(file.read(), **options) |
| if len(newtidy.errors) > 0: |
| if not html5: |
| file.seek(0) |
| options.update(html5_options) |
| newtidy = tidy.parseString(file.read(), **options) |
| file.close() |
| file = tempfile.NamedTemporaryFile( |
| mode='w+', prefix='htmldiff-', suffix='.html') |
| atexit.register(file.close) |
| file.write(str(newtidy)) |
| file.flush() |
| file.seek(0) |
| return (file, newtidy.errors) |
| |
| def matchPredecessorRel(rel): |
| return rel and "predecessor-version" in rel.lower().split(" ") |
| |
| def mirrorURL(url, opener): |
| try: |
| filename, headers = opener.retrieve(url) |
| except IOError, error: |
| opener.error = "I/O error: %s %s" % (error.errno, error.strerror) |
| except httplib.InvalidURL: |
| opener.error = "Invalid URL submitted" |
| except AttributeError: # ProxyAuthURLopener returned None. |
| pass # There's already an error set. |
| else: |
| atexit.register(os.unlink, filename) |
| file = open(filename) |
| if headers.has_key("content-encoding") and headers["content-encoding"] == "gzip": |
| import gzip |
| from StringIO import StringIO |
| data = StringIO(file.read()) |
| file.close() |
| file = gzip.GzipFile(fileobj=data) |
| file,errors = tidyFile(file) |
| if len(errors) == 0: |
| return (file, headers) |
| else: |
| opener.error = "Tidy errors: %s" % (str(errors)) |
| return (None, {}) |
| |
| def showPage(url1='', url2='', error_html='', **headers): |
| for name, value in headers.items(): |
| print "%s: %s" % (name.replace('_', '-'), value) |
| print |
| print Page |
| print error_html |
| print Page2 % (url1, url2) |
| sys.exit() |
| |
| def serveRequest(): |
| fields = cgi.FieldStorage() |
| |
| if (not fields.has_key('doc2')): |
| showPage(Content_Type=CONTENT_TYPE) |
| # if doc1 is not specified, we load doc2 to check if it has a previous version link |
| doc2 = fields['doc2'].value |
| checkInputUrl(doc2) |
| url_opener2 = setupRequest(fields.headers) |
| newdoc, newheaders = mirrorURL(doc2, url_opener2) |
| if fields.has_key('doc1'): |
| doc1 = fields['doc1'].value |
| elif newdoc is not None: |
| from BeautifulSoup import BeautifulSoup |
| |
| soup = BeautifulSoup(newdoc.read()) |
| newdoc.seek(0) |
| try: |
| doc1 = soup.find(text=re.compile("Previous Version",re.IGNORECASE)).findNext(name="a", attrs={"href":True})["href"] |
| except: |
| try: |
| doc1 = soup.find(name=["a", "link"], attrs={"href":True, rel:matchPredecessorRel})["href"] |
| except: |
| doc1 = None |
| else: |
| doc1 = None |
| if (not doc1): |
| showPage(Content_Type=CONTENT_TYPE) |
| |
| checkInputUrl(doc1) |
| esc1 = cgi.escape(doc1, True) |
| esc2 = cgi.escape(doc2, True) |
| urlcomponents1 = urlparse.urlparse(doc1) |
| urlcomponents2 = urlparse.urlparse(doc2) |
| # if same domain, we can use the same urlopener |
| # otherwise, we create a separate one |
| if urlcomponents2[1] == urlcomponents1[1]: |
| url_opener = url_opener2 |
| else: |
| url_opener = setupRequest(fields.headers) |
| |
| refdoc, refheaders = mirrorURL(doc1, url_opener) |
| if not (refdoc and newdoc): |
| http_error = "" |
| url = "" |
| if not refdoc: |
| http_error = url_opener.error |
| url = esc1 |
| else: |
| http_error = url_opener2.error |
| url = esc2 |
| if re.match("^[1234][0-9][0-9] ", http_error): |
| print "Status: %s" %(http_error) |
| error="<p style='color:#FF0000'>An error (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (cgi.escape(http_error), url, url) |
| showPage(esc1, esc2, error, Content_Type=CONTENT_TYPE) |
| |
| print "Content-Type: text/html" |
| if newheaders.has_key('Content-Type'): |
| contentType = cgi.parse_header(newheaders["Content-Type"]) |
| if contentType[1].has_key('charset'): |
| charset = contentType[1]['charset'].lower() |
| #if charset == "iso-8859-1": |
| # options["char_encoding"]='latin1' |
| |
| for proxy_header in ('Last-Modified', 'Expires'): |
| if copyHeader(lambda header, value: sys.stdout.write("%s: %s" %(header, value)), newheaders, proxy_header): |
| print |
| print |
| p = Popen(["/usr/local/bin/htmldiff", refdoc.name, newdoc.name], |
| stdin=PIPE, stdout=PIPE, stderr=PIPE) |
| sys.stdout.flush() |
| sys.stderr.flush() |
| (out, err) = p.communicate() |
| p.stdin.close() |
| if err: |
| error = "<p style='color:#FF0000'>An error occured when running <code>htmldiff</code> on the documents:</p><pre>%s</pre>" % (cgi.escape(err),) |
| showPage(esc1, esc2, error) |
| else: |
| print out |
| if __name__ == '__main__': |
| if os.environ.has_key('SCRIPT_NAME'): |
| serveRequest() |