blob: 3b17e872d558b82a4e9c5ba2625ce9d47651cdc0 [file] [log] [blame]
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" $Id: htmldiff,v 1.62 2016/10/06 10:46:19 dom Exp $
"""
import atexit
import cgi
import http_auth
import httplib
import os
import re
import surbl
import sys
import tempfile
import tidy
import urlparse
from subprocess import Popen, PIPE
CONTENT_TYPE = "text/html;charset=utf-8"
Page = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
<head><title>HTML Diff service</title>
<link rel="stylesheet" href="http://www.w3.org/StyleSheets/base" />
</head>
<body>
<p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a> <a href="http://www.w3.org/2003/Editors">W3C Editors homepage</a></p>
<h1>Create Diff between HTML pages</h1>
"""
Page2 = """
<form method="GET">
<p>Address of reference document: <input name="doc1" type="url" value="%s" style="width:100%%"/></p>
<p>Address of new document: <input name="doc2" value="%s" style="width:100%%"/></p>
<p><input type="submit" value="get Diff"/></p>
</form>
<p><strong>Tip</strong>: if the document uses the W3C convention on linking to its previous version, you can specify only the address of the new document — the previous link will be automatically detected.</p>
<h2>Diff markings</h2>
<p>This service relies on <a href="https://www.gnu.org/software/diffutils/">GNU diff</a>. The found differences are roughly marked as follow:
<ul>
<li>deleted text is shown in pink with down-arrows (as styled for a &lt;del> element)</li>
<li>where there is replacement, it’s shown in green with bi-directional arrows,</li>
<li>where there is newly inserted text, it’s yellow with up arrows (&lt;ins> element)</li>
</ul>
<address>
script $Revision: 1.62 $ of $Date: 2016/10/06 10:46:19 $<br />
by <a href="http://www.w3.org/People/Dom/">Dominique Hazaël-Massieux</a><br />based on <a href="https://dev.w3.org/cvsweb/2009/htmldiff/htmldiff.pl">Shane McCarron’ Perl script</a> wrapped in a <a href="http://dev.w3.org/cvsweb/2009/htmldiff/">Python CGI</a>
</address>
</body>
</html>
"""
def checkInputUrl(url):
checker = surbl.SurblChecker('/usr/local/share/surbl/two-level-tlds','/afs/w3.org/pub/WWW/Systems/Server/debian/generic/usr/local/etc/surbl.whitelist')
if url[:5] == 'file:' or len(urlparse.urlparse(url)[0])<2:
print "Status: 403"
print "Content-Type: text/plain"
print
print "sorry, I decline to handle file: addresses"
sys.exit()
elif checker.isMarkedAsSpam(url):
print "Status: 403"
print "Content-Type: text/plain; charset=utf-8"
print
print "sorry, this URL matches a record known in SURBL. See http://www.surbl.org/"
sys.exit()
def copyHeader(copy_func, source, key, header_name=None):
value = source.get(key)
if not value:
return False
elif header_name is None:
header_name = key
copy_func(header_name, value)
return True
def setupRequest(source_headers):
opener = http_auth.ProxyAuthURLopener()
copyHeader(opener.addheader, source_headers, 'If-Modified-Since')
copyHeader(opener.addheader, os.environ, 'REMOTE_ADDR', 'X_Forward_IP_Addr')
return opener
def tidyFile(file):
# option for tidy
options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8')
html5 = re.search(r"<!doctype\s+html\s*>", file.read(4096),
re.IGNORECASE)
file.seek(0)
html5_options = {"add_xml_space": "no",
"output_xhtml": "no",
"tidy_mark": "no",
"new_blocklevel_tags": 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle',
"new_inline_tags": 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark',
"break_before_br": "no",
"vertical_space": "no",
"enclose_text": "no",
"numeric_entities": "yes",
"wrap": "1000",
"wrap_attributes": "no",
"drop_empty_paras": "no"
}
if html5:
options.update(html5_options)
newtidy = tidy.parseString(file.read(), **options)
if len(newtidy.errors) > 0:
if not html5:
file.seek(0)
options.update(html5_options)
newtidy = tidy.parseString(file.read(), **options)
file.close()
file = tempfile.NamedTemporaryFile(
mode='w+', prefix='htmldiff-', suffix='.html')
atexit.register(file.close)
file.write(str(newtidy))
file.flush()
file.seek(0)
return (file, newtidy.errors)
def matchPredecessorRel(rel):
return rel and "predecessor-version" in rel.lower().split(" ")
def mirrorURL(url, opener):
try:
filename, headers = opener.retrieve(url)
except IOError, error:
opener.error = "I/O error: %s %s" % (error.errno, error.strerror)
except httplib.InvalidURL:
opener.error = "Invalid URL submitted"
except AttributeError: # ProxyAuthURLopener returned None.
pass # There's already an error set.
else:
atexit.register(os.unlink, filename)
file = open(filename)
if headers.has_key("content-encoding") and headers["content-encoding"] == "gzip":
import gzip
from StringIO import StringIO
data = StringIO(file.read())
file.close()
file = gzip.GzipFile(fileobj=data)
file,errors = tidyFile(file)
if len(errors) == 0:
return (file, headers)
else:
opener.error = "Tidy errors: %s" % (str(errors))
return (None, {})
def showPage(url1='', url2='', error_html='', **headers):
for name, value in headers.items():
print "%s: %s" % (name.replace('_', '-'), value)
print
print Page
print error_html
print Page2 % (url1, url2)
sys.exit()
def serveRequest():
fields = cgi.FieldStorage()
if (not fields.has_key('doc2')):
showPage(Content_Type=CONTENT_TYPE)
# if doc1 is not specified, we load doc2 to check if it has a previous version link
doc2 = fields['doc2'].value
checkInputUrl(doc2)
url_opener2 = setupRequest(fields.headers)
newdoc, newheaders = mirrorURL(doc2, url_opener2)
if fields.has_key('doc1'):
doc1 = fields['doc1'].value
elif newdoc is not None:
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(newdoc.read())
newdoc.seek(0)
try:
doc1 = soup.find(text=re.compile("Previous Version",re.IGNORECASE)).findNext(name="a", attrs={"href":True})["href"]
except:
try:
doc1 = soup.find(name=["a", "link"], attrs={"href":True, rel:matchPredecessorRel})["href"]
except:
doc1 = None
else:
doc1 = None
if (not doc1):
showPage(Content_Type=CONTENT_TYPE)
checkInputUrl(doc1)
esc1 = cgi.escape(doc1, True)
esc2 = cgi.escape(doc2, True)
urlcomponents1 = urlparse.urlparse(doc1)
urlcomponents2 = urlparse.urlparse(doc2)
# if same domain, we can use the same urlopener
# otherwise, we create a separate one
if urlcomponents2[1] == urlcomponents1[1]:
url_opener = url_opener2
else:
url_opener = setupRequest(fields.headers)
refdoc, refheaders = mirrorURL(doc1, url_opener)
if not (refdoc and newdoc):
http_error = ""
url = ""
if not refdoc:
http_error = url_opener.error
url = esc1
else:
http_error = url_opener2.error
url = esc2
if re.match("^[1234][0-9][0-9] ", http_error):
print "Status: %s" %(http_error)
error="<p style='color:#FF0000'>An error (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (cgi.escape(http_error), url, url)
showPage(esc1, esc2, error, Content_Type=CONTENT_TYPE)
print "Content-Type: text/html"
if newheaders.has_key('Content-Type'):
contentType = cgi.parse_header(newheaders["Content-Type"])
if contentType[1].has_key('charset'):
charset = contentType[1]['charset'].lower()
#if charset == "iso-8859-1":
# options["char_encoding"]='latin1'
for proxy_header in ('Last-Modified', 'Expires'):
if copyHeader(lambda header, value: sys.stdout.write("%s: %s" %(header, value)), newheaders, proxy_header):
print
print
p = Popen(["/usr/local/bin/htmldiff", refdoc.name, newdoc.name],
stdin=PIPE, stdout=PIPE, stderr=PIPE)
sys.stdout.flush()
sys.stderr.flush()
(out, err) = p.communicate()
p.stdin.close()
if err:
error = "<p style='color:#FF0000'>An error occured when running <code>htmldiff</code> on the documents:</p><pre>%s</pre>" % (cgi.escape(err),)
showPage(esc1, esc2, error)
else:
print out
if __name__ == '__main__':
if os.environ.has_key('SCRIPT_NAME'):
serveRequest()