blob: 9220702c24ee433d60dd104d5c494e7518a930af [file] [log] [blame]
from xml.dom import minidom, Node
from urlparse import urlparse, urlunparse
from xml.parsers.expat import ExpatError
from htmlentitydefs import name2codepoint
import re
# select and apply an xml:base for this entry
class relativize:
def __init__(self, parent):
self.score = {}
self.links = []
self.collect_and_tally(parent)
self.base = self.select_optimal_base()
if self.base:
if not parent.hasAttribute('xml:base'):
self.rebase(parent)
parent.setAttribute('xml:base', self.base)
# collect and tally cite, href and src attributes
def collect_and_tally(self,parent):
uri = None
if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')
if parent.hasAttribute('href'): uri=parent.getAttribute('href')
if parent.hasAttribute('src'): uri=parent.getAttribute('src')
if uri:
parts=urlparse(uri)
if parts[0].lower() == 'http':
parts = (parts[1]+parts[2]).split('/')
base = None
for i in range(1,len(parts)):
base = tuple(parts[0:i])
self.score[base] = self.score.get(base,0) + len(base)
if base and base not in self.links: self.links.append(base)
for node in parent.childNodes:
if node.nodeType == Node.ELEMENT_NODE:
self.collect_and_tally(node)
# select the xml:base with the highest score
def select_optimal_base(self):
if not self.score: return None
for link in self.links:
self.score[link] = 0
winner = max(self.score.values())
if not winner: return None
for key in self.score.keys():
if self.score[key] == winner:
if winner == len(key): return None
return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/'
# rewrite cite, href and src attributes using this base
def rebase(self,parent):
uri = None
if parent.hasAttribute('cite'): uri=parent.getAttribute('cite')
if parent.hasAttribute('href'): uri=parent.getAttribute('href')
if parent.hasAttribute('src'): uri=parent.getAttribute('src')
if uri and uri.startswith(self.base):
uri = uri[len(self.base):] or '.'
if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri)
if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri)
for node in parent.childNodes:
if node.nodeType == Node.ELEMENT_NODE:
self.rebase(node)
# convert type="html" to type="plain" or type="xhtml" as appropriate
def retype(parent):
for node in parent.childNodes:
if node.nodeType == Node.ELEMENT_NODE:
if node.hasAttribute('type') and node.getAttribute('type') == 'html':
if len(node.childNodes)==0:
node.removeAttribute('type')
elif len(node.childNodes)==1:
# replace html entity defs with utf-8
chunks=re.split('&(\w+);', node.childNodes[0].nodeValue)
for i in range(1,len(chunks),2):
if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']:
chunks[i] ='&' + chunks[i] +';'
elif chunks[i] in name2codepoint:
chunks[i]=unichr(name2codepoint[chunks[i]])
else:
chunks[i]='&' + chunks[i] + ';'
text = u"".join(chunks)
try:
# see if the resulting text is a well-formed XML fragment
div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
data = minidom.parseString((div % text.encode('utf-8')))
if text.find('<') < 0:
# plain text
node.removeAttribute('type')
text = data.documentElement.childNodes[0].nodeValue
node.childNodes[0].replaceWholeText(text)
elif len(text) > 80:
# xhtml
node.setAttribute('type', 'xhtml')
node.removeChild(node.childNodes[0])
node.appendChild(data.documentElement)
except ExpatError:
# leave as html
pass
else:
# recurse
retype(node)
if parent.nodeName == 'entry':
relativize(parent)
if __name__ == '__main__':
# run styler on each file mention on the command line
import sys
for feed in sys.argv[1:]:
doc = minidom.parse(feed)
doc.normalize()
retype(doc.documentElement)
open(feed,'w').write(doc.toxml('utf-8'))