| ## another case of deja-vu |
| ## this time, we want the slashdot style (what Yahoo said to do) only allow |
| ## certain tags... we'll make it an option |
| ## we'll have to tie this in some way to our HTML body displayer... |
| ## |
| ## Ok, there are basically four types of tags: |
| ## 1) safe - ie, <b>, <i>, etc. |
| ## 2) render problems - <table><form><body><frame> - these we either strip, |
| ## or we have to ensure they match |
| ## 3) definitely evil independent tags that we always strip |
| ## 4) definitely evil tags which denote a region, we strip the entire region |
| |
| from PassSGMLParser import PassSGMLParser |
| from urllib import basejoin |
| import string, sys |
| import neo_cgi |
| |
| try: |
| from cStringIO import StringIO |
| except: |
| from StringIO import StringIO |
| |
| class SafeHtml (PassSGMLParser): |
| _safeTags = {"P":1, "LI":1, "DD":1, "DT":1, "EM":1, "BR":1, "CITE":1, |
| "DFN":1, "Q":1, "STRONG":1, "IMG":1, "HR":1, |
| "TR":1, "TD":1, "TH":1, "CAPTION":1, "THEAD":1, "TFOOT":1, |
| "TBODY":1} |
| _matchTags = {"TABLE":1, "OL":1, "UL":1, "DL":1, "CENTER":1, "DIV":1, "PRE":1, |
| "SUB":1, "SUP":1, "BIG":1, "SMALL":1, "CODE":1, |
| "B":1, "I":1, "A":1, "TT":1, "BLOCKQUOTE":1, "U":1, |
| "H1":1, "H2":1, "H3":1, "H4":1, "H5":1, "H6":1, "FONT":1} |
| _skipTags = {"FORM":1, "HTML":1, "BODY":1, "EMBED":1, "AREA":1, "MAP":1, |
| "FRAME":1, "FRAMESET":1, "IFRAME":1, "META":1} |
| _stripTags = {"HEAD":1, "JAVA":1, "APPLET":1, "OBJECT":1, |
| "JAVASCRIPT":1, "LAYER":1, "STYLE":1, "SCRIPT":1} |
| |
| def __init__ (self, fp, extra_safe=1, base=None, map_urls=None, new_window=1): |
| self._extra_safe = extra_safe |
| PassSGMLParser.__init__ (self, fp, extra_safe) |
| self._matchDict = {} |
| self._stripping = 0 |
| self._base = base |
| self._map_urls = map_urls |
| self._new_window = new_window |
| |
| def safe_start_strip (self): |
| if self._stripping == 0: |
| self.flush() |
| self._stripping = self._stripping + 1 |
| |
| def safe_end_strip (self): |
| self.flush() |
| self._stripping = self._stripping - 1 |
| if self._stripping < 0: self._stripping = 0 |
| |
| def write (self, data): |
| # sys.stderr.write("write[%d] %s\n" % (self._stripping, data)) |
| if self._stripping == 0: |
| # sys.stderr.write("write %s\n" % data) |
| PassSGMLParser.write(self, data) |
| |
| def cleanup_attrs (self, tag, attrs): |
| new_attrs = [] |
| tag = string.lower(tag) |
| if self._new_window and tag == "a": |
| new_attrs.append(('target', '_blank')) |
| for name, value in attrs: |
| name = string.lower(name) |
| if name[:2] == "on": continue ## skip any javascript events |
| if string.lower(value)[:11] == "javascript:": continue |
| if self._map_urls and name in ["action", "href", "src", "lowsrc", "background"] and value[:4] == 'cid:': |
| try: |
| value = self._map_urls[value[4:]] |
| except KeyError: |
| pass |
| else: |
| if self._base and name in ["action", "href", "src", "lowsrc", "background"]: |
| value = basejoin (self._base, value) |
| if name in ["action", "href", "src", "lowsrc", "background"]: |
| value = 'http://www.google.com/url?sa=D&q=%s' % (neo_cgi.urlEscape(value)) |
| if self._new_window and tag == "a" and name == "target": continue |
| new_attrs.append ((name, value)) |
| return new_attrs |
| |
| def unknown_starttag(self, tag, attrs): |
| tag = string.upper(tag) |
| if SafeHtml._stripTags.has_key(tag): |
| self.safe_start_strip() |
| # sys.stderr.write("Stripping tag %s: %d\n" % (tag, self._stripping)) |
| elif SafeHtml._skipTags.has_key(tag): |
| # sys.stderr.write("Skipping tag %s\n" % tag) |
| pass |
| elif SafeHtml._matchTags.has_key(tag): |
| # sys.stderr.write("Matching tag %s\n" % tag) |
| if self._matchDict.has_key(tag): |
| self._matchDict[tag] = self._matchDict[tag] + 1 |
| else: |
| self._matchDict[tag] = 1 |
| self.write_starttag (tag, self.cleanup_attrs(tag, attrs)) |
| elif SafeHtml._safeTags.has_key(tag): |
| # sys.stderr.write("Safe tag %s\n" % tag) |
| self.write_starttag (tag, self.cleanup_attrs(tag, attrs)) |
| elif not self._extra_safe: |
| # sys.stderr.write("Other tag %s\n" % tag) |
| self.write_starttag (tag, self.cleanup_attrs(tag, attrs)) |
| |
| def unknown_endtag(self, tag): |
| tag = string.upper(tag) |
| if SafeHtml._stripTags.has_key(tag): |
| self.safe_end_strip() |
| # sys.stderr.write("End Stripping tag %s: %d\n" % (tag, self._stripping)) |
| elif self._stripping == 0: |
| if SafeHtml._skipTags.has_key(tag): |
| pass |
| elif SafeHtml._matchTags.has_key(tag): |
| if self._matchDict.has_key(tag): |
| self._matchDict[tag] = self._matchDict[tag] - 1 |
| self.write_endtag (tag) |
| elif SafeHtml._safeTags.has_key(tag): |
| self.write_endtag (tag) |
| elif not self._extra_safe: |
| self.write_endtag (tag) |
| |
| def close (self): |
| self._stripping = 0 |
| for tag in self._matchDict.keys(): |
| if self._matchDict[tag] > 0: |
| for x in range (self._matchDict[tag]): |
| self.write_endtag(tag) |
| PassSGMLParser.close(self) |
| |
| def SafeHtmlString (s, really_safe=1, map_urls=None): |
| # fp = open("/tmp/safe_html.in", "w") |
| # fp.write(s) |
| # fp.close() |
| fp = StringIO() |
| parser = SafeHtml(fp, really_safe, map_urls=map_urls) |
| parser.feed (s) |
| parser.close () |
| s = fp.getvalue() |
| # fp = open("/tmp/safe_html.out", "w") |
| # fp.write(s) |
| # fp.close() |
| return s |
| |