blob: 3c310bd167c9e32bdce41dad85b2d4528378da18 [file] [log] [blame]
// Copyright (c) 2011, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
package org.owasp.html;
import junit.framework.TestCase;
import javax.annotation.Nullable;
import org.junit.Test;
public class HtmlSanitizerTest extends TestCase {
@Test
public static final void testEmpty() throws Exception {
assertEquals("", sanitize(""));
assertEquals("", sanitize(null));
}
@Test
public static final void testSimpleText() throws Exception {
assertEquals("hello world", sanitize("hello world"));
}
@Test
public static final void testEntities1() throws Exception {
assertEquals("<hello world>", sanitize("<hello world>"));
}
@Test
public static final void testEntities2() throws Exception {
assertEquals("<b>hello <i>world</i></b>",
sanitize("<b>hello <i>world</i></b>"));
}
@Test
public static final void testUnknownTagsRemoved() throws Exception {
assertEquals("<b>hello <i>world</i></b>",
sanitize("<b>hello <bogus></bogus><i>world</i></b>"));
}
@Test
public static final void testUnsafeTagsRemoved() throws Exception {
assertEquals("<b>hello <i>world</i></b>",
sanitize("<b>hello <i>world</i>"
+ "<script src=foo.js></script></b>"));
}
@Test
public static final void testUnsafeAttributesRemoved() throws Exception {
assertEquals(
"<b>hello <i>world</i></b>",
sanitize("<b>hello <i onclick=\"takeOverWorld(this)\">world</i></b>"));
}
@Test
public static final void testCruftEscaped() throws Exception {
assertEquals("<b>hello <i>world&lt;</i></b> &amp; tomorrow the universe",
sanitize(
"<b>hello <i>world<</i></b> & tomorrow the universe"));
}
@Test
public static final void testTagCruftRemoved() throws Exception {
assertEquals("<b id=\"p-foo\">hello <i>world&lt;</i></b>",
sanitize("<b id=\"foo\" / -->hello <i>world<</i></b>"));
}
@Test
public static final void testIdsAndClassesPrefixed() throws Exception {
assertEquals(
"<b id=\"p-foo\" class=\"p-boo p-bar p-baz\">"
+ "hello <i>world&lt;</i></b>",
sanitize(
"<b id=\"foo\" class=\"boo bar baz\">hello <i>world<</i></b>"));
}
@Test
public static final void testSpecialCharsInAttributes() throws Exception {
assertEquals(
"<b title=\"a&lt;b &amp;&amp; c&gt;b\">bar</b>",
sanitize("<b title=\"a<b && c>b\">bar</b>"));
}
@Test
public static final void testUnclosedTags() throws Exception {
assertEquals("<div id=\"p-foo\">Bar<br />Baz</div>",
sanitize("<div id=\"foo\">Bar<br>Baz"));
}
@Test
public static final void testUnopenedTags() throws Exception {
assertEquals("Foo<b>Bar</b>Baz",
sanitize("Foo<b></select>Bar</b></b>Baz</select>"));
}
@Test
public static final void testUnsafeEndTags() throws Exception {
assertEquals(
"",
sanitize(
"</meta http-equiv=\"refesh\""
+ " content=\"1;URL=http://evilgadget.com\">"));
}
@Test
public static final void testEmptyEndTags() throws Exception {
assertEquals("<input />", sanitize("<input></input>"));
}
@Test
public static final void testOnLoadStripped() throws Exception {
assertEquals(
"<img />",
sanitize("<img src=http://foo.com/bar ONLOAD=alert(1)>"));
}
@Test
public static final void testClosingTagParameters() throws Exception {
assertEquals(
"<p>Hello world</p>",
sanitize("<p>Hello world</b style=\"width:expression(alert(1))\">"));
}
@Test
public static final void testOptionalEndTags() throws Exception {
// Should not be
// "<ol> <li>A</li> <li>B<li>C </li></li></ol>"
// The difference is significant because in the first, the item contains no
// space after 'A", but in the third, the item contains 'C' and a space.
assertEquals(
"<ol><li>A</li><li>B</li><li>C </li></ol>",
sanitize("<ol> <li>A</li> <li>B<li>C </ol>"));
}
@Test
public static final void testFoldingOfHtmlAndBodyTags() throws Exception {
assertEquals(
"<p>P 1</p>",
sanitize("<html><head><title>Foo</title></head>"
+ "<body><p>P 1</p></body></html>"));
assertEquals(
"Hello",
sanitize("<body bgcolor=\"blue\">Hello</body>"));
assertEquals(
"<p>Foo</p><p>One</p><p>Two</p>Three<p>Four</p>",
sanitize(
"<html>"
+ "<head>"
+ "<title>Blah</title>"
+ "<p>Foo</p>"
+ "</head>"
+ "<body>"
+ "<p>One"
+ "<p>Two</p>"
+ "Three"
+ "<p>Four</p>"
+ "</body>"
+ "</html>"));
}
@Test
public static final void testEmptyAndValuelessAttributes() throws Exception {
assertEquals(
"<input checked=\"checked\" type=\"checkbox\" id=\"\" class=\"\" />",
sanitize("<input checked type=checkbox id=\"\" class=>"));
}
@Test
public static final void testSgmlShortTags() throws Exception {
// We make no attempt to correctly handle SGML short tags since they are
// not implemented consistently across browsers, and have been removed from
// HTML 5.
//
// According to http://www.w3.org/QA/2007/10/shorttags.html
// Shorttags - the odd side of HTML 4.01
// ...
// It uses an ill-known feature of SGML called shorthand markup, which
// was authorized in HTML up to HTML 4.01. But what used to be a "cool"
// feature for SGML experts becomes a liability in HTML, where the
// construct is more likely to appear as a typo than as a conscious
// choice.
//
// All could be fine if this form typo-that-happens-to-be-legal was
// properly implemented in contemporary HTML user-agents. It is not.
assertEquals("<p></p>", sanitize("<p/b/")); // Short-tag discarded.
assertEquals("<p></p>", sanitize("<p<b>")); // Discard <b attribute
assertEquals(
// This behavior for short tags is not ideal, but it is safe.
"<p href=\"/\">first part of the text&lt;/&gt; second part</p>",
sanitize("<p<a href=\"/\">first part of the text</> second part"));
}
@Test
public static final void testNul() throws Exception {
assertEquals(
"<a title="
+ "\"harmless SCRIPT&#61;javascript:alert(1) ignored&#61;ignored\">"
+ "</a>",
sanitize(
"<A TITLE="
+ "\"harmless\0 SCRIPT=javascript:alert(1) ignored=ignored\">"
));
}
@Test
public static final void testDigitsInAttrNames() throws Exception {
// See bug 614 for details.
assertEquals(
"<div>Hello</div>",
sanitize(
"<div style1=\"expression(\'alert(1)\")\">Hello</div>"
));
}
@Test
public static final void testSupplementaryCodepointEncoding()
throws Exception {
// &#xd87e;&#xdc1a; is not appropriate.
// &#x2f81a; is appropriate as is the unencoded form.
assertEquals(
"&#x2f81a; | &#x2f81a; | &#x2f81a;",
sanitize("&#x2F81A; | \ud87e\udc1a | &#xd87e;&#xdc1a;"));
}
@Test
public static final void testDeeplyNestedTagsDoS() throws Exception {
String sanitized = sanitize(stringRepeatedTimes("<div>", 20000));
int n = sanitized.length() / "<div></div>".length();
assertTrue("" + n, 50 <= n && n <= 1000);
int middle = n * "<div>".length();
assertEquals(sanitized.substring(0, middle),
stringRepeatedTimes("<div>", n));
assertEquals(sanitized.substring(middle),
stringRepeatedTimes("</div>", n));
}
@Test
public static final void testInnerHTMLIE8() throws Exception {
// Apparently, in quirks mode, IE8 does a poor job producing innerHTML
// values. Given
// <div attr="``foo=bar">
// we encode &#96; but if JavaScript does:
// nodeA.innerHTML = nodeB.innerHTML;
// and nodeB contains the DIV above, then IE8 will produce
// <div attr=``foo=bar>
// as the value of nodeB.innerHTML and assign it to nodeA.
// IE8's HTML parser treats `` as a blank attribute value and foo=bar
// becomes a separate attribute.
// Adding a space at the end of the attribute prevents this by forcing
// IE8 to put double quotes around the attribute when computing
// nodeB.innerHTML.
assertEquals(
"<div title=\"&#96;&#96;onmouseover&#61;alert(1337) \"></div>",
sanitize("<div title=\"``onmouseover=alert(1337)\">"));
}
@Test
public static final void testNabobsOfNegativism() throws Exception {
// Treating <noscript> as raw-text gains us nothing security-wise.
assertEquals("<noscript></noscript>",
sanitize("<noscript><evil></noscript>"));
assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
sanitize("<noscript>I <b><3</b> Ponies</noscript>"));
assertEquals("<noscript>I <b>&lt;3</b> Ponies</noscript>",
sanitize("<NOSCRIPT>I <b><3</b> Ponies</noscript><evil>"));
assertEquals("<noframes>I <b>&lt;3</b> Ponies</noframes>",
sanitize("<noframes>I <b><3</b> Ponies</noframes><evil>"));
assertEquals("<noembed>I <b>&lt;3</b> Ponies</noembed>",
sanitize("<noembed>I <b><3</b> Ponies</noembed><evil>"));
assertEquals("<noxss>I <b>&lt;3</b> Ponies</noxss>",
sanitize("<noxss>I <b><3</b> Ponies</noxss><evil>"));
assertEquals(
"&lt;noscript&gt;I &lt;b&gt;&lt;3&lt;/b&gt; Ponies&lt;/noscript&gt;",
sanitize("<xmp><noscript>I <b><3</b> Ponies</noscript></xmp>"));
}
@Test
public static final void testNULs() throws Exception {
assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000</b>"));
assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000"));
assertEquals("", sanitize("\u0000"));
assertEquals("<b>Hello, </b>", sanitize("<b>Hello, &#0;</b>"));
assertEquals("", sanitize("&#0;"));
}
@Test
public static final void testQMarkMeta() throws Exception {
assertEquals(
"Hello, <b>World</b>!",
sanitize(
""
// An XML Prologue.
// HTML5 treats it as ignorable content via the bogus comment state.
+ "<?xml version=\"1\" ?>"
+ "Hello, "
// An XML Processing instruction.
// HTML5 treats it as ignorable content via the bogus comment state.
+ "<?processing instruction?>"
+ "<b>World"
// Appears in HTML copied from outlook.
+ "<?xml:namespace prefix = o ns = "
+ "\"urn:schemas-microsoft-com:office:office\" />"
+ "</b>!"));
}
@Test
public static final void testScriptInIframe() throws Exception {
assertEquals(
"<iframe></iframe>",
sanitize(
"<iframe>\n"
+ " <script>alert(Hi)</script>\n"
+ "</iframe>"));
}
private static String sanitize(@Nullable String html) throws Exception {
StringBuilder sb = new StringBuilder();
HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
sb,
new Handler<String>() {
public void handle(String errorMessage) {
fail(errorMessage);
}
});
HtmlSanitizer.Policy policy = new HtmlPolicyBuilder()
// Allow these tags.
.allowElements(
"a", "b", "br", "div", "i", "iframe", "img", "input", "li",
"ol", "p", "span", "ul", "noscript", "noframes", "noembed", "noxss")
// And these attributes.
.allowAttributes(
"dir", "checked", "class", "href", "id", "target", "title", "type")
.globally()
// Cleanup IDs and CLASSes and prefix them with p- to move to a separate
// name-space.
.allowAttributes("id", "class")
.matching(
new AttributePolicy() {
public String apply(
String elementName, String attributeName, String value) {
return value.replaceAll("(?:^|\\s)([a-zA-Z])", " p-$1")
.replaceAll("\\s+", " ")
.trim();
}
})
.globally()
// Don't throw out useless <img> and <input> elements to ease debugging.
.allowWithoutAttributes("img", "input")
.build(renderer);
HtmlSanitizer.sanitize(html, policy);
return sb.toString();
}
private static final String stringRepeatedTimes(String s, int n) {
StringBuilder sb = new StringBuilder(s.length() * n);
while (--n >= 0) {
sb.append(s);
}
return sb.toString();
}
}