| // Copyright (c) 2011, Mike Samuel |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions |
| // are met: |
| // |
| // Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // Redistributions in binary form must reproduce the above copyright |
| // notice, this list of conditions and the following disclaimer in the |
| // documentation and/or other materials provided with the distribution. |
| // Neither the name of the OWASP nor the names of its contributors may |
| // be used to endorse or promote products derived from this software |
| // without specific prior written permission. |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
| // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| // POSSIBILITY OF SUCH DAMAGE. |
| |
| package org.owasp.html; |
| |
| import junit.framework.TestCase; |
| |
| import javax.annotation.Nullable; |
| |
| import org.junit.Test; |
| |
| |
| public class HtmlSanitizerTest extends TestCase { |
| |
| @Test |
| public static final void testEmpty() throws Exception { |
| assertEquals("", sanitize("")); |
| assertEquals("", sanitize(null)); |
| } |
| |
| @Test |
| public static final void testSimpleText() throws Exception { |
| assertEquals("hello world", sanitize("hello world")); |
| } |
| |
| @Test |
| public static final void testEntities1() throws Exception { |
| assertEquals("<hello world>", sanitize("<hello world>")); |
| } |
| |
| @Test |
| public static final void testEntities2() throws Exception { |
| assertEquals("<b>hello <i>world</i></b>", |
| sanitize("<b>hello <i>world</i></b>")); |
| } |
| |
| @Test |
| public static final void testUnknownTagsRemoved() throws Exception { |
| assertEquals("<b>hello <i>world</i></b>", |
| sanitize("<b>hello <bogus></bogus><i>world</i></b>")); |
| } |
| |
| @Test |
| public static final void testUnsafeTagsRemoved() throws Exception { |
| assertEquals("<b>hello <i>world</i></b>", |
| sanitize("<b>hello <i>world</i>" |
| + "<script src=foo.js></script></b>")); |
| } |
| |
| @Test |
| public static final void testUnsafeAttributesRemoved() throws Exception { |
| assertEquals( |
| "<b>hello <i>world</i></b>", |
| sanitize("<b>hello <i onclick=\"takeOverWorld(this)\">world</i></b>")); |
| } |
| |
| @Test |
| public static final void testCruftEscaped() throws Exception { |
| assertEquals("<b>hello <i>world<</i></b> & tomorrow the universe", |
| sanitize( |
| "<b>hello <i>world<</i></b> & tomorrow the universe")); |
| } |
| |
| @Test |
| public static final void testTagCruftRemoved() throws Exception { |
| assertEquals("<b id=\"p-foo\">hello <i>world<</i></b>", |
| sanitize("<b id=\"foo\" / -->hello <i>world<</i></b>")); |
| } |
| |
| @Test |
| public static final void testIdsAndClassesPrefixed() throws Exception { |
| assertEquals( |
| "<b id=\"p-foo\" class=\"p-boo p-bar p-baz\">" |
| + "hello <i>world<</i></b>", |
| sanitize( |
| "<b id=\"foo\" class=\"boo bar baz\">hello <i>world<</i></b>")); |
| } |
| |
| @Test |
| public static final void testSpecialCharsInAttributes() throws Exception { |
| assertEquals( |
| "<b title=\"a<b && c>b\">bar</b>", |
| sanitize("<b title=\"a<b && c>b\">bar</b>")); |
| } |
| |
| @Test |
| public static final void testUnclosedTags() throws Exception { |
| assertEquals("<div id=\"p-foo\">Bar<br />Baz</div>", |
| sanitize("<div id=\"foo\">Bar<br>Baz")); |
| } |
| |
| @Test |
| public static final void testUnopenedTags() throws Exception { |
| assertEquals("Foo<b>Bar</b>Baz", |
| sanitize("Foo<b></select>Bar</b></b>Baz</select>")); |
| } |
| |
| @Test |
| public static final void testUnsafeEndTags() throws Exception { |
| assertEquals( |
| "", |
| sanitize( |
| "</meta http-equiv=\"refesh\"" |
| + " content=\"1;URL=http://evilgadget.com\">")); |
| } |
| |
| @Test |
| public static final void testEmptyEndTags() throws Exception { |
| assertEquals("<input />", sanitize("<input></input>")); |
| } |
| |
| @Test |
| public static final void testOnLoadStripped() throws Exception { |
| assertEquals( |
| "<img />", |
| sanitize("<img src=http://foo.com/bar ONLOAD=alert(1)>")); |
| } |
| |
| @Test |
| public static final void testClosingTagParameters() throws Exception { |
| assertEquals( |
| "<p>Hello world</p>", |
| sanitize("<p>Hello world</b style=\"width:expression(alert(1))\">")); |
| } |
| |
| @Test |
| public static final void testOptionalEndTags() throws Exception { |
| // Should not be |
| // "<ol> <li>A</li> <li>B<li>C </li></li></ol>" |
| // The difference is significant because in the first, the item contains no |
| // space after 'A", but in the third, the item contains 'C' and a space. |
| assertEquals( |
| "<ol><li>A</li><li>B</li><li>C </li></ol>", |
| sanitize("<ol> <li>A</li> <li>B<li>C </ol>")); |
| } |
| |
| @Test |
| public static final void testFoldingOfHtmlAndBodyTags() throws Exception { |
| assertEquals( |
| "<p>P 1</p>", |
| sanitize("<html><head><title>Foo</title></head>" |
| + "<body><p>P 1</p></body></html>")); |
| assertEquals( |
| "Hello", |
| sanitize("<body bgcolor=\"blue\">Hello</body>")); |
| assertEquals( |
| "<p>Foo</p><p>One</p><p>Two</p>Three<p>Four</p>", |
| sanitize( |
| "<html>" |
| + "<head>" |
| + "<title>Blah</title>" |
| + "<p>Foo</p>" |
| + "</head>" |
| + "<body>" |
| + "<p>One" |
| + "<p>Two</p>" |
| + "Three" |
| + "<p>Four</p>" |
| + "</body>" |
| + "</html>")); |
| } |
| |
| @Test |
| public static final void testEmptyAndValuelessAttributes() throws Exception { |
| assertEquals( |
| "<input checked=\"checked\" type=\"checkbox\" id=\"\" class=\"\" />", |
| sanitize("<input checked type=checkbox id=\"\" class=>")); |
| } |
| |
| @Test |
| public static final void testSgmlShortTags() throws Exception { |
| // We make no attempt to correctly handle SGML short tags since they are |
| // not implemented consistently across browsers, and have been removed from |
| // HTML 5. |
| // |
| // According to http://www.w3.org/QA/2007/10/shorttags.html |
| // Shorttags - the odd side of HTML 4.01 |
| // ... |
| // It uses an ill-known feature of SGML called shorthand markup, which |
| // was authorized in HTML up to HTML 4.01. But what used to be a "cool" |
| // feature for SGML experts becomes a liability in HTML, where the |
| // construct is more likely to appear as a typo than as a conscious |
| // choice. |
| // |
| // All could be fine if this form typo-that-happens-to-be-legal was |
| // properly implemented in contemporary HTML user-agents. It is not. |
| assertEquals("<p></p>", sanitize("<p/b/")); // Short-tag discarded. |
| assertEquals("<p></p>", sanitize("<p<b>")); // Discard <b attribute |
| assertEquals( |
| // This behavior for short tags is not ideal, but it is safe. |
| "<p href=\"/\">first part of the text</> second part</p>", |
| sanitize("<p<a href=\"/\">first part of the text</> second part")); |
| } |
| |
| @Test |
| public static final void testNul() throws Exception { |
| assertEquals( |
| "<a title=" |
| + "\"harmless SCRIPT=javascript:alert(1) ignored=ignored\">" |
| + "</a>", |
| sanitize( |
| "<A TITLE=" |
| + "\"harmless\0 SCRIPT=javascript:alert(1) ignored=ignored\">" |
| )); |
| } |
| |
| @Test |
| public static final void testDigitsInAttrNames() throws Exception { |
| // See bug 614 for details. |
| assertEquals( |
| "<div>Hello</div>", |
| sanitize( |
| "<div style1=\"expression(\'alert(1)\")\">Hello</div>" |
| )); |
| } |
| |
| @Test |
| public static final void testSupplementaryCodepointEncoding() |
| throws Exception { |
| // �� is not appropriate. |
| // 冬 is appropriate as is the unencoded form. |
| assertEquals( |
| "冬 | 冬 | 冬", |
| sanitize("冬 | \ud87e\udc1a | ��")); |
| } |
| |
| @Test |
| public static final void testDeeplyNestedTagsDoS() throws Exception { |
| String sanitized = sanitize(stringRepeatedTimes("<div>", 20000)); |
| int n = sanitized.length() / "<div></div>".length(); |
| assertTrue("" + n, 50 <= n && n <= 1000); |
| int middle = n * "<div>".length(); |
| assertEquals(sanitized.substring(0, middle), |
| stringRepeatedTimes("<div>", n)); |
| assertEquals(sanitized.substring(middle), |
| stringRepeatedTimes("</div>", n)); |
| } |
| |
| @Test |
| public static final void testInnerHTMLIE8() throws Exception { |
| // Apparently, in quirks mode, IE8 does a poor job producing innerHTML |
| // values. Given |
| // <div attr="``foo=bar"> |
| // we encode ` but if JavaScript does: |
| // nodeA.innerHTML = nodeB.innerHTML; |
| // and nodeB contains the DIV above, then IE8 will produce |
| // <div attr=``foo=bar> |
| // as the value of nodeB.innerHTML and assign it to nodeA. |
| // IE8's HTML parser treats `` as a blank attribute value and foo=bar |
| // becomes a separate attribute. |
| // Adding a space at the end of the attribute prevents this by forcing |
| // IE8 to put double quotes around the attribute when computing |
| // nodeB.innerHTML. |
| assertEquals( |
| "<div title=\"``onmouseover=alert(1337) \"></div>", |
| sanitize("<div title=\"``onmouseover=alert(1337)\">")); |
| } |
| |
| @Test |
| public static final void testNabobsOfNegativism() throws Exception { |
| // Treating <noscript> as raw-text gains us nothing security-wise. |
| assertEquals("<noscript></noscript>", |
| sanitize("<noscript><evil></noscript>")); |
| assertEquals("<noscript>I <b><3</b> Ponies</noscript>", |
| sanitize("<noscript>I <b><3</b> Ponies</noscript>")); |
| assertEquals("<noscript>I <b><3</b> Ponies</noscript>", |
| sanitize("<NOSCRIPT>I <b><3</b> Ponies</noscript><evil>")); |
| assertEquals("<noframes>I <b><3</b> Ponies</noframes>", |
| sanitize("<noframes>I <b><3</b> Ponies</noframes><evil>")); |
| assertEquals("<noembed>I <b><3</b> Ponies</noembed>", |
| sanitize("<noembed>I <b><3</b> Ponies</noembed><evil>")); |
| assertEquals("<noxss>I <b><3</b> Ponies</noxss>", |
| sanitize("<noxss>I <b><3</b> Ponies</noxss><evil>")); |
| assertEquals( |
| "<noscript>I <b><3</b> Ponies</noscript>", |
| sanitize("<xmp><noscript>I <b><3</b> Ponies</noscript></xmp>")); |
| } |
| |
| @Test |
| public static final void testNULs() throws Exception { |
| assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000</b>")); |
| assertEquals("<b>Hello, </b>", sanitize("<b>Hello, \u0000")); |
| assertEquals("", sanitize("\u0000")); |
| assertEquals("<b>Hello, </b>", sanitize("<b>Hello, �</b>")); |
| assertEquals("", sanitize("�")); |
| } |
| |
| @Test |
| public static final void testQMarkMeta() throws Exception { |
| assertEquals( |
| "Hello, <b>World</b>!", |
| sanitize( |
| "" |
| // An XML Prologue. |
| // HTML5 treats it as ignorable content via the bogus comment state. |
| + "<?xml version=\"1\" ?>" |
| + "Hello, " |
| // An XML Processing instruction. |
| // HTML5 treats it as ignorable content via the bogus comment state. |
| + "<?processing instruction?>" |
| + "<b>World" |
| // Appears in HTML copied from outlook. |
| + "<?xml:namespace prefix = o ns = " |
| + "\"urn:schemas-microsoft-com:office:office\" />" |
| + "</b>!")); |
| } |
| |
| @Test |
| public static final void testScriptInIframe() throws Exception { |
| assertEquals( |
| "<iframe></iframe>", |
| sanitize( |
| "<iframe>\n" |
| + " <script>alert(Hi)</script>\n" |
| + "</iframe>")); |
| } |
| |
| private static String sanitize(@Nullable String html) throws Exception { |
| StringBuilder sb = new StringBuilder(); |
| HtmlStreamRenderer renderer = HtmlStreamRenderer.create( |
| sb, |
| new Handler<String>() { |
| public void handle(String errorMessage) { |
| fail(errorMessage); |
| } |
| }); |
| |
| HtmlSanitizer.Policy policy = new HtmlPolicyBuilder() |
| // Allow these tags. |
| .allowElements( |
| "a", "b", "br", "div", "i", "iframe", "img", "input", "li", |
| "ol", "p", "span", "ul", "noscript", "noframes", "noembed", "noxss") |
| // And these attributes. |
| .allowAttributes( |
| "dir", "checked", "class", "href", "id", "target", "title", "type") |
| .globally() |
| // Cleanup IDs and CLASSes and prefix them with p- to move to a separate |
| // name-space. |
| .allowAttributes("id", "class") |
| .matching( |
| new AttributePolicy() { |
| public String apply( |
| String elementName, String attributeName, String value) { |
| return value.replaceAll("(?:^|\\s)([a-zA-Z])", " p-$1") |
| .replaceAll("\\s+", " ") |
| .trim(); |
| } |
| }) |
| .globally() |
| // Don't throw out useless <img> and <input> elements to ease debugging. |
| .allowWithoutAttributes("img", "input") |
| .build(renderer); |
| |
| HtmlSanitizer.sanitize(html, policy); |
| |
| return sb.toString(); |
| } |
| |
| private static final String stringRepeatedTimes(String s, int n) { |
| StringBuilder sb = new StringBuilder(s.length() * n); |
| while (--n >= 0) { |
| sb.append(s); |
| } |
| return sb.toString(); |
| } |
| } |