blob: e2ce969571b9611fe5612347baabd778b7f3d5d5 [file] [log] [blame]
// Copyright (c) 2011, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
package org.owasp.html;
import junit.framework.TestCase;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
public class HtmlLexerTest extends TestCase {
@Test
public final void testHtmlLexer() throws Exception {
// Do the lexing.
String input = Resources.toString(
Resources.getResource(getClass(), "htmllexerinput1.html"),
Charsets.UTF_8);
StringBuilder actual = new StringBuilder();
lex(input, actual);
// Get the golden.
String golden = Resources.toString(
Resources.getResource(getClass(), "htmllexergolden1.txt"),
Charsets.UTF_8);
// Compare.
assertEquals(golden, actual.toString());
}
@Test
public static final void testEofInTag() throws Exception {
assertTokens("<div", "TAGBEGIN: <div");
assertTokens("</div", "TAGBEGIN: </div");
assertTokens("<div\n", "TAGBEGIN: <div");
assertTokens("</div\n", "TAGBEGIN: </div");
assertTokens("<div", "TAGBEGIN: <div");
assertTokens("</div", "TAGBEGIN: </div");
assertTokens("<div\n", "TAGBEGIN: <div");
assertTokens("</div\n", "TAGBEGIN: </div");
}
@Test
public static final void testPartialTagInCData() throws Exception {
assertTokens(
"<script>w('</b')</script>",
"TAGBEGIN: <script",
"TAGEND: >",
"UNESCAPED: w('</b')",
"TAGBEGIN: </script",
"TAGEND: >");
}
@Test
public static final void testUrlEndingInSlashOutsideQuotes()
throws Exception {
assertTokens(
"<a href=http://foo.com/>Clicky</a>",
"TAGBEGIN: <a",
"ATTRNAME: href",
"ATTRVALUE: http://foo.com/",
"TAGEND: >",
"TEXT: Clicky",
"TAGBEGIN: </a",
"TAGEND: >");
}
@Test
public static final void testShortTags() throws Exception {
// See comments in html-sanitizer-test.js as to why we don't bother with
// short tags. In short, they are not in HTML5 and not implemented properly
// in existing HTML4 clients.
assertTokens(
"<p<a href=\"/\">first part of the text</> second part",
"TAGBEGIN: <p",
"ATTRNAME: <a",
"ATTRNAME: href",
"ATTRVALUE: \"/\"",
"TAGEND: >",
"TEXT: first part of the text</> second part");
assertTokens(
"<p/b/",
"TAGBEGIN: <p",
"ATTRNAME: /",
"ATTRNAME: b/");
assertTokens(
"<p<b>",
"TAGBEGIN: <p",
"ATTRNAME: <b",
"TAGEND: >");
}
private static void lex(String input, Appendable out) throws Exception {
HtmlLexer lexer = new HtmlLexer(input);
int maxTypeLength = 0;
for (HtmlTokenType t : HtmlTokenType.values()) {
maxTypeLength = Math.max(maxTypeLength, t.name().length());
}
while (lexer.hasNext()) {
HtmlToken t = lexer.next();
// Do C style escaping of the token text so that each token in the golden
// file can fit on one line.
String escaped = input.substring(t.start, t.end)
.replace("\\", "\\\\").replace("\n", "\\n");
String type = t.type.toString();
int nPadding = maxTypeLength - type.length();
out.append(type);
while (--nPadding >= 0) { out.append(' '); }
out.append(" [").append(escaped).append("] : ")
.append(String.valueOf(t.start)).append('-')
.append(String.valueOf(t.end))
.append("\n");
}
}
private static void assertTokens(String markup, String... golden) {
HtmlLexer lexer = new HtmlLexer(markup);
List<String> actual = Lists.newArrayList();
while (lexer.hasNext()) {
HtmlToken t = lexer.next();
actual.add(t.type + ": " + markup.substring(t.start, t.end));
}
assertEquals(Arrays.asList(golden), actual);
}
}