src/tests/org/owasp/html/HtmlLexerTest.java - platform/external/owasp/sanitizer - Git at Google

 // Copyright (c) 2011, Mike Samuel
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
 // are met:
 //
 // Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 // Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 // Neither the name of the OWASP nor the names of its contributors may
 // be used to endorse or promote products derived from this software
 // without specific prior written permission.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.

 package org.owasp.html;

 import junit.framework.TestCase;

 import java.util.Arrays;
 import java.util.List;

 import org.junit.Test;

 import com.google.common.base.Charsets;
 import com.google.common.collect.Lists;
 import com.google.common.io.Resources;

 public class HtmlLexerTest extends TestCase {

   @Test
   public final void testHtmlLexer() throws Exception {
     // Do the lexing.
     String input = Resources.toString(
         Resources.getResource(getClass(), "htmllexerinput1.html"),
         Charsets.UTF_8);
     StringBuilder actual = new StringBuilder();
     lex(input, actual);

     // Get the golden.
     String golden = Resources.toString(
         Resources.getResource(getClass(), "htmllexergolden1.txt"),
         Charsets.UTF_8);

     // Compare.
     assertEquals(golden, actual.toString());
   }

   @Test
   public static final void testEofInTag() throws Exception {
     assertTokens("<div", "TAGBEGIN: <div");
     assertTokens("</div", "TAGBEGIN: </div");
     assertTokens("<div\n", "TAGBEGIN: <div");
     assertTokens("</div\n", "TAGBEGIN: </div");
     assertTokens("<div", "TAGBEGIN: <div");
     assertTokens("</div", "TAGBEGIN: </div");
     assertTokens("<div\n", "TAGBEGIN: <div");
     assertTokens("</div\n", "TAGBEGIN: </div");
   }

   @Test
   public static final void testPartialTagInCData() throws Exception {
     assertTokens(
         "<script>w('</b')</script>",
         "TAGBEGIN: <script",
         "TAGEND: >",
         "UNESCAPED: w('</b')",
         "TAGBEGIN: </script",
         "TAGEND: >");
   }

   @Test
   public static final void testUrlEndingInSlashOutsideQuotes()
       throws Exception {
     assertTokens(
         "<a href=http://foo.com/>Clicky</a>",
         "TAGBEGIN: <a",
         "ATTRNAME: href",
         "ATTRVALUE: http://foo.com/",
         "TAGEND: >",
         "TEXT: Clicky",
         "TAGBEGIN: </a",
         "TAGEND: >");
   }

   @Test
   public static final void testShortTags() throws Exception {
     // See comments in html-sanitizer-test.js as to why we don't bother with
     // short tags.  In short, they are not in HTML5 and not implemented properly
     // in existing HTML4 clients.
     assertTokens(
         "<p<a href=\"/\">first part of the text</> second part",
         "TAGBEGIN: <p",
         "ATTRNAME: <a",
         "ATTRNAME: href",
         "ATTRVALUE: \"/\"",
         "TAGEND: >",
         "TEXT: first part of the text</> second part");
     assertTokens(
         "<p/b/",
         "TAGBEGIN: <p",
         "ATTRNAME: /",
         "ATTRNAME: b/");
     assertTokens(
         "<p<b>",
         "TAGBEGIN: <p",
         "ATTRNAME: <b",
         "TAGEND: >");
   }

   private static void lex(String input, Appendable out) throws Exception {
     HtmlLexer lexer = new HtmlLexer(input);
     int maxTypeLength = 0;
     for (HtmlTokenType t : HtmlTokenType.values()) {
       maxTypeLength = Math.max(maxTypeLength, t.name().length());
     }

     while (lexer.hasNext()) {
       HtmlToken t = lexer.next();
       // Do C style escaping of the token text so that each token in the golden
       // file can fit on one line.
       String escaped = input.substring(t.start, t.end)
           .replace("\\", "\\\\").replace("\n", "\\n");
       String type = t.type.toString();
       int nPadding = maxTypeLength - type.length();
       out.append(type);
       while (--nPadding >= 0) { out.append(' '); }
       out.append(" [").append(escaped).append("]  :  ")
           .append(String.valueOf(t.start)).append('-')
           .append(String.valueOf(t.end))
           .append("\n");
     }
   }

   private static void assertTokens(String markup, String... golden) {
     HtmlLexer lexer = new HtmlLexer(markup);
     List<String> actual = Lists.newArrayList();
     while (lexer.hasNext()) {
       HtmlToken t = lexer.next();
       actual.add(t.type + ": " + markup.substring(t.start, t.end));
     }
     assertEquals(Arrays.asList(golden), actual);
   }
 }
	// Copyright (c) 2011, Mike Samuel
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions
	// are met:
	//
	// Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	// Neither the name of the OWASP nor the names of its contributors may
	// be used to endorse or promote products derived from this software
	// without specific prior written permission.
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.

	package org.owasp.html;

	import junit.framework.TestCase;

	import java.util.Arrays;
	import java.util.List;

	import org.junit.Test;

	import com.google.common.base.Charsets;
	import com.google.common.collect.Lists;
	import com.google.common.io.Resources;

	public class HtmlLexerTest extends TestCase {

	@Test
	public final void testHtmlLexer() throws Exception {
	// Do the lexing.
	String input = Resources.toString(
	Resources.getResource(getClass(), "htmllexerinput1.html"),
	Charsets.UTF_8);
	StringBuilder actual = new StringBuilder();
	lex(input, actual);

	// Get the golden.
	String golden = Resources.toString(
	Resources.getResource(getClass(), "htmllexergolden1.txt"),
	Charsets.UTF_8);

	// Compare.
	assertEquals(golden, actual.toString());
	}

	@Test
	public static final void testEofInTag() throws Exception {
	assertTokens("<div", "TAGBEGIN: <div");
	assertTokens("</div", "TAGBEGIN: </div");
	assertTokens("<div\n", "TAGBEGIN: <div");
	assertTokens("</div\n", "TAGBEGIN: </div");
	assertTokens("<div", "TAGBEGIN: <div");
	assertTokens("</div", "TAGBEGIN: </div");
	assertTokens("<div\n", "TAGBEGIN: <div");
	assertTokens("</div\n", "TAGBEGIN: </div");
	}

	@Test
	public static final void testPartialTagInCData() throws Exception {
	assertTokens(
	"<script>w('</b')</script>",
	"TAGBEGIN: <script",
	"TAGEND: >",
	"UNESCAPED: w('</b')",
	"TAGBEGIN: </script",
	"TAGEND: >");
	}

	@Test
	public static final void testUrlEndingInSlashOutsideQuotes()
	throws Exception {
	assertTokens(
	"<a href=http://foo.com/>Clicky</a>",
	"TAGBEGIN: <a",
	"ATTRNAME: href",
	"ATTRVALUE: http://foo.com/",
	"TAGEND: >",
	"TEXT: Clicky",
	"TAGBEGIN: </a",
	"TAGEND: >");
	}

	@Test
	public static final void testShortTags() throws Exception {
	// See comments in html-sanitizer-test.js as to why we don't bother with
	// short tags. In short, they are not in HTML5 and not implemented properly
	// in existing HTML4 clients.
	assertTokens(
	"<p<a href=\"/\">first part of the text</> second part",
	"TAGBEGIN: <p",
	"ATTRNAME: <a",
	"ATTRNAME: href",
	"ATTRVALUE: \"/\"",
	"TAGEND: >",
	"TEXT: first part of the text</> second part");
	assertTokens(
	"<p/b/",
	"TAGBEGIN: <p",
	"ATTRNAME: /",
	"ATTRNAME: b/");
	assertTokens(
	"<p<b>",
	"TAGBEGIN: <p",
	"ATTRNAME: <b",
	"TAGEND: >");
	}

	private static void lex(String input, Appendable out) throws Exception {
	HtmlLexer lexer = new HtmlLexer(input);
	int maxTypeLength = 0;
	for (HtmlTokenType t : HtmlTokenType.values()) {
	maxTypeLength = Math.max(maxTypeLength, t.name().length());
	}

	while (lexer.hasNext()) {
	HtmlToken t = lexer.next();
	// Do C style escaping of the token text so that each token in the golden
	// file can fit on one line.
	String escaped = input.substring(t.start, t.end)
	.replace("\\", "\\\\").replace("\n", "\\n");
	String type = t.type.toString();
	int nPadding = maxTypeLength - type.length();
	out.append(type);
	while (--nPadding >= 0) { out.append(' '); }
	out.append(" [").append(escaped).append("] : ")
	.append(String.valueOf(t.start)).append('-')
	.append(String.valueOf(t.end))
	.append("\n");
	}
	}

	private static void assertTokens(String markup, String... golden) {
	HtmlLexer lexer = new HtmlLexer(markup);
	List<String> actual = Lists.newArrayList();
	while (lexer.hasNext()) {
	HtmlToken t = lexer.next();
	actual.add(t.type + ": " + markup.substring(t.start, t.end));
	}
	assertEquals(Arrays.asList(golden), actual);
	}
	}