src/main/org/owasp/html/HtmlSanitizer.java - platform/external/owasp/sanitizer - Git at Google

 // Copyright (c) 2011, Mike Samuel
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
 // are met:
 //
 // Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 // Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 // Neither the name of the OWASP nor the names of its contributors may
 // be used to endorse or promote products derived from this software
 // without specific prior written permission.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.

 package org.owasp.html;

 import java.util.LinkedList;
 import java.util.List;
 import javax.annotation.Nullable;

 import com.google.common.collect.Lists;

 /**
  * Consumes an HTML stream, and dispatches events to a policy object which
  * decides which elements and attributes to allow.
  */
 public final class HtmlSanitizer {

   /**
    * Receives events based on the HTML stream, and applies a policy to decide
    * what HTML constructs to allow.
    * Typically, implementations use an {@link HtmlStreamRenderer} to produce
    * the sanitized output.
    *
    * <p>
    * <b>Implementations of this class are in the TCB.</b></p>
    */
   @TCB
   public interface Policy extends HtmlStreamEventReceiver {
     /**
      * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
      *
      * @param elementName a normalized (lower-case for non-namespaced names)
      *     element name.
      * @param attrs a list of alternating attribute name and value pairs.
      *     For efficiency, this list may be mutated by this during this method
      *     call, but ownership reverts to the caller on method exit.
      *     The values are raw -- HTML entities have been decoded.
      *     Specifically, implementations are allowed to use a list iterator
      *     and remove all disallowed attributes, add necessary attributes, and
      *     then pass the list to an {@link HtmlStreamRenderer}.
      */
     void openTag(String elementName, List<String> attrs);

     /**
      * Called when an HTML tag like {@code </foo>} is seen in the input.
      *
      * @param elementName a normalized (lower-case for non-namespaced names)
      *     element name.
      */
     void closeTag(String elementName);

     /**
      * Called when textual content is seen.
      * @param textChunk raw content -- HTML entities have been decoded.
      */
     void text(String textChunk);
   }

   /**
    * Sanitizes the given HTML by applying the given policy to it.
    *
    * <p>
    * This method is not in the TCB.
    *
    * <p>
    * This method has no return value since policies are assumed to render things
    * they accept and do nothing on things they reject.
    * Use {@link HtmlStreamRenderer} to render content to an output buffer.
    *
    * @param html A snippet of HTML to sanitize.  {@code null} is treated as the
    *     empty string and will not result in a {@code NullPointerException}.
    * @param policy The Policy that will receive events based on the tokens in
    *     HTML.  Typically, this policy ends up routing the events to an
    *     {@link HtmlStreamRenderer} after filtering.
    *     {@link HtmlPolicyBuilder} provides an easy way to create policies.
    */
   public static void sanitize(@Nullable String html, final Policy policy) {
     if (html == null) { html = ""; }

     TagBalancingHtmlStreamEventReceiver balancer
         = new TagBalancingHtmlStreamEventReceiver(policy);

     // According to Opera the maximum table nesting depth seen in the wild is
     // 795, but 99.99% of documents have a table nesting depth of less than 22.
     // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
     // document depth of 90 (incl. HTML & BODY).
     // Obviously table nesting depth is not the same as whole document depth,
     // but it is the best proxy I have available.
     // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
     // the original data.

     // Webkit defines the maximum HTML parser tree depth as 512.
     // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
     // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;

     // The first number gives us a lower bound on the nesting depth we allow,
     // 90, and the second gives us an upper bound: 512.
     // We do not want to bump right up against that limit.
     // 256 is substantially larger than the lower bound and well clear of the
     // upper bound.
     balancer.setNestingLimit(256);

     balancer.openDocument();

     HtmlLexer lexer = new HtmlLexer(html);
     // Use a linked list so that policies can use Iterator.remove() in an O(1)
     // way.
     LinkedList<String> attrs = Lists.newLinkedList();
     while (lexer.hasNext()) {
       HtmlToken token = lexer.next();
       switch (token.type) {
         case TEXT:
           balancer.text(
               Encoding.decodeHtml(html.substring(token.start, token.end)));
           break;
         case UNESCAPED:
           balancer.text(Encoding.stripBannedCodeunits(
               html.substring(token.start, token.end)));
           break;
         case TAGBEGIN:
           if (html.charAt(token.start + 1) == '/') {  // A close tag.
             balancer.closeTag(HtmlLexer.canonicalName(
                 html.substring(token.start + 2, token.end)));
             while (lexer.hasNext()
                    && lexer.next().type != HtmlTokenType.TAGEND) {
               // skip tokens until we see a ">"
             }
           } else {
             attrs.clear();

             boolean attrsReadyForName = true;
             tagBody:
             while (lexer.hasNext()) {
               HtmlToken tagBodyToken = lexer.next();
               switch (tagBodyToken.type) {
                 case ATTRNAME:
                   if (!attrsReadyForName) {
                     // Last attribute added was valueless.
                     attrs.add(attrs.getLast());
                   } else {
                     attrsReadyForName = false;
                   }
                   attrs.add(HtmlLexer.canonicalName(
                       html.substring(tagBodyToken.start, tagBodyToken.end)));
                   break;
                 case ATTRVALUE:
                   attrs.add(Encoding.decodeHtml(stripQuotes(
                       html.substring(tagBodyToken.start, tagBodyToken.end))));
                   attrsReadyForName = true;
                   break;
                 case TAGEND:
                   break tagBody;
                 default:
                   // Just drop anything not recognized
               }
             }
             if (!attrsReadyForName) {
               attrs.add(attrs.getLast());
             }
             balancer.openTag(
                 HtmlLexer.canonicalName(
                     html.substring(token.start + 1, token.end)),
                 attrs);
           }
           break;
         default:
           // Ignore comments, XML prologues, processing instructions, and other
           // stuff that shouldn't show up in the output.
           break;
       }
     }

     balancer.closeDocument();
   }

   private static String stripQuotes(String encodedAttributeValue) {
     int n = encodedAttributeValue.length();
     if (n > 0) {
       char last = encodedAttributeValue.charAt(n - 1);
       if (last == '"' || last == '\'') {
         int start = 0;
         if (n != 1 && last == encodedAttributeValue.charAt(0)) {
           start = 1;
         } else {
           // Browsers deal with missing left quotes : <img src=foo.png">
           // but generally do not deal with missing right : <img src="foo.png>
         }
         return encodedAttributeValue.substring(start, n - 1);
       }
     }
     return encodedAttributeValue;
   }

 }
	// Copyright (c) 2011, Mike Samuel
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions
	// are met:
	//
	// Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	// Neither the name of the OWASP nor the names of its contributors may
	// be used to endorse or promote products derived from this software
	// without specific prior written permission.
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.

	package org.owasp.html;

	import java.util.LinkedList;
	import java.util.List;
	import javax.annotation.Nullable;

	import com.google.common.collect.Lists;

	/**
	* Consumes an HTML stream, and dispatches events to a policy object which
	* decides which elements and attributes to allow.
	*/
	public final class HtmlSanitizer {

	/**
	* Receives events based on the HTML stream, and applies a policy to decide
	* what HTML constructs to allow.
	* Typically, implementations use an {@link HtmlStreamRenderer} to produce
	* the sanitized output.
	*
	* <p>
	* <b>Implementations of this class are in the TCB.</b></p>
	*/
	@TCB
	public interface Policy extends HtmlStreamEventReceiver {
	/**
	* Called when an HTML tag like {@code <foo bar=baz>} is seen in the input.
	*
	* @param elementName a normalized (lower-case for non-namespaced names)
	* element name.
	* @param attrs a list of alternating attribute name and value pairs.
	* For efficiency, this list may be mutated by this during this method
	* call, but ownership reverts to the caller on method exit.
	* The values are raw -- HTML entities have been decoded.
	* Specifically, implementations are allowed to use a list iterator
	* and remove all disallowed attributes, add necessary attributes, and
	* then pass the list to an {@link HtmlStreamRenderer}.
	*/
	void openTag(String elementName, List<String> attrs);

	/**
	* Called when an HTML tag like {@code </foo>} is seen in the input.
	*
	* @param elementName a normalized (lower-case for non-namespaced names)
	* element name.
	*/
	void closeTag(String elementName);

	/**
	* Called when textual content is seen.
	* @param textChunk raw content -- HTML entities have been decoded.
	*/
	void text(String textChunk);
	}

	/**
	* Sanitizes the given HTML by applying the given policy to it.
	*
	* <p>
	* This method is not in the TCB.
	*
	* <p>
	* This method has no return value since policies are assumed to render things
	* they accept and do nothing on things they reject.
	* Use {@link HtmlStreamRenderer} to render content to an output buffer.
	*
	* @param html A snippet of HTML to sanitize. {@code null} is treated as the
	* empty string and will not result in a {@code NullPointerException}.
	* @param policy The Policy that will receive events based on the tokens in
	* HTML. Typically, this policy ends up routing the events to an
	* {@link HtmlStreamRenderer} after filtering.
	* {@link HtmlPolicyBuilder} provides an easy way to create policies.
	*/
	public static void sanitize(@Nullable String html, final Policy policy) {
	if (html == null) { html = ""; }

	TagBalancingHtmlStreamEventReceiver balancer
	= new TagBalancingHtmlStreamEventReceiver(policy);

	// According to Opera the maximum table nesting depth seen in the wild is
	// 795, but 99.99% of documents have a table nesting depth of less than 22.
	// Since each table has a nesting depth of 4 (incl. TBODY), this leads to a
	// document depth of 90 (incl. HTML & BODY).
	// Obviously table nesting depth is not the same as whole document depth,
	// but it is the best proxy I have available.
	// See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for
	// the original data.

	// Webkit defines the maximum HTML parser tree depth as 512.
	// http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408
	// static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512;

	// The first number gives us a lower bound on the nesting depth we allow,
	// 90, and the second gives us an upper bound: 512.
	// We do not want to bump right up against that limit.
	// 256 is substantially larger than the lower bound and well clear of the
	// upper bound.
	balancer.setNestingLimit(256);

	balancer.openDocument();

	HtmlLexer lexer = new HtmlLexer(html);
	// Use a linked list so that policies can use Iterator.remove() in an O(1)
	// way.
	LinkedList<String> attrs = Lists.newLinkedList();
	while (lexer.hasNext()) {
	HtmlToken token = lexer.next();
	switch (token.type) {
	case TEXT:
	balancer.text(
	Encoding.decodeHtml(html.substring(token.start, token.end)));
	break;
	case UNESCAPED:
	balancer.text(Encoding.stripBannedCodeunits(
	html.substring(token.start, token.end)));
	break;
	case TAGBEGIN:
	if (html.charAt(token.start + 1) == '/') { // A close tag.
	balancer.closeTag(HtmlLexer.canonicalName(
	html.substring(token.start + 2, token.end)));
	while (lexer.hasNext()
	&& lexer.next().type != HtmlTokenType.TAGEND) {
	// skip tokens until we see a ">"
	}
	} else {
	attrs.clear();

	boolean attrsReadyForName = true;
	tagBody:
	while (lexer.hasNext()) {
	HtmlToken tagBodyToken = lexer.next();
	switch (tagBodyToken.type) {
	case ATTRNAME:
	if (!attrsReadyForName) {
	// Last attribute added was valueless.
	attrs.add(attrs.getLast());
	} else {
	attrsReadyForName = false;
	}
	attrs.add(HtmlLexer.canonicalName(
	html.substring(tagBodyToken.start, tagBodyToken.end)));
	break;
	case ATTRVALUE:
	attrs.add(Encoding.decodeHtml(stripQuotes(
	html.substring(tagBodyToken.start, tagBodyToken.end))));
	attrsReadyForName = true;
	break;
	case TAGEND:
	break tagBody;
	default:
	// Just drop anything not recognized
	}
	}
	if (!attrsReadyForName) {
	attrs.add(attrs.getLast());
	}
	balancer.openTag(
	HtmlLexer.canonicalName(
	html.substring(token.start + 1, token.end)),
	attrs);
	}
	break;
	default:
	// Ignore comments, XML prologues, processing instructions, and other
	// stuff that shouldn't show up in the output.
	break;
	}
	}

	balancer.closeDocument();
	}

	private static String stripQuotes(String encodedAttributeValue) {
	int n = encodedAttributeValue.length();
	if (n > 0) {
	char last = encodedAttributeValue.charAt(n - 1);
	if (last == '"' \|\| last == '\'') {
	int start = 0;
	if (n != 1 && last == encodedAttributeValue.charAt(0)) {
	start = 1;
	} else {
	// Browsers deal with missing left quotes : <img src=foo.png">
	// but generally do not deal with missing right : <img src="foo.png>
	}
	return encodedAttributeValue.substring(start, n - 1);
	}
	}
	return encodedAttributeValue;
	}

	}