src/main/org/owasp/html/HtmlLexer.java - platform/external/owasp/sanitizer - Git at Google

 // Copyright (c) 2011, Mike Samuel
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
 // are met:
 //
 // Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 // Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
 // Neither the name of the OWASP nor the names of its contributors may
 // be used to endorse or promote products derived from this software
 // without specific prior written permission.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.

 package org.owasp.html;

 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 import java.util.LinkedList;
 import java.util.NoSuchElementException;
 import java.util.Set;

 import javax.annotation.concurrent.NotThreadSafe;

 /**
  * A flexible lexer for HTML.
  * This is hairy code, but it is outside the TCB for the HTML sanitizer.
  *
  * @author Mike Samuel <mikesamuel@gmail.com>
  */
 @NotThreadSafe
 final class HtmlLexer extends AbstractTokenStream {
   private final String input;
   private final HtmlInputSplitter splitter;
   private State state = State.OUTSIDE_TAG;

   public HtmlLexer(String input) {
     this.input = input;
     this.splitter = new HtmlInputSplitter(input);
   }

   /**
    * Normalize case of names that are not name-spaced.  This lower-cases HTML
    * element and attribute names, but not ones for embedded SVG or MATHML.
    */
   static String canonicalName(String elementOrAttribName) {
     return elementOrAttribName.indexOf(':') >= 0
         ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
   }

   /**
    * An FSM that lets us reclassify text tokens inside tags as attribute
    * names/values
    */
   private static enum State {
     OUTSIDE_TAG,
     IN_TAG,
     SAW_NAME,
     SAW_EQ,
     ;
   }

   /**
    * Makes sure that this.token contains a token if one is available.
    * This may require fetching and combining multiple tokens from the underlying
    * splitter.
    */
   @Override
   protected HtmlToken produce() {
     HtmlToken token = readToken();
     if (token == null) { return null; }

     switch (token.type) {

       // Keep track of whether we're inside a tag or not.
       case TAGBEGIN:
         state = State.IN_TAG;
         break;
       case TAGEND:
         if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
           // Distinguish <input type=checkbox checked=> from
           // <input type=checkbox checked>
           pushbackToken(token);
           state = State.IN_TAG;
           return HtmlToken.instance(
               token.start, token.start, HtmlTokenType.ATTRVALUE);
         }

         state = State.OUTSIDE_TAG;
         break;

       // Drop ignorable tokens by zeroing out the one received and recursing
       case IGNORABLE:
         return produce();

       // collapse adjacent text nodes if we're outside a tag, or otherwise,
       // Recognize attribute names and values.
       default:
         switch (state) {
           case OUTSIDE_TAG:
             if (HtmlTokenType.TEXT == token.type
                 || HtmlTokenType.UNESCAPED == token.type) {
               token = collapseSubsequent(token);
             }
             break;
           case IN_TAG:
             if (HtmlTokenType.TEXT == token.type
                 && !token.tokenInContextMatches(input, "=")) {
               // Reclassify as attribute name
               token = HtmlInputSplitter.reclassify(
                   token, HtmlTokenType.ATTRNAME);
               state = State.SAW_NAME;
             }
             break;
           case SAW_NAME:
             if (HtmlTokenType.TEXT == token.type) {
               if (token.tokenInContextMatches(input, "=")) {
                 state = State.SAW_EQ;
                 // Skip the '=' token
                 return produce();
               } else {
                 // Reclassify as attribute name
                 token = HtmlInputSplitter.reclassify(
                     token, HtmlTokenType.ATTRNAME);
               }
             } else {
               state = State.IN_TAG;
             }
             break;
           case SAW_EQ:
             if (HtmlTokenType.TEXT == token.type
                 || HtmlTokenType.QSTRING == token.type) {
               if (HtmlTokenType.TEXT == token.type) {
                 // Collapse adjacent text nodes to properly handle
                 //   <a onclick=this.clicked=true>
                 //   <a title=foo bar>
                 token = collapseAttributeName(token);
               }
               // Reclassify as value
               token = HtmlInputSplitter.reclassify(
                   token, HtmlTokenType.ATTRVALUE);
               state = State.IN_TAG;
             }
             break;
         }
         break;
     }

     return token;
   }

   /**
    * Collapses all the following tokens of the same type into this.token.
    */
   private HtmlToken collapseSubsequent(HtmlToken token) {
     HtmlToken collapsed = token;
     for (HtmlToken next;
          (next= peekToken(0)) != null && next.type == token.type;
          readToken()) {
       collapsed = join(collapsed, next);
     }
     return collapsed;
   }

   private HtmlToken collapseAttributeName(HtmlToken token) {
     // We want to collapse tokens into the value that are not parts of an
     // attribute value.  We should include any space or text adjacent to the
     // value, but should stop at any of the following constructions:
     //   space end-of-file              e.g. name=foo_
     //   space valueless-attrib-name    e.g. name=foo checked
     //   space tag-end                  e.g. name=foo />
     //   space text space? '='          e.g. name=foo bar=
     int nToMerge = 0;
     for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
       if (t.type == HtmlTokenType.IGNORABLE) {
         HtmlToken tok = peekToken(nToMerge + 1);
         if (tok == null) { break; }
         if (tok.type != HtmlTokenType.TEXT) { break; }
         if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
           break;
         }
         HtmlToken eq = peekToken(nToMerge + 2);
         if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
           eq = peekToken(nToMerge + 3);
         }
         if (eq == null || eq.tokenInContextMatches(input, "=")) {
           break;
         }
       } else if (t.type != HtmlTokenType.TEXT) {
         break;
       }
       ++nToMerge;
     }
     if (nToMerge == 0) { return token; }

     int end = token.end;
     do {
       end = readToken().end;
     } while (--nToMerge > 0);

     return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
   }

   private static HtmlToken join(HtmlToken a, HtmlToken b) {
     return HtmlToken.instance(a.start, b.end, a.type);
   }

   private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList();
   private HtmlToken readToken() {
     if (!lookahead.isEmpty()) {
       return lookahead.remove();
     } else if (splitter.hasNext()) {
       return splitter.next();
     } else {
       return null;
     }
   }

   private HtmlToken peekToken(int i) {
     while (lookahead.size() <= i && splitter.hasNext()) {
       lookahead.add(splitter.next());
     }
     return lookahead.size() > i ? lookahead.get(i) : null;
   }

   private void pushbackToken(HtmlToken token) {
     lookahead.addFirst(token);
   }

   /** Can the attribute appear in HTML without a value. */
   private static boolean isValuelessAttribute(String attribName) {
     boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
         Strings.toLowerCase(attribName));
     return valueless;
   }

   // From http://issues.apache.org/jira/browse/XALANC-519
   private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
       "checked", "compact", "declare", "defer", "disabled",
       "ismap", "multiple", "nohref", "noresize", "noshade",
       "nowrap", "readonly", "selected");
 }

 /**
  * A token stream that breaks a character stream into <tt>
  * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
  * tokens.  The matching of attribute names and values is done in a later step.
  */
 final class HtmlInputSplitter extends AbstractTokenStream {
   /** The source of HTML character data. */
   private final String input;
   /** An offset into input. */
   private int offset;
   /** True iff the current character is inside a tag. */
   private boolean inTag;
   /**
    * True if inside a script, xmp, listing, or similar tag whose content does
    * not follow the normal escaping rules.
    */
   private boolean inEscapeExemptBlock;

   /**
    * Null or the name of the close tag required to end the current escape exempt
    * block.
    * Preformatted tags include &lt;script&gt;, &lt;xmp&gt;, etc. that may
    * contain unescaped HTML input.
    */
   private String escapeExemptTagName = null;

   private HtmlTextEscapingMode textEscapingMode;

   public HtmlInputSplitter(String input) {
     this.input = input;
   }

   /**
    * Make sure that there is a token ready to yield in this.token.
    */
   @Override
   protected HtmlToken produce() {
     HtmlToken token = parseToken();
     if (null == token) { return null; }

     // Handle escape-exempt blocks.
     // The parse() method is only dimly aware of escape-excempt blocks, so
     // here we detect the beginning and ends of escape exempt blocks, and
     // reclassify as UNESCAPED, any tokens that appear in the middle.
     if (inEscapeExemptBlock) {
       if (token.type != HtmlTokenType.SERVERCODE) {
         // classify RCDATA as text since it can contain entities
         token = reclassify(
             token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
                     ? HtmlTokenType.TEXT
                     : HtmlTokenType.UNESCAPED));
       }
     } else {
       switch (token.type) {
         case TAGBEGIN:
           {
             String canonTagName = canonicalName(
                 token.start + 1, token.end);
             if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
                     canonTagName)) {
               this.escapeExemptTagName = canonTagName;
               this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
                   canonTagName);
             }
             break;
           }
         case TAGEND:
           this.inEscapeExemptBlock = null != this.escapeExemptTagName;
           break;
         default:
           break;
       }
     }
     return token;
   }

   /**
    * States for a state machine for optimistically identifying tags and other
    * html/xml/phpish structures.
    */
   private static enum State {
     TAGNAME,
     SLASH,
     BANG,
     BANG_DASH,
     COMMENT,
     COMMENT_DASH,
     COMMENT_DASH_DASH,
     DIRECTIVE,
     DONE,
     BOGUS_COMMENT,
     SERVER_CODE,
     SERVER_CODE_PCT,

     // From HTML 5 section 8.1.2.6

     // The text in CDATA and RCDATA elements must not contain any
     // occurrences of the string "</" followed by characters that
     // case-insensitively match the tag name of the element followed
     // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
     // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
     // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
     // that string is part of an escaping text span.

     // An escaping text span is a span of text (in CDATA and RCDATA
     // elements) and character entity references (in RCDATA elements)
     // that starts with an escaping text span start that is not itself
     // in an escaping text span, and ends at the next escaping text
     // span end.

     // An escaping text span start is a part of text that consists of
     // the four character sequence "<!--".

     // An escaping text span end is a part of text that consists of
     // the three character sequence "-->".

     // An escaping text span start may share its U+002D HYPHEN-MINUS characters
     // with its corresponding escaping text span end.
     UNESCAPED_LT_BANG,             // <!
     UNESCAPED_LT_BANG_DASH,        // <!-
     ESCAPING_TEXT_SPAN,            // Inside an escaping text span
     ESCAPING_TEXT_SPAN_DASH,       // Seen - inside an escaping text span
     ESCAPING_TEXT_SPAN_DASH_DASH,  // Seen -- inside an escaping text span
     ;
   }

   private HtmlToken lastNonIgnorable = null;
   /**
    * Breaks the character stream into tokens.
    * This method returns a stream of tokens such that each token starts where
    * the last token ended.
    *
    * <p>This property is useful as it allows fetch to collapse and reclassify
    * ranges of tokens based on state that is easy to maintain there.
    *
    * <p>Later passes are responsible for throwing away useless tokens.
    */
   private HtmlToken parseToken() {
     int start = offset;
     int limit = input.length();
     if (start == limit) { return null; }

     int end = start + 1;
     HtmlTokenType type;

     char ch = input.charAt(start);
     if (inTag) {
       if ('>' == ch) {
         type = HtmlTokenType.TAGEND;
         inTag = false;
       } else if ('/' == ch) {
         if (end != limit && '>' == input.charAt(end)) {
           type = HtmlTokenType.TAGEND;
           inTag = false;
           ++end;
         } else {
           type = HtmlTokenType.TEXT;
         }
       } else if ('=' == ch) {
         type = HtmlTokenType.TEXT;
       } else if ('"' == ch || '\'' == ch) {
         type = HtmlTokenType.QSTRING;
         int delim = ch;
         for (; end < limit; ++end) {
           if (input.charAt(end) == delim) {
             ++end;
             break;
           }
         }
       } else if (!Character.isWhitespace(ch)) {
         type = HtmlTokenType.TEXT;
         for (; end < limit; ++end) {
           ch = input.charAt(end);
           // End a text chunk before />
           if ((lastNonIgnorable == null
                || !lastNonIgnorable.tokenInContextMatches(input, "="))
               && '/' == ch && end + 1 < limit
               && '>' == input.charAt(end + 1)) {
             break;
           } else if ('>' == ch || '=' == ch
                      || Character.isWhitespace(ch)) {
             break;
           } else if ('"' == ch || '\'' == ch) {
             if (end + 1 < limit) {
               char ch2 = input.charAt(end + 1);
               if (ch2 >= 0 && Character.isWhitespace(ch2)
                   || ch2 == '>' || ch2 == '/') {
                 ++end;
                 break;
               }
             }
           }
         }
       } else {
         // We skip whitespace tokens inside tag bodies.
         type = HtmlTokenType.IGNORABLE;
         while (end < limit && Character.isWhitespace(input.charAt(end))) {
           ++end;
         }
       }
     } else {
       if (ch == '<') {
         if (end == limit) {
           type = HtmlTokenType.TEXT;
         } else {
           ch = input.charAt(end);
           type = null;
           State state = null;
           switch (ch) {
             case '/':  // close tag?
               state = State.SLASH;
               ++end;
               break;
             case '!':  // Comment or declaration
               if (!this.inEscapeExemptBlock) {
                 state = State.BANG;
               } else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
                              escapeExemptTagName)) {
                 // Directives, and cdata suppressed in escape
                 // exempt mode as they could obscure the close of the
                 // escape exempty block, but comments are similar to escaping
                 // text spans, and are significant in all CDATA and RCDATA
                 // blocks except those inside <xmp> tags.
                 // See "Escaping text spans" in section 8.1.2.6 of HTML5.
                 // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
                 state = State.UNESCAPED_LT_BANG;
               }
               ++end;
               break;
             case '?':
               if (!this.inEscapeExemptBlock) {
                 state = State.BOGUS_COMMENT;
               }
               ++end;
               break;
             case '%':
               state = State.SERVER_CODE;
               ++end;
               break;
             default:
               if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
                 state = State.TAGNAME;
                 ++end;
               } else if ('<' == ch) {
                 type = HtmlTokenType.TEXT;
               } else {
                 ++end;
               }
               break;
           }
           if (null != state) {
             charloop:
             while (end < limit) {
               ch = input.charAt(end);
               switch (state) {
                 case TAGNAME:
                   if (Character.isWhitespace(ch)
                       || '>' == ch || '/' == ch || '<' == ch) {
                     // End processing of an escape exempt block when we see
                     // a corresponding end tag.
                     if (this.inEscapeExemptBlock
                         && '/' == input.charAt(start + 1)
                         && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
                         && canonicalName(start + 2, end)
                             .equals(escapeExemptTagName)) {
                       this.inEscapeExemptBlock = false;
                       this.escapeExemptTagName = null;
                       this.textEscapingMode = null;
                     }
                     type = HtmlTokenType.TAGBEGIN;
                     // Don't process content as attributes if we're inside
                     // an escape exempt block.
                     inTag = !this.inEscapeExemptBlock;
                     state = State.DONE;
                     break charloop;
                   }
                   break;
                 case SLASH:
                   if (Character.isLetter(ch)) {
                     state = State.TAGNAME;
                   } else {
                     if ('<' == ch) {
                       type = HtmlTokenType.TEXT;
                     } else {
                       ++end;
                     }
                     break charloop;
                   }
                   break;
                 case BANG:
                   if ('-' == ch) {
                     state = State.BANG_DASH;
                   } else {
                     state = State.DIRECTIVE;
                   }
                   break;
                 case BANG_DASH:
                   if ('-' == ch) {
                     state = State.COMMENT;
                   } else {
                     state = State.DIRECTIVE;
                   }
                   break;
                 case COMMENT:
                   if ('-' == ch) {
                     state = State.COMMENT_DASH;
                   }
                   break;
                 case COMMENT_DASH:
                   state = ('-' == ch)
                       ? State.COMMENT_DASH_DASH
                       : State.COMMENT_DASH;
                   break;
                 case COMMENT_DASH_DASH:
                   if ('>' == ch) {
                     state = State.DONE;
                     type = HtmlTokenType.COMMENT;
                   } else if ('-' == ch) {
                     state = State.COMMENT_DASH_DASH;
                   } else {
                     state = State.COMMENT_DASH;
                   }
                   break;
                 case DIRECTIVE:
                   if ('>' == ch) {
                     type = HtmlTokenType.DIRECTIVE;
                     state = State.DONE;
                   }
                   break;
                 case BOGUS_COMMENT:
                   if ('>' == ch) {
                     type = HtmlTokenType.QMARKMETA;
                     state = State.DONE;
                   }
                   break;
                 case SERVER_CODE:
                   if ('%' == ch) {
                     state = State.SERVER_CODE_PCT;
                   }
                   break;
                 case SERVER_CODE_PCT:
                   if ('>' == ch) {
                     type = HtmlTokenType.SERVERCODE;
                     state = State.DONE;
                   } else if ('%' != ch) {
                     state = State.SERVER_CODE;
                   }
                   break;
                 case UNESCAPED_LT_BANG:
                   if ('-' == ch) {
                     state = State.UNESCAPED_LT_BANG_DASH;
                   } else {
                     type = HtmlTokenType.TEXT;
                     state = State.DONE;
                   }
                   break;
                 case UNESCAPED_LT_BANG_DASH:
                   if ('-' == ch) {
                     // According to HTML 5 section 8.1.2.6

                     // An escaping text span start may share its
                     // U+002D HYPHEN-MINUS characters with its
                     // corresponding escaping text span end.
                     state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
                   } else {
                     type = HtmlTokenType.TEXT;
                     state = State.DONE;
                   }
                   break;
                 case ESCAPING_TEXT_SPAN:
                   if ('-' == ch) {
                     state = State.ESCAPING_TEXT_SPAN_DASH;
                   }
                   break;
                 case ESCAPING_TEXT_SPAN_DASH:
                   if ('-' == ch) {
                     state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
                   } else {
                     state = State.ESCAPING_TEXT_SPAN;
                   }
                   break;
                 case ESCAPING_TEXT_SPAN_DASH_DASH:
                   if ('>' == ch) {
                     type = HtmlTokenType.TEXT;
                     state = State.DONE;
                   } else if ('-' != ch) {
                     state = State.ESCAPING_TEXT_SPAN;
                   }
                   break;
                 case DONE:
                   throw new AssertionError(
                       "Unexpectedly DONE while lexing HTML token stream");
               }
               ++end;
               if (State.DONE == state) { break; }
             }
             if (end == limit) {
               switch (state) {
                 case DONE:
                   break;
                 case BOGUS_COMMENT:
                   type = HtmlTokenType.QMARKMETA;
                   break;
                 case COMMENT:
                 case COMMENT_DASH:
                 case COMMENT_DASH_DASH:
                   type = HtmlTokenType.COMMENT;
                   break;
                 case DIRECTIVE:
                 case SERVER_CODE:
                 case SERVER_CODE_PCT:
                   type = HtmlTokenType.SERVERCODE;
                   break;
                 case TAGNAME:
                   type = HtmlTokenType.TAGBEGIN;
                   break;
                 default:
                   type = HtmlTokenType.TEXT;
                   break;
               }
             }
           }
         }
       } else {
         type = null;
       }
     }
     if (null == type) {
       while (end < limit && '<' != input.charAt(end)) { ++end; }
       type = HtmlTokenType.TEXT;
     }

     offset = end;
     HtmlToken result = HtmlToken.instance(start, end, type);
     if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; }
     return result;
   }

   private String canonicalName(int start, int end) {
     return HtmlLexer.canonicalName(input.substring(start, end));
   }

   private static boolean isIdentStart(char ch) {
     return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
   }

   static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) {
     return HtmlToken.instance(token.start, token.end, type);
   }
 }


 /**
  * A TokenStream that lazily fetches one token at a time.
  *
  * @author Mike Samuel <mikesamuel@gmail.com>
  */
 abstract class AbstractTokenStream implements TokenStream {
   private HtmlToken tok;

   public final boolean hasNext() {
     if (tok == null) { tok = produce(); }
     return tok != null;
   }

   public HtmlToken next() {
     if (this.tok == null) { this.tok = produce(); }
     HtmlToken t = this.tok;
     if (t == null) { throw new NoSuchElementException(); }
     this.tok = null;
     return t;
   }

   protected abstract HtmlToken produce();
 }
	// Copyright (c) 2011, Mike Samuel
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions
	// are met:
	//
	// Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	// Neither the name of the OWASP nor the names of its contributors may
	// be used to endorse or promote products derived from this software
	// without specific prior written permission.
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.

	package org.owasp.html;

	import com.google.common.collect.ImmutableSet;
	import com.google.common.collect.Lists;
	import java.util.LinkedList;
	import java.util.NoSuchElementException;
	import java.util.Set;

	import javax.annotation.concurrent.NotThreadSafe;

	/**
	* A flexible lexer for HTML.
	* This is hairy code, but it is outside the TCB for the HTML sanitizer.
	*
	* @author Mike Samuel <mikesamuel@gmail.com>
	*/
	@NotThreadSafe
	final class HtmlLexer extends AbstractTokenStream {
	private final String input;
	private final HtmlInputSplitter splitter;
	private State state = State.OUTSIDE_TAG;

	public HtmlLexer(String input) {
	this.input = input;
	this.splitter = new HtmlInputSplitter(input);
	}

	/**
	* Normalize case of names that are not name-spaced. This lower-cases HTML
	* element and attribute names, but not ones for embedded SVG or MATHML.
	*/
	static String canonicalName(String elementOrAttribName) {
	return elementOrAttribName.indexOf(':') >= 0
	? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
	}

	/**
	* An FSM that lets us reclassify text tokens inside tags as attribute
	* names/values
	*/
	private static enum State {
	OUTSIDE_TAG,
	IN_TAG,
	SAW_NAME,
	SAW_EQ,
	;
	}

	/**
	* Makes sure that this.token contains a token if one is available.
	* This may require fetching and combining multiple tokens from the underlying
	* splitter.
	*/
	@Override
	protected HtmlToken produce() {
	HtmlToken token = readToken();
	if (token == null) { return null; }

	switch (token.type) {

	// Keep track of whether we're inside a tag or not.
	case TAGBEGIN:
	state = State.IN_TAG;
	break;
	case TAGEND:
	if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
	// Distinguish <input type=checkbox checked=> from
	// <input type=checkbox checked>
	pushbackToken(token);
	state = State.IN_TAG;
	return HtmlToken.instance(
	token.start, token.start, HtmlTokenType.ATTRVALUE);
	}

	state = State.OUTSIDE_TAG;
	break;

	// Drop ignorable tokens by zeroing out the one received and recursing
	case IGNORABLE:
	return produce();

	// collapse adjacent text nodes if we're outside a tag, or otherwise,
	// Recognize attribute names and values.
	default:
	switch (state) {
	case OUTSIDE_TAG:
	if (HtmlTokenType.TEXT == token.type
	\|\| HtmlTokenType.UNESCAPED == token.type) {
	token = collapseSubsequent(token);
	}
	break;
	case IN_TAG:
	if (HtmlTokenType.TEXT == token.type
	&& !token.tokenInContextMatches(input, "=")) {
	// Reclassify as attribute name
	token = HtmlInputSplitter.reclassify(
	token, HtmlTokenType.ATTRNAME);
	state = State.SAW_NAME;
	}
	break;
	case SAW_NAME:
	if (HtmlTokenType.TEXT == token.type) {
	if (token.tokenInContextMatches(input, "=")) {
	state = State.SAW_EQ;
	// Skip the '=' token
	return produce();
	} else {
	// Reclassify as attribute name
	token = HtmlInputSplitter.reclassify(
	token, HtmlTokenType.ATTRNAME);
	}
	} else {
	state = State.IN_TAG;
	}
	break;
	case SAW_EQ:
	if (HtmlTokenType.TEXT == token.type
	\|\| HtmlTokenType.QSTRING == token.type) {
	if (HtmlTokenType.TEXT == token.type) {
	// Collapse adjacent text nodes to properly handle
	// <a onclick=this.clicked=true>
	// <a title=foo bar>
	token = collapseAttributeName(token);
	}
	// Reclassify as value
	token = HtmlInputSplitter.reclassify(
	token, HtmlTokenType.ATTRVALUE);
	state = State.IN_TAG;
	}
	break;
	}
	break;
	}

	return token;
	}

	/**
	* Collapses all the following tokens of the same type into this.token.
	*/
	private HtmlToken collapseSubsequent(HtmlToken token) {
	HtmlToken collapsed = token;
	for (HtmlToken next;
	(next= peekToken(0)) != null && next.type == token.type;
	readToken()) {
	collapsed = join(collapsed, next);
	}
	return collapsed;
	}

	private HtmlToken collapseAttributeName(HtmlToken token) {
	// We want to collapse tokens into the value that are not parts of an
	// attribute value. We should include any space or text adjacent to the
	// value, but should stop at any of the following constructions:
	// space end-of-file e.g. name=foo_
	// space valueless-attrib-name e.g. name=foo checked
	// space tag-end e.g. name=foo />
	// space text space? '=' e.g. name=foo bar=
	int nToMerge = 0;
	for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
	if (t.type == HtmlTokenType.IGNORABLE) {
	HtmlToken tok = peekToken(nToMerge + 1);
	if (tok == null) { break; }
	if (tok.type != HtmlTokenType.TEXT) { break; }
	if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
	break;
	}
	HtmlToken eq = peekToken(nToMerge + 2);
	if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
	eq = peekToken(nToMerge + 3);
	}
	if (eq == null \|\| eq.tokenInContextMatches(input, "=")) {
	break;
	}
	} else if (t.type != HtmlTokenType.TEXT) {
	break;
	}
	++nToMerge;
	}
	if (nToMerge == 0) { return token; }

	int end = token.end;
	do {
	end = readToken().end;
	} while (--nToMerge > 0);

	return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
	}

	private static HtmlToken join(HtmlToken a, HtmlToken b) {
	return HtmlToken.instance(a.start, b.end, a.type);
	}

	private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList();
	private HtmlToken readToken() {
	if (!lookahead.isEmpty()) {
	return lookahead.remove();
	} else if (splitter.hasNext()) {
	return splitter.next();
	} else {
	return null;
	}
	}

	private HtmlToken peekToken(int i) {
	while (lookahead.size() <= i && splitter.hasNext()) {
	lookahead.add(splitter.next());
	}
	return lookahead.size() > i ? lookahead.get(i) : null;
	}

	private void pushbackToken(HtmlToken token) {
	lookahead.addFirst(token);
	}

	/** Can the attribute appear in HTML without a value. */
	private static boolean isValuelessAttribute(String attribName) {
	boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
	Strings.toLowerCase(attribName));
	return valueless;
	}

	// From http://issues.apache.org/jira/browse/XALANC-519
	private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
	"checked", "compact", "declare", "defer", "disabled",
	"ismap", "multiple", "nohref", "noresize", "noshade",
	"nowrap", "readonly", "selected");
	}

	/**
	* A token stream that breaks a character stream into <tt>
	* HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
	* tokens. The matching of attribute names and values is done in a later step.
	*/
	final class HtmlInputSplitter extends AbstractTokenStream {
	/** The source of HTML character data. */
	private final String input;
	/** An offset into input. */
	private int offset;
	/** True iff the current character is inside a tag. */
	private boolean inTag;
	/**
	* True if inside a script, xmp, listing, or similar tag whose content does
	* not follow the normal escaping rules.
	*/
	private boolean inEscapeExemptBlock;

	/**
	* Null or the name of the close tag required to end the current escape exempt
	* block.
	* Preformatted tags include <script>, <xmp>, etc. that may
	* contain unescaped HTML input.
	*/
	private String escapeExemptTagName = null;

	private HtmlTextEscapingMode textEscapingMode;

	public HtmlInputSplitter(String input) {
	this.input = input;
	}

	/**
	* Make sure that there is a token ready to yield in this.token.
	*/
	@Override
	protected HtmlToken produce() {
	HtmlToken token = parseToken();
	if (null == token) { return null; }

	// Handle escape-exempt blocks.
	// The parse() method is only dimly aware of escape-excempt blocks, so
	// here we detect the beginning and ends of escape exempt blocks, and
	// reclassify as UNESCAPED, any tokens that appear in the middle.
	if (inEscapeExemptBlock) {
	if (token.type != HtmlTokenType.SERVERCODE) {
	// classify RCDATA as text since it can contain entities
	token = reclassify(
	token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
	? HtmlTokenType.TEXT
	: HtmlTokenType.UNESCAPED));
	}
	} else {
	switch (token.type) {
	case TAGBEGIN:
	{
	String canonTagName = canonicalName(
	token.start + 1, token.end);
	if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
	canonTagName)) {
	this.escapeExemptTagName = canonTagName;
	this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
	canonTagName);
	}
	break;
	}
	case TAGEND:
	this.inEscapeExemptBlock = null != this.escapeExemptTagName;
	break;
	default:
	break;
	}
	}
	return token;
	}

	/**
	* States for a state machine for optimistically identifying tags and other
	* html/xml/phpish structures.
	*/
	private static enum State {
	TAGNAME,
	SLASH,
	BANG,
	BANG_DASH,
	COMMENT,
	COMMENT_DASH,
	COMMENT_DASH_DASH,
	DIRECTIVE,
	DONE,
	BOGUS_COMMENT,
	SERVER_CODE,
	SERVER_CODE_PCT,

	// From HTML 5 section 8.1.2.6

	// The text in CDATA and RCDATA elements must not contain any
	// occurrences of the string "</" followed by characters that
	// case-insensitively match the tag name of the element followed
	// by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
	// U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
	// U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
	// that string is part of an escaping text span.

	// An escaping text span is a span of text (in CDATA and RCDATA
	// elements) and character entity references (in RCDATA elements)
	// that starts with an escaping text span start that is not itself
	// in an escaping text span, and ends at the next escaping text
	// span end.

	// An escaping text span start is a part of text that consists of
	// the four character sequence "<!--".

	// An escaping text span end is a part of text that consists of
	// the three character sequence "-->".

	// An escaping text span start may share its U+002D HYPHEN-MINUS characters
	// with its corresponding escaping text span end.
	UNESCAPED_LT_BANG, // <!
	UNESCAPED_LT_BANG_DASH, // <!-
	ESCAPING_TEXT_SPAN, // Inside an escaping text span
	ESCAPING_TEXT_SPAN_DASH, // Seen - inside an escaping text span
	ESCAPING_TEXT_SPAN_DASH_DASH, // Seen -- inside an escaping text span
	;
	}

	private HtmlToken lastNonIgnorable = null;
	/**
	* Breaks the character stream into tokens.
	* This method returns a stream of tokens such that each token starts where
	* the last token ended.
	*
	* <p>This property is useful as it allows fetch to collapse and reclassify
	* ranges of tokens based on state that is easy to maintain there.
	*
	* <p>Later passes are responsible for throwing away useless tokens.
	*/
	private HtmlToken parseToken() {
	int start = offset;
	int limit = input.length();
	if (start == limit) { return null; }

	int end = start + 1;
	HtmlTokenType type;

	char ch = input.charAt(start);
	if (inTag) {
	if ('>' == ch) {
	type = HtmlTokenType.TAGEND;
	inTag = false;
	} else if ('/' == ch) {
	if (end != limit && '>' == input.charAt(end)) {
	type = HtmlTokenType.TAGEND;
	inTag = false;
	++end;
	} else {
	type = HtmlTokenType.TEXT;
	}
	} else if ('=' == ch) {
	type = HtmlTokenType.TEXT;
	} else if ('"' == ch \|\| '\'' == ch) {
	type = HtmlTokenType.QSTRING;
	int delim = ch;
	for (; end < limit; ++end) {
	if (input.charAt(end) == delim) {
	++end;
	break;
	}
	}
	} else if (!Character.isWhitespace(ch)) {
	type = HtmlTokenType.TEXT;
	for (; end < limit; ++end) {
	ch = input.charAt(end);
	// End a text chunk before />
	if ((lastNonIgnorable == null
	\|\| !lastNonIgnorable.tokenInContextMatches(input, "="))
	&& '/' == ch && end + 1 < limit
	&& '>' == input.charAt(end + 1)) {
	break;
	} else if ('>' == ch \|\| '=' == ch
	\|\| Character.isWhitespace(ch)) {
	break;
	} else if ('"' == ch \|\| '\'' == ch) {
	if (end + 1 < limit) {
	char ch2 = input.charAt(end + 1);
	if (ch2 >= 0 && Character.isWhitespace(ch2)
	\|\| ch2 == '>' \|\| ch2 == '/') {
	++end;
	break;
	}
	}
	}
	}
	} else {
	// We skip whitespace tokens inside tag bodies.
	type = HtmlTokenType.IGNORABLE;
	while (end < limit && Character.isWhitespace(input.charAt(end))) {
	++end;
	}
	}
	} else {
	if (ch == '<') {
	if (end == limit) {
	type = HtmlTokenType.TEXT;
	} else {
	ch = input.charAt(end);
	type = null;
	State state = null;
	switch (ch) {
	case '/': // close tag?
	state = State.SLASH;
	++end;
	break;
	case '!': // Comment or declaration
	if (!this.inEscapeExemptBlock) {
	state = State.BANG;
	} else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
	escapeExemptTagName)) {
	// Directives, and cdata suppressed in escape
	// exempt mode as they could obscure the close of the
	// escape exempty block, but comments are similar to escaping
	// text spans, and are significant in all CDATA and RCDATA
	// blocks except those inside <xmp> tags.
	// See "Escaping text spans" in section 8.1.2.6 of HTML5.
	// http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
	state = State.UNESCAPED_LT_BANG;
	}
	++end;
	break;
	case '?':
	if (!this.inEscapeExemptBlock) {
	state = State.BOGUS_COMMENT;
	}
	++end;
	break;
	case '%':
	state = State.SERVER_CODE;
	++end;
	break;
	default:
	if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
	state = State.TAGNAME;
	++end;
	} else if ('<' == ch) {
	type = HtmlTokenType.TEXT;
	} else {
	++end;
	}
	break;
	}
	if (null != state) {
	charloop:
	while (end < limit) {
	ch = input.charAt(end);
	switch (state) {
	case TAGNAME:
	if (Character.isWhitespace(ch)
	\|\| '>' == ch \|\| '/' == ch \|\| '<' == ch) {
	// End processing of an escape exempt block when we see
	// a corresponding end tag.
	if (this.inEscapeExemptBlock
	&& '/' == input.charAt(start + 1)
	&& textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
	&& canonicalName(start + 2, end)
	.equals(escapeExemptTagName)) {
	this.inEscapeExemptBlock = false;
	this.escapeExemptTagName = null;
	this.textEscapingMode = null;
	}
	type = HtmlTokenType.TAGBEGIN;
	// Don't process content as attributes if we're inside
	// an escape exempt block.
	inTag = !this.inEscapeExemptBlock;
	state = State.DONE;
	break charloop;
	}
	break;
	case SLASH:
	if (Character.isLetter(ch)) {
	state = State.TAGNAME;
	} else {
	if ('<' == ch) {
	type = HtmlTokenType.TEXT;
	} else {
	++end;
	}
	break charloop;
	}
	break;
	case BANG:
	if ('-' == ch) {
	state = State.BANG_DASH;
	} else {
	state = State.DIRECTIVE;
	}
	break;
	case BANG_DASH:
	if ('-' == ch) {
	state = State.COMMENT;
	} else {
	state = State.DIRECTIVE;
	}
	break;
	case COMMENT:
	if ('-' == ch) {
	state = State.COMMENT_DASH;
	}
	break;
	case COMMENT_DASH:
	state = ('-' == ch)
	? State.COMMENT_DASH_DASH
	: State.COMMENT_DASH;
	break;
	case COMMENT_DASH_DASH:
	if ('>' == ch) {
	state = State.DONE;
	type = HtmlTokenType.COMMENT;
	} else if ('-' == ch) {
	state = State.COMMENT_DASH_DASH;
	} else {
	state = State.COMMENT_DASH;
	}
	break;
	case DIRECTIVE:
	if ('>' == ch) {
	type = HtmlTokenType.DIRECTIVE;
	state = State.DONE;
	}
	break;
	case BOGUS_COMMENT:
	if ('>' == ch) {
	type = HtmlTokenType.QMARKMETA;
	state = State.DONE;
	}
	break;
	case SERVER_CODE:
	if ('%' == ch) {
	state = State.SERVER_CODE_PCT;
	}
	break;
	case SERVER_CODE_PCT:
	if ('>' == ch) {
	type = HtmlTokenType.SERVERCODE;
	state = State.DONE;
	} else if ('%' != ch) {
	state = State.SERVER_CODE;
	}
	break;
	case UNESCAPED_LT_BANG:
	if ('-' == ch) {
	state = State.UNESCAPED_LT_BANG_DASH;
	} else {
	type = HtmlTokenType.TEXT;
	state = State.DONE;
	}
	break;
	case UNESCAPED_LT_BANG_DASH:
	if ('-' == ch) {
	// According to HTML 5 section 8.1.2.6

	// An escaping text span start may share its
	// U+002D HYPHEN-MINUS characters with its
	// corresponding escaping text span end.
	state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
	} else {
	type = HtmlTokenType.TEXT;
	state = State.DONE;
	}
	break;
	case ESCAPING_TEXT_SPAN:
	if ('-' == ch) {
	state = State.ESCAPING_TEXT_SPAN_DASH;
	}
	break;
	case ESCAPING_TEXT_SPAN_DASH:
	if ('-' == ch) {
	state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
	} else {
	state = State.ESCAPING_TEXT_SPAN;
	}
	break;
	case ESCAPING_TEXT_SPAN_DASH_DASH:
	if ('>' == ch) {
	type = HtmlTokenType.TEXT;
	state = State.DONE;
	} else if ('-' != ch) {
	state = State.ESCAPING_TEXT_SPAN;
	}
	break;
	case DONE:
	throw new AssertionError(
	"Unexpectedly DONE while lexing HTML token stream");
	}
	++end;
	if (State.DONE == state) { break; }
	}
	if (end == limit) {
	switch (state) {
	case DONE:
	break;
	case BOGUS_COMMENT:
	type = HtmlTokenType.QMARKMETA;
	break;
	case COMMENT:
	case COMMENT_DASH:
	case COMMENT_DASH_DASH:
	type = HtmlTokenType.COMMENT;
	break;
	case DIRECTIVE:
	case SERVER_CODE:
	case SERVER_CODE_PCT:
	type = HtmlTokenType.SERVERCODE;
	break;
	case TAGNAME:
	type = HtmlTokenType.TAGBEGIN;
	break;
	default:
	type = HtmlTokenType.TEXT;
	break;
	}
	}
	}
	}
	} else {
	type = null;
	}
	}
	if (null == type) {
	while (end < limit && '<' != input.charAt(end)) { ++end; }
	type = HtmlTokenType.TEXT;
	}

	offset = end;
	HtmlToken result = HtmlToken.instance(start, end, type);
	if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; }
	return result;
	}

	private String canonicalName(int start, int end) {
	return HtmlLexer.canonicalName(input.substring(start, end));
	}

	private static boolean isIdentStart(char ch) {
	return ch >= 'A' && ch <= 'z' && (ch <= 'Z' \|\| ch >= 'a');
	}

	static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) {
	return HtmlToken.instance(token.start, token.end, type);
	}
	}


	/**
	* A TokenStream that lazily fetches one token at a time.
	*
	* @author Mike Samuel <mikesamuel@gmail.com>
	*/
	abstract class AbstractTokenStream implements TokenStream {
	private HtmlToken tok;

	public final boolean hasNext() {
	if (tok == null) { tok = produce(); }
	return tok != null;
	}

	public HtmlToken next() {
	if (this.tok == null) { this.tok = produce(); }
	HtmlToken t = this.tok;
	if (t == null) { throw new NoSuchElementException(); }
	this.tok = null;
	return t;
	}

	protected abstract HtmlToken produce();
	}