| /* |
| * Copyright (C) 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.google.streamhtmlparser.impl; |
| |
| import com.google.common.base.Preconditions; |
| import com.google.common.collect.Maps; |
| import com.google.streamhtmlparser.ExternalState; |
| import com.google.streamhtmlparser.HtmlParser; |
| import com.google.streamhtmlparser.ParseException; |
| import com.google.streamhtmlparser.util.CharacterRecorder; |
| import com.google.streamhtmlparser.util.EntityResolver; |
| import com.google.streamhtmlparser.util.HtmlUtils; |
| |
| import java.util.Map; |
| |
| /** |
| * A custom specialized parser - ported from the main C++ version - used to |
| * implement context-aware escaping of run-time data in web-application |
| * templates. |
| * |
| * <p>This is the main class in the package. It implements the |
| * {@code HtmlParser} interface. |
| * |
| * <p>This class is not thread-safe, in particular you cannot invoke any |
| * state changing operations (such as {@code parse} from multiple threads |
| * on the same object. |
| * |
| * <p>If you are looking at this class, chances are very high you are |
| * implementing Auto-Escaping for a new template system. Please see the |
| * landing page including a design document at |
| * <a href="http://go/autoescape">Auto-Escape Landing Page</a>. |
| */ |
| public class HtmlParserImpl extends GenericParser implements HtmlParser { |
| |
| /* |
| * Internal representation of the parser state, which is at a |
| * finer-granularity than the external state as given to callers. |
| * The relationship between <code>InternalState</code> and |
| * <code>ExternalState</code> is a many-to-one relationship. |
| */ |
| private static final InternalState TEXT; |
| private static final InternalState TAG_START; |
| private static final InternalState TAG_NAME; |
| private static final InternalState DECL_START; |
| private static final InternalState DECL_BODY; |
| private static final InternalState COM_OPEN; |
| private static final InternalState COM_BODY; |
| private static final InternalState COM_DASH; |
| private static final InternalState COM_DASH_DASH; |
| private static final InternalState PI; |
| private static final InternalState PI_MAY_END; |
| private static final InternalState TAG_SPACE; |
| private static final InternalState TAG_CLOSE; |
| private static final InternalState ATTR; |
| private static final InternalState ATTR_SPACE; |
| private static final InternalState VALUE; |
| private static final InternalState VALUE_TEXT; |
| private static final InternalState VALUE_Q_START; |
| private static final InternalState VALUE_Q; |
| private static final InternalState VALUE_DQ_START; |
| private static final InternalState VALUE_DQ; |
| private static final InternalState CDATA_COM_START; |
| private static final InternalState CDATA_COM_START_DASH; |
| private static final InternalState CDATA_COM_BODY; |
| private static final InternalState CDATA_COM_DASH; |
| private static final InternalState CDATA_COM_DASH_DASH; |
| private static final InternalState CDATA_TEXT; |
| private static final InternalState CDATA_LT; |
| private static final InternalState CDATA_MAY_CLOSE; |
| private static final InternalState JS_FILE; |
| private static final InternalState CSS_FILE; |
| |
| static { |
| TEXT = InternalState.getInstanceHtml("TEXT"); |
| TAG_START = InternalState.getInstanceHtml("TAG_START"); |
| TAG_NAME = InternalState.getInstanceHtml("TAG_NAME"); |
| DECL_START = InternalState.getInstanceHtml("DECL_START"); |
| DECL_BODY = InternalState.getInstanceHtml("DECL_BODY"); |
| COM_OPEN = InternalState.getInstanceHtml("COM_OPEN"); |
| COM_BODY = InternalState.getInstanceHtml("COM_BODY"); |
| COM_DASH = InternalState.getInstanceHtml("COM_DASH"); |
| COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH"); |
| PI =InternalState.getInstanceHtml("PI"); |
| PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END"); |
| TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE"); |
| TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE"); |
| ATTR = InternalState.getInstanceHtml("ATTR"); |
| ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE"); |
| VALUE = InternalState.getInstanceHtml("VALUE"); |
| VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT"); |
| VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START"); |
| VALUE_Q = InternalState.getInstanceHtml("VALUE_Q"); |
| VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START"); |
| VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ"); |
| CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START"); |
| CDATA_COM_START_DASH = |
| InternalState.getInstanceHtml("CDATA_COM_START_DASH"); |
| CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY"); |
| CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH"); |
| CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH"); |
| CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT"); |
| CDATA_LT = InternalState.getInstanceHtml("CDATA_LT"); |
| CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE"); |
| JS_FILE = InternalState.getInstanceHtml("JS_FILE"); |
| CSS_FILE = InternalState.getInstanceHtml("CSS_FILE"); |
| } |
| |
| private static final Map<InternalState, ExternalState> STATE_MAPPING = |
| Maps.newHashMap(); |
| static { |
| initializeStateMapping(); |
| } |
| |
| private static final ParserStateTable STATE_TABLE = new ParserStateTable(); |
| static { |
| initializeParserStateTable(); |
| } |
| |
| private final CharacterRecorder tag; |
| private final CharacterRecorder attr; |
| private final CharacterRecorder value; |
| private final CharacterRecorder cdataCloseTag; |
| private final EntityResolver entityResolver; |
| private final JavascriptParserImpl jsParser; |
| private boolean insideJavascript; |
| private int valueIndex; |
| // True iff InsertText() was called at the start of a URL attribute value. |
| private boolean textInsideUrlValue; |
| |
| /** |
| * Creates an {@code HtmlParserImpl} object. |
| * |
| * <p>Both for performance reasons and to leverage code a state-flow machine |
| * that is automatically generated from Python for multiple target |
| * languages, this object uses a static {@code ParserStateTable} that |
| * is read-only and obtained from the generated code in {@code HtmlParserFsm}. |
| * That code also maintains the mapping from internal states |
| * ({@code InternalState}) to external states ({@code ExternalState}). |
| */ |
| public HtmlParserImpl() { |
| super(STATE_TABLE, STATE_MAPPING, TEXT); |
| tag = new CharacterRecorder(); |
| attr = new CharacterRecorder(); |
| value = new CharacterRecorder(); |
| cdataCloseTag = new CharacterRecorder(); |
| entityResolver = new EntityResolver(); |
| jsParser = new JavascriptParserImpl(); |
| insideJavascript = false; |
| valueIndex = 0; |
| textInsideUrlValue = false; |
| } |
| |
| /** |
| * Creates an {@code HtmlParserImpl} that is a copy of the one provided. |
| * |
| * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy |
| */ |
| public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) { |
| super(aHtmlParserImpl); |
| tag = new CharacterRecorder(aHtmlParserImpl.tag); |
| attr = new CharacterRecorder(aHtmlParserImpl.attr); |
| value = new CharacterRecorder(aHtmlParserImpl.value); |
| cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag); |
| entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver); |
| jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser); |
| insideJavascript = aHtmlParserImpl.insideJavascript; |
| valueIndex = aHtmlParserImpl.valueIndex; |
| textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue; |
| } |
| |
| @Override |
| public boolean inJavascript() { |
| return (insideJavascript |
| && ( (getState() == STATE_VALUE) |
| || (currentState == CDATA_TEXT) |
| || (currentState == CDATA_COM_START) |
| || (currentState == CDATA_COM_START_DASH) |
| || (currentState == CDATA_COM_BODY) |
| || (currentState == CDATA_COM_DASH) |
| || (currentState == CDATA_COM_DASH_DASH) |
| || (currentState == CDATA_LT) |
| || (currentState == CDATA_MAY_CLOSE) |
| || (currentState == JS_FILE) )); |
| } |
| |
| @Override |
| public boolean isJavascriptQuoted() { |
| if (inJavascript()) { |
| ExternalState jsParserState = jsParser.getState(); |
| return (jsParserState == JavascriptParserImpl.STATE_Q |
| || jsParserState == JavascriptParserImpl.STATE_DQ); |
| } |
| return false; |
| } |
| |
| @Override |
| public boolean inAttribute() { |
| ExternalState extState = getState(); |
| return (extState != null && (extState == STATE_ATTR |
| || extState == STATE_VALUE)); |
| } |
| |
| /** |
| * Returns {@code true} if and only if the parser is currently within |
| * a CSS context. A CSS context is one of the below: |
| * <ul> |
| * <li>Inside a STYLE tag. |
| * <li>Inside a STYLE attribute. |
| * <li>Inside a CSS file when the parser was reset in the CSS mode. |
| * </ul> |
| * |
| * @return {@code true} if and only if the parser is inside CSS |
| */ |
| @Override |
| public boolean inCss() { |
| return (currentState == CSS_FILE |
| || (getState() == STATE_VALUE |
| && (getAttributeType() == ATTR_TYPE.STYLE)) |
| || ("style".equals(getTag()))); |
| } |
| |
| @Override |
| public ATTR_TYPE getAttributeType() { |
| String attribute = getAttribute(); |
| if (!inAttribute()) { |
| return ATTR_TYPE.NONE; |
| } |
| if (HtmlUtils.isAttributeJavascript(attribute)) { |
| return ATTR_TYPE.JS; |
| } |
| if (HtmlUtils.isAttributeUri(attribute)) { |
| return ATTR_TYPE.URI; |
| } |
| if (HtmlUtils.isAttributeStyle(attribute)) { |
| return ATTR_TYPE.STYLE; |
| } |
| |
| // Special logic to handle the "content" attribute of the "meta" tag. |
| if ("meta".equals(getTag()) && "content".equals(getAttribute())) { |
| HtmlUtils.META_REDIRECT_TYPE redirectType = |
| HtmlUtils.parseContentAttributeForUrl(getValue()); |
| if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START || |
| redirectType == HtmlUtils.META_REDIRECT_TYPE.URL) |
| return ATTR_TYPE.URI; |
| } |
| |
| return ATTR_TYPE.REGULAR; |
| } |
| |
| @Override |
| public ExternalState getJavascriptState() { |
| return jsParser.getState(); |
| } |
| |
| @Override |
| public boolean isAttributeQuoted() { |
| return (currentState == VALUE_Q_START |
| || currentState == VALUE_Q |
| || currentState == VALUE_DQ_START |
| || currentState == VALUE_DQ); |
| } |
| |
| @Override |
| public String getTag() { |
| return tag.getContent().toLowerCase(); |
| } |
| |
| @Override |
| public String getAttribute() { |
| return inAttribute() ? attr.getContent().toLowerCase() : ""; |
| } |
| |
| @Override |
| public String getValue() { |
| return (getState() == STATE_VALUE) ? value.getContent() : ""; |
| } |
| |
| @Override |
| public int getValueIndex() { |
| if (getState() != STATE_VALUE) { |
| return 0; |
| } |
| return valueIndex; |
| } |
| |
| @Override |
| public boolean isUrlStart() { |
| // False when not inside an HTML attribute value |
| if (getState() != STATE_VALUE) { |
| return false; |
| } |
| |
| // Or when the HTML attribute is not of URI type. |
| if (getAttributeType() != ATTR_TYPE.URI) { |
| return false; |
| } |
| |
| // Or when we received an InsertText() directive at the start of a URL. |
| if (textInsideUrlValue) { |
| return false; |
| } |
| |
| if ("meta".equals(getTag())) { |
| // At this point, we know we are in the "content" attribute |
| // or we would not have the URI attribute type. |
| return (HtmlUtils.parseContentAttributeForUrl(getValue()) == |
| HtmlUtils.META_REDIRECT_TYPE.URL_START); |
| } |
| |
| // For all other URI attributes, check if we are at index 0. |
| return (getValueIndex() == 0); |
| } |
| |
| /** |
| * {@inheritDoc} |
| * |
| * Resets the state of the parser to a state consistent with the |
| * {@code Mode} provided. This will reset finer-grained state |
| * information back to a default value, hence use only when |
| * you want to parse text from a very clean slate. |
| * |
| * <p>See the {@link HtmlParser.Mode} enum for information on all |
| * the valid modes. |
| * |
| * @param mode is an enum representing the high-level state of the parser |
| */ |
| @Override |
| public void resetMode(Mode mode) { |
| insideJavascript = false; |
| tag.reset(); |
| attr.reset(); |
| value.reset(); |
| cdataCloseTag.reset(); |
| valueIndex = 0; |
| textInsideUrlValue = false; |
| jsParser.reset(); |
| |
| switch (mode) { |
| case HTML: |
| currentState = TEXT; |
| break; |
| case JS: |
| currentState = JS_FILE; |
| insideJavascript = true; |
| break; |
| case CSS: |
| currentState = CSS_FILE; |
| break; |
| case HTML_IN_TAG: |
| currentState = TAG_SPACE; |
| break; |
| default: |
| throw new IllegalArgumentException("Did not recognize Mode: " + |
| mode.toString()); |
| } |
| } |
| |
| /** |
| * Resets the state of the parser to the initial state of parsing HTML. |
| */ |
| public void reset() { |
| super.reset(); |
| resetMode(Mode.HTML); |
| } |
| |
| /** |
| * A specialized directive to tell the parser there is some content |
| * that will be inserted here but that it will not get to parse. Used |
| * by the template system that may not be able to give some content |
| * to the parser but wants it to know there typically will be content |
| * inserted at that point. This is a hint used in corner cases within |
| * parsing of HTML attribute names and values where content we do not |
| * get to see could affect our parsing and alter our current state. |
| * |
| * <p>The two cases where {@code #insertText()} affects our parsing are: |
| * <ul> |
| * <li>We are at the start of the value of a URL-accepting HTML attribute. In |
| * that case, we change internal state to no longer be considered at the |
| * start of the URL. This may affect what escaping template systems may want |
| * to perform on the HTML attribute value. We avoid injecting fake data and |
| * hence not modify the current index of the value as determined by |
| * {@link #getValueIndex()}</li> |
| * <li>We just transitioned from an attribute name to an attribute value |
| * (by parsing the separating {@code '='} character). In that case, we |
| * change internal state to be now inside a non-quoted HTML attribute |
| * value.</li> |
| * </ul> |
| * |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| @Override |
| public void insertText() throws ParseException { |
| // Case: Inside URL attribute value. |
| if (getState() == STATE_VALUE |
| && getAttributeType() == ATTR_TYPE.URI |
| && isUrlStart()) { |
| textInsideUrlValue = true; |
| } |
| // Case: Before parsing any attribute value. |
| if (currentState == VALUE) { |
| setNextState(VALUE_TEXT); |
| } |
| } |
| |
| @Override |
| protected InternalState handleEnterState(InternalState currentState, |
| InternalState expectedNextState, |
| char input) { |
| InternalState nextState = expectedNextState; |
| if (currentState == TAG_NAME) { |
| enterTagName(); |
| } else if (currentState == ATTR) { |
| enterAttribute(); |
| } else if (currentState == TAG_CLOSE) { |
| nextState = tagClose(currentState); |
| } else if (currentState == CDATA_MAY_CLOSE) { |
| enterStateCdataMayClose(); |
| } else if (currentState == VALUE) { |
| enterValue(); |
| } else |
| if (currentState == VALUE_TEXT || currentState == VALUE_Q |
| || currentState == VALUE_DQ) { |
| enterValueContent(); |
| } |
| return nextState; |
| } |
| |
| @Override |
| protected InternalState handleExitState(InternalState currentState, |
| InternalState expectedNextState, |
| char input) { |
| InternalState nextState = expectedNextState; |
| if (currentState == TAG_NAME) { |
| exitTagName(); |
| } else if (currentState == ATTR) { |
| exitAttribute(); |
| } else if (currentState == CDATA_MAY_CLOSE) { |
| nextState = exitStateCdataMayClose(nextState, input); |
| } else |
| if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q) |
| || (currentState == VALUE_DQ)) { |
| exitValueContent(); |
| } |
| return nextState; |
| } |
| |
| @Override |
| protected InternalState handleInState(InternalState currentState, |
| char input) throws ParseException { |
| if ((currentState == CDATA_TEXT) |
| || (currentState == CDATA_COM_START) |
| || (currentState == CDATA_COM_START_DASH) |
| || (currentState == CDATA_COM_BODY) |
| || (currentState == CDATA_COM_DASH) |
| || (currentState == CDATA_COM_DASH_DASH) |
| || (currentState == CDATA_LT) |
| || (currentState == CDATA_MAY_CLOSE) |
| || (currentState == JS_FILE)) { |
| inStateCdata(input); |
| } else if ((currentState == VALUE_TEXT) |
| || (currentState == VALUE_Q) |
| || (currentState == VALUE_DQ)) { |
| inStateValue(input); |
| } |
| return currentState; |
| } |
| |
| /** |
| * Invokes recording on all CharacterRecorder objects. Currently we do |
| * not check that one and only one of them is recording. I did a fair |
| * bit of testing on the C++ parser and was not convinced there is |
| * such a guarantee. |
| */ |
| @Override |
| protected void record(char input) { |
| attr.maybeRecord(input); |
| tag.maybeRecord(input); |
| value.maybeRecord(input); |
| cdataCloseTag.maybeRecord(input); |
| } |
| |
| /** |
| * Starts recording the name of the HTML tag. Called when the parser |
| * enters a new tag. |
| */ |
| private void enterTagName() { |
| tag.startRecording(); |
| } |
| |
| private void exitTagName() { |
| tag.stopRecording(); |
| String tagString = tag.getContent(); |
| if (!tagString.isEmpty() && tagString.charAt(0) == '/') { |
| tag.reset(); |
| } |
| } |
| |
| /** |
| * Starts recording the name of the HTML attribute. Called when the parser |
| * enters a new HTML attribute. |
| */ |
| private void enterAttribute() { |
| attr.startRecording(); |
| } |
| |
| private void exitAttribute() { |
| attr.stopRecording(); |
| } |
| |
| /** |
| * Tracks the index within the HTML attribute value and initializes |
| * the javascript parser for attributes that take javascript. |
| * |
| * Called when the parser enters a new HTML attribute value. |
| */ |
| private void enterValue() { |
| valueIndex = 0; |
| textInsideUrlValue = false; |
| if (HtmlUtils.isAttributeJavascript(getAttribute())) { |
| entityResolver.reset(); |
| jsParser.reset(); |
| insideJavascript = true; |
| } else { |
| insideJavascript = false; |
| } |
| } |
| |
| /** |
| * Starts recordning the contents of the attribute value. |
| * |
| * Called when entering an attribute value. |
| */ |
| private void enterValueContent() { |
| value.startRecording(); |
| } |
| |
| /** |
| * Stops the recording of the attribute value and exits javascript |
| * (in case we were inside it). |
| */ |
| private void exitValueContent() { |
| value.stopRecording(); |
| insideJavascript = false; |
| } |
| |
| /** |
| * Processes javascript after performing entity resolution and updates |
| * the position within the attribute value. |
| * If the status of the entity resolution is <code>IN_PROGRESS</code>, |
| * we don't invoke the javascript parser. |
| * |
| * <p>Called for every character inside an attribute value. |
| * |
| * @param input character read |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| private void inStateValue(char input) throws ParseException { |
| valueIndex++; |
| if (insideJavascript) { |
| EntityResolver.Status status = entityResolver.processChar(input); |
| if (status == EntityResolver.Status.COMPLETED) { |
| jsParser.parse(entityResolver.getEntity()); |
| entityResolver.reset(); |
| } else if (status == EntityResolver.Status.NOT_STARTED) { |
| jsParser.parse(input); |
| } |
| } |
| } |
| |
| /** |
| * Handles the tag it finished reading. |
| * |
| * <p>For a script tag, it initializes the javascript parser. For all |
| * tags that are recognized to have CDATA values |
| * (including the script tag), it switches the CDATA state to handle them |
| * properly. For code simplification, CDATA and RCDATA sections are |
| * treated the same. |
| * |
| * <p>Called when the parser leaves a tag definition. |
| * |
| * @param state current state |
| * @return state next state, could be the same as current state |
| */ |
| private InternalState tagClose(InternalState state) { |
| InternalState nextState = state; |
| String tagName = getTag(); |
| if ("script".equals(tagName)) { |
| nextState = CDATA_TEXT; |
| jsParser.reset(); |
| insideJavascript = true; |
| } else if ("style".equals(tagName) |
| || "title".equals(tagName) |
| || "textarea".equals(tagName)) { |
| nextState = CDATA_TEXT; |
| insideJavascript = false; |
| } |
| return nextState; |
| } |
| |
| /** |
| * Feeds the character to the javascript parser for processing. |
| * |
| * <p>Called inside CDATA blocks to parse javascript. |
| * |
| * @param input character read |
| * @throws ParseException if an unrecoverable error occurred during parsing |
| */ |
| private void inStateCdata(char input) throws ParseException { |
| if (insideJavascript) { |
| jsParser.parse(input); |
| } |
| } |
| |
| /** |
| * Starts recording. This is so we find the closing tag name in order to |
| * know if the tag is going to be closed or not. |
| * |
| * <p>Called when encountering a '<' character in a CDATA section. |
| */ |
| private void enterStateCdataMayClose() { |
| cdataCloseTag.startRecording(); |
| } |
| |
| /** |
| * Determines whether to close the tag element, It closes it if it finds |
| * the corresponding end tag. Called when reading what could be a |
| * closing CDATA tag. |
| * |
| * @param input the character read |
| * @param expectedNextState the expected state to go to next |
| * unless we want to change it here |
| * @return the next state to go to |
| */ |
| private InternalState exitStateCdataMayClose( |
| InternalState expectedNextState, |
| char input) { |
| InternalState nextState = expectedNextState; |
| cdataCloseTag.stopRecording(); |
| String cdataCloseTagString = cdataCloseTag.getContent(); |
| Preconditions.checkState(!cdataCloseTagString.isEmpty() |
| && cdataCloseTagString.charAt(0) == '/'); // Developer error. |
| |
| if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag()) |
| && (input == '>' || HtmlUtils.isHtmlSpace(input))) { |
| tag.clear(); |
| insideJavascript = false; |
| } else { |
| nextState = CDATA_TEXT; |
| } |
| return nextState; |
| } |
| |
| |
| // ======================================================= // |
| // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. // |
| // ======================================================= // |
| |
| private static void registerMapping(InternalState internalState, |
| ExternalState externalState) { |
| STATE_MAPPING.put(internalState, externalState); |
| } |
| |
| private static void initializeStateMapping() { |
| // Each parser implementation must map the error state appropriately. |
| registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR); |
| |
| registerMapping(TEXT, HtmlParser.STATE_TEXT); |
| registerMapping(TAG_START, HtmlParser.STATE_TAG); |
| registerMapping(TAG_NAME, HtmlParser.STATE_TAG); |
| registerMapping(DECL_START, HtmlParser.STATE_TEXT); |
| registerMapping(DECL_BODY, HtmlParser.STATE_TEXT); |
| registerMapping(COM_OPEN, HtmlParser.STATE_TEXT); |
| registerMapping(COM_BODY, HtmlParser.STATE_COMMENT); |
| registerMapping(COM_DASH, HtmlParser.STATE_COMMENT); |
| registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT); |
| registerMapping(PI, HtmlParser.STATE_TEXT); |
| registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT); |
| registerMapping(TAG_SPACE, HtmlParser.STATE_TAG); |
| registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT); |
| registerMapping(ATTR, HtmlParser.STATE_ATTR); |
| registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR); |
| registerMapping(VALUE, HtmlParser.STATE_VALUE); |
| registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE); |
| registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE); |
| registerMapping(VALUE_Q, HtmlParser.STATE_VALUE); |
| registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE); |
| registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE); |
| registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT); |
| registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT); |
| registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT); |
| registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT); |
| registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT); |
| registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT); |
| registerMapping(CDATA_LT, HtmlParser.STATE_TEXT); |
| registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT); |
| registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE); |
| registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE); |
| } |
| |
| private static void registerTransition(String expression, |
| InternalState source, |
| InternalState to) { |
| // It seems to silly to go through a StateTableTransition here |
| // but it adds extra data checking. |
| StateTableTransition stt = new StateTableTransition(expression, |
| source, to); |
| STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(), |
| stt.getTo()); |
| } |
| |
| // NOTE: The "[:default:]" transition should be registered before any |
| // other transitions for a given state or it will over-write them. |
| private static void initializeParserStateTable() { |
| registerTransition("[:default:]", CSS_FILE, CSS_FILE); |
| registerTransition("[:default:]", JS_FILE, JS_FILE); |
| registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT); |
| registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE); |
| registerTransition(">", CDATA_MAY_CLOSE, TEXT); |
| registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE); |
| registerTransition("[:default:]", CDATA_LT, CDATA_TEXT); |
| registerTransition("!", CDATA_LT, CDATA_COM_START); |
| registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE); |
| registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT); |
| registerTransition("<", CDATA_TEXT, CDATA_LT); |
| registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY); |
| registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT); |
| registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH); |
| registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY); |
| registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH); |
| registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY); |
| registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH); |
| registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT); |
| registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY); |
| registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT); |
| registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH); |
| registerTransition("[:default:]", VALUE_DQ, VALUE_DQ); |
| registerTransition("\"", VALUE_DQ, TAG_SPACE); |
| registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ); |
| registerTransition("\"", VALUE_DQ_START, TAG_SPACE); |
| registerTransition("[:default:]", VALUE_Q, VALUE_Q); |
| registerTransition("\'", VALUE_Q, TAG_SPACE); |
| registerTransition("[:default:]", VALUE_Q_START, VALUE_Q); |
| registerTransition("\'", VALUE_Q_START, TAG_SPACE); |
| registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT); |
| registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE); |
| registerTransition(">", VALUE_TEXT, TAG_CLOSE); |
| registerTransition("[:default:]", VALUE, VALUE_TEXT); |
| registerTransition(">", VALUE, TAG_CLOSE); |
| registerTransition(" \t\n\r", VALUE, VALUE); |
| registerTransition("\"", VALUE, VALUE_DQ_START); |
| registerTransition("\'", VALUE, VALUE_Q_START); |
| registerTransition("=", ATTR_SPACE, VALUE); |
| registerTransition("/", ATTR_SPACE, TAG_SPACE); |
| registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR); |
| registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE); |
| registerTransition(">", ATTR_SPACE, TAG_CLOSE); |
| registerTransition(" \t\n\r", ATTR, ATTR_SPACE); |
| registerTransition("=", ATTR, VALUE); |
| registerTransition("/", ATTR, TAG_SPACE); |
| registerTransition(">", ATTR, TAG_CLOSE); |
| registerTransition("A-Za-z0-9_:.-", ATTR, ATTR); |
| registerTransition("[:default:]", TAG_CLOSE, TEXT); |
| registerTransition("<", TAG_CLOSE, TAG_START); |
| registerTransition("/", TAG_SPACE, TAG_SPACE); |
| registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR); |
| registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE); |
| registerTransition(">", TAG_SPACE, TAG_CLOSE); |
| registerTransition("[:default:]", PI_MAY_END, PI); |
| registerTransition(">", PI_MAY_END, TEXT); |
| registerTransition("[:default:]", PI, PI); |
| registerTransition("?", PI, PI_MAY_END); |
| registerTransition("[:default:]", COM_DASH_DASH, COM_BODY); |
| registerTransition(">", COM_DASH_DASH, TEXT); |
| registerTransition("-", COM_DASH_DASH, COM_DASH_DASH); |
| registerTransition("[:default:]", COM_DASH, COM_BODY); |
| registerTransition("-", COM_DASH, COM_DASH_DASH); |
| registerTransition("[:default:]", COM_BODY, COM_BODY); |
| registerTransition("-", COM_BODY, COM_DASH); |
| registerTransition("[:default:]", COM_OPEN, TEXT); |
| registerTransition("-", COM_OPEN, COM_BODY); |
| registerTransition("[:default:]", DECL_BODY, DECL_BODY); |
| registerTransition(">", DECL_BODY, TEXT); |
| registerTransition("[:default:]", DECL_START, DECL_BODY); |
| registerTransition(">", DECL_START, TEXT); |
| registerTransition("-", DECL_START, COM_OPEN); |
| registerTransition(">", TAG_NAME, TAG_CLOSE); |
| registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE); |
| registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME); |
| |
| // Manual change to remain in-sync with CL 10597850 in C HtmlParser. |
| registerTransition("[:default:]", TAG_START, TEXT); |
| registerTransition("<", TAG_START, TAG_START); |
| // End of manual change. |
| |
| registerTransition("!", TAG_START, DECL_START); |
| registerTransition("?", TAG_START, PI); |
| registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME); |
| registerTransition("[:default:]", TEXT, TEXT); |
| registerTransition("<", TEXT, TAG_START); |
| } |
| } |