blob: a79a640db96215ad211a645c7b656eede88219c9 [file] [log] [blame]
/*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.impl;
import com.google.common.base.Preconditions;
/**
* Holds a state table which is defined as the set of all
* recognized state transitions and the set of characters that
* trigger them.
*
* <p>The logic of what character causes what state transition is derived from
* a base definition written as a Python configuration file in the original
* C++ parser.
*
* <p>This class provides methods to initially build the state table and then
* methods at parsing time to determine the transitions to subsequent states.
*
* <p>Note on characters outside the extended ASCII range: Currently, all state
* transitions in the Python configuration file trigger only on extended
* ASCII characters, that is characters in the Unicode space of [U+0000 to
* U+00FF]. We use that property to design a more efficient state transition
* representation. When receiving characters outside that ASCII range, we
* simply apply the DEFAULT transition for the given state - as we do for any
* character that is not a hot character for that state. If no default
* transition exists, we switch to the Internal Error state.
*
* <p>Technical note: In Java, a {@code char} is a code unit and in some cases
* may not represent a complete Unicode code point. However, when that happens,
* the code units that follow for that code point are all in the surrogate area
* [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger
* any incorrect state transitions.
*
* <p>This class is storage-inefficient but it is static at least
* and not generated for each Parser instance.
*/
class ParserStateTable {
/**
* A limit on how many different states we can have in one state table.
* Can be increased should it no longer be sufficient.
*/
private static final int MAX_STATES = 256;
/**
* We only check transitions for (extended) ASCII characters, hence
* characters in the range 0 to MAX_CHARS -1.
*/
private static final int MAX_CHARS = 256;
/**
* Records all state transitions except those identified as DEFAULT
* transitions. It is two dimensional: Stores a target {@code InternalState}
* given a source state (referenced by its numeric ID) and the current
* character.
*/
private final InternalState[][] stateTable;
/**
* Records all DEFAULT state transitions. These are transitions provided
* using the {@code "[:default:]"} syntax in the Python configuration file.
* There can be only one such transition for any given source state, hence
* the array is one dimensional.
*/
private final InternalState[] defaultStateTable;
public ParserStateTable() {
stateTable = new InternalState[MAX_STATES][MAX_CHARS];
defaultStateTable = new InternalState[MAX_STATES];
}
/**
* Returns the state to go to when receiving the current {@code char}
* in the {@code from} state.
* Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no
* state transition for that character and no default state transition
* for that state.
*
* <p>For ASCII characters, first look-up an explicit state transition for
* the current character. If none is found, look-up a default transition. For
* non-ASCII characters, look-up a default transition only.
*
* @param from the source state
* @param currentChar the character received
* @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE}
*/
InternalState getNextState(InternalState from, int currentChar) {
// TODO: Consider throwing run-time error here.
if (from == null || currentChar < 0)
return InternalState.INTERNAL_ERROR_STATE;
int id = from.getId();
if (id < 0 || id >= MAX_STATES) {
return InternalState.INTERNAL_ERROR_STATE;
}
InternalState result = null;
if (currentChar < MAX_CHARS) {
result = stateTable[id][currentChar];
}
if (result == null) {
result = defaultStateTable[from.getId()];
}
return result != null ? result : InternalState.INTERNAL_ERROR_STATE;
}
void setExpression(String expr, InternalState from, InternalState to) {
if ((expr == null) || (from == null) || (to == null)) {
return;
}
// This special string indicates a default state transition.
if ("[:default:]".equals(expr)) {
setDefaultDestination(from, to);
return;
}
int i = 0;
while (i < expr.length()) {
// If next char is a '-' which is not the last character of the expr
if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') {
setRange(from, expr.charAt(i), expr.charAt(i + 2), to);
i += 2;
} else {
setDestination(from, expr.charAt(i), to);
i++;
}
}
}
private void fill(InternalState from, InternalState to) {
char c;
for (c = 0; c < MAX_CHARS; c++) {
setDestination(from, c, to);
}
}
private void setDefaultDestination(InternalState from, InternalState to) {
Preconditions.checkNotNull(from); // Developer error if it triggers
Preconditions.checkNotNull(to); // Developer error if it triggers
int id = from.getId();
if ((id < 0) || (id >= MAX_STATES)) {
return;
}
// TODO: Consider asserting if there was a state transition defined.
defaultStateTable[from.getId()] = to;
}
private void setDestination(InternalState from, char chr, InternalState to) {
Preconditions.checkNotNull(from); // Developer error if it triggers
Preconditions.checkNotNull(to); // Developer error if it triggers
Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS,
"char must be in ASCII set: %c", chr);
int id = from.getId();
if ((id < 0) || (id >= MAX_STATES)) {
return;
}
stateTable[from.getId()][chr] = to;
}
private void setRange(InternalState from, char start, char end,
InternalState to) {
// Developer error if either trigger.
Preconditions.checkArgument(start >= 0 && start < MAX_CHARS,
"char must be in ASCII set: %c", start);
Preconditions.checkArgument(end >= 0 && end < MAX_CHARS,
"char must be in ASCII set: %c", end);
char c;
for (c = start; c <= end; c++) {
setDestination(from, c, to);
}
}
}