blob: 06af6dcf070ee6c08ff8a6a4def8a135c38df14f [file] [log] [blame]
// Copyright (c) 2011, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
package org.owasp.html;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
/**
* A trie used to separate punctuation tokens in a run of non-whitespace
* characters by preferring the longest punctuation string possible in a
* greedy left-to-right scan.
*
* @author Mike Samuel <mikesamuel@gmail.com>
*/
final class Trie {
private final char[] childMap;
private final Trie[] children;
private final boolean terminal;
private final int value;
/**
* @param elements not empty, non null.
*/
public Trie(Map<String, Integer> elements) {
this(sortedUniqEntries(elements), 0);
}
private Trie(List<Map.Entry<String, Integer>> elements, int depth) {
this(elements, depth, 0, elements.size());
}
/**
* @param elements not empty, non null. Not modified.
* @param depth the depth in the tree.
* @param start an index into punctuationStrings of the first string in this
* subtree.
* @param end an index into punctuationStrings past the last string in this
* subtree.
*/
private Trie(
List<Map.Entry<String, Integer>> elements, int depth,
int start, int end) {
this.terminal = depth == elements.get(start).getKey().length();
if (this.terminal) {
this.value = elements.get(start).getValue();
if (start + 1 == end) { // base case
this.childMap = ZERO_CHARS;
this.children = ZERO_TRIES;
return;
} else {
++start;
}
} else {
this.value = Integer.MAX_VALUE;
}
int childCount = 0;
{
int last = -1;
for (int i = start; i < end; ++i) {
char ch = elements.get(i).getKey().charAt(depth);
if (ch != last) {
++childCount;
last = ch;
}
}
}
this.childMap = new char[childCount];
this.children = new Trie[childCount];
int childStart = start;
int childIndex = 0;
char lastCh = elements.get(start).getKey().charAt(depth);
for (int i = start + 1; i < end; ++i) {
char ch = elements.get(i).getKey().charAt(depth);
if (ch != lastCh) {
childMap[childIndex] = lastCh;
children[childIndex++] = new Trie(
elements, depth + 1, childStart, i);
childStart = i;
lastCh = ch;
}
}
childMap[childIndex] = lastCh;
children[childIndex++] = new Trie(elements, depth + 1, childStart, end);
}
/** Does this node correspond to a complete string in the input set. */
public boolean isTerminal() { return terminal; }
public int getValue() { return value; }
/**
* The child corresponding to the given character.
* @return null if no such trie.
*/
public Trie lookup(char ch) {
int i = Arrays.binarySearch(childMap, ch);
return i >= 0 ? children[i] : null;
}
/**
* The descendant of this trie corresponding to the string for this trie
* appended with s.
* @param s non null.
* @return null if no such trie.
*/
public Trie lookup(CharSequence s) {
Trie t = this;
for (int i = 0, n = s.length(); i < n; ++i) {
t = t.lookup(s.charAt(i));
if (null == t) { break; }
}
return t;
}
public boolean contains(char ch) {
return Arrays.binarySearch(childMap, ch) >= 0;
}
private static <T> List<Map.Entry<String, T>> sortedUniqEntries(
Map<String, T> m) {
return new ArrayList<Map.Entry<String, T>>(
new TreeMap<String, T>(m).entrySet());
}
private static final char[] ZERO_CHARS = new char[0];
private static final Trie[] ZERO_TRIES = new Trie[0];
/**
* Append all strings s such that {@code this.lookup(s).isTerminal()} to the
* given list in lexical order.
*/
public void toStringList(List<String> strings) {
toStringList("", strings);
}
private void toStringList(String prefix, List<String> strings) {
if (terminal) { strings.add(prefix); }
for (int i = 0, n = childMap.length; i < n; ++i) {
children[i].toStringList(prefix + childMap[i], strings);
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
toStringBuilder(0, sb);
return sb.toString();
}
private void toStringBuilder(int depth, StringBuilder sb) {
sb.append(terminal ? "terminal" : "nonterminal");
++depth;
for (int i = 0; i < childMap.length; ++i) {
sb.append('\n');
for (int d = 0; d < depth; ++d) {
sb.append('\t');
}
sb.append('\'').append(childMap[i]).append("' ");
children[i].toStringBuilder(depth, sb);
}
}
}