blob: f6f301a3162c248fa83ce8fd67b8e1544e81e500 [file] [log] [blame]
// Copyright (c) 2013, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
package org.owasp.html;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import javax.annotation.Nullable;
import com.google.common.collect.ImmutableMap;
/**
* Given a string of CSS, produces a string of normalized CSS with certain
* useful properties detailed below.
* <ul>
* <li>All runs of white-space and comment tokens (including CDO and CDC)
* have been replaced with a single space character.</li>
* <li>All strings are quoted and escapes are escaped according to the
* following scheme:
* <table>
* <tr><td>NUL</td> <td><code>\0</code></tr>
* <tr><td>line feed</td> <td><code>\a</code></tr>
* <tr><td>vertical feed</td> <td><code>\c</code></tr>
* <tr><td>carriage return</td><td><code>\d</code></tr>
* <tr><td>double quote</td> <td><code>\22</code></tr>
* <tr><td>ampersand &amp;</td><td><code>\26</code></tr>
* <tr><td>single quote</td> <td><code>\27</code></tr>
* <tr><td>left-angle &lt;</td><td><code>\3c</code></tr>
* <tr><td>rt-angle &gt;</td> <td><code>\3e</code></tr>
* <tr><td>back slash</td> <td><code>\\</code></tr>
* <tr><td>all others</td> <td>raw</td></tr>
* </table>
* </li>
* <li>All <code>url(&hellip;)</code> tokens are quoted.
* <li>All keywords, identifiers, and hex literals are lower-case and have
* embedded escape sequences decoded, except that .</li>
* <li>All brackets nest properly.</li>
* <li>Does not contain any case-insensitive variant of the sequences
* {@code <!--}, {@code -->}, {@code <![CDATA[}, {@code ]]>}, or
* {@code </style}.</li>
* <li>All delimiters that can start longer tokens are followed by a space.
* </ul>
*/
final class CssTokens implements Iterable<String> {
public final String normalizedCss;
public final Brackets brackets;
private final int[] tokenBreaks;
private final TokenType[] tokenTypes;
public TokenIterator start() {
return new TokenIterator(tokenTypes.length);
}
public TokenIterator iterator() { return start(); }
public static CssTokens lex(String css) {
Lexer lexer = new Lexer(css);
lexer.lex();
return lexer.build();
}
/** A cursor into a list of tokens. */
public final class TokenIterator implements Iterator<String> {
private int tokenIndex = 0;
private final int limit;
TokenIterator(int limit) {
this.limit = limit;
}
public boolean hasNext() {
return hasToken();
}
public String next() {
String token = token();
advance();
return token;
}
public @Nullable TokenIterator spliceToEnd() {
if (!hasNext()) { throw new NoSuchElementException(); }
int end = brackets.partner(tokenIndex);
if (end < 0) {
return null;
}
TokenIterator between = new TokenIterator(end);
between.tokenIndex = tokenIndex + 1;
tokenIndex = end + 1;
return between;
}
public int tokenIndex() {
return tokenIndex;
}
public int startOffset() {
return tokenBreaks[tokenIndex];
}
public int endOffset() {
return tokenBreaks[tokenIndex+1];
}
public String token() {
return normalizedCss.substring(startOffset(), endOffset());
}
public boolean hasToken() {
return tokenIndex < limit;
}
public boolean hasTokenAfterSpace() {
while (hasToken()) {
if (type() != TokenType.WHITESPACE) { return true; }
advance();
}
return false;
}
/** The type of the current token. */
public TokenType type() {
return tokenTypes[tokenIndex];
}
public void seek(int tokenIndex) {
this.tokenIndex = tokenIndex;
}
public void advance() {
if (!hasToken()) { throw new NoSuchElementException(); }
++tokenIndex;
}
public void backup() {
if (tokenIndex == 0) { throw new NoSuchElementException(); }
--tokenIndex;
}
public void remove() throws UnsupportedOperationException {
throw new UnsupportedOperationException();
}
}
private CssTokens(
String normalizedCss, Brackets brackets, int[] tokenBreaks,
TokenType[] tokenTypes) {
this.normalizedCss = normalizedCss;
this.brackets = brackets;
this.tokenBreaks = tokenBreaks;
this.tokenTypes = tokenTypes;
}
public enum TokenType {
/** An identifier. */
IDENT,
/** An identifier prefixed with a period. */
DOT_IDENT,
/** A function name and opening bracket. */
FUNCTION,
/** An {@code @<identifier>} directive token. */
AT,
/** A hash token that contains non-hex characters. */
HASH_ID,
/** A hash token that could be a color literal. */
HASH_UNRESTRICTED,
/** A quoted string. */
STRING,
/** A URL of the form <code>url("...")</code>. */
URL,
/** A single character. */
DELIM,
/** A scalar numeric value. */
NUMBER,
/** A percentage. */
PERCENTAGE,
/** A numeric value with a unit suffix. */
DIMENSION,
/** A numeric value with an unknown unit suffix. */
BAD_DIMENSION,
/** {@code U+<hex-or-qmark>} */
UNICODE_RANGE,
/**
* include-match, dash-match, prefix-match, suffix-match, substring-match
*/
MATCH,
/** {@code ||} */
COLUMN,
/** A run of white-space, comment, CDO, and CDC tokens. */
WHITESPACE,
/** {@code :} */
COLON,
/** {@code ;} */
SEMICOLON,
/** {@code ,} */
COMMA,
/** {@code [} */
LEFT_SQUARE,
/** {@code ]} */
RIGHT_SQUARE,
/** {@code (} */
LEFT_PAREN,
/** {@code )} */
RIGHT_PAREN,
/** <code>{</code> */
LEFT_CURLY,
/** <code>}</code> */
RIGHT_CURLY,
;
}
/**
* Maps tokens to their partners. A close bracket token like {@code (} may
* have a partner token like {@code )} if properly nested, and vice-versa.
*/
static final class Brackets {
/**
* For each token index, the index of the indexed token's partner or -1 if
* it has none.
*/
private final int[] brackets;
private Brackets(int[] brackets) {
this.brackets = brackets;
}
/** The index of the partner token or -1 if none. */
int partner(int tokenIndex) {
int bracketIndex = bracketIndexForToken(tokenIndex);
if (bracketIndex < 0) { return -1; }
return brackets[(bracketIndex << 1) + 1];
}
int bracketIndexForToken(int target) {
// Binary search by leftmost element of pair.
int left = 0;
int right = brackets.length >> 1;
while (left < right) {
int mid = left + ((right - left) >> 1);
int value = brackets[mid << 1];
if (value == target) { return mid; }
if (value < target) {
left = mid + 1;
} else {
right = mid;
}
}
return -1;
}
}
private static final int[] ZERO_INTS = new int[0];
private static final TokenType[] ZERO_TYPES = new TokenType[0];
private static final Brackets EMPTY_BRACKETS = new Brackets(ZERO_INTS);
private static final CssTokens EMPTY = new CssTokens(
"", EMPTY_BRACKETS, ZERO_INTS, ZERO_TYPES);
/**
* Tokenizes according to section 4 of http://dev.w3.org/csswg/css-syntax/
*/
private static final class Lexer {
private final String css;
private final StringBuilder sb;
private int pos = 0;
private final int cssLimit;
private List<TokenType> tokenTypes = null;
private int[] tokenBreaks = new int[128];
private int tokenBreaksLimit = 0;
/**
* For each bracket, 2 ints: the token index of the bracket, and the token
* index of its partner.
* The array is sorted by the first int.
* The second int is -1 when the bracket has not yet been closed.
*/
private int[] brackets = ZERO_INTS;
/**
* The number of elements in {@link #brackets} that are valid.
* {@code brackets[bracketsLimit:]} is zeroed space that the list can grow
* into.
*/
private int bracketsLimit = 0;
/**
* For each bracket that has not been closed, 2 ints:
* its index in {@link #brackets} and the character of its close bracket
* as an int.
* This is a bracket stack so the array is sorted by the first int.
*/
private int[] open = ZERO_INTS;
/**
* The number of elements in {@link #open} that are valid.
* {@code open[openLimit:]} is garbage space that the stack can grow into.
*/
private int openLimit = 0;
Lexer(String css) {
this.css = css;
this.sb = new StringBuilder();
this.cssLimit = css.length();
}
TokenType openBracket(char bracketChar) {
char close;
TokenType type;
switch (bracketChar) {
case '(': close = ')'; type = TokenType.LEFT_PAREN; break;
case '[': close = ']'; type = TokenType.LEFT_SQUARE; break;
case '{': close = '}'; type = TokenType.LEFT_CURLY; break;
default:
throw new AssertionError("Invalid open bracket " + bracketChar);
}
brackets = expandIfNecessary(brackets, bracketsLimit, 2);
open = expandIfNecessary(open, openLimit, 2);
open[openLimit++] = bracketsLimit;
open[openLimit++] = close;
brackets[bracketsLimit++] = tokenBreaksLimit;
brackets[bracketsLimit++] = -1;
sb.append(bracketChar);
return type;
}
void closeBracket(char bracketChar) {
int openLimitAfterClose = openLimit;
do {
if (openLimitAfterClose == 0) {
// Drop an orphaned close bracket.
breakOutput();
return;
}
openLimitAfterClose -= 2;
} while (bracketChar != open[openLimitAfterClose + 1]);
closeBrackets(openLimitAfterClose);
}
private void closeBrackets(int openLimitAfterClose) {
// Make sure we've got space on brackets.
int spaceNeeded = openLimit - openLimitAfterClose;
brackets = expandIfNecessary(brackets, bracketsLimit, spaceNeeded);
int closeTokenIndex = tokenBreaksLimit;
while (openLimit > openLimitAfterClose) {
// Pop the stack.
int closeBracket = open[--openLimit];
int openBracketIndex = open[--openLimit];
int openTokenIndex = brackets[openBracketIndex];
// Update open bracket to point to its partner.
brackets[openBracketIndex + 1] = closeTokenIndex;
// Emit the close bracket.
brackets[bracketsLimit++] = closeTokenIndex;
brackets[bracketsLimit++] = openTokenIndex;
sb.appendCodePoint(closeBracket);
closeTokenIndex++;
}
}
CssTokens build() {
// Close any still open brackets.
{
int startOfCloseBrackets = sb.length();
closeBrackets(0);
emitMergedTokens(startOfCloseBrackets, sb.length());
}
if (tokenTypes == null) { return EMPTY; }
int[] bracketsTrunc = truncateOrShare(brackets, bracketsLimit);
// Strip any trailing space off, since it may have been inserted by a
// breakAfter call anyway.
int cssEnd = sb.length();
if (cssEnd > 0 && sb.charAt(cssEnd - 1) == ' ') {
--cssEnd;
tokenTypes.remove(--tokenBreaksLimit);
}
String normalizedCss = sb.substring(0, cssEnd);
// Store the last character on the tokenBreaksList to simplify finding the
// end of a token.
tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
tokenBreaks[tokenBreaksLimit++] = normalizedCss.length();
int[] tokenBreaksTrunc = truncateOrShare(tokenBreaks, tokenBreaksLimit);
TokenType[] tokenTypesArr = tokenTypes.toArray(ZERO_TYPES);
return new CssTokens(
normalizedCss, new Brackets(bracketsTrunc),
tokenBreaksTrunc, tokenTypesArr);
}
void lex() {
// Fast-track no content.
consumeIgnorable();
sb.setLength(0);
if (pos == cssLimit) { return; }
tokenTypes = new ArrayList<TokenType>();
String css = this.css;
int cssLimit = this.cssLimit;
while (pos < cssLimit) {
assert this.tokenBreaksLimit == this.tokenTypes.size()
: "token and types out of sync at " + tokenBreaksLimit
+ " in `" + css + "`";
// SPEC: 4. Tokenization
// The output of the tokenization step is a stream of zero
// or more of the following tokens: <ident>, <function>,
// <at-keyword>, <hash>, <string>, <bad-string>, <url>,
// <bad-url>, <delim>, <number>, <percentage>,
// <dimension>, <unicode-range>, <include-match>,
// <dash-match>, <prefix-match>, <suffix-match>,
// <substring-match>, <column>, <whitespace>, <CDO>,
// <CDC>, <colon>, <semicolon>, <comma>, <[>, <]>,
// <(>, <)>, <{>, and <}>.
// IMPLEMENTS: 4.3 Consume a token
char ch = css.charAt(pos);
int startOfToken = pos;
int startOfOutputToken = sb.length();
final TokenType type;
switch (ch) {
case '\t': case '\n': case '\f': case '\r': case ' ': case '\ufeff':
consumeIgnorable();
type = TokenType.WHITESPACE;
break;
case '/': {
char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
if (lookahead == '/' || lookahead == '*') {
consumeIgnorable();
type = TokenType.WHITESPACE;
} else {
consumeDelim(ch);
type = TokenType.DELIM;
}
break;
}
case '<':
if (consumeIgnorable()) { // <!--
type = TokenType.WHITESPACE;
} else {
consumeDelim('<');
type = TokenType.DELIM;
}
break;
case '>':
breakOutput();
sb.append('>');
type = TokenType.DELIM;
++pos;
break;
case '@':
if (consumeAtKeyword()) {
type = TokenType.AT;
} else {
consumeDelim(ch);
type = TokenType.DELIM;
}
break;
case '#': {
sb.append('#');
TokenType hashType = consumeHash();
if (hashType != null) {
type = hashType;
} else {
++pos;
sb.append(' ');
type = TokenType.DELIM;
}
break;
}
case '"':
case '\'':
type = consumeString();
break;
case 'U': case 'u':
// SPEC handle URL under "ident like token".
if (consumeUnicodeRange()) {
type = TokenType.UNICODE_RANGE;
} else {
type = consumeIdentOrUrlOrFunction();
}
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
type = consumeNumberOrPercentageOrDimension();
break;
case '+': case '-': case '.': {
char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
if (isDecimal(lookahead)
|| (lookahead == '.' && pos + 2 < cssLimit
&& isDecimal(css.charAt(pos + 2)))) {
type = consumeNumberOrPercentageOrDimension();
} else if (ch == '+') {
consumeDelim(ch);
type = TokenType.DELIM;
} else if (ch == '-') {
if (consumeIgnorable()) { // -->
type = TokenType.WHITESPACE;
} else {
type = consumeIdentOrUrlOrFunction();
}
} else if (isIdentPart(lookahead)) {
// treat ".<IDENT>" as one token.
sb.append('.');
++pos;
consumeIdent(false);
if (pos != startOfToken + 1) {
type = TokenType.DOT_IDENT;
if (pos < cssLimit) {
char next = css.charAt(pos);
if ('(' == next) {
// A dotted identifier followed by a parenthesis is
// ambiguously a function.
sb.append(' ');
}
}
} else {
type = TokenType.DELIM;
sb.append(' ');
}
} else {
consumeDelim('.');
type = TokenType.DELIM;
}
break;
}
case ':': consumeDelim(ch); type = TokenType.COLON; break;
case ';': consumeDelim(ch); type = TokenType.SEMICOLON; break;
case ',': consumeDelim(ch); type = TokenType.COMMA; break;
case '[': case '(': case '{':
type = openBracket(ch);
++pos;
break;
case '}': case ')': case ']':
closeBracket(ch);
++pos;
// Use DELIM so that a later loop will split output into multiple
// tokens since we may have inserted missing close brackets for
// unclosed open brackets already on the stack.
type = TokenType.DELIM;
break;
case '~': case '|': case '^': case '$': case '*': {
char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
if (lookahead == '=') {
consumeMatch(ch);
type = TokenType.MATCH;
} else if (ch == '|' && lookahead == '|') {
consumeColumn();
type = TokenType.COLUMN;
} else {
consumeDelim(ch);
type = TokenType.DELIM;
}
break;
}
case '_':
type = consumeIdentOrUrlOrFunction();
break;
case '\\': {
// Optimistically parse as an ident.
TokenType identType = consumeIdentOrUrlOrFunction();
if (identType == null) {
++pos; // drop
breakOutput();
type = TokenType.WHITESPACE;
} else {
type = identType;
}
// TODO: handle case where "url" is encoded.
break;
}
default:
int chlower = ch | 32;
if ('a' <= chlower && chlower <= 'z' || ch >= 0x80) {
TokenType identType = consumeIdentOrUrlOrFunction();
if (identType != null) {
type = identType;
} else { // Occurs on undefined-codepoints.
++pos;
breakOutput();
type = TokenType.WHITESPACE;
}
} else if (ch > 0x20) {
consumeDelim(ch);
type = TokenType.DELIM;
} else { // Ignore.
consumeIgnorable();
type = TokenType.WHITESPACE;
}
}
assert pos > startOfToken
: "empty token at " + pos + ", ch0=" + css.charAt(startOfToken)
+ ":U+" + Integer.toHexString(css.charAt(startOfToken));
int endOfOutputToken = sb.length();
if (endOfOutputToken > startOfOutputToken) {
if (type == TokenType.DELIM) {
emitMergedTokens(startOfOutputToken, endOfOutputToken);
} else {
if (type != TokenType.WHITESPACE
&& sb.charAt(startOfOutputToken) == ' ') {
emitToken(TokenType.WHITESPACE, startOfOutputToken);
++startOfOutputToken;
assert startOfOutputToken != endOfOutputToken;
}
emitToken(type, startOfOutputToken);
// Token emitters can emit a space after a token to avoid possible
// merges with following tokens
if (type != TokenType.WHITESPACE) {
int sbLen = sb.length();
if (startOfOutputToken + 1 < sbLen
&& sb.charAt(sbLen - 1) == ' ') {
emitToken(TokenType.WHITESPACE, sbLen - 1);
}
}
}
}
}
}
private void emitMergedTokens(int start, int end) {
// Handle breakOutput and merging of output tokens.
for (int e = start; e < end; ++e) {
TokenType delimType;
switch (sb.charAt(e)) {
case ' ': delimType = TokenType.WHITESPACE; break;
case '}': delimType = TokenType.RIGHT_CURLY; break;
case ')': delimType = TokenType.RIGHT_PAREN; break;
case ']': delimType = TokenType.RIGHT_SQUARE; break;
default : delimType = TokenType.DELIM; break;
}
emitToken(delimType, e);
}
}
private void emitToken(TokenType type, int startOfOutputToken) {
if (tokenBreaksLimit == 0
|| tokenBreaks[tokenBreaksLimit - 1] != startOfOutputToken) {
tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
tokenBreaks[tokenBreaksLimit++] = startOfOutputToken;
tokenTypes.add(type);
}
}
private void consumeDelim(char ch) {
sb.append(ch);
switch (ch) {
// Prevent token merging.
case '~': case '|': case '^': case '$': case '\\':
case '.': case '+': case '-': case '@': case '/': case '<':
sb.append(' ');
break;
default:
break;
}
++pos;
}
private boolean consumeIgnorable() {
String css = this.css;
int cssLimit = this.cssLimit;
int posBefore = pos;
while (pos < cssLimit) {
char ch = css.charAt(pos);
if (ch <= 0x20
// Treat a BOM as white-space so that it is ignored at the beginning
// of a file.
|| ch == '\ufeff') {
++pos;
} else if (pos + 1 == cssLimit) {
break;
} else if (ch == '/') {
char next = css.charAt(pos + 1);
if (next == '*') {
pos += 2;
while (pos < cssLimit) {
int ast = css.indexOf('*', pos);
if (ast < 0) {
pos = cssLimit; // Unclosed /* comment */
break;
} else {
// Advance over a run of '*'s.
pos = ast + 1;
while (pos < cssLimit && css.charAt(pos) == '*') {
++pos;
}
if (pos < cssLimit && css.charAt(pos) == '/') {
++pos;
break;
}
}
}
} else if (next == '/') { // Non-standard but widely supported
while (++pos < cssLimit) {
if (isLineTerminator(css.charAt(pos))) { break; }
}
} else {
break;
}
} else if (ch == '<') {
if (pos + 3 < cssLimit
&& '!' == css.charAt(pos + 1)
&& '-' == css.charAt(pos + 2)
&& '-' == css.charAt(pos + 3)) {
pos += 4;
} else {
break;
}
} else if (ch == '-') {
if (pos + 2 < cssLimit
&& '-' == css.charAt(pos + 1)
&& '>' == css.charAt(pos + 2)) {
pos += 3;
} else {
break;
}
} else {
break;
}
}
if (pos == posBefore) {
return false;
} else {
breakOutput();
return true;
}
}
private void breakOutput() {
int last = sb.length() - 1;
if (last >= 0 && sb.charAt(last) != ' ') { sb.append(' '); }
}
private void consumeColumn() {
pos += 2;
sb.append("||");
}
private void consumeMatch(char ch) {
pos += 2;
sb.append(ch).append('=');
}
private void consumeIdent(boolean allowFirstDigit) {
int cssLimit = this.cssLimit;
int last = -1, nCodepoints = 0;
int sbAtStart = sb.length();
int posAtStart = pos;
while (pos < cssLimit) {
int posBefore = pos;
int decoded = readCodepoint();
if (decoded == '\\') {
decoded = consumeAndDecodeEscapeSequence();
} else {
++pos;
}
if (decoded >= 0 && isIdentPart(decoded)) {
if (!allowFirstDigit && nCodepoints < 2
&& '0' <= decoded && decoded <= '9') {
// Don't allow encoded identifiers that look like numeric tokens
// like \-1 or ones that start with an encoded decimal digit.
if (last == '-' || last == -1) {
pos = posAtStart;
sb.setLength(sbAtStart);
return;
}
}
sb.appendCodePoint(decoded);
last = decoded;
++nCodepoints;
} else {
pos = posBefore;
return;
}
}
}
private boolean consumeAtKeyword() {
assert css.charAt(pos) == '@';
int bufferLengthBeforeWrite = sb.length();
sb.append('@');
int posBeforeKeyword = ++pos;
consumeIdent(false);
if (pos == posBeforeKeyword) {
--pos; // back up over '@'
sb.setLength(bufferLengthBeforeWrite); // Unwrite the '@'
return false;
} else {
return true;
}
}
private int consumeAndDecodeEscapeSequence() {
String css = this.css;
int cssLimit = this.cssLimit;
assert css.charAt(pos) == '\\';
if (pos + 1 >= cssLimit) { return -1; }
char esc = css.charAt(pos + 1);
if (isLineTerminator(esc)) { return -1; }
int escLower = esc | 32;
if (('0' <= esc && esc <= '9')
|| ('a' <= escLower && escLower <= 'f')) {
int hexValue = 0;
int hexStart = pos + 1;
int hexLimit = Math.min(pos + 7, cssLimit);
int hexEnd = hexStart;
do {
hexValue = (hexValue << 4)
| (esc <= '9' ? esc - '0' : escLower - ('a' - 10));
++hexEnd;
if (hexEnd == hexLimit) { break; }
esc = css.charAt(hexEnd);
escLower = esc | 32;
} while (('0' <= esc && esc <= '9')
|| ('a' <= escLower && escLower <= 'f'));
if (!Character.isDefined(hexValue)) {
hexValue = 0xfffd;
}
pos = hexEnd;
if (pos < cssLimit) {
// A sequence of hex digits can be followed by a space that allows
// so that code-point U+A followed by the letter 'b' can be rendered
// as "\a b" since "\ab" specifies the single code-point U+AB.
char next = css.charAt(pos);
if (next == ' ' || next == '\t' || isLineTerminator(next)) {
++pos;
}
}
return hexValue;
}
pos += 2;
return esc;
}
private static final long HEX_ENCODED_BITMASK =
(1L << 0) | LINE_TERMINATOR_BITMASK
| (1L << '"') | (1L << '\'') | (1L << '&') | (1L << '<') | (1L << '>');
private static boolean isHexEncoded(int codepoint) {
return (0 <= codepoint && codepoint < 63
&& 0 != ((1L << codepoint) & HEX_ENCODED_BITMASK));
}
private void encodeCharOntoOutput(int codepoint, int last) {
switch (codepoint) {
case '\\': sb.append("\\\\"); break;
case '\0': sb.append("\\0"); break;
case '\n': sb.append("\\a"); break;
case '\f': sb.append("\\c"); break;
case '\r': sb.append("\\d"); break;
case '\"': sb.append("\\22"); break;
case '&': sb.append("\\26"); break;
case '\'': sb.append("\\27"); break;
case '<': sb.append("\\3c"); break;
case '>': sb.append("\\3e"); break;
// The set of escapes above that end with a hex digit must appear in
// HEX_ENCODED_BITMASK.
case '-':
sb.append('-');
break;
default:
if (isHexEncoded(last)
// We need to put a space after a trailing hex digit if the
// next encoded character on the output would be another hex
// digit or a space character. The other space characters
// are handled above.
&& (codepoint == ' ' || codepoint == '\t'
|| ('0' <= codepoint && codepoint <= '9')
|| ('a' <= (codepoint | 32) && (codepoint | 32) <= 'f'))) {
sb.append(' ');
}
sb.appendCodePoint(codepoint);
break;
}
}
private TokenType consumeNumberOrPercentageOrDimension() {
String css = this.css;
int cssLimit = this.cssLimit;
boolean isZero = true;
int intStart = pos;
if (intStart < cssLimit) {
char ch = css.charAt(intStart);
if (ch == '-' || ch == '+') {
++intStart;
}
}
// Find the integer part after any sign.
int intEnd = intStart;
for (; intEnd < cssLimit; ++intEnd) {
char ch = css.charAt(intEnd);
if (!('0' <= ch && ch <= '9')) { break; }
if (ch != '0') { isZero = false; }
}
// Find a fraction like ".5" or ".".
int fractionStart = intEnd;
int fractionEnd = fractionStart;
if (fractionEnd < cssLimit && '.' == css.charAt(fractionEnd)) {
++fractionEnd;
for (; fractionEnd < cssLimit; ++fractionEnd) {
char ch = css.charAt(fractionEnd);
if (!('0' <= ch && ch <= '9')) { break; }
if (ch != '0') { isZero = false; }
}
}
int exponentStart = fractionEnd;
int exponentIntStart = exponentStart;
int exponentEnd = exponentStart;
boolean isExponentZero = true;
if (exponentStart < cssLimit && 'e' == (css.charAt(exponentStart) | 32)) {
// 'e' and 'e' in "5e-f" for a
exponentEnd = exponentStart + 1;
if (exponentEnd < cssLimit) {
char ch = css.charAt(exponentEnd);
if (ch == '+' || ch == '-') { ++exponentEnd; }
}
exponentIntStart = exponentEnd;
for (; exponentEnd < cssLimit; ++exponentEnd) {
char ch = css.charAt(exponentEnd);
if (!('0' <= ch && ch <= '9')) { break; }
if (ch != '0') { isExponentZero = false; }
}
// Since
// dimension := <number> <ident>
// the below are technically valid dimensions even though they appear
// to have incomplete exponents:
// 5e
// 5ex
// 5e-
if (exponentEnd == exponentIntStart) { // Incomplete exponent.
exponentIntStart = exponentEnd = exponentStart;
isExponentZero = true;
}
}
int unitStart = exponentEnd;
// Skip over space between number and unit.
// Many user-agents allow "5 ex" instead of "5ex".
while (unitStart < cssLimit) {
char ch = css.charAt(unitStart);
if (ch == ' ' || isLineTerminator(ch)) {
++unitStart;
} else {
break;
}
}
if (sb.length() != 0 && isIdentPart(sb.charAt(sb.length() - 1))) {
sb.append(' ');
}
// Normalize the number onto the buffer.
// We will normalize and unit later.
// Skip the sign if it is positive.
if (intStart != pos && '-' == css.charAt(pos) && !isZero) {
sb.append('-');
}
if (isZero) {
sb.append('0');
} else {
// Strip leading zeroes from the integer and exponent and trailing
// zeroes from the fraction.
while (intStart < intEnd && css.charAt(intStart) == '0') { ++intStart; }
while (fractionEnd > fractionStart
&& css.charAt(fractionEnd - 1) == '0') {
--fractionEnd;
}
if (intStart == intEnd) {
sb.append('0'); // .5 -> 0.5
} else {
sb.append(css, intStart, intEnd);
}
if (fractionEnd > fractionStart + 1) { // 5. -> 5; 5.0 -> 5
sb.append(css, fractionStart, fractionEnd);
}
if (!isExponentZero) {
sb.append('e');
// 1e+1 -> 1e1
if ('-' == css.charAt(exponentIntStart - 1)) { sb.append('-'); }
while (exponentIntStart < exponentEnd
&& css.charAt(exponentIntStart) == '0') {
++exponentIntStart;
}
sb.append(css, exponentIntStart, exponentEnd);
}
}
int unitEnd;
TokenType type;
if (unitStart < cssLimit && '%' == css.charAt(unitStart)) {
unitEnd = unitStart + 1;
type = TokenType.PERCENTAGE;
sb.append('%');
} else {
// The grammar says that any identifier following a number is a unit.
int bufferBeforeUnit = sb.length();
pos = unitStart;
consumeIdent(false);
int bufferAfterUnit = sb.length();
boolean knownUnit = isWellKnownUnit(
sb, bufferBeforeUnit, bufferAfterUnit);
if (unitStart == exponentEnd // No intervening space
|| knownUnit) {
unitEnd = pos;
// 3IN -> 3in
for (int i = bufferBeforeUnit; i < bufferAfterUnit; ++i) {
char ch = sb.charAt(i);
if ('A' <= ch && ch <= 'Z') { sb.setCharAt(i, (char) (ch | 32)); }
}
} else {
unitEnd = unitStart = exponentEnd;
sb.setLength(bufferBeforeUnit);
}
type = unitStart == unitEnd
? TokenType.NUMBER
: knownUnit
? TokenType.DIMENSION
: TokenType.BAD_DIMENSION;
}
pos = unitEnd;
if (type != TokenType.PERCENTAGE
&& pos < cssLimit && css.charAt(pos) == '.') {
sb.append(' ');
}
return type;
}
private TokenType consumeString() {
String css = this.css;
int cssLimit = this.cssLimit;
char delim = css.charAt(pos);
assert delim == '"' || delim == '\'';
++pos;
int startOfStringOnOutput = sb.length();
sb.append('\'');
int last = -1;
boolean closed = false;
while (pos < cssLimit) {
char ch = css.charAt(pos);
if (ch == delim) {
++pos;
closed = true;
break;
}
if (isLineTerminator(ch)) { break; }
int decoded = ch;
if (ch == '\\') {
if (pos + 1 < cssLimit && isLineTerminator(css.charAt(pos+1))) {
// consume it but generate no tokens.
// Lookahead to treat a \r\n sequence as one line-terminator.
if (pos + 2 < cssLimit
&& css.charAt(pos+1) == '\r' && css.charAt(pos+2) == '\n') {
pos += 3;
} else {
pos += 2;
}
continue;
} else {
decoded = consumeAndDecodeEscapeSequence();
if (decoded < 0) {
break;
}
}
} else {
++pos;
}
encodeCharOntoOutput(decoded, last);
last = decoded;
}
if (closed) {
sb.append('\'');
return TokenType.STRING;
} else { // Drop <bad-string>s
sb.setLength(startOfStringOnOutput);
breakOutput();
return TokenType.WHITESPACE;
}
}
private @Nullable TokenType consumeHash() {
assert css.charAt(pos) == '#';
++pos;
int beforeIdent = pos;
consumeIdent(true);
if (pos == beforeIdent) {
pos = beforeIdent - 1;
return null;
}
for (int i = beforeIdent; i < pos; ++i) {
char chLower = (char) (css.charAt(i) | 32);
if (!(('0' <= chLower && chLower <= '9')
|| ('a' <= chLower && chLower <= 'f'))) {
return TokenType.HASH_ID;
}
}
return TokenType.HASH_UNRESTRICTED;
}
private boolean consumeUnicodeRange() {
final String css = this.css;
final int cssLimit = this.cssLimit;
assert pos < cssLimit && (css.charAt(pos) | 32) == 'u';
final int start = pos;
final int startOfOutput = sb.length();
++pos;
boolean ok = false;
parse:
try {
if (pos == cssLimit || css.charAt(pos) != '+') {
break parse;
}
++pos;
sb.append("U+");
int numStartDigits = 0;
while (pos < cssLimit && numStartDigits < 6) {
char chLower = (char) (css.charAt(pos) | 32);
if (('0' <= chLower && chLower <= '9')
|| ('a' <= chLower && chLower <= 'f')) {
sb.append(chLower);
++numStartDigits;
++pos;
} else {
break;
}
}
if (numStartDigits == 0) {
break parse;
}
boolean hasQmark = false;
while (pos < cssLimit && numStartDigits < 6 && css.charAt(pos) == '?') {
hasQmark = true;
sb.append('?');
++numStartDigits;
++pos;
}
if (numStartDigits == 0) {
break parse;
}
if (pos < cssLimit && css.charAt(pos) == '-') {
if (!hasQmark) {
// Look for end of range.
++pos;
sb.append('-');
int numEndDigits = 0;
while (pos < cssLimit && numEndDigits < 6) {
char chLower = (char) (css.charAt(pos) | 32);
if (('0' <= chLower && chLower <= '9')
|| ('a' <= chLower && chLower <= 'f')) {
++numEndDigits;
++pos;
sb.append(chLower);
} else {
break;
}
}
if (numEndDigits == 0) {
// Back up over '-'
--pos;
sb.append(' ');
}
} else {
sb.append(' ');
}
}
ok = true;
} finally {
if (!ok) {
pos = start;
sb.setLength(startOfOutput);
}
}
return ok;
}
private @Nullable TokenType consumeIdentOrUrlOrFunction() {
int bufferStart = sb.length();
int posBefore = pos;
consumeIdent(false);
if (pos == posBefore) { return null; }
boolean parenAfter = pos < cssLimit && css.charAt(pos) == '(';
if (sb.length() - bufferStart == 3
&& 'u' == (sb.charAt(bufferStart) | 32)
&& 'r' == (sb.charAt(bufferStart + 1) | 32)
&& 'l' == (sb.charAt(bufferStart + 2) | 32)) {
if (parenAfter && consumeUrlValue()) {
sb.setCharAt(bufferStart, 'u');
sb.setCharAt(bufferStart + 1, 'r');
sb.setCharAt(bufferStart + 2, 'l');
return TokenType.URL;
} else {
sb.setLength(bufferStart);
breakOutput();
return TokenType.WHITESPACE;
}
} else if (parenAfter) {
openBracket('(');
++pos;
return TokenType.FUNCTION;
} else {
if (pos + 1 < cssLimit && '.' == css.charAt(pos)) {
// Prevent merging of ident and number as in
// border:solid.1cm black
// when .1 is rewritten to 0.1 becoming
// border:solid0.1cm black
char next = css.charAt(pos + 1);
if ('0' <= next && next <= '9') {
sb.append(' ');
}
}
return TokenType.IDENT;
}
}
private boolean consumeUrlValue() {
String css = this.css;
int cssLimit = this.cssLimit;
if (pos == cssLimit || css.charAt(pos) != '(') { return false; }
++pos;
// skip space.
for (; pos < cssLimit; ++pos) {
char ch = css.charAt(pos);
if (ch != ' ' && !isLineTerminator(ch)) { break; }
}
// Find the value.
int delim;
if (pos < cssLimit) {
char ch = pos < cssLimit ? css.charAt(pos) : '\0';
if (ch == '"' || ch == '\'') {
delim = ch;
++pos;
} else {
delim = '\0';
}
} else {
return false;
}
sb.append("('");
while (pos < cssLimit) {
int decoded = readCodepoint();
if (delim != 0) {
if (decoded == delim) {
++pos;
break;
}
} else if (decoded <= ' ' || decoded == ')') {
break;
}
if (decoded == '\\') {
decoded = consumeAndDecodeEscapeSequence();
if (decoded < 0) {
return false;
}
} else {
++pos;
}
// Any character not in the RFC 3986 safe set is %-encoded.
if (decoded < URL_SAFE.length && URL_SAFE[decoded]) {
sb.appendCodePoint(decoded);
} else if (decoded < 0x80) {
sb.append('%')
.append(HEX_DIGITS[(decoded >>> 4) & 0xf])
.append(HEX_DIGITS[(decoded >>> 0) & 0xf]);
} else if (decoded < 0x800) {
int octet0 = 0xc0 | ((decoded >>> 6) & 0x1f),
octet1 = 0x80 | (decoded & 0x3f);
sb.append('%')
.append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
.append('%')
.append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet1 >>> 0) & 0xf]);
} else if (decoded < 0x10000) {
int octet0 = 0xe0 | ((decoded >>> 12) & 0xf),
octet1 = 0x80 | ((decoded >>> 6) & 0x3f),
octet2 = 0x80 | (decoded & 0x3f);
sb.append('%')
.append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
.append('%')
.append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet1 >>> 0) & 0xf])
.append('%')
.append(HEX_DIGITS[(octet2 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet2 >>> 0) & 0xf]);
} else {
int octet0 = 0xf0 | ((decoded >>> 18) & 0x7),
octet1 = 0x80 | ((decoded >>> 12) & 0x3f),
octet2 = 0x80 | ((decoded >>> 6) & 0x3f),
octet3 = 0x80 | (decoded & 0x3f);
sb.append('%')
.append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
.append('%')
.append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet1 >>> 0) & 0xf])
.append('%')
.append(HEX_DIGITS[(octet2 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet2 >>> 0) & 0xf])
.append('%')
.append(HEX_DIGITS[(octet3 >>> 4) & 0xf])
.append(HEX_DIGITS[(octet3 >>> 0) & 0xf]);
}
}
// skip space.
for (; pos < cssLimit; ++pos) {
char ch = css.charAt(pos);
if (ch != ' ' && !isLineTerminator(ch)) { break; }
}
if (pos < cssLimit && css.charAt(pos) == ')') {
++pos;
} else {
// broken-url
}
sb.append("')");
return true;
}
/**
* Reads the codepoint at pos, leaving pos at the index of the last code
* unit.
*/
private int readCodepoint() {
String css = this.css;
char ch = css.charAt(pos);
if (Character.isHighSurrogate(ch) && pos + 1 < cssLimit) {
char next = css.charAt(pos + 1);
if (Character.isLowSurrogate(next)) {
++pos;
return 0x10000 + (((ch - 0xd800) << 10) | (next - 0xdc00));
}
}
return ch;
}
}
private static final boolean isIdentPart(int cp) {
return cp >= 0x80
? Character.isDefined(cp) && cp != '\ufeff'
: IDENT_PART_ASCII[cp];
}
private static final boolean isDecimal(char ch) {
return '0' <= ch && ch <= '9';
}
private static final boolean[] IDENT_PART_ASCII = new boolean[128];
static {
for (int i = '0'; i <= '9'; ++i) { IDENT_PART_ASCII[i] = true; }
for (int i = 'A'; i <= 'Z'; ++i) { IDENT_PART_ASCII[i] = true; }
for (int i = 'a'; i <= 'z'; ++i) { IDENT_PART_ASCII[i] = true; }
IDENT_PART_ASCII['_'] = true;
IDENT_PART_ASCII['-'] = true;
}
private static final int LINE_TERMINATOR_BITMASK =
(1 << '\n') | (1 << '\r') | (1 << '\f');
private static boolean isLineTerminator(char ch) {
return ch < 0x20 && 0 != (LINE_TERMINATOR_BITMASK & (1 << ch));
}
private static int[] expandIfNecessary(int[] arr, int limit, int needed) {
int neededLength = limit + needed;
int length = arr.length;
if (length >= neededLength) { return arr; }
int[] newArr = new int[Math.max(16, Math.max(neededLength, length * 2))];
System.arraycopy(arr, 0, newArr, 0, limit);
return newArr;
}
private static int[] truncateOrShare(int[] arr, int limit) {
if (limit == 0) { return ZERO_INTS; }
if (limit == arr.length) {
return arr;
}
int[] trunc = new int[limit];
System.arraycopy(arr, 0, trunc, 0, limit);
return trunc;
}
private static final int LENGTH_UNIT_TYPE = 0;
private static final int ANGLE_UNIT_TYPE = 1;
private static final int TIME_UNIT_TYPE = 2;
private static final int FREQUENCY_UNIT_TYPE = 3;
private static final int RESOLUTION_UNIT_TYPE = 4;
/**
* See http://dev.w3.org/csswg/css-values/#lengths and
* http://dev.w3.org/csswg/css-values/#other-units
*/
private static final Trie UNIT_TRIE = new Trie(
ImmutableMap.<String, Integer>builder()
.put("em", LENGTH_UNIT_TYPE)
.put("ex", LENGTH_UNIT_TYPE)
.put("ch", LENGTH_UNIT_TYPE) // Width of zero character
.put("rem", LENGTH_UNIT_TYPE) // Root element font-size
.put("vh", LENGTH_UNIT_TYPE)
.put("vw", LENGTH_UNIT_TYPE)
.put("vmin", LENGTH_UNIT_TYPE)
.put("vmax", LENGTH_UNIT_TYPE)
.put("px", LENGTH_UNIT_TYPE)
.put("mm", LENGTH_UNIT_TYPE)
.put("cm", LENGTH_UNIT_TYPE)
.put("in", LENGTH_UNIT_TYPE)
.put("pt", LENGTH_UNIT_TYPE)
.put("pc", LENGTH_UNIT_TYPE)
.put("deg", ANGLE_UNIT_TYPE)
.put("rad", ANGLE_UNIT_TYPE)
.put("grad", ANGLE_UNIT_TYPE)
.put("turn", ANGLE_UNIT_TYPE)
.put("s", TIME_UNIT_TYPE)
.put("ms", TIME_UNIT_TYPE)
.put("hz", FREQUENCY_UNIT_TYPE)
.put("khz", FREQUENCY_UNIT_TYPE)
.put("dpi", RESOLUTION_UNIT_TYPE)
.put("dpcm", RESOLUTION_UNIT_TYPE)
.put("dppx", RESOLUTION_UNIT_TYPE)
.build());
static boolean isWellKnownUnit(CharSequence s, int start, int end) {
if (start == end) { return false; }
Trie t = UNIT_TRIE;
for (int i = start; i < end; ++i) {
char ch = s.charAt(i);
t = t.lookup('A' <= ch && ch <= 'Z' ? (char) (ch | 32) : ch);
if (t == null) { return false; }
}
return t.isTerminal();
}
static boolean isWellKnownUnit(CharSequence s) {
return isWellKnownUnit(s, 0, s.length());
}
private static final boolean[] URL_SAFE = new boolean[128];
static {
// From RFC 3986
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
for (int i = 'A'; i <= 'Z'; ++i) { URL_SAFE[i] = true; }
for (int i = 'a'; i <= 'z'; ++i) { URL_SAFE[i] = true; }
for (int i = '0'; i <= '9'; ++i) { URL_SAFE[i] = true; }
URL_SAFE['-'] = true;
URL_SAFE['.'] = true;
URL_SAFE['_'] = true;
URL_SAFE['~'] = true;
// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
URL_SAFE[':'] = true;
URL_SAFE['/'] = true;
URL_SAFE['?'] = true;
URL_SAFE['#'] = true;
URL_SAFE['['] = true;
URL_SAFE[']'] = true;
URL_SAFE['@'] = true;
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
URL_SAFE['!'] = true;
URL_SAFE['$'] = true;
URL_SAFE['&'] = true;
// Only used in obsolete mark rule and special in unquoted URLs or comment
// delimiters.
// URL_SAFE['\''] = true;
// URL_SAFE['('] = true;
// URL_SAFE[')'] = true;
// URL_SAFE['*'] = true;
URL_SAFE['+'] = true;
URL_SAFE[','] = true;
URL_SAFE[';'] = true;
URL_SAFE['='] = true;
// % is used to encode unsafe octets.
URL_SAFE['%'] = true;
}
private static final char[] HEX_DIGITS = {
'0', '1', '2', '3',
'4', '5', '6', '7',
'8', '9', 'a', 'b',
'c', 'd', 'e', 'f'
};
}