/* | |
* Copyright (C) 2008 Apple Inc. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions | |
* are met: | |
* 1. Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* 2. Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* | |
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
*/ | |
#include "config.h" | |
#include "WRECParser.h" | |
#if ENABLE(WREC) | |
#include "CharacterClassConstructor.h" | |
#include "WRECFunctors.h" | |
using namespace WTF; | |
namespace JSC { namespace WREC { | |
// These error messages match the error messages used by PCRE. | |
const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier"; | |
const char* Parser::QuantifierWithoutAtom = "nothing to repeat"; | |
const char* Parser::ParenthesesUnmatched = "unmatched parentheses"; | |
const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?"; | |
const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet. | |
const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class"; | |
const char* Parser::CharacterClassOutOfOrder = "range out of order in character class"; | |
const char* Parser::EscapeUnterminated = "\\ at end of pattern"; | |
class PatternCharacterSequence { | |
typedef Generator::JumpList JumpList; | |
public: | |
PatternCharacterSequence(Generator& generator, JumpList& failures) | |
: m_generator(generator) | |
, m_failures(failures) | |
{ | |
} | |
size_t size() { return m_sequence.size(); } | |
void append(int ch) | |
{ | |
m_sequence.append(ch); | |
} | |
void flush() | |
{ | |
if (!m_sequence.size()) | |
return; | |
m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size()); | |
m_sequence.clear(); | |
} | |
void flush(const Quantifier& quantifier) | |
{ | |
if (!m_sequence.size()) | |
return; | |
m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1); | |
switch (quantifier.type) { | |
case Quantifier::None: | |
case Quantifier::Error: | |
ASSERT_NOT_REACHED(); | |
break; | |
case Quantifier::Greedy: { | |
GeneratePatternCharacterFunctor functor(m_sequence.last()); | |
m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); | |
break; | |
} | |
case Quantifier::NonGreedy: { | |
GeneratePatternCharacterFunctor functor(m_sequence.last()); | |
m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); | |
break; | |
} | |
} | |
m_sequence.clear(); | |
} | |
private: | |
Generator& m_generator; | |
JumpList& m_failures; | |
Vector<int, 8> m_sequence; | |
}; | |
ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier() | |
{ | |
switch (peek()) { | |
case '?': | |
consume(); | |
return Quantifier(Quantifier::Greedy, 0, 1); | |
case '*': | |
consume(); | |
return Quantifier(Quantifier::Greedy, 0); | |
case '+': | |
consume(); | |
return Quantifier(Quantifier::Greedy, 1); | |
case '{': { | |
SavedState state(*this); | |
consume(); | |
// Accept: {n}, {n,}, {n,m}. | |
// Reject: {n,m} where n > m. | |
// Ignore: Anything else, such as {n, m}. | |
if (!peekIsDigit()) { | |
state.restore(); | |
return Quantifier(); | |
} | |
unsigned min = consumeNumber(); | |
unsigned max = min; | |
if (peek() == ',') { | |
consume(); | |
max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity; | |
} | |
if (peek() != '}') { | |
state.restore(); | |
return Quantifier(); | |
} | |
consume(); | |
if (min > max) { | |
setError(QuantifierOutOfOrder); | |
return Quantifier(Quantifier::Error); | |
} | |
return Quantifier(Quantifier::Greedy, min, max); | |
} | |
default: | |
return Quantifier(); // No quantifier. | |
} | |
} | |
Quantifier Parser::consumeQuantifier() | |
{ | |
Quantifier q = consumeGreedyQuantifier(); | |
if ((q.type == Quantifier::Greedy) && (peek() == '?')) { | |
consume(); | |
q.type = Quantifier::NonGreedy; | |
} | |
return q; | |
} | |
bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert) | |
{ | |
Quantifier q = consumeQuantifier(); | |
switch (q.type) { | |
case Quantifier::None: { | |
m_generator.generateCharacterClass(failures, charClass, invert); | |
break; | |
} | |
case Quantifier::Greedy: { | |
GenerateCharacterClassFunctor functor(&charClass, invert); | |
m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); | |
break; | |
} | |
case Quantifier::NonGreedy: { | |
GenerateCharacterClassFunctor functor(&charClass, invert); | |
m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); | |
break; | |
} | |
case Quantifier::Error: | |
return false; | |
} | |
return true; | |
} | |
bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId) | |
{ | |
Quantifier q = consumeQuantifier(); | |
switch (q.type) { | |
case Quantifier::None: { | |
m_generator.generateBackreference(failures, subpatternId); | |
break; | |
} | |
case Quantifier::Greedy: | |
case Quantifier::NonGreedy: | |
m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max); | |
return true; | |
case Quantifier::Error: | |
return false; | |
} | |
return true; | |
} | |
bool Parser::parseParentheses(JumpList& failures) | |
{ | |
ParenthesesType type = consumeParenthesesType(); | |
// FIXME: WREC originally failed to backtrack correctly in cases such as | |
// "c".match(/(.*)c/). Now, most parentheses handling is disabled. For | |
// unsupported parentheses, we fall back on PCRE. | |
switch (type) { | |
case Generator::Assertion: { | |
m_generator.generateParenthesesAssertion(failures); | |
if (consume() != ')') { | |
setError(ParenthesesUnmatched); | |
return false; | |
} | |
Quantifier quantifier = consumeQuantifier(); | |
if (quantifier.type != Quantifier::None && quantifier.min == 0) { | |
setError(ParenthesesNotSupported); | |
return false; | |
} | |
return true; | |
} | |
case Generator::InvertedAssertion: { | |
m_generator.generateParenthesesInvertedAssertion(failures); | |
if (consume() != ')') { | |
setError(ParenthesesUnmatched); | |
return false; | |
} | |
Quantifier quantifier = consumeQuantifier(); | |
if (quantifier.type != Quantifier::None && quantifier.min == 0) { | |
setError(ParenthesesNotSupported); | |
return false; | |
} | |
return true; | |
} | |
default: | |
setError(ParenthesesNotSupported); | |
return false; | |
} | |
} | |
bool Parser::parseCharacterClass(JumpList& failures) | |
{ | |
bool invert = false; | |
if (peek() == '^') { | |
consume(); | |
invert = true; | |
} | |
CharacterClassConstructor constructor(m_ignoreCase); | |
int ch; | |
while ((ch = peek()) != ']') { | |
switch (ch) { | |
case EndOfPattern: | |
setError(CharacterClassUnmatched); | |
return false; | |
case '\\': { | |
consume(); | |
Escape escape = consumeEscape(true); | |
switch (escape.type()) { | |
case Escape::PatternCharacter: { | |
int character = PatternCharacterEscape::cast(escape).character(); | |
if (character == '-') | |
constructor.flushBeforeEscapedHyphen(); | |
constructor.put(character); | |
break; | |
} | |
case Escape::CharacterClass: { | |
const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape); | |
ASSERT(!characterClassEscape.invert()); | |
constructor.append(characterClassEscape.characterClass()); | |
break; | |
} | |
case Escape::Error: | |
return false; | |
case Escape::Backreference: | |
case Escape::WordBoundaryAssertion: { | |
ASSERT_NOT_REACHED(); | |
break; | |
} | |
} | |
break; | |
} | |
default: | |
consume(); | |
constructor.put(ch); | |
} | |
} | |
consume(); | |
// lazily catch reversed ranges ([z-a])in character classes | |
if (constructor.isUpsideDown()) { | |
setError(CharacterClassOutOfOrder); | |
return false; | |
} | |
constructor.flush(); | |
CharacterClass charClass = constructor.charClass(); | |
return parseCharacterClassQuantifier(failures, charClass, invert); | |
} | |
bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape) | |
{ | |
switch (escape.type()) { | |
case Escape::PatternCharacter: | |
ASSERT_NOT_REACHED(); | |
return false; | |
case Escape::CharacterClass: | |
return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert()); | |
case Escape::Backreference: | |
return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId()); | |
case Escape::WordBoundaryAssertion: | |
m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert()); | |
return true; | |
case Escape::Error: | |
return false; | |
} | |
ASSERT_NOT_REACHED(); | |
return false; | |
} | |
Escape Parser::consumeEscape(bool inCharacterClass) | |
{ | |
switch (peek()) { | |
case EndOfPattern: | |
setError(EscapeUnterminated); | |
return Escape(Escape::Error); | |
// Assertions | |
case 'b': | |
consume(); | |
if (inCharacterClass) | |
return PatternCharacterEscape('\b'); | |
return WordBoundaryAssertionEscape(false); // do not invert | |
case 'B': | |
consume(); | |
if (inCharacterClass) | |
return PatternCharacterEscape('B'); | |
return WordBoundaryAssertionEscape(true); // invert | |
// CharacterClassEscape | |
case 'd': | |
consume(); | |
return CharacterClassEscape(CharacterClass::digits(), false); | |
case 's': | |
consume(); | |
return CharacterClassEscape(CharacterClass::spaces(), false); | |
case 'w': | |
consume(); | |
return CharacterClassEscape(CharacterClass::wordchar(), false); | |
case 'D': | |
consume(); | |
return inCharacterClass | |
? CharacterClassEscape(CharacterClass::nondigits(), false) | |
: CharacterClassEscape(CharacterClass::digits(), true); | |
case 'S': | |
consume(); | |
return inCharacterClass | |
? CharacterClassEscape(CharacterClass::nonspaces(), false) | |
: CharacterClassEscape(CharacterClass::spaces(), true); | |
case 'W': | |
consume(); | |
return inCharacterClass | |
? CharacterClassEscape(CharacterClass::nonwordchar(), false) | |
: CharacterClassEscape(CharacterClass::wordchar(), true); | |
// DecimalEscape | |
case '1': | |
case '2': | |
case '3': | |
case '4': | |
case '5': | |
case '6': | |
case '7': | |
case '8': | |
case '9': { | |
if (peekDigit() > m_numSubpatterns || inCharacterClass) { | |
// To match Firefox, we parse an invalid backreference in the range [1-7] | |
// as an octal escape. | |
return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal()); | |
} | |
int value = 0; | |
do { | |
unsigned newValue = value * 10 + peekDigit(); | |
if (newValue > m_numSubpatterns) | |
break; | |
value = newValue; | |
consume(); | |
} while (peekIsDigit()); | |
return BackreferenceEscape(value); | |
} | |
// Octal escape | |
case '0': | |
consume(); | |
return PatternCharacterEscape(consumeOctal()); | |
// ControlEscape | |
case 'f': | |
consume(); | |
return PatternCharacterEscape('\f'); | |
case 'n': | |
consume(); | |
return PatternCharacterEscape('\n'); | |
case 'r': | |
consume(); | |
return PatternCharacterEscape('\r'); | |
case 't': | |
consume(); | |
return PatternCharacterEscape('\t'); | |
case 'v': | |
consume(); | |
return PatternCharacterEscape('\v'); | |
// ControlLetter | |
case 'c': { | |
SavedState state(*this); | |
consume(); | |
int control = consume(); | |
// To match Firefox, inside a character class, we also accept numbers | |
// and '_' as control characters. | |
if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) { | |
state.restore(); | |
return PatternCharacterEscape('\\'); | |
} | |
return PatternCharacterEscape(control & 31); | |
} | |
// HexEscape | |
case 'x': { | |
consume(); | |
SavedState state(*this); | |
int x = consumeHex(2); | |
if (x == -1) { | |
state.restore(); | |
return PatternCharacterEscape('x'); | |
} | |
return PatternCharacterEscape(x); | |
} | |
// UnicodeEscape | |
case 'u': { | |
consume(); | |
SavedState state(*this); | |
int x = consumeHex(4); | |
if (x == -1) { | |
state.restore(); | |
return PatternCharacterEscape('u'); | |
} | |
return PatternCharacterEscape(x); | |
} | |
// IdentityEscape | |
default: | |
return PatternCharacterEscape(consume()); | |
} | |
} | |
void Parser::parseAlternative(JumpList& failures) | |
{ | |
PatternCharacterSequence sequence(m_generator, failures); | |
while (1) { | |
switch (peek()) { | |
case EndOfPattern: | |
case '|': | |
case ')': | |
sequence.flush(); | |
return; | |
case '*': | |
case '+': | |
case '?': | |
case '{': { | |
Quantifier q = consumeQuantifier(); | |
if (q.type == Quantifier::None) { | |
sequence.append(consume()); | |
continue; | |
} | |
if (q.type == Quantifier::Error) | |
return; | |
if (!sequence.size()) { | |
setError(QuantifierWithoutAtom); | |
return; | |
} | |
sequence.flush(q); | |
continue; | |
} | |
case '^': | |
consume(); | |
sequence.flush(); | |
m_generator.generateAssertionBOL(failures); | |
continue; | |
case '$': | |
consume(); | |
sequence.flush(); | |
m_generator.generateAssertionEOL(failures); | |
continue; | |
case '.': | |
consume(); | |
sequence.flush(); | |
if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true)) | |
return; | |
continue; | |
case '[': | |
consume(); | |
sequence.flush(); | |
if (!parseCharacterClass(failures)) | |
return; | |
continue; | |
case '(': | |
consume(); | |
sequence.flush(); | |
if (!parseParentheses(failures)) | |
return; | |
continue; | |
case '\\': { | |
consume(); | |
Escape escape = consumeEscape(false); | |
if (escape.type() == Escape::PatternCharacter) { | |
sequence.append(PatternCharacterEscape::cast(escape).character()); | |
continue; | |
} | |
sequence.flush(); | |
if (!parseNonCharacterEscape(failures, escape)) | |
return; | |
continue; | |
} | |
default: | |
sequence.append(consume()); | |
continue; | |
} | |
} | |
} | |
/* | |
TOS holds index. | |
*/ | |
void Parser::parseDisjunction(JumpList& failures) | |
{ | |
parseAlternative(failures); | |
if (peek() != '|') | |
return; | |
JumpList successes; | |
do { | |
consume(); | |
m_generator.terminateAlternative(successes, failures); | |
parseAlternative(failures); | |
} while (peek() == '|'); | |
m_generator.terminateDisjunction(successes); | |
} | |
Generator::ParenthesesType Parser::consumeParenthesesType() | |
{ | |
if (peek() != '?') | |
return Generator::Capturing; | |
consume(); | |
switch (consume()) { | |
case ':': | |
return Generator::NonCapturing; | |
case '=': | |
return Generator::Assertion; | |
case '!': | |
return Generator::InvertedAssertion; | |
default: | |
setError(ParenthesesTypeInvalid); | |
return Generator::Error; | |
} | |
} | |
} } // namespace JSC::WREC | |
#endif // ENABLE(WREC) |