/* | |
* Copyright (C) 2009 Apple Inc. All rights reserved. | |
* | |
* Redistribution and use in source and binary forms, with or without | |
* modification, are permitted provided that the following conditions | |
* are met: | |
* 1. Redistributions of source code must retain the above copyright | |
* notice, this list of conditions and the following disclaimer. | |
* 2. Redistributions in binary form must reproduce the above copyright | |
* notice, this list of conditions and the following disclaimer in the | |
* documentation and/or other materials provided with the distribution. | |
* | |
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
*/ | |
#include "config.h" | |
#include "RegexInterpreter.h" | |
#include "RegexCompiler.h" | |
#include "RegexPattern.h" | |
#ifndef NDEBUG | |
#include <stdio.h> | |
#endif | |
#if ENABLE(YARR) | |
using namespace WTF; | |
namespace JSC { namespace Yarr { | |
class Interpreter { | |
public: | |
struct ParenthesesDisjunctionContext; | |
struct BackTrackInfoPatternCharacter { | |
uintptr_t matchAmount; | |
}; | |
struct BackTrackInfoCharacterClass { | |
uintptr_t matchAmount; | |
}; | |
struct BackTrackInfoBackReference { | |
uintptr_t begin; // Not really needed for greedy quantifiers. | |
uintptr_t matchAmount; // Not really needed for fixed quantifiers. | |
}; | |
struct BackTrackInfoAlternative { | |
uintptr_t offset; | |
}; | |
struct BackTrackInfoParentheticalAssertion { | |
uintptr_t begin; | |
}; | |
struct BackTrackInfoParenthesesOnce { | |
uintptr_t inParentheses; | |
}; | |
struct BackTrackInfoParentheses { | |
uintptr_t matchAmount; | |
ParenthesesDisjunctionContext* lastContext; | |
uintptr_t prevBegin; | |
uintptr_t prevEnd; | |
}; | |
static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) | |
{ | |
context->next = backTrack->lastContext; | |
backTrack->lastContext = context; | |
++backTrack->matchAmount; | |
} | |
static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) | |
{ | |
ASSERT(backTrack->matchAmount); | |
ASSERT(backTrack->lastContext); | |
backTrack->lastContext = backTrack->lastContext->next; | |
--backTrack->matchAmount; | |
} | |
struct DisjunctionContext | |
{ | |
DisjunctionContext() | |
: term(0) | |
{ | |
} | |
void* operator new(size_t, void* where) | |
{ | |
return where; | |
} | |
int term; | |
unsigned matchBegin; | |
unsigned matchEnd; | |
uintptr_t frame[1]; | |
}; | |
DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) | |
{ | |
return new(malloc(sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); | |
} | |
void freeDisjunctionContext(DisjunctionContext* context) | |
{ | |
free(context); | |
} | |
struct ParenthesesDisjunctionContext | |
{ | |
ParenthesesDisjunctionContext(int* output, ByteTerm& term) | |
: next(0) | |
{ | |
unsigned firstSubpatternId = term.atom.subpatternId; | |
unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { | |
subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; | |
output[(firstSubpatternId << 1) + i] = -1; | |
} | |
new(getDisjunctionContext(term)) DisjunctionContext(); | |
} | |
void* operator new(size_t, void* where) | |
{ | |
return where; | |
} | |
void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) | |
{ | |
for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) | |
output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; | |
} | |
DisjunctionContext* getDisjunctionContext(ByteTerm& term) | |
{ | |
return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); | |
} | |
ParenthesesDisjunctionContext* next; | |
int subpatternBackup[1]; | |
}; | |
ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) | |
{ | |
return new(malloc(sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); | |
} | |
void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) | |
{ | |
free(context); | |
} | |
class InputStream { | |
public: | |
InputStream(const UChar* input, unsigned start, unsigned length) | |
: input(input) | |
, pos(start) | |
, length(length) | |
{ | |
} | |
void next() | |
{ | |
++pos; | |
} | |
void rewind(unsigned amount) | |
{ | |
ASSERT(pos >= amount); | |
pos -= amount; | |
} | |
int read() | |
{ | |
ASSERT(pos < length); | |
if (pos < length) | |
return input[pos]; | |
return -1; | |
} | |
int readChecked(int position) | |
{ | |
ASSERT(position < 0); | |
ASSERT((unsigned)-position <= pos); | |
unsigned p = pos + position; | |
ASSERT(p < length); | |
return input[p]; | |
} | |
int reread(unsigned from) | |
{ | |
ASSERT(from < length); | |
return input[from]; | |
} | |
int prev() | |
{ | |
ASSERT(!(pos > length)); | |
if (pos && length) | |
return input[pos - 1]; | |
return -1; | |
} | |
unsigned getPos() | |
{ | |
return pos; | |
} | |
void setPos(unsigned p) | |
{ | |
pos = p; | |
} | |
bool atStart() | |
{ | |
return pos == 0; | |
} | |
bool atEnd() | |
{ | |
return pos == length; | |
} | |
bool checkInput(int count) | |
{ | |
if ((pos + count) <= length) { | |
pos += count; | |
return true; | |
} else | |
return false; | |
} | |
void uncheckInput(int count) | |
{ | |
pos -= count; | |
} | |
bool atStart(int position) | |
{ | |
return (pos + position) == 0; | |
} | |
bool atEnd(int position) | |
{ | |
return (pos + position) == length; | |
} | |
private: | |
const UChar* input; | |
unsigned pos; | |
unsigned length; | |
}; | |
bool testCharacterClass(CharacterClass* characterClass, int ch) | |
{ | |
if (ch & 0xFF80) { | |
for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) | |
if (ch == characterClass->m_matchesUnicode[i]) | |
return true; | |
for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) | |
if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) | |
return true; | |
} else { | |
for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) | |
if (ch == characterClass->m_matches[i]) | |
return true; | |
for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) | |
if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) | |
return true; | |
} | |
return false; | |
} | |
bool tryConsumeCharacter(int testChar) | |
{ | |
if (input.atEnd()) | |
return false; | |
int ch = input.read(); | |
if (pattern->m_ignoreCase ? ((Unicode::toLower(testChar) == ch) || (Unicode::toUpper(testChar) == ch)) : (testChar == ch)) { | |
input.next(); | |
return true; | |
} | |
return false; | |
} | |
bool checkCharacter(int testChar, int inputPosition) | |
{ | |
return testChar == input.readChecked(inputPosition); | |
} | |
bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) | |
{ | |
int ch = input.readChecked(inputPosition); | |
return (loChar == ch) || (hiChar == ch); | |
} | |
bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert) | |
{ | |
if (input.atEnd()) | |
return false; | |
bool match = testCharacterClass(characterClass, input.read()); | |
if (invert) | |
match = !match; | |
if (match) { | |
input.next(); | |
return true; | |
} | |
return false; | |
} | |
bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) | |
{ | |
bool match = testCharacterClass(characterClass, input.readChecked(inputPosition)); | |
return invert ? !match : match; | |
} | |
bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) | |
{ | |
int matchSize = matchEnd - matchBegin; | |
if (!input.checkInput(matchSize)) | |
return false; | |
for (int i = 0; i < matchSize; ++i) { | |
if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) { | |
input.uncheckInput(matchSize); | |
return false; | |
} | |
} | |
return true; | |
} | |
bool matchAssertionBOL(ByteTerm& term) | |
{ | |
return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1))); | |
} | |
bool matchAssertionEOL(ByteTerm& term) | |
{ | |
if (term.inputPosition) | |
return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition))); | |
else | |
return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read())); | |
} | |
bool matchAssertionWordBoundary(ByteTerm& term) | |
{ | |
bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1)); | |
bool readIsWordchar; | |
if (term.inputPosition) | |
readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition)); | |
else | |
readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); | |
bool wordBoundary = prevIsWordchar != readIsWordchar; | |
return term.invert() ? !wordBoundary : wordBoundary; | |
} | |
bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) | |
{ | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: | |
break; | |
case QuantifierGreedy: | |
if (backTrack->matchAmount) { | |
--backTrack->matchAmount; | |
input.uncheckInput(1); | |
return true; | |
} | |
break; | |
case QuantifierNonGreedy: | |
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
++backTrack->matchAmount; | |
if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1)) | |
return true; | |
} | |
input.uncheckInput(backTrack->matchAmount); | |
break; | |
} | |
return false; | |
} | |
bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) | |
{ | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: | |
break; | |
case QuantifierGreedy: | |
if (backTrack->matchAmount) { | |
--backTrack->matchAmount; | |
input.uncheckInput(1); | |
return true; | |
} | |
break; | |
case QuantifierNonGreedy: | |
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
++backTrack->matchAmount; | |
if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1)) | |
return true; | |
} | |
input.uncheckInput(backTrack->matchAmount); | |
break; | |
} | |
return false; | |
} | |
bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeCharacterClass); | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: { | |
for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { | |
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount)) | |
return false; | |
} | |
return true; | |
} | |
case QuantifierGreedy: { | |
unsigned matchAmount = 0; | |
while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) { | |
input.uncheckInput(1); | |
break; | |
} | |
++matchAmount; | |
} | |
backTrack->matchAmount = matchAmount; | |
return true; | |
} | |
case QuantifierNonGreedy: | |
backTrack->matchAmount = 0; | |
return true; | |
} | |
ASSERT_NOT_REACHED(); | |
return false; | |
} | |
bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeCharacterClass); | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: | |
break; | |
case QuantifierGreedy: | |
if (backTrack->matchAmount) { | |
--backTrack->matchAmount; | |
input.uncheckInput(1); | |
return true; | |
} | |
break; | |
case QuantifierNonGreedy: | |
if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) { | |
++backTrack->matchAmount; | |
if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) | |
return true; | |
} | |
input.uncheckInput(backTrack->matchAmount); | |
break; | |
} | |
return false; | |
} | |
bool matchBackReference(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeBackReference); | |
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); | |
int matchBegin = output[(term.atom.subpatternId << 1)]; | |
int matchEnd = output[(term.atom.subpatternId << 1) + 1]; | |
ASSERT((matchBegin == -1) == (matchEnd == -1)); | |
ASSERT(matchBegin <= matchEnd); | |
if (matchBegin == matchEnd) | |
return true; | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: { | |
backTrack->begin = input.getPos(); | |
for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { | |
if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { | |
input.setPos(backTrack->begin); | |
return false; | |
} | |
} | |
return true; | |
} | |
case QuantifierGreedy: { | |
unsigned matchAmount = 0; | |
while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) | |
++matchAmount; | |
backTrack->matchAmount = matchAmount; | |
return true; | |
} | |
case QuantifierNonGreedy: | |
backTrack->begin = input.getPos(); | |
backTrack->matchAmount = 0; | |
return true; | |
} | |
ASSERT_NOT_REACHED(); | |
return false; | |
} | |
bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeBackReference); | |
BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); | |
int matchBegin = output[(term.atom.subpatternId << 1)]; | |
int matchEnd = output[(term.atom.subpatternId << 1) + 1]; | |
ASSERT((matchBegin == -1) == (matchEnd == -1)); | |
ASSERT(matchBegin <= matchEnd); | |
if (matchBegin == matchEnd) | |
return false; | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: | |
// for quantityCount == 1, could rewind. | |
input.setPos(backTrack->begin); | |
break; | |
case QuantifierGreedy: | |
if (backTrack->matchAmount) { | |
--backTrack->matchAmount; | |
input.rewind(matchEnd - matchBegin); | |
return true; | |
} | |
break; | |
case QuantifierNonGreedy: | |
if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) { | |
++backTrack->matchAmount; | |
return true; | |
} else | |
input.setPos(backTrack->begin); | |
break; | |
} | |
return false; | |
} | |
void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) | |
{ | |
if (term.capture()) { | |
unsigned subpatternId = term.atom.subpatternId; | |
output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; | |
output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; | |
} | |
} | |
void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) | |
{ | |
unsigned firstSubpatternId = term.atom.subpatternId; | |
unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
context->restoreOutput(output, firstSubpatternId, count); | |
} | |
void resetAssertionMatches(ByteTerm& term) | |
{ | |
unsigned firstSubpatternId = term.atom.subpatternId; | |
unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; | |
for (unsigned i = 0; i < (count << 1); ++i) | |
output[(firstSubpatternId << 1) + i] = -1; | |
} | |
bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) | |
{ | |
while (backTrack->matchAmount) { | |
ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
if (matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true)) | |
return true; | |
resetMatches(term, context); | |
popParenthesesDisjunctionContext(backTrack); | |
freeParenthesesDisjunctionContext(context); | |
} | |
return false; | |
} | |
bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); | |
ASSERT(term.atom.quantityCount == 1); | |
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
switch (term.atom.quantityType) { | |
case QuantifierGreedy: { | |
// set this speculatively; if we get to the parens end this will be true. | |
backTrack->inParentheses = 1; | |
break; | |
} | |
case QuantifierNonGreedy: { | |
backTrack->inParentheses = 0; | |
context->term += term.atom.parenthesesWidth; | |
return true; | |
} | |
case QuantifierFixedCount: | |
break; | |
} | |
if (term.capture()) { | |
unsigned subpatternId = term.atom.subpatternId; | |
output[(subpatternId << 1)] = input.getPos() + term.inputPosition; | |
} | |
return true; | |
} | |
bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); | |
ASSERT(term.atom.quantityCount == 1); | |
if (term.capture()) { | |
unsigned subpatternId = term.atom.subpatternId; | |
output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; | |
} | |
return true; | |
} | |
bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); | |
ASSERT(term.atom.quantityCount == 1); | |
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
if (term.capture()) { | |
unsigned subpatternId = term.atom.subpatternId; | |
output[(subpatternId << 1)] = -1; | |
output[(subpatternId << 1) + 1] = -1; | |
} | |
switch (term.atom.quantityType) { | |
case QuantifierGreedy: | |
// if we backtrack to this point, there is another chance - try matching nothing. | |
ASSERT(backTrack->inParentheses); | |
backTrack->inParentheses = 0; | |
context->term += term.atom.parenthesesWidth; | |
return true; | |
case QuantifierNonGreedy: | |
ASSERT(backTrack->inParentheses); | |
case QuantifierFixedCount: | |
break; | |
} | |
return false; | |
} | |
bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); | |
ASSERT(term.atom.quantityCount == 1); | |
BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); | |
switch (term.atom.quantityType) { | |
case QuantifierGreedy: | |
if (!backTrack->inParentheses) { | |
context->term -= term.atom.parenthesesWidth; | |
return false; | |
} | |
case QuantifierNonGreedy: | |
if (!backTrack->inParentheses) { | |
// now try to match the parens; set this speculatively. | |
backTrack->inParentheses = 1; | |
if (term.capture()) { | |
unsigned subpatternId = term.atom.subpatternId; | |
output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; | |
} | |
context->term -= term.atom.parenthesesWidth; | |
return true; | |
} | |
case QuantifierFixedCount: | |
break; | |
} | |
return false; | |
} | |
bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); | |
ASSERT(term.atom.quantityCount == 1); | |
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
backTrack->begin = input.getPos(); | |
return true; | |
} | |
bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); | |
ASSERT(term.atom.quantityCount == 1); | |
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
input.setPos(backTrack->begin); | |
// We've reached the end of the parens; if they are inverted, this is failure. | |
if (term.invert()) { | |
context->term -= term.atom.parenthesesWidth; | |
return false; | |
} | |
return true; | |
} | |
bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); | |
ASSERT(term.atom.quantityCount == 1); | |
// We've failed to match parens; if they are inverted, this is win! | |
if (term.invert()) { | |
context->term += term.atom.parenthesesWidth; | |
return true; | |
} | |
return false; | |
} | |
bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); | |
ASSERT(term.atom.quantityCount == 1); | |
BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); | |
input.setPos(backTrack->begin); | |
context->term -= term.atom.parenthesesWidth; | |
return false; | |
} | |
bool matchParentheses(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); | |
BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); | |
unsigned subpatternId = term.atom.subpatternId; | |
ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; | |
backTrack->prevBegin = output[(subpatternId << 1)]; | |
backTrack->prevEnd = output[(subpatternId << 1) + 1]; | |
backTrack->matchAmount = 0; | |
backTrack->lastContext = 0; | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: { | |
// While we haven't yet reached our fixed limit, | |
while (backTrack->matchAmount < term.atom.quantityCount) { | |
// Try to do a match, and it it succeeds, add it to the list. | |
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
appendParenthesesDisjunctionContext(backTrack, context); | |
else { | |
// The match failed; try to find an alternate point to carry on from. | |
resetMatches(term, context); | |
freeParenthesesDisjunctionContext(context); | |
if (!parenthesesDoBacktrack(term, backTrack)) | |
return false; | |
} | |
} | |
ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
recordParenthesesMatch(term, context); | |
return true; | |
} | |
case QuantifierGreedy: { | |
while (backTrack->matchAmount < term.atom.quantityCount) { | |
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
appendParenthesesDisjunctionContext(backTrack, context); | |
else { | |
resetMatches(term, context); | |
freeParenthesesDisjunctionContext(context); | |
break; | |
} | |
} | |
if (backTrack->matchAmount) { | |
ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
recordParenthesesMatch(term, context); | |
} | |
return true; | |
} | |
case QuantifierNonGreedy: | |
return true; | |
} | |
ASSERT_NOT_REACHED(); | |
return false; | |
} | |
// Rules for backtracking differ depending on whether this is greedy or non-greedy. | |
// | |
// Greedy matches never should try just adding more - you should already have done | |
// the 'more' cases. Always backtrack, at least a leetle bit. However cases where | |
// you backtrack an item off the list needs checking, since we'll never have matched | |
// the one less case. Tracking forwards, still add as much as possible. | |
// | |
// Non-greedy, we've already done the one less case, so don't match on popping. | |
// We haven't done the one more case, so always try to add that. | |
// | |
bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) | |
{ | |
ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); | |
BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); | |
if (term.capture()) { | |
unsigned subpatternId = term.atom.subpatternId; | |
output[(subpatternId << 1)] = backTrack->prevBegin; | |
output[(subpatternId << 1) + 1] = backTrack->prevEnd; | |
} | |
ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; | |
switch (term.atom.quantityType) { | |
case QuantifierFixedCount: { | |
ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
ParenthesesDisjunctionContext* context = 0; | |
if (!parenthesesDoBacktrack(term, backTrack)) | |
return false; | |
// While we haven't yet reached our fixed limit, | |
while (backTrack->matchAmount < term.atom.quantityCount) { | |
// Try to do a match, and it it succeeds, add it to the list. | |
context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
if (matchDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
appendParenthesesDisjunctionContext(backTrack, context); | |
else { | |
// The match failed; try to find an alternate point to carry on from. | |
resetMatches(term, context); | |
freeParenthesesDisjunctionContext(context); | |
if (!parenthesesDoBacktrack(term, backTrack)) | |
return false; | |
} | |
} | |
ASSERT(backTrack->matchAmount == term.atom.quantityCount); | |
context = backTrack->lastContext; | |
recordParenthesesMatch(term, context); | |
return true; | |
} | |
case QuantifierGreedy: { | |
if (!backTrack->matchAmount) | |
return false; | |
ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { | |
while (backTrack->matchAmount < term.atom.quantityCount) { | |
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) | |
appendParenthesesDisjunctionContext(backTrack, context); | |
else { | |
resetMatches(term, context); | |
freeParenthesesDisjunctionContext(context); | |
break; | |
} | |
} | |
} else { | |
resetMatches(term, context); | |
popParenthesesDisjunctionContext(backTrack); | |
freeParenthesesDisjunctionContext(context); | |
} | |
if (backTrack->matchAmount) { | |
ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
recordParenthesesMatch(term, context); | |
} | |
return true; | |
} | |
case QuantifierNonGreedy: { | |
// If we've not reached the limit, try to add one more match. | |
if (backTrack->matchAmount < term.atom.quantityCount) { | |
ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); | |
if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term))) { | |
appendParenthesesDisjunctionContext(backTrack, context); | |
recordParenthesesMatch(term, context); | |
return true; | |
} else { | |
resetMatches(term, context); | |
freeParenthesesDisjunctionContext(context); | |
} | |
} | |
// Nope - okay backtrack looking for an alternative. | |
while (backTrack->matchAmount) { | |
ParenthesesDisjunctionContext* context = backTrack->lastContext; | |
if (matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true)) { | |
// successful backtrack! we're back in the game! | |
if (backTrack->matchAmount) { | |
context = backTrack->lastContext; | |
recordParenthesesMatch(term, context); | |
} | |
return true; | |
} | |
// pop a match off the stack | |
resetMatches(term, context); | |
popParenthesesDisjunctionContext(backTrack); | |
freeParenthesesDisjunctionContext(context); | |
} | |
return false; | |
} | |
} | |
ASSERT_NOT_REACHED(); | |
return false; | |
} | |
#define MATCH_NEXT() { ++context->term; goto matchAgain; } | |
#define BACKTRACK() { --context->term; goto backtrack; } | |
#define currentTerm() (disjunction->terms[context->term]) | |
bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) | |
{ | |
if (btrack) | |
BACKTRACK(); | |
context->matchBegin = input.getPos(); | |
context->term = 0; | |
matchAgain: | |
ASSERT(context->term < static_cast<int>(disjunction->terms.size())); | |
switch (currentTerm().type) { | |
case ByteTerm::TypeSubpatternBegin: | |
MATCH_NEXT(); | |
case ByteTerm::TypeSubpatternEnd: | |
context->matchEnd = input.getPos(); | |
return true; | |
case ByteTerm::TypeBodyAlternativeBegin: | |
MATCH_NEXT(); | |
case ByteTerm::TypeBodyAlternativeDisjunction: | |
case ByteTerm::TypeBodyAlternativeEnd: | |
context->matchEnd = input.getPos(); | |
return true; | |
case ByteTerm::TypeAlternativeBegin: | |
MATCH_NEXT(); | |
case ByteTerm::TypeAlternativeDisjunction: | |
case ByteTerm::TypeAlternativeEnd: { | |
int offset = currentTerm().alternative.end; | |
BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); | |
backTrack->offset = offset; | |
context->term += offset; | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypeAssertionBOL: | |
if (matchAssertionBOL(currentTerm())) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeAssertionEOL: | |
if (matchAssertionEOL(currentTerm())) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeAssertionWordBoundary: | |
if (matchAssertionWordBoundary(currentTerm())) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypePatternCharacterOnce: | |
case ByteTerm::TypePatternCharacterFixed: { | |
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { | |
if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) | |
BACKTRACK(); | |
} | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypePatternCharacterGreedy: { | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
unsigned matchAmount = 0; | |
while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { | |
if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { | |
input.uncheckInput(1); | |
break; | |
} | |
++matchAmount; | |
} | |
backTrack->matchAmount = matchAmount; | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypePatternCharacterNonGreedy: { | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
backTrack->matchAmount = 0; | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypePatternCasedCharacterOnce: | |
case ByteTerm::TypePatternCasedCharacterFixed: { | |
for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { | |
if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) | |
BACKTRACK(); | |
} | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypePatternCasedCharacterGreedy: { | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
unsigned matchAmount = 0; | |
while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) { | |
if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { | |
input.uncheckInput(1); | |
break; | |
} | |
++matchAmount; | |
} | |
backTrack->matchAmount = matchAmount; | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypePatternCasedCharacterNonGreedy: { | |
BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); | |
backTrack->matchAmount = 0; | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypeCharacterClass: | |
if (matchCharacterClass(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeBackReference: | |
if (matchBackReference(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParenthesesSubpattern: | |
if (matchParentheses(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParenthesesSubpatternOnceBegin: | |
if (matchParenthesesOnceBegin(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParenthesesSubpatternOnceEnd: | |
if (matchParenthesesOnceEnd(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParentheticalAssertionBegin: | |
if (matchParentheticalAssertionBegin(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParentheticalAssertionEnd: | |
if (matchParentheticalAssertionEnd(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeCheckInput: | |
if (input.checkInput(currentTerm().checkInputCount)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
} | |
// We should never fall-through to here. | |
ASSERT_NOT_REACHED(); | |
backtrack: | |
ASSERT(context->term < static_cast<int>(disjunction->terms.size())); | |
switch (currentTerm().type) { | |
case ByteTerm::TypeSubpatternBegin: | |
return false; | |
case ByteTerm::TypeSubpatternEnd: | |
ASSERT_NOT_REACHED(); | |
case ByteTerm::TypeBodyAlternativeBegin: | |
case ByteTerm::TypeBodyAlternativeDisjunction: { | |
int offset = currentTerm().alternative.next; | |
context->term += offset; | |
if (offset > 0) | |
MATCH_NEXT(); | |
if (input.atEnd()) | |
return false; | |
input.next(); | |
context->matchBegin = input.getPos(); | |
MATCH_NEXT(); | |
} | |
case ByteTerm::TypeBodyAlternativeEnd: | |
ASSERT_NOT_REACHED(); | |
case ByteTerm::TypeAlternativeBegin: | |
case ByteTerm::TypeAlternativeDisjunction: { | |
int offset = currentTerm().alternative.next; | |
context->term += offset; | |
if (offset > 0) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
} | |
case ByteTerm::TypeAlternativeEnd: { | |
// We should never backtrack back into an alternative of the main body of the regex. | |
BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); | |
unsigned offset = backTrack->offset; | |
context->term -= offset; | |
BACKTRACK(); | |
} | |
case ByteTerm::TypeAssertionBOL: | |
case ByteTerm::TypeAssertionEOL: | |
case ByteTerm::TypeAssertionWordBoundary: | |
BACKTRACK(); | |
case ByteTerm::TypePatternCharacterOnce: | |
case ByteTerm::TypePatternCharacterFixed: | |
case ByteTerm::TypePatternCharacterGreedy: | |
case ByteTerm::TypePatternCharacterNonGreedy: | |
if (backtrackPatternCharacter(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypePatternCasedCharacterOnce: | |
case ByteTerm::TypePatternCasedCharacterFixed: | |
case ByteTerm::TypePatternCasedCharacterGreedy: | |
case ByteTerm::TypePatternCasedCharacterNonGreedy: | |
if (backtrackPatternCasedCharacter(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeCharacterClass: | |
if (backtrackCharacterClass(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeBackReference: | |
if (backtrackBackReference(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParenthesesSubpattern: | |
if (backtrackParentheses(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParenthesesSubpatternOnceBegin: | |
if (backtrackParenthesesOnceBegin(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParenthesesSubpatternOnceEnd: | |
if (backtrackParenthesesOnceEnd(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParentheticalAssertionBegin: | |
if (backtrackParentheticalAssertionBegin(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeParentheticalAssertionEnd: | |
if (backtrackParentheticalAssertionEnd(currentTerm(), context)) | |
MATCH_NEXT(); | |
BACKTRACK(); | |
case ByteTerm::TypeCheckInput: | |
input.uncheckInput(currentTerm().checkInputCount); | |
BACKTRACK(); | |
} | |
ASSERT_NOT_REACHED(); | |
return false; | |
} | |
bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) | |
{ | |
if (matchDisjunction(disjunction, context, btrack)) { | |
while (context->matchBegin == context->matchEnd) { | |
if (!matchDisjunction(disjunction, context, true)) | |
return false; | |
} | |
return true; | |
} | |
return false; | |
} | |
int interpret() | |
{ | |
for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) | |
output[i] = -1; | |
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); | |
if (matchDisjunction(pattern->m_body.get(), context)) { | |
output[0] = context->matchBegin; | |
output[1] = context->matchEnd; | |
} | |
freeDisjunctionContext(context); | |
return output[0]; | |
} | |
Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) | |
: pattern(pattern) | |
, output(output) | |
, input(inputChar, start, length) | |
{ | |
} | |
private: | |
BytecodePattern *pattern; | |
int* output; | |
InputStream input; | |
}; | |
class ByteCompiler { | |
struct ParenthesesStackEntry { | |
unsigned beginTerm; | |
unsigned savedAlternativeIndex; | |
ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) | |
: beginTerm(beginTerm) | |
, savedAlternativeIndex(savedAlternativeIndex) | |
{ | |
} | |
}; | |
public: | |
ByteCompiler(RegexPattern& pattern) | |
: m_pattern(pattern) | |
{ | |
m_bodyDisjunction = 0; | |
m_currentAlternativeIndex = 0; | |
} | |
BytecodePattern* compile() | |
{ | |
regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize); | |
emitDisjunction(m_pattern.m_body); | |
regexEnd(); | |
return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); | |
} | |
void checkInput(unsigned count) | |
{ | |
m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); | |
} | |
void assertionBOL(int inputPosition) | |
{ | |
m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); | |
} | |
void assertionEOL(int inputPosition) | |
{ | |
m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); | |
} | |
void assertionWordBoundary(bool invert, int inputPosition) | |
{ | |
m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition)); | |
} | |
void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
{ | |
if (m_pattern.m_ignoreCase) { | |
UChar lo = Unicode::toLower(ch); | |
UChar hi = Unicode::toUpper(ch); | |
if (lo != hi) { | |
m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); | |
return; | |
} | |
} | |
m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); | |
} | |
void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
{ | |
m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
} | |
void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) | |
{ | |
ASSERT(subpatternId); | |
m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition)); | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
} | |
void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) | |
{ | |
int beginTerm = m_bodyDisjunction->terms.size(); | |
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; | |
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); | |
m_currentAlternativeIndex = beginTerm + 1; | |
} | |
void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) | |
{ | |
int beginTerm = m_bodyDisjunction->terms.size(); | |
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; | |
m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); | |
m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; | |
m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); | |
m_currentAlternativeIndex = beginTerm + 1; | |
} | |
unsigned popParenthesesStack() | |
{ | |
ASSERT(m_parenthesesStack.size()); | |
int stackEnd = m_parenthesesStack.size() - 1; | |
unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; | |
m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; | |
m_parenthesesStack.shrink(stackEnd); | |
ASSERT(beginTerm < m_bodyDisjunction->terms.size()); | |
ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); | |
return beginTerm; | |
} | |
#ifndef NDEBUG | |
void dumpDisjunction(ByteDisjunction* disjunction) | |
{ | |
printf("ByteDisjunction(%p):\n\t", disjunction); | |
for (unsigned i = 0; i < disjunction->terms.size(); ++i) | |
printf("{ %d } ", disjunction->terms[i].type); | |
printf("\n"); | |
} | |
#endif | |
void closeAlternative(int beginTerm) | |
{ | |
int origBeginTerm = beginTerm; | |
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); | |
int endIndex = m_bodyDisjunction->terms.size(); | |
unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; | |
if (!m_bodyDisjunction->terms[beginTerm].alternative.next) | |
m_bodyDisjunction->terms.remove(beginTerm); | |
else { | |
while (m_bodyDisjunction->terms[beginTerm].alternative.next) { | |
beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; | |
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); | |
m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; | |
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
} | |
m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; | |
m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); | |
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; | |
} | |
} | |
void closeBodyAlternative() | |
{ | |
int beginTerm = 0; | |
int origBeginTerm = 0; | |
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); | |
int endIndex = m_bodyDisjunction->terms.size(); | |
unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; | |
while (m_bodyDisjunction->terms[beginTerm].alternative.next) { | |
beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; | |
ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); | |
m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; | |
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
} | |
m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; | |
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); | |
m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; | |
} | |
void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) | |
{ | |
unsigned beginTerm = popParenthesesStack(); | |
closeAlternative(beginTerm + 1); | |
unsigned endTerm = m_bodyDisjunction->terms.size(); | |
bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; | |
bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; | |
unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; | |
m_bodyDisjunction->terms.append(ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); | |
m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; | |
m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; | |
m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; | |
if (doInline) { | |
m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; | |
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; | |
m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; | |
m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; | |
} else { | |
ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; | |
ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); | |
bool invertOrCapture = parenthesesBegin.invertOrCapture; | |
unsigned subpatternId = parenthesesBegin.atom.subpatternId; | |
unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; | |
ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); | |
parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); | |
for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) | |
parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); | |
parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); | |
m_bodyDisjunction->terms.shrink(beginTerm); | |
m_allParenthesesInfo.append(parenthesesDisjunction); | |
m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); | |
m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; | |
m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; | |
m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; | |
} | |
} | |
void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) | |
{ | |
m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); | |
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin()); | |
m_bodyDisjunction->terms[0].frameLocation = 0; | |
m_currentAlternativeIndex = 0; | |
} | |
void regexEnd() | |
{ | |
closeBodyAlternative(); | |
} | |
void alternativeBodyDisjunction() | |
{ | |
int newAlternativeIndex = m_bodyDisjunction->terms.size(); | |
m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; | |
m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction()); | |
m_currentAlternativeIndex = newAlternativeIndex; | |
} | |
void alternativeDisjunction() | |
{ | |
int newAlternativeIndex = m_bodyDisjunction->terms.size(); | |
m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; | |
m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); | |
m_currentAlternativeIndex = newAlternativeIndex; | |
} | |
void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) | |
{ | |
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | |
unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; | |
if (alt) { | |
if (disjunction == m_pattern.m_body) | |
alternativeBodyDisjunction(); | |
else | |
alternativeDisjunction(); | |
} | |
PatternAlternative* alternative = disjunction->m_alternatives[alt]; | |
unsigned minimumSize = alternative->m_minimumSize; | |
ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); | |
unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; | |
if (countToCheck) | |
checkInput(countToCheck); | |
currentCountAlreadyChecked += countToCheck; | |
for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { | |
PatternTerm& term = alternative->m_terms[i]; | |
switch (term.type) { | |
case PatternTerm::TypeAssertionBOL: | |
assertionBOL(term.inputPosition - currentCountAlreadyChecked); | |
break; | |
case PatternTerm::TypeAssertionEOL: | |
assertionEOL(term.inputPosition - currentCountAlreadyChecked); | |
break; | |
case PatternTerm::TypeAssertionWordBoundary: | |
assertionWordBoundary(term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked); | |
break; | |
case PatternTerm::TypePatternCharacter: | |
atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
break; | |
case PatternTerm::TypeCharacterClass: | |
atomCharacterClass(term.characterClass, term.invertOrCapture, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
break; | |
case PatternTerm::TypeBackReference: | |
atomBackReference(term.subpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType); | |
break; | |
case PatternTerm::TypeForwardReference: | |
break; | |
case PatternTerm::TypeParenthesesSubpattern: { | |
unsigned disjunctionAlreadyCheckedCount = 0; | |
if ((term.quantityCount == 1) && !term.parentheses.isCopy) { | |
if (term.quantityType == QuantifierFixedCount) { | |
disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; | |
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation); | |
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, term.parentheses.disjunction->m_minimumSize); | |
atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
} else { | |
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); | |
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
atomParenthesesEnd(true, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
} | |
} else { | |
unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; | |
atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.invertOrCapture, delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0); | |
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
atomParenthesesEnd(false, term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); | |
} | |
break; | |
} | |
case PatternTerm::TypeParentheticalAssertion: { | |
unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion; | |
atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invertOrCapture, term.frameLocation, alternativeFrameLocation); | |
emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0); | |
atomParenthesesEnd(true, term.parentheses.lastSubpatternId, 0, term.frameLocation, term.quantityCount, term.quantityType); | |
break; | |
} | |
} | |
} | |
} | |
} | |
private: | |
RegexPattern& m_pattern; | |
ByteDisjunction* m_bodyDisjunction; | |
unsigned m_currentAlternativeIndex; | |
Vector<ParenthesesStackEntry> m_parenthesesStack; | |
Vector<ByteDisjunction*> m_allParenthesesInfo; | |
}; | |
BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) | |
{ | |
RegexPattern pattern(ignoreCase, multiline); | |
if ((error = compileRegex(patternString, pattern))) | |
return 0; | |
numSubpatterns = pattern.m_numSubpatterns; | |
return ByteCompiler(pattern).compile(); | |
} | |
int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) | |
{ | |
return Interpreter(regex, output, input, start, length).interpret(); | |
} | |
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); | |
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); | |
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); | |
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); | |
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); | |
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); | |
COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); | |
} } | |
#endif |