blob: b62ad1fff816c416ae2018e3843147f30f0c0f06 [file] [log] [blame]
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/regexp/regexp.h"
#include "src/codegen/compilation-cache.h"
#include "src/diagnostics/code-tracer.h"
#include "src/heap/heap-inl.h"
#include "src/objects/js-regexp-inl.h"
#include "src/regexp/experimental/experimental.h"
#include "src/regexp/regexp-bytecode-generator.h"
#include "src/regexp/regexp-bytecodes.h"
#include "src/regexp/regexp-compiler.h"
#include "src/regexp/regexp-dotprinter.h"
#include "src/regexp/regexp-interpreter.h"
#include "src/regexp/regexp-macro-assembler-arch.h"
#include "src/regexp/regexp-macro-assembler-tracer.h"
#include "src/regexp/regexp-parser.h"
#include "src/regexp/regexp-utils.h"
#include "src/strings/string-search.h"
#include "src/utils/ostreams.h"
namespace v8 {
namespace internal {
using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
class RegExpImpl final : public AllStatic {
public:
// Returns a string representation of a regular expression.
// Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
// This function calls the garbage collector if necessary.
static Handle<String> ToString(Handle<Object> value);
// Prepares a JSRegExp object with Irregexp-specific data.
static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
int capture_count, uint32_t backtrack_limit);
// Prepare a RegExp for being executed one or more times (using
// IrregexpExecOnce) on the subject.
// This ensures that the regexp is compiled for the subject, and that
// the subject is flat.
// Returns the number of integer spaces required by IrregexpExecOnce
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is set as pending, and this function returns negative.
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject);
static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
Handle<String> match_pattern);
static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index, int32_t* output,
int output_size);
static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info);
// Execute a regular expression on the subject, starting from index.
// If matching succeeds, return the number of matches. This can be larger
// than one in the case of global regular expressions.
// The captures and subcaptures are stored into the registers vector.
// If matching fails, returns RE_FAILURE.
// If execution fails, sets a pending exception and returns RE_EXCEPTION.
static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index, int32_t* output,
int output_size);
// Execute an Irregexp bytecode pattern.
// On a successful match, the result is a JSArray containing
// captured positions. On a failure, the result is the null value.
// Returns an empty handle in case of an exception.
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> sample_subject, bool is_one_byte);
static inline bool EnsureCompiledIrregexp(Isolate* isolate,
Handle<JSRegExp> re,
Handle<String> sample_subject,
bool is_one_byte);
// Returns true on success, false on failure.
static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
JSRegExp::Flags flags, Handle<String> pattern,
Handle<String> sample_subject, bool is_one_byte,
uint32_t& backtrack_limit);
// For acting on the JSRegExp data FixedArray.
static int IrregexpMaxRegisterCount(FixedArray re);
static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
static int IrregexpNumberOfCaptures(FixedArray re);
static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
};
MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
Handle<JSRegExp> re,
Handle<String> pattern,
RegExpError error) {
Vector<const char> error_data = CStrVector(RegExpErrorString(error));
Handle<String> error_text =
isolate->factory()
->NewStringFromOneByte(Vector<const uint8_t>::cast(error_data))
.ToHandleChecked();
THROW_NEW_ERROR(
isolate,
NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
Object);
}
void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
RegExpError error_text) {
USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
error_text));
}
bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) {
return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
}
// Identifies the sort of regexps where the regexp engine is faster
// than the code used for atom matches.
static bool HasFewDifferentCharacters(Handle<String> pattern) {
int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
if (length <= kPatternTooShortForBoyerMoore) return false;
const int kMod = 128;
bool character_found[kMod];
int different = 0;
memset(&character_found[0], 0, sizeof(character_found));
for (int i = 0; i < length; i++) {
int ch = (pattern->Get(i) & (kMod - 1));
if (!character_found[ch]) {
character_found[ch] = true;
different++;
// We declare a regexp low-alphabet if it has at least 3 times as many
// characters as it has different characters.
if (different * 3 > length) return false;
}
}
return true;
}
// Generic RegExp methods. Dispatches to implementation specific methods.
// static
MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags,
uint32_t backtrack_limit) {
DCHECK(pattern->IsFlat());
// Caching is based only on the pattern and flags, but code also differs when
// a backtrack limit is set. A present backtrack limit is very much *not* the
// common case, so just skip the cache for these.
const bool is_compilation_cache_enabled =
(backtrack_limit == JSRegExp::kNoBacktrackLimit);
Zone zone(isolate->allocator(), ZONE_NAME);
CompilationCache* compilation_cache = nullptr;
if (is_compilation_cache_enabled) {
compilation_cache = isolate->compilation_cache();
MaybeHandle<FixedArray> maybe_cached =
compilation_cache->LookupRegExp(pattern, flags);
Handle<FixedArray> cached;
if (maybe_cached.ToHandle(&cached)) {
re->set_data(*cached);
return re;
}
}
PostponeInterruptsScope postpone(isolate);
RegExpCompileData parse_result;
FlatStringReader reader(isolate, pattern);
DCHECK(!isolate->has_pending_exception());
if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
&parse_result)) {
// Throw an exception if we fail to parse the pattern.
return RegExp::ThrowRegExpException(isolate, re, pattern,
parse_result.error);
}
bool has_been_compiled = false;
if (FLAG_default_to_experimental_regexp_engine &&
ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
parse_result.capture_count)) {
DCHECK(FLAG_enable_experimental_regexp_engine);
ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
parse_result.capture_count);
has_been_compiled = true;
} else if (flags & JSRegExp::kLinear) {
DCHECK(FLAG_enable_experimental_regexp_engine);
if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
parse_result.capture_count)) {
// TODO(mbid): The error could provide a reason for why the regexp can't
// be executed in linear time (e.g. due to back references).
return RegExp::ThrowRegExpException(isolate, re, pattern,
RegExpError::kNotLinear);
}
ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
parse_result.capture_count);
has_been_compiled = true;
} else if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
!HasFewDifferentCharacters(pattern)) {
// Parse-tree is a single atom that is equal to the pattern.
RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
has_been_compiled = true;
} else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
parse_result.capture_count == 0) {
RegExpAtom* atom = parse_result.tree->AsAtom();
// The pattern source might (?) contain escape sequences, but they're
// resolved in atom_string.
Vector<const uc16> atom_pattern = atom->data();
Handle<String> atom_string;
ASSIGN_RETURN_ON_EXCEPTION(
isolate, atom_string,
isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
has_been_compiled = true;
}
}
if (!has_been_compiled) {
RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
parse_result.capture_count, backtrack_limit);
}
DCHECK(re->data().IsFixedArray());
// Compilation succeeded so the data is set on the regexp
// and we can store it in the cache.
Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
if (is_compilation_cache_enabled) {
compilation_cache->PutRegExp(pattern, flags, data);
}
return re;
}
// static
bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> subject) {
switch (re->TypeTag()) {
case JSRegExp::NOT_COMPILED:
UNREACHABLE();
case JSRegExp::ATOM:
return true;
case JSRegExp::IRREGEXP:
if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
DCHECK(isolate->has_pending_exception());
return false;
}
return true;
case JSRegExp::EXPERIMENTAL:
if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
!ExperimentalRegExp::Compile(isolate, re)) {
DCHECK(isolate->has_pending_exception());
return false;
}
return true;
}
}
// static
MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info) {
return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
last_match_info);
}
// static
MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info) {
switch (regexp->TypeTag()) {
case JSRegExp::NOT_COMPILED:
UNREACHABLE();
case JSRegExp::ATOM:
return RegExpImpl::AtomExec(isolate, regexp, subject, index,
last_match_info);
case JSRegExp::IRREGEXP:
return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
last_match_info);
case JSRegExp::EXPERIMENTAL:
return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
last_match_info);
}
}
// RegExp Atom implementation: Simple string search using indexOf.
void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern, JSRegExp::Flags flags,
Handle<String> match_pattern) {
isolate->factory()->SetRegExpAtomData(re, pattern, flags, match_pattern);
}
static void SetAtomLastCapture(Isolate* isolate,
Handle<RegExpMatchInfo> last_match_info,
String subject, int from, int to) {
SealHandleScope shs(isolate);
last_match_info->SetNumberOfCaptureRegisters(2);
last_match_info->SetLastSubject(subject);
last_match_info->SetLastInput(subject);
last_match_info->SetCapture(0, from);
last_match_info->SetCapture(1, to);
}
int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index, int32_t* output,
int output_size) {
DCHECK_LE(0, index);
DCHECK_LE(index, subject->length());
subject = String::Flatten(isolate, subject);
DisallowHeapAllocation no_gc; // ensure vectors stay valid
String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
int needle_len = needle.length();
DCHECK(needle.IsFlat());
DCHECK_LT(0, needle_len);
if (index + needle_len > subject->length()) {
return RegExp::RE_FAILURE;
}
for (int i = 0; i < output_size; i += 2) {
String::FlatContent needle_content = needle.GetFlatContent(no_gc);
String::FlatContent subject_content = subject->GetFlatContent(no_gc);
DCHECK(needle_content.IsFlat());
DCHECK(subject_content.IsFlat());
// dispatch on type of strings
index =
(needle_content.IsOneByte()
? (subject_content.IsOneByte()
? SearchString(isolate, subject_content.ToOneByteVector(),
needle_content.ToOneByteVector(), index)
: SearchString(isolate, subject_content.ToUC16Vector(),
needle_content.ToOneByteVector(), index))
: (subject_content.IsOneByte()
? SearchString(isolate, subject_content.ToOneByteVector(),
needle_content.ToUC16Vector(), index)
: SearchString(isolate, subject_content.ToUC16Vector(),
needle_content.ToUC16Vector(), index)));
if (index == -1) {
return i / 2; // Return number of matches.
} else {
output[i] = index;
output[i + 1] = index + needle_len;
index += needle_len;
}
}
return output_size / 2;
}
Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> subject, int index,
Handle<RegExpMatchInfo> last_match_info) {
static const int kNumRegisters = 2;
STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
int res =
AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value();
DCHECK_EQ(res, RegExp::RE_SUCCESS);
SealHandleScope shs(isolate);
SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
output_registers[1]);
return last_match_info;
}
// Irregexp implementation.
// Ensures that the regexp object contains a compiled version of the
// source for either one-byte or two-byte subject strings.
// If the compiled version doesn't already exist, it is compiled
// from the source pattern.
// If compilation fails, an exception is thrown and this function
// returns false.
bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> sample_subject,
bool is_one_byte) {
Object compiled_code = re->Code(is_one_byte);
Object bytecode = re->Bytecode(is_one_byte);
bool needs_initial_compilation =
compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue);
// Recompile is needed when we're dealing with the first execution of the
// regexp after the decision to tier up has been made. If the tiering up
// strategy is not in use, this value is always false.
bool needs_tier_up_compilation =
re->MarkedForTierUp() && bytecode.IsByteArray();
if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) {
PrintF("JSRegExp object %p needs tier-up compilation\n",
reinterpret_cast<void*>(re->ptr()));
}
if (!needs_initial_compilation && !needs_tier_up_compilation) {
DCHECK(compiled_code.IsCode());
DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray());
return true;
}
DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray());
return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
}
#ifdef DEBUG
namespace {
bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
Object entry = re->Code(is_one_byte);
Object bytecode = re->Bytecode(is_one_byte);
// If we're not using the tier-up strategy, entry can only be a smi
// representing an uncompiled regexp here. If we're using the tier-up
// strategy, entry can still be a smi representing an uncompiled regexp, when
// compiling the regexp before the tier-up, or it can contain a trampoline to
// the regexp interpreter, in which case the bytecode field contains compiled
// bytecode, when recompiling the regexp after the tier-up. If the
// tier-up was forced, which happens for global replaces, entry is a smi
// representing an uncompiled regexp, even though we're "recompiling" after
// the tier-up.
if (re->ShouldProduceBytecode()) {
DCHECK(entry.IsSmi());
DCHECK(bytecode.IsSmi());
int entry_value = Smi::ToInt(entry);
int bytecode_value = Smi::ToInt(bytecode);
DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value);
} else {
DCHECK(entry.IsSmi() || (entry.IsCode() && bytecode.IsByteArray()));
}
return true;
}
} // namespace
#endif
bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> sample_subject,
bool is_one_byte) {
// Compile the RegExp.
Zone zone(isolate->allocator(), ZONE_NAME);
PostponeInterruptsScope postpone(isolate);
DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte));
JSRegExp::Flags flags = re->GetFlags();
Handle<String> pattern(re->Pattern(), isolate);
pattern = String::Flatten(isolate, pattern);
RegExpCompileData compile_data;
FlatStringReader reader(isolate, pattern);
if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
&compile_data)) {
// Throw an exception if we fail to parse the pattern.
// THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
return false;
}
// The compilation target is a kBytecode if we're interpreting all regexp
// objects, or if we're using the tier-up strategy but the tier-up hasn't
// happened yet. The compilation target is a kNative if we're using the
// tier-up strategy and we need to recompile to tier-up, or if we're producing
// native code for all regexp objects.
compile_data.compilation_target = re->ShouldProduceBytecode()
? RegExpCompilationTarget::kBytecode
: RegExpCompilationTarget::kNative;
uint32_t backtrack_limit = re->BacktrackLimit();
const bool compilation_succeeded =
Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
is_one_byte, backtrack_limit);
if (!compilation_succeeded) {
DCHECK(compile_data.error != RegExpError::kNone);
RegExp::ThrowRegExpException(isolate, re, compile_data.error);
return false;
}
Handle<FixedArray> data =
Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
if (compile_data.compilation_target == RegExpCompilationTarget::kNative) {
data->set(JSRegExp::code_index(is_one_byte), *compile_data.code);
// Reset bytecode to uninitialized. In case we use tier-up we know that
// tier-up has happened this way.
data->set(JSRegExp::bytecode_index(is_one_byte),
Smi::FromInt(JSRegExp::kUninitializedValue));
} else {
DCHECK_EQ(compile_data.compilation_target,
RegExpCompilationTarget::kBytecode);
// Store code generated by compiler in bytecode and trampoline to
// interpreter in code.
data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code);
Handle<Code> trampoline =
BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
data->set(JSRegExp::code_index(is_one_byte), *trampoline);
}
re->SetCaptureNameMap(compile_data.capture_name_map);
int register_max = IrregexpMaxRegisterCount(*data);
if (compile_data.register_count > register_max) {
SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
}
data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
if (FLAG_trace_regexp_tier_up) {
PrintF("JSRegExp object %p %s size: %d\n",
reinterpret_cast<void*>(re->ptr()),
re->ShouldProduceBytecode() ? "bytecode" : "native code",
re->ShouldProduceBytecode()
? IrregexpByteCode(*data, is_one_byte).Size()
: IrregexpNativeCode(*data, is_one_byte).Size());
}
return true;
}
int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex));
}
void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
}
int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
}
ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte)));
}
Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
return Code::cast(re.get(JSRegExp::code_index(is_one_byte)));
}
void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
Handle<String> pattern,
JSRegExp::Flags flags, int capture_count,
uint32_t backtrack_limit) {
// Initialize compiled code entries to null.
isolate->factory()->SetRegExpIrregexpData(re, pattern, flags, capture_count,
backtrack_limit);
}
// static
int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject) {
DCHECK(subject->IsFlat());
// Check representation of the underlying storage.
bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject,
is_one_byte)) {
return -1;
}
// Only reserve room for output captures. Internal registers are allocated by
// the engine.
return JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
}
int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject, int index,
int32_t* output, int output_size) {
DCHECK_LE(0, index);
DCHECK_LE(index, subject->length());
DCHECK(subject->IsFlat());
DCHECK_GE(output_size,
JSRegExp::RegistersForCaptureCount(regexp->CaptureCount()));
bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
if (!regexp->ShouldProduceBytecode()) {
do {
EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
// The stack is used to allocate registers for the compiled regexp code.
// This means that in case of failure, the output registers array is left
// untouched and contains the capture results from the previous successful
// match. We can use that to set the last match info lazily.
int res = NativeRegExpMacroAssembler::Match(regexp, subject, output,
output_size, index, isolate);
if (res != NativeRegExpMacroAssembler::RETRY) {
DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
isolate->has_pending_exception());
STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
RegExp::RE_SUCCESS);
STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
RegExp::RE_FAILURE);
STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
RegExp::RE_EXCEPTION);
return res;
}
// If result is RETRY, the string has changed representation, and we
// must restart from scratch.
// In this case, it means we must make sure we are prepared to handle
// the, potentially, different subject (the string can switch between
// being internal and external, and even between being Latin1 and UC16,
// but the characters are always the same).
is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
} while (true);
UNREACHABLE();
} else {
DCHECK(regexp->ShouldProduceBytecode());
do {
IrregexpInterpreter::Result result =
IrregexpInterpreter::MatchForCallFromRuntime(
isolate, regexp, subject, output, output_size, index);
DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
isolate->has_pending_exception());
switch (result) {
case IrregexpInterpreter::SUCCESS:
case IrregexpInterpreter::EXCEPTION:
case IrregexpInterpreter::FAILURE:
case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL:
return result;
case IrregexpInterpreter::RETRY:
// The string has changed representation, and we must restart the
// match.
// We need to reset the tier up to start over with compilation.
if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick();
is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
break;
}
} while (true);
UNREACHABLE();
}
}
MaybeHandle<Object> RegExpImpl::IrregexpExec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int previous_index, Handle<RegExpMatchInfo> last_match_info) {
DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
subject = String::Flatten(isolate, subject);
#ifdef DEBUG
if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) {
String pattern = regexp->Pattern();
PrintF("\n\nRegexp match: /%s/\n\n", pattern.ToCString().get());
PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
}
#endif
// For very long subject strings, the regexp interpreter is currently much
// slower than the jitted code execution. If the tier-up strategy is turned
// on, we want to avoid this performance penalty so we eagerly tier-up if the
// subject string length is equal or greater than the given heuristic value.
if (FLAG_regexp_tier_up &&
subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
regexp->MarkTierUpForNextExec();
if (FLAG_trace_regexp_tier_up) {
PrintF(
"Forcing tier-up for very long strings in "
"RegExpImpl::IrregexpExec\n");
}
}
// Prepare space for the return values.
int required_registers =
RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
if (required_registers < 0) {
// Compiling failed with an exception.
DCHECK(isolate->has_pending_exception());
return MaybeHandle<Object>();
}
int32_t* output_registers = nullptr;
if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
output_registers = NewArray<int32_t>(required_registers);
}
std::unique_ptr<int32_t[]> auto_release(output_registers);
if (output_registers == nullptr) {
output_registers = isolate->jsregexp_static_offsets_vector();
}
int res =
RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
output_registers, required_registers);
if (res == RegExp::RE_SUCCESS) {
int capture_count = regexp->CaptureCount();
return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
capture_count, output_registers);
} else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
return ExperimentalRegExp::OneshotExec(isolate, regexp, subject,
previous_index, last_match_info);
} else if (res == RegExp::RE_EXCEPTION) {
DCHECK(isolate->has_pending_exception());
return MaybeHandle<Object>();
} else {
DCHECK(res == RegExp::RE_FAILURE);
return isolate->factory()->null_value();
}
}
// static
Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo(
Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
Handle<String> subject, int capture_count, int32_t* match) {
// This is the only place where match infos can grow. If, after executing the
// regexp, RegExpExecStub finds that the match info is too small, it restarts
// execution in RegExpImpl::Exec, which finally grows the match info right
// here.
Handle<RegExpMatchInfo> result =
RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
if (*result != *last_match_info) {
if (*last_match_info == *isolate->regexp_last_match_info()) {
// This inner condition is only needed for special situations like the
// regexp fuzzer, where we pass our own custom RegExpMatchInfo to
// RegExpImpl::Exec; there actually want to bypass the Isolate's match
// info and execute the regexp without side effects.
isolate->native_context()->set_regexp_last_match_info(*result);
}
}
int capture_register_count =
JSRegExp::RegistersForCaptureCount(capture_count);
DisallowHeapAllocation no_allocation;
if (match != nullptr) {
for (int i = 0; i < capture_register_count; i += 2) {
result->SetCapture(i, match[i]);
result->SetCapture(i + 1, match[i + 1]);
}
}
result->SetLastSubject(*subject);
result->SetLastInput(*subject);
return result;
}
// static
void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) {
DotPrinter::DotPrint(label, node);
}
namespace {
// Returns true if we've either generated too much irregex code within this
// isolate, or the pattern string is too long.
bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
// Limit the space regexps take up on the heap. In order to limit this we
// would like to keep track of the amount of regexp code on the heap. This
// is not tracked, however. As a conservative approximation we track the
// total regexp code compiled including code that has subsequently been freed
// and the total executable memory at any point.
static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
static constexpr size_t kRegExpCompiledLimit = 1 * MB;
Heap* heap = isolate->heap();
if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
}
} // namespace
// static
bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone,
RegExpCompileData* data, JSRegExp::Flags flags,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_one_byte) {
uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
sample_subject, is_one_byte, backtrack_limit);
}
bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
JSRegExp::Flags flags, Handle<String> pattern,
Handle<String> sample_subject, bool is_one_byte,
uint32_t& backtrack_limit) {
if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
RegExpMacroAssembler::kMaxRegisterCount) {
data->error = RegExpError::kTooLarge;
return false;
}
RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
if (compiler.optimize()) {
compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
}
// Sample some characters from the middle of the string.
static const int kSampleSize = 128;
sample_subject = String::Flatten(isolate, sample_subject);
int chars_sampled = 0;
int half_way = (sample_subject->length() - kSampleSize) / 2;
for (int i = Max(0, half_way);
i < sample_subject->length() && chars_sampled < kSampleSize;
i++, chars_sampled++) {
compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
}
data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
data->error = AnalyzeRegExp(isolate, is_one_byte, data->node);
if (data->error != RegExpError::kNone) {
return false;
}
// Create the correct assembler for the architecture.
std::unique_ptr<RegExpMacroAssembler> macro_assembler;
if (data->compilation_target == RegExpCompilationTarget::kNative) {
// Native regexp implementation.
DCHECK(!FLAG_jitless);
NativeRegExpMacroAssembler::Mode mode =
is_one_byte ? NativeRegExpMacroAssembler::LATIN1
: NativeRegExpMacroAssembler::UC16;
const int output_register_count =
JSRegExp::RegistersForCaptureCount(data->capture_count);
#if V8_TARGET_ARCH_IA32
macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
output_register_count));
#elif V8_TARGET_ARCH_X64
macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
output_register_count));
#elif V8_TARGET_ARCH_ARM
macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
output_register_count));
#elif V8_TARGET_ARCH_ARM64
macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
output_register_count));
#elif V8_TARGET_ARCH_S390
macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
output_register_count));
#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
output_register_count));
#elif V8_TARGET_ARCH_MIPS
macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
output_register_count));
#elif V8_TARGET_ARCH_MIPS64
macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
output_register_count));
#else
#error "Unsupported architecture"
#endif
} else {
DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
// Interpreted regexp implementation.
macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
}
macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks &&
ExperimentalRegExp::CanBeHandled(data->tree, flags,
data->capture_count)) {
if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
backtrack_limit = FLAG_regexp_backtracks_before_fallback;
} else {
backtrack_limit =
std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback);
}
macro_assembler->set_backtrack_limit(backtrack_limit);
macro_assembler->set_can_fallback(true);
} else {
macro_assembler->set_backtrack_limit(backtrack_limit);
macro_assembler->set_can_fallback(false);
}
// Inserted here, instead of in Assembler, because it depends on information
// in the AST that isn't replicated in the Node structure.
bool is_end_anchored = data->tree->IsAnchoredAtEnd();
bool is_start_anchored = data->tree->IsAnchoredAtStart();
int max_length = data->tree->max_match();
static const int kMaxBacksearchLimit = 1024;
if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
max_length < kMaxBacksearchLimit) {
macro_assembler->SetCurrentPositionFromEnd(max_length);
}
if (IsGlobal(flags)) {
RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
if (data->tree->min_match() > 0) {
mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
} else if (IsUnicode(flags)) {
mode = RegExpMacroAssembler::GLOBAL_UNICODE;
}
macro_assembler->set_global_mode(mode);
}
RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
#ifdef DEBUG
std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
if (FLAG_trace_regexp_assembler) {
tracer_macro_assembler.reset(
new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
macro_assembler_ptr = tracer_macro_assembler.get();
}
#endif
RegExpCompiler::CompilationResult result = compiler.Assemble(
isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
// Code / bytecode printing.
{
#ifdef ENABLE_DISASSEMBLER
if (FLAG_print_regexp_code &&
data->compilation_target == RegExpCompilationTarget::kNative) {
CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
OFStream os(trace_scope.file());
Handle<Code> c = Handle<Code>::cast(result.code);
auto pattern_cstring = pattern->ToCString();
c->Disassemble(pattern_cstring.get(), os, isolate);
}
#endif
if (FLAG_print_regexp_bytecode &&
data->compilation_target == RegExpCompilationTarget::kBytecode) {
Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code);
auto pattern_cstring = pattern->ToCString();
RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(),
bytecode->length(), pattern_cstring.get());
}
}
if (result.error != RegExpError::kNone) {
if (FLAG_correctness_fuzzer_suppressions &&
result.error == RegExpError::kStackOverflow) {
FATAL("Aborting on stack overflow");
}
data->error = result.error;
}
data->code = result.code;
data->register_count = result.num_registers;
return result.Succeeded();
}
RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
Handle<String> subject, Isolate* isolate)
: register_array_(nullptr),
register_array_size_(0),
regexp_(regexp),
subject_(subject),
isolate_(isolate) {
DCHECK(IsGlobal(regexp->GetFlags()));
switch (regexp_->TypeTag()) {
case JSRegExp::NOT_COMPILED:
UNREACHABLE();
case JSRegExp::ATOM: {
// ATOM regexps do not have a global loop, so we search for one match at
// a time.
static const int kAtomRegistersPerMatch = 2;
registers_per_match_ = kAtomRegistersPerMatch;
register_array_size_ = registers_per_match_;
break;
}
case JSRegExp::IRREGEXP: {
registers_per_match_ =
RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
if (registers_per_match_ < 0) {
num_matches_ = -1; // Signal exception.
return;
}
if (regexp->ShouldProduceBytecode()) {
// Global loop in interpreted regexp is not implemented. We choose the
// size of the offsets vector so that it can only store one match.
register_array_size_ = registers_per_match_;
max_matches_ = 1;
} else {
register_array_size_ = Max(registers_per_match_,
Isolate::kJSRegexpStaticOffsetsVectorSize);
}
break;
}
case JSRegExp::EXPERIMENTAL: {
if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
!ExperimentalRegExp::Compile(isolate_, regexp)) {
DCHECK(isolate->has_pending_exception());
num_matches_ = -1; // Signal exception.
return;
}
registers_per_match_ =
JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
register_array_size_ =
Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
break;
}
}
max_matches_ = register_array_size_ / registers_per_match_;
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
register_array_ = NewArray<int32_t>(register_array_size_);
} else {
register_array_ = isolate->jsregexp_static_offsets_vector();
}
// Set state so that fetching the results the first time triggers a call
// to the compiled regexp.
current_match_index_ = max_matches_ - 1;
num_matches_ = max_matches_;
DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
DCHECK_GE(register_array_size_, registers_per_match_);
int32_t* last_match =
&register_array_[current_match_index_ * registers_per_match_];
last_match[0] = -1;
last_match[1] = 0;
}
RegExpGlobalCache::~RegExpGlobalCache() {
// Deallocate the register array if we allocated it in the constructor
// (as opposed to using the existing jsregexp_static_offsets_vector).
if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
DeleteArray(register_array_);
}
}
int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
// Advance over the surrogate pair.
return last_index + 2;
}
return last_index + 1;
}
int32_t* RegExpGlobalCache::FetchNext() {
current_match_index_++;
if (current_match_index_ >= num_matches_) {
// Current batch of results exhausted.
// Fail if last batch was not even fully filled.
if (num_matches_ < max_matches_) {
num_matches_ = 0; // Signal failed match.
return nullptr;
}
int32_t* last_match =
&register_array_[(current_match_index_ - 1) * registers_per_match_];
int last_end_index = last_match[1];
switch (regexp_->TypeTag()) {
case JSRegExp::NOT_COMPILED:
UNREACHABLE();
case JSRegExp::ATOM:
num_matches_ =
RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
register_array_, register_array_size_);
break;
case JSRegExp::EXPERIMENTAL: {
DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
DisallowHeapAllocation no_gc;
num_matches_ = ExperimentalRegExp::ExecRaw(
isolate_, RegExp::kFromRuntime, *regexp_, *subject_,
register_array_, register_array_size_, last_end_index);
break;
}
case JSRegExp::IRREGEXP: {
int last_start_index = last_match[0];
if (last_start_index == last_end_index) {
// Zero-length match. Advance by one code point.
last_end_index = AdvanceZeroLength(last_end_index);
}
if (last_end_index > subject_->length()) {
num_matches_ = 0; // Signal failed match.
return nullptr;
}
num_matches_ = RegExpImpl::IrregexpExecRaw(
isolate_, regexp_, subject_, last_end_index, register_array_,
register_array_size_);
break;
}
}
// Fall back to experimental engine if needed and possible.
if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) {
num_matches_ = ExperimentalRegExp::OneshotExecRaw(
isolate_, regexp_, subject_, register_array_, register_array_size_,
last_end_index);
}
if (num_matches_ <= 0) {
return nullptr;
}
current_match_index_ = 0;
return register_array_;
} else {
return &register_array_[current_match_index_ * registers_per_match_];
}
}
int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
int index = current_match_index_ * registers_per_match_;
if (num_matches_ == 0) {
// After a failed match we shift back by one result.
index -= registers_per_match_;
}
return &register_array_[index];
}
Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
Object key_pattern,
FixedArray* last_match_cache,
ResultsCacheType type) {
FixedArray cache;
if (!key_string.IsInternalizedString()) return Smi::zero();
if (type == STRING_SPLIT_SUBSTRINGS) {
DCHECK(key_pattern.IsString());
if (!key_pattern.IsInternalizedString()) return Smi::zero();
cache = heap->string_split_cache();
} else {
DCHECK(type == REGEXP_MULTIPLE_INDICES);
DCHECK(key_pattern.IsFixedArray());
cache = heap->regexp_multiple_cache();
}
uint32_t hash = key_string.Hash();
uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
~(kArrayEntriesPerCacheEntry - 1));
if (cache.get(index + kStringOffset) != key_string ||
cache.get(index + kPatternOffset) != key_pattern) {
index =
((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
if (cache.get(index + kStringOffset) != key_string ||
cache.get(index + kPatternOffset) != key_pattern) {
return Smi::zero();
}
}
*last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset));
return cache.get(index + kArrayOffset);
}
void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
Handle<Object> key_pattern,
Handle<FixedArray> value_array,
Handle<FixedArray> last_match_cache,
ResultsCacheType type) {
Factory* factory = isolate->factory();
Handle<FixedArray> cache;
if (!key_string->IsInternalizedString()) return;
if (type == STRING_SPLIT_SUBSTRINGS) {
DCHECK(key_pattern->IsString());
if (!key_pattern->IsInternalizedString()) return;
cache = factory->string_split_cache();
} else {
DCHECK(type == REGEXP_MULTIPLE_INDICES);
DCHECK(key_pattern->IsFixedArray());
cache = factory->regexp_multiple_cache();
}
uint32_t hash = key_string->Hash();
uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
~(kArrayEntriesPerCacheEntry - 1));
if (cache->get(index + kStringOffset) == Smi::zero()) {
cache->set(index + kStringOffset, *key_string);
cache->set(index + kPatternOffset, *key_pattern);
cache->set(index + kArrayOffset, *value_array);
cache->set(index + kLastMatchOffset, *last_match_cache);
} else {
uint32_t index2 =
((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
if (cache->get(index2 + kStringOffset) == Smi::zero()) {
cache->set(index2 + kStringOffset, *key_string);
cache->set(index2 + kPatternOffset, *key_pattern);
cache->set(index2 + kArrayOffset, *value_array);
cache->set(index2 + kLastMatchOffset, *last_match_cache);
} else {
cache->set(index2 + kStringOffset, Smi::zero());
cache->set(index2 + kPatternOffset, Smi::zero());
cache->set(index2 + kArrayOffset, Smi::zero());
cache->set(index2 + kLastMatchOffset, Smi::zero());
cache->set(index + kStringOffset, *key_string);
cache->set(index + kPatternOffset, *key_pattern);
cache->set(index + kArrayOffset, *value_array);
cache->set(index + kLastMatchOffset, *last_match_cache);
}
}
// If the array is a reasonably short list of substrings, convert it into a
// list of internalized strings.
if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
for (int i = 0; i < value_array->length(); i++) {
Handle<String> str(String::cast(value_array->get(i)), isolate);
Handle<String> internalized_str = factory->InternalizeString(str);
value_array->set(i, *internalized_str);
}
}
// Convert backing store to a copy-on-write array.
value_array->set_map_no_write_barrier(
ReadOnlyRoots(isolate).fixed_cow_array_map());
}
void RegExpResultsCache::Clear(FixedArray cache) {
for (int i = 0; i < kRegExpResultsCacheSize; i++) {
cache.set(i, Smi::zero());
}
}
} // namespace internal
} // namespace v8