blob: a7cca83902a4d874ff3296f6ce5ee1ccd5d14c27 [file]
/*
* Copyright (C) 2022 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "RegEx.h"
#include <android-base/logging.h>
#include <pcre2.h>
// PCRE2 is used for regex implementation because it provides a good balance between
// search efficiency and binary size. In a local experiment searching process maps:
// std::regex: 5986 ms
// re2: 40 ms
// pcre2: 211 ms
// pcre2 (with preallocated match data): 96 ms
//
// Binary size comparison (simpleperf):
// with std::regex: 3,191,856 bytes
// with re2: 3,640,936 bytes
// with pcre2 (static): 3,452,896 bytes
// with pcre2 (shared): 3,140,656 bytes
//
// PCRE2 with preallocated match data is significantly faster than std::regex while
// keeping a smaller binary size than re2.
namespace simpleperf {
RegExMatch::~RegExMatch() {}
class RegExMatchImpl : public RegExMatch {
public:
RegExMatchImpl(std::string_view s, pcre2_code* re) : s_(s), re_(re) {
match_data_ = pcre2_match_data_create_from_pattern(re_, nullptr);
current_offset_ = 0;
MoveToNextMatch();
}
~RegExMatchImpl() override { pcre2_match_data_free(match_data_); }
bool IsValid() const override { return is_valid_; }
std::string GetField(size_t index) const override {
if (!is_valid_) return "";
uint32_t count = pcre2_get_ovector_count(match_data_);
if (index >= count) return "";
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data_);
if (ovector[2 * index] == PCRE2_UNSET) return "";
return std::string(s_.substr(ovector[2 * index], ovector[2 * index + 1] - ovector[2 * index]));
}
void MoveToNextMatch() override {
int rc = pcre2_match(re_, reinterpret_cast<PCRE2_SPTR>(s_.data()), s_.size(), current_offset_,
0, match_data_, nullptr);
if (rc >= 0) {
is_valid_ = true;
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data_);
PCRE2_SIZE match_end = ovector[1];
if (match_end == current_offset_) {
current_offset_++;
} else {
current_offset_ = match_end;
}
} else {
is_valid_ = false;
}
}
private:
std::string_view s_;
pcre2_code* re_;
pcre2_match_data* match_data_;
PCRE2_SIZE current_offset_;
bool is_valid_ = false;
};
class RegExImpl : public RegEx {
public:
RegExImpl(std::string_view pattern, pcre2_code* re) : RegEx(pattern), re_(re) {
match_data_ = pcre2_match_data_create_from_pattern(re_, nullptr);
}
~RegExImpl() override {
pcre2_match_data_free(match_data_);
pcre2_code_free(re_);
}
bool Match(std::string_view s) const override {
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re_, nullptr);
int rc = pcre2_match(re_, reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), 0,
PCRE2_ANCHORED | PCRE2_ENDANCHORED, match_data, nullptr);
pcre2_match_data_free(match_data);
return rc >= 0;
}
bool Search(std::string_view s) const override {
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re_, nullptr);
int rc = pcre2_match(re_, reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), 0, 0, match_data,
nullptr);
pcre2_match_data_free(match_data);
return rc >= 0;
}
bool ThreadUnsafeMatch(std::string_view s) const override {
int rc = pcre2_match(re_, reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), 0,
PCRE2_ANCHORED | PCRE2_ENDANCHORED, match_data_, nullptr);
return rc >= 0;
}
bool ThreadUnsafeSearch(std::string_view s) const override {
int rc = pcre2_match(re_, reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), 0, 0, match_data_,
nullptr);
return rc >= 0;
}
std::unique_ptr<RegExMatch> SearchAll(std::string_view s) const override {
return std::make_unique<RegExMatchImpl>(s, re_);
}
std::optional<std::string> Replace(const std::string& s,
const std::string& format) const override {
PCRE2_SIZE outlen = s.size() * 2 + 128;
std::string result(outlen, '\0');
int rc = pcre2_substitute(re_, reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), 0,
PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_EXTENDED, nullptr, nullptr,
reinterpret_cast<PCRE2_SPTR>(format.data()), format.size(),
reinterpret_cast<PCRE2_UCHAR*>(result.data()), &outlen);
if (rc == PCRE2_ERROR_NOMEMORY) {
result.resize(outlen);
rc = pcre2_substitute(re_, reinterpret_cast<PCRE2_SPTR>(s.data()), s.size(), 0,
PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_EXTENDED, nullptr, nullptr,
reinterpret_cast<PCRE2_SPTR>(format.data()), format.size(),
reinterpret_cast<PCRE2_UCHAR*>(result.data()), &outlen);
}
if (rc >= 0) {
result.resize(outlen);
return result;
}
return std::nullopt;
}
private:
pcre2_code* re_;
pcre2_match_data* match_data_;
};
std::unique_ptr<RegEx> RegEx::Create(std::string_view pattern) {
int errornumber;
PCRE2_SIZE erroroffset;
pcre2_code* re = pcre2_compile(reinterpret_cast<PCRE2_SPTR>(pattern.data()), pattern.size(), 0,
&errornumber, &erroroffset, nullptr);
if (re == nullptr) {
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
LOG(ERROR) << "regex error: " << buffer << " at offset " << erroroffset << ", pattern "
<< pattern;
return nullptr;
}
return std::make_unique<RegExImpl>(pattern, re);
}
bool SearchInRegs(std::string_view s, const std::vector<std::unique_ptr<RegEx>>& regs) {
for (auto& reg : regs) {
if (reg->Search(s)) {
return true;
}
}
return false;
}
} // namespace simpleperf