blob: 8e7e9ba5e08de7eacc9e590497ec040ef26ffea3 [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utils/sentencepiece/sorted_strings_table.h"
#include <algorithm>
#include "utils/base/logging.h"
namespace libtextclassifier3 {
void SortedStringsTable::GatherPrefixMatches(
StringPiece input, const std::function<void(TrieMatch)>& update_fn) const {
int left = 0;
int right = num_pieces_;
int span_size = right - left;
int match_length = 0;
// Loop invariant:
// at the ith iteration, all strings from `left` ... `right` match the input
// on the first `match_length` characters.
while (span_size > use_linear_scan_threshold_) {
if (match_length >= input.length()) {
return;
}
// We find the possible range of pieces in `left` ... `right` matching the
// `match_length` + 1 character with two binary searches:
// `lower_bound` to find the start of the range of matching pieces.
// `upper_bound` to find the non-inclusive end of the range.
left = (std::lower_bound(
offsets_ + left, offsets_ + right,
static_cast<unsigned char>(input[match_length]),
[this, match_length](uint32 piece_offset, uint32 c) -> bool {
return static_cast<unsigned char>(
pieces_[piece_offset + match_length]) < c;
}) -
offsets_);
right = (std::upper_bound(
offsets_ + left, offsets_ + right,
static_cast<unsigned char>(input[match_length]),
[this, match_length](uint32 c, uint32 piece_offset) -> bool {
return c < static_cast<unsigned char>(
pieces_[piece_offset + match_length]);
}) -
offsets_);
span_size = right - left;
if (span_size <= 0) {
return;
}
++match_length;
// Due to the loop invariant and the fact that the strings are sorted, there
// can only be one piece matching completely now, namely at left.
if (pieces_[offsets_[left] + match_length] == 0) {
update_fn(TrieMatch(/*id=*/left,
/*match_length=*/match_length));
left++;
}
}
// Use linear scan for small problem instances.
// By the loop invariant characters 0...`match_length` of all pieces in
// in `left`...`right` match the input on 0...`match_length`.
for (int i = left; i < right; i++) {
bool matches = true;
int piece_match_length = match_length;
for (int k = offsets_[i] + piece_match_length; pieces_[k] != 0; k++) {
if (match_length >= input.size() ||
input[piece_match_length] != pieces_[k]) {
matches = false;
break;
}
piece_match_length++;
}
if (matches) {
update_fn(TrieMatch(/*id=*/i,
/*match_length=*/piece_match_length));
}
}
}
bool SortedStringsTable::FindAllPrefixMatches(
StringPiece input, std::vector<TrieMatch>* matches) const {
GatherPrefixMatches(
input, [matches](const TrieMatch match) { matches->push_back(match); });
return true;
}
bool SortedStringsTable::LongestPrefixMatch(StringPiece input,
TrieMatch* longest_match) const {
*longest_match = TrieMatch();
GatherPrefixMatches(input, [longest_match](const TrieMatch match) {
*longest_match = match;
});
return true;
}
} // namespace libtextclassifier3