| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "utils/sentencepiece/sorted_strings_table.h" |
| |
| #include <algorithm> |
| #include "utils/base/logging.h" |
| |
| namespace libtextclassifier3 { |
| |
| void SortedStringsTable::GatherPrefixMatches( |
| StringPiece input, const std::function<void(TrieMatch)>& update_fn) const { |
| int left = 0; |
| int right = num_pieces_; |
| int span_size = right - left; |
| int match_length = 0; |
| |
| // Loop invariant: |
| // at the ith iteration, all strings from `left` ... `right` match the input |
| // on the first `match_length` characters. |
| while (span_size > use_linear_scan_threshold_) { |
| if (match_length >= input.length()) { |
| return; |
| } |
| |
| // We find the possible range of pieces in `left` ... `right` matching the |
| // `match_length` + 1 character with two binary searches: |
| // `lower_bound` to find the start of the range of matching pieces. |
| // `upper_bound` to find the non-inclusive end of the range. |
| left = (std::lower_bound( |
| offsets_ + left, offsets_ + right, input[match_length], |
| [this, match_length](int piece_offset, int c) -> bool { |
| return pieces_[piece_offset + match_length] < c; |
| }) - |
| offsets_); |
| right = (std::upper_bound( |
| offsets_ + left, offsets_ + right, input[match_length], |
| [this, match_length](int c, int piece_offset) -> bool { |
| return c < pieces_[piece_offset + match_length]; |
| }) - |
| offsets_); |
| span_size = right - left; |
| if (span_size <= 0) { |
| return; |
| } |
| ++match_length; |
| |
| // Due to the loop invariant and the fact that the strings are sorted, there |
| // can only be one piece matching completely now, namely at left. |
| if (pieces_[offsets_[left] + match_length] == 0) { |
| update_fn(TrieMatch(/*id=*/left, |
| /*match_length=*/match_length)); |
| left++; |
| } |
| } |
| |
| // Use linear scan for small problem instances. |
| // By the loop invariant characters 0...`match_length` of all pieces in |
| // in `left`...`right` match the input on 0...`match_length`. |
| for (int i = left; i < right; i++) { |
| bool matches = true; |
| int piece_match_length = match_length; |
| for (int k = offsets_[i] + piece_match_length; pieces_[k] != 0; k++) { |
| if (match_length >= input.size() || |
| input[piece_match_length] != pieces_[k]) { |
| matches = false; |
| break; |
| } |
| piece_match_length++; |
| } |
| if (matches) { |
| update_fn(TrieMatch(/*id=*/i, |
| /*match_length=*/piece_match_length)); |
| } |
| } |
| } |
| |
| std::vector<TrieMatch> SortedStringsTable::FindAllPrefixMatches( |
| StringPiece input) const { |
| std::vector<TrieMatch> result; |
| GatherPrefixMatches( |
| input, [&result](const TrieMatch match) { result.push_back(match); }); |
| return result; |
| } |
| |
| TrieMatch SortedStringsTable::LongestPrefixMatch(StringPiece input) const { |
| TrieMatch longest_match; |
| GatherPrefixMatches(input, [&longest_match](const TrieMatch match) { |
| longest_match = match; |
| }); |
| return longest_match; |
| } |
| |
| } // namespace libtextclassifier3 |