| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_ |
| #define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_ |
| |
| #include <memory> |
| #include <string> |
| |
| #include "utils/sentencepiece/double_array_trie.h" |
| #include "utils/strings/stringpiece.h" |
| |
| namespace libtextclassifier3 { |
| |
| // Normalizer implements a simple text normalizer with user-defined |
| // string-to-string rules and leftmost longest matching. |
| class SentencePieceNormalizer { |
| public: |
| // charsmap_trie and charsmap_normalized specify the normalization/replacement |
| // string-to-string rules in the following way: |
| // A match in the trie for a string will return the offset in |
| // charsmap_normalized that contains the replacement string. |
| // |
| // add_dummy_prefix: Whether to add dummy whitespace at the beginning of the |
| // text in order to treat "world" in "world" and "hello world" uniformly. |
| // |
| // remove_extra_whitespaces: Whether to remove leading, trailing and duplicate |
| // internal whitespace. |
| // |
| // escape_whitespaces: Whether to replace whitespace with a meta symbol. |
| SentencePieceNormalizer(const DoubleArrayTrie& charsmap_trie, |
| StringPiece charsmap_normalized, |
| bool add_dummy_prefix = true, |
| bool remove_extra_whitespaces = true, |
| bool escape_whitespaces = true) |
| : charsmap_trie_(charsmap_trie), |
| charsmap_normalized_(charsmap_normalized), |
| add_dummy_prefix_(add_dummy_prefix), |
| remove_extra_whitespaces_(remove_extra_whitespaces), |
| escape_whitespaces_(escape_whitespaces) {} |
| |
| // Normalizes a plain utf8 string into an internal representation for |
| // Sentencepiece model. |
| bool Normalize(StringPiece input, std::string* normalized_input) const; |
| |
| private: |
| // Normalizes the prefix of `input` and returns the pair of |
| // normalized prefix and the length of the prefix of `input` processed in the |
| // normalization. |
| bool NormalizePrefix(StringPiece input, |
| std::pair<StringPiece, int>* prefix) const; |
| |
| // Internal trie for efficient longest prefix string matching. |
| DoubleArrayTrie charsmap_trie_; |
| |
| // "\0" delimitered concatenated normalized strings. |
| // the value of `charsmap_trie_` stores offsets into this string. |
| StringPiece charsmap_normalized_; |
| |
| const bool add_dummy_prefix_; |
| const bool remove_extra_whitespaces_; |
| const bool escape_whitespaces_; |
| }; |
| |
| } // namespace libtextclassifier3 |
| |
| #endif // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_ |