blob: 1d3aeb5276e5a5c5ca56de2e91afc7a10160b73e [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
#define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
#include <memory>
#include <string>
#include "utils/sentencepiece/double_array_trie.h"
#include "utils/strings/stringpiece.h"
namespace libtextclassifier3 {
// Normalizer implements a simple text normalizer with user-defined
// string-to-string rules and leftmost longest matching.
class SentencePieceNormalizer {
public:
// charsmap_trie and charsmap_normalized specify the normalization/replacement
// string-to-string rules in the following way:
// A match in the trie for a string will return the offset in
// charsmap_normalized that contains the replacement string.
//
// add_dummy_prefix: Whether to add dummy whitespace at the beginning of the
// text in order to treat "world" in "world" and "hello world" uniformly.
//
// remove_extra_whitespaces: Whether to remove leading, trailing and duplicate
// internal whitespace.
//
// escape_whitespaces: Whether to replace whitespace with a meta symbol.
SentencePieceNormalizer(const DoubleArrayTrie& charsmap_trie,
StringPiece charsmap_normalized,
bool add_dummy_prefix = true,
bool remove_extra_whitespaces = true,
bool escape_whitespaces = true)
: charsmap_trie_(charsmap_trie),
charsmap_normalized_(charsmap_normalized),
add_dummy_prefix_(add_dummy_prefix),
remove_extra_whitespaces_(remove_extra_whitespaces),
escape_whitespaces_(escape_whitespaces) {}
// Normalizes a plain utf8 string into an internal representation for
// Sentencepiece model.
bool Normalize(StringPiece input, std::string* normalized_input) const;
private:
// Normalizes the prefix of `input` and returns the pair of
// normalized prefix and the length of the prefix of `input` processed in the
// normalization.
bool NormalizePrefix(StringPiece input,
std::pair<StringPiece, int>* prefix) const;
// Internal trie for efficient longest prefix string matching.
DoubleArrayTrie charsmap_trie_;
// "\0" delimitered concatenated normalized strings.
// the value of `charsmap_trie_` stores offsets into this string.
StringPiece charsmap_normalized_;
const bool add_dummy_prefix_;
const bool remove_extra_whitespaces_;
const bool escape_whitespaces_;
};
} // namespace libtextclassifier3
#endif // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_