utils/sentencepiece/normalizer.h - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
 #define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_

 #include <memory>
 #include <string>

 #include "utils/sentencepiece/double_array_trie.h"
 #include "utils/strings/stringpiece.h"

 namespace libtextclassifier3 {

 // Normalizer implements a simple text normalizer with user-defined
 // string-to-string rules and leftmost longest matching.
 class SentencePieceNormalizer {
  public:
   // charsmap_trie and charsmap_normalized specify the normalization/replacement
   // string-to-string rules in the following way:
   // A match in the trie for a string will return the offset in
   // charsmap_normalized that contains the replacement string.
   //
   // add_dummy_prefix: Whether to add dummy whitespace at the beginning of the
   //   text in order to treat "world" in "world" and "hello world" uniformly.
   //
   // remove_extra_whitespaces: Whether to remove leading, trailing and duplicate
   //   internal whitespace.
   //
   // escape_whitespaces: Whether to replace whitespace with a meta symbol.
   SentencePieceNormalizer(const DoubleArrayTrie& charsmap_trie,
                           StringPiece charsmap_normalized,
                           bool add_dummy_prefix = true,
                           bool remove_extra_whitespaces = true,
                           bool escape_whitespaces = true)
       : charsmap_trie_(charsmap_trie),
         charsmap_normalized_(charsmap_normalized),
         add_dummy_prefix_(add_dummy_prefix),
         remove_extra_whitespaces_(remove_extra_whitespaces),
         escape_whitespaces_(escape_whitespaces) {}

   // Normalizes a plain utf8 string into an internal representation for
   // Sentencepiece model.
   bool Normalize(StringPiece input, std::string* normalized_input) const;

  private:
   // Normalizes the prefix of `input` and returns the pair of
   // normalized prefix and the length of the prefix of `input` processed in the
   // normalization.
   bool NormalizePrefix(StringPiece input,
                        std::pair<StringPiece, int>* prefix) const;

   // Internal trie for efficient longest prefix string matching.
   DoubleArrayTrie charsmap_trie_;

   // "\0" delimitered concatenated normalized strings.
   // the value of `charsmap_trie_` stores offsets into this string.
   StringPiece charsmap_normalized_;

   const bool add_dummy_prefix_;
   const bool remove_extra_whitespaces_;
   const bool escape_whitespaces_;
 };

 }  // namespace libtextclassifier3

 #endif  // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_
	#define LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_

	#include <memory>
	#include <string>

	#include "utils/sentencepiece/double_array_trie.h"
	#include "utils/strings/stringpiece.h"

	namespace libtextclassifier3 {

	// Normalizer implements a simple text normalizer with user-defined
	// string-to-string rules and leftmost longest matching.
	class SentencePieceNormalizer {
	public:
	// charsmap_trie and charsmap_normalized specify the normalization/replacement
	// string-to-string rules in the following way:
	// A match in the trie for a string will return the offset in
	// charsmap_normalized that contains the replacement string.
	//
	// add_dummy_prefix: Whether to add dummy whitespace at the beginning of the
	// text in order to treat "world" in "world" and "hello world" uniformly.
	//
	// remove_extra_whitespaces: Whether to remove leading, trailing and duplicate
	// internal whitespace.
	//
	// escape_whitespaces: Whether to replace whitespace with a meta symbol.
	SentencePieceNormalizer(const DoubleArrayTrie& charsmap_trie,
	StringPiece charsmap_normalized,
	bool add_dummy_prefix = true,
	bool remove_extra_whitespaces = true,
	bool escape_whitespaces = true)
	: charsmap_trie_(charsmap_trie),
	charsmap_normalized_(charsmap_normalized),
	add_dummy_prefix_(add_dummy_prefix),
	remove_extra_whitespaces_(remove_extra_whitespaces),
	escape_whitespaces_(escape_whitespaces) {}

	// Normalizes a plain utf8 string into an internal representation for
	// Sentencepiece model.
	bool Normalize(StringPiece input, std::string* normalized_input) const;

	private:
	// Normalizes the prefix of `input` and returns the pair of
	// normalized prefix and the length of the prefix of `input` processed in the
	// normalization.
	bool NormalizePrefix(StringPiece input,
	std::pair<StringPiece, int>* prefix) const;

	// Internal trie for efficient longest prefix string matching.
	DoubleArrayTrie charsmap_trie_;

	// "\0" delimitered concatenated normalized strings.
	// the value of `charsmap_trie_` stores offsets into this string.
	StringPiece charsmap_normalized_;

	const bool add_dummy_prefix_;
	const bool remove_extra_whitespaces_;
	const bool escape_whitespaces_;
	};

	} // namespace libtextclassifier3

	#endif // LIBTEXTCLASSIFIER_UTILS_SENTENCEPIECE_NORMALIZER_H_