native/utils/bert_tokenizer.h - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef LIBTEXTCLASSIFIER_UTILS_BERT_TOKENIZER_H_
 #define LIBTEXTCLASSIFIER_UTILS_BERT_TOKENIZER_H_

 #include <fstream>
 #include <string>
 #include <vector>

 #include "annotator/types.h"
 #include "utils/wordpiece_tokenizer.h"
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow_lite_support/cc/text/tokenizers/tokenizer.h"
 #include "tensorflow_lite_support/cc/utils/common_utils.h"

 namespace libtextclassifier3 {

 using ::tflite::support::text::tokenizer::TokenizerResult;
 using ::tflite::support::utils::LoadVocabFromBuffer;
 using ::tflite::support::utils::LoadVocabFromFile;

 constexpr int kDefaultMaxBytesPerToken = 100;
 constexpr int kDefaultMaxCharsPerSubToken = 100;
 constexpr char kDefaultSuffixIndicator[] = "##";
 constexpr bool kDefaultUseUnknownToken = true;
 constexpr char kDefaultUnknownToken[] = "[UNK]";
 constexpr bool kDefaultSplitUnknownChars = false;

 // Result of wordpiece tokenization including subwords and offsets.
 // Example:
 // input:                tokenize     me  please
 // subwords:             token ##ize  me  plea ##se
 // wp_begin_offset:     [0,      5,   9,  12,    16]
 // wp_end_offset:       [     5,    8,  11,   16,  18]
 // row_lengths:         [2,          1,  1]
 struct WordpieceTokenizerResult
     : tflite::support::text::tokenizer::TokenizerResult {
   std::vector<int> wp_begin_offset;
   std::vector<int> wp_end_offset;
   std::vector<int> row_lengths;
 };

 // Options to create a BertTokenizer.
 struct BertTokenizerOptions {
   int max_bytes_per_token = kDefaultMaxBytesPerToken;
   int max_chars_per_subtoken = kDefaultMaxCharsPerSubToken;
   std::string suffix_indicator = kDefaultSuffixIndicator;
   bool use_unknown_token = kDefaultUseUnknownToken;
   std::string unknown_token = kDefaultUnknownToken;
   bool split_unknown_chars = kDefaultSplitUnknownChars;
 };

 // A flat-hash-map based implementation of WordpieceVocab, used in
 // BertTokenizer to invoke tensorflow::text::WordpieceTokenize within.
 class FlatHashMapBackedWordpiece : public WordpieceVocab {
  public:
   explicit FlatHashMapBackedWordpiece(const std::vector<std::string>& vocab);

   LookupStatus Contains(absl::string_view key, bool* value) const override;
   bool LookupId(absl::string_view key, int* result) const;
   bool LookupWord(int vocab_id, absl::string_view* result) const;
   int VocabularySize() const { return vocab_.size(); }

  private:
   // All words indexed position in vocabulary file.
   std::vector<std::string> vocab_;
   absl::flat_hash_map<absl::string_view, int> index_map_;
 };

 // Wordpiece tokenizer for bert models. Initialized with a vocab file or vector.
 //
 // The full tokenization involves two steps: Splitting the input into tokens
 // (pretokenization) and splitting the tokens into subwords.
 class BertTokenizer : public tflite::support::text::tokenizer::Tokenizer {
  public:
   // Initialize the tokenizer from vocab vector and tokenizer configs.
   explicit BertTokenizer(const std::vector<std::string>& vocab,
                          const BertTokenizerOptions& options = {})
       : vocab_{FlatHashMapBackedWordpiece(vocab)}, options_{options} {}

   // Initialize the tokenizer from file path to vocab and tokenizer configs.
   explicit BertTokenizer(const std::string& path_to_vocab,
                          const BertTokenizerOptions& options = {})
       : BertTokenizer(LoadVocabFromFile(path_to_vocab), options) {}

   // Initialize the tokenizer from buffer and size of vocab and tokenizer
   // configs.
   BertTokenizer(const char* vocab_buffer_data, size_t vocab_buffer_size,
                 const BertTokenizerOptions& options = {})
       : BertTokenizer(LoadVocabFromBuffer(vocab_buffer_data, vocab_buffer_size),
                       options) {}

   // Perform tokenization, first tokenize the input and then find the subwords.
   // Return tokenized results containing the subwords.
   TokenizerResult Tokenize(const std::string& input) override;

   // Perform tokenization, first tokenize the input and then find the subwords.
   // Return tokenized results containing the subwords and codepoint indices.
   WordpieceTokenizerResult TokenizeIntoWordpieces(const std::string& input);

   // Perform tokenization on a single token, return tokenized results containing
   // the subwords and codepoint indices.
   WordpieceTokenizerResult TokenizeSingleToken(const std::string& token);

   // Perform tokenization, return tokenized results containing the subwords and
   // codepoint indices.
   WordpieceTokenizerResult TokenizeIntoWordpieces(
       const std::vector<Token>& tokens);

   // Check if a certain key is included in the vocab.
   LookupStatus Contains(const absl::string_view key, bool* value) const {
     return vocab_.Contains(key, value);
   }

   // Find the id of a wordpiece.
   bool LookupId(absl::string_view key, int* result) const override {
     return vocab_.LookupId(key, result);
   }

   // Find the wordpiece from an id.
   bool LookupWord(int vocab_id, absl::string_view* result) const override {
     return vocab_.LookupWord(vocab_id, result);
   }

   int VocabularySize() const { return vocab_.VocabularySize(); }

   static std::vector<std::string> PreTokenize(const absl::string_view input);

  private:
   FlatHashMapBackedWordpiece vocab_;
   BertTokenizerOptions options_;
 };

 }  // namespace libtextclassifier3

 #endif  // LIBTEXTCLASSIFIER_UTILS_BERT_TOKENIZER_H_
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LIBTEXTCLASSIFIER_UTILS_BERT_TOKENIZER_H_
	#define LIBTEXTCLASSIFIER_UTILS_BERT_TOKENIZER_H_

	#include <fstream>
	#include <string>
	#include <vector>

	#include "annotator/types.h"
	#include "utils/wordpiece_tokenizer.h"
	#include "absl/container/flat_hash_map.h"
	#include "tensorflow_lite_support/cc/text/tokenizers/tokenizer.h"
	#include "tensorflow_lite_support/cc/utils/common_utils.h"

	namespace libtextclassifier3 {

	using ::tflite::support::text::tokenizer::TokenizerResult;
	using ::tflite::support::utils::LoadVocabFromBuffer;
	using ::tflite::support::utils::LoadVocabFromFile;

	constexpr int kDefaultMaxBytesPerToken = 100;
	constexpr int kDefaultMaxCharsPerSubToken = 100;
	constexpr char kDefaultSuffixIndicator[] = "##";
	constexpr bool kDefaultUseUnknownToken = true;
	constexpr char kDefaultUnknownToken[] = "[UNK]";
	constexpr bool kDefaultSplitUnknownChars = false;

	// Result of wordpiece tokenization including subwords and offsets.
	// Example:
	// input: tokenize me please
	// subwords: token ##ize me plea ##se
	// wp_begin_offset: [0, 5, 9, 12, 16]
	// wp_end_offset: [ 5, 8, 11, 16, 18]
	// row_lengths: [2, 1, 1]
	struct WordpieceTokenizerResult
	: tflite::support::text::tokenizer::TokenizerResult {
	std::vector<int> wp_begin_offset;
	std::vector<int> wp_end_offset;
	std::vector<int> row_lengths;
	};

	// Options to create a BertTokenizer.
	struct BertTokenizerOptions {
	int max_bytes_per_token = kDefaultMaxBytesPerToken;
	int max_chars_per_subtoken = kDefaultMaxCharsPerSubToken;
	std::string suffix_indicator = kDefaultSuffixIndicator;
	bool use_unknown_token = kDefaultUseUnknownToken;
	std::string unknown_token = kDefaultUnknownToken;
	bool split_unknown_chars = kDefaultSplitUnknownChars;
	};

	// A flat-hash-map based implementation of WordpieceVocab, used in
	// BertTokenizer to invoke tensorflow::text::WordpieceTokenize within.
	class FlatHashMapBackedWordpiece : public WordpieceVocab {
	public:
	explicit FlatHashMapBackedWordpiece(const std::vector<std::string>& vocab);

	LookupStatus Contains(absl::string_view key, bool* value) const override;
	bool LookupId(absl::string_view key, int* result) const;
	bool LookupWord(int vocab_id, absl::string_view* result) const;
	int VocabularySize() const { return vocab_.size(); }

	private:
	// All words indexed position in vocabulary file.
	std::vector<std::string> vocab_;
	absl::flat_hash_map<absl::string_view, int> index_map_;
	};

	// Wordpiece tokenizer for bert models. Initialized with a vocab file or vector.
	//
	// The full tokenization involves two steps: Splitting the input into tokens
	// (pretokenization) and splitting the tokens into subwords.
	class BertTokenizer : public tflite::support::text::tokenizer::Tokenizer {
	public:
	// Initialize the tokenizer from vocab vector and tokenizer configs.
	explicit BertTokenizer(const std::vector<std::string>& vocab,
	const BertTokenizerOptions& options = {})
	: vocab_{FlatHashMapBackedWordpiece(vocab)}, options_{options} {}

	// Initialize the tokenizer from file path to vocab and tokenizer configs.
	explicit BertTokenizer(const std::string& path_to_vocab,
	const BertTokenizerOptions& options = {})
	: BertTokenizer(LoadVocabFromFile(path_to_vocab), options) {}

	// Initialize the tokenizer from buffer and size of vocab and tokenizer
	// configs.
	BertTokenizer(const char* vocab_buffer_data, size_t vocab_buffer_size,
	const BertTokenizerOptions& options = {})
	: BertTokenizer(LoadVocabFromBuffer(vocab_buffer_data, vocab_buffer_size),
	options) {}

	// Perform tokenization, first tokenize the input and then find the subwords.
	// Return tokenized results containing the subwords.
	TokenizerResult Tokenize(const std::string& input) override;

	// Perform tokenization, first tokenize the input and then find the subwords.
	// Return tokenized results containing the subwords and codepoint indices.
	WordpieceTokenizerResult TokenizeIntoWordpieces(const std::string& input);

	// Perform tokenization on a single token, return tokenized results containing
	// the subwords and codepoint indices.
	WordpieceTokenizerResult TokenizeSingleToken(const std::string& token);

	// Perform tokenization, return tokenized results containing the subwords and
	// codepoint indices.
	WordpieceTokenizerResult TokenizeIntoWordpieces(
	const std::vector<Token>& tokens);

	// Check if a certain key is included in the vocab.
	LookupStatus Contains(const absl::string_view key, bool* value) const {
	return vocab_.Contains(key, value);
	}

	// Find the id of a wordpiece.
	bool LookupId(absl::string_view key, int* result) const override {
	return vocab_.LookupId(key, result);
	}

	// Find the wordpiece from an id.
	bool LookupWord(int vocab_id, absl::string_view* result) const override {
	return vocab_.LookupWord(vocab_id, result);
	}

	int VocabularySize() const { return vocab_.VocabularySize(); }

	static std::vector<std::string> PreTokenize(const absl::string_view input);

	private:
	FlatHashMapBackedWordpiece vocab_;
	BertTokenizerOptions options_;
	};

	} // namespace libtextclassifier3

	#endif // LIBTEXTCLASSIFIER_UTILS_BERT_TOKENIZER_H_