native/annotator/pod_ner/pod-ner-impl.cc - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "annotator/pod_ner/pod-ner-impl.h"

 #include <algorithm>
 #include <cstdint>
 #include <ctime>
 #include <iostream>
 #include <memory>
 #include <ostream>
 #include <unordered_set>
 #include <vector>

 #include "annotator/model_generated.h"
 #include "annotator/pod_ner/utils.h"
 #include "annotator/types.h"
 #include "utils/base/logging.h"
 #include "utils/bert_tokenizer.h"
 #include "utils/tflite-model-executor.h"
 #include "utils/tokenizer-utils.h"
 #include "utils/utf8/unicodetext.h"
 #include "absl/strings/ascii.h"
 #include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow_lite_support/cc/text/tokenizers/tokenizer.h"
 #include "tensorflow_models/seq_flow_lite/tflite_ops/layer_norm.h"
 #include "tensorflow_models/seq_flow_lite/tflite_ops/quantization_util.h"

 namespace libtextclassifier3 {

 using PodNerModel_::CollectionT;
 using PodNerModel_::LabelT;
 using ::tflite::support::text::tokenizer::TokenizerResult;

 namespace {

 using PodNerModel_::Label_::BoiseType;
 using PodNerModel_::Label_::BoiseType_BEGIN;
 using PodNerModel_::Label_::BoiseType_END;
 using PodNerModel_::Label_::BoiseType_INTERMEDIATE;
 using PodNerModel_::Label_::BoiseType_O;
 using PodNerModel_::Label_::BoiseType_SINGLE;
 using PodNerModel_::Label_::MentionType;
 using PodNerModel_::Label_::MentionType_NAM;
 using PodNerModel_::Label_::MentionType_NOM;
 using PodNerModel_::Label_::MentionType_UNDEFINED;

 void EmplaceToLabelVector(BoiseType boise_type, MentionType mention_type,
                           int collection_id, std::vector<LabelT> *labels) {
   labels->emplace_back();
   labels->back().boise_type = boise_type;
   labels->back().mention_type = mention_type;
   labels->back().collection_id = collection_id;
 }

 void FillDefaultLabelsAndCollections(float default_priority,
                                      std::vector<LabelT> *labels,
                                      std::vector<CollectionT> *collections) {
   std::vector<std::string> collection_names = {
       "art",          "consumer_good", "event",  "location",
       "organization", "ner_entity",    "person", "undefined"};
   collections->clear();
   for (const std::string &collection_name : collection_names) {
     collections->emplace_back();
     collections->back().name = collection_name;
     collections->back().single_token_priority_score = default_priority;
     collections->back().multi_token_priority_score = default_priority;
   }

   labels->clear();
   for (auto boise_type :
        {BoiseType_BEGIN, BoiseType_END, BoiseType_INTERMEDIATE}) {
     for (auto mention_type : {MentionType_NAM, MentionType_NOM}) {
       for (int i = 0; i < collections->size() - 1; ++i) {  // skip undefined
         EmplaceToLabelVector(boise_type, mention_type, i, labels);
       }
     }
   }
   EmplaceToLabelVector(BoiseType_O, MentionType_UNDEFINED, 7, labels);
   for (auto mention_type : {MentionType_NAM, MentionType_NOM}) {
     for (int i = 0; i < collections->size() - 1; ++i) {  // skip undefined
       EmplaceToLabelVector(BoiseType_SINGLE, mention_type, i, labels);
     }
   }
 }

 std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     const PodNerModel *model) {
   TC3_CHECK(model != nullptr);
   if (model->tflite_model() == nullptr) {
     TC3_LOG(ERROR) << "Unable to create tf.lite interpreter, model is null.";
     return nullptr;
   }

   const tflite::Model *tflite_model =
       tflite::GetModel(model->tflite_model()->Data());
   if (tflite_model == nullptr) {
     TC3_LOG(ERROR) << "Unable to create tf.lite interpreter, model is null.";
     return nullptr;
   }

   std::unique_ptr<tflite::OpResolver> resolver =
       BuildOpResolver([](tflite::MutableOpResolver *mutable_resolver) {
         mutable_resolver->AddBuiltin(::tflite::BuiltinOperator_SHAPE,
                                      ::tflite::ops::builtin::Register_SHAPE());
         mutable_resolver->AddBuiltin(::tflite::BuiltinOperator_RANGE,
                                      ::tflite::ops::builtin::Register_RANGE());
         mutable_resolver->AddBuiltin(
             ::tflite::BuiltinOperator_ARG_MAX,
             ::tflite::ops::builtin::Register_ARG_MAX());
         mutable_resolver->AddBuiltin(
             ::tflite::BuiltinOperator_EXPAND_DIMS,
             ::tflite::ops::builtin::Register_EXPAND_DIMS());
         mutable_resolver->AddCustom(
             "LayerNorm", ::seq_flow_lite::ops::custom::Register_LAYER_NORM());
       });

   std::unique_ptr<tflite::Interpreter> tflite_interpreter;
   tflite::InterpreterBuilder(tflite_model, *resolver,
                              nullptr)(&tflite_interpreter);
   if (tflite_interpreter == nullptr) {
     TC3_LOG(ERROR) << "Unable to create tf.lite interpreter.";
     return nullptr;
   }
   return tflite_interpreter;
 }

 bool FindSpecialWordpieceIds(const std::unique_ptr<BertTokenizer> &tokenizer,
                              int *cls_id, int *sep_id, int *period_id,
                              int *unknown_id) {
   if (!tokenizer->LookupId("[CLS]", cls_id)) {
     TC3_LOG(ERROR) << "Couldn't find [CLS] wordpiece.";
     return false;
   }
   if (!tokenizer->LookupId("[SEP]", sep_id)) {
     TC3_LOG(ERROR) << "Couldn't find [SEP] wordpiece.";
     return false;
   }
   if (!tokenizer->LookupId(".", period_id)) {
     TC3_LOG(ERROR) << "Couldn't find [.] wordpiece.";
     return false;
   }
   if (!tokenizer->LookupId("[UNK]", unknown_id)) {
     TC3_LOG(ERROR) << "Couldn't find [UNK] wordpiece.";
     return false;
   }
   return true;
 }
 // WARNING: This tokenizer is not exactly the one the model was trained with
 // so there might be nuances.
 std::unique_ptr<BertTokenizer> CreateTokenizer(const PodNerModel *model) {
   TC3_CHECK(model != nullptr);
   if (model->word_piece_vocab() == nullptr) {
     TC3_LOG(ERROR)
         << "Unable to create tokenizer, model or word_pieces is null.";
     return nullptr;
   }

   return std::unique_ptr<BertTokenizer>(new BertTokenizer(
       reinterpret_cast<const char *>(model->word_piece_vocab()->Data()),
       model->word_piece_vocab()->size()));
 }

 }  // namespace

 std::unique_ptr<PodNerAnnotator> PodNerAnnotator::Create(
     const PodNerModel *model, const UniLib &unilib) {
   if (model == nullptr) {
     TC3_LOG(ERROR) << "Create received null model.";
     return nullptr;
   }

   std::unique_ptr<BertTokenizer> tokenizer = CreateTokenizer(model);
   if (tokenizer == nullptr) {
     return nullptr;
   }

   int cls_id, sep_id, period_id, unknown_wordpiece_id;
   if (!FindSpecialWordpieceIds(tokenizer, &cls_id, &sep_id, &period_id,
                                &unknown_wordpiece_id)) {
     return nullptr;
   }

   std::unique_ptr<PodNerAnnotator> annotator(new PodNerAnnotator(unilib));
   annotator->tokenizer_ = std::move(tokenizer);
   annotator->lowercase_input_ = model->lowercase_input();
   annotator->logits_index_in_output_tensor_ =
       model->logits_index_in_output_tensor();
   annotator->append_final_period_ = model->append_final_period();
   if (model->labels() && model->labels()->size() > 0 && model->collections() &&
       model->collections()->size() > 0) {
     annotator->labels_.clear();
     for (const PodNerModel_::Label *label : *model->labels()) {
       annotator->labels_.emplace_back();
       annotator->labels_.back().boise_type = label->boise_type();
       annotator->labels_.back().mention_type = label->mention_type();
       annotator->labels_.back().collection_id = label->collection_id();
     }
     for (const PodNerModel_::Collection *collection : *model->collections()) {
       annotator->collections_.emplace_back();
       annotator->collections_.back().name = collection->name()->str();
       annotator->collections_.back().single_token_priority_score =
           collection->single_token_priority_score();
       annotator->collections_.back().multi_token_priority_score =
           collection->multi_token_priority_score();
     }
   } else {
     FillDefaultLabelsAndCollections(
         model->priority_score(), &annotator->labels_, &annotator->collections_);
   }
   int max_num_surrounding_wordpieces = model->append_final_period() ? 3 : 2;
   annotator->max_num_effective_wordpieces_ =
       model->max_num_wordpieces() - max_num_surrounding_wordpieces;
   annotator->sliding_window_num_wordpieces_overlap_ =
       model->sliding_window_num_wordpieces_overlap();
   annotator->max_ratio_unknown_wordpieces_ =
       model->max_ratio_unknown_wordpieces();
   annotator->min_number_of_tokens_ = model->min_number_of_tokens();
   annotator->min_number_of_wordpieces_ = model->min_number_of_wordpieces();
   annotator->cls_wordpiece_id_ = cls_id;
   annotator->sep_wordpiece_id_ = sep_id;
   annotator->period_wordpiece_id_ = period_id;
   annotator->unknown_wordpiece_id_ = unknown_wordpiece_id;
   annotator->model_ = model;

   return annotator;
 }

 std::vector<LabelT> PodNerAnnotator::ReadResultsFromInterpreter(
     tflite::Interpreter &interpreter) const {
   TfLiteTensor *output =
       interpreter.tensor(interpreter.outputs()[logits_index_in_output_tensor_]);
   TC3_CHECK_EQ(output->dims->size, 3);
   TC3_CHECK_EQ(output->dims->data[0], 1);
   TC3_CHECK_EQ(output->dims->data[2], labels_.size());
   std::vector<LabelT> return_value(output->dims->data[1]);
   std::vector<float> probs(output->dims->data[1]);
   for (int step = 0, index = 0; step < output->dims->data[1]; ++step) {
     float max_prob = 0.0f;
     int max_index = 0;
     for (int cindex = 0; cindex < output->dims->data[2]; ++cindex) {
       const float probability =
           ::seq_flow_lite::PodDequantize(*output, index++);
       if (probability > max_prob) {
         max_prob = probability;
         max_index = cindex;
       }
     }
     return_value[step] = labels_[max_index];
     probs[step] = max_prob;
   }
   return return_value;
 }

 std::vector<LabelT> PodNerAnnotator::ExecuteModel(
     const VectorSpan<int> &wordpiece_indices,
     const VectorSpan<int32_t> &token_starts,
     const VectorSpan<Token> &tokens) const {
   // Check that there are not more input indices than supported.
   if (wordpiece_indices.size() > max_num_effective_wordpieces_) {
     TC3_LOG(ERROR) << "More than " << max_num_effective_wordpieces_
                    << " indices passed to POD NER model.";
     return {};
   }
   if (wordpiece_indices.size() <= 0 || token_starts.size() <= 0 ||
       tokens.size() <= 0) {
     TC3_LOG(ERROR) << "ExecuteModel received illegal input, #wordpiece_indices="
                    << wordpiece_indices.size()
                    << " #token_starts=" << token_starts.size()
                    << " #tokens=" << tokens.size();
     return {};
   }

   // For the CLS (at the beginning) and SEP (at the end) wordpieces.
   int num_additional_wordpieces = 2;
   bool should_append_final_period = false;
   // Optionally add a final period wordpiece if the final token is not
   // already punctuation. This can improve performance for models trained on
   // data mostly ending in sentence-final punctuation.
   const std::string &last_token = (tokens.end() - 1)->value;
   if (append_final_period_ &&
       (last_token.size() != 1 || !unilib_.IsPunctuation(last_token.at(0)))) {
     should_append_final_period = true;
     num_additional_wordpieces++;
   }

   // Interpreter needs to be created for each inference call separately,
   // otherwise the class is not thread-safe.
   std::unique_ptr<tflite::Interpreter> interpreter = CreateInterpreter(model_);
   if (interpreter == nullptr) {
     TC3_LOG(ERROR) << "Couldn't create Interpreter.";
     return {};
   }

   TfLiteStatus status;
   status = interpreter->ResizeInputTensor(
       interpreter->inputs()[0],
       {1, wordpiece_indices.size() + num_additional_wordpieces});
   TC3_CHECK_EQ(status, kTfLiteOk);
   status = interpreter->ResizeInputTensor(interpreter->inputs()[1],
                                           {1, token_starts.size()});
   TC3_CHECK_EQ(status, kTfLiteOk);

   status = interpreter->AllocateTensors();
   TC3_CHECK_EQ(status, kTfLiteOk);

   TfLiteTensor *tensor = interpreter->tensor(interpreter->inputs()[0]);
   int wordpiece_tensor_index = 0;
   tensor->data.i32[wordpiece_tensor_index++] = cls_wordpiece_id_;
   for (int wordpiece_index : wordpiece_indices) {
     tensor->data.i32[wordpiece_tensor_index++] = wordpiece_index;
   }

   if (should_append_final_period) {
     tensor->data.i32[wordpiece_tensor_index++] = period_wordpiece_id_;
   }
   tensor->data.i32[wordpiece_tensor_index++] = sep_wordpiece_id_;

   tensor = interpreter->tensor(interpreter->inputs()[1]);
   for (int i = 0; i < token_starts.size(); ++i) {
     // Need to add one because of the starting CLS wordpiece and reduce the
     // offset from the first wordpiece.
     tensor->data.i32[i] = token_starts[i] + 1 - token_starts[0];
   }

   status = interpreter->Invoke();
   TC3_CHECK_EQ(status, kTfLiteOk);

   return ReadResultsFromInterpreter(*interpreter);
 }

 bool PodNerAnnotator::PrepareText(const UnicodeText &text_unicode,
                                   std::vector<int32_t> *wordpiece_indices,
                                   std::vector<int32_t> *token_starts,
                                   std::vector<Token> *tokens) const {
   *tokens = TokenizeOnWhiteSpacePunctuationAndChineseLetter(
       text_unicode.ToUTF8String());
   tokens->erase(std::remove_if(tokens->begin(), tokens->end(),
                                [](const Token &token) {
                                  return token.start == token.end;
                                }),
                 tokens->end());

   for (const Token &token : *tokens) {
     const std::string token_text =
         lowercase_input_ ? unilib_
                                .ToLowerText(UTF8ToUnicodeText(
                                    token.value, /*do_copy=*/false))
                                .ToUTF8String()
                          : token.value;

     const TokenizerResult wordpiece_tokenization =
         tokenizer_->TokenizeSingleToken(token_text);

     std::vector<int> wordpiece_ids;
     for (const std::string &wordpiece : wordpiece_tokenization.subwords) {
       if (!tokenizer_->LookupId(wordpiece, &(wordpiece_ids.emplace_back()))) {
         TC3_LOG(ERROR) << "Couldn't find wordpiece " << wordpiece;
         return false;
       }
     }

     if (wordpiece_ids.empty()) {
       TC3_LOG(ERROR) << "wordpiece_ids.empty()";
       return false;
     }
     token_starts->push_back(wordpiece_indices->size());
     for (const int64 wordpiece_id : wordpiece_ids) {
       wordpiece_indices->push_back(wordpiece_id);
     }
   }

   return true;
 }

 bool PodNerAnnotator::Annotate(const UnicodeText &context,
                                std::vector<AnnotatedSpan> *results) const {
   return AnnotateAroundSpanOfInterest(context, {0, context.size_codepoints()},
                                       results);
 }

 bool PodNerAnnotator::AnnotateAroundSpanOfInterest(
     const UnicodeText &context, const CodepointSpan &span_of_interest,
     std::vector<AnnotatedSpan> *results) const {
   TC3_CHECK(results != nullptr);

   std::vector<int32_t> wordpiece_indices;
   std::vector<int32_t> token_starts;
   std::vector<Token> tokens;
   if (!PrepareText(context, &wordpiece_indices, &token_starts, &tokens)) {
     TC3_LOG(ERROR) << "PodNerAnnotator PrepareText(...) failed.";
     return false;
   }
   const int unknown_wordpieces_count =
       std::count(wordpiece_indices.begin(), wordpiece_indices.end(),
                  unknown_wordpiece_id_);
   if (tokens.empty() || tokens.size() < min_number_of_tokens_ ||
       wordpiece_indices.size() < min_number_of_wordpieces_ ||
       (static_cast<float>(unknown_wordpieces_count) /
        wordpiece_indices.size()) > max_ratio_unknown_wordpieces_) {
     return true;
   }

   std::vector<LabelT> labels;
   int first_token_index_entire_window = 0;

   WindowGenerator window_generator(
       wordpiece_indices, token_starts, tokens, max_num_effective_wordpieces_,
       sliding_window_num_wordpieces_overlap_, span_of_interest);
   while (!window_generator.Done()) {
     VectorSpan<int32_t> cur_wordpiece_indices;
     VectorSpan<int32_t> cur_token_starts;
     VectorSpan<Token> cur_tokens;
     if (!window_generator.Next(&cur_wordpiece_indices, &cur_token_starts,
                                &cur_tokens) ||
         cur_tokens.size() <= 0 || cur_token_starts.size() <= 0 ||
         cur_wordpiece_indices.size() <= 0) {
       return false;
     }
     std::vector<LabelT> new_labels =
         ExecuteModel(cur_wordpiece_indices, cur_token_starts, cur_tokens);
     if (labels.empty()) {  // First loop.
       first_token_index_entire_window = cur_tokens.begin() - tokens.begin();
     }
     if (!MergeLabelsIntoLeftSequence(
             /*labels_right=*/new_labels,
             /*index_first_right_tag_in_left=*/cur_tokens.begin() -
                 tokens.begin() - first_token_index_entire_window,
             /*labels_left=*/&labels)) {
       return false;
     }
   }

   if (labels.empty()) {
     return false;
   }
   ConvertTagsToAnnotatedSpans(
       VectorSpan<Token>(tokens.begin() + first_token_index_entire_window,
                         tokens.end()),
       labels, collections_, {PodNerModel_::Label_::MentionType_NAM},
       /*relaxed_inside_label_matching=*/false,
       /*relaxed_mention_type_matching=*/false, results);

   return true;
 }

 bool PodNerAnnotator::SuggestSelection(const UnicodeText &context,
                                        CodepointSpan click,
                                        AnnotatedSpan *result) const {
   TC3_VLOG(INFO) << "POD NER SuggestSelection " << click;
   std::vector<AnnotatedSpan> annotations;
   if (!AnnotateAroundSpanOfInterest(context, click, &annotations)) {
     TC3_VLOG(INFO) << "POD NER SuggestSelection: Annotate error. Returning: "
                    << click;
     *result = {};
     return false;
   }

   for (const AnnotatedSpan &annotation : annotations) {
     TC3_VLOG(INFO) << "POD NER SuggestSelection: " << annotation;
     if (annotation.span.first <= click.first &&
         annotation.span.second >= click.second) {
       TC3_VLOG(INFO) << "POD NER SuggestSelection: Accepted.";
       *result = annotation;
       return true;
     }
   }

   TC3_VLOG(INFO)
       << "POD NER SuggestSelection: No annotation matched click. Returning: "
       << click;
   *result = {};
   return false;
 }

 bool PodNerAnnotator::ClassifyText(const UnicodeText &context,
                                    CodepointSpan click,
                                    ClassificationResult *result) const {
   TC3_VLOG(INFO) << "POD NER ClassifyText " << click;
   std::vector<AnnotatedSpan> annotations;
   if (!AnnotateAroundSpanOfInterest(context, click, &annotations)) {
     return false;
   }

   for (const AnnotatedSpan &annotation : annotations) {
     if (annotation.span.first <= click.first &&
         annotation.span.second >= click.second) {
       if (annotation.classification.empty()) {
         return false;
       }
       *result = annotation.classification[0];
       return true;
     }
   }
   return false;
 }

 std::vector<std::string> PodNerAnnotator::GetSupportedCollections() const {
   std::vector<std::string> result;
   for (const PodNerModel_::CollectionT &collection : collections_) {
     result.push_back(collection.name);
   }
   return result;
 }

 }  // namespace libtextclassifier3
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "annotator/pod_ner/pod-ner-impl.h"

	#include <algorithm>
	#include <cstdint>
	#include <ctime>
	#include <iostream>
	#include <memory>
	#include <ostream>
	#include <unordered_set>
	#include <vector>

	#include "annotator/model_generated.h"
	#include "annotator/pod_ner/utils.h"
	#include "annotator/types.h"
	#include "utils/base/logging.h"
	#include "utils/bert_tokenizer.h"
	#include "utils/tflite-model-executor.h"
	#include "utils/tokenizer-utils.h"
	#include "utils/utf8/unicodetext.h"
	#include "absl/strings/ascii.h"
	#include "tensorflow/lite/kernels/builtin_op_kernels.h"
	#include "tensorflow/lite/mutable_op_resolver.h"
	#include "tensorflow_lite_support/cc/text/tokenizers/tokenizer.h"
	#include "tensorflow_models/seq_flow_lite/tflite_ops/layer_norm.h"
	#include "tensorflow_models/seq_flow_lite/tflite_ops/quantization_util.h"

	namespace libtextclassifier3 {

	using PodNerModel_::CollectionT;
	using PodNerModel_::LabelT;
	using ::tflite::support::text::tokenizer::TokenizerResult;

	namespace {

	using PodNerModel_::Label_::BoiseType;
	using PodNerModel_::Label_::BoiseType_BEGIN;
	using PodNerModel_::Label_::BoiseType_END;
	using PodNerModel_::Label_::BoiseType_INTERMEDIATE;
	using PodNerModel_::Label_::BoiseType_O;
	using PodNerModel_::Label_::BoiseType_SINGLE;
	using PodNerModel_::Label_::MentionType;
	using PodNerModel_::Label_::MentionType_NAM;
	using PodNerModel_::Label_::MentionType_NOM;
	using PodNerModel_::Label_::MentionType_UNDEFINED;

	void EmplaceToLabelVector(BoiseType boise_type, MentionType mention_type,
	int collection_id, std::vector<LabelT> *labels) {
	labels->emplace_back();
	labels->back().boise_type = boise_type;
	labels->back().mention_type = mention_type;
	labels->back().collection_id = collection_id;
	}

	void FillDefaultLabelsAndCollections(float default_priority,
	std::vector<LabelT> *labels,
	std::vector<CollectionT> *collections) {
	std::vector<std::string> collection_names = {
	"art", "consumer_good", "event", "location",
	"organization", "ner_entity", "person", "undefined"};
	collections->clear();
	for (const std::string &collection_name : collection_names) {
	collections->emplace_back();
	collections->back().name = collection_name;
	collections->back().single_token_priority_score = default_priority;
	collections->back().multi_token_priority_score = default_priority;
	}

	labels->clear();
	for (auto boise_type :
	{BoiseType_BEGIN, BoiseType_END, BoiseType_INTERMEDIATE}) {
	for (auto mention_type : {MentionType_NAM, MentionType_NOM}) {
	for (int i = 0; i < collections->size() - 1; ++i) { // skip undefined
	EmplaceToLabelVector(boise_type, mention_type, i, labels);
	}
	}
	}
	EmplaceToLabelVector(BoiseType_O, MentionType_UNDEFINED, 7, labels);
	for (auto mention_type : {MentionType_NAM, MentionType_NOM}) {
	for (int i = 0; i < collections->size() - 1; ++i) { // skip undefined
	EmplaceToLabelVector(BoiseType_SINGLE, mention_type, i, labels);
	}
	}
	}

	std::unique_ptr<tflite::Interpreter> CreateInterpreter(
	const PodNerModel *model) {
	TC3_CHECK(model != nullptr);
	if (model->tflite_model() == nullptr) {
	TC3_LOG(ERROR) << "Unable to create tf.lite interpreter, model is null.";
	return nullptr;
	}

	const tflite::Model *tflite_model =
	tflite::GetModel(model->tflite_model()->Data());
	if (tflite_model == nullptr) {
	TC3_LOG(ERROR) << "Unable to create tf.lite interpreter, model is null.";
	return nullptr;
	}

	std::unique_ptr<tflite::OpResolver> resolver =
	BuildOpResolver([](tflite::MutableOpResolver *mutable_resolver) {
	mutable_resolver->AddBuiltin(::tflite::BuiltinOperator_SHAPE,
	::tflite::ops::builtin::Register_SHAPE());
	mutable_resolver->AddBuiltin(::tflite::BuiltinOperator_RANGE,
	::tflite::ops::builtin::Register_RANGE());
	mutable_resolver->AddBuiltin(
	::tflite::BuiltinOperator_ARG_MAX,
	::tflite::ops::builtin::Register_ARG_MAX());
	mutable_resolver->AddBuiltin(
	::tflite::BuiltinOperator_EXPAND_DIMS,
	::tflite::ops::builtin::Register_EXPAND_DIMS());
	mutable_resolver->AddCustom(
	"LayerNorm", ::seq_flow_lite::ops::custom::Register_LAYER_NORM());
	});

	std::unique_ptr<tflite::Interpreter> tflite_interpreter;
	tflite::InterpreterBuilder(tflite_model, *resolver,
	nullptr)(&tflite_interpreter);
	if (tflite_interpreter == nullptr) {
	TC3_LOG(ERROR) << "Unable to create tf.lite interpreter.";
	return nullptr;
	}
	return tflite_interpreter;
	}

	bool FindSpecialWordpieceIds(const std::unique_ptr<BertTokenizer> &tokenizer,
	int cls_id, int sep_id, int *period_id,
	int *unknown_id) {
	if (!tokenizer->LookupId("[CLS]", cls_id)) {
	TC3_LOG(ERROR) << "Couldn't find [CLS] wordpiece.";
	return false;
	}
	if (!tokenizer->LookupId("[SEP]", sep_id)) {
	TC3_LOG(ERROR) << "Couldn't find [SEP] wordpiece.";
	return false;
	}
	if (!tokenizer->LookupId(".", period_id)) {
	TC3_LOG(ERROR) << "Couldn't find [.] wordpiece.";
	return false;
	}
	if (!tokenizer->LookupId("[UNK]", unknown_id)) {
	TC3_LOG(ERROR) << "Couldn't find [UNK] wordpiece.";
	return false;
	}
	return true;
	}
	// WARNING: This tokenizer is not exactly the one the model was trained with
	// so there might be nuances.
	std::unique_ptr<BertTokenizer> CreateTokenizer(const PodNerModel *model) {
	TC3_CHECK(model != nullptr);
	if (model->word_piece_vocab() == nullptr) {
	TC3_LOG(ERROR)
	<< "Unable to create tokenizer, model or word_pieces is null.";
	return nullptr;
	}

	return std::unique_ptr<BertTokenizer>(new BertTokenizer(
	reinterpret_cast<const char *>(model->word_piece_vocab()->Data()),
	model->word_piece_vocab()->size()));
	}

	} // namespace

	std::unique_ptr<PodNerAnnotator> PodNerAnnotator::Create(
	const PodNerModel *model, const UniLib &unilib) {
	if (model == nullptr) {
	TC3_LOG(ERROR) << "Create received null model.";
	return nullptr;
	}

	std::unique_ptr<BertTokenizer> tokenizer = CreateTokenizer(model);
	if (tokenizer == nullptr) {
	return nullptr;
	}

	int cls_id, sep_id, period_id, unknown_wordpiece_id;
	if (!FindSpecialWordpieceIds(tokenizer, &cls_id, &sep_id, &period_id,
	&unknown_wordpiece_id)) {
	return nullptr;
	}

	std::unique_ptr<PodNerAnnotator> annotator(new PodNerAnnotator(unilib));
	annotator->tokenizer_ = std::move(tokenizer);
	annotator->lowercase_input_ = model->lowercase_input();
	annotator->logits_index_in_output_tensor_ =
	model->logits_index_in_output_tensor();
	annotator->append_final_period_ = model->append_final_period();
	if (model->labels() && model->labels()->size() > 0 && model->collections() &&
	model->collections()->size() > 0) {
	annotator->labels_.clear();
	for (const PodNerModel_::Label label : model->labels()) {
	annotator->labels_.emplace_back();
	annotator->labels_.back().boise_type = label->boise_type();
	annotator->labels_.back().mention_type = label->mention_type();
	annotator->labels_.back().collection_id = label->collection_id();
	}
	for (const PodNerModel_::Collection collection : model->collections()) {
	annotator->collections_.emplace_back();
	annotator->collections_.back().name = collection->name()->str();
	annotator->collections_.back().single_token_priority_score =
	collection->single_token_priority_score();
	annotator->collections_.back().multi_token_priority_score =
	collection->multi_token_priority_score();
	}
	} else {
	FillDefaultLabelsAndCollections(
	model->priority_score(), &annotator->labels_, &annotator->collections_);
	}
	int max_num_surrounding_wordpieces = model->append_final_period() ? 3 : 2;
	annotator->max_num_effective_wordpieces_ =
	model->max_num_wordpieces() - max_num_surrounding_wordpieces;
	annotator->sliding_window_num_wordpieces_overlap_ =
	model->sliding_window_num_wordpieces_overlap();
	annotator->max_ratio_unknown_wordpieces_ =
	model->max_ratio_unknown_wordpieces();
	annotator->min_number_of_tokens_ = model->min_number_of_tokens();
	annotator->min_number_of_wordpieces_ = model->min_number_of_wordpieces();
	annotator->cls_wordpiece_id_ = cls_id;
	annotator->sep_wordpiece_id_ = sep_id;
	annotator->period_wordpiece_id_ = period_id;
	annotator->unknown_wordpiece_id_ = unknown_wordpiece_id;
	annotator->model_ = model;

	return annotator;
	}

	std::vector<LabelT> PodNerAnnotator::ReadResultsFromInterpreter(
	tflite::Interpreter &interpreter) const {
	TfLiteTensor *output =
	interpreter.tensor(interpreter.outputs()[logits_index_in_output_tensor_]);
	TC3_CHECK_EQ(output->dims->size, 3);
	TC3_CHECK_EQ(output->dims->data[0], 1);
	TC3_CHECK_EQ(output->dims->data[2], labels_.size());
	std::vector<LabelT> return_value(output->dims->data[1]);
	std::vector<float> probs(output->dims->data[1]);
	for (int step = 0, index = 0; step < output->dims->data[1]; ++step) {
	float max_prob = 0.0f;
	int max_index = 0;
	for (int cindex = 0; cindex < output->dims->data[2]; ++cindex) {
	const float probability =
	::seq_flow_lite::PodDequantize(*output, index++);
	if (probability > max_prob) {
	max_prob = probability;
	max_index = cindex;
	}
	}
	return_value[step] = labels_[max_index];
	probs[step] = max_prob;
	}
	return return_value;
	}

	std::vector<LabelT> PodNerAnnotator::ExecuteModel(
	const VectorSpan<int> &wordpiece_indices,
	const VectorSpan<int32_t> &token_starts,
	const VectorSpan<Token> &tokens) const {
	// Check that there are not more input indices than supported.
	if (wordpiece_indices.size() > max_num_effective_wordpieces_) {
	TC3_LOG(ERROR) << "More than " << max_num_effective_wordpieces_
	<< " indices passed to POD NER model.";
	return {};
	}
	if (wordpiece_indices.size() <= 0 \|\| token_starts.size() <= 0 \|\|
	tokens.size() <= 0) {
	TC3_LOG(ERROR) << "ExecuteModel received illegal input, #wordpiece_indices="
	<< wordpiece_indices.size()
	<< " #token_starts=" << token_starts.size()
	<< " #tokens=" << tokens.size();
	return {};
	}

	// For the CLS (at the beginning) and SEP (at the end) wordpieces.
	int num_additional_wordpieces = 2;
	bool should_append_final_period = false;
	// Optionally add a final period wordpiece if the final token is not
	// already punctuation. This can improve performance for models trained on
	// data mostly ending in sentence-final punctuation.
	const std::string &last_token = (tokens.end() - 1)->value;
	if (append_final_period_ &&
	(last_token.size() != 1 \|\| !unilib_.IsPunctuation(last_token.at(0)))) {
	should_append_final_period = true;
	num_additional_wordpieces++;
	}

	// Interpreter needs to be created for each inference call separately,
	// otherwise the class is not thread-safe.
	std::unique_ptr<tflite::Interpreter> interpreter = CreateInterpreter(model_);
	if (interpreter == nullptr) {
	TC3_LOG(ERROR) << "Couldn't create Interpreter.";
	return {};
	}

	TfLiteStatus status;
	status = interpreter->ResizeInputTensor(
	interpreter->inputs()[0],
	{1, wordpiece_indices.size() + num_additional_wordpieces});
	TC3_CHECK_EQ(status, kTfLiteOk);
	status = interpreter->ResizeInputTensor(interpreter->inputs()[1],
	{1, token_starts.size()});
	TC3_CHECK_EQ(status, kTfLiteOk);

	status = interpreter->AllocateTensors();
	TC3_CHECK_EQ(status, kTfLiteOk);

	TfLiteTensor *tensor = interpreter->tensor(interpreter->inputs()[0]);
	int wordpiece_tensor_index = 0;
	tensor->data.i32[wordpiece_tensor_index++] = cls_wordpiece_id_;
	for (int wordpiece_index : wordpiece_indices) {
	tensor->data.i32[wordpiece_tensor_index++] = wordpiece_index;
	}

	if (should_append_final_period) {
	tensor->data.i32[wordpiece_tensor_index++] = period_wordpiece_id_;
	}
	tensor->data.i32[wordpiece_tensor_index++] = sep_wordpiece_id_;

	tensor = interpreter->tensor(interpreter->inputs()[1]);
	for (int i = 0; i < token_starts.size(); ++i) {
	// Need to add one because of the starting CLS wordpiece and reduce the
	// offset from the first wordpiece.
	tensor->data.i32[i] = token_starts[i] + 1 - token_starts[0];
	}

	status = interpreter->Invoke();
	TC3_CHECK_EQ(status, kTfLiteOk);

	return ReadResultsFromInterpreter(*interpreter);
	}

	bool PodNerAnnotator::PrepareText(const UnicodeText &text_unicode,
	std::vector<int32_t> *wordpiece_indices,
	std::vector<int32_t> *token_starts,
	std::vector<Token> *tokens) const {
	*tokens = TokenizeOnWhiteSpacePunctuationAndChineseLetter(
	text_unicode.ToUTF8String());
	tokens->erase(std::remove_if(tokens->begin(), tokens->end(),
	[](const Token &token) {
	return token.start == token.end;
	}),
	tokens->end());

	for (const Token &token : *tokens) {
	const std::string token_text =
	lowercase_input_ ? unilib_
	.ToLowerText(UTF8ToUnicodeText(
	token.value, /do_copy=/false))
	.ToUTF8String()
	: token.value;

	const TokenizerResult wordpiece_tokenization =
	tokenizer_->TokenizeSingleToken(token_text);

	std::vector<int> wordpiece_ids;
	for (const std::string &wordpiece : wordpiece_tokenization.subwords) {
	if (!tokenizer_->LookupId(wordpiece, &(wordpiece_ids.emplace_back()))) {
	TC3_LOG(ERROR) << "Couldn't find wordpiece " << wordpiece;
	return false;
	}
	}

	if (wordpiece_ids.empty()) {
	TC3_LOG(ERROR) << "wordpiece_ids.empty()";
	return false;
	}
	token_starts->push_back(wordpiece_indices->size());
	for (const int64 wordpiece_id : wordpiece_ids) {
	wordpiece_indices->push_back(wordpiece_id);
	}
	}

	return true;
	}

	bool PodNerAnnotator::Annotate(const UnicodeText &context,
	std::vector<AnnotatedSpan> *results) const {
	return AnnotateAroundSpanOfInterest(context, {0, context.size_codepoints()},
	results);
	}

	bool PodNerAnnotator::AnnotateAroundSpanOfInterest(
	const UnicodeText &context, const CodepointSpan &span_of_interest,
	std::vector<AnnotatedSpan> *results) const {
	TC3_CHECK(results != nullptr);

	std::vector<int32_t> wordpiece_indices;
	std::vector<int32_t> token_starts;
	std::vector<Token> tokens;
	if (!PrepareText(context, &wordpiece_indices, &token_starts, &tokens)) {
	TC3_LOG(ERROR) << "PodNerAnnotator PrepareText(...) failed.";
	return false;
	}
	const int unknown_wordpieces_count =
	std::count(wordpiece_indices.begin(), wordpiece_indices.end(),
	unknown_wordpiece_id_);
	if (tokens.empty() \|\| tokens.size() < min_number_of_tokens_ \|\|
	wordpiece_indices.size() < min_number_of_wordpieces_ \|\|
	(static_cast<float>(unknown_wordpieces_count) /
	wordpiece_indices.size()) > max_ratio_unknown_wordpieces_) {
	return true;
	}

	std::vector<LabelT> labels;
	int first_token_index_entire_window = 0;

	WindowGenerator window_generator(
	wordpiece_indices, token_starts, tokens, max_num_effective_wordpieces_,
	sliding_window_num_wordpieces_overlap_, span_of_interest);
	while (!window_generator.Done()) {
	VectorSpan<int32_t> cur_wordpiece_indices;
	VectorSpan<int32_t> cur_token_starts;
	VectorSpan<Token> cur_tokens;
	if (!window_generator.Next(&cur_wordpiece_indices, &cur_token_starts,
	&cur_tokens) \|\|
	cur_tokens.size() <= 0 \|\| cur_token_starts.size() <= 0 \|\|
	cur_wordpiece_indices.size() <= 0) {
	return false;
	}
	std::vector<LabelT> new_labels =
	ExecuteModel(cur_wordpiece_indices, cur_token_starts, cur_tokens);
	if (labels.empty()) { // First loop.
	first_token_index_entire_window = cur_tokens.begin() - tokens.begin();
	}
	if (!MergeLabelsIntoLeftSequence(
	/labels_right=/new_labels,
	/index_first_right_tag_in_left=/cur_tokens.begin() -
	tokens.begin() - first_token_index_entire_window,
	/labels_left=/&labels)) {
	return false;
	}
	}

	if (labels.empty()) {
	return false;
	}
	ConvertTagsToAnnotatedSpans(
	VectorSpan<Token>(tokens.begin() + first_token_index_entire_window,
	tokens.end()),
	labels, collections_, {PodNerModel_::Label_::MentionType_NAM},
	/relaxed_inside_label_matching=/false,
	/relaxed_mention_type_matching=/false, results);

	return true;
	}

	bool PodNerAnnotator::SuggestSelection(const UnicodeText &context,
	CodepointSpan click,
	AnnotatedSpan *result) const {
	TC3_VLOG(INFO) << "POD NER SuggestSelection " << click;
	std::vector<AnnotatedSpan> annotations;
	if (!AnnotateAroundSpanOfInterest(context, click, &annotations)) {
	TC3_VLOG(INFO) << "POD NER SuggestSelection: Annotate error. Returning: "
	<< click;
	*result = {};
	return false;
	}

	for (const AnnotatedSpan &annotation : annotations) {
	TC3_VLOG(INFO) << "POD NER SuggestSelection: " << annotation;
	if (annotation.span.first <= click.first &&
	annotation.span.second >= click.second) {
	TC3_VLOG(INFO) << "POD NER SuggestSelection: Accepted.";
	*result = annotation;
	return true;
	}
	}

	TC3_VLOG(INFO)
	<< "POD NER SuggestSelection: No annotation matched click. Returning: "
	<< click;
	*result = {};
	return false;
	}

	bool PodNerAnnotator::ClassifyText(const UnicodeText &context,
	CodepointSpan click,
	ClassificationResult *result) const {
	TC3_VLOG(INFO) << "POD NER ClassifyText " << click;
	std::vector<AnnotatedSpan> annotations;
	if (!AnnotateAroundSpanOfInterest(context, click, &annotations)) {
	return false;
	}

	for (const AnnotatedSpan &annotation : annotations) {
	if (annotation.span.first <= click.first &&
	annotation.span.second >= click.second) {
	if (annotation.classification.empty()) {
	return false;
	}
	*result = annotation.classification[0];
	return true;
	}
	}
	return false;
	}

	std::vector<std::string> PodNerAnnotator::GetSupportedCollections() const {
	std::vector<std::string> result;
	for (const PodNerModel_::CollectionT &collection : collections_) {
	result.push_back(collection.name);
	}
	return result;
	}

	} // namespace libtextclassifier3