blob: e0d424180e591113eac860eb8326a5901ce152b1 [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "annotator/annotator.h"
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <iterator>
#include <limits>
#include <numeric>
#include <string>
#include <unordered_map>
#include <vector>
#include "annotator/collections.h"
#include "annotator/datetime/grammar-parser.h"
#include "annotator/datetime/regex-parser.h"
#include "annotator/flatbuffer-utils.h"
#include "annotator/knowledge/knowledge-engine-types.h"
#include "annotator/model_generated.h"
#include "annotator/types.h"
#include "utils/base/logging.h"
#include "utils/base/status.h"
#include "utils/base/statusor.h"
#include "utils/calendar/calendar.h"
#include "utils/checksum.h"
#include "utils/grammar/analyzer.h"
#include "utils/i18n/locale-list.h"
#include "utils/i18n/locale.h"
#include "utils/math/softmax.h"
#include "utils/normalization.h"
#include "utils/optional.h"
#include "utils/regex-match.h"
#include "utils/strings/append.h"
#include "utils/strings/numbers.h"
#include "utils/strings/split.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib-common.h"
#include "utils/zlib/zlib_regex.h"
namespace libtextclassifier3 {
using SortedIntSet = std::set<int, std::function<bool(int, int)>>;
const std::string& Annotator::kPhoneCollection =
*[]() { return new std::string("phone"); }();
const std::string& Annotator::kAddressCollection =
*[]() { return new std::string("address"); }();
const std::string& Annotator::kDateCollection =
*[]() { return new std::string("date"); }();
const std::string& Annotator::kUrlCollection =
*[]() { return new std::string("url"); }();
const std::string& Annotator::kEmailCollection =
*[]() { return new std::string("email"); }();
namespace {
const Model* LoadAndVerifyModel(const void* addr, int size) {
flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);
if (VerifyModelBuffer(verifier)) {
return GetModel(addr);
} else {
return nullptr;
}
}
const PersonNameModel* LoadAndVerifyPersonNameModel(const void* addr,
int size) {
flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(addr), size);
if (VerifyPersonNameModelBuffer(verifier)) {
return GetPersonNameModel(addr);
} else {
return nullptr;
}
}
// If lib is not nullptr, just returns lib. Otherwise, if lib is nullptr, will
// create a new instance, assign ownership to owned_lib, and return it.
const UniLib* MaybeCreateUnilib(const UniLib* lib,
std::unique_ptr<UniLib>* owned_lib) {
if (lib) {
return lib;
} else {
owned_lib->reset(new UniLib);
return owned_lib->get();
}
}
// As above, but for CalendarLib.
const CalendarLib* MaybeCreateCalendarlib(
const CalendarLib* lib, std::unique_ptr<CalendarLib>* owned_lib) {
if (lib) {
return lib;
} else {
owned_lib->reset(new CalendarLib);
return owned_lib->get();
}
}
// Returns whether the provided input is valid:
// * Sane span indices.
bool IsValidSpanInput(const UnicodeText& context, const CodepointSpan& span) {
return (span.first >= 0 && span.first < span.second &&
span.second <= context.size_codepoints());
}
std::unordered_set<char32> FlatbuffersIntVectorToChar32UnorderedSet(
const flatbuffers::Vector<int32_t>* ints) {
if (ints == nullptr) {
return {};
}
std::unordered_set<char32> ints_set;
for (auto value : *ints) {
ints_set.insert(static_cast<char32>(value));
}
return ints_set;
}
} // namespace
tflite::Interpreter* InterpreterManager::SelectionInterpreter() {
if (!selection_interpreter_) {
TC3_CHECK(selection_executor_);
selection_interpreter_ = selection_executor_->CreateInterpreter();
if (!selection_interpreter_) {
TC3_LOG(ERROR) << "Could not build TFLite interpreter.";
}
}
return selection_interpreter_.get();
}
tflite::Interpreter* InterpreterManager::ClassificationInterpreter() {
if (!classification_interpreter_) {
TC3_CHECK(classification_executor_);
classification_interpreter_ = classification_executor_->CreateInterpreter();
if (!classification_interpreter_) {
TC3_LOG(ERROR) << "Could not build TFLite interpreter.";
}
}
return classification_interpreter_.get();
}
std::unique_ptr<Annotator> Annotator::FromUnownedBuffer(
const char* buffer, int size, const UniLib* unilib,
const CalendarLib* calendarlib) {
const Model* model = LoadAndVerifyModel(buffer, size);
if (model == nullptr) {
return nullptr;
}
auto classifier = std::unique_ptr<Annotator>(new Annotator());
unilib = MaybeCreateUnilib(unilib, &classifier->owned_unilib_);
calendarlib =
MaybeCreateCalendarlib(calendarlib, &classifier->owned_calendarlib_);
classifier->ValidateAndInitialize(model, unilib, calendarlib);
if (!classifier->IsInitialized()) {
return nullptr;
}
return classifier;
}
std::unique_ptr<Annotator> Annotator::FromString(
const std::string& buffer, const UniLib* unilib,
const CalendarLib* calendarlib) {
auto classifier = std::unique_ptr<Annotator>(new Annotator());
classifier->owned_buffer_ = buffer;
const Model* model = LoadAndVerifyModel(classifier->owned_buffer_.data(),
classifier->owned_buffer_.size());
if (model == nullptr) {
return nullptr;
}
unilib = MaybeCreateUnilib(unilib, &classifier->owned_unilib_);
calendarlib =
MaybeCreateCalendarlib(calendarlib, &classifier->owned_calendarlib_);
classifier->ValidateAndInitialize(model, unilib, calendarlib);
if (!classifier->IsInitialized()) {
return nullptr;
}
return classifier;
}
std::unique_ptr<Annotator> Annotator::FromScopedMmap(
std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib,
const CalendarLib* calendarlib) {
if (!(*mmap)->handle().ok()) {
TC3_VLOG(1) << "Mmap failed.";
return nullptr;
}
const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),
(*mmap)->handle().num_bytes());
if (!model) {
TC3_LOG(ERROR) << "Model verification failed.";
return nullptr;
}
auto classifier = std::unique_ptr<Annotator>(new Annotator());
classifier->mmap_ = std::move(*mmap);
unilib = MaybeCreateUnilib(unilib, &classifier->owned_unilib_);
calendarlib =
MaybeCreateCalendarlib(calendarlib, &classifier->owned_calendarlib_);
classifier->ValidateAndInitialize(model, unilib, calendarlib);
if (!classifier->IsInitialized()) {
return nullptr;
}
return classifier;
}
std::unique_ptr<Annotator> Annotator::FromScopedMmap(
std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,
std::unique_ptr<CalendarLib> calendarlib) {
if (!(*mmap)->handle().ok()) {
TC3_VLOG(1) << "Mmap failed.";
return nullptr;
}
const Model* model = LoadAndVerifyModel((*mmap)->handle().start(),
(*mmap)->handle().num_bytes());
if (model == nullptr) {
TC3_LOG(ERROR) << "Model verification failed.";
return nullptr;
}
auto classifier = std::unique_ptr<Annotator>(new Annotator());
classifier->mmap_ = std::move(*mmap);
classifier->owned_unilib_ = std::move(unilib);
classifier->owned_calendarlib_ = std::move(calendarlib);
classifier->ValidateAndInitialize(model, classifier->owned_unilib_.get(),
classifier->owned_calendarlib_.get());
if (!classifier->IsInitialized()) {
return nullptr;
}
return classifier;
}
std::unique_ptr<Annotator> Annotator::FromFileDescriptor(
int fd, int offset, int size, const UniLib* unilib,
const CalendarLib* calendarlib) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));
return FromScopedMmap(&mmap, unilib, calendarlib);
}
std::unique_ptr<Annotator> Annotator::FromFileDescriptor(
int fd, int offset, int size, std::unique_ptr<UniLib> unilib,
std::unique_ptr<CalendarLib> calendarlib) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));
return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));
}
std::unique_ptr<Annotator> Annotator::FromFileDescriptor(
int fd, const UniLib* unilib, const CalendarLib* calendarlib) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));
return FromScopedMmap(&mmap, unilib, calendarlib);
}
std::unique_ptr<Annotator> Annotator::FromFileDescriptor(
int fd, std::unique_ptr<UniLib> unilib,
std::unique_ptr<CalendarLib> calendarlib) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd));
return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));
}
std::unique_ptr<Annotator> Annotator::FromPath(const std::string& path,
const UniLib* unilib,
const CalendarLib* calendarlib) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));
return FromScopedMmap(&mmap, unilib, calendarlib);
}
std::unique_ptr<Annotator> Annotator::FromPath(
const std::string& path, std::unique_ptr<UniLib> unilib,
std::unique_ptr<CalendarLib> calendarlib) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));
return FromScopedMmap(&mmap, std::move(unilib), std::move(calendarlib));
}
void Annotator::ValidateAndInitialize(const Model* model, const UniLib* unilib,
const CalendarLib* calendarlib) {
model_ = model;
unilib_ = unilib;
calendarlib_ = calendarlib;
initialized_ = false;
if (model_ == nullptr) {
TC3_LOG(ERROR) << "No model specified.";
return;
}
const bool model_enabled_for_annotation =
(model_->triggering_options() != nullptr &&
(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION));
const bool model_enabled_for_classification =
(model_->triggering_options() != nullptr &&
(model_->triggering_options()->enabled_modes() &
ModeFlag_CLASSIFICATION));
const bool model_enabled_for_selection =
(model_->triggering_options() != nullptr &&
(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION));
// Annotation requires the selection model.
if (model_enabled_for_annotation || model_enabled_for_selection) {
if (!model_->selection_options()) {
TC3_LOG(ERROR) << "No selection options.";
return;
}
if (!model_->selection_feature_options()) {
TC3_LOG(ERROR) << "No selection feature options.";
return;
}
if (!model_->selection_feature_options()->bounds_sensitive_features()) {
TC3_LOG(ERROR) << "No selection bounds sensitive feature options.";
return;
}
if (!model_->selection_model()) {
TC3_LOG(ERROR) << "No selection model.";
return;
}
selection_executor_ = ModelExecutor::FromBuffer(model_->selection_model());
if (!selection_executor_) {
TC3_LOG(ERROR) << "Could not initialize selection executor.";
return;
}
}
// Even if the annotation mode is not enabled (for the neural network model),
// the selection feature processor is needed to tokenize the text for other
// models.
if (model_->selection_feature_options()) {
selection_feature_processor_.reset(
new FeatureProcessor(model_->selection_feature_options(), unilib_));
}
// Annotation requires the classification model for conflict resolution and
// scoring.
// Selection requires the classification model for conflict resolution.
if (model_enabled_for_annotation || model_enabled_for_classification ||
model_enabled_for_selection) {
if (!model_->classification_options()) {
TC3_LOG(ERROR) << "No classification options.";
return;
}
if (!model_->classification_feature_options()) {
TC3_LOG(ERROR) << "No classification feature options.";
return;
}
if (!model_->classification_feature_options()
->bounds_sensitive_features()) {
TC3_LOG(ERROR) << "No classification bounds sensitive feature options.";
return;
}
if (!model_->classification_model()) {
TC3_LOG(ERROR) << "No clf model.";
return;
}
classification_executor_ =
ModelExecutor::FromBuffer(model_->classification_model());
if (!classification_executor_) {
TC3_LOG(ERROR) << "Could not initialize classification executor.";
return;
}
classification_feature_processor_.reset(new FeatureProcessor(
model_->classification_feature_options(), unilib_));
}
// The embeddings need to be specified if the model is to be used for
// classification or selection.
if (model_enabled_for_annotation || model_enabled_for_classification ||
model_enabled_for_selection) {
if (!model_->embedding_model()) {
TC3_LOG(ERROR) << "No embedding model.";
return;
}
// Check that the embedding size of the selection and classification model
// matches, as they are using the same embeddings.
if (model_enabled_for_selection &&
(model_->selection_feature_options()->embedding_size() !=
model_->classification_feature_options()->embedding_size() ||
model_->selection_feature_options()->embedding_quantization_bits() !=
model_->classification_feature_options()
->embedding_quantization_bits())) {
TC3_LOG(ERROR) << "Mismatching embedding size/quantization.";
return;
}
embedding_executor_ = TFLiteEmbeddingExecutor::FromBuffer(
model_->embedding_model(),
model_->classification_feature_options()->embedding_size(),
model_->classification_feature_options()->embedding_quantization_bits(),
model_->embedding_pruning_mask());
if (!embedding_executor_) {
TC3_LOG(ERROR) << "Could not initialize embedding executor.";
return;
}
}
std::unique_ptr<ZlibDecompressor> decompressor = ZlibDecompressor::Instance();
if (model_->regex_model()) {
if (!InitializeRegexModel(decompressor.get())) {
TC3_LOG(ERROR) << "Could not initialize regex model.";
return;
}
}
if (model_->datetime_grammar_model()) {
if (model_->datetime_grammar_model()->rules()) {
analyzer_ = std::make_unique<grammar::Analyzer>(
unilib_, model_->datetime_grammar_model()->rules());
datetime_grounder_ = std::make_unique<DatetimeGrounder>(calendarlib_);
datetime_parser_ = std::make_unique<GrammarDatetimeParser>(
*analyzer_, *datetime_grounder_,
/*target_classification_score=*/1.0,
/*priority_score=*/1.0);
}
} else if (model_->datetime_model()) {
datetime_parser_ = RegexDatetimeParser::Instance(
model_->datetime_model(), unilib_, calendarlib_, decompressor.get());
if (!datetime_parser_) {
TC3_LOG(ERROR) << "Could not initialize datetime parser.";
return;
}
}
if (model_->output_options()) {
if (model_->output_options()->filtered_collections_annotation()) {
for (const auto collection :
*model_->output_options()->filtered_collections_annotation()) {
filtered_collections_annotation_.insert(collection->str());
}
}
if (model_->output_options()->filtered_collections_classification()) {
for (const auto collection :
*model_->output_options()->filtered_collections_classification()) {
filtered_collections_classification_.insert(collection->str());
}
}
if (model_->output_options()->filtered_collections_selection()) {
for (const auto collection :
*model_->output_options()->filtered_collections_selection()) {
filtered_collections_selection_.insert(collection->str());
}
}
}
if (model_->number_annotator_options() &&
model_->number_annotator_options()->enabled()) {
number_annotator_.reset(
new NumberAnnotator(model_->number_annotator_options(), unilib_));
}
if (model_->money_parsing_options()) {
money_separators_ = FlatbuffersIntVectorToChar32UnorderedSet(
model_->money_parsing_options()->separators());
}
if (model_->duration_annotator_options() &&
model_->duration_annotator_options()->enabled()) {
duration_annotator_.reset(
new DurationAnnotator(model_->duration_annotator_options(),
selection_feature_processor_.get(), unilib_));
}
if (model_->grammar_model()) {
grammar_annotator_.reset(new GrammarAnnotator(
unilib_, model_->grammar_model(), entity_data_builder_.get()));
}
// The following #ifdef is here to aid quality evaluation of a situation, when
// a POD NER kill switch in AiAi is invoked, when a model that has POD NER in
// it.
#if !defined(TC3_DISABLE_POD_NER)
if (model_->pod_ner_model()) {
pod_ner_annotator_ =
PodNerAnnotator::Create(model_->pod_ner_model(), *unilib_);
}
#endif
if (model_->vocab_model()) {
vocab_annotator_ = VocabAnnotator::Create(
model_->vocab_model(), *selection_feature_processor_, *unilib_);
}
if (model_->entity_data_schema()) {
entity_data_schema_ = LoadAndVerifyFlatbuffer<reflection::Schema>(
model_->entity_data_schema()->Data(),
model_->entity_data_schema()->size());
if (entity_data_schema_ == nullptr) {
TC3_LOG(ERROR) << "Could not load entity data schema data.";
return;
}
entity_data_builder_.reset(
new MutableFlatbufferBuilder(entity_data_schema_));
} else {
entity_data_schema_ = nullptr;
entity_data_builder_ = nullptr;
}
if (model_->triggering_locales() &&
!ParseLocales(model_->triggering_locales()->c_str(),
&model_triggering_locales_)) {
TC3_LOG(ERROR) << "Could not parse model supported locales.";
return;
}
if (model_->triggering_options() != nullptr &&
model_->triggering_options()->locales() != nullptr &&
!ParseLocales(model_->triggering_options()->locales()->c_str(),
&ml_model_triggering_locales_)) {
TC3_LOG(ERROR) << "Could not parse supported ML model locales.";
return;
}
if (model_->triggering_options() != nullptr &&
model_->triggering_options()->dictionary_locales() != nullptr &&
!ParseLocales(model_->triggering_options()->dictionary_locales()->c_str(),
&dictionary_locales_)) {
TC3_LOG(ERROR) << "Could not parse dictionary supported locales.";
return;
}
if (model_->conflict_resolution_options() != nullptr) {
prioritize_longest_annotation_ =
model_->conflict_resolution_options()->prioritize_longest_annotation();
do_conflict_resolution_in_raw_mode_ =
model_->conflict_resolution_options()
->do_conflict_resolution_in_raw_mode();
}
#ifdef TC3_EXPERIMENTAL
TC3_LOG(WARNING) << "Enabling experimental annotators.";
InitializeExperimentalAnnotators();
#endif
initialized_ = true;
}
bool Annotator::InitializeRegexModel(ZlibDecompressor* decompressor) {
if (!model_->regex_model()->patterns()) {
return true;
}
// Initialize pattern recognizers.
int regex_pattern_id = 0;
for (const auto regex_pattern : *model_->regex_model()->patterns()) {
std::unique_ptr<UniLib::RegexPattern> compiled_pattern =
UncompressMakeRegexPattern(
*unilib_, regex_pattern->pattern(),
regex_pattern->compressed_pattern(),
model_->regex_model()->lazy_regex_compilation(), decompressor);
if (!compiled_pattern) {
TC3_LOG(INFO) << "Failed to load regex pattern";
return false;
}
if (regex_pattern->enabled_modes() & ModeFlag_ANNOTATION) {
annotation_regex_patterns_.push_back(regex_pattern_id);
}
if (regex_pattern->enabled_modes() & ModeFlag_CLASSIFICATION) {
classification_regex_patterns_.push_back(regex_pattern_id);
}
if (regex_pattern->enabled_modes() & ModeFlag_SELECTION) {
selection_regex_patterns_.push_back(regex_pattern_id);
}
regex_patterns_.push_back({
regex_pattern,
std::move(compiled_pattern),
});
++regex_pattern_id;
}
return true;
}
bool Annotator::InitializeKnowledgeEngine(
const std::string& serialized_config) {
std::unique_ptr<KnowledgeEngine> knowledge_engine(new KnowledgeEngine());
if (!knowledge_engine->Initialize(serialized_config, unilib_)) {
TC3_LOG(ERROR) << "Failed to initialize the knowledge engine.";
return false;
}
if (model_->triggering_options() != nullptr) {
knowledge_engine->SetPriorityScore(
model_->triggering_options()->knowledge_priority_score());
}
knowledge_engine_ = std::move(knowledge_engine);
return true;
}
bool Annotator::InitializeContactEngine(const std::string& serialized_config) {
std::unique_ptr<ContactEngine> contact_engine(
new ContactEngine(selection_feature_processor_.get(), unilib_,
model_->contact_annotator_options()));
if (!contact_engine->Initialize(serialized_config)) {
TC3_LOG(ERROR) << "Failed to initialize the contact engine.";
return false;
}
contact_engine_ = std::move(contact_engine);
return true;
}
bool Annotator::InitializeInstalledAppEngine(
const std::string& serialized_config) {
std::unique_ptr<InstalledAppEngine> installed_app_engine(
new InstalledAppEngine(selection_feature_processor_.get(), unilib_));
if (!installed_app_engine->Initialize(serialized_config)) {
TC3_LOG(ERROR) << "Failed to initialize the installed app engine.";
return false;
}
installed_app_engine_ = std::move(installed_app_engine);
return true;
}
bool Annotator::SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id) {
if (lang_id == nullptr) {
return false;
}
lang_id_ = lang_id;
if (lang_id_ != nullptr && model_->translate_annotator_options() &&
model_->translate_annotator_options()->enabled()) {
translate_annotator_.reset(new TranslateAnnotator(
model_->translate_annotator_options(), lang_id_, unilib_));
} else {
translate_annotator_.reset(nullptr);
}
return true;
}
bool Annotator::InitializePersonNameEngineFromUnownedBuffer(const void* buffer,
int size) {
const PersonNameModel* person_name_model =
LoadAndVerifyPersonNameModel(buffer, size);
if (person_name_model == nullptr) {
TC3_LOG(ERROR) << "Person name model verification failed.";
return false;
}
if (!person_name_model->enabled()) {
return true;
}
std::unique_ptr<PersonNameEngine> person_name_engine(
new PersonNameEngine(selection_feature_processor_.get(), unilib_));
if (!person_name_engine->Initialize(person_name_model)) {
TC3_LOG(ERROR) << "Failed to initialize the person name engine.";
return false;
}
person_name_engine_ = std::move(person_name_engine);
return true;
}
bool Annotator::InitializePersonNameEngineFromScopedMmap(
const ScopedMmap& mmap) {
if (!mmap.handle().ok()) {
TC3_LOG(ERROR) << "Mmap for person name model failed.";
return false;
}
return InitializePersonNameEngineFromUnownedBuffer(mmap.handle().start(),
mmap.handle().num_bytes());
}
bool Annotator::InitializePersonNameEngineFromPath(const std::string& path) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(path));
return InitializePersonNameEngineFromScopedMmap(*mmap);
}
bool Annotator::InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
int size) {
std::unique_ptr<ScopedMmap> mmap(new ScopedMmap(fd, offset, size));
return InitializePersonNameEngineFromScopedMmap(*mmap);
}
bool Annotator::InitializeExperimentalAnnotators() {
if (ExperimentalAnnotator::IsEnabled()) {
experimental_annotator_.reset(new ExperimentalAnnotator(
model_->experimental_model(), *selection_feature_processor_, *unilib_));
return true;
}
return false;
}
namespace internal {
// Helper function, which if the initial 'span' contains only white-spaces,
// moves the selection to a single-codepoint selection on a left or right side
// of this space.
CodepointSpan SnapLeftIfWhitespaceSelection(const CodepointSpan& span,
const UnicodeText& context_unicode,
const UniLib& unilib) {
TC3_CHECK(span.IsValid() && !span.IsEmpty());
UnicodeText::const_iterator it;
// Check that the current selection is all whitespaces.
it = context_unicode.begin();
std::advance(it, span.first);
for (int i = 0; i < (span.second - span.first); ++i, ++it) {
if (!unilib.IsWhitespace(*it)) {
return span;
}
}
// Try moving left.
CodepointSpan result = span;
it = context_unicode.begin();
std::advance(it, span.first);
while (it != context_unicode.begin() && unilib.IsWhitespace(*it)) {
--result.first;
--it;
}
result.second = result.first + 1;
if (!unilib.IsWhitespace(*it)) {
return result;
}
// If moving left didn't find a non-whitespace character, just return the
// original span.
return span;
}
} // namespace internal
bool Annotator::FilteredForAnnotation(const AnnotatedSpan& span) const {
return !span.classification.empty() &&
filtered_collections_annotation_.find(
span.classification[0].collection) !=
filtered_collections_annotation_.end();
}
bool Annotator::FilteredForClassification(
const ClassificationResult& classification) const {
return filtered_collections_classification_.find(classification.collection) !=
filtered_collections_classification_.end();
}
bool Annotator::FilteredForSelection(const AnnotatedSpan& span) const {
return !span.classification.empty() &&
filtered_collections_selection_.find(
span.classification[0].collection) !=
filtered_collections_selection_.end();
}
namespace {
inline bool ClassifiedAsOther(
const std::vector<ClassificationResult>& classification) {
return !classification.empty() &&
classification[0].collection == Collections::Other();
}
} // namespace
float Annotator::GetPriorityScore(
const std::vector<ClassificationResult>& classification) const {
if (!classification.empty() && !ClassifiedAsOther(classification)) {
return classification[0].priority_score;
} else {
if (model_->triggering_options() != nullptr) {
return model_->triggering_options()->other_collection_priority_score();
} else {
return -1000.0;
}
}
}
bool Annotator::VerifyRegexMatchCandidate(
const std::string& context, const VerificationOptions* verification_options,
const std::string& match, const UniLib::RegexMatcher* matcher) const {
if (verification_options == nullptr) {
return true;
}
if (verification_options->verify_luhn_checksum() &&
!VerifyLuhnChecksum(match)) {
return false;
}
const int lua_verifier = verification_options->lua_verifier();
if (lua_verifier >= 0) {
if (model_->regex_model()->lua_verifier() == nullptr ||
lua_verifier >= model_->regex_model()->lua_verifier()->size()) {
TC3_LOG(ERROR) << "Invalid lua verifier specified: " << lua_verifier;
return false;
}
return VerifyMatch(
context, matcher,
model_->regex_model()->lua_verifier()->Get(lua_verifier)->str());
}
return true;
}
CodepointSpan Annotator::SuggestSelection(
const std::string& context, CodepointSpan click_indices,
const SelectionOptions& options) const {
if (context.size() > std::numeric_limits<int>::max()) {
TC3_LOG(ERROR) << "Rejecting too long input: " << context.size();
return {};
}
CodepointSpan original_click_indices = click_indices;
if (!initialized_) {
TC3_LOG(ERROR) << "Not initialized";
return original_click_indices;
}
if (options.annotation_usecase !=
AnnotationUsecase_ANNOTATION_USECASE_SMART) {
TC3_LOG(WARNING)
<< "Invoking SuggestSelection, which is not supported in RAW mode.";
return original_click_indices;
}
if (!(model_->enabled_modes() & ModeFlag_SELECTION)) {
return original_click_indices;
}
std::vector<Locale> detected_text_language_tags;
if (!ParseLocales(options.detected_text_language_tags,
&detected_text_language_tags)) {
TC3_LOG(WARNING)
<< "Failed to parse the detected_text_language_tags in options: "
<< options.detected_text_language_tags;
}
if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,
model_triggering_locales_,
/*default_value=*/true)) {
return original_click_indices;
}
const UnicodeText context_unicode = UTF8ToUnicodeText(context,
/*do_copy=*/false);
if (!unilib_->IsValidUtf8(context_unicode)) {
TC3_LOG(ERROR) << "Rejecting input, invalid UTF8.";
return original_click_indices;
}
if (!IsValidSpanInput(context_unicode, click_indices)) {
TC3_VLOG(1)
<< "Trying to run SuggestSelection with invalid input, indices: "
<< click_indices.first << " " << click_indices.second;
return original_click_indices;
}
if (model_->snap_whitespace_selections()) {
// We want to expand a purely white-space selection to a multi-selection it
// would've been part of. But with this feature disabled we would do a no-
// op, because no token is found. Therefore, we need to modify the
// 'click_indices' a bit to include a part of the token, so that the click-
// finding logic finds the clicked token correctly. This modification is
// done by the following function. Note, that it's enough to check the left
// side of the current selection, because if the white-space is a part of a
// multi-selection, necessarily both tokens - on the left and the right
// sides need to be selected. Thus snapping only to the left is sufficient
// (there's a check at the bottom that makes sure that if we snap to the
// left token but the result does not contain the initial white-space,
// returns the original indices).
click_indices = internal::SnapLeftIfWhitespaceSelection(
click_indices, context_unicode, *unilib_);
}
Annotations candidates;
// As we process a single string of context, the candidates will only
// contain one vector of AnnotatedSpan.
candidates.annotated_spans.resize(1);
InterpreterManager interpreter_manager(selection_executor_.get(),
classification_executor_.get());
std::vector<Token> tokens;
if (!ModelSuggestSelection(context_unicode, click_indices,
detected_text_language_tags, &interpreter_manager,
&tokens, &candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Model suggest selection failed.";
return original_click_indices;
}
const std::unordered_set<std::string> set;
const EnabledEntityTypes is_entity_type_enabled(set);
if (!RegexChunk(context_unicode, selection_regex_patterns_,
/*is_serialized_entity_data_enabled=*/false,
is_entity_type_enabled, options.annotation_usecase,
&candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Regex suggest selection failed.";
return original_click_indices;
}
if (!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),
/*reference_time_ms_utc=*/0, /*reference_timezone=*/"",
options.locales, ModeFlag_SELECTION,
options.annotation_usecase,
/*is_serialized_entity_data_enabled=*/false,
&candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Datetime suggest selection failed.";
return original_click_indices;
}
if (knowledge_engine_ != nullptr &&
!knowledge_engine_
->Chunk(context, options.annotation_usecase,
options.location_context, Permissions(),
AnnotateMode::kEntityAnnotation, &candidates)
.ok()) {
TC3_LOG(ERROR) << "Knowledge suggest selection failed.";
return original_click_indices;
}
if (contact_engine_ != nullptr &&
!contact_engine_->Chunk(context_unicode, tokens,
&candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Contact suggest selection failed.";
return original_click_indices;
}
if (installed_app_engine_ != nullptr &&
!installed_app_engine_->Chunk(context_unicode, tokens,
&candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Installed app suggest selection failed.";
return original_click_indices;
}
if (number_annotator_ != nullptr &&
!number_annotator_->FindAll(context_unicode, options.annotation_usecase,
&candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Number annotator failed in suggest selection.";
return original_click_indices;
}
if (duration_annotator_ != nullptr &&
!duration_annotator_->FindAll(context_unicode, tokens,
options.annotation_usecase,
&candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Duration annotator failed in suggest selection.";
return original_click_indices;
}
if (person_name_engine_ != nullptr &&
!person_name_engine_->Chunk(context_unicode, tokens,
&candidates.annotated_spans[0])) {
TC3_LOG(ERROR) << "Person name suggest selection failed.";
return original_click_indices;
}
AnnotatedSpan grammar_suggested_span;
if (grammar_annotator_ != nullptr &&
grammar_annotator_->SuggestSelection(detected_text_language_tags,
context_unicode, click_indices,
&grammar_suggested_span)) {
candidates.annotated_spans[0].push_back(grammar_suggested_span);
}
AnnotatedSpan pod_ner_suggested_span;
if (pod_ner_annotator_ != nullptr && options.use_pod_ner &&
pod_ner_annotator_->SuggestSelection(context_unicode, click_indices,
&pod_ner_suggested_span)) {
candidates.annotated_spans[0].push_back(pod_ner_suggested_span);
}
if (experimental_annotator_ != nullptr) {
candidates.annotated_spans[0].push_back(
experimental_annotator_->SuggestSelection(context_unicode,
click_indices));
}
// Sort candidates according to their position in the input, so that the next
// code can assume that any connected component of overlapping spans forms a
// contiguous block.
std::stable_sort(candidates.annotated_spans[0].begin(),
candidates.annotated_spans[0].end(),
[](const AnnotatedSpan& a, const AnnotatedSpan& b) {
return a.span.first < b.span.first;
});
std::vector<int> candidate_indices;
if (!ResolveConflicts(candidates.annotated_spans[0], context, tokens,
detected_text_language_tags, options,
&interpreter_manager, &candidate_indices)) {
TC3_LOG(ERROR) << "Couldn't resolve conflicts.";
return original_click_indices;
}
std::stable_sort(
candidate_indices.begin(), candidate_indices.end(),
[this, &candidates](int a, int b) {
return GetPriorityScore(
candidates.annotated_spans[0][a].classification) >
GetPriorityScore(
candidates.annotated_spans[0][b].classification);
});
for (const int i : candidate_indices) {
if (SpansOverlap(candidates.annotated_spans[0][i].span, click_indices) &&
SpansOverlap(candidates.annotated_spans[0][i].span,
original_click_indices)) {
// Run model classification if not present but requested and there's a
// classification collection filter specified.
if (candidates.annotated_spans[0][i].classification.empty() &&
model_->selection_options()->always_classify_suggested_selection() &&
!filtered_collections_selection_.empty()) {
if (!ModelClassifyText(context, /*cached_tokens=*/{},
detected_text_language_tags,
candidates.annotated_spans[0][i].span, options,
&interpreter_manager,
/*embedding_cache=*/nullptr,
&candidates.annotated_spans[0][i].classification,
/*tokens=*/nullptr)) {
return original_click_indices;
}
}
// Ignore if span classification is filtered.
if (FilteredForSelection(candidates.annotated_spans[0][i])) {
return original_click_indices;
}
// We return a suggested span contains the original span.
// This compensates for "select all" selection that may come from
// other apps. See http://b/179890518.
if (SpanContains(candidates.annotated_spans[0][i].span,
original_click_indices)) {
return candidates.annotated_spans[0][i].span;
}
}
}
return original_click_indices;
}
namespace {
// Helper function that returns the index of the first candidate that
// transitively does not overlap with the candidate on 'start_index'. If the end
// of 'candidates' is reached, it returns the index that points right behind the
// array.
int FirstNonOverlappingSpanIndex(const std::vector<AnnotatedSpan>& candidates,
int start_index) {
int first_non_overlapping = start_index + 1;
CodepointSpan conflicting_span = candidates[start_index].span;
while (
first_non_overlapping < candidates.size() &&
SpansOverlap(conflicting_span, candidates[first_non_overlapping].span)) {
// Grow the span to include the current one.
conflicting_span.second = std::max(
conflicting_span.second, candidates[first_non_overlapping].span.second);
++first_non_overlapping;
}
return first_non_overlapping;
}
} // namespace
bool Annotator::ResolveConflicts(
const std::vector<AnnotatedSpan>& candidates, const std::string& context,
const std::vector<Token>& cached_tokens,
const std::vector<Locale>& detected_text_language_tags,
const BaseOptions& options, InterpreterManager* interpreter_manager,
std::vector<int>* result) const {
result->clear();
result->reserve(candidates.size());
for (int i = 0; i < candidates.size();) {
int first_non_overlapping =
FirstNonOverlappingSpanIndex(candidates, /*start_index=*/i);
const bool conflict_found = first_non_overlapping != (i + 1);
if (conflict_found) {
std::vector<int> candidate_indices;
if (!ResolveConflict(context, cached_tokens, candidates,
detected_text_language_tags, i,
first_non_overlapping, options, interpreter_manager,
&candidate_indices)) {
return false;
}
result->insert(result->end(), candidate_indices.begin(),
candidate_indices.end());
} else {
result->push_back(i);
}
// Skip over the whole conflicting group/go to next candidate.
i = first_non_overlapping;
}
return true;
}
namespace {
// Returns true, if the given two sources do conflict in given annotation
// usecase.
// - In SMART usecase, all sources do conflict, because there's only 1 possible
// annotation for a given span.
// - In RAW usecase, certain annotations are allowed to overlap (e.g. datetime
// and duration), while others not (e.g. duration and number).
bool DoSourcesConflict(AnnotationUsecase annotation_usecase,
const AnnotatedSpan::Source source1,
const AnnotatedSpan::Source source2) {
uint32 source_mask =
(1 << static_cast<int>(source1)) | (1 << static_cast<int>(source2));
switch (annotation_usecase) {
case AnnotationUsecase_ANNOTATION_USECASE_SMART:
// In the SMART mode, all annotations conflict.
return true;
case AnnotationUsecase_ANNOTATION_USECASE_RAW:
// DURATION and DATETIME do not conflict. E.g. "let's meet in 3 hours",
// can have two non-conflicting annotations: "in 3 hours" (datetime), "3
// hours" (duration).
if ((source_mask &
(1 << static_cast<int>(AnnotatedSpan::Source::DURATION))) &&
(source_mask &
(1 << static_cast<int>(AnnotatedSpan::Source::DATETIME)))) {
return false;
}
// A KNOWLEDGE entity does not conflict with anything.
if ((source_mask &
(1 << static_cast<int>(AnnotatedSpan::Source::KNOWLEDGE)))) {
return false;
}
// A PERSONNAME entity does not conflict with anything.
if ((source_mask &
(1 << static_cast<int>(AnnotatedSpan::Source::PERSON_NAME)))) {
return false;
}
// Entities from other sources can conflict.
return true;
}
}
} // namespace
bool Annotator::ResolveConflict(
const std::string& context, const std::vector<Token>& cached_tokens,
const std::vector<AnnotatedSpan>& candidates,
const std::vector<Locale>& detected_text_language_tags, int start_index,
int end_index, const BaseOptions& options,
InterpreterManager* interpreter_manager,
std::vector<int>* chosen_indices) const {
std::vector<int> conflicting_indices;
std::unordered_map<int, std::pair<float, int>> scores_lengths;
for (int i = start_index; i < end_index; ++i) {
conflicting_indices.push_back(i);
if (!candidates[i].classification.empty()) {
scores_lengths[i] = {
GetPriorityScore(candidates[i].classification),
candidates[i].span.second - candidates[i].span.first};
continue;
}
// OPTIMIZATION: So that we don't have to classify all the ML model
// spans apriori, we wait until we get here, when they conflict with
// something and we need the actual classification scores. So if the
// candidate conflicts and comes from the model, we need to run a
// classification to determine its priority:
std::vector<ClassificationResult> classification;
if (!ModelClassifyText(context, cached_tokens, detected_text_language_tags,
candidates[i].span, options, interpreter_manager,
/*embedding_cache=*/nullptr, &classification,
/*tokens=*/nullptr)) {
return false;
}
if (!classification.empty()) {
scores_lengths[i] = {
GetPriorityScore(classification),
candidates[i].span.second - candidates[i].span.first};
}
}
std::stable_sort(
conflicting_indices.begin(), conflicting_indices.end(),
[this, &scores_lengths, candidates, conflicting_indices](int i, int j) {
if (scores_lengths[i].first == scores_lengths[j].first &&
prioritize_longest_annotation_) {
return scores_lengths[i].second > scores_lengths[j].second;
}
return scores_lengths[i].first > scores_lengths[j].first;
});
// Here we keep a set of indices that were chosen, per-source, to enable
// effective computation.
std::unordered_map<AnnotatedSpan::Source, SortedIntSet>
chosen_indices_for_source_map;
// Greedily place the candidates if they don't conflict with the already
// placed ones.
for (int i = 0; i < conflicting_indices.size(); ++i) {
const int considered_candidate = conflicting_indices[i];
// See if there is a conflict between the candidate and all already placed
// candidates.
bool conflict = false;
SortedIntSet* chosen_indices_for_source_ptr = nullptr;
for (auto& source_set_pair : chosen_indices_for_source_map) {
if (source_set_pair.first == candidates[considered_candidate].source) {
chosen_indices_for_source_ptr = &source_set_pair.second;
}
const bool needs_conflict_resolution =
options.annotation_usecase ==
AnnotationUsecase_ANNOTATION_USECASE_SMART ||
(options.annotation_usecase ==
AnnotationUsecase_ANNOTATION_USECASE_RAW &&
do_conflict_resolution_in_raw_mode_);
if (needs_conflict_resolution &&
DoSourcesConflict(options.annotation_usecase, source_set_pair.first,
candidates[considered_candidate].source) &&
DoesCandidateConflict(considered_candidate, candidates,
source_set_pair.second)) {
conflict = true;
break;
}
}
// Skip the candidate if a conflict was found.
if (conflict) {
continue;
}
// If the set of indices for the current source doesn't exist yet,
// initialize it.
if (chosen_indices_for_source_ptr == nullptr) {
SortedIntSet new_set([&candidates](int a, int b) {
return candidates[a].span.first < candidates[b].span.first;
});
chosen_indices_for_source_map[candidates[considered_candidate].source] =
std::move(new_set);
chosen_indices_for_source_ptr =
&chosen_indices_for_source_map[candidates[considered_candidate]
.source];
}
// Place the candidate to the output and to the per-source conflict set.
chosen_indices->push_back(considered_candidate);
chosen_indices_for_source_ptr->insert(considered_candidate);
}
std::stable_sort(chosen_indices->begin(), chosen_indices->end());
return true;
}
bool Annotator::ModelSuggestSelection(
const UnicodeText& context_unicode, const CodepointSpan& click_indices,
const std::vector<Locale>& detected_text_language_tags,
InterpreterManager* interpreter_manager, std::vector<Token>* tokens,
std::vector<AnnotatedSpan>* result) const {
if (model_->triggering_options() == nullptr ||
!(model_->triggering_options()->enabled_modes() & ModeFlag_SELECTION)) {
return true;
}
if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,
ml_model_triggering_locales_,
/*default_value=*/true)) {
return true;
}
int click_pos;
*tokens = selection_feature_processor_->Tokenize(context_unicode);
const auto [click_begin, click_end] =
CodepointSpanToUnicodeTextRange(context_unicode, click_indices);
selection_feature_processor_->RetokenizeAndFindClick(
context_unicode, click_begin, click_end, click_indices,
selection_feature_processor_->GetOptions()->only_use_line_with_click(),
tokens, &click_pos);
if (click_pos == kInvalidIndex) {
TC3_VLOG(1) << "Could not calculate the click position.";
return false;
}
const int symmetry_context_size =
model_->selection_options()->symmetry_context_size();
const FeatureProcessorOptions_::BoundsSensitiveFeatures*
bounds_sensitive_features = selection_feature_processor_->GetOptions()
->bounds_sensitive_features();
// The symmetry context span is the clicked token with symmetry_context_size
// tokens on either side.
const TokenSpan symmetry_context_span =
IntersectTokenSpans(TokenSpan(click_pos).Expand(
/*num_tokens_left=*/symmetry_context_size,
/*num_tokens_right=*/symmetry_context_size),
AllOf(*tokens));
// Compute the extraction span based on the model type.
TokenSpan extraction_span;
if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {
// The extraction span is the symmetry context span expanded to include
// max_selection_span tokens on either side, which is how far a selection
// can stretch from the click, plus a relevant number of tokens outside of
// the bounds of the selection.
const int max_selection_span =
selection_feature_processor_->GetOptions()->max_selection_span();
extraction_span = symmetry_context_span.Expand(
/*num_tokens_left=*/max_selection_span +
bounds_sensitive_features->num_tokens_before(),
/*num_tokens_right=*/max_selection_span +
bounds_sensitive_features->num_tokens_after());
} else {
// The extraction span is the symmetry context span expanded to include
// context_size tokens on either side.
const int context_size =
selection_feature_processor_->GetOptions()->context_size();
extraction_span = symmetry_context_span.Expand(
/*num_tokens_left=*/context_size,
/*num_tokens_right=*/context_size);
}
extraction_span = IntersectTokenSpans(extraction_span, AllOf(*tokens));
if (!selection_feature_processor_->HasEnoughSupportedCodepoints(
*tokens, extraction_span)) {
return true;
}
std::unique_ptr<CachedFeatures> cached_features;
if (!selection_feature_processor_->ExtractFeatures(
*tokens, extraction_span,
/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
embedding_executor_.get(),
/*embedding_cache=*/nullptr,
selection_feature_processor_->EmbeddingSize() +
selection_feature_processor_->DenseFeaturesCount(),
&cached_features)) {
TC3_LOG(ERROR) << "Could not extract features.";
return false;
}
// Produce selection model candidates.
std::vector<TokenSpan> chunks;
if (!ModelChunk(tokens->size(), /*span_of_interest=*/symmetry_context_span,
interpreter_manager->SelectionInterpreter(), *cached_features,
&chunks)) {
TC3_LOG(ERROR) << "Could not chunk.";
return false;
}
for (const TokenSpan& chunk : chunks) {
AnnotatedSpan candidate;
candidate.span = selection_feature_processor_->StripBoundaryCodepoints(
context_unicode, TokenSpanToCodepointSpan(*tokens, chunk));
if (model_->selection_options()->strip_unpaired_brackets()) {
candidate.span =
StripUnpairedBrackets(context_unicode, candidate.span, *unilib_);
}
// Only output non-empty spans.
if (candidate.span.first != candidate.span.second) {
result->push_back(candidate);
}
}
return true;
}
namespace internal {
std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,
const CodepointSpan& selection_indices,
TokenSpan tokens_around_selection_to_copy) {
const auto first_selection_token = std::upper_bound(
cached_tokens.begin(), cached_tokens.end(), selection_indices.first,
[](int selection_start, const Token& token) {
return selection_start < token.end;
});
const auto last_selection_token = std::lower_bound(
cached_tokens.begin(), cached_tokens.end(), selection_indices.second,
[](const Token& token, int selection_end) {
return token.start < selection_end;
});
const int64 first_token = std::max(
static_cast<int64>(0),
static_cast<int64>((first_selection_token - cached_tokens.begin()) -
tokens_around_selection_to_copy.first));
const int64 last_token = std::min(
static_cast<int64>(cached_tokens.size()),
static_cast<int64>((last_selection_token - cached_tokens.begin()) +
tokens_around_selection_to_copy.second));
std::vector<Token> tokens;
tokens.reserve(last_token - first_token);
for (int i = first_token; i < last_token; ++i) {
tokens.push_back(cached_tokens[i]);
}
return tokens;
}
} // namespace internal
TokenSpan Annotator::ClassifyTextUpperBoundNeededTokens() const {
const FeatureProcessorOptions_::BoundsSensitiveFeatures*
bounds_sensitive_features =
classification_feature_processor_->GetOptions()
->bounds_sensitive_features();
if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {
// The extraction span is the selection span expanded to include a relevant
// number of tokens outside of the bounds of the selection.
return {bounds_sensitive_features->num_tokens_before(),
bounds_sensitive_features->num_tokens_after()};
} else {
// The extraction span is the clicked token with context_size tokens on
// either side.
const int context_size =
selection_feature_processor_->GetOptions()->context_size();
return {context_size, context_size};
}
}
namespace {
// Sorts the classification results from high score to low score.
void SortClassificationResults(
std::vector<ClassificationResult>* classification_results) {
std::stable_sort(
classification_results->begin(), classification_results->end(),
[](const ClassificationResult& a, const ClassificationResult& b) {
return a.score > b.score;
});
}
} // namespace
bool Annotator::ModelClassifyText(
const std::string& context, const std::vector<Token>& cached_tokens,
const std::vector<Locale>& detected_text_language_tags,
const CodepointSpan& selection_indices, const BaseOptions& options,
InterpreterManager* interpreter_manager,
FeatureProcessor::EmbeddingCache* embedding_cache,
std::vector<ClassificationResult>* classification_results,
std::vector<Token>* tokens) const {
const UnicodeText context_unicode =
UTF8ToUnicodeText(context, /*do_copy=*/false);
const auto [span_begin, span_end] =
CodepointSpanToUnicodeTextRange(context_unicode, selection_indices);
return ModelClassifyText(context_unicode, cached_tokens,
detected_text_language_tags, span_begin, span_end,
/*line=*/nullptr, selection_indices, options,
interpreter_manager, embedding_cache,
classification_results, tokens);
}
bool Annotator::ModelClassifyText(
const UnicodeText& context_unicode, const std::vector<Token>& cached_tokens,
const std::vector<Locale>& detected_text_language_tags,
const UnicodeText::const_iterator& span_begin,
const UnicodeText::const_iterator& span_end, const UnicodeTextRange* line,
const CodepointSpan& selection_indices, const BaseOptions& options,
InterpreterManager* interpreter_manager,
FeatureProcessor::EmbeddingCache* embedding_cache,
std::vector<ClassificationResult>* classification_results,
std::vector<Token>* tokens) const {
if (model_->triggering_options() == nullptr ||
!(model_->triggering_options()->enabled_modes() &
ModeFlag_CLASSIFICATION)) {
return true;
}
if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,
ml_model_triggering_locales_,
/*default_value=*/true)) {
return true;
}
std::vector<Token> local_tokens;
if (tokens == nullptr) {
tokens = &local_tokens;
}
if (cached_tokens.empty()) {
*tokens = classification_feature_processor_->Tokenize(context_unicode);
} else {
*tokens = internal::CopyCachedTokens(cached_tokens, selection_indices,
ClassifyTextUpperBoundNeededTokens());
}
int click_pos;
classification_feature_processor_->RetokenizeAndFindClick(
context_unicode, span_begin, span_end, selection_indices,
classification_feature_processor_->GetOptions()
->only_use_line_with_click(),
tokens, &click_pos);
const TokenSpan selection_token_span =
CodepointSpanToTokenSpan(*tokens, selection_indices);
const int selection_num_tokens = selection_token_span.Size();
if (model_->classification_options()->max_num_tokens() > 0 &&
model_->classification_options()->max_num_tokens() <
selection_num_tokens) {
*classification_results = {{Collections::Other(), 1.0}};
return true;
}
const FeatureProcessorOptions_::BoundsSensitiveFeatures*
bounds_sensitive_features =
classification_feature_processor_->GetOptions()
->bounds_sensitive_features();
if (selection_token_span.first == kInvalidIndex ||
selection_token_span.second == kInvalidIndex) {
TC3_LOG(ERROR) << "Could not determine span.";
return false;
}
// Compute the extraction span based on the model type.
TokenSpan extraction_span;
if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {
// The extraction span is the selection span expanded to include a relevant
// number of tokens outside of the bounds of the selection.
extraction_span = selection_token_span.Expand(
/*num_tokens_left=*/bounds_sensitive_features->num_tokens_before(),
/*num_tokens_right=*/bounds_sensitive_features->num_tokens_after());
} else {
if (click_pos == kInvalidIndex) {
TC3_LOG(ERROR) << "Couldn't choose a click position.";
return false;
}
// The extraction span is the clicked token with context_size tokens on
// either side.
const int context_size =
classification_feature_processor_->GetOptions()->context_size();
extraction_span = TokenSpan(click_pos).Expand(
/*num_tokens_left=*/context_size,
/*num_tokens_right=*/context_size);
}
extraction_span = IntersectTokenSpans(extraction_span, AllOf(*tokens));
if (!classification_feature_processor_->HasEnoughSupportedCodepoints(
*tokens, extraction_span)) {
*classification_results = {{Collections::Other(), 1.0}};
return true;
}
std::unique_ptr<CachedFeatures> cached_features;
if (!classification_feature_processor_->ExtractFeatures(
*tokens, extraction_span, selection_indices,
embedding_executor_.get(), embedding_cache,
classification_feature_processor_->EmbeddingSize() +
classification_feature_processor_->DenseFeaturesCount(),
&cached_features)) {
TC3_LOG(ERROR) << "Could not extract features.";
return false;
}
std::vector<float> features;
features.reserve(cached_features->OutputFeaturesSize());
if (bounds_sensitive_features && bounds_sensitive_features->enabled()) {
cached_features->AppendBoundsSensitiveFeaturesForSpan(selection_token_span,
&features);
} else {
cached_features->AppendClickContextFeaturesForClick(click_pos, &features);
}
TensorView<float> logits = classification_executor_->ComputeLogits(
TensorView<float>(features.data(),
{1, static_cast<int>(features.size())}),
interpreter_manager->ClassificationInterpreter());
if (!logits.is_valid()) {
TC3_LOG(ERROR) << "Couldn't compute logits.";
return false;
}
if (logits.dims() != 2 || logits.dim(0) != 1 ||
logits.dim(1) != classification_feature_processor_->NumCollections()) {
TC3_LOG(ERROR) << "Mismatching output";
return false;
}
const std::vector<float> scores =
ComputeSoftmax(logits.data(), logits.dim(1));
if (scores.empty()) {
*classification_results = {{Collections::Other(), 1.0}};
return true;
}
const int best_score_index =
std::max_element(scores.begin(), scores.end()) - scores.begin();
const std::string top_collection =
classification_feature_processor_->LabelToCollection(best_score_index);
// Sanity checks.
if (top_collection == Collections::Phone()) {
const int digit_count = std::count_if(span_begin, span_end, IsDigit);
if (digit_count <
model_->classification_options()->phone_min_num_digits() ||
digit_count >
model_->classification_options()->phone_max_num_digits()) {
*classification_results = {{Collections::Other(), 1.0}};
return true;
}
} else if (top_collection == Collections::Address()) {
if (selection_num_tokens <
model_->classification_options()->address_min_num_tokens()) {
*classification_results = {{Collections::Other(), 1.0}};
return true;
}
} else if (top_collection == Collections::Dictionary()) {
if ((options.use_vocab_annotator && vocab_annotator_) ||
!Locale::IsAnyLocaleSupported(detected_text_language_tags,
dictionary_locales_,
/*default_value=*/false)) {
*classification_results = {{Collections::Other(), 1.0}};
return true;
}
}
*classification_results = {{top_collection, /*arg_score=*/1.0,
/*arg_priority_score=*/scores[best_score_index]}};
// For some entities, we might want to clamp the priority score, for better
// conflict resolution between entities.
if (model_->triggering_options() != nullptr &&
model_->triggering_options()->collection_to_priority() != nullptr) {
if (auto entry =
model_->triggering_options()->collection_to_priority()->LookupByKey(
top_collection.c_str())) {
(*classification_results)[0].priority_score *= entry->value();
}
}
return true;
}
bool Annotator::RegexClassifyText(
const std::string& context, const CodepointSpan& selection_indices,
std::vector<ClassificationResult>* classification_result) const {
const std::string selection_text =
UTF8ToUnicodeText(context, /*do_copy=*/false)
.UTF8Substring(selection_indices.first, selection_indices.second);
const UnicodeText selection_text_unicode(
UTF8ToUnicodeText(selection_text, /*do_copy=*/false));
// Check whether any of the regular expressions match.
for (const int pattern_id : classification_regex_patterns_) {
const CompiledRegexPattern& regex_pattern = regex_patterns_[pattern_id];
const std::unique_ptr<UniLib::RegexMatcher> matcher =
regex_pattern.pattern->Matcher(selection_text_unicode);
int status = UniLib::RegexMatcher::kNoError;
bool matches;
if (regex_pattern.config->use_approximate_matching()) {
matches = matcher->ApproximatelyMatches(&status);
} else {
matches = matcher->Matches(&status);
}
if (status != UniLib::RegexMatcher::kNoError) {
return false;
}
if (matches && VerifyRegexMatchCandidate(
context, regex_pattern.config->verification_options(),
selection_text, matcher.get())) {
classification_result->push_back(
{regex_pattern.config->collection_name()->str(),
regex_pattern.config->target_classification_score(),
regex_pattern.config->priority_score()});
if (!SerializedEntityDataFromRegexMatch(
regex_pattern.config, matcher.get(),
&classification_result->back().serialized_entity_data)) {
TC3_LOG(ERROR) << "Could not get entity data.";
return false;
}
}
}
return true;
}
namespace {
std::string PickCollectionForDatetime(
const DatetimeParseResult& datetime_parse_result) {
switch (datetime_parse_result.granularity) {
case GRANULARITY_HOUR:
case GRANULARITY_MINUTE:
case GRANULARITY_SECOND:
return Collections::DateTime();
default:
return Collections::Date();
}
}
} // namespace
bool Annotator::DatetimeClassifyText(
const std::string& context, const CodepointSpan& selection_indices,
const ClassificationOptions& options,
std::vector<ClassificationResult>* classification_results) const {
if (!datetime_parser_) {
return true;
}
const std::string selection_text =
UTF8ToUnicodeText(context, /*do_copy=*/false)
.UTF8Substring(selection_indices.first, selection_indices.second);
LocaleList locale_list = LocaleList::ParseFrom(options.locales);
StatusOr<std::vector<DatetimeParseResultSpan>> result_status =
datetime_parser_->Parse(selection_text, options.reference_time_ms_utc,
options.reference_timezone, locale_list,
ModeFlag_CLASSIFICATION,
options.annotation_usecase,
/*anchor_start_end=*/true);
if (!result_status.ok()) {
TC3_LOG(ERROR) << "Error during parsing datetime.";
return false;
}
for (const DatetimeParseResultSpan& datetime_span :
result_status.ValueOrDie()) {
// Only consider the result valid if the selection and extracted datetime
// spans exactly match.
if (CodepointSpan(datetime_span.span.first + selection_indices.first,
datetime_span.span.second + selection_indices.first) ==
selection_indices) {
for (const DatetimeParseResult& parse_result : datetime_span.data) {
classification_results->emplace_back(
PickCollectionForDatetime(parse_result),
datetime_span.target_classification_score);
classification_results->back().datetime_parse_result = parse_result;
classification_results->back().serialized_entity_data =
CreateDatetimeSerializedEntityData(parse_result);
classification_results->back().priority_score =
datetime_span.priority_score;
}
return true;
}
}
return true;
}
std::vector<ClassificationResult> Annotator::ClassifyText(
const std::string& context, const CodepointSpan& selection_indices,
const ClassificationOptions& options) const {
if (context.size() > std::numeric_limits<int>::max()) {
TC3_LOG(ERROR) << "Rejecting too long input: " << context.size();
return {};
}
if (!initialized_) {
TC3_LOG(ERROR) << "Not initialized";
return {};
}
if (options.annotation_usecase !=
AnnotationUsecase_ANNOTATION_USECASE_SMART) {
TC3_LOG(WARNING)
<< "Invoking ClassifyText, which is not supported in RAW mode.";
return {};
}
if (!(model_->enabled_modes() & ModeFlag_CLASSIFICATION)) {
return {};
}
std::vector<Locale> detected_text_language_tags;
if (!ParseLocales(options.detected_text_language_tags,
&detected_text_language_tags)) {
TC3_LOG(WARNING)
<< "Failed to parse the detected_text_language_tags in options: "
<< options.detected_text_language_tags;
}
if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,
model_triggering_locales_,
/*default_value=*/true)) {
return {};
}
const UnicodeText context_unicode =
UTF8ToUnicodeText(context, /*do_copy=*/false);
if (!unilib_->IsValidUtf8(context_unicode)) {
TC3_LOG(ERROR) << "Rejecting input, invalid UTF8.";
return {};
}
if (!IsValidSpanInput(context_unicode, selection_indices)) {
TC3_VLOG(1) << "Trying to run ClassifyText with invalid input: "
<< selection_indices.first << " " << selection_indices.second;
return {};
}
// We'll accumulate a list of candidates, and pick the best candidate in the
// end.
std::vector<AnnotatedSpan> candidates;
// Try the knowledge engine.
// TODO(b/126579108): Propagate error status.
ClassificationResult knowledge_result;
if (knowledge_engine_ &&
knowledge_engine_
->ClassifyText(context, selection_indices, options.annotation_usecase,
options.location_context, Permissions(),
&knowledge_result)
.ok()) {
candidates.push_back({selection_indices, {knowledge_result}});
candidates.back().source = AnnotatedSpan::Source::KNOWLEDGE;
}
AddContactMetadataToKnowledgeClassificationResults(&candidates);
// Try the contact engine.
// TODO(b/126579108): Propagate error status.
ClassificationResult contact_result;
if (contact_engine_ && contact_engine_->ClassifyText(
context, selection_indices, &contact_result)) {
candidates.push_back({selection_indices, {contact_result}});
}
// Try the person name engine.
ClassificationResult person_name_result;
if (person_name_engine_ &&
person_name_engine_->ClassifyText(context, selection_indices,
&person_name_result)) {
candidates.push_back({selection_indices, {person_name_result}});
candidates.back().source = AnnotatedSpan::Source::PERSON_NAME;
}
// Try the installed app engine.
// TODO(b/126579108): Propagate error status.
ClassificationResult installed_app_result;
if (installed_app_engine_ &&
installed_app_engine_->ClassifyText(context, selection_indices,
&installed_app_result)) {
candidates.push_back({selection_indices, {installed_app_result}});
}
// Try the regular expression models.
std::vector<ClassificationResult> regex_results;
if (!RegexClassifyText(context, selection_indices, &regex_results)) {
return {};
}
for (const ClassificationResult& result : regex_results) {
candidates.push_back({selection_indices, {result}});
}
// Try the date model.
//
// DatetimeClassifyText only returns the first result, which can however have
// more interpretations. They are inserted in the candidates as a single
// AnnotatedSpan, so that they get treated together by the conflict resolution
// algorithm.
std::vector<ClassificationResult> datetime_results;
if (!DatetimeClassifyText(context, selection_indices, options,
&datetime_results)) {
return {};
}
if (!datetime_results.empty()) {
candidates.push_back({selection_indices, std::move(datetime_results)});
candidates.back().source = AnnotatedSpan::Source::DATETIME;
}
// Try the number annotator.
// TODO(b/126579108): Propagate error status.
ClassificationResult number_annotator_result;
if (number_annotator_ &&
number_annotator_->ClassifyText(context_unicode, selection_indices,
options.annotation_usecase,
&number_annotator_result)) {
candidates.push_back({selection_indices, {number_annotator_result}});
}
// Try the duration annotator.
ClassificationResult duration_annotator_result;
if (duration_annotator_ &&
duration_annotator_->ClassifyText(context_unicode, selection_indices,
options.annotation_usecase,
&duration_annotator_result)) {
candidates.push_back({selection_indices, {duration_annotator_result}});
candidates.back().source = AnnotatedSpan::Source::DURATION;
}
// Try the translate annotator.
ClassificationResult translate_annotator_result;
if (translate_annotator_ &&
translate_annotator_->ClassifyText(context_unicode, selection_indices,
options.user_familiar_language_tags,
&translate_annotator_result)) {
candidates.push_back({selection_indices, {translate_annotator_result}});
}
// Try the grammar model.
ClassificationResult grammar_annotator_result;
if (grammar_annotator_ && grammar_annotator_->ClassifyText(
detected_text_language_tags, context_unicode,
selection_indices, &grammar_annotator_result)) {
candidates.push_back({selection_indices, {grammar_annotator_result}});
}
ClassificationResult pod_ner_annotator_result;
if (pod_ner_annotator_ && options.use_pod_ner &&
pod_ner_annotator_->ClassifyText(context_unicode, selection_indices,
&pod_ner_annotator_result)) {
candidates.push_back({selection_indices, {pod_ner_annotator_result}});
}
ClassificationResult vocab_annotator_result;
if (vocab_annotator_ && options.use_vocab_annotator &&
vocab_annotator_->ClassifyText(
context_unicode, selection_indices, detected_text_language_tags,
options.trigger_dictionary_on_beginner_words,
&vocab_annotator_result)) {
candidates.push_back({selection_indices, {vocab_annotator_result}});
}
if (experimental_annotator_) {
experimental_annotator_->ClassifyText(context_unicode, selection_indices,
candidates);
}
// Try the ML model.
//
// The output of the model is considered as an exclusive 1-of-N choice. That's
// why it's inserted as only 1 AnnotatedSpan into candidates, as opposed to 1
// span for each candidate, like e.g. the regex model.
InterpreterManager interpreter_manager(selection_executor_.get(),
classification_executor_.get());
std::vector<ClassificationResult> model_results;
std::vector<Token> tokens;
if (!ModelClassifyText(
context, /*cached_tokens=*/{}, detected_text_language_tags,
selection_indices, options, &interpreter_manager,
/*embedding_cache=*/nullptr, &model_results, &tokens)) {
return {};
}
if (!model_results.empty()) {
candidates.push_back({selection_indices, std::move(model_results)});
}
std::vector<int> candidate_indices;
if (!ResolveConflicts(candidates, context, tokens,
detected_text_language_tags, options,
&interpreter_manager, &candidate_indices)) {
TC3_LOG(ERROR) << "Couldn't resolve conflicts.";
return {};
}
std::vector<ClassificationResult> results;
for (const int i : candidate_indices) {
for (const ClassificationResult& result : candidates[i].classification) {
if (!FilteredForClassification(result)) {
results.push_back(result);
}
}
}
// Sort results according to score.
std::stable_sort(
results.begin(), results.end(),
[](const ClassificationResult& a, const ClassificationResult& b) {
return a.score > b.score;
});
if (results.empty()) {
results = {{Collections::Other(), 1.0}};
}
return results;
}
bool Annotator::ModelAnnotate(
const std::string& context,
const std::vector<Locale>& detected_text_language_tags,
const AnnotationOptions& options, InterpreterManager* interpreter_manager,
std::vector<Token>* tokens, std::vector<AnnotatedSpan>* result) const {
bool skip_model_annotatation = false;
if (model_->triggering_options() == nullptr ||
!(model_->triggering_options()->enabled_modes() & ModeFlag_ANNOTATION)) {
skip_model_annotatation = true;
}
if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,
ml_model_triggering_locales_,
/*default_value=*/true)) {
skip_model_annotatation = true;
}
const UnicodeText context_unicode = UTF8ToUnicodeText(context,
/*do_copy=*/false);
std::vector<UnicodeTextRange> lines;
if (!selection_feature_processor_ ||
!selection_feature_processor_->GetOptions()->only_use_line_with_click()) {
lines.push_back({context_unicode.begin(), context_unicode.end()});
} else {
lines = selection_feature_processor_->SplitContext(
context_unicode, selection_feature_processor_->GetOptions()
->use_pipe_character_for_newline());
}
const float min_annotate_confidence =
(model_->triggering_options() != nullptr
? model_->triggering_options()->min_annotate_confidence()
: 0.f);
for (const UnicodeTextRange& line : lines) {
const std::string line_str =
UnicodeText::UTF8Substring(line.first, line.second);
std::vector<Token> line_tokens;
line_tokens = selection_feature_processor_->Tokenize(line_str);
selection_feature_processor_->RetokenizeAndFindClick(
line_str, {0, std::distance(line.first, line.second)},
selection_feature_processor_->GetOptions()->only_use_line_with_click(),
&line_tokens,
/*click_pos=*/nullptr);
const TokenSpan full_line_span = {
0, static_cast<TokenIndex>(line_tokens.size())};
tokens->insert(tokens->end(), line_tokens.begin(), line_tokens.end());
if (skip_model_annotatation) {
// We do not annotate, we only output the tokens.
continue;
}
// TODO(zilka): Add support for greater granularity of this check.
if (!selection_feature_processor_->HasEnoughSupportedCodepoints(
line_tokens, full_line_span)) {
continue;
}
std::unique_ptr<CachedFeatures> cached_features;
if (!selection_feature_processor_->ExtractFeatures(
line_tokens, full_line_span,
/*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
embedding_executor_.get(),
/*embedding_cache=*/nullptr,
selection_feature_processor_->EmbeddingSize() +
selection_feature_processor_->DenseFeaturesCount(),
&cached_features)) {
TC3_LOG(ERROR) << "Could not extract features.";
return false;
}
std::vector<TokenSpan> local_chunks;
if (!ModelChunk(line_tokens.size(), /*span_of_interest=*/full_line_span,
interpreter_manager->SelectionInterpreter(),
*cached_features, &local_chunks)) {
TC3_LOG(ERROR) << "Could not chunk.";
return false;
}
const int offset = std::distance(context_unicode.begin(), line.first);
if (local_chunks.empty()) {
continue;
}
const UnicodeText line_unicode =
UTF8ToUnicodeText(line_str, /*do_copy=*/false);
std::vector<UnicodeText::const_iterator> line_codepoints =
line_unicode.Codepoints();
line_codepoints.push_back(line_unicode.end());
FeatureProcessor::EmbeddingCache embedding_cache;
for (const TokenSpan& chunk : local_chunks) {
CodepointSpan codepoint_span =
TokenSpanToCodepointSpan(line_tokens, chunk);
if (!codepoint_span.IsValid() ||
codepoint_span.second > line_codepoints.size()) {
continue;
}
codepoint_span = selection_feature_processor_->StripBoundaryCodepoints(
/*span_begin=*/line_codepoints[codepoint_span.first],
/*span_end=*/line_codepoints[codepoint_span.second], codepoint_span);
if (model_->selection_options()->strip_unpaired_brackets()) {
codepoint_span = StripUnpairedBrackets(
/*span_begin=*/line_codepoints[codepoint_span.first],
/*span_end=*/line_codepoints[codepoint_span.second], codepoint_span,
*unilib_);
}
// Skip empty spans.
if (codepoint_span.first != codepoint_span.second) {
std::vector<ClassificationResult> classification;
if (!ModelClassifyText(
line_unicode, line_tokens, detected_text_language_tags,
/*span_begin=*/line_codepoints[codepoint_span.first],
/*span_end=*/line_codepoints[codepoint_span.second], &line,
codepoint_span, options, interpreter_manager, &embedding_cache,
&classification, /*tokens=*/nullptr)) {
TC3_LOG(ERROR) << "Could not classify text: "
<< (codepoint_span.first + offset) << " "
<< (codepoint_span.second + offset);
return false;
}
// Do not include the span if it's classified as "other".
if (!classification.empty() && !ClassifiedAsOther(classification) &&
classification[0].score >= min_annotate_confidence) {
AnnotatedSpan result_span;
result_span.span = {codepoint_span.first + offset,
codepoint_span.second + offset};
result_span.classification = std::move(classification);
result->push_back(std::move(result_span));
}
}
}
}
return true;
}
const FeatureProcessor* Annotator::SelectionFeatureProcessorForTests() const {
return selection_feature_processor_.get();
}
const FeatureProcessor* Annotator::ClassificationFeatureProcessorForTests()
const {
return classification_feature_processor_.get();
}
const DatetimeParser* Annotator::DatetimeParserForTests() const {
return datetime_parser_.get();
}
void Annotator::RemoveNotEnabledEntityTypes(
const EnabledEntityTypes& is_entity_type_enabled,
std::vector<AnnotatedSpan>* annotated_spans) const {
for (AnnotatedSpan& annotated_span : *annotated_spans) {
std::vector<ClassificationResult>& classifications =
annotated_span.classification;
classifications.erase(
std::remove_if(classifications.begin(), classifications.end(),
[&is_entity_type_enabled](
const ClassificationResult& classification_result) {
return !is_entity_type_enabled(
classification_result.collection);
}),
classifications.end());
}
annotated_spans->erase(
std::remove_if(annotated_spans->begin(), annotated_spans->end(),
[](const AnnotatedSpan& annotated_span) {
return annotated_span.classification.empty();
}),
annotated_spans->end());
}
void Annotator::AddContactMetadataToKnowledgeClassificationResults(
std::vector<AnnotatedSpan>* candidates) const {
if (candidates == nullptr || contact_engine_ == nullptr) {
return;
}
for (auto& candidate : *candidates) {
for (auto& classification_result : candidate.classification) {
contact_engine_->AddContactMetadataToKnowledgeClassificationResult(
&classification_result);
}
}
}
Status Annotator::AnnotateSingleInput(
const std::string& context, const AnnotationOptions& options,
std::vector<AnnotatedSpan>* candidates) const {
if (!(model_->enabled_modes() & ModeFlag_ANNOTATION)) {
return Status(StatusCode::UNAVAILABLE, "Model annotation was not enabled.");
}
const UnicodeText context_unicode =
UTF8ToUnicodeText(context, /*do_copy=*/false);
std::vector<Locale> detected_text_language_tags;
if (!ParseLocales(options.detected_text_language_tags,
&detected_text_language_tags)) {
TC3_LOG(WARNING)
<< "Failed to parse the detected_text_language_tags in options: "
<< options.detected_text_language_tags;
}
if (!Locale::IsAnyLocaleSupported(detected_text_language_tags,
model_triggering_locales_,
/*default_value=*/true)) {
return Status(
StatusCode::UNAVAILABLE,
"The detected language tags are not in the supported locales.");
}
InterpreterManager interpreter_manager(selection_executor_.get(),
classification_executor_.get());
const EnabledEntityTypes is_entity_type_enabled(options.entity_types);
const bool is_raw_usecase =
options.annotation_usecase == AnnotationUsecase_ANNOTATION_USECASE_RAW;
// Annotate with the selection model.
const bool model_annotations_enabled =
!is_raw_usecase || IsAnyModelEntityTypeEnabled(is_entity_type_enabled);
std::vector<Token> tokens;
if (model_annotations_enabled &&
!ModelAnnotate(context, detected_text_language_tags, options,
&interpreter_manager, &tokens, candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run ModelAnnotate.");
} else if (!model_annotations_enabled) {
// If the ML model didn't run, we need to tokenize to support the other
// annotators that depend on the tokens.
// Optimization could be made to only do this when an annotator that uses
// the tokens is enabled, but it's unclear if the added complexity is worth
// it.
if (selection_feature_processor_ != nullptr) {
tokens = selection_feature_processor_->Tokenize(context_unicode);
}
}
// Annotate with the regular expression models.
const bool regex_annotations_enabled =
!is_raw_usecase || IsAnyRegexEntityTypeEnabled(is_entity_type_enabled);
if (regex_annotations_enabled &&
!RegexChunk(
UTF8ToUnicodeText(context, /*do_copy=*/false),
annotation_regex_patterns_, options.is_serialized_entity_data_enabled,
is_entity_type_enabled, options.annotation_usecase, candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run RegexChunk.");
}
// Annotate with the datetime model.
// NOTE: Datetime can be disabled even in the SMART usecase, because it's been
// relatively slow for some clients.
if ((is_entity_type_enabled(Collections::Date()) ||
is_entity_type_enabled(Collections::DateTime())) &&
!DatetimeChunk(UTF8ToUnicodeText(context, /*do_copy=*/false),
options.reference_time_ms_utc, options.reference_timezone,
options.locales, ModeFlag_ANNOTATION,
options.annotation_usecase,
options.is_serialized_entity_data_enabled, candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run DatetimeChunk.");
}
// Annotate with the contact engine.
const bool contact_annotations_enabled =
!is_raw_usecase || is_entity_type_enabled(Collections::Contact());
if (contact_annotations_enabled && contact_engine_ &&
!contact_engine_->Chunk(context_unicode, tokens, candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run contact engine Chunk.");
}
// Annotate with the installed app engine.
const bool app_annotations_enabled =
!is_raw_usecase || is_entity_type_enabled(Collections::App());
if (app_annotations_enabled && installed_app_engine_ &&
!installed_app_engine_->Chunk(context_unicode, tokens, candidates)) {
return Status(StatusCode::INTERNAL,
"Couldn't run installed app engine Chunk.");
}
// Annotate with the number annotator.
const bool number_annotations_enabled =
!is_raw_usecase || (is_entity_type_enabled(Collections::Number()) ||
is_entity_type_enabled(Collections::Percentage()));
if (number_annotations_enabled && number_annotator_ != nullptr &&
!number_annotator_->FindAll(context_unicode, options.annotation_usecase,
candidates)) {
return Status(StatusCode::INTERNAL,
"Couldn't run number annotator FindAll.");
}
// Annotate with the duration annotator.
const bool duration_annotations_enabled =
!is_raw_usecase || is_entity_type_enabled(Collections::Duration());
if (duration_annotations_enabled && duration_annotator_ != nullptr &&
!duration_annotator_->FindAll(context_unicode, tokens,
options.annotation_usecase, candidates)) {
return Status(StatusCode::INTERNAL,
"Couldn't run duration annotator FindAll.");
}
// Annotate with the person name engine.
const bool person_annotations_enabled =
!is_raw_usecase || is_entity_type_enabled(Collections::PersonName());
if (person_annotations_enabled && person_name_engine_ &&
!person_name_engine_->Chunk(context_unicode, tokens, candidates)) {
return Status(StatusCode::INTERNAL,
"Couldn't run person name engine Chunk.");
}
// Annotate with the grammar annotators.
if (grammar_annotator_ != nullptr &&
!grammar_annotator_->Annotate(detected_text_language_tags,
context_unicode, candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run grammar annotators.");
}
// Annotate with the POD NER annotator.
const bool pod_ner_annotations_enabled =
!is_raw_usecase || IsAnyPodNerEntityTypeEnabled(is_entity_type_enabled);
if (pod_ner_annotations_enabled && pod_ner_annotator_ != nullptr &&
options.use_pod_ner &&
!pod_ner_annotator_->Annotate(context_unicode, candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run POD NER annotator.");
}
// Annotate with the vocab annotator.
const bool vocab_annotations_enabled =
!is_raw_usecase || is_entity_type_enabled(Collections::Dictionary());
if (vocab_annotations_enabled && vocab_annotator_ != nullptr &&
options.use_vocab_annotator &&
!vocab_annotator_->Annotate(context_unicode, detected_text_language_tags,
options.trigger_dictionary_on_beginner_words,
candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run vocab annotator.");
}
// Annotate with the experimental annotator.
if (experimental_annotator_ != nullptr &&
!experimental_annotator_->Annotate(context_unicode, candidates)) {
return Status(StatusCode::INTERNAL, "Couldn't run experimental annotator.");
}
// Sort candidates according to their position in the input, so that the next
// code can assume that any connected component of overlapping spans forms a
// contiguous block.
// Also sort them according to the end position and collection, so that the
// deduplication code below can assume that same spans and classifications
// form contiguous blocks.
std::stable_sort(candidates->begin(), candidates->end(),
[](const AnnotatedSpan& a, const AnnotatedSpan& b) {
if (a.span.first != b.span.first) {
return a.span.first < b.span.first;
}
if (a.span.second != b.span.second) {
return a.span.second < b.span.second;
}
return a.classification[0].collection <
b.classification[0].collection;
});
std::vector<int> candidate_indices;
if (!ResolveConflicts(*candidates, context, tokens,
detected_text_language_tags, options,
&interpreter_manager, &candidate_indices)) {
return Status(StatusCode::INTERNAL, "Couldn't resolve conflicts.");
}
// Remove candidates that overlap exactly and have the same collection.
// This can e.g. happen for phone coming from both ML model and regex.
candidate_indices.erase(
std::unique(candidate_indices.begin(), candidate_indices.end(),
[&candidates](const int a_index, const int b_index) {
const AnnotatedSpan& a = (*candidates)[a_index];
const AnnotatedSpan& b = (*candidates)[b_index];
return a.span == b.span &&
a.classification[0].collection ==
b.classification[0].collection;
}),
candidate_indices.end());
std::vector<AnnotatedSpan> result;
result.reserve(candidate_indices.size());
for (const int i : candidate_indices) {
if ((*candidates)[i].classification.empty() ||
ClassifiedAsOther((*candidates)[i].classification) ||
FilteredForAnnotation((*candidates)[i])) {
continue;
}
result.push_back(std::move((*candidates)[i]));
}
// We generate all candidates and remove them later (with the exception of
// date/time/duration entities) because there are complex interdependencies
// between the entity types. E.g., the TLD of an email can be interpreted as a
// URL, but most likely a user of the API does not want such annotations if
// "url" is enabled and "email" is not.
RemoveNotEnabledEntityTypes(is_entity_type_enabled, &result);
for (AnnotatedSpan& annotated_span : result) {
SortClassificationResults(&annotated_span.classification);
}
*candidates = result;
return Status::OK;
}
StatusOr<Annotations> Annotator::AnnotateStructuredInput(
const std::vector<InputFragment>& string_fragments,
const AnnotationOptions& options) const {
Annotations annotation_candidates;
annotation_candidates.annotated_spans.resize(string_fragments.size());
std::vector<std::string> text_to_annotate;
text_to_annotate.reserve(string_fragments.size());
std::vector<FragmentMetadata> fragment_metadata;
fragment_metadata.reserve(string_fragments.size());
for (const auto& string_fragment : string_fragments) {
text_to_annotate.push_back(string_fragment.text);
fragment_metadata.push_back(
{.relative_bounding_box_top = string_fragment.bounding_box_top,
.relative_bounding_box_height = string_fragment.bounding_box_height});
}
// KnowledgeEngine is special, because it supports annotation of multiple
// fragments at once.
if (knowledge_engine_ &&
!knowledge_engine_
->ChunkMultipleSpans(text_to_annotate, fragment_metadata,
options.annotation_usecase,
options.location_context, options.permissions,
options.annotate_mode, &annotation_candidates)
.ok()) {
return Status(StatusCode::INTERNAL, "Couldn't run knowledge engine Chunk.");
}
// The annotator engines shouldn't change the number of annotation vectors.
if (annotation_candidates.annotated_spans.size() != text_to_annotate.size()) {
TC3_LOG(ERROR) << "Received " << text_to_annotate.size()
<< " texts to annotate but generated a different number of "
"lists of annotations:"
<< annotation_candidates.annotated_spans.size();
return Status(StatusCode::INTERNAL,
"Number of annotation candidates differs from "
"number of texts to annotate.");
}
// As an optimization, if the only annotated type is Entity, we skip all the
// other annotators than the KnowledgeEngine. This only happens in the raw
// mode, to make sure it does not affect the result.
if (options.annotation_usecase == ANNOTATION_USECASE_RAW &&
options.entity_types.size() == 1 &&
*options.entity_types.begin() == Collections::Entity()) {
return annotation_candidates;
}
// Other annotators run on each fragment independently.
for (int i = 0; i < text_to_annotate.size(); ++i) {
AnnotationOptions annotation_options = options;
if (string_fragments[i].datetime_options.has_value()) {
DatetimeOptions reference_datetime =
string_fragments[i].datetime_options.value();
annotation_options.reference_time_ms_utc =
reference_datetime.reference_time_ms_utc;
annotation_options.reference_timezone =
reference_datetime.reference_timezone;
}
AddContactMetadataToKnowledgeClassificationResults(
&annotation_candidates.annotated_spans[i]);
Status annotation_status =
AnnotateSingleInput(text_to_annotate[i], annotation_options,
&annotation_candidates.annotated_spans[i]);
if (!annotation_status.ok()) {
return annotation_status;
}
}
return annotation_candidates;
}
std::vector<AnnotatedSpan> Annotator::Annotate(
const std::string& context, const AnnotationOptions& options) const {
if (context.size() > std::numeric_limits<int>::max()) {
TC3_LOG(ERROR) << "Rejecting too long input.";
return {};
}
const UnicodeText context_unicode =
UTF8ToUnicodeText(context, /*do_copy=*/false);
if (!unilib_->IsValidUtf8(context_unicode)) {
TC3_LOG(ERROR) << "Rejecting input, invalid UTF8.";
return {};
}
std::vector<InputFragment> string_fragments;
string_fragments.push_back({.text = context});
StatusOr<Annotations> annotations =
AnnotateStructuredInput(string_fragments, options);
if (!annotations.ok()) {
TC3_LOG(ERROR) << "Returned error when calling AnnotateStructuredInput: "
<< annotations.status().error_message();
return {};
}
return annotations.ValueOrDie().annotated_spans[0];
}
CodepointSpan Annotator::ComputeSelectionBoundaries(
const UniLib::RegexMatcher* match,
const RegexModel_::Pattern* config) const {
if (config->capturing_group() == nullptr) {
// Use first capturing group to specify the selection.
int status = UniLib::RegexMatcher::kNoError;
const CodepointSpan result = {match->Start(1, &status),
match->End(1, &status)};
if (status != UniLib::RegexMatcher::kNoError) {
return {kInvalidIndex, kInvalidIndex};
}
return result;
}
CodepointSpan result = {kInvalidIndex, kInvalidIndex};
const int num_groups = config->capturing_group()->size();
for (int i = 0; i < num_groups; i++) {
if (!config->capturing_group()->Get(i)->extend_selection()) {
continue;
}
int status = UniLib::RegexMatcher::kNoError;
// Check match and adjust bounds.
const int group_start = match->Start(i, &status);
const int group_end = match->End(i, &status);
if (status != UniLib::RegexMatcher::kNoError) {
return {kInvalidIndex, kInvalidIndex};
}
if (group_start == kInvalidIndex || group_end == kInvalidIndex) {
continue;
}
if (result.first == kInvalidIndex) {
result = {group_start, group_end};
} else {
result.first = std::min(result.first, group_start);
result.second = std::max(result.second, group_end);
}
}
return result;
}
bool Annotator::HasEntityData(const RegexModel_::Pattern* pattern) const {
if (pattern->serialized_entity_data() != nullptr ||
pattern->entity_data() != nullptr) {
return true;
}
if (pattern->capturing_group() != nullptr) {
for (const CapturingGroup* group : *pattern->capturing_group()) {
if (group->entity_field_path() != nullptr) {
return true;
}
if (group->serialized_entity_data() != nullptr ||
group->entity_data() != nullptr) {
return true;
}
}
}
return false;
}
bool Annotator::SerializedEntityDataFromRegexMatch(
const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,
std::string* serialized_entity_data) const {
if (!HasEntityData(pattern)) {
serialized_entity_data->clear();
return true;
}
TC3_CHECK(entity_data_builder_ != nullptr);
std::unique_ptr<MutableFlatbuffer> entity_data =
entity_data_builder_->NewRoot();
TC3_CHECK(entity_data != nullptr);
// Set fixed entity data.
if (pattern->serialized_entity_data() != nullptr) {
entity_data->MergeFromSerializedFlatbuffer(
StringPiece(pattern->serialized_entity_data()->c_str(),
pattern->serialized_entity_data()->size()));
}
if (pattern->entity_data() != nullptr) {
entity_data->MergeFrom(
reinterpret_cast<const flatbuffers::Table*>(pattern->entity_data()));
}
// Add entity data from rule capturing groups.
if (pattern->capturing_group() != nullptr) {
const int num_groups = pattern->capturing_group()->size();
for (int i = 0; i < num_groups; i++) {
const CapturingGroup* group = pattern->capturing_group()->Get(i);
// Check whether the group matched.
Optional<std::string> group_match_text =
GetCapturingGroupText(matcher, /*group_id=*/i);
if (!group_match_text.has_value()) {
continue;
}
// Set fixed entity data from capturing group match.
if (group->serialized_entity_data() != nullptr) {
entity_data->MergeFromSerializedFlatbuffer(
StringPiece(group->serialized_entity_data()->c_str(),
group->serialized_entity_data()->size()));
}
if (group->entity_data() != nullptr) {
entity_data->MergeFrom(reinterpret_cast<const flatbuffers::Table*>(
pattern->entity_data()));
}
// Set entity field from capturing group text.
if (group->entity_field_path() != nullptr) {
UnicodeText normalized_group_match_text =
UTF8ToUnicodeText(group_match_text.value(), /*do_copy=*/false);
// Apply normalization if specified.
if (group->normalization_options() != nullptr) {
normalized_group_match_text =
NormalizeText(*unilib_, group->normalization_options(),
normalized_group_match_text);
}
if (!entity_data->ParseAndSet(
group->entity_field_path(),
normalized_group_match_text.ToUTF8String())) {
TC3_LOG(ERROR)
<< "Could not set entity data from rule capturing group.";
return false;
}
}
}
}
*serialized_entity_data = entity_data->Serialize();
return true;
}
UnicodeText RemoveMoneySeparators(
const std::unordered_set<char32>& decimal_separators,
const UnicodeText& amount,
UnicodeText::const_iterator it_decimal_separator) {
UnicodeText whole_amount;
for (auto it = amount.begin();
it != amount.end() && it != it_decimal_separator; ++it) {
if (std::find(decimal_separators.begin(), decimal_separators.end(),
static_cast<char32>(*it)) == decimal_separators.end()) {
whole_amount.push_back(*it);
}
}
return whole_amount;
}
void Annotator::GetMoneyQuantityFromCapturingGroup(
const UniLib::RegexMatcher* match, const RegexModel_::Pattern* config,
const UnicodeText& context_unicode, std::string* quantity,
int* exponent) const {
if (config->capturing_group() == nullptr) {
*exponent = 0;
return;
}
const int num_groups = config->capturing_group()->size();
for (int i = 0; i < num_groups; i++) {
int status = UniLib::RegexMatcher::kNoError;
const int group_start = match->Start(i, &status);
const int group_end = match->End(i, &status);
if (group_start == kInvalidIndex || group_end == kInvalidIndex) {
continue;
}
*quantity =
unilib_
->ToLowerText(UnicodeText::Substring(context_unicode, group_start,
group_end, /*do_copy=*/false))
.ToUTF8String();
if (auto entry = model_->money_parsing_options()
->quantities_name_to_exponent()
->LookupByKey((*quantity).c_str())) {
*exponent = entry->value();
return;
}
}
*exponent = 0;
}
bool Annotator::ParseAndFillInMoneyAmount(
std::string* serialized_entity_data, const UniLib::RegexMatcher* match,
const RegexModel_::Pattern* config,
const UnicodeText& context_unicode) const {
std::unique_ptr<EntityDataT> data =
LoadAndVerifyMutableFlatbuffer<libtextclassifier3::EntityData>(
*serialized_entity_data);
if (data == nullptr) {
if (model_->version() >= 706) {
// This way of parsing money entity data is enabled for models newer than
// v706, consequently logging errors only for them (b/156634162).
TC3_LOG(ERROR)
<< "Data field is null when trying to parse Money Entity Data";
}
return false;
}
if (data->money->unnormalized_amount.empty()) {
if (model_->version() >= 706) {
// This way of parsing money entity data is enabled for models newer than
// v706, consequently logging errors only for them (b/156634162).
TC3_LOG(ERROR)
<< "Data unnormalized_amount is empty when trying to parse "
"Money Entity Data";
}
return false;
}
UnicodeText amount =
UTF8ToUnicodeText(data->money->unnormalized_amount, /*do_copy=*/false);
int separator_back_index = 0;
auto it_decimal_separator = --amount.end();
for (; it_decimal_separator != amount.begin();
--it_decimal_separator, ++separator_back_index) {
if (std::find(money_separators_.begin(), money_separators_.end(),
static_cast<char32>(*it_decimal_separator)) !=
money_separators_.end()) {
break;
}
}
// If there are 3 digits after the last separator, we consider that a
// thousands separator => the number is an int (e.g. 1.234 is considered int).
// If there is no separator in number, also that number is an int.
if (separator_back_index == 3 || it_decimal_separator == amount.begin()) {
it_decimal_separator = amount.end();
}
if (!unilib_->ParseInt32(RemoveMoneySeparators(money_separators_, amount,
it_decimal_separator),
&data->money->amount_whole_part)) {
TC3_LOG(ERROR) << "Could not parse the money whole part as int32 from the "
"amount: "
<< data->money->unnormalized_amount;
return false;
}
if (it_decimal_separator == amount.end()) {
data->money->amount_decimal_part = 0;
data->money->nanos = 0;
} else {
const int amount_codepoints_size = amount.size_codepoints();
const UnicodeText decimal_part = UnicodeText::Substring(
amount, amount_codepoints_size - separator_back_index,
amount_codepoints_size, /*do_copy=*/false);
if (!unilib_->ParseInt32(decimal_part, &data->money->amount_decimal_part)) {
TC3_LOG(ERROR) << "Could not parse the money decimal part as int32 from "
"the amount: "
<< data->money->unnormalized_amount;
return false;
}
data->money->nanos = data->money->amount_decimal_part *
pow(10, 9 - decimal_part.size_codepoints());
}
if (model_->money_parsing_options()->quantities_name_to_exponent() !=
nullptr) {
int quantity_exponent;
std::string quantity;
GetMoneyQuantityFromCapturingGroup(match, config, context_unicode,
&quantity, &quantity_exponent);
if (quantity_exponent > 0 && quantity_exponent <= 9) {
const double amount_whole_part =
data->money->amount_whole_part * pow(10, quantity_exponent) +
data->money->nanos / pow(10, 9 - quantity_exponent);
// TODO(jacekj): Change type of `data->money->amount_whole_part` to int64
// (and `std::numeric_limits<int>::max()` to
// `std::numeric_limits<int64>::max()`).
if (amount_whole_part < std::numeric_limits<int>::max()) {
data->money->amount_whole_part = amount_whole_part;
data->money->nanos = data->money->nanos %
static_cast<int>(pow(10, 9 - quantity_exponent)) *
pow(10, quantity_exponent);
}
}
if (quantity_exponent > 0) {
data->money->unnormalized_amount = strings::JoinStrings(
" ", {data->money->unnormalized_amount, quantity});
}
}
*serialized_entity_data =
PackFlatbuffer<libtextclassifier3::EntityData>(data.get());
return true;
}
bool