blob: 1339223d9de90424888440f18c131d3663e21bcc [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "lang_id/lang-id.h"
#include <stdio.h>
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lang_id/common/embedding-feature-interface.h"
#include "lang_id/common/embedding-network-params.h"
#include "lang_id/common/embedding-network.h"
#include "lang_id/common/fel/feature-extractor.h"
#include "lang_id/common/lite_base/logging.h"
#include "lang_id/common/lite_strings/numbers.h"
#include "lang_id/common/lite_strings/str-split.h"
#include "lang_id/common/lite_strings/stringpiece.h"
#include "lang_id/common/math/algorithm.h"
#include "lang_id/common/math/softmax.h"
#include "lang_id/custom-tokenizer.h"
#include "lang_id/features/light-sentence-features.h"
#include "lang_id/light-sentence.h"
namespace libtextclassifier3 {
namespace mobile {
namespace lang_id {
namespace {
// Default value for the confidence threshold. If the confidence of the top
// prediction is below this threshold, then FindLanguage() returns
// LangId::kUnknownLanguageCode. Note: this is just a default value; if the
// TaskSpec from the model specifies a "reliability_thresh" parameter, then we
// use that value instead. Note: for legacy reasons, our code and comments use
// the terms "confidence", "probability" and "reliability" equivalently.
static const float kDefaultConfidenceThreshold = 0.50f;
} // namespace
// Class that performs all work behind LangId.
class LangIdImpl {
public:
explicit LangIdImpl(std::unique_ptr<ModelProvider> model_provider)
: model_provider_(std::move(model_provider)),
lang_id_brain_interface_("language_identifier") {
// Note: in the code below, we set valid_ to true only if all initialization
// steps completed successfully. Otherwise, we return early, leaving valid_
// to its default value false.
if (!model_provider_ || !model_provider_->is_valid()) {
SAFTM_LOG(ERROR) << "Invalid model provider";
return;
}
auto *nn_params = model_provider_->GetNnParams();
if (!nn_params) {
SAFTM_LOG(ERROR) << "No NN params";
return;
}
network_.reset(new EmbeddingNetwork(nn_params));
languages_ = model_provider_->GetLanguages();
if (languages_.empty()) {
SAFTM_LOG(ERROR) << "No known languages";
return;
}
TaskContext context = *model_provider_->GetTaskContext();
if (!Setup(&context)) {
SAFTM_LOG(ERROR) << "Unable to Setup() LangId";
return;
}
if (!Init(&context)) {
SAFTM_LOG(ERROR) << "Unable to Init() LangId";
return;
}
valid_ = true;
}
string FindLanguage(StringPiece text) const {
// NOTE: it would be wasteful to implement this method in terms of
// FindLanguages(). We just need the most likely language and its
// probability; no need to compute (and allocate) a vector of pairs for all
// languages, nor to compute probabilities for all non-top languages.
if (!is_valid()) {
return LangId::kUnknownLanguageCode;
}
// Create a Sentence storing the input text.
LightSentence sentence;
tokenizer_.Tokenize(text, &sentence);
// Test input size here, after pre-processing removed irrelevant chars.
if (IsTooShort(sentence)) {
return LangId::kUnknownLanguageCode;
}
std::vector<float> scores;
ComputeScores(&sentence, &scores);
int prediction_id = GetArgMax(scores);
const string language = GetLanguageForSoftmaxLabel(prediction_id);
float probability = ComputeSoftmaxProbability(scores, prediction_id);
SAFTM_DLOG(INFO) << "Predicted " << language
<< " with prob: " << probability << " for \"" << text
<< "\"";
// Find confidence threshold for language.
float threshold = default_threshold_;
auto it = per_lang_thresholds_.find(language);
if (it != per_lang_thresholds_.end()) {
threshold = it->second;
}
if (probability < threshold) {
SAFTM_DLOG(INFO) << " below threshold => "
<< LangId::kUnknownLanguageCode;
return LangId::kUnknownLanguageCode;
}
return language;
}
void FindLanguages(StringPiece text, LangIdResult *result) const {
if (result == nullptr) return;
result->predictions.clear();
if (!is_valid()) {
result->predictions.emplace_back(LangId::kUnknownLanguageCode, 1);
return;
}
// Create a Sentence storing the input text.
LightSentence sentence;
tokenizer_.Tokenize(text, &sentence);
// Test input size here, after pre-processing removed irrelevant chars.
if (IsTooShort(sentence)) {
result->predictions.emplace_back(LangId::kUnknownLanguageCode, 1);
return;
}
std::vector<float> scores;
ComputeScores(&sentence, &scores);
// Compute and sort softmax in descending order by probability and convert
// IDs to language code strings. When probabilities are equal, we sort by
// language code string in ascending order.
std::vector<float> softmax = ComputeSoftmax(scores);
for (int i = 0; i < softmax.size(); ++i) {
result->predictions.emplace_back(GetLanguageForSoftmaxLabel(i),
softmax[i]);
}
// Sort the resulting language predictions by probability in descending
// order.
std::sort(result->predictions.begin(), result->predictions.end(),
[](const std::pair<string, float> &a,
const std::pair<string, float> &b) {
if (a.second == b.second) {
return a.first.compare(b.first) < 0;
} else {
return a.second > b.second;
}
});
}
bool is_valid() const { return valid_; }
int GetModelVersion() const { return model_version_; }
// Returns a property stored in the model file.
template <typename T, typename R>
R GetProperty(const string &property, T default_value) const {
return model_provider_->GetTaskContext()->Get(property, default_value);
}
private:
bool Setup(TaskContext *context) {
tokenizer_.Setup(context);
if (!lang_id_brain_interface_.SetupForProcessing(context)) return false;
min_text_size_in_bytes_ = context->Get("min_text_size_in_bytes", 0);
default_threshold_ =
context->Get("reliability_thresh", kDefaultConfidenceThreshold);
// Parse task parameter "per_lang_reliability_thresholds", fill
// per_lang_thresholds_.
const string thresholds_str =
context->Get("per_lang_reliability_thresholds", "");
std::vector<StringPiece> tokens = LiteStrSplit(thresholds_str, ',');
for (const auto &token : tokens) {
if (token.empty()) continue;
std::vector<StringPiece> parts = LiteStrSplit(token, '=');
float threshold = 0.0f;
if ((parts.size() == 2) && LiteAtof(parts[1], &threshold)) {
per_lang_thresholds_[string(parts[0])] = threshold;
} else {
SAFTM_LOG(ERROR) << "Broken token: \"" << token << "\"";
}
}
model_version_ = context->Get("model_version", model_version_);
return true;
}
bool Init(TaskContext *context) {
return lang_id_brain_interface_.InitForProcessing(context);
}
// Extracts features for |text|, runs them through the feed-forward neural
// network, and computes the output scores (activations from the last layer).
// These scores can be used to compute the softmax probabilities for our
// labels (in this case, the languages).
void ComputeScores(LightSentence* sentence, std::vector<float> *scores) const {
std::vector<FeatureVector> features =
lang_id_brain_interface_.GetFeaturesNoCaching(sentence);
// Run feed-forward neural network to compute scores.
network_->ComputeFinalScores(features, scores);
}
// Returns language code for a softmax label. See comments for languages_
// field. If label is out of range, returns LangId::kUnknownLanguageCode.
string GetLanguageForSoftmaxLabel(int label) const {
if ((label >= 0) && (label < languages_.size())) {
return languages_[label];
} else {
SAFTM_LOG(ERROR) << "Softmax label " << label << " outside range [0, "
<< languages_.size() << ")";
return LangId::kUnknownLanguageCode;
}
}
bool IsTooShort(const LightSentence &sentence) const {
int text_size = 0;
for (const std::string &token : sentence) {
// Each token has the form ^...$: we subtract 2 because we want to count
// only the real text, not the chars added by us.
text_size += token.size() - 2;
}
return text_size < min_text_size_in_bytes_;
}
std::unique_ptr<ModelProvider> model_provider_;
TokenizerForLangId tokenizer_;
EmbeddingFeatureInterface<LightSentenceExtractor, LightSentence>
lang_id_brain_interface_;
// Neural network to use for scoring.
std::unique_ptr<EmbeddingNetwork> network_;
// True if this object is ready to perform language predictions.
bool valid_ = false;
// The model returns LangId::kUnknownLanguageCode for input text that has
// fewer than min_text_size_in_bytes_ bytes (excluding ASCII whitespaces,
// digits, and punctuation).
int min_text_size_in_bytes_ = 0;
// Only predictions with a probability (confidence) above this threshold are
// reported. Otherwise, we report LangId::kUnknownLanguageCode.
float default_threshold_ = kDefaultConfidenceThreshold;
std::unordered_map<string, float> per_lang_thresholds_;
// Recognized languages: softmax label i means languages_[i] (something like
// "en", "fr", "ru", etc).
std::vector<string> languages_;
// Version of the model used by this LangIdImpl object. Zero means that the
// model version could not be determined.
int model_version_ = 0;
};
const char LangId::kUnknownLanguageCode[] = "und";
LangId::LangId(std::unique_ptr<ModelProvider> model_provider)
: pimpl_(new LangIdImpl(std::move(model_provider))) {}
LangId::~LangId() = default;
string LangId::FindLanguage(const char *data, size_t num_bytes) const {
StringPiece text(data, num_bytes);
return pimpl_->FindLanguage(text);
}
void LangId::FindLanguages(const char *data, size_t num_bytes,
LangIdResult *result) const {
SAFTM_DCHECK(result) << "LangIdResult must not be null.";
StringPiece text(data, num_bytes);
pimpl_->FindLanguages(text, result);
}
bool LangId::is_valid() const { return pimpl_->is_valid(); }
int LangId::GetModelVersion() const { return pimpl_->GetModelVersion(); }
float LangId::GetFloatProperty(const string &property,
float default_value) const {
return pimpl_->GetProperty<float, float>(property, default_value);
}
} // namespace lang_id
} // namespace mobile
} // namespace nlp_saft