native/lang_id/features/char-ngram-feature.cc - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "lang_id/features/char-ngram-feature.h"

 #include <string>
 #include <utility>
 #include <vector>

 #include "lang_id/common/fel/feature-types.h"
 #include "lang_id/common/fel/task-context.h"
 #include "lang_id/common/lite_base/logging.h"
 #include "lang_id/common/math/hash.h"
 #include "lang_id/common/utf8.h"

 namespace libtextclassifier3 {
 namespace mobile {
 namespace lang_id {

 bool ContinuousBagOfNgramsFunction::Setup(TaskContext *context) {
   // Parameters in the feature function descriptor.
   bool include_terminators = GetBoolParameter("include_terminators", false);
   if (!include_terminators) {
     SAFTM_LOG(ERROR) << "No support for include_terminators=true";
     return false;
   }

   bool include_spaces = GetBoolParameter("include_spaces", false);
   if (include_spaces) {
     SAFTM_LOG(ERROR) << "No support for include_spaces=true";
     return false;
   }

   bool use_equal_ngram_weight = GetBoolParameter("use_equal_weight", false);
   if (use_equal_ngram_weight) {
     SAFTM_LOG(ERROR) << "No support for use_equal_weight=true";
     return false;
   }

   ngram_id_dimension_ = GetIntParameter("id_dim", 10000);
   ngram_size_ = GetIntParameter("size", 3);

   counts_.assign(ngram_id_dimension_, 0);
   return true;
 }

 bool ContinuousBagOfNgramsFunction::Init(TaskContext *context) {
   set_feature_type(new NumericFeatureType(name(), ngram_id_dimension_));
   return true;
 }

 int ContinuousBagOfNgramsFunction::ComputeNgramCounts(
     const LightSentence &sentence) const {
   SAFTM_CHECK_EQ(counts_.size(), ngram_id_dimension_);
   SAFTM_CHECK_EQ(non_zero_count_indices_.size(), 0);

   int total_count = 0;

   for (const std::string &word : sentence) {
     const char *const word_end = word.data() + word.size();

     // Set ngram_start at the start of the current token (word).
     const char *ngram_start = word.data();

     // Set ngram_end ngram_size UTF8 characters after ngram_start.  Note: each
     // UTF8 character contains between 1 and 4 bytes.
     const char *ngram_end = ngram_start;
     int num_utf8_chars = 0;
     do {
       ngram_end += utils::OneCharLen(ngram_end);
       num_utf8_chars++;
     } while ((num_utf8_chars < ngram_size_) && (ngram_end < word_end));

     if (num_utf8_chars < ngram_size_) {
       // Current token is so small, it does not contain a single ngram of
       // ngram_size UTF8 characters.  Not much we can do in this case ...
       continue;
     }

     // At this point, [ngram_start, ngram_end) is the first ngram of ngram_size
     // UTF8 characters from current token.
     while (true) {
       // Compute ngram id: hash(ngram) % ngram_id_dimension
       int ngram_id = (
           utils::Hash32WithDefaultSeed(ngram_start, ngram_end - ngram_start)
           % ngram_id_dimension_);

       // Use a reference to the actual count, such that we can both test whether
       // the count was 0 and increment it without perfoming two lookups.
       int &ref_to_count_for_ngram = counts_[ngram_id];
       if (ref_to_count_for_ngram == 0) {
         non_zero_count_indices_.push_back(ngram_id);
       }
       ref_to_count_for_ngram++;
       total_count++;
       if (ngram_end >= word_end) {
         break;
       }

       // Advance both ngram_start and ngram_end by one UTF8 character.  This
       // way, the number of UTF8 characters between them remains constant
       // (ngram_size).
       ngram_start += utils::OneCharLen(ngram_start);
       ngram_end += utils::OneCharLen(ngram_end);
     }
   }  // end of loop over tokens.

   return total_count;
 }

 void ContinuousBagOfNgramsFunction::Evaluate(const WorkspaceSet &workspaces,
                                              const LightSentence &sentence,
                                              FeatureVector *result) const {
   // NOTE: we use std::* constructs (instead of absl::Mutex & co) to simplify
   // porting to Android and to avoid pulling in absl (which increases our code
   // size).
   std::lock_guard<std::mutex> mlock(state_mutex_);

   // Find the char ngram counts.
   int total_count = ComputeNgramCounts(sentence);

   // Populate the feature vector.
   const float norm = static_cast<float>(total_count);

   // TODO(salcianu): explore treating dense vectors (i.e., many non-zero
   // elements) separately.
   for (int ngram_id : non_zero_count_indices_) {
     const float weight = counts_[ngram_id] / norm;
     FloatFeatureValue value(ngram_id, weight);
     result->add(feature_type(), value.discrete_value);

     // Clear up counts_, for the next invocation of Evaluate().
     counts_[ngram_id] = 0;
   }

   // Clear up non_zero_count_indices_, for the next invocation of Evaluate().
   non_zero_count_indices_.clear();
 }

 SAFTM_STATIC_REGISTRATION(ContinuousBagOfNgramsFunction);

 }  // namespace lang_id
 }  // namespace mobile
 }  // namespace nlp_saft
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "lang_id/features/char-ngram-feature.h"

	#include <string>
	#include <utility>
	#include <vector>

	#include "lang_id/common/fel/feature-types.h"
	#include "lang_id/common/fel/task-context.h"
	#include "lang_id/common/lite_base/logging.h"
	#include "lang_id/common/math/hash.h"
	#include "lang_id/common/utf8.h"

	namespace libtextclassifier3 {
	namespace mobile {
	namespace lang_id {

	bool ContinuousBagOfNgramsFunction::Setup(TaskContext *context) {
	// Parameters in the feature function descriptor.
	bool include_terminators = GetBoolParameter("include_terminators", false);
	if (!include_terminators) {
	SAFTM_LOG(ERROR) << "No support for include_terminators=true";
	return false;
	}

	bool include_spaces = GetBoolParameter("include_spaces", false);
	if (include_spaces) {
	SAFTM_LOG(ERROR) << "No support for include_spaces=true";
	return false;
	}

	bool use_equal_ngram_weight = GetBoolParameter("use_equal_weight", false);
	if (use_equal_ngram_weight) {
	SAFTM_LOG(ERROR) << "No support for use_equal_weight=true";
	return false;
	}

	ngram_id_dimension_ = GetIntParameter("id_dim", 10000);
	ngram_size_ = GetIntParameter("size", 3);

	counts_.assign(ngram_id_dimension_, 0);
	return true;
	}

	bool ContinuousBagOfNgramsFunction::Init(TaskContext *context) {
	set_feature_type(new NumericFeatureType(name(), ngram_id_dimension_));
	return true;
	}

	int ContinuousBagOfNgramsFunction::ComputeNgramCounts(
	const LightSentence &sentence) const {
	SAFTM_CHECK_EQ(counts_.size(), ngram_id_dimension_);
	SAFTM_CHECK_EQ(non_zero_count_indices_.size(), 0);

	int total_count = 0;

	for (const std::string &word : sentence) {
	const char *const word_end = word.data() + word.size();

	// Set ngram_start at the start of the current token (word).
	const char *ngram_start = word.data();

	// Set ngram_end ngram_size UTF8 characters after ngram_start. Note: each
	// UTF8 character contains between 1 and 4 bytes.
	const char *ngram_end = ngram_start;
	int num_utf8_chars = 0;
	do {
	ngram_end += utils::OneCharLen(ngram_end);
	num_utf8_chars++;
	} while ((num_utf8_chars < ngram_size_) && (ngram_end < word_end));

	if (num_utf8_chars < ngram_size_) {
	// Current token is so small, it does not contain a single ngram of
	// ngram_size UTF8 characters. Not much we can do in this case ...
	continue;
	}

	// At this point, [ngram_start, ngram_end) is the first ngram of ngram_size
	// UTF8 characters from current token.
	while (true) {
	// Compute ngram id: hash(ngram) % ngram_id_dimension
	int ngram_id = (
	utils::Hash32WithDefaultSeed(ngram_start, ngram_end - ngram_start)
	% ngram_id_dimension_);

	// Use a reference to the actual count, such that we can both test whether
	// the count was 0 and increment it without perfoming two lookups.
	int &ref_to_count_for_ngram = counts_[ngram_id];
	if (ref_to_count_for_ngram == 0) {
	non_zero_count_indices_.push_back(ngram_id);
	}
	ref_to_count_for_ngram++;
	total_count++;
	if (ngram_end >= word_end) {
	break;
	}

	// Advance both ngram_start and ngram_end by one UTF8 character. This
	// way, the number of UTF8 characters between them remains constant
	// (ngram_size).
	ngram_start += utils::OneCharLen(ngram_start);
	ngram_end += utils::OneCharLen(ngram_end);
	}
	} // end of loop over tokens.

	return total_count;
	}

	void ContinuousBagOfNgramsFunction::Evaluate(const WorkspaceSet &workspaces,
	const LightSentence &sentence,
	FeatureVector *result) const {
	// NOTE: we use std::* constructs (instead of absl::Mutex & co) to simplify
	// porting to Android and to avoid pulling in absl (which increases our code
	// size).
	std::lock_guard<std::mutex> mlock(state_mutex_);

	// Find the char ngram counts.
	int total_count = ComputeNgramCounts(sentence);

	// Populate the feature vector.
	const float norm = static_cast<float>(total_count);

	// TODO(salcianu): explore treating dense vectors (i.e., many non-zero
	// elements) separately.
	for (int ngram_id : non_zero_count_indices_) {
	const float weight = counts_[ngram_id] / norm;
	FloatFeatureValue value(ngram_id, weight);
	result->add(feature_type(), value.discrete_value);

	// Clear up counts_, for the next invocation of Evaluate().
	counts_[ngram_id] = 0;
	}

	// Clear up non_zero_count_indices_, for the next invocation of Evaluate().
	non_zero_count_indices_.clear();
	}

	SAFTM_STATIC_REGISTRATION(ContinuousBagOfNgramsFunction);

	} // namespace lang_id
	} // namespace mobile
	} // namespace nlp_saft