lang_id/relevant-script-feature.cc - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "lang_id/relevant-script-feature.h"

 #include <string>

 #include "common/feature-extractor.h"
 #include "common/feature-types.h"
 #include "common/task-context.h"
 #include "common/workspace.h"
 #include "lang_id/script-detector.h"
 #include "util/base/logging.h"
 #include "util/strings/utf8.h"

 namespace libtextclassifier {
 namespace nlp_core {
 namespace lang_id {

 bool RelevantScriptFeature::Setup(TaskContext *context) { return true; }

 bool RelevantScriptFeature::Init(TaskContext *context) {
   set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
   return true;
 }

 void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
                                      const LightSentence &sentence,
                                      FeatureVector *result) const {
   // We expect kNumRelevantScripts to be small, so we stack-allocate the array
   // of counts.  Still, if that changes, we want to find out.
   static_assert(
       kNumRelevantScripts < 25,
       "switch counts to vector<int>: too big for stack-allocated int[]");

   // counts[s] is the number of characters with script s.
   // Note: {} "value-initializes" the array to zero.
   int counts[kNumRelevantScripts]{};
   int total_count = 0;
   for (int i = 0; i < sentence.num_words(); ++i) {
     const std::string &word = sentence.word(i);
     const char *const word_end = word.data() + word.size();
     const char *curr = word.data();

     // Skip over token start '^'.
     TC_DCHECK_EQ(*curr, '^');
     curr += GetNumBytesForNonZeroUTF8Char(curr);
     while (true) {
       const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
       Script script = GetScript(curr, num_bytes);

       // We do this update and the if (...) break below *before* incrementing
       // counts[script] in order to skip the token end '$'.
       curr += num_bytes;
       if (curr >= word_end) {
         TC_DCHECK_EQ(*(curr - num_bytes), '$');
         break;
       }
       TC_DCHECK_GE(script, 0);
       TC_DCHECK_LT(script, kNumRelevantScripts);
       counts[script]++;
       total_count++;
     }
   }

   for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
     int count = counts[script_id];
     if (count > 0) {
       const float weight = static_cast<float>(count) / total_count;
       FloatFeatureValue value(script_id, weight);
       result->add(feature_type(), value.discrete_value);
     }
   }
 }

 }  // namespace lang_id
 }  // namespace nlp_core
 }  // namespace libtextclassifier
	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "lang_id/relevant-script-feature.h"

	#include <string>

	#include "common/feature-extractor.h"
	#include "common/feature-types.h"
	#include "common/task-context.h"
	#include "common/workspace.h"
	#include "lang_id/script-detector.h"
	#include "util/base/logging.h"
	#include "util/strings/utf8.h"

	namespace libtextclassifier {
	namespace nlp_core {
	namespace lang_id {

	bool RelevantScriptFeature::Setup(TaskContext *context) { return true; }

	bool RelevantScriptFeature::Init(TaskContext *context) {
	set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
	return true;
	}

	void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
	const LightSentence &sentence,
	FeatureVector *result) const {
	// We expect kNumRelevantScripts to be small, so we stack-allocate the array
	// of counts. Still, if that changes, we want to find out.
	static_assert(
	kNumRelevantScripts < 25,
	"switch counts to vector<int>: too big for stack-allocated int[]");

	// counts[s] is the number of characters with script s.
	// Note: {} "value-initializes" the array to zero.
	int counts[kNumRelevantScripts]{};
	int total_count = 0;
	for (int i = 0; i < sentence.num_words(); ++i) {
	const std::string &word = sentence.word(i);
	const char *const word_end = word.data() + word.size();
	const char *curr = word.data();

	// Skip over token start '^'.
	TC_DCHECK_EQ(*curr, '^');
	curr += GetNumBytesForNonZeroUTF8Char(curr);
	while (true) {
	const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
	Script script = GetScript(curr, num_bytes);

	// We do this update and the if (...) break below before incrementing
	// counts[script] in order to skip the token end '$'.
	curr += num_bytes;
	if (curr >= word_end) {
	TC_DCHECK_EQ(*(curr - num_bytes), '$');
	break;
	}
	TC_DCHECK_GE(script, 0);
	TC_DCHECK_LT(script, kNumRelevantScripts);
	counts[script]++;
	total_count++;
	}
	}

	for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
	int count = counts[script_id];
	if (count > 0) {
	const float weight = static_cast<float>(count) / total_count;
	FloatFeatureValue value(script_id, weight);
	result->add(feature_type(), value.discrete_value);
	}
	}
	}

	} // namespace lang_id
	} // namespace nlp_core
	} // namespace libtextclassifier