smartselect/token-feature-extractor.cc - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "smartselect/token-feature-extractor.h"

 #include <cctype>
 #include <string>

 #include "util/base/logging.h"
 #include "util/hash/farmhash.h"
 #include "util/strings/stringpiece.h"
 #include "util/utf8/unicodetext.h"
 #ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
 #include "unicode/regex.h"
 #include "unicode/uchar.h"
 #endif

 namespace libtextclassifier {

 namespace {

 std::string RemapTokenAscii(const std::string& token,
                             const TokenFeatureExtractorOptions& options) {
   if (!options.remap_digits && !options.lowercase_tokens) {
     return token;
   }

   std::string copy = token;
   for (int i = 0; i < token.size(); ++i) {
     if (options.remap_digits && isdigit(copy[i])) {
       copy[i] = '0';
     }
     if (options.lowercase_tokens) {
       copy[i] = tolower(copy[i]);
     }
   }
   return copy;
 }

 #ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
 void RemapTokenUnicode(const std::string& token,
                        const TokenFeatureExtractorOptions& options,
                        UnicodeText* remapped) {
   if (!options.remap_digits && !options.lowercase_tokens) {
     // Leave remapped untouched.
     return;
   }

   UnicodeText word = UTF8ToUnicodeText(token, /*do_copy=*/false);
   icu::UnicodeString icu_string;
   for (auto it = word.begin(); it != word.end(); ++it) {
     if (options.remap_digits && u_isdigit(*it)) {
       icu_string.append('0');
     } else if (options.lowercase_tokens) {
       icu_string.append(u_tolower(*it));
     } else {
       icu_string.append(*it);
     }
   }
   std::string utf8_str;
   icu_string.toUTF8String(utf8_str);
   remapped->CopyUTF8(utf8_str.data(), utf8_str.length());
 }
 #endif

 }  // namespace

 TokenFeatureExtractor::TokenFeatureExtractor(
     const TokenFeatureExtractorOptions& options)
     : options_(options) {
 #ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
   UErrorCode status;
   for (const std::string& pattern : options.regexp_features) {
     status = U_ZERO_ERROR;
     regex_patterns_.push_back(
         std::unique_ptr<icu::RegexPattern>(icu::RegexPattern::compile(
             icu::UnicodeString(pattern.c_str(), pattern.size(), "utf-8"), 0,
             status)));
     if (U_FAILURE(status)) {
       TC_LOG(WARNING) << "Failed to load pattern" << pattern;
     }
   }
 #else
   bool found_unsupported_regexp_features = false;
   for (const std::string& pattern : options.regexp_features) {
     // A temporary solution to support this specific regexp pattern without
     // adding too much binary size.
     if (pattern == "^[^a-z]*$") {
       enable_all_caps_feature_ = true;
     } else {
       found_unsupported_regexp_features = true;
     }
   }
   if (found_unsupported_regexp_features) {
     TC_LOG(WARNING) << "ICU not supported regexp features ignored.";
   }
 #endif
 }

 int TokenFeatureExtractor::HashToken(StringPiece token) const {
   if (options_.allowed_chargrams.empty()) {
     return tcfarmhash::Fingerprint64(token) % options_.num_buckets;
   } else {
     // Padding and out-of-vocabulary tokens have extra buckets reserved because
     // they are special and important tokens, and we don't want them to share
     // embedding with other charactergrams.
     // TODO(zilka): Experimentally verify.
     const int kNumExtraBuckets = 2;
     const std::string token_string = token.ToString();
     if (token_string == "<PAD>") {
       return 1;
     } else if (options_.allowed_chargrams.find(token_string) ==
                options_.allowed_chargrams.end()) {
       return 0;  // Out-of-vocabulary.
     } else {
       return (tcfarmhash::Fingerprint64(token) %
               (options_.num_buckets - kNumExtraBuckets)) +
              kNumExtraBuckets;
     }
   }
 }

 std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
     const Token& token) const {
   if (options_.unicode_aware_features) {
     return ExtractCharactergramFeaturesUnicode(token);
   } else {
     return ExtractCharactergramFeaturesAscii(token);
   }
 }

 std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
     const Token& token) const {
   std::vector<int> result;
   if (token.is_padding || token.value.empty()) {
     result.push_back(HashToken("<PAD>"));
   } else {
     const std::string word = RemapTokenAscii(token.value, options_);

     // Trim words that are over max_word_length characters.
     const int max_word_length = options_.max_word_length;
     std::string feature_word;
     if (word.size() > max_word_length) {
       feature_word =
           "^" + word.substr(0, max_word_length / 2) + "\1" +
           word.substr(word.size() - max_word_length / 2, max_word_length / 2) +
           "$";
     } else {
       // Add a prefix and suffix to the word.
       feature_word = "^" + word + "$";
     }

     // Upper-bound the number of charactergram extracted to avoid resizing.
     result.reserve(options_.chargram_orders.size() * feature_word.size());

     if (options_.chargram_orders.empty()) {
       result.push_back(HashToken(feature_word));
     } else {
       // Generate the character-grams.
       for (int chargram_order : options_.chargram_orders) {
         if (chargram_order == 1) {
           for (int i = 1; i < feature_word.size() - 1; ++i) {
             result.push_back(
                 HashToken(StringPiece(feature_word, /*offset=*/i, /*len=*/1)));
           }
         } else {
           for (int i = 0;
                i < static_cast<int>(feature_word.size()) - chargram_order + 1;
                ++i) {
             result.push_back(HashToken(StringPiece(feature_word, /*offset=*/i,
                                                    /*len=*/chargram_order)));
           }
         }
       }
     }
   }
   return result;
 }

 std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(
     const Token& token) const {
 #ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
   std::vector<int> result;
   if (token.is_padding || token.value.empty()) {
     result.push_back(HashToken("<PAD>"));
   } else {
     UnicodeText word = UTF8ToUnicodeText(token.value, /*do_copy=*/false);
     RemapTokenUnicode(token.value, options_, &word);

     // Trim the word if needed by finding a left-cut point and right-cut point.
     auto left_cut = word.begin();
     auto right_cut = word.end();
     for (int i = 0; i < options_.max_word_length / 2; i++) {
       if (left_cut < right_cut) {
         ++left_cut;
       }
       if (left_cut < right_cut) {
         --right_cut;
       }
     }

     std::string feature_word;
     if (left_cut == right_cut) {
       feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";
     } else {
       // clang-format off
       feature_word = "^" +
                      word.UTF8Substring(word.begin(), left_cut) +
                      "\1" +
                      word.UTF8Substring(right_cut, word.end()) +
                      "$";
       // clang-format on
     }

     const UnicodeText feature_word_unicode =
         UTF8ToUnicodeText(feature_word, /*do_copy=*/false);

     // Upper-bound the number of charactergram extracted to avoid resizing.
     result.reserve(options_.chargram_orders.size() * feature_word.size());

     if (options_.chargram_orders.empty()) {
       result.push_back(HashToken(feature_word));
     } else {
       // Generate the character-grams.
       for (int chargram_order : options_.chargram_orders) {
         UnicodeText::const_iterator it_start = feature_word_unicode.begin();
         UnicodeText::const_iterator it_end = feature_word_unicode.end();
         if (chargram_order == 1) {
           ++it_start;
           --it_end;
         }

         UnicodeText::const_iterator it_chargram_start = it_start;
         UnicodeText::const_iterator it_chargram_end = it_start;
         bool chargram_is_complete = true;
         for (int i = 0; i < chargram_order; ++i) {
           if (it_chargram_end == it_end) {
             chargram_is_complete = false;
             break;
           }
           ++it_chargram_end;
         }
         if (!chargram_is_complete) {
           continue;
         }

         for (; it_chargram_end <= it_end;
              ++it_chargram_start, ++it_chargram_end) {
           const int length_bytes =
               it_chargram_end.utf8_data() - it_chargram_start.utf8_data();
           result.push_back(HashToken(
               StringPiece(it_chargram_start.utf8_data(), length_bytes)));
         }
       }
     }
   }
   return result;
 #else
   TC_LOG(WARNING) << "ICU not supported. No feature extracted.";
   return {};
 #endif
 }

 bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
                                     std::vector<int>* sparse_features,
                                     std::vector<float>* dense_features) const {
   if (sparse_features == nullptr || dense_features == nullptr) {
     return false;
   }

   *sparse_features = ExtractCharactergramFeatures(token);

   if (options_.extract_case_feature) {
     if (options_.unicode_aware_features) {
       UnicodeText token_unicode =
           UTF8ToUnicodeText(token.value, /*do_copy=*/false);
       bool is_upper;
 #ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
       is_upper = u_isupper(*token_unicode.begin());
 #else
       TC_LOG(WARNING) << "Using non-unicode isupper because ICU is disabled.";
       is_upper = isupper(*token_unicode.begin());
 #endif
       if (!token.value.empty() && is_upper) {
         dense_features->push_back(1.0);
       } else {
         dense_features->push_back(-1.0);
       }
     } else {
       if (!token.value.empty() && isupper(*token.value.begin())) {
         dense_features->push_back(1.0);
       } else {
         dense_features->push_back(-1.0);
       }
     }
   }

   if (options_.extract_selection_mask_feature) {
     if (is_in_span) {
       dense_features->push_back(1.0);
     } else {
       if (options_.unicode_aware_features) {
         dense_features->push_back(-1.0);
       } else {
         dense_features->push_back(0.0);
       }
     }
   }

 #ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
   // Add regexp features.
   if (!regex_patterns_.empty()) {
     icu::UnicodeString unicode_str(token.value.c_str(), token.value.size(),
                                    "utf-8");
     for (int i = 0; i < regex_patterns_.size(); ++i) {
       if (!regex_patterns_[i].get()) {
         dense_features->push_back(-1.0);
         continue;
       }

       // Check for match.
       UErrorCode status = U_ZERO_ERROR;
       std::unique_ptr<icu::RegexMatcher> matcher(
           regex_patterns_[i]->matcher(unicode_str, status));
       if (matcher->find()) {
         dense_features->push_back(1.0);
       } else {
         dense_features->push_back(-1.0);
       }
     }
   }
 #else
   if (enable_all_caps_feature_) {
     bool is_all_caps = true;
     for (const char character_byte : token.value) {
       if (islower(character_byte)) {
         is_all_caps = false;
         break;
       }
     }
     if (is_all_caps) {
       dense_features->push_back(1.0);
     } else {
       dense_features->push_back(-1.0);
     }
   }
 #endif

   return true;
 }

 }  // namespace libtextclassifier
	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "smartselect/token-feature-extractor.h"

	#include <cctype>
	#include <string>

	#include "util/base/logging.h"
	#include "util/hash/farmhash.h"
	#include "util/strings/stringpiece.h"
	#include "util/utf8/unicodetext.h"
	#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
	#include "unicode/regex.h"
	#include "unicode/uchar.h"
	#endif

	namespace libtextclassifier {

	namespace {

	std::string RemapTokenAscii(const std::string& token,
	const TokenFeatureExtractorOptions& options) {
	if (!options.remap_digits && !options.lowercase_tokens) {
	return token;
	}

	std::string copy = token;
	for (int i = 0; i < token.size(); ++i) {
	if (options.remap_digits && isdigit(copy[i])) {
	copy[i] = '0';
	}
	if (options.lowercase_tokens) {
	copy[i] = tolower(copy[i]);
	}
	}
	return copy;
	}

	#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
	void RemapTokenUnicode(const std::string& token,
	const TokenFeatureExtractorOptions& options,
	UnicodeText* remapped) {
	if (!options.remap_digits && !options.lowercase_tokens) {
	// Leave remapped untouched.
	return;
	}

	UnicodeText word = UTF8ToUnicodeText(token, /do_copy=/false);
	icu::UnicodeString icu_string;
	for (auto it = word.begin(); it != word.end(); ++it) {
	if (options.remap_digits && u_isdigit(*it)) {
	icu_string.append('0');
	} else if (options.lowercase_tokens) {
	icu_string.append(u_tolower(*it));
	} else {
	icu_string.append(*it);
	}
	}
	std::string utf8_str;
	icu_string.toUTF8String(utf8_str);
	remapped->CopyUTF8(utf8_str.data(), utf8_str.length());
	}
	#endif

	} // namespace

	TokenFeatureExtractor::TokenFeatureExtractor(
	const TokenFeatureExtractorOptions& options)
	: options_(options) {
	#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
	UErrorCode status;
	for (const std::string& pattern : options.regexp_features) {
	status = U_ZERO_ERROR;
	regex_patterns_.push_back(
	std::unique_ptr<icu::RegexPattern>(icu::RegexPattern::compile(
	icu::UnicodeString(pattern.c_str(), pattern.size(), "utf-8"), 0,
	status)));
	if (U_FAILURE(status)) {
	TC_LOG(WARNING) << "Failed to load pattern" << pattern;
	}
	}
	#else
	bool found_unsupported_regexp_features = false;
	for (const std::string& pattern : options.regexp_features) {
	// A temporary solution to support this specific regexp pattern without
	// adding too much binary size.
	if (pattern == "^[^a-z]*$") {
	enable_all_caps_feature_ = true;
	} else {
	found_unsupported_regexp_features = true;
	}
	}
	if (found_unsupported_regexp_features) {
	TC_LOG(WARNING) << "ICU not supported regexp features ignored.";
	}
	#endif
	}

	int TokenFeatureExtractor::HashToken(StringPiece token) const {
	if (options_.allowed_chargrams.empty()) {
	return tcfarmhash::Fingerprint64(token) % options_.num_buckets;
	} else {
	// Padding and out-of-vocabulary tokens have extra buckets reserved because
	// they are special and important tokens, and we don't want them to share
	// embedding with other charactergrams.
	// TODO(zilka): Experimentally verify.
	const int kNumExtraBuckets = 2;
	const std::string token_string = token.ToString();
	if (token_string == "<PAD>") {
	return 1;
	} else if (options_.allowed_chargrams.find(token_string) ==
	options_.allowed_chargrams.end()) {
	return 0; // Out-of-vocabulary.
	} else {
	return (tcfarmhash::Fingerprint64(token) %
	(options_.num_buckets - kNumExtraBuckets)) +
	kNumExtraBuckets;
	}
	}
	}

	std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeatures(
	const Token& token) const {
	if (options_.unicode_aware_features) {
	return ExtractCharactergramFeaturesUnicode(token);
	} else {
	return ExtractCharactergramFeaturesAscii(token);
	}
	}

	std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesAscii(
	const Token& token) const {
	std::vector<int> result;
	if (token.is_padding \|\| token.value.empty()) {
	result.push_back(HashToken("<PAD>"));
	} else {
	const std::string word = RemapTokenAscii(token.value, options_);

	// Trim words that are over max_word_length characters.
	const int max_word_length = options_.max_word_length;
	std::string feature_word;
	if (word.size() > max_word_length) {
	feature_word =
	"^" + word.substr(0, max_word_length / 2) + "\1" +
	word.substr(word.size() - max_word_length / 2, max_word_length / 2) +
	"$";
	} else {
	// Add a prefix and suffix to the word.
	feature_word = "^" + word + "$";
	}

	// Upper-bound the number of charactergram extracted to avoid resizing.
	result.reserve(options_.chargram_orders.size() * feature_word.size());

	if (options_.chargram_orders.empty()) {
	result.push_back(HashToken(feature_word));
	} else {
	// Generate the character-grams.
	for (int chargram_order : options_.chargram_orders) {
	if (chargram_order == 1) {
	for (int i = 1; i < feature_word.size() - 1; ++i) {
	result.push_back(
	HashToken(StringPiece(feature_word, /offset=/i, /len=/1)));
	}
	} else {
	for (int i = 0;
	i < static_cast<int>(feature_word.size()) - chargram_order + 1;
	++i) {
	result.push_back(HashToken(StringPiece(feature_word, /offset=/i,
	/len=/chargram_order)));
	}
	}
	}
	}
	}
	return result;
	}

	std::vector<int> TokenFeatureExtractor::ExtractCharactergramFeaturesUnicode(
	const Token& token) const {
	#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
	std::vector<int> result;
	if (token.is_padding \|\| token.value.empty()) {
	result.push_back(HashToken("<PAD>"));
	} else {
	UnicodeText word = UTF8ToUnicodeText(token.value, /do_copy=/false);
	RemapTokenUnicode(token.value, options_, &word);

	// Trim the word if needed by finding a left-cut point and right-cut point.
	auto left_cut = word.begin();
	auto right_cut = word.end();
	for (int i = 0; i < options_.max_word_length / 2; i++) {
	if (left_cut < right_cut) {
	++left_cut;
	}
	if (left_cut < right_cut) {
	--right_cut;
	}
	}

	std::string feature_word;
	if (left_cut == right_cut) {
	feature_word = "^" + word.UTF8Substring(word.begin(), word.end()) + "$";
	} else {
	// clang-format off
	feature_word = "^" +
	word.UTF8Substring(word.begin(), left_cut) +
	"\1" +
	word.UTF8Substring(right_cut, word.end()) +
	"$";
	// clang-format on
	}

	const UnicodeText feature_word_unicode =
	UTF8ToUnicodeText(feature_word, /do_copy=/false);

	// Upper-bound the number of charactergram extracted to avoid resizing.
	result.reserve(options_.chargram_orders.size() * feature_word.size());

	if (options_.chargram_orders.empty()) {
	result.push_back(HashToken(feature_word));
	} else {
	// Generate the character-grams.
	for (int chargram_order : options_.chargram_orders) {
	UnicodeText::const_iterator it_start = feature_word_unicode.begin();
	UnicodeText::const_iterator it_end = feature_word_unicode.end();
	if (chargram_order == 1) {
	++it_start;
	--it_end;
	}

	UnicodeText::const_iterator it_chargram_start = it_start;
	UnicodeText::const_iterator it_chargram_end = it_start;
	bool chargram_is_complete = true;
	for (int i = 0; i < chargram_order; ++i) {
	if (it_chargram_end == it_end) {
	chargram_is_complete = false;
	break;
	}
	++it_chargram_end;
	}
	if (!chargram_is_complete) {
	continue;
	}

	for (; it_chargram_end <= it_end;
	++it_chargram_start, ++it_chargram_end) {
	const int length_bytes =
	it_chargram_end.utf8_data() - it_chargram_start.utf8_data();
	result.push_back(HashToken(
	StringPiece(it_chargram_start.utf8_data(), length_bytes)));
	}
	}
	}
	}
	return result;
	#else
	TC_LOG(WARNING) << "ICU not supported. No feature extracted.";
	return {};
	#endif
	}

	bool TokenFeatureExtractor::Extract(const Token& token, bool is_in_span,
	std::vector<int>* sparse_features,
	std::vector<float>* dense_features) const {
	if (sparse_features == nullptr \|\| dense_features == nullptr) {
	return false;
	}

	*sparse_features = ExtractCharactergramFeatures(token);

	if (options_.extract_case_feature) {
	if (options_.unicode_aware_features) {
	UnicodeText token_unicode =
	UTF8ToUnicodeText(token.value, /do_copy=/false);
	bool is_upper;
	#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
	is_upper = u_isupper(*token_unicode.begin());
	#else
	TC_LOG(WARNING) << "Using non-unicode isupper because ICU is disabled.";
	is_upper = isupper(*token_unicode.begin());
	#endif
	if (!token.value.empty() && is_upper) {
	dense_features->push_back(1.0);
	} else {
	dense_features->push_back(-1.0);
	}
	} else {
	if (!token.value.empty() && isupper(*token.value.begin())) {
	dense_features->push_back(1.0);
	} else {
	dense_features->push_back(-1.0);
	}
	}
	}

	if (options_.extract_selection_mask_feature) {
	if (is_in_span) {
	dense_features->push_back(1.0);
	} else {
	if (options_.unicode_aware_features) {
	dense_features->push_back(-1.0);
	} else {
	dense_features->push_back(0.0);
	}
	}
	}

	#ifndef LIBTEXTCLASSIFIER_DISABLE_ICU_SUPPORT
	// Add regexp features.
	if (!regex_patterns_.empty()) {
	icu::UnicodeString unicode_str(token.value.c_str(), token.value.size(),
	"utf-8");
	for (int i = 0; i < regex_patterns_.size(); ++i) {
	if (!regex_patterns_[i].get()) {
	dense_features->push_back(-1.0);
	continue;
	}

	// Check for match.
	UErrorCode status = U_ZERO_ERROR;
	std::unique_ptr<icu::RegexMatcher> matcher(
	regex_patterns_[i]->matcher(unicode_str, status));
	if (matcher->find()) {
	dense_features->push_back(1.0);
	} else {
	dense_features->push_back(-1.0);
	}
	}
	}
	#else
	if (enable_all_caps_feature_) {
	bool is_all_caps = true;
	for (const char character_byte : token.value) {
	if (islower(character_byte)) {
	is_all_caps = false;
	break;
	}
	}
	if (is_all_caps) {
	dense_features->push_back(1.0);
	} else {
	dense_features->push_back(-1.0);
	}
	}
	#endif

	return true;
	}

	} // namespace libtextclassifier