tests/feature-processor_test.cc - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "smartselect/feature-processor.h"

 #include "gmock/gmock.h"
 #include "gtest/gtest.h"

 namespace libtextclassifier {
 namespace {

 using testing::ElementsAreArray;
 using testing::FloatEq;

 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
   std::vector<Token> tokens{Token("Hělló", 0, 5, false),
                             Token("fěěbař@google.com", 6, 23, false),
                             Token("heře!", 24, 29, false)};

   internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);

   // clang-format off
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Hělló", 0, 5, false),
                            Token("fěě", 6, 9, false),
                            Token("bař", 9, 12, false),
                            Token("@google.com", 12, 23, false),
                            Token("heře!", 24, 29, false)}));
   // clang-format on
 }

 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
   std::vector<Token> tokens{Token("Hělló", 0, 5, false),
                             Token("fěěbař@google.com", 6, 23, false),
                             Token("heře!", 24, 29, false)};

   internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);

   // clang-format off
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Hělló", 0, 5, false),
                            Token("fěěbař", 6, 12, false),
                            Token("@google.com", 12, 23, false),
                            Token("heře!", 24, 29, false)}));
   // clang-format on
 }

 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
   std::vector<Token> tokens{Token("Hělló", 0, 5, false),
                             Token("fěěbař@google.com", 6, 23, false),
                             Token("heře!", 24, 29, false)};

   internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);

   // clang-format off
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Hělló", 0, 5, false),
                            Token("fěě", 6, 9, false),
                            Token("bař@google.com", 9, 23, false),
                            Token("heře!", 24, 29, false)}));
   // clang-format on
 }

 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
   std::vector<Token> tokens{Token("Hělló", 0, 5, false),
                             Token("fěěbař@google.com", 6, 23, false),
                             Token("heře!", 24, 29, false)};

   internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);

   // clang-format off
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Hělló", 0, 5, false),
                            Token("fěěbař@google.com", 6, 23, false),
                            Token("heře!", 24, 29, false)}));
   // clang-format on
 }

 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
   std::vector<Token> tokens{Token("Hělló", 0, 5, false),
                             Token("fěěbař@google.com", 6, 23, false),
                             Token("heře!", 24, 29, false)};

   internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);

   // clang-format off
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Hě", 0, 2, false),
                            Token("lló", 2, 5, false),
                            Token("fěě", 6, 9, false),
                            Token("bař@google.com", 9, 23, false),
                            Token("heře!", 24, 29, false)}));
   // clang-format on
 }

 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {0, 5};
   // clang-format off
   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
                                Token("Lině", 6, 10),
                                Token("Sěcond", 11, 17),
                                Token("Lině", 18, 22),
                                Token("Thiřd", 23, 28),
                                Token("Lině", 29, 33)};
   // clang-format on

   // Keeps the first line.
   internal::StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens,
               ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
 }

 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {18, 22};
   // clang-format off
   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
                                Token("Lině", 6, 10),
                                Token("Sěcond", 11, 17),
                                Token("Lině", 18, 22),
                                Token("Thiřd", 23, 28),
                                Token("Lině", 29, 33)};
   // clang-format on

   // Keeps the first line.
   internal::StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
 }

 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {24, 33};
   // clang-format off
   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
                                Token("Lině", 6, 10),
                                Token("Sěcond", 11, 17),
                                Token("Lině", 18, 22),
                                Token("Thiřd", 23, 28),
                                Token("Lině", 29, 33)};
   // clang-format on

   // Keeps the first line.
   internal::StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
 }

 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
   const CodepointSpan span = {18, 22};
   // clang-format off
   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
                                Token("Lině", 6, 10),
                                Token("Sěcond", 11, 17),
                                Token("Lině", 18, 22),
                                Token("Thiřd", 23, 28),
                                Token("Lině", 29, 33)};
   // clang-format on

   // Keeps the first line.
   internal::StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
 }

 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
   const CodepointSpan span = {5, 23};
   // clang-format off
   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
                                Token("Lině", 6, 10),
                                Token("Sěcond", 18, 23),
                                Token("Lině", 19, 23),
                                Token("Thiřd", 23, 28),
                                Token("Lině", 29, 33)};
   // clang-format on

   // Keeps the first line.
   internal::StripTokensFromOtherLines(context, span, &tokens);
   EXPECT_THAT(tokens, ElementsAreArray(
                           {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
                            Token("Sěcond", 18, 23), Token("Lině", 19, 23),
                            Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
 }

 class TestingFeatureProcessor : public FeatureProcessor {
  public:
   using FeatureProcessor::FeatureProcessor;
   using FeatureProcessor::SpanToLabel;
   using FeatureProcessor::SupportedCodepointsRatio;
   using FeatureProcessor::IsCodepointSupported;
 };

 TEST(FeatureProcessorTest, SpanToLabel) {
   FeatureProcessorOptions options;
   options.set_context_size(1);
   options.set_max_selection_span(1);
   options.set_tokenize_on_space(true);
   options.set_snap_label_span_boundaries_to_containing_tokens(false);

   TokenizationCodepointRange* config =
       options.add_tokenization_codepoint_config();
   config->set_start(32);
   config->set_end(33);
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);

   TestingFeatureProcessor feature_processor(options);
   std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
   ASSERT_EQ(3, tokens.size());
   int label;
   ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
   EXPECT_EQ(kInvalidLabel, label);
   ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
   EXPECT_NE(kInvalidLabel, label);
   TokenSpan token_span;
   feature_processor.LabelToTokenSpan(label, &token_span);
   EXPECT_EQ(0, token_span.first);
   EXPECT_EQ(0, token_span.second);

   // Reconfigure with snapping enabled.
   options.set_snap_label_span_boundaries_to_containing_tokens(true);
   TestingFeatureProcessor feature_processor2(options);
   int label2;
   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
   EXPECT_EQ(label, label2);
   ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
   EXPECT_EQ(label, label2);
   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
   EXPECT_EQ(label, label2);

   // Cross a token boundary.
   ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
   EXPECT_EQ(kInvalidLabel, label2);
   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
   EXPECT_EQ(kInvalidLabel, label2);

   // Multiple tokens.
   options.set_context_size(2);
   options.set_max_selection_span(2);
   TestingFeatureProcessor feature_processor3(options);
   tokens = feature_processor3.Tokenize("zero, one, two, three, four");
   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
   EXPECT_NE(kInvalidLabel, label2);
   feature_processor3.LabelToTokenSpan(label2, &token_span);
   EXPECT_EQ(1, token_span.first);
   EXPECT_EQ(0, token_span.second);

   int label3;
   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
   EXPECT_EQ(label2, label3);
   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
   EXPECT_EQ(label2, label3);
   ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
   EXPECT_EQ(label2, label3);
 }

 TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
   FeatureProcessorOptions options;
   options.set_num_buckets(10);
   options.set_context_size(7);
   options.set_max_selection_span(7);
   options.add_chargram_orders(1);
   options.set_tokenize_on_space(true);
   options.set_context_dropout_probability(0.5);
   options.set_use_variable_context_dropout(true);
   TokenizationCodepointRange* config =
       options.add_tokenization_codepoint_config();
   config->set_start(32);
   config->set_end(33);
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
   FeatureProcessor feature_processor(options);

   // Test that two subsequent runs with random context dropout produce
   // different features.
   feature_processor.SetRandom(new std::mt19937);

   std::vector<std::vector<std::pair<int, float>>> features;
   std::vector<std::vector<std::pair<int, float>>> features2;
   std::vector<float> extra_features;
   std::vector<CodepointSpan> selection_label_spans;
   int selection_label;
   CodepointSpan selection_codepoint_label;
   int classification_label;
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
       "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
       &features, &extra_features, &selection_label_spans, &selection_label,
       &selection_codepoint_label, &classification_label));
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
       "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
       &features2, &extra_features, &selection_label_spans, &selection_label,
       &selection_codepoint_label, &classification_label));

   EXPECT_NE(features, features2);
 }

 TEST(FeatureProcessorTest, GetFeaturesWithLongerContext) {
   FeatureProcessorOptions options;
   options.set_num_buckets(10);
   options.set_context_size(9);
   options.set_max_selection_span(7);
   options.add_chargram_orders(1);
   options.set_tokenize_on_space(true);
   TokenizationCodepointRange* config =
       options.add_tokenization_codepoint_config();
   config->set_start(32);
   config->set_end(33);
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
   FeatureProcessor feature_processor(options);

   std::vector<std::vector<std::pair<int, float>>> features;
   std::vector<float> extra_features;
   std::vector<CodepointSpan> selection_label_spans;
   int selection_label;
   CodepointSpan selection_codepoint_label;
   int classification_label;
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
       "1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
       &features, &extra_features, &selection_label_spans, &selection_label,
       &selection_codepoint_label, &classification_label));
   EXPECT_EQ(19, features.size());

   // Should pad the string.
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
       "X", {0, 1}, {0, 1}, "", &features, &extra_features,
       &selection_label_spans, &selection_label, &selection_codepoint_label,
       &classification_label));
   EXPECT_EQ(19, features.size());
 }

 TEST(FeatureProcessorTest, CenterTokenFromClick) {
   int token_index;

   // Exactly aligned indices.
   token_index = internal::CenterTokenFromClick(
       {6, 11}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
                 Token("heře!", 12, 17, false)});
   EXPECT_EQ(token_index, 1);

   // Click is contained in a token.
   token_index = internal::CenterTokenFromClick(
       {13, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
                  Token("heře!", 12, 17, false)});
   EXPECT_EQ(token_index, 2);

   // Click spans two tokens.
   token_index = internal::CenterTokenFromClick(
       {6, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
                 Token("heře!", 12, 17, false)});
   EXPECT_EQ(token_index, kInvalidIndex);
 }

 TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
   int token_index;

   // Selection of length 3. Exactly aligned indices.
   token_index = internal::CenterTokenFromMiddleOfSelection(
       {7, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
                 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
                 Token("Token5", 28, 34, false)});
   EXPECT_EQ(token_index, 2);

   // Selection of length 1 token. Exactly aligned indices.
   token_index = internal::CenterTokenFromMiddleOfSelection(
       {21, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
                  Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
                  Token("Token5", 28, 34, false)});
   EXPECT_EQ(token_index, 3);

   // Selection marks sub-token range, with no tokens in it.
   token_index = internal::CenterTokenFromMiddleOfSelection(
       {29, 33}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
                  Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
                  Token("Token5", 28, 34, false)});
   EXPECT_EQ(token_index, kInvalidIndex);

   // Selection of length 2. Sub-token indices.
   token_index = internal::CenterTokenFromMiddleOfSelection(
       {3, 25}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
                 Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
                 Token("Token5", 28, 34, false)});
   EXPECT_EQ(token_index, 1);

   // Selection of length 1. Sub-token indices.
   token_index = internal::CenterTokenFromMiddleOfSelection(
       {22, 34}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
                  Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
                  Token("Token5", 28, 34, false)});
   EXPECT_EQ(token_index, 4);

   // Some invalid ones.
   token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
   EXPECT_EQ(token_index, -1);
 }

 TEST(FeatureProcessorTest, GetFeaturesForSharing) {
   FeatureProcessorOptions options;
   options.set_num_buckets(10);
   options.set_context_size(9);
   options.set_max_selection_span(7);
   options.add_chargram_orders(1);
   options.set_tokenize_on_space(true);
   options.set_center_token_selection_method(
       FeatureProcessorOptions::CENTER_TOKEN_MIDDLE_OF_SELECTION);
   options.set_only_use_line_with_click(true);
   options.set_split_tokens_on_selection_boundaries(true);
   options.set_extract_selection_mask_feature(true);
   TokenizationCodepointRange* config =
       options.add_tokenization_codepoint_config();
   config->set_start(32);
   config->set_end(33);
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
   config = options.add_tokenization_codepoint_config();
   config->set_start(10);
   config->set_end(11);
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
   FeatureProcessor feature_processor(options);

   std::vector<std::vector<std::pair<int, float>>> features;
   std::vector<float> extra_features;
   std::vector<CodepointSpan> selection_label_spans;
   int selection_label;
   CodepointSpan selection_codepoint_label;
   int classification_label;
   EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
       "line 1\nline2\nsome entity\n line 4", {13, 24}, {13, 24}, "", &features,
       &extra_features, &selection_label_spans, &selection_label,
       &selection_codepoint_label, &classification_label));
   EXPECT_EQ(19, features.size());
 }

 TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
   FeatureProcessorOptions options;
   options.set_context_size(2);
   options.set_max_selection_span(2);
   options.set_snap_label_span_boundaries_to_containing_tokens(false);

   TokenizationCodepointRange* config =
       options.add_tokenization_codepoint_config();
   config->set_start(32);
   config->set_end(33);
   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);

   FeatureProcessorOptions::CodepointRange* range;
   range = options.add_supported_codepoint_ranges();
   range->set_start(0);
   range->set_end(128);

   range = options.add_supported_codepoint_ranges();
   range->set_start(10000);
   range->set_end(10001);

   range = options.add_supported_codepoint_ranges();
   range->set_start(20000);
   range->set_end(30000);

   TestingFeatureProcessor feature_processor(options);
   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
                   1, feature_processor.Tokenize("aaa bbb ccc")),
               FloatEq(1.0));
   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
                   1, feature_processor.Tokenize("aaa bbb ěěě")),
               FloatEq(2.0 / 3));
   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
                   1, feature_processor.Tokenize("ěěě řřř ěěě")),
               FloatEq(0.0));
   EXPECT_FALSE(feature_processor.IsCodepointSupported(-1));
   EXPECT_TRUE(feature_processor.IsCodepointSupported(0));
   EXPECT_TRUE(feature_processor.IsCodepointSupported(10));
   EXPECT_TRUE(feature_processor.IsCodepointSupported(127));
   EXPECT_FALSE(feature_processor.IsCodepointSupported(128));
   EXPECT_FALSE(feature_processor.IsCodepointSupported(9999));
   EXPECT_TRUE(feature_processor.IsCodepointSupported(10000));
   EXPECT_FALSE(feature_processor.IsCodepointSupported(10001));
   EXPECT_TRUE(feature_processor.IsCodepointSupported(25000));

   std::vector<nlp_core::FeatureVector> features;
   std::vector<float> extra_features;

   options.set_min_supported_codepoint_ratio(0.0);
   feature_processor = TestingFeatureProcessor(options);
   EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
                                             &extra_features,
                                             /*selection_label_spans=*/nullptr));

   options.set_min_supported_codepoint_ratio(0.2);
   feature_processor = TestingFeatureProcessor(options);
   EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
                                             &extra_features,
                                             /*selection_label_spans=*/nullptr));

   options.set_min_supported_codepoint_ratio(0.5);
   feature_processor = TestingFeatureProcessor(options);
   EXPECT_FALSE(feature_processor.GetFeatures(
       "ěěě řřř eee", {4, 7}, &features, &extra_features,
       /*selection_label_spans=*/nullptr));
 }

 }  // namespace
 }  // namespace libtextclassifier
	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "smartselect/feature-processor.h"

	#include "gmock/gmock.h"
	#include "gtest/gtest.h"

	namespace libtextclassifier {
	namespace {

	using testing::ElementsAreArray;
	using testing::FloatEq;

	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
	Token("fěěbař@google.com", 6, 23, false),
	Token("heře!", 24, 29, false)};

	internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);

	// clang-format off
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Hělló", 0, 5, false),
	Token("fěě", 6, 9, false),
	Token("bař", 9, 12, false),
	Token("@google.com", 12, 23, false),
	Token("heře!", 24, 29, false)}));
	// clang-format on
	}

	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
	Token("fěěbař@google.com", 6, 23, false),
	Token("heře!", 24, 29, false)};

	internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);

	// clang-format off
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Hělló", 0, 5, false),
	Token("fěěbař", 6, 12, false),
	Token("@google.com", 12, 23, false),
	Token("heře!", 24, 29, false)}));
	// clang-format on
	}

	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
	Token("fěěbař@google.com", 6, 23, false),
	Token("heře!", 24, 29, false)};

	internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);

	// clang-format off
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Hělló", 0, 5, false),
	Token("fěě", 6, 9, false),
	Token("bař@google.com", 9, 23, false),
	Token("heře!", 24, 29, false)}));
	// clang-format on
	}

	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
	Token("fěěbař@google.com", 6, 23, false),
	Token("heře!", 24, 29, false)};

	internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);

	// clang-format off
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Hělló", 0, 5, false),
	Token("fěěbař@google.com", 6, 23, false),
	Token("heře!", 24, 29, false)}));
	// clang-format on
	}

	TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
	std::vector<Token> tokens{Token("Hělló", 0, 5, false),
	Token("fěěbař@google.com", 6, 23, false),
	Token("heře!", 24, 29, false)};

	internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);

	// clang-format off
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Hě", 0, 2, false),
	Token("lló", 2, 5, false),
	Token("fěě", 6, 9, false),
	Token("bař@google.com", 9, 23, false),
	Token("heře!", 24, 29, false)}));
	// clang-format on
	}

	TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
	const CodepointSpan span = {0, 5};
	// clang-format off
	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
	Token("Lině", 6, 10),
	Token("Sěcond", 11, 17),
	Token("Lině", 18, 22),
	Token("Thiřd", 23, 28),
	Token("Lině", 29, 33)};
	// clang-format on

	// Keeps the first line.
	internal::StripTokensFromOtherLines(context, span, &tokens);
	EXPECT_THAT(tokens,
	ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
	}

	TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
	const CodepointSpan span = {18, 22};
	// clang-format off
	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
	Token("Lině", 6, 10),
	Token("Sěcond", 11, 17),
	Token("Lině", 18, 22),
	Token("Thiřd", 23, 28),
	Token("Lině", 29, 33)};
	// clang-format on

	// Keeps the first line.
	internal::StripTokensFromOtherLines(context, span, &tokens);
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
	}

	TEST(FeatureProcessorTest, KeepLineWithClickThird) {
	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
	const CodepointSpan span = {24, 33};
	// clang-format off
	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
	Token("Lině", 6, 10),
	Token("Sěcond", 11, 17),
	Token("Lině", 18, 22),
	Token("Thiřd", 23, 28),
	Token("Lině", 29, 33)};
	// clang-format on

	// Keeps the first line.
	internal::StripTokensFromOtherLines(context, span, &tokens);
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
	}

	TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
	const std::string context = "Fiřst Lině\|Sěcond Lině\nThiřd Lině";
	const CodepointSpan span = {18, 22};
	// clang-format off
	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
	Token("Lině", 6, 10),
	Token("Sěcond", 11, 17),
	Token("Lině", 18, 22),
	Token("Thiřd", 23, 28),
	Token("Lině", 29, 33)};
	// clang-format on

	// Keeps the first line.
	internal::StripTokensFromOtherLines(context, span, &tokens);
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
	}

	TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
	const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
	const CodepointSpan span = {5, 23};
	// clang-format off
	std::vector<Token> tokens = {Token("Fiřst", 0, 5),
	Token("Lině", 6, 10),
	Token("Sěcond", 18, 23),
	Token("Lině", 19, 23),
	Token("Thiřd", 23, 28),
	Token("Lině", 29, 33)};
	// clang-format on

	// Keeps the first line.
	internal::StripTokensFromOtherLines(context, span, &tokens);
	EXPECT_THAT(tokens, ElementsAreArray(
	{Token("Fiřst", 0, 5), Token("Lině", 6, 10),
	Token("Sěcond", 18, 23), Token("Lině", 19, 23),
	Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
	}

	class TestingFeatureProcessor : public FeatureProcessor {
	public:
	using FeatureProcessor::FeatureProcessor;
	using FeatureProcessor::SpanToLabel;
	using FeatureProcessor::SupportedCodepointsRatio;
	using FeatureProcessor::IsCodepointSupported;
	};

	TEST(FeatureProcessorTest, SpanToLabel) {
	FeatureProcessorOptions options;
	options.set_context_size(1);
	options.set_max_selection_span(1);
	options.set_tokenize_on_space(true);
	options.set_snap_label_span_boundaries_to_containing_tokens(false);

	TokenizationCodepointRange* config =
	options.add_tokenization_codepoint_config();
	config->set_start(32);
	config->set_end(33);
	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);

	TestingFeatureProcessor feature_processor(options);
	std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
	ASSERT_EQ(3, tokens.size());
	int label;
	ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
	EXPECT_EQ(kInvalidLabel, label);
	ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
	EXPECT_NE(kInvalidLabel, label);
	TokenSpan token_span;
	feature_processor.LabelToTokenSpan(label, &token_span);
	EXPECT_EQ(0, token_span.first);
	EXPECT_EQ(0, token_span.second);

	// Reconfigure with snapping enabled.
	options.set_snap_label_span_boundaries_to_containing_tokens(true);
	TestingFeatureProcessor feature_processor2(options);
	int label2;
	ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
	EXPECT_EQ(label, label2);
	ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
	EXPECT_EQ(label, label2);
	ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
	EXPECT_EQ(label, label2);

	// Cross a token boundary.
	ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
	EXPECT_EQ(kInvalidLabel, label2);
	ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
	EXPECT_EQ(kInvalidLabel, label2);

	// Multiple tokens.
	options.set_context_size(2);
	options.set_max_selection_span(2);
	TestingFeatureProcessor feature_processor3(options);
	tokens = feature_processor3.Tokenize("zero, one, two, three, four");
	ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
	EXPECT_NE(kInvalidLabel, label2);
	feature_processor3.LabelToTokenSpan(label2, &token_span);
	EXPECT_EQ(1, token_span.first);
	EXPECT_EQ(0, token_span.second);

	int label3;
	ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
	EXPECT_EQ(label2, label3);
	ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
	EXPECT_EQ(label2, label3);
	ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
	EXPECT_EQ(label2, label3);
	}

	TEST(FeatureProcessorTest, GetFeaturesWithContextDropout) {
	FeatureProcessorOptions options;
	options.set_num_buckets(10);
	options.set_context_size(7);
	options.set_max_selection_span(7);
	options.add_chargram_orders(1);
	options.set_tokenize_on_space(true);
	options.set_context_dropout_probability(0.5);
	options.set_use_variable_context_dropout(true);
	TokenizationCodepointRange* config =
	options.add_tokenization_codepoint_config();
	config->set_start(32);
	config->set_end(33);
	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
	FeatureProcessor feature_processor(options);

	// Test that two subsequent runs with random context dropout produce
	// different features.
	feature_processor.SetRandom(new std::mt19937);

	std::vector<std::vector<std::pair<int, float>>> features;
	std::vector<std::vector<std::pair<int, float>>> features2;
	std::vector<float> extra_features;
	std::vector<CodepointSpan> selection_label_spans;
	int selection_label;
	CodepointSpan selection_codepoint_label;
	int classification_label;
	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
	&features, &extra_features, &selection_label_spans, &selection_label,
	&selection_codepoint_label, &classification_label));
	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
	&features2, &extra_features, &selection_label_spans, &selection_label,
	&selection_codepoint_label, &classification_label));

	EXPECT_NE(features, features2);
	}

	TEST(FeatureProcessorTest, GetFeaturesWithLongerContext) {
	FeatureProcessorOptions options;
	options.set_num_buckets(10);
	options.set_context_size(9);
	options.set_max_selection_span(7);
	options.add_chargram_orders(1);
	options.set_tokenize_on_space(true);
	TokenizationCodepointRange* config =
	options.add_tokenization_codepoint_config();
	config->set_start(32);
	config->set_end(33);
	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
	FeatureProcessor feature_processor(options);

	std::vector<std::vector<std::pair<int, float>>> features;
	std::vector<float> extra_features;
	std::vector<CodepointSpan> selection_label_spans;
	int selection_label;
	CodepointSpan selection_codepoint_label;
	int classification_label;
	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
	"1 2 3 c o n t e x t X c o n t e x t 1 2 3", {20, 21}, {20, 21}, "",
	&features, &extra_features, &selection_label_spans, &selection_label,
	&selection_codepoint_label, &classification_label));
	EXPECT_EQ(19, features.size());

	// Should pad the string.
	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
	"X", {0, 1}, {0, 1}, "", &features, &extra_features,
	&selection_label_spans, &selection_label, &selection_codepoint_label,
	&classification_label));
	EXPECT_EQ(19, features.size());
	}

	TEST(FeatureProcessorTest, CenterTokenFromClick) {
	int token_index;

	// Exactly aligned indices.
	token_index = internal::CenterTokenFromClick(
	{6, 11}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
	Token("heře!", 12, 17, false)});
	EXPECT_EQ(token_index, 1);

	// Click is contained in a token.
	token_index = internal::CenterTokenFromClick(
	{13, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
	Token("heře!", 12, 17, false)});
	EXPECT_EQ(token_index, 2);

	// Click spans two tokens.
	token_index = internal::CenterTokenFromClick(
	{6, 17}, {Token("Hělló", 0, 5, false), Token("world", 6, 11, false),
	Token("heře!", 12, 17, false)});
	EXPECT_EQ(token_index, kInvalidIndex);
	}

	TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
	int token_index;

	// Selection of length 3. Exactly aligned indices.
	token_index = internal::CenterTokenFromMiddleOfSelection(
	{7, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
	Token("Token5", 28, 34, false)});
	EXPECT_EQ(token_index, 2);

	// Selection of length 1 token. Exactly aligned indices.
	token_index = internal::CenterTokenFromMiddleOfSelection(
	{21, 27}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
	Token("Token5", 28, 34, false)});
	EXPECT_EQ(token_index, 3);

	// Selection marks sub-token range, with no tokens in it.
	token_index = internal::CenterTokenFromMiddleOfSelection(
	{29, 33}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
	Token("Token5", 28, 34, false)});
	EXPECT_EQ(token_index, kInvalidIndex);

	// Selection of length 2. Sub-token indices.
	token_index = internal::CenterTokenFromMiddleOfSelection(
	{3, 25}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
	Token("Token5", 28, 34, false)});
	EXPECT_EQ(token_index, 1);

	// Selection of length 1. Sub-token indices.
	token_index = internal::CenterTokenFromMiddleOfSelection(
	{22, 34}, {Token("Token1", 0, 6, false), Token("Token2", 7, 13, false),
	Token("Token3", 14, 20, false), Token("Token4", 21, 27, false),
	Token("Token5", 28, 34, false)});
	EXPECT_EQ(token_index, 4);

	// Some invalid ones.
	token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
	EXPECT_EQ(token_index, -1);
	}

	TEST(FeatureProcessorTest, GetFeaturesForSharing) {
	FeatureProcessorOptions options;
	options.set_num_buckets(10);
	options.set_context_size(9);
	options.set_max_selection_span(7);
	options.add_chargram_orders(1);
	options.set_tokenize_on_space(true);
	options.set_center_token_selection_method(
	FeatureProcessorOptions::CENTER_TOKEN_MIDDLE_OF_SELECTION);
	options.set_only_use_line_with_click(true);
	options.set_split_tokens_on_selection_boundaries(true);
	options.set_extract_selection_mask_feature(true);
	TokenizationCodepointRange* config =
	options.add_tokenization_codepoint_config();
	config->set_start(32);
	config->set_end(33);
	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
	config = options.add_tokenization_codepoint_config();
	config->set_start(10);
	config->set_end(11);
	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
	FeatureProcessor feature_processor(options);

	std::vector<std::vector<std::pair<int, float>>> features;
	std::vector<float> extra_features;
	std::vector<CodepointSpan> selection_label_spans;
	int selection_label;
	CodepointSpan selection_codepoint_label;
	int classification_label;
	EXPECT_TRUE(feature_processor.GetFeaturesAndLabels(
	"line 1\nline2\nsome entity\n line 4", {13, 24}, {13, 24}, "", &features,
	&extra_features, &selection_label_spans, &selection_label,
	&selection_codepoint_label, &classification_label));
	EXPECT_EQ(19, features.size());
	}

	TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
	FeatureProcessorOptions options;
	options.set_context_size(2);
	options.set_max_selection_span(2);
	options.set_snap_label_span_boundaries_to_containing_tokens(false);

	TokenizationCodepointRange* config =
	options.add_tokenization_codepoint_config();
	config->set_start(32);
	config->set_end(33);
	config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);

	FeatureProcessorOptions::CodepointRange* range;
	range = options.add_supported_codepoint_ranges();
	range->set_start(0);
	range->set_end(128);

	range = options.add_supported_codepoint_ranges();
	range->set_start(10000);
	range->set_end(10001);

	range = options.add_supported_codepoint_ranges();
	range->set_start(20000);
	range->set_end(30000);

	TestingFeatureProcessor feature_processor(options);
	EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
	1, feature_processor.Tokenize("aaa bbb ccc")),
	FloatEq(1.0));
	EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
	1, feature_processor.Tokenize("aaa bbb ěěě")),
	FloatEq(2.0 / 3));
	EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
	1, feature_processor.Tokenize("ěěě řřř ěěě")),
	FloatEq(0.0));
	EXPECT_FALSE(feature_processor.IsCodepointSupported(-1));
	EXPECT_TRUE(feature_processor.IsCodepointSupported(0));
	EXPECT_TRUE(feature_processor.IsCodepointSupported(10));
	EXPECT_TRUE(feature_processor.IsCodepointSupported(127));
	EXPECT_FALSE(feature_processor.IsCodepointSupported(128));
	EXPECT_FALSE(feature_processor.IsCodepointSupported(9999));
	EXPECT_TRUE(feature_processor.IsCodepointSupported(10000));
	EXPECT_FALSE(feature_processor.IsCodepointSupported(10001));
	EXPECT_TRUE(feature_processor.IsCodepointSupported(25000));

	std::vector<nlp_core::FeatureVector> features;
	std::vector<float> extra_features;

	options.set_min_supported_codepoint_ratio(0.0);
	feature_processor = TestingFeatureProcessor(options);
	EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
	&extra_features,
	/selection_label_spans=/nullptr));

	options.set_min_supported_codepoint_ratio(0.2);
	feature_processor = TestingFeatureProcessor(options);
	EXPECT_TRUE(feature_processor.GetFeatures("ěěě řřř eee", {4, 7}, &features,
	&extra_features,
	/selection_label_spans=/nullptr));

	options.set_min_supported_codepoint_ratio(0.5);
	feature_processor = TestingFeatureProcessor(options);
	EXPECT_FALSE(feature_processor.GetFeatures(
	"ěěě řřř eee", {4, 7}, &features, &extra_features,
	/selection_label_spans=/nullptr));
	}

	} // namespace
	} // namespace libtextclassifier