| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "annotator/number/number.h" |
| |
| #include <string> |
| #include <vector> |
| |
| #include "annotator/collections.h" |
| #include "annotator/model_generated.h" |
| #include "annotator/types-test-util.h" |
| #include "annotator/types.h" |
| #include "utils/test-utils.h" |
| #include "utils/utf8/unicodetext.h" |
| #include "utils/utf8/unilib.h" |
| #include "gmock/gmock.h" |
| #include "gtest/gtest.h" |
| |
| namespace libtextclassifier3 { |
| namespace { |
| |
| using testing::AllOf; |
| using testing::ElementsAre; |
| using testing::Field; |
| |
| const NumberAnnotatorOptions* TestingNumberAnnotatorOptions() { |
| static const flatbuffers::DetachedBuffer* options_data = []() { |
| NumberAnnotatorOptionsT options; |
| options.enabled = true; |
| options.allowed_prefix_codepoints.push_back('$'); |
| options.allowed_suffix_codepoints.push_back('%'); |
| |
| flatbuffers::FlatBufferBuilder builder; |
| builder.Finish(NumberAnnotatorOptions::Pack(builder, &options)); |
| return new flatbuffers::DetachedBuffer(builder.Release()); |
| }(); |
| |
| return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data()); |
| } |
| |
| FeatureProcessor BuildFeatureProcessor(const UniLib* unilib) { |
| static const flatbuffers::DetachedBuffer* options_data = []() { |
| FeatureProcessorOptionsT options; |
| options.context_size = 1; |
| options.max_selection_span = 1; |
| options.snap_label_span_boundaries_to_containing_tokens = false; |
| options.ignored_span_boundary_codepoints.push_back(','); |
| |
| options.tokenization_codepoint_config.emplace_back( |
| new TokenizationCodepointRangeT()); |
| auto& config = options.tokenization_codepoint_config.back(); |
| config->start = 32; |
| config->end = 33; |
| config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; |
| |
| flatbuffers::FlatBufferBuilder builder; |
| builder.Finish(FeatureProcessorOptions::Pack(builder, &options)); |
| return new flatbuffers::DetachedBuffer(builder.Release()); |
| }(); |
| |
| const FeatureProcessorOptions* feature_processor_options = |
| flatbuffers::GetRoot<FeatureProcessorOptions>(options_data->data()); |
| |
| return FeatureProcessor(feature_processor_options, unilib); |
| } |
| |
| class NumberAnnotatorTest : public ::testing::Test { |
| protected: |
| NumberAnnotatorTest() |
| : INIT_UNILIB_FOR_TESTING(unilib_), |
| feature_processor_(BuildFeatureProcessor(&unilib_)), |
| number_annotator_(TestingNumberAnnotatorOptions(), |
| &feature_processor_) {} |
| |
| UniLib unilib_; |
| FeatureProcessor feature_processor_; |
| NumberAnnotator number_annotator_; |
| }; |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345 ..."), {4, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 123a45 ..."), {4, 10}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... 12345 ... 9 is my number and I paid $99 and " |
| "sometimes 27% but not 68# nor #68"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| ASSERT_EQ(result.size(), 4); |
| ASSERT_EQ(result[0].classification.size(), 1); |
| EXPECT_EQ(result[0].classification[0].collection, "number"); |
| EXPECT_EQ(result[0].classification[0].numeric_value, 12345); |
| ASSERT_EQ(result[1].classification.size(), 1); |
| EXPECT_EQ(result[1].classification[0].collection, "number"); |
| EXPECT_EQ(result[1].classification[0].numeric_value, 9); |
| ASSERT_EQ(result[2].classification.size(), 1); |
| EXPECT_EQ(result[2].classification[0].collection, "number"); |
| EXPECT_EQ(result[2].classification[0].numeric_value, 99); |
| ASSERT_EQ(result[3].classification.size(), 1); |
| EXPECT_EQ(result[3].classification[0].collection, "number"); |
| EXPECT_EQ(result[3].classification[0].numeric_value, 27); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("Come at 9, ok?"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT( |
| result, |
| ElementsAre( |
| AllOf(Field(&AnnotatedSpan::span, CodepointSpan(8, 9)), |
| Field(&AnnotatedSpan::classification, |
| ElementsAre(AllOf( |
| Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, 9))))))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| EXPECT_THAT( |
| result, |
| ElementsAre( |
| AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 2)), |
| Field(&AnnotatedSpan::classification, |
| ElementsAre(AllOf( |
| Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, -5))))))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("-999999999999999999"), {0, 19}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_THAT( |
| classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, -999999999999999999L))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("999999999999999999"), {0, 18}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_THAT( |
| classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, 999999999999999999L))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenFirstLowestNonSupportedNumberDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("-10000000000000000000"), {0, 21}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenFirstLargestNonSupportedNumberDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("10000000000000000000"), {0, 20}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("--10"), {0, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("10-"), {0, 3}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("2016-2017"), {0, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| ASSERT_EQ(result.size(), 0); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| ASSERT_EQ(result.size(), 0); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| ASSERT_EQ(result.size(), 0); |
| } |
| |
| } // namespace |
| } // namespace libtextclassifier3 |