| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "annotator/number/number_test-include.h" |
| |
| #include <string> |
| #include <vector> |
| |
| #include "annotator/collections.h" |
| #include "annotator/model_generated.h" |
| #include "annotator/types-test-util.h" |
| #include "annotator/types.h" |
| #include "utils/tokenizer-utils.h" |
| #include "utils/utf8/unicodetext.h" |
| #include "gmock/gmock.h" |
| #include "gtest/gtest.h" |
| |
| namespace libtextclassifier3 { |
| namespace test_internal { |
| |
| using ::testing::AllOf; |
| using ::testing::ElementsAre; |
| using ::testing::Field; |
| using ::testing::Matcher; |
| using ::testing::UnorderedElementsAre; |
| |
| const NumberAnnotatorOptions* |
| NumberAnnotatorTest::TestingNumberAnnotatorOptions() { |
| static const flatbuffers::DetachedBuffer* options_data = []() { |
| NumberAnnotatorOptionsT options; |
| options.enabled = true; |
| options.priority_score = -10.0; |
| options.float_number_priority_score = 1.0; |
| options.enabled_annotation_usecases = |
| 1 << AnnotationUsecase_ANNOTATION_USECASE_RAW; |
| options.max_number_of_digits = 20; |
| |
| options.percentage_priority_score = 1.0; |
| options.percentage_annotation_usecases = |
| (1 << AnnotationUsecase_ANNOTATION_USECASE_RAW) + |
| (1 << AnnotationUsecase_ANNOTATION_USECASE_SMART); |
| std::set<std::string> percent_suffixes({"パーセント", "percent", "pércént", |
| "pc", "pct", "%", "٪", "﹪", "%"}); |
| for (const std::string& string_value : percent_suffixes) { |
| options.percentage_pieces_string.append(string_value); |
| options.percentage_pieces_string.push_back('\0'); |
| } |
| |
| flatbuffers::FlatBufferBuilder builder; |
| builder.Finish(NumberAnnotatorOptions::Pack(builder, &options)); |
| return new flatbuffers::DetachedBuffer(builder.Release()); |
| }(); |
| |
| return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data()); |
| } |
| |
| MATCHER_P(IsCorrectCollection, collection, "collection is " + collection) { |
| return arg.collection == collection; |
| } |
| |
| MATCHER_P(IsCorrectNumericValue, numeric_value, |
| "numeric value is " + std::to_string(numeric_value)) { |
| return arg.numeric_value == numeric_value; |
| } |
| |
| MATCHER_P(IsCorrectNumericDoubleValue, numeric_double_value, |
| "numeric double value is " + std::to_string(numeric_double_value)) { |
| return arg.numeric_double_value == numeric_double_value; |
| } |
| |
| MATCHER_P(IsCorrectScore, score, "score is " + std::to_string(score)) { |
| return arg.score == score; |
| } |
| |
| MATCHER_P(IsCorrectPriortyScore, priority_score, |
| "priority score is " + std::to_string(priority_score)) { |
| return arg.priority_score == priority_score; |
| } |
| |
| MATCHER_P(IsCorrectSpan, span, |
| "span is (" + std::to_string(span.first) + "," + |
| std::to_string(span.second) + ")") { |
| return arg.span == span; |
| } |
| |
| MATCHER_P(Classification, inner, "") { |
| return testing::ExplainMatchResult(inner, arg.classification, |
| result_listener); |
| } |
| |
| static Matcher<AnnotatedSpan> IsAnnotatedSpan( |
| const CodepointSpan& codepoint_span, const std::string& collection, |
| const int int_value, const double double_value, |
| const float priority_score = -10, const float score = 1) { |
| return AllOf( |
| IsCorrectSpan(codepoint_span), |
| Classification(ElementsAre(AllOf( |
| IsCorrectCollection(collection), IsCorrectNumericValue(int_value), |
| IsCorrectNumericDoubleValue(double_value), IsCorrectScore(score), |
| IsCorrectPriortyScore(priority_score))))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345 ..."), {4, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberAsFloatCorrectly) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345.12345); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals) { |
| ClassificationResult classification_result; |
| // The dot after a number is considered punctuation, not part of a floating |
| // number. |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345. ..."), {4, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345. ..."), {4, 10}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345); |
| |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345. ..."), {4, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindsAllIntegerAndFloatNumbersInText) { |
| std::vector<AnnotatedSpan> result; |
| // In the context "68.9#" -> 68.9 is a number because # is punctuation. |
| // In the context "68.9#?" -> 68.9 is not a number because is followed by two |
| // punctuation signs. |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("how much is 2 plus 5 divided by 7% minus 3.14 " |
| "what about 68.9# or 68.9#?"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(12, 13), "number", |
| /*int_value=*/2, /*double_value=*/2.0), |
| IsAnnotatedSpan(CodepointSpan(19, 20), "number", |
| /*int_value=*/5, /*double_value=*/5.0), |
| IsAnnotatedSpan(CodepointSpan(32, 33), "number", |
| /*int_value=*/7, /*double_value=*/7.0), |
| IsAnnotatedSpan(CodepointSpan(32, 34), "percentage", |
| /*int_value=*/7, /*double_value=*/7.0, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(41, 45), "number", |
| /*int_value=*/3, /*double_value=*/3.14, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(57, 61), "number", |
| /*int_value=*/68, /*double_value=*/68.9, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 123a45 ..."), {4, 10}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345..12345 ..."), {4, 16}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345a ..."), {4, 11}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesNumberSelectionCorrectly) { |
| ClassificationResult classification_result; |
| // Punctuation after a number is not part of the number. |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 14, ..."), {4, 6}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 14); |
| EXPECT_EQ(classification_result.numeric_double_value, 14); |
| |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 14, ..."), {4, 7}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesPercentageSignCorrectly) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 99% ..."), {4, 7}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 99); |
| EXPECT_EQ(classification_result.numeric_double_value, 99); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesPercentageWordCorrectly) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 15 percent ..."), {4, 14}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 15); |
| EXPECT_EQ(classification_result.numeric_double_value, 15); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiPercentageIncorrectSuffix) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("15 café"), {0, 7}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiFrPercentageCorrectSuffix) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("25 pércént"), {0, 10}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 25); |
| EXPECT_EQ(classification_result.numeric_double_value, 25); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiJaPercentageCorrectSuffix) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("10パーセント"), {0, 7}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 10); |
| EXPECT_EQ(classification_result.numeric_double_value, 10); |
| |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("明日の降水確率は10パーセント 音量を12にセット"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(8, 10), "number", |
| /*int_value=*/10, /*double_value=*/10.0), |
| IsAnnotatedSpan(CodepointSpan(8, 15), "percentage", |
| /*int_value=*/10, /*double_value=*/10.0, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(20, 22), "number", |
| /*int_value=*/12, /*double_value=*/12.0))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 " |
| "but not $99."), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT( |
| result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(4, 9), "number", |
| /*int_value=*/12345, /*double_value=*/12345.0), |
| IsAnnotatedSpan(CodepointSpan(14, 15), "number", |
| /*int_value=*/9, /*double_value=*/9.0), |
| IsAnnotatedSpan(CodepointSpan(33, 35), "number", |
| /*int_value=*/27, /*double_value=*/27.0), |
| IsAnnotatedSpan(CodepointSpan(33, 36), "percentage", |
| /*int_value=*/27, /*double_value=*/27.0, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(40, 42), "number", |
| /*int_value=*/68, /*double_value=*/68.0), |
| IsAnnotatedSpan(CodepointSpan(45, 47), "number", |
| /*int_value=*/38, /*double_value=*/38.0), |
| IsAnnotatedSpan(CodepointSpan(49, 51), "number", |
| /*int_value=*/39, /*double_value=*/39.0))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindsNoNumberInText) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... 12345a ... 12345..12345 and 123a45 are not valid. " |
| "And -#5% is also bad."), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| ASSERT_EQ(result.size(), 0); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) { |
| std::vector<AnnotatedSpan> result; |
| // A number should be followed by only one punctuation signs => 15 is not a |
| // number. |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText( |
| "It's 12, 13, 14! Or 15??? For sure 16: 17; 18. and -19"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(5, 7), "number", |
| /*int_value=*/12, /*double_value=*/12.0), |
| IsAnnotatedSpan(CodepointSpan(9, 11), "number", |
| /*int_value=*/13, /*double_value=*/13.0), |
| IsAnnotatedSpan(CodepointSpan(13, 15), "number", |
| /*int_value=*/14, /*double_value=*/14.0), |
| IsAnnotatedSpan(CodepointSpan(35, 37), "number", |
| /*int_value=*/16, /*double_value=*/16.0), |
| IsAnnotatedSpan(CodepointSpan(39, 41), "number", |
| /*int_value=*/17, /*double_value=*/17.0), |
| IsAnnotatedSpan(CodepointSpan(43, 45), "number", |
| /*int_value=*/18, /*double_value=*/18.0), |
| IsAnnotatedSpan(CodepointSpan(51, 54), "number", |
| /*int_value=*/-19, /*double_value=*/-19.0))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindsFloatNumberWithPunctuation) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("It's 12.123, 13.45, 14.54321! Or 15.1? Maybe 16.33: " |
| "17.21; but for sure 18.90."), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(5, 11), "number", |
| /*int_value=*/12, /*double_value=*/12.123, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(13, 18), "number", |
| /*int_value=*/13, /*double_value=*/13.45, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(20, 28), "number", |
| /*int_value=*/14, /*double_value=*/14.54321, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(33, 37), "number", |
| /*int_value=*/15, /*double_value=*/15.1, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(45, 50), "number", |
| /*int_value=*/16, /*double_value=*/16.33, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(52, 57), "number", |
| /*int_value=*/17, /*double_value=*/17.21, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(72, 77), "number", |
| /*int_value=*/18, /*double_value=*/18.9, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan( |
| CodepointSpan(0, 2), "number", |
| /*int_value=*/-5, /*double_value=*/-5))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, HandlesNegativeNumbers) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("Number -5 and -5% and not number --5%"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(7, 9), "number", |
| /*int_value=*/-5, /*double_value=*/-5), |
| IsAnnotatedSpan(CodepointSpan(14, 16), "number", |
| /*int_value=*/-5, /*double_value=*/-5), |
| IsAnnotatedSpan(CodepointSpan(14, 17), "percentage", |
| /*int_value=*/-5, /*double_value=*/-5, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindGoodPercentageContexts) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText( |
| "5 percent, 10 pct, 25 pc and 17%, -5 percent, 10% are percentages"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(0, 1), "number", |
| /*int_value=*/5, /*double_value=*/5), |
| IsAnnotatedSpan(CodepointSpan(0, 9), "percentage", |
| /*int_value=*/5, /*double_value=*/5, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(11, 13), "number", |
| /*int_value=*/10, /*double_value=*/10), |
| IsAnnotatedSpan(CodepointSpan(11, 17), "percentage", |
| /*int_value=*/10, /*double_value=*/10, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(19, 21), "number", |
| /*int_value=*/25, /*double_value=*/25), |
| IsAnnotatedSpan(CodepointSpan(19, 24), "percentage", |
| /*int_value=*/25, /*double_value=*/25, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(29, 31), "number", |
| /*int_value=*/17, /*double_value=*/17), |
| IsAnnotatedSpan(CodepointSpan(29, 32), "percentage", |
| /*int_value=*/17, /*double_value=*/17, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(34, 36), "number", |
| /*int_value=*/-5, /*double_value=*/-5), |
| IsAnnotatedSpan(CodepointSpan(34, 44), "percentage", |
| /*int_value=*/-5, /*double_value=*/-5, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(46, 48), "number", |
| /*int_value=*/10, /*double_value=*/10), |
| IsAnnotatedSpan(CodepointSpan(46, 49), "percentage", |
| /*int_value=*/10, /*double_value=*/10, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindSinglePercentageInContext) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("5%"), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| EXPECT_THAT(result, UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(0, 1), "number", |
| /*int_value=*/5, /*double_value=*/5), |
| IsAnnotatedSpan(CodepointSpan(0, 2), "percentage", |
| /*int_value=*/5, /*double_value=*/5, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, IgnoreBadPercentageContexts) { |
| std::vector<AnnotatedSpan> result; |
| // A valid number is followed by only one punctuation element. |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("10, pct, 25 prc, 5#: percentage are not percentages"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(0, 2), "number", |
| /*int_value=*/10, /*double_value=*/10), |
| IsAnnotatedSpan(CodepointSpan(9, 11), "number", |
| /*int_value=*/25, /*double_value=*/25))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, IgnoreBadPercentagePunctuationContexts) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText( |
| "#!24% or :?33 percent are not valid percentages, nor numbers."), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_TRUE(result.empty()); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FindPercentageInNonAsciiContext) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText( |
| "At the café 10% or 25 percent of people are nice. Only 10%!"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(12, 14), "number", |
| /*int_value=*/10, /*double_value=*/10), |
| IsAnnotatedSpan(CodepointSpan(12, 15), "percentage", |
| /*int_value=*/10, /*double_value=*/10, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(19, 21), "number", |
| /*int_value=*/25, /*double_value=*/25), |
| IsAnnotatedSpan(CodepointSpan(19, 29), "percentage", |
| /*int_value=*/25, /*double_value=*/25, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(55, 57), "number", |
| /*int_value=*/10, /*double_value=*/10), |
| IsAnnotatedSpan(CodepointSpan(55, 58), "percentage", |
| /*int_value=*/10, /*double_value=*/10, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("23#!? percent"), {0, 13}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("23 asdf 3.14 pct asdf"), {0, 21}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("abdf23 percentabdf"), {0, 18}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("#?!23 percent#!?"), {0, 16}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenBothPercentSymbolAndSuffixDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("23% percent"), {0, 11}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("#?23%"), {0, 5}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenNumberWithAdditionalCharactersDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("23#!?"), {0, 5}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| // ! does not belong to the percentage annotation |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("23%!"), {0, 3}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 23); |
| EXPECT_EQ(classification_result.numeric_double_value, 23); |
| |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("23%!"), {0, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("23.:;%"), {0, 6}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("--11"), {1, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_THAT(classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, -11), |
| Field(&ClassificationResult::numeric_double_value, -11))); |
| |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("--11"), {0, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsPercentSignDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("--11%"), {1, 5}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_THAT(classification_result, |
| AllOf(Field(&ClassificationResult::collection, "percentage"), |
| Field(&ClassificationResult::numeric_value, -11), |
| Field(&ClassificationResult::numeric_double_value, -11))); |
| |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("--11%"), {0, 5}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenPlusMinusSignsDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("+-11"), {1, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_THAT(classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, -11), |
| Field(&ClassificationResult::numeric_double_value, -11))); |
| |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("+-11"), {0, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMinusPlusSignsDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| // + right before a number is not included in the number annotation |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("-+11"), {1, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("-+11"), {0, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("10-"), {0, 3}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMultipleCharSuffixDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("10**"), {0, 2}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_THAT(classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, 10), |
| Field(&ClassificationResult::numeric_double_value, 10))); |
| |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("10**"), {0, 3}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("10**"), {0, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMultipleCharPrefixDoesNotParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("**10"), {1, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("**10"), {0, 4}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("-1000000000"), {0, 11}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_THAT( |
| classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, -1000000000), |
| Field(&ClassificationResult::numeric_double_value, -1000000000))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("1000000000"), {0, 10}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_THAT( |
| classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, 1000000000), |
| Field(&ClassificationResult::numeric_double_value, 1000000000))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLowestSupportedFloatNumberParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("-999999999.999999999"), {0, 20}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_THAT(classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, -1000000000), |
| Field(&ClassificationResult::numeric_double_value, |
| -999999999.999999999))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLargestFloatSupportedNumberParsesIt) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("999999999.999999999"), {0, 19}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_THAT(classification_result, |
| AllOf(Field(&ClassificationResult::collection, "number"), |
| Field(&ClassificationResult::numeric_value, 1000000000), |
| Field(&ClassificationResult::numeric_double_value, |
| 999999999.999999999))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("2016-2017"), {0, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| ASSERT_EQ(result.size(), 0); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| ASSERT_EQ(result.size(), 0); |
| } |
| |
| TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| ASSERT_EQ(result.size(), 0); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ForNumberAnnotationsSetsScoreAndPriorityScore) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345 ..."), {4, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_EQ(classification_result.numeric_double_value, 12345); |
| EXPECT_EQ(classification_result.score, 1); |
| EXPECT_EQ(classification_result.priority_score, -10); |
| |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("Come at 9 or 10 ok?"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(8, 9), "number", |
| /*int_value=*/9, /*double_value=*/9), |
| IsAnnotatedSpan(CodepointSpan(13, 15), "number", |
| /*int_value=*/10, /*double_value=*/10))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, |
| ForFloatNumberAnnotationsSetsScoreAndPriorityScore) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_EQ(classification_result.numeric_double_value, 12345.12345); |
| EXPECT_EQ(classification_result.score, 1); |
| EXPECT_EQ(classification_result.priority_score, 1); |
| |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("Results are between 12.5 and 13.5, right?"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(20, 24), "number", |
| /*int_value=*/12, /*double_value=*/12.5, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(29, 33), "number", |
| /*int_value=*/13, /*double_value=*/13.5, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ForPercentageAnnotationsSetsScoreAndPriorityScore) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345% ..."), {4, 10}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_EQ(classification_result.numeric_double_value, 12345); |
| EXPECT_EQ(classification_result.score, 1); |
| EXPECT_EQ(classification_result.priority_score, 1); |
| |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345 percent ..."), {4, 17}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_EQ(classification_result.numeric_double_value, 12345); |
| EXPECT_EQ(classification_result.score, 1); |
| EXPECT_EQ(classification_result.priority_score, 1); |
| |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("Results are between 9% and 10 percent."), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(20, 21), "number", |
| /*int_value=*/9, /*double_value=*/9), |
| IsAnnotatedSpan(CodepointSpan(20, 22), "percentage", |
| /*int_value=*/9, /*double_value=*/9, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(27, 29), "number", |
| /*int_value=*/10, /*double_value=*/10), |
| IsAnnotatedSpan(CodepointSpan(27, 37), "percentage", |
| /*int_value=*/10, /*double_value=*/10, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, NumberDisabledPercentageEnabledForSmartUsecase) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345 ..."), {4, 9}, |
| AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result)); |
| |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345% ..."), {4, 10}, |
| AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_EQ(classification_result.numeric_double_value, 12345.0); |
| EXPECT_EQ(classification_result.score, 1); |
| EXPECT_EQ(classification_result.priority_score, 1); |
| |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("... 12345percent ..."), {4, 16}, |
| AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "percentage"); |
| EXPECT_EQ(classification_result.numeric_value, 12345); |
| EXPECT_EQ(classification_result.numeric_double_value, 12345); |
| EXPECT_EQ(classification_result.score, 1); |
| EXPECT_EQ(classification_result.priority_score, 1); |
| |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("Accuracy for experiment 3 is 9%."), |
| AnnotationUsecase_ANNOTATION_USECASE_SMART, &result)); |
| EXPECT_THAT(result, UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(29, 31), "percentage", |
| /*int_value=*/9, /*double_value=*/9.0, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersFindAll) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("how much is 2 + 2 or 5 - 96 * 89"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(12, 13), "number", |
| /*int_value=*/2, /*double_value=*/2), |
| IsAnnotatedSpan(CodepointSpan(16, 17), "number", |
| /*int_value=*/2, /*double_value=*/2), |
| IsAnnotatedSpan(CodepointSpan(21, 22), "number", |
| /*int_value=*/5, /*double_value=*/5), |
| IsAnnotatedSpan(CodepointSpan(25, 27), "number", |
| /*int_value=*/96, /*double_value=*/96), |
| IsAnnotatedSpan(CodepointSpan(30, 32), "number", |
| /*int_value=*/89, /*double_value=*/89))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersClassifyText) { |
| ClassificationResult classification_result; |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("2 + 2"), {2, 3}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_FALSE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("2 - 96 * 89"), {2, 3}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| } |
| |
| TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersFindAll) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("what's 1 + 2/3 * 4/5 * 6 / 7"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(7, 8), "number", |
| /*int_value=*/1, /*double_value=*/1), |
| IsAnnotatedSpan(CodepointSpan(11, 12), "number", |
| /*int_value=*/2, /*double_value=*/2), |
| IsAnnotatedSpan(CodepointSpan(13, 14), "number", |
| /*int_value=*/3, /*double_value=*/3), |
| IsAnnotatedSpan(CodepointSpan(17, 18), "number", |
| /*int_value=*/4, /*double_value=*/4), |
| IsAnnotatedSpan(CodepointSpan(19, 20), "number", |
| /*int_value=*/5, /*double_value=*/5), |
| IsAnnotatedSpan(CodepointSpan(23, 24), "number", |
| /*int_value=*/6, /*double_value=*/6), |
| IsAnnotatedSpan(CodepointSpan(27, 28), "number", |
| /*int_value=*/7, /*double_value=*/7))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersClassifyText) { |
| ClassificationResult classification_result; |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {11, 12}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 2); |
| EXPECT_EQ(classification_result.numeric_double_value, 2); |
| EXPECT_EQ(classification_result.score, 1); |
| |
| EXPECT_TRUE(number_annotator_.ClassifyText( |
| UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {13, 14}, |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result)); |
| EXPECT_EQ(classification_result.collection, "number"); |
| EXPECT_EQ(classification_result.numeric_value, 3); |
| EXPECT_EQ(classification_result.numeric_double_value, 3); |
| EXPECT_EQ(classification_result.score, 1); |
| } |
| |
| TEST_F(NumberAnnotatorTest, SlashDoesNotSeparatesTwoNumbersFindAll) { |
| std::vector<AnnotatedSpan> result; |
| // 2 in the "2/" context is a number because / is punctuation |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("what's 2a2/3 or 2/s4 or 2/ or /3 or //3 or 2//"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan( |
| CodepointSpan(24, 25), "number", |
| /*int_value=*/2, /*double_value=*/2))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, BracketsContextAnnotatedFindAll) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("The interval is: (12, 13) or [-12, -4.5)"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(18, 20), "number", |
| /*int_value=*/12, /*double_value=*/12), |
| IsAnnotatedSpan(CodepointSpan(22, 24), "number", |
| /*int_value=*/13, /*double_value=*/13), |
| IsAnnotatedSpan(CodepointSpan(30, 33), "number", |
| /*int_value=*/-12, /*double_value=*/-12), |
| IsAnnotatedSpan(CodepointSpan(35, 39), "number", |
| /*int_value=*/-4, /*double_value=*/-4.5, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, BracketsContextNotAnnotatedFindAll) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("The interval is: -(12, 138*)"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_TRUE(result.empty()); |
| } |
| |
| TEST_F(NumberAnnotatorTest, FractionalNumberDotsFindAll) { |
| std::vector<AnnotatedSpan> result; |
| // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("3.1 3﹒2 3.3"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(0, 3), "number", |
| /*int_value=*/3, /*double_value=*/3.1, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(4, 7), "number", |
| /*int_value=*/3, /*double_value=*/3.2, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(8, 11), "number", |
| /*int_value=*/3, /*double_value=*/3.3, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, NonAsciiDigitsFindAll) { |
| std::vector<AnnotatedSpan> result; |
| // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period |
| // Digits source: https://unicode-search.net/unicode-namesearch.pl?term=digit |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("3 3﹒2 3.3%"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(0, 1), "number", |
| /*int_value=*/3, /*double_value=*/3), |
| IsAnnotatedSpan(CodepointSpan(2, 5), "number", |
| /*int_value=*/3, /*double_value=*/3.2, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(6, 9), "number", |
| /*int_value=*/3, /*double_value=*/3.3, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(6, 10), "percentage", |
| /*int_value=*/3, /*double_value=*/3.3, |
| /*priority_score=*/1))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, AnnotatedZeroPrecededNumbersFindAll) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("Numbers: 0.9 or 09 or 09.9 or 032310"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(9, 12), "number", |
| /*int_value=*/0, /*double_value=*/0.9, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(16, 18), "number", |
| /*int_value=*/9, /*double_value=*/9), |
| IsAnnotatedSpan(CodepointSpan(22, 26), "number", |
| /*int_value=*/9, /*double_value=*/9.9, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(30, 36), "number", |
| /*int_value=*/32310, |
| /*double_value=*/32310))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, ZeroAfterDotFindAll) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("15.0 16.00"), AnnotationUsecase_ANNOTATION_USECASE_RAW, |
| &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(0, 4), "number", |
| /*int_value=*/15, /*double_value=*/15), |
| IsAnnotatedSpan(CodepointSpan(5, 10), "number", |
| /*int_value=*/16, /*double_value=*/16))); |
| } |
| |
| TEST_F(NumberAnnotatorTest, NineDotNineFindAll) { |
| std::vector<AnnotatedSpan> result; |
| EXPECT_TRUE(number_annotator_.FindAll( |
| UTF8ToUnicodeText("9.9 9.99 99.99 99.999 99.9999"), |
| AnnotationUsecase_ANNOTATION_USECASE_RAW, &result)); |
| |
| EXPECT_THAT(result, |
| UnorderedElementsAre( |
| IsAnnotatedSpan(CodepointSpan(0, 3), "number", |
| /*int_value=*/9, /*double_value=*/9.9, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(4, 8), "number", |
| /*int_value=*/9, /*double_value=*/9.99, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(9, 14), "number", |
| /*int_value=*/99, /*double_value=*/99.99, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(15, 21), "number", |
| /*int_value=*/99, /*double_value=*/99.999, |
| /*priority_score=*/1), |
| IsAnnotatedSpan(CodepointSpan(22, 29), "number", |
| /*int_value=*/99, /*double_value=*/99.9999, |
| /*priority_score=*/1))); |
| } |
| |
| } // namespace test_internal |
| } // namespace libtextclassifier3 |