blob: 98140f4415b9dc1b2ac44302fb373bdd25ac99ff [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "annotator/number/number_test-include.h"
#include <set>
#include <string>
#include <vector>
#include "annotator/collections.h"
#include "annotator/model_generated.h"
#include "annotator/types-test-util.h"
#include "annotator/types.h"
#include "utils/tokenizer-utils.h"
#include "utils/utf8/unicodetext.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace libtextclassifier3 {
namespace test_internal {
using ::testing::AllOf;
using ::testing::ElementsAre;
using ::testing::Field;
using ::testing::IsEmpty;
using ::testing::Matcher;
using ::testing::UnorderedElementsAre;
namespace {
const flatbuffers::DetachedBuffer* CreateOptionsData(ModeFlag enabled_modes) {
NumberAnnotatorOptionsT options;
options.enabled = true;
options.priority_score = -10.0;
options.float_number_priority_score = 1.0;
options.enabled_annotation_usecases =
1 << AnnotationUsecase_ANNOTATION_USECASE_RAW;
options.max_number_of_digits = 20;
options.enabled_modes = enabled_modes;
options.percentage_priority_score = 1.0;
options.percentage_annotation_usecases =
(1 << AnnotationUsecase_ANNOTATION_USECASE_RAW) +
(1 << AnnotationUsecase_ANNOTATION_USECASE_SMART);
std::set<std::string> percent_suffixes(
{"パーセント", "percent", "pércént", "pc", "pct", "%", "٪", "﹪", "%"});
for (const std::string& string_value : percent_suffixes) {
options.percentage_pieces_string.append(string_value);
options.percentage_pieces_string.push_back('\0');
}
flatbuffers::FlatBufferBuilder builder;
builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
return new flatbuffers::DetachedBuffer(builder.Release());
}
} // namespace
const NumberAnnotatorOptions*
NumberAnnotatorTest::TestingNumberAnnotatorOptions(ModeFlag enabled_modes) {
static const flatbuffers::DetachedBuffer* options_data_selection =
CreateOptionsData(ModeFlag_SELECTION);
static const flatbuffers::DetachedBuffer* options_data_no_selection =
CreateOptionsData(ModeFlag_ANNOTATION_AND_CLASSIFICATION);
static const flatbuffers::DetachedBuffer* options_data_all =
CreateOptionsData(ModeFlag_ALL);
if (enabled_modes == ModeFlag_SELECTION) {
return flatbuffers::GetRoot<NumberAnnotatorOptions>(
options_data_selection->data());
} else if (enabled_modes == ModeFlag_ANNOTATION_AND_CLASSIFICATION) {
return flatbuffers::GetRoot<NumberAnnotatorOptions>(
options_data_no_selection->data());
} else {
return flatbuffers::GetRoot<NumberAnnotatorOptions>(
options_data_all->data());
}
}
MATCHER_P(IsCorrectCollection, collection, "collection is " + collection) {
return arg.collection == collection;
}
MATCHER_P(IsCorrectNumericValue, numeric_value,
"numeric value is " + std::to_string(numeric_value)) {
return arg.numeric_value == numeric_value;
}
MATCHER_P(IsCorrectNumericDoubleValue, numeric_double_value,
"numeric double value is " + std::to_string(numeric_double_value)) {
return arg.numeric_double_value == numeric_double_value;
}
MATCHER_P(IsCorrectScore, score, "score is " + std::to_string(score)) {
return arg.score == score;
}
MATCHER_P(IsCorrectPriortyScore, priority_score,
"priority score is " + std::to_string(priority_score)) {
return arg.priority_score == priority_score;
}
MATCHER_P(IsCorrectSpan, span,
"span is (" + std::to_string(span.first) + "," +
std::to_string(span.second) + ")") {
return arg.span == span;
}
MATCHER_P(Classification, inner, "") {
return testing::ExplainMatchResult(inner, arg.classification,
result_listener);
}
static Matcher<AnnotatedSpan> IsAnnotatedSpan(
const CodepointSpan& codepoint_span, const std::string& collection,
const int int_value, const double double_value,
const float priority_score = -10, const float score = 1) {
return AllOf(
IsCorrectSpan(codepoint_span),
Classification(ElementsAre(AllOf(
IsCorrectCollection(collection), IsCorrectNumericValue(int_value),
IsCorrectNumericDoubleValue(double_value), IsCorrectScore(score),
IsCorrectPriortyScore(priority_score)))));
}
TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345 ..."), {4, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
}
TEST_F(NumberAnnotatorForSelectionTest,
ClassifyTextDisabledClassificationReturnsFalse) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345 ..."), {4, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberAsFloatCorrectly) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345.12345);
}
TEST_F(NumberAnnotatorTest,
ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals) {
ClassificationResult classification_result;
// The dot after a number is considered punctuation, not part of a floating
// number.
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345. ..."), {4, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345. ..."), {4, 10},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345. ..."), {4, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
}
TEST_F(NumberAnnotatorTest, FindsAllIntegerAndFloatNumbersInText) {
std::vector<AnnotatedSpan> result;
// In the context "68.9#" -> 68.9 is a number because # is punctuation.
// In the context "68.9#?" -> 68.9 is not a number because is followed by two
// punctuation signs.
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("how much is 2 plus 5 divided by 7% minus 3.14 "
"what about 68.9# or 68.9#?"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(12, 13), "number",
/*int_value=*/2, /*double_value=*/2.0),
IsAnnotatedSpan(CodepointSpan(19, 20), "number",
/*int_value=*/5, /*double_value=*/5.0),
IsAnnotatedSpan(CodepointSpan(32, 33), "number",
/*int_value=*/7, /*double_value=*/7.0),
IsAnnotatedSpan(CodepointSpan(32, 34), "percentage",
/*int_value=*/7, /*double_value=*/7.0,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(41, 45), "number",
/*int_value=*/3, /*double_value=*/3.14,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(57, 61), "number",
/*int_value=*/68, /*double_value=*/68.9,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345..12345 ..."), {4, 16},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345a ..."), {4, 11},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, ClassifiesNumberSelectionCorrectly) {
ClassificationResult classification_result;
// Punctuation after a number is not part of the number.
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 14, ..."), {4, 6},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 14);
EXPECT_EQ(classification_result.numeric_double_value, 14);
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 14, ..."), {4, 7},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, ClassifiesPercentageSignCorrectly) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 99% ..."), {4, 7},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 99);
EXPECT_EQ(classification_result.numeric_double_value, 99);
}
TEST_F(NumberAnnotatorTest, ClassifiesPercentageWordCorrectly) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 15 percent ..."), {4, 14},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 15);
EXPECT_EQ(classification_result.numeric_double_value, 15);
}
TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiPercentageIncorrectSuffix) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("15 café"), {0, 7},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiFrPercentageCorrectSuffix) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("25 pércént"), {0, 10},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 25);
EXPECT_EQ(classification_result.numeric_double_value, 25);
}
TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiJaPercentageCorrectSuffix) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("10パーセント"), {0, 7},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 10);
EXPECT_EQ(classification_result.numeric_double_value, 10);
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("明日の降水確率は10パーセント 音量を12にセット"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_CLASSIFICATION,
&result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(8, 10), "number",
/*int_value=*/10, /*double_value=*/10.0),
IsAnnotatedSpan(CodepointSpan(8, 15), "percentage",
/*int_value=*/10, /*double_value=*/10.0,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(20, 22), "number",
/*int_value=*/12, /*double_value=*/12.0)));
}
TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
"but not $99."),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(
result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(4, 9), "number",
/*int_value=*/12345, /*double_value=*/12345.0),
IsAnnotatedSpan(CodepointSpan(14, 15), "number",
/*int_value=*/9, /*double_value=*/9.0),
IsAnnotatedSpan(CodepointSpan(33, 35), "number",
/*int_value=*/27, /*double_value=*/27.0),
IsAnnotatedSpan(CodepointSpan(33, 36), "percentage",
/*int_value=*/27, /*double_value=*/27.0,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(40, 42), "number",
/*int_value=*/68, /*double_value=*/68.0),
IsAnnotatedSpan(CodepointSpan(45, 47), "number",
/*int_value=*/38, /*double_value=*/38.0),
IsAnnotatedSpan(CodepointSpan(49, 51), "number",
/*int_value=*/39, /*double_value=*/39.0)));
}
TEST_F(NumberAnnotatorForAnnotationAndClassificationTest,
FindsAllDisabledModeReturnsNoResults) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
"but not $99."),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
EXPECT_THAT(result, IsEmpty());
}
TEST_F(NumberAnnotatorTest, FindsNoNumberInText) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... 12345a ... 12345..12345 and 123a45 are not valid. "
"And -#5% is also bad."),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
ASSERT_EQ(result.size(), 0);
}
TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
std::vector<AnnotatedSpan> result;
// A number should be followed by only one punctuation signs => 15 is not a
// number.
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText(
"It's 12, 13, 14! Or 15??? For sure 16: 17; 18. and -19"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_CLASSIFICATION,
&result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(5, 7), "number",
/*int_value=*/12, /*double_value=*/12.0),
IsAnnotatedSpan(CodepointSpan(9, 11), "number",
/*int_value=*/13, /*double_value=*/13.0),
IsAnnotatedSpan(CodepointSpan(13, 15), "number",
/*int_value=*/14, /*double_value=*/14.0),
IsAnnotatedSpan(CodepointSpan(35, 37), "number",
/*int_value=*/16, /*double_value=*/16.0),
IsAnnotatedSpan(CodepointSpan(39, 41), "number",
/*int_value=*/17, /*double_value=*/17.0),
IsAnnotatedSpan(CodepointSpan(43, 45), "number",
/*int_value=*/18, /*double_value=*/18.0),
IsAnnotatedSpan(CodepointSpan(51, 54), "number",
/*int_value=*/-19, /*double_value=*/-19.0)));
}
TEST_F(NumberAnnotatorTest, FindsFloatNumberWithPunctuation) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("It's 12.123, 13.45, 14.54321! Or 15.1? Maybe 16.33: "
"17.21; but for sure 18.90."),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(5, 11), "number",
/*int_value=*/12, /*double_value=*/12.123,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(13, 18), "number",
/*int_value=*/13, /*double_value=*/13.45,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(20, 28), "number",
/*int_value=*/14, /*double_value=*/14.54321,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(33, 37), "number",
/*int_value=*/15, /*double_value=*/15.1,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(45, 50), "number",
/*int_value=*/16, /*double_value=*/16.33,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(52, 57), "number",
/*int_value=*/17, /*double_value=*/17.21,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(72, 77), "number",
/*int_value=*/18, /*double_value=*/18.9,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
ModeFlag_SELECTION, &result));
EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
CodepointSpan(0, 2), "number",
/*int_value=*/-5, /*double_value=*/-5)));
}
TEST_F(NumberAnnotatorTest, HandlesNegativeNumbers) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("Number -5 and -5% and not number --5%"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(7, 9), "number",
/*int_value=*/-5, /*double_value=*/-5),
IsAnnotatedSpan(CodepointSpan(14, 16), "number",
/*int_value=*/-5, /*double_value=*/-5),
IsAnnotatedSpan(CodepointSpan(14, 17), "percentage",
/*int_value=*/-5, /*double_value=*/-5,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, FindGoodPercentageContexts) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText(
"5 percent, 10 pct, 25 pc and 17%, -5 percent, 10% are percentages"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_SELECTION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(0, 1), "number",
/*int_value=*/5, /*double_value=*/5),
IsAnnotatedSpan(CodepointSpan(0, 9), "percentage",
/*int_value=*/5, /*double_value=*/5,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(11, 13), "number",
/*int_value=*/10, /*double_value=*/10),
IsAnnotatedSpan(CodepointSpan(11, 17), "percentage",
/*int_value=*/10, /*double_value=*/10,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(19, 21), "number",
/*int_value=*/25, /*double_value=*/25),
IsAnnotatedSpan(CodepointSpan(19, 24), "percentage",
/*int_value=*/25, /*double_value=*/25,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(29, 31), "number",
/*int_value=*/17, /*double_value=*/17),
IsAnnotatedSpan(CodepointSpan(29, 32), "percentage",
/*int_value=*/17, /*double_value=*/17,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(34, 36), "number",
/*int_value=*/-5, /*double_value=*/-5),
IsAnnotatedSpan(CodepointSpan(34, 44), "percentage",
/*int_value=*/-5, /*double_value=*/-5,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(46, 48), "number",
/*int_value=*/10, /*double_value=*/10),
IsAnnotatedSpan(CodepointSpan(46, 49), "percentage",
/*int_value=*/10, /*double_value=*/10,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, FindSinglePercentageInContext) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("5%"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result, UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(0, 1), "number",
/*int_value=*/5, /*double_value=*/5),
IsAnnotatedSpan(CodepointSpan(0, 2), "percentage",
/*int_value=*/5, /*double_value=*/5,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, IgnoreBadPercentageContexts) {
std::vector<AnnotatedSpan> result;
// A valid number is followed by only one punctuation element.
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("10, pct, 25 prc, 5#: percentage are not percentages"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(0, 2), "number",
/*int_value=*/10, /*double_value=*/10),
IsAnnotatedSpan(CodepointSpan(9, 11), "number",
/*int_value=*/25, /*double_value=*/25)));
}
TEST_F(NumberAnnotatorTest, IgnoreBadPercentagePunctuationContexts) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText(
"#!24% or :?33 percent are not valid percentages, nor numbers."),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_TRUE(result.empty());
}
TEST_F(NumberAnnotatorTest, FindPercentageInNonAsciiContext) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText(
"At the café 10% or 25 percent of people are nice. Only 10%!"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(12, 14), "number",
/*int_value=*/10, /*double_value=*/10),
IsAnnotatedSpan(CodepointSpan(12, 15), "percentage",
/*int_value=*/10, /*double_value=*/10,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(19, 21), "number",
/*int_value=*/25, /*double_value=*/25),
IsAnnotatedSpan(CodepointSpan(19, 29), "percentage",
/*int_value=*/25, /*double_value=*/25,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(55, 57), "number",
/*int_value=*/10, /*double_value=*/10),
IsAnnotatedSpan(CodepointSpan(55, 58), "percentage",
/*int_value=*/10, /*double_value=*/10,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest,
WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("23#!? percent"), {0, 13},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest,
WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("23 asdf 3.14 pct asdf"), {0, 21},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest,
WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("abdf23 percentabdf"), {0, 18},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest,
WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("#?!23 percent#!?"), {0, 16},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenBothPercentSymbolAndSuffixDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("23% percent"), {0, 11},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest,
WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("#?23%"), {0, 5},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenNumberWithAdditionalCharactersDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("23#!?"), {0, 5},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest,
WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt) {
ClassificationResult classification_result;
// ! does not belong to the percentage annotation
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("23%!"), {0, 3},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 23);
EXPECT_EQ(classification_result.numeric_double_value, 23);
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("23%!"), {0, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest,
WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("23.:;%"), {0, 6},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("--11"), {1, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, -11),
Field(&ClassificationResult::numeric_double_value, -11)));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("--11"), {0, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsPercentSignDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("--11%"), {1, 5},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(classification_result,
AllOf(Field(&ClassificationResult::collection, "percentage"),
Field(&ClassificationResult::numeric_value, -11),
Field(&ClassificationResult::numeric_double_value, -11)));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("--11%"), {0, 5},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenPlusMinusSignsDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("+-11"), {1, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, -11),
Field(&ClassificationResult::numeric_double_value, -11)));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("+-11"), {0, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMinusPlusSignsDoesNotParsesIt) {
ClassificationResult classification_result;
// + right before a number is not included in the number annotation
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("-+11"), {1, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("-+11"), {0, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("10-"), {0, 3},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMultipleCharSuffixDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("10**"), {0, 2},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, 10),
Field(&ClassificationResult::numeric_double_value, 10)));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("10**"), {0, 3},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("10**"), {0, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMultipleCharPrefixDoesNotParsesIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("**10"), {1, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("**10"), {0, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("-1000000000"), {0, 11},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(
classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, -1000000000),
Field(&ClassificationResult::numeric_double_value, -1000000000)));
}
TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("1000000000"), {0, 10},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(
classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, 1000000000),
Field(&ClassificationResult::numeric_double_value, 1000000000)));
}
TEST_F(NumberAnnotatorTest, WhenLowestSupportedFloatNumberParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("-999999999.999999999"), {0, 20},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, -1000000000),
Field(&ClassificationResult::numeric_double_value,
-999999999.999999999)));
}
TEST_F(NumberAnnotatorTest, WhenLargestFloatSupportedNumberParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("999999999.999999999"), {0, 19},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, 1000000000),
Field(&ClassificationResult::numeric_double_value,
999999999.999999999)));
}
TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("2016-2017"), {0, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
ModeFlag_ANNOTATION, &result));
ASSERT_EQ(result.size(), 0);
}
TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
ModeFlag_ANNOTATION, &result));
ASSERT_EQ(result.size(), 0);
}
TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
ModeFlag_ANNOTATION, &result));
ASSERT_EQ(result.size(), 0);
}
TEST_F(NumberAnnotatorTest, ForNumberAnnotationsSetsScoreAndPriorityScore) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345 ..."), {4, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_EQ(classification_result.numeric_double_value, 12345);
EXPECT_EQ(classification_result.score, 1);
EXPECT_EQ(classification_result.priority_score, -10);
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("Come at 9 or 10 ok?"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(8, 9), "number",
/*int_value=*/9, /*double_value=*/9),
IsAnnotatedSpan(CodepointSpan(13, 15), "number",
/*int_value=*/10, /*double_value=*/10)));
}
TEST_F(NumberAnnotatorTest,
ForFloatNumberAnnotationsSetsScoreAndPriorityScore) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_EQ(classification_result.numeric_double_value, 12345.12345);
EXPECT_EQ(classification_result.score, 1);
EXPECT_EQ(classification_result.priority_score, 1);
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("Results are between 12.5 and 13.5, right?"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(20, 24), "number",
/*int_value=*/12, /*double_value=*/12.5,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(29, 33), "number",
/*int_value=*/13, /*double_value=*/13.5,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, ForPercentageAnnotationsSetsScoreAndPriorityScore) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345% ..."), {4, 10},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_EQ(classification_result.numeric_double_value, 12345);
EXPECT_EQ(classification_result.score, 1);
EXPECT_EQ(classification_result.priority_score, 1);
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345 percent ..."), {4, 17},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_EQ(classification_result.numeric_double_value, 12345);
EXPECT_EQ(classification_result.score, 1);
EXPECT_EQ(classification_result.priority_score, 1);
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("Results are between 9% and 10 percent."),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(20, 21), "number",
/*int_value=*/9, /*double_value=*/9),
IsAnnotatedSpan(CodepointSpan(20, 22), "percentage",
/*int_value=*/9, /*double_value=*/9,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(27, 29), "number",
/*int_value=*/10, /*double_value=*/10),
IsAnnotatedSpan(CodepointSpan(27, 37), "percentage",
/*int_value=*/10, /*double_value=*/10,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, NumberDisabledPercentageEnabledForSmartUsecase) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345 ..."), {4, 9},
AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345% ..."), {4, 10},
AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_EQ(classification_result.numeric_double_value, 12345.0);
EXPECT_EQ(classification_result.score, 1);
EXPECT_EQ(classification_result.priority_score, 1);
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345percent ..."), {4, 16},
AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
EXPECT_EQ(classification_result.collection, "percentage");
EXPECT_EQ(classification_result.numeric_value, 12345);
EXPECT_EQ(classification_result.numeric_double_value, 12345);
EXPECT_EQ(classification_result.score, 1);
EXPECT_EQ(classification_result.priority_score, 1);
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("Accuracy for experiment 3 is 9%."),
AnnotationUsecase_ANNOTATION_USECASE_SMART, ModeFlag_ANNOTATION,
&result));
EXPECT_THAT(result, UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(29, 31), "percentage",
/*int_value=*/9, /*double_value=*/9.0,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersFindAll) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("how much is 2 + 2 or 5 - 96 * 89"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(12, 13), "number",
/*int_value=*/2, /*double_value=*/2),
IsAnnotatedSpan(CodepointSpan(16, 17), "number",
/*int_value=*/2, /*double_value=*/2),
IsAnnotatedSpan(CodepointSpan(21, 22), "number",
/*int_value=*/5, /*double_value=*/5),
IsAnnotatedSpan(CodepointSpan(25, 27), "number",
/*int_value=*/96, /*double_value=*/96),
IsAnnotatedSpan(CodepointSpan(30, 32), "number",
/*int_value=*/89, /*double_value=*/89)));
}
TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersClassifyText) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("2 + 2"), {2, 3},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("2 - 96 * 89"), {2, 3},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersFindAll) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("what's 1 + 2/3 * 4/5 * 6 / 7"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(7, 8), "number",
/*int_value=*/1, /*double_value=*/1),
IsAnnotatedSpan(CodepointSpan(11, 12), "number",
/*int_value=*/2, /*double_value=*/2),
IsAnnotatedSpan(CodepointSpan(13, 14), "number",
/*int_value=*/3, /*double_value=*/3),
IsAnnotatedSpan(CodepointSpan(17, 18), "number",
/*int_value=*/4, /*double_value=*/4),
IsAnnotatedSpan(CodepointSpan(19, 20), "number",
/*int_value=*/5, /*double_value=*/5),
IsAnnotatedSpan(CodepointSpan(23, 24), "number",
/*int_value=*/6, /*double_value=*/6),
IsAnnotatedSpan(CodepointSpan(27, 28), "number",
/*int_value=*/7, /*double_value=*/7)));
}
TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersClassifyText) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {11, 12},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 2);
EXPECT_EQ(classification_result.numeric_double_value, 2);
EXPECT_EQ(classification_result.score, 1);
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {13, 14},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 3);
EXPECT_EQ(classification_result.numeric_double_value, 3);
EXPECT_EQ(classification_result.score, 1);
}
TEST_F(NumberAnnotatorTest, SlashDoesNotSeparatesTwoNumbersFindAll) {
std::vector<AnnotatedSpan> result;
// 2 in the "2/" context is a number because / is punctuation
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("what's 2a2/3 or 2/s4 or 2/ or /3 or //3 or 2//"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
CodepointSpan(24, 25), "number",
/*int_value=*/2, /*double_value=*/2)));
}
TEST_F(NumberAnnotatorTest, BracketsContextAnnotatedFindAll) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("The interval is: (12, 13) or [-12, -4.5)"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(18, 20), "number",
/*int_value=*/12, /*double_value=*/12),
IsAnnotatedSpan(CodepointSpan(22, 24), "number",
/*int_value=*/13, /*double_value=*/13),
IsAnnotatedSpan(CodepointSpan(30, 33), "number",
/*int_value=*/-12, /*double_value=*/-12),
IsAnnotatedSpan(CodepointSpan(35, 39), "number",
/*int_value=*/-4, /*double_value=*/-4.5,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, BracketsContextNotAnnotatedFindAll) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("The interval is: -(12, 138*)"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_TRUE(result.empty());
}
TEST_F(NumberAnnotatorTest, FractionalNumberDotsFindAll) {
std::vector<AnnotatedSpan> result;
// Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("3.1 3﹒2 3.3"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result, UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(0, 3), "number",
/*int_value=*/3, /*double_value=*/3.1,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(4, 7), "number",
/*int_value=*/3, /*double_value=*/3.2,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(8, 11), "number",
/*int_value=*/3, /*double_value=*/3.3,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, NonAsciiDigitsFindAll) {
std::vector<AnnotatedSpan> result;
// Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
// Digits source: https://unicode-search.net/unicode-namesearch.pl?term=digit
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("3 3﹒2 3.3%"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result, UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(0, 1), "number",
/*int_value=*/3, /*double_value=*/3),
IsAnnotatedSpan(CodepointSpan(2, 5), "number",
/*int_value=*/3, /*double_value=*/3.2,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(6, 9), "number",
/*int_value=*/3, /*double_value=*/3.3,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(6, 10), "percentage",
/*int_value=*/3, /*double_value=*/3.3,
/*priority_score=*/1)));
}
TEST_F(NumberAnnotatorTest, AnnotatedZeroPrecededNumbersFindAll) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("Numbers: 0.9 or 09 or 09.9 or 032310"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result, UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(9, 12), "number",
/*int_value=*/0, /*double_value=*/0.9,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(16, 18), "number",
/*int_value=*/9, /*double_value=*/9),
IsAnnotatedSpan(CodepointSpan(22, 26), "number",
/*int_value=*/9, /*double_value=*/9.9,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(30, 36), "number",
/*int_value=*/32310,
/*double_value=*/32310)));
}
TEST_F(NumberAnnotatorTest, ZeroAfterDotFindAll) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("15.0 16.00"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(0, 4), "number",
/*int_value=*/15, /*double_value=*/15),
IsAnnotatedSpan(CodepointSpan(5, 10), "number",
/*int_value=*/16, /*double_value=*/16)));
}
TEST_F(NumberAnnotatorTest, NineDotNineFindAll) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("9.9 9.99 99.99 99.999 99.9999"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, ModeFlag_ANNOTATION, &result));
EXPECT_THAT(result,
UnorderedElementsAre(
IsAnnotatedSpan(CodepointSpan(0, 3), "number",
/*int_value=*/9, /*double_value=*/9.9,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(4, 8), "number",
/*int_value=*/9, /*double_value=*/9.99,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(9, 14), "number",
/*int_value=*/99, /*double_value=*/99.99,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(15, 21), "number",
/*int_value=*/99, /*double_value=*/99.999,
/*priority_score=*/1),
IsAnnotatedSpan(CodepointSpan(22, 29), "number",
/*int_value=*/99, /*double_value=*/99.9999,
/*priority_score=*/1)));
}
} // namespace test_internal
} // namespace libtextclassifier3