blob: d3b2e8cdb8b22cb5f47fd9106fb00f5aff65205e [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "annotator/number/number.h"
#include <string>
#include <vector>
#include "annotator/collections.h"
#include "annotator/model_generated.h"
#include "annotator/types-test-util.h"
#include "annotator/types.h"
#include "utils/test-utils.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace libtextclassifier3 {
namespace {
using testing::AllOf;
using testing::ElementsAre;
using testing::Field;
const NumberAnnotatorOptions* TestingNumberAnnotatorOptions() {
static const flatbuffers::DetachedBuffer* options_data = []() {
NumberAnnotatorOptionsT options;
options.enabled = true;
options.allowed_prefix_codepoints.push_back('$');
options.allowed_suffix_codepoints.push_back('%');
flatbuffers::FlatBufferBuilder builder;
builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
return new flatbuffers::DetachedBuffer(builder.Release());
}();
return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data());
}
FeatureProcessor BuildFeatureProcessor(const UniLib* unilib) {
static const flatbuffers::DetachedBuffer* options_data = []() {
FeatureProcessorOptionsT options;
options.context_size = 1;
options.max_selection_span = 1;
options.snap_label_span_boundaries_to_containing_tokens = false;
options.ignored_span_boundary_codepoints.push_back(',');
options.tokenization_codepoint_config.emplace_back(
new TokenizationCodepointRangeT());
auto& config = options.tokenization_codepoint_config.back();
config->start = 32;
config->end = 33;
config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
flatbuffers::FlatBufferBuilder builder;
builder.Finish(FeatureProcessorOptions::Pack(builder, &options));
return new flatbuffers::DetachedBuffer(builder.Release());
}();
const FeatureProcessorOptions* feature_processor_options =
flatbuffers::GetRoot<FeatureProcessorOptions>(options_data->data());
return FeatureProcessor(feature_processor_options, unilib);
}
class NumberAnnotatorTest : public ::testing::Test {
protected:
NumberAnnotatorTest()
: INIT_UNILIB_FOR_TESTING(unilib_),
feature_processor_(BuildFeatureProcessor(&unilib_)),
number_annotator_(TestingNumberAnnotatorOptions(),
&feature_processor_) {}
UniLib unilib_;
FeatureProcessor feature_processor_;
NumberAnnotator number_annotator_;
};
TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 12345 ..."), {4, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_EQ(classification_result.collection, "number");
EXPECT_EQ(classification_result.numeric_value, 12345);
}
TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... 12345 ... 9 is my number and I paid $99 and "
"sometimes 27% but not 68# nor #68"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
ASSERT_EQ(result.size(), 4);
ASSERT_EQ(result[0].classification.size(), 1);
EXPECT_EQ(result[0].classification[0].collection, "number");
EXPECT_EQ(result[0].classification[0].numeric_value, 12345);
ASSERT_EQ(result[1].classification.size(), 1);
EXPECT_EQ(result[1].classification[0].collection, "number");
EXPECT_EQ(result[1].classification[0].numeric_value, 9);
ASSERT_EQ(result[2].classification.size(), 1);
EXPECT_EQ(result[2].classification[0].collection, "number");
EXPECT_EQ(result[2].classification[0].numeric_value, 99);
ASSERT_EQ(result[3].classification.size(), 1);
EXPECT_EQ(result[3].classification[0].collection, "number");
EXPECT_EQ(result[3].classification[0].numeric_value, 27);
}
TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("Come at 9, ok?"),
AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
EXPECT_THAT(
result,
ElementsAre(
AllOf(Field(&AnnotatedSpan::span, CodepointSpan(8, 9)),
Field(&AnnotatedSpan::classification,
ElementsAre(AllOf(
Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, 9)))))));
}
TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
&result));
EXPECT_THAT(
result,
ElementsAre(
AllOf(Field(&AnnotatedSpan::span, CodepointSpan(0, 2)),
Field(&AnnotatedSpan::classification,
ElementsAre(AllOf(
Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, -5)))))));
}
TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("-999999999999999999"), {0, 19},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(
classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, -999999999999999999L)));
}
TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
ClassificationResult classification_result;
EXPECT_TRUE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("999999999999999999"), {0, 18},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
EXPECT_THAT(
classification_result,
AllOf(Field(&ClassificationResult::collection, "number"),
Field(&ClassificationResult::numeric_value, 999999999999999999L)));
}
TEST_F(NumberAnnotatorTest, WhenFirstLowestNonSupportedNumberDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("-10000000000000000000"), {0, 21},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenFirstLargestNonSupportedNumberDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("10000000000000000000"), {0, 20},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("--10"), {0, 4},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("10-"), {0, 3},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
ClassificationResult classification_result;
EXPECT_FALSE(number_annotator_.ClassifyText(
UTF8ToUnicodeText("2016-2017"), {0, 9},
AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
}
TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
&result));
ASSERT_EQ(result.size(), 0);
}
TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
&result));
ASSERT_EQ(result.size(), 0);
}
TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
std::vector<AnnotatedSpan> result;
EXPECT_TRUE(number_annotator_.FindAll(
UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
&result));
ASSERT_EQ(result.size(), 0);
}
} // namespace
} // namespace libtextclassifier3