blob: f6018abe972b175b6d500a2bf56ac860e0db45e9 [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fstream>
#include <string>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "utils/sentencepiece/double_array_trie.h"
#include "utils/sentencepiece/normalizer.h"
#include "utils/sentencepiece/test_utils.h"
#include "utils/strings/stringpiece.h"
namespace libtextclassifier3 {
namespace {
std::string GetTestConfigPath() {
return "";
}
TEST(NormalizerTest, NormalizesAsReferenceNormalizer) {
std::ifstream test_config_stream(GetTestConfigPath());
std::string config((std::istreambuf_iterator<char>(test_config_stream)),
(std::istreambuf_iterator<char>()));
SentencePieceNormalizer normalizer =
NormalizerFromSpec(config, /*add_dummy_prefix=*/true,
/*remove_extra_whitespaces=*/true,
/*escape_whitespaces=*/true);
EXPECT_EQ(normalizer.Normalize("hello there"), "▁hello▁there");
// Redundant whitespace.
EXPECT_EQ(normalizer.Normalize("when is the world cup?"),
"▁when▁is▁the▁world▁cup?");
// Different whitespace.
EXPECT_EQ(normalizer.Normalize("general\tkenobi"), "▁general▁kenobi");
// NFKC char to multi-char normalization.
EXPECT_EQ(normalizer.Normalize("㍿"), "▁株式会社");
// Half width katakana, character composition happens.
EXPECT_EQ(normalizer.Normalize(" グーグル "), "▁グーグル");
// NFKC char to char normalization.
EXPECT_EQ(normalizer.Normalize("①②③"), "▁123");
}
TEST(NormalizerTest, NoDummyPrefix) {
std::ifstream test_config_stream(GetTestConfigPath());
std::string config((std::istreambuf_iterator<char>(test_config_stream)),
(std::istreambuf_iterator<char>()));
SentencePieceNormalizer normalizer =
NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
/*remove_extra_whitespaces=*/true,
/*escape_whitespaces=*/true);
EXPECT_EQ(normalizer.Normalize("hello there"), "hello▁there");
// Redundant whitespace.
EXPECT_EQ(normalizer.Normalize("when is the world cup?"),
"when▁is▁the▁world▁cup?");
// Different whitespace.
EXPECT_EQ(normalizer.Normalize("general\tkenobi"), "general▁kenobi");
// NFKC char to multi-char normalization.
EXPECT_EQ(normalizer.Normalize("㍿"), "株式会社");
// Half width katakana, character composition happens.
EXPECT_EQ(normalizer.Normalize(" グーグル "), "グーグル");
// NFKC char to char normalization.
EXPECT_EQ(normalizer.Normalize("①②③"), "123");
}
TEST(NormalizerTest, NoRemoveExtraWhitespace) {
std::ifstream test_config_stream(GetTestConfigPath());
std::string config((std::istreambuf_iterator<char>(test_config_stream)),
(std::istreambuf_iterator<char>()));
SentencePieceNormalizer normalizer =
NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
/*remove_extra_whitespaces=*/false,
/*escape_whitespaces=*/true);
EXPECT_EQ(normalizer.Normalize("hello there"), "hello▁there");
// Redundant whitespace.
EXPECT_EQ(normalizer.Normalize("when is the world cup?"),
"when▁is▁▁the▁▁world▁cup?");
// Different whitespace.
EXPECT_EQ(normalizer.Normalize("general\tkenobi"), "general▁kenobi");
}
TEST(NormalizerTest, NoEscapeWhitespaces) {
std::ifstream test_config_stream(GetTestConfigPath());
std::string config((std::istreambuf_iterator<char>(test_config_stream)),
(std::istreambuf_iterator<char>()));
SentencePieceNormalizer normalizer =
NormalizerFromSpec(config, /*add_dummy_prefix=*/false,
/*remove_extra_whitespaces=*/false,
/*escape_whitespaces=*/false);
EXPECT_EQ(normalizer.Normalize("hello there"), "hello there");
// Redundant whitespace.
EXPECT_EQ(normalizer.Normalize("when is the world cup?"),
"when is the world cup?");
// Different whitespace.
EXPECT_EQ(normalizer.Normalize("general\tkenobi"), "general kenobi");
}
} // namespace
} // namespace libtextclassifier3