| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "utils/tokenizer.h" |
| |
| #include <vector> |
| |
| #include "gmock/gmock.h" |
| #include "gtest/gtest.h" |
| |
| namespace libtextclassifier3 { |
| namespace { |
| |
| using testing::ElementsAreArray; |
| |
| class TestingTokenizer : public Tokenizer { |
| public: |
| TestingTokenizer( |
| const TokenizationType type, const UniLib* unilib, |
| const std::vector<const TokenizationCodepointRange*>& codepoint_ranges, |
| const std::vector<const CodepointRange*>& |
| internal_tokenizer_codepoint_ranges, |
| const bool split_on_script_change, |
| const bool icu_preserve_whitespace_tokens, |
| const bool preserve_floating_numbers) |
| : Tokenizer(type, unilib, codepoint_ranges, |
| internal_tokenizer_codepoint_ranges, split_on_script_change, |
| icu_preserve_whitespace_tokens, preserve_floating_numbers) {} |
| |
| using Tokenizer::FindTokenizationRange; |
| }; |
| |
| class TestingTokenizerProxy { |
| public: |
| TestingTokenizerProxy( |
| TokenizationType type, |
| const std::vector<TokenizationCodepointRangeT>& codepoint_range_configs, |
| const std::vector<CodepointRangeT>& internal_codepoint_range_configs, |
| const bool split_on_script_change, |
| const bool icu_preserve_whitespace_tokens, |
| const bool preserve_floating_numbers) |
| : INIT_UNILIB_FOR_TESTING(unilib_) { |
| const int num_configs = codepoint_range_configs.size(); |
| std::vector<const TokenizationCodepointRange*> configs_fb; |
| configs_fb.reserve(num_configs); |
| const int num_internal_configs = internal_codepoint_range_configs.size(); |
| std::vector<const CodepointRange*> internal_configs_fb; |
| internal_configs_fb.reserve(num_internal_configs); |
| buffers_.reserve(num_configs + num_internal_configs); |
| for (int i = 0; i < num_configs; i++) { |
| flatbuffers::FlatBufferBuilder builder; |
| builder.Finish(CreateTokenizationCodepointRange( |
| builder, &codepoint_range_configs[i])); |
| buffers_.push_back(builder.Release()); |
| configs_fb.push_back(flatbuffers::GetRoot<TokenizationCodepointRange>( |
| buffers_.back().data())); |
| } |
| for (int i = 0; i < num_internal_configs; i++) { |
| flatbuffers::FlatBufferBuilder builder; |
| builder.Finish( |
| CreateCodepointRange(builder, &internal_codepoint_range_configs[i])); |
| buffers_.push_back(builder.Release()); |
| internal_configs_fb.push_back( |
| flatbuffers::GetRoot<CodepointRange>(buffers_.back().data())); |
| } |
| tokenizer_ = std::unique_ptr<TestingTokenizer>(new TestingTokenizer( |
| type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change, |
| icu_preserve_whitespace_tokens, preserve_floating_numbers)); |
| } |
| |
| TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const { |
| const TokenizationCodepointRangeT* range = |
| tokenizer_->FindTokenizationRange(c); |
| if (range != nullptr) { |
| return range->role; |
| } else { |
| return TokenizationCodepointRange_::Role_DEFAULT_ROLE; |
| } |
| } |
| |
| std::vector<Token> Tokenize(const std::string& utf8_text) const { |
| return tokenizer_->Tokenize(utf8_text); |
| } |
| |
| private: |
| UniLib unilib_; |
| std::vector<flatbuffers::DetachedBuffer> buffers_; |
| std::unique_ptr<TestingTokenizer> tokenizer_; |
| }; |
| |
| TEST(TokenizerTest, FindTokenizationRange) { |
| std::vector<TokenizationCodepointRangeT> configs; |
| TokenizationCodepointRangeT* config; |
| |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0; |
| config->end = 10; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 32; |
| config->end = 33; |
| config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; |
| |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 1234; |
| config->end = 12345; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| |
| TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, |
| {}, /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| |
| // Test hits to the first group. |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(0), |
| TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(5), |
| TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(10), |
| TokenizationCodepointRange_::Role_DEFAULT_ROLE); |
| |
| // Test a hit to the second group. |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(31), |
| TokenizationCodepointRange_::Role_DEFAULT_ROLE); |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(32), |
| TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR); |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(33), |
| TokenizationCodepointRange_::Role_DEFAULT_ROLE); |
| |
| // Test hits to the third group. |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233), |
| TokenizationCodepointRange_::Role_DEFAULT_ROLE); |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234), |
| TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344), |
| TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345), |
| TokenizationCodepointRange_::Role_DEFAULT_ROLE); |
| |
| // Test a hit outside. |
| EXPECT_EQ(tokenizer.TestFindTokenizationRole(99), |
| TokenizationCodepointRange_::Role_DEFAULT_ROLE); |
| } |
| |
| TEST(TokenizerTest, TokenizeOnSpace) { |
| std::vector<TokenizationCodepointRangeT> configs; |
| TokenizationCodepointRangeT* config; |
| |
| configs.emplace_back(); |
| config = &configs.back(); |
| // Space character. |
| config->start = 32; |
| config->end = 33; |
| config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; |
| |
| TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, |
| {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = tokenizer.Tokenize("Hello world!"); |
| |
| EXPECT_THAT(tokens, |
| ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)})); |
| } |
| |
| TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) { |
| std::vector<TokenizationCodepointRangeT> configs; |
| TokenizationCodepointRangeT* config; |
| |
| // Latin. |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0; |
| config->end = 32; |
| config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; |
| config->script_id = 1; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 32; |
| config->end = 33; |
| config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; |
| config->script_id = 1; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 33; |
| config->end = 0x77F + 1; |
| config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; |
| config->script_id = 1; |
| |
| TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, |
| {}, |
| /*split_on_script_change=*/true, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"), |
| std::vector<Token>({Token("앨라배마", 0, 4), Token("주", 5, 6), |
| Token("전화", 7, 10), Token("(123)", 10, 15), |
| Token("456-789", 16, 23), |
| Token("웹사이트", 23, 28)})); |
| } // namespace |
| |
| TEST(TokenizerTest, TokenizeComplex) { |
| std::vector<TokenizationCodepointRangeT> configs; |
| TokenizationCodepointRangeT* config; |
| |
| // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt |
| // Latin - cyrilic. |
| // 0000..007F; Basic Latin |
| // 0080..00FF; Latin-1 Supplement |
| // 0100..017F; Latin Extended-A |
| // 0180..024F; Latin Extended-B |
| // 0250..02AF; IPA Extensions |
| // 02B0..02FF; Spacing Modifier Letters |
| // 0300..036F; Combining Diacritical Marks |
| // 0370..03FF; Greek and Coptic |
| // 0400..04FF; Cyrillic |
| // 0500..052F; Cyrillic Supplement |
| // 0530..058F; Armenian |
| // 0590..05FF; Hebrew |
| // 0600..06FF; Arabic |
| // 0700..074F; Syriac |
| // 0750..077F; Arabic Supplement |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0; |
| config->end = 32; |
| config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 32; |
| config->end = 33; |
| config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 33; |
| config->end = 0x77F + 1; |
| config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; |
| |
| // CJK |
| // 2E80..2EFF; CJK Radicals Supplement |
| // 3000..303F; CJK Symbols and Punctuation |
| // 3040..309F; Hiragana |
| // 30A0..30FF; Katakana |
| // 3100..312F; Bopomofo |
| // 3130..318F; Hangul Compatibility Jamo |
| // 3190..319F; Kanbun |
| // 31A0..31BF; Bopomofo Extended |
| // 31C0..31EF; CJK Strokes |
| // 31F0..31FF; Katakana Phonetic Extensions |
| // 3200..32FF; Enclosed CJK Letters and Months |
| // 3300..33FF; CJK Compatibility |
| // 3400..4DBF; CJK Unified Ideographs Extension A |
| // 4DC0..4DFF; Yijing Hexagram Symbols |
| // 4E00..9FFF; CJK Unified Ideographs |
| // A000..A48F; Yi Syllables |
| // A490..A4CF; Yi Radicals |
| // A4D0..A4FF; Lisu |
| // A500..A63F; Vai |
| // F900..FAFF; CJK Compatibility Ideographs |
| // FE30..FE4F; CJK Compatibility Forms |
| // 20000..2A6DF; CJK Unified Ideographs Extension B |
| // 2A700..2B73F; CJK Unified Ideographs Extension C |
| // 2B740..2B81F; CJK Unified Ideographs Extension D |
| // 2B820..2CEAF; CJK Unified Ideographs Extension E |
| // 2CEB0..2EBEF; CJK Unified Ideographs Extension F |
| // 2F800..2FA1F; CJK Compatibility Ideographs Supplement |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x2E80; |
| config->end = 0x2EFF + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x3000; |
| config->end = 0xA63F + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0xF900; |
| config->end = 0xFAFF + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0xFE30; |
| config->end = 0xFE4F + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x20000; |
| config->end = 0x2A6DF + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x2A700; |
| config->end = 0x2B73F + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x2B740; |
| config->end = 0x2B81F + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x2B820; |
| config->end = 0x2CEAF + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x2CEB0; |
| config->end = 0x2EBEF + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x2F800; |
| config->end = 0x2FA1F + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| |
| // Thai. |
| // 0E00..0E7F; Thai |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0x0E00; |
| config->end = 0x0E7F + 1; |
| config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; |
| |
| TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, |
| {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens; |
| |
| tokens = tokenizer.Tokenize( |
| "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。"); |
| EXPECT_EQ(tokens.size(), 30); |
| |
| tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ"); |
| // clang-format off |
| EXPECT_THAT( |
| tokens, |
| ElementsAreArray({Token("問", 0, 1), |
| Token("少", 1, 2), |
| Token("目", 2, 3), |
| Token("hello", 4, 9), |
| Token("木", 10, 11), |
| Token("輸", 11, 12), |
| Token("ย", 12, 13), |
| Token("า", 13, 14), |
| Token("ม", 14, 15), |
| Token("き", 15, 16), |
| Token("ゃ", 16, 17)})); |
| // clang-format on |
| } |
| |
| #if defined(TC3_TEST_ICU) || defined(__APPLE__) |
| TEST(TokenizerTest, ICUTokenizeWithWhitespaces) { |
| TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/true, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ"); |
| // clang-format off |
| ASSERT_EQ(tokens, |
| std::vector<Token>({Token("พระบาท", 0, 6), |
| Token(" ", 6, 7), |
| Token("สมเด็จ", 7, 13), |
| Token(" ", 13, 14), |
| Token("พระ", 14, 17), |
| Token(" ", 17, 18), |
| Token("ปร", 18, 20), |
| Token(" ", 20, 21), |
| Token("มิ", 21, 23)})); |
| // clang-format on |
| } |
| |
| TEST(TokenizerTest, ICUTokenizePunctuation) { |
| TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/true, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = |
| tokenizer.Tokenize("The interval is: -(12, 138*)"); |
| // clang-format off |
| ASSERT_EQ( |
| tokens, |
| std::vector<Token>({Token("The", 0, 3), |
| Token(" ", 3, 4), |
| Token("interval", 4, 12), |
| Token(" ", 12, 13), |
| Token("is", 13, 15), |
| Token(":", 15, 16), |
| Token(" ", 16, 17), |
| Token("-", 17, 18), |
| Token("(", 18, 19), |
| Token("12", 19, 21), |
| Token(",", 21, 22), |
| Token(" ", 22, 23), |
| Token("138", 23, 26), |
| Token("*", 26, 27), |
| Token(")", 27, 28)})); |
| // clang-format on |
| } |
| |
| TEST(TokenizerTest, ICUTokenizeWithNumbers) { |
| TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/true, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = tokenizer.Tokenize("3.1 3﹒2 3.3"); |
| // clang-format off |
| ASSERT_EQ(tokens, |
| std::vector<Token>({Token("3.1", 0, 3), |
| Token(" ", 3, 4), |
| Token("3﹒2", 4, 7), |
| Token(" ", 7, 8), |
| Token("3.3", 8, 11)})); |
| // clang-format on |
| } |
| #endif |
| |
| #if defined(TC3_TEST_ICU) |
| TEST(TokenizerTest, ICUTokenize) { |
| TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ"); |
| // clang-format off |
| ASSERT_EQ(tokens, |
| std::vector<Token>({Token("พระบาท", 0, 6), |
| Token("สมเด็จ", 6, 12), |
| Token("พระ", 12, 15), |
| Token("ปร", 15, 17), |
| Token("มิ", 17, 19)})); |
| // clang-format on |
| } |
| |
| TEST(TokenizerTest, MixedTokenize) { |
| std::vector<TokenizationCodepointRangeT> configs; |
| TokenizationCodepointRangeT* config; |
| |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 32; |
| config->end = 33; |
| config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; |
| |
| std::vector<CodepointRangeT> internal_configs; |
| CodepointRangeT* interal_config; |
| |
| internal_configs.emplace_back(); |
| interal_config = &internal_configs.back(); |
| interal_config->start = 0; |
| interal_config->end = 128; |
| |
| internal_configs.emplace_back(); |
| interal_config = &internal_configs.back(); |
| interal_config->start = 128; |
| interal_config->end = 256; |
| |
| internal_configs.emplace_back(); |
| interal_config = &internal_configs.back(); |
| interal_config->start = 256; |
| interal_config->end = 384; |
| |
| internal_configs.emplace_back(); |
| interal_config = &internal_configs.back(); |
| interal_config->start = 384; |
| interal_config->end = 592; |
| |
| TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs, |
| internal_configs, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| |
| std::vector<Token> tokens = tokenizer.Tokenize( |
| "こんにちはJapanese-ląnguagę text 你好世界 http://www.google.com/"); |
| ASSERT_EQ( |
| tokens, |
| // clang-format off |
| std::vector<Token>({Token("こんにちは", 0, 5), |
| Token("Japanese-ląnguagę", 5, 22), |
| Token("text", 23, 27), |
| Token("你好", 28, 30), |
| Token("世界", 30, 32), |
| Token("http://www.google.com/", 33, 55)})); |
| // clang-format on |
| } |
| |
| TEST(TokenizerTest, InternalTokenizeOnScriptChange) { |
| std::vector<TokenizationCodepointRangeT> configs; |
| TokenizationCodepointRangeT* config; |
| |
| configs.emplace_back(); |
| config = &configs.back(); |
| config->start = 0; |
| config->end = 256; |
| config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; |
| |
| { |
| TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, |
| configs, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| |
| EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"), |
| std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)})); |
| } |
| |
| { |
| TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, |
| configs, {}, |
| /*split_on_script_change=*/true, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"), |
| std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7), |
| Token("웹사이트", 7, 11)})); |
| } |
| } |
| #endif |
| |
| TEST(TokenizerTest, LetterDigitTokenize) { |
| TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/true); |
| std::vector<Token> tokens = tokenizer.Tokenize("7% -3.14 68.9#? 7% $99 .18."); |
| ASSERT_EQ(tokens, |
| std::vector<Token>( |
| {Token("7", 0, 1), Token("%", 1, 2), Token(" ", 2, 3), |
| Token("-", 3, 4), Token("3.14", 4, 8), Token(" ", 8, 9), |
| Token("68.9", 9, 13), Token("#", 13, 14), Token("?", 14, 15), |
| Token(" ", 15, 16), Token("7", 16, 17), Token("%", 17, 18), |
| Token(" ", 18, 19), Token("$", 19, 20), Token("99", 20, 22), |
| Token(" ", 22, 23), Token(".", 23, 24), Token("18", 24, 26), |
| Token(".", 26, 27)})); |
| } |
| |
| TEST(TokenizerTest, LetterDigitTokenizeUnicode) { |
| TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/true); |
| std::vector<Token> tokens = tokenizer.Tokenize("2 pércént 3パーセント"); |
| ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2), |
| Token("pércént", 2, 9), |
| Token(" ", 9, 10), Token("3", 10, 11), |
| Token("パーセント", 11, 16)})); |
| } |
| |
| TEST(TokenizerTest, LetterDigitTokenizeWithDots) { |
| TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/true); |
| std::vector<Token> tokens = tokenizer.Tokenize("3 3﹒2 3.3%"); |
| ASSERT_EQ(tokens, |
| std::vector<Token>({Token("3", 0, 1), Token(" ", 1, 2), |
| Token("3﹒2", 2, 5), Token(" ", 5, 6), |
| Token("3.3", 6, 9), Token("%", 9, 10)})); |
| } |
| |
| TEST(TokenizerTest, LetterDigitTokenizeDoNotPreserveFloatingNumbers) { |
| TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = tokenizer.Tokenize("15.12.2019 january's 3.2"); |
| ASSERT_EQ(tokens, |
| std::vector<Token>( |
| {Token("15", 0, 2), Token(".", 2, 3), Token("12", 3, 5), |
| Token(".", 5, 6), Token("2019", 6, 10), Token(" ", 10, 11), |
| Token("january", 11, 18), Token("'", 18, 19), |
| Token("s", 19, 20), Token(" ", 20, 21), Token("3", 21, 22), |
| Token(".", 22, 23), Token("2", 23, 24)})); |
| } |
| |
| TEST(TokenizerTest, LetterDigitTokenizeStrangeStringFloatingNumbers) { |
| TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = tokenizer.Tokenize("The+2345++the +íí+"); |
| ASSERT_EQ(tokens, |
| std::vector<Token>({Token("The", 0, 3), Token("+", 3, 4), |
| Token("2345", 4, 8), Token("+", 8, 9), |
| Token("+", 9, 10), Token("the", 10, 13), |
| Token(" ", 13, 14), Token("+", 14, 15), |
| Token("íí", 15, 17), Token("+", 17, 18)})); |
| } |
| |
| TEST(TokenizerTest, LetterDigitTokenizeWhitespcesInSameToken) { |
| TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, |
| /*split_on_script_change=*/false, |
| /*icu_preserve_whitespace_tokens=*/false, |
| /*preserve_floating_numbers=*/false); |
| std::vector<Token> tokens = tokenizer.Tokenize("2 3 4 5"); |
| ASSERT_EQ(tokens, std::vector<Token>({Token("2", 0, 1), Token(" ", 1, 2), |
| Token("3", 2, 3), Token(" ", 3, 5), |
| Token("4", 5, 6), Token(" ", 6, 9), |
| Token("5", 9, 10)})); |
| } |
| |
| } // namespace |
| } // namespace libtextclassifier3 |