| /* |
| * Copyright (C) 2015 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "WordBreaker.h" |
| |
| #include <cstdio> |
| |
| #include <gtest/gtest.h> |
| #include <unicode/uclean.h> |
| #include <unicode/udata.h> |
| |
| #include "UnicodeUtils.h" |
| |
| #ifndef NELEM |
| #define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) |
| #endif |
| |
| #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint) |
| |
| namespace minikin { |
| |
| TEST(WordBreakerTest, basic) { |
| uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'}; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), LineBreakStyle::None, |
| LineBreakWordStyle::None, 0)); // after "hello " |
| EXPECT_EQ(0, breaker.wordStart()); // "hello" |
| EXPECT_EQ(5, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ(6, breaker.current()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(6, breaker.wordStart()); // "world" |
| EXPECT_EQ(11, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ(11, breaker.current()); |
| } |
| |
| TEST(WordBreakerTest, softHyphen) { |
| uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| // after "hel{SOFT HYPHEN}lo " |
| EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo" |
| EXPECT_EQ(6, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(7, breaker.wordStart()); // "world" |
| EXPECT_EQ(12, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, hardHyphen) { |
| // Hyphens should not allow breaks anymore. |
| uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ((ssize_t)NELEM(buf), |
| breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, postfixAndPrefix) { |
| uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥ |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| |
| EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after CENT SIGN |
| EXPECT_EQ(0, breaker.wordStart()); // "US¢" |
| EXPECT_EQ(3, breaker.wordEnd()); |
| |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string |
| EXPECT_EQ(4, breaker.wordStart()); // "JP¥" |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, myanmarKinzi) { |
| uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| |
| // end of string |
| EXPECT_EQ((ssize_t)NELEM(buf), |
| breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, zwjEmojiSequences) { |
| uint16_t buf[] = { |
| // man + zwj + heart + zwj + man |
| UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468), |
| // woman + zwj + heart + zwj + kiss mark + zwj + woman |
| UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469), |
| // eye + zwj + left speech bubble |
| UTF16(0x1F441), 0x200D, UTF16(0x1F5E8), |
| // CAT FACE + zwj + BUST IN SILHOUETTE |
| UTF16(0x1F431), 0x200D, UTF16(0x1F464), |
| }; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| // after man + zwj + heart + zwj + man |
| EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ(7, breaker.wordEnd()); |
| EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman |
| EXPECT_EQ(7, breaker.wordStart()); |
| EXPECT_EQ(17, breaker.wordEnd()); |
| EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble |
| EXPECT_EQ(17, breaker.wordStart()); |
| EXPECT_EQ(22, breaker.wordEnd()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(22, breaker.wordStart()); |
| EXPECT_EQ(27, breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, emojiWithModifier) { |
| uint16_t buf[] = { |
| UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier |
| 0x270C, 0xFE0F, |
| UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier |
| }; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| // after boy + type 1-2 fitzpatrick modifier |
| EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ(4, breaker.wordEnd()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(4, breaker.wordStart()); |
| EXPECT_EQ(8, breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, unicode10Emoji) { |
| // Should break between emojis. |
| uint16_t buf[] = { |
| // SLED + SLED |
| UTF16(0x1F6F7), UTF16(0x1F6F7), |
| // SLED + VS15 + SLED |
| UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7), |
| // WHITE SMILING FACE + SLED |
| 0x263A, UTF16(0x1F6F7), |
| // WHITE SMILING FACE + VS16 + SLED |
| 0x263A, 0xFE0F, UTF16(0x1F6F7), |
| }; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ(2, breaker.wordEnd()); |
| |
| EXPECT_EQ(4, breaker.next()); |
| EXPECT_EQ(2, breaker.wordStart()); |
| EXPECT_EQ(4, breaker.wordEnd()); |
| |
| EXPECT_EQ(7, breaker.next()); |
| EXPECT_EQ(4, breaker.wordStart()); |
| EXPECT_EQ(7, breaker.wordEnd()); |
| |
| EXPECT_EQ(9, breaker.next()); |
| EXPECT_EQ(7, breaker.wordStart()); |
| EXPECT_EQ(9, breaker.wordEnd()); |
| |
| EXPECT_EQ(10, breaker.next()); |
| EXPECT_EQ(9, breaker.wordStart()); |
| EXPECT_EQ(10, breaker.wordEnd()); |
| |
| EXPECT_EQ(12, breaker.next()); |
| EXPECT_EQ(10, breaker.wordStart()); |
| EXPECT_EQ(12, breaker.wordEnd()); |
| |
| EXPECT_EQ(14, breaker.next()); |
| EXPECT_EQ(12, breaker.wordStart()); |
| EXPECT_EQ(14, breaker.wordEnd()); |
| |
| EXPECT_EQ(16, breaker.next()); |
| EXPECT_EQ(14, breaker.wordStart()); |
| EXPECT_EQ(16, breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, flagsSequenceSingleFlag) { |
| const std::string kFlag = "U+1F3F4"; |
| const std::string flags = kFlag + " " + kFlag; |
| |
| const int kFlagLength = 2; |
| const size_t BUF_SIZE = kFlagLength * 2; |
| |
| uint16_t buf[BUF_SIZE]; |
| size_t size; |
| ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr); |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| |
| WordBreaker breaker; |
| breaker.setText(buf, size); |
| EXPECT_EQ(0, breaker.current()); |
| // end of the first flag |
| EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ(kFlagLength, breaker.wordEnd()); |
| EXPECT_EQ(static_cast<ssize_t>(size), breaker.next()); |
| EXPECT_EQ(kFlagLength, breaker.wordStart()); |
| EXPECT_EQ(kFlagLength * 2, breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, flagsSequence) { |
| // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag |
| // of Scotland. |
| const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"; |
| const std::string flagSequence = kFlagSequence + " " + kFlagSequence; |
| |
| const int kFlagLength = 14; |
| const size_t BUF_SIZE = kFlagLength * 2; |
| |
| uint16_t buf[BUF_SIZE]; |
| size_t size; |
| ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr); |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| |
| WordBreaker breaker; |
| breaker.setText(buf, size); |
| EXPECT_EQ(0, breaker.current()); |
| // end of the first flag sequence |
| EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ(kFlagLength, breaker.wordEnd()); |
| EXPECT_EQ(static_cast<ssize_t>(size), breaker.next()); |
| EXPECT_EQ(kFlagLength, breaker.wordStart()); |
| EXPECT_EQ(kFlagLength * 2, breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, punct) { |
| uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',', |
| ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "¡¡hello, " |
| EXPECT_EQ(2, breaker.wordStart()); // "hello" |
| EXPECT_EQ(7, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(9, breaker.wordStart()); // "world" |
| EXPECT_EQ(14, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, email) { |
| uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', |
| 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "foo@example" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(16, breaker.next()); // after ".com " |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(16, breaker.wordStart()); // "x" |
| EXPECT_EQ(17, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, mailto) { |
| uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e', |
| 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "mailto:" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(18, breaker.next()); // after "foo@example" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(23, breaker.next()); // after ".com " |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(23, breaker.wordStart()); // "x" |
| EXPECT_EQ(24, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| // The current logic always places a line break after a detected email address or URL |
| // and an immediately following non-ASCII character. |
| TEST(WordBreakerTest, emailNonAscii) { |
| uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', |
| 'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "foo@example" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(15, breaker.next()); // after ".com" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(15, breaker.wordStart()); // "一" |
| EXPECT_EQ(16, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, emailCombining) { |
| uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', |
| 'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "foo@example" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(17, breaker.next()); // after ".com̃ " |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(17, breaker.wordStart()); // "x" |
| EXPECT_EQ(18, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, lonelyAt) { |
| uint16_t buf[] = {'a', ' ', '@', ' ', 'b'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(2, |
| breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); // after "a " |
| EXPECT_EQ(0, breaker.wordStart()); // "a" |
| EXPECT_EQ(1, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ(4, breaker.next()); // after "@ " |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(4, breaker.wordStart()); // "b" |
| EXPECT_EQ(5, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, url) { |
| uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', |
| 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "http:" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(7, breaker.next()); // after "//" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(14, breaker.next()); // after "example" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(19, breaker.next()); // after ".com " |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_EQ(19, breaker.wordStart()); // "x" |
| EXPECT_EQ(20, breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks* |
| TEST(WordBreakerTest, urlBreakChars) { |
| uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', |
| '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&', |
| 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "http:" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(7, breaker.next()); // after "//" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(8, breaker.next()); // after "a" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(10, breaker.next()); // after ".b" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(11, breaker.next()); // after "/" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(13, breaker.next()); // after "~c" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(15, breaker.next()); // after ",d" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(17, breaker.next()); // after "-e" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(19, breaker.next()); // after "?f" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(20, breaker.next()); // after "=" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(21, breaker.next()); // after "g" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(22, breaker.next()); // after "&" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(23, breaker.next()); // after "h" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(25, breaker.next()); // after "#i" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(27, breaker.next()); // after "%j" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ(29, breaker.next()); // after "_k" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(1, breaker.breakBadness()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(0, breaker.breakBadness()); |
| } |
| |
| TEST(WordBreakerTest, urlNoHyphenBreak) { |
| uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "http:" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(7, breaker.next()); // after "//" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(8, breaker.next()); // after "a" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, urlEndsWithSlash) { |
| uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "http:" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(7, breaker.next()); // after "//" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(8, breaker.next()); // after "a" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, emailStartsWithSlash) { |
| uint16_t buf[] = {'/', 'a', '@', 'b'}; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf, NELEM(buf)); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ((ssize_t)NELEM(buf), |
| breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); // end |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| } |
| |
| TEST(WordBreakerTest, setLocaleInsideUrl) { |
| std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World"); |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| WordBreaker breaker; |
| breaker.setText(buf.data(), buf.size()); |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "Hello " |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ(5, breaker.wordEnd()); |
| |
| EXPECT_EQ(6, breaker.current()); |
| EXPECT_EQ(11, breaker.next()); // after "http:" |
| |
| // Restart from middle point of the URL. It should return the same previous break point. |
| EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 6)); // after "http:" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| |
| EXPECT_EQ(13, breaker.next()); // after "//" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| |
| // Restart from middle point of the URL. It should return the same previous break point. |
| EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 12)); // after "//" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(16, breaker.next()); // after "abc" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(18, breaker.next()); // after "/d" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| EXPECT_EQ(24, breaker.next()); // after ".html" |
| EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); |
| |
| EXPECT_EQ(29, breaker.next()); // after "World" |
| EXPECT_EQ(24, breaker.wordStart()); |
| EXPECT_EQ(29, breaker.wordEnd()); |
| } |
| |
| // b/68669534 |
| TEST(WordBreakerTest, spaceAfterSpace) { |
| const std::vector<uint16_t> SPACES = { |
| '\t', // TAB |
| 0x1680, // OGHAM SPACE MARK |
| 0x3000, // IDEOGRAPHIC SPACE |
| }; |
| |
| constexpr uint16_t CHAR_SPACE = 0x0020; |
| auto lbStyle = LineBreakStyle::None; |
| auto lbWordStyle = LineBreakWordStyle::None; |
| |
| for (uint16_t sp : SPACES) { |
| char msg[64] = {}; |
| snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp); |
| SCOPED_TRACE(msg); |
| |
| std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'}; |
| WordBreaker breaker; |
| breaker.setText(buf.data(), buf.size()); |
| |
| EXPECT_EQ(0, breaker.current()); |
| EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, |
| 0)); // after "a " |
| EXPECT_EQ(0, breaker.wordStart()); |
| EXPECT_EQ(1, breaker.wordEnd()); |
| |
| EXPECT_EQ(2, breaker.current()); |
| EXPECT_EQ(3, breaker.next()); // after CHAR_SPACE character. |
| EXPECT_EQ(2, breaker.wordStart()); |
| EXPECT_EQ(2, breaker.wordEnd()); |
| |
| EXPECT_EQ(3, breaker.current()); |
| EXPECT_EQ(4, breaker.next()); // after sp character. |
| EXPECT_EQ(3, breaker.wordStart()); |
| EXPECT_EQ(4, breaker.wordEnd()); |
| } |
| } |
| |
| class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl { |
| public: |
| TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {} |
| |
| using ICULineBreakerPoolImpl::getPoolSize; |
| using ICULineBreakerPoolImpl::MAX_POOL_SIZE; |
| }; |
| |
| TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) { |
| TestableICULineBreakerPoolImpl pool; |
| |
| const Locale enUS("en-Latn-US"); |
| const Locale frFR("fr-Latn-FR"); |
| |
| // All following three breakers must be the different instances. |
| ICULineBreakerPool::Slot enUSBreaker = |
| pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None); |
| ICULineBreakerPool::Slot enUSBreaker2 = |
| pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None); |
| ICULineBreakerPool::Slot enUSBreaker3 = |
| pool.acquire(enUS, LineBreakStyle::Strict, LineBreakWordStyle::None); |
| ICULineBreakerPool::Slot frFRBreaker = |
| pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::None); |
| ICULineBreakerPool::Slot frFRBreaker2 = |
| pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::Phrase); |
| |
| EXPECT_NE(nullptr, enUSBreaker.breaker.get()); |
| EXPECT_NE(nullptr, enUSBreaker2.breaker.get()); |
| EXPECT_NE(nullptr, enUSBreaker3.breaker.get()); |
| EXPECT_NE(nullptr, frFRBreaker.breaker.get()); |
| EXPECT_NE(nullptr, frFRBreaker2.breaker.get()); |
| |
| EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get()); |
| EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker3.breaker.get()); |
| EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get()); |
| EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get()); |
| EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker2.breaker.get()); |
| EXPECT_NE(enUSBreaker2.breaker.get(), enUSBreaker3.breaker.get()); |
| |
| EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId); |
| EXPECT_EQ(enUSBreaker.localeId, enUSBreaker3.localeId); |
| EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId); |
| EXPECT_NE(enUSBreaker.localeId, frFRBreaker2.localeId); |
| EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId); |
| EXPECT_NE(enUSBreaker2.localeId, frFRBreaker2.localeId); |
| EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId); |
| } |
| |
| TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) { |
| TestableICULineBreakerPoolImpl pool; |
| |
| const Locale enUS("en-Latn-US"); |
| const Locale frFR("fr-Latn-FR"); |
| |
| // All following three breakers must be the different instances. |
| ICULineBreakerPool::Slot enUSBreaker = |
| pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None); |
| |
| uint64_t enUSBreakerLocaleId = enUSBreaker.localeId; |
| UBreakIterator* enUSBreakerPtr = enUSBreaker.breaker.get(); |
| |
| pool.release(std::move(enUSBreaker)); |
| EXPECT_EQ(nullptr, enUSBreaker.breaker.get()); |
| |
| // acquire must return a different instance if the locale is different. |
| ICULineBreakerPool::Slot frFRBreaker = |
| pool.acquire(frFR, LineBreakStyle::Loose, LineBreakWordStyle::None); |
| EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get()); |
| EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId); |
| |
| // acquire must return the same instance as released before if the locale is the same. |
| ICULineBreakerPool::Slot enUSBreaker2 = |
| pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None); |
| EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get()); |
| EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId); |
| |
| // acquire must return a different instance if the line break is different. |
| ICULineBreakerPool::Slot frFRBreaker2 = |
| pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::None); |
| ICULineBreakerPool::Slot frFRBreaker3 = |
| pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::Phrase); |
| EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker2.breaker.get()); |
| EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker3.breaker.get()); |
| EXPECT_NE(frFRBreaker2.breaker.get(), frFRBreaker3.breaker.get()); |
| EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId); |
| EXPECT_EQ(frFRBreaker.localeId, frFRBreaker3.localeId); |
| EXPECT_EQ(frFRBreaker2.localeId, frFRBreaker3.localeId); |
| } |
| |
| TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) { |
| const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE; |
| TestableICULineBreakerPoolImpl pool; |
| |
| const Locale enUS("en-Latn-US"); |
| |
| ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2]; |
| |
| // Make pool full. |
| for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) { |
| slots[i] = pool.acquire(enUS, LineBreakStyle::None, LineBreakWordStyle::None); |
| EXPECT_EQ(0U, pool.getPoolSize()); |
| } |
| |
| for (size_t i = 0; i < MAX_POOL_SIZE; i++) { |
| pool.release(std::move(slots[i])); |
| EXPECT_EQ(i + 1, pool.getPoolSize()); |
| } |
| |
| for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) { |
| pool.release(std::move(slots[i])); |
| EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize()); |
| } |
| } |
| |
| } // namespace minikin |