Snap for 10120310 from aea5e709cdd5a418f4b39fdf803523f050061f31 to udc-qpr1-release Change-Id: I9c30645a12259a16da03cdc008fc22f4f59df126

commit: d84043f4a548a14b60ca737f15daa6e8048720d0 [log] [tgz]
author: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> Fri May 12 03:22:44 2023 +0000
committer: Android Build Coastguard Worker <android-build-coastguard-worker@google.com> Fri May 12 03:22:44 2023 +0000
tree: c2bbaabcdf91cd5359247e70a9fd8af5c355507d
parent: f0d4adc904504931a5ceee0d7d09116516ad0730 [diff]
parent: aea5e709cdd5a418f4b39fdf803523f050061f31 [diff]
diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h
index 47c1648..da265f0 100644
--- a/include/minikin/Hyphenator.h
+++ b/include/minikin/Hyphenator.h

@@ -23,7 +23,6 @@
 
 #include <string>
 #include <vector>
-#include <unicode/uscript.h>
 
 #include "minikin/Characters.h"
 #include "minikin/U16StringPiece.h"
@@ -64,12 +63,7 @@
     // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
     // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
     // behavior when a soft hyphen is used in Arabic script.
-    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8,
-    // Break the line, and insert hyphen at current line and the beginning of the next line. It's
-    // very similar to BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE. BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE
-    // is for the words which included hyphenator but this is used to actively hyphenate words
-    // based on the pattern.
-    BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE = 9
+    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
 };
 
 // The hyphen edit represents an edit to the string when a word is hyphenated.
@@ -236,11 +230,6 @@
     void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
                             HyphenationType* out) const;
 
-    HyphenationType hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const;
-
-    bool isRepeatHyphen(UScriptCode script,
-                        HyphenationLocale locale) const;
-
     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
     // that temporary buffers can be stack-allocated without waste, which is a slightly
     // different use case. It measures UTF-16 code units.

diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
index 098e6e8..c755a87 100644
--- a/libs/minikin/Hyphenator.cpp
+++ b/libs/minikin/Hyphenator.cpp

@@ -158,7 +158,6 @@
 EndHyphenEdit editForThisLine(HyphenationType type) {
     switch (type) {
         case HyphenationType::BREAK_AND_INSERT_HYPHEN:
-        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
             return EndHyphenEdit::INSERT_HYPHEN;
         case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
             return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
@@ -179,7 +178,6 @@
 StartHyphenEdit editForNextLine(HyphenationType type) {
     switch (type) {
         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
-        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE:
             return StartHyphenEdit::INSERT_HYPHEN;
         case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
             return StartHyphenEdit::INSERT_ZWJ;
@@ -199,6 +197,25 @@
     }
 }
 
+static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
+    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
+    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
+    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
+    // to maqaf for Hebrew, we can simply add a condition here.
+    const UScriptCode script = getScript(codePoint);
+    if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
+        script == USCRIPT_TELUGU) {
+        // Grantha is not included, since we don't support non-BMP hyphenation yet.
+        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
+    } else if (script == USCRIPT_ARMENIAN) {
+        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
+    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
+        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
+    } else {
+        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
+    }
+}
+
 static inline int32_t getJoiningType(UChar32 codepoint) {
     return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
 }
@@ -227,27 +244,6 @@
     return HyphenationType::BREAK_AND_INSERT_HYPHEN;
 }
 
-HyphenationType Hyphenator::hyphenationTypeBasedOnScriptAndLocale(uint32_t codePoint) const {
-    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
-    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
-    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
-    // to maqaf for Hebrew, we can simply add a condition here.
-    const UScriptCode script = getScript(codePoint);
-    if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
-        script == USCRIPT_TELUGU) {
-        // Grantha is not included, since we don't support non-BMP hyphenation yet.
-        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
-    } else if (script == USCRIPT_ARMENIAN) {
-        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
-    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
-        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
-    } else if (isRepeatHyphen(script, mHyphenationLocale)) {
-        return HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE;
-    } else {
-        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
-    }
-}
-
 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
@@ -259,7 +255,9 @@
             // Break after hyphens, but only if they don't start the word.
 
             if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
-                isRepeatHyphen(getScript(word[i]), mHyphenationLocale)) {
+                (mHyphenationLocale == HyphenationLocale::POLISH ||
+                 mHyphenationLocale == HyphenationLocale::SLOVENIAN) &&
+                getScript(word[i]) == USCRIPT_LATIN) {
                 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
                 // we will do this only if the next character is Latin.
                 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
@@ -275,7 +273,7 @@
                 // actually join. If they don't, we'll just insert a normal hyphen.
                 out[i] = getHyphTypeForArabic(word, i);
             } else {
-                out[i] = hyphenationTypeBasedOnScriptAndLocale(word[i]);
+                out[i] = hyphenationTypeBasedOnScript(word[i]);
             }
         } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
                    ((word[i - 2] == 'l' && word[i] == 'l') ||
@@ -311,7 +309,7 @@
                 return HyphenationType::DONT_BREAK;
             }
             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
-                result = hyphenationTypeBasedOnScriptAndLocale(c);
+                result = hyphenationTypeBasedOnScript(c);
             }
             alpha_codes[i + 1] = code;
         }
@@ -334,7 +332,7 @@
                 return HyphenationType::DONT_BREAK;
             }
             if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
-                result = hyphenationTypeBasedOnScriptAndLocale(c);
+                result = hyphenationTypeBasedOnScript(c);
             }
             alpha_codes[i + 1] = AlphabetTable1::value(entry);
         }
@@ -400,10 +398,4 @@
     }
 }
 
-bool Hyphenator::isRepeatHyphen(UScriptCode script,
-                                HyphenationLocale locale) const {
-    return script == USCRIPT_LATIN &&
-            (locale == HyphenationLocale::POLISH ||locale == HyphenationLocale::SLOVENIAN);
-}
-
 }  // namespace minikin

diff --git a/tests/unittest/HyphenatorTest.cpp b/tests/unittest/HyphenatorTest.cpp
index 5936146..14c3ea3 100644
--- a/tests/unittest/HyphenatorTest.cpp
+++ b/tests/unittest/HyphenatorTest.cpp

@@ -121,18 +121,6 @@
     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
 }
 
-// If there is a soft hyphen in Polish, the hyphen should be repeated on the next line.
-TEST(HyphenatorTest, polishSoftHyphen) {
-    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
-    const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
-    std::vector<HyphenationType> result;
-    hyphenator->hyphenate(word, &result);
-    EXPECT_EQ((size_t)3, result.size());
-    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
-    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
-    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_CURRENT_AND_NEXT_LINE, result[2]);
-}
-
 // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as
 // Polish.)
 TEST(HyphenatorTest, slovenianHyphen) {
commit	d84043f4a548a14b60ca737f15daa6e8048720d0	[log] [tgz]
author	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	Fri May 12 03:22:44 2023 +0000
committer	Android Build Coastguard Worker <android-build-coastguard-worker@google.com>	Fri May 12 03:22:44 2023 +0000
tree	c2bbaabcdf91cd5359247e70a9fd8af5c355507d
parent	f0d4adc904504931a5ceee0d7d09116516ad0730 [diff]
parent	aea5e709cdd5a418f4b39fdf803523f050061f31 [diff]