Update ICU with patch to fix Japanese alphabetic index. Patch from http://bugs.icu-project.org/trac/ticket/10423. Bug: 10809397 (cherry picked from commit 260c3da8a8e46f15a7a433f0ad009bd805b804d4) Change-Id: If514618784c1528d1072d2e3f8792bf60d6283a8

commit: 0a61a367aa48577edf1c9ba57c501b8f5e7555d5 [log] [tgz]
author: Craig Cornelius <ccornelius@google.com> Thu Oct 10 15:18:49 2013 -0700
committer: Elliott Hughes <enh@google.com> Fri Oct 11 14:34:32 2013 -0700
tree: 711bf9961b01c8e853eb7ebe51822664a1a7a7ee
parent: 8f65a294aa83d78c7f04a69227ac61f070a381f4 [diff]
diff --git a/i18n/alphaindex.cpp b/i18n/alphaindex.cpp
index 88dcaab..e80efc7 100644
--- a/i18n/alphaindex.cpp
+++ b/i18n/alphaindex.cpp

@@ -245,7 +245,7 @@
 
 
 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
-    addIndexExemplars(locale, status);
+    addIndexExemplars(&locale, status);
     clearBuckets();
     return *this;
 }
@@ -709,12 +709,13 @@
 }
 
 
-void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
+void AlphabeticIndex::addIndexExemplars(const Locale *locale, UErrorCode &status) {
     if (U_FAILURE(status)) { return; }
     // Chinese index characters, which are specific to each of the several Chinese tailorings,
     // take precedence over the single locale data exemplar set per language.
-    const char *language = locale.getLanguage();
-    if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
+    const char *language = locale == NULL ? NULL : locale->getLanguage();
+    if (language == NULL ||
+            uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
             uprv_strcmp(language, "ko") == 0) {
         // TODO: This should be done regardless of the language, but it's expensive.
         // We should add a Collator function (can be @internal)
@@ -723,8 +724,9 @@
             return;
         }
     }
+    if (locale == NULL) { return; }
 
-    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
+    LocalULocaleDataPointer uld(ulocdata_open(locale->getName(), &status));
     if (U_FAILURE(status)) {
         return;
     }
@@ -777,7 +779,7 @@
     while (it.next()) {
         const UnicodeString &exemplarC = it.getString();
         upperC = exemplarC;
-        upperC.toUpper(locale);
+        upperC.toUpper(*locale);
         initialLabels_->add(upperC);
     }
 }
@@ -963,22 +965,38 @@
     firstCharsInScripts_ = firstStringsInScript(status);
     if (U_FAILURE(status)) { return; }
     firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
+
+    // Add index exemplar characters before checking the script boundaries,
+    // since this might modify them.
+    addIndexExemplars(locale, status);
+
     UnicodeString _4E00((UChar)0x4E00);
-    UnicodeString _1100((UChar)0x1100);
-    UnicodeString _1112((UChar)0x1112);
-    if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
-            collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
-        // The standard Korean tailoring sorts Hanja (Han characters)
-        // as secondary differences from Hangul syllables.
-        // This makes U+4E00 not useful as a Han-script boundary.
+    int32_t hanIndex = binarySearch(
+            *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
+    if (hanIndex >= 0) {
+        // Adjust the Han script boundary if necessary.
         // TODO: This becomes obsolete when the root collator gets
         // reliable script-first-primary mappings.
-        int32_t hanIndex = binarySearch(
-                *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
-        if (hanIndex >= 0) {
+        UnicodeString _1100((UChar)0x1100);
+        UnicodeString _1112((UChar)0x1112);
+        UnicodeString _4E9C((UChar)0x4E9C);
+        if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
+                collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
+            // The standard Korean tailoring sorts Hanja (Han characters)
+            // as secondary differences from Hangul syllables.
+            // This makes U+4E00 not useful as a Han-script boundary.
             firstCharsInScripts_->removeElementAt(hanIndex);
+        } else if (collatorPrimaryOnly_->compare(_4E9C, _4E00, status) < 0) {
+            // The standard Japanese tailoring sorts U+4E9C first among Kanji.
+            UnicodeString *fh = new UnicodeString(_4E9C);
+            if (fh == NULL) {
+                status = U_MEMORY_ALLOCATION_ERROR;
+                return;
+            }
+            firstCharsInScripts_->setElementAt(fh, hanIndex);
         }
     }
+
     // Guard against a degenerate collator where
     // some script boundary strings are primary ignorable.
     for (;;) {
@@ -996,10 +1014,6 @@
             break;
         }
     }
-
-    if (locale != NULL) {
-        addIndexExemplars(*locale, status);
-    }
 }
 
 

diff --git a/i18n/unicode/alphaindex.h b/i18n/unicode/alphaindex.h
index 64e2f54..6f47ea1 100644
--- a/i18n/unicode/alphaindex.h
+++ b/i18n/unicode/alphaindex.h

@@ -675,7 +675,7 @@
      * This method is called to get the index exemplars. Normally these come from the locale directly,
      * but if they aren't available, we have to synthesize them.
      */
-    void addIndexExemplars(const Locale &locale, UErrorCode &status);
+    void addIndexExemplars(const Locale *locale, UErrorCode &status);
     /**
      * Add Chinese index characters from the tailoring.
      */

diff --git a/test/intltest/alphaindextst.cpp b/test/intltest/alphaindextst.cpp
index 5bef31b..ea4eeaf 100644
--- a/test/intltest/alphaindextst.cpp
+++ b/test/intltest/alphaindextst.cpp

@@ -63,6 +63,7 @@
     // BEGIN android-remove - test to be added in 51.1
     // TESTCASE_AUTO(TestChineseZhuyin);
     // END android-remove
+    TESTCASE_AUTO(TestJapaneseKanji);
     TESTCASE_AUTO_END;
 }
 
@@ -93,7 +94,8 @@
     // Constructor from a Collator
     //
     status = U_ZERO_ERROR;
-    RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getChinese(), status));
+    RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(
+        Collator::createInstance(Locale::getGerman(), status));
     TEST_CHECK_STATUS;
     TEST_ASSERT(coll != NULL);
     index = new AlphabeticIndex(coll, status);
@@ -586,7 +588,6 @@
     TEST_CHECK_STATUS; 
     AlphabeticIndex index(coll.orphan(), status);
     TEST_CHECK_STATUS; 
-    assertEquals("getBucketCount()", 1, index.getBucketCount(status));   // ... (underflow only)
     index.addLabels(Locale::getChinese(), status);
     assertEquals("getBucketCount()", 28, index.getBucketCount(status));  // ... A-Z ...
     int bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status);
@@ -676,4 +677,21 @@
     assertEquals("label 5", UnicodeString((UChar)0x3109), immIndex->getBucket(5)->getLabel());
 }
 
+void AlphabeticIndexTest::TestJapaneseKanji() {
+    UErrorCode status = U_ZERO_ERROR;
+    AlphabeticIndex index(Locale::getJapanese(), status);
+    LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
+    TEST_CHECK_STATUS;
+    // There are no index characters for Kanji in the Japanese standard collator.
+    // They should all go into the overflow bucket.
+    static const UChar32 kanji[] = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 };
+    int32_t overflowIndex = immIndex->getBucketCount() - 1;
+    for(int32_t i = 0; i < LENGTHOF(kanji); ++i) {
+        char msg[40];
+        sprintf(msg, "kanji[%d]=U+%04lX in overflow bucket", (int)i, (long)kanji[i]);
+        assertEquals(msg, overflowIndex, immIndex->getBucketIndex(UnicodeString(kanji[i]), status));
+        TEST_CHECK_STATUS;
+    }
+}
+
 #endif

diff --git a/test/intltest/alphaindextst.h b/test/intltest/alphaindextst.h
index 2f86471..1aa0075 100644
--- a/test/intltest/alphaindextst.h
+++ b/test/intltest/alphaindextst.h

@@ -45,6 +45,7 @@
      * Test with the Bopomofo-phonetic tailoring.
      */
     void TestChineseZhuyin();
+    void TestJapaneseKanji();
 };
 
 #endif
commit	0a61a367aa48577edf1c9ba57c501b8f5e7555d5	[log] [tgz]
author	Craig Cornelius <ccornelius@google.com>	Thu Oct 10 15:18:49 2013 -0700
committer	Elliott Hughes <enh@google.com>	Fri Oct 11 14:34:32 2013 -0700
tree	711bf9961b01c8e853eb7ebe51822664a1a7a7ee
parent	8f65a294aa83d78c7f04a69227ac61f070a381f4 [diff]