Introducing a new SQLite extension function: GET_PHONEBOOK_INDEX

This function will produce a normalized upper case first letter
from a given string.

Bug: 2407129
Change-Id: Idfafca04342d43ef43cfdff0e431e0a6a8cf5c68
diff --git a/android/Android.mk b/android/Android.mk
index 0f776b0..9016f31 100644
--- a/android/Android.mk
+++ b/android/Android.mk
@@ -5,6 +5,7 @@
 	PhoneNumberUtils.cpp \
 	PhoneticStringUtils.cpp \
 	OldPhoneNumberUtils.cpp \
+	PhonebookIndex.cpp \
 	sqlite3_android.cpp
 
 LOCAL_C_INCLUDES := \
diff --git a/android/PhonebookIndex.cpp b/android/PhonebookIndex.cpp
new file mode 100644
index 0000000..f82c9d2
--- /dev/null
+++ b/android/PhonebookIndex.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2010, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ctype.h>
+#include <string.h>
+
+#include <unicode/ucol.h>
+#include <unicode/uiter.h>
+#include <unicode/ustring.h>
+#include <unicode/utypes.h>
+
+#include "PhonebookIndex.h"
+#include "PhoneticStringUtils.h"
+
+#define SMALL_BUFFER_SIZE 10
+
+namespace android {
+
+// IMPORTANT!  Keep the codes below SORTED. We are doing a binary search on the array
+static UChar DEFAULT_CHAR_MAP[] = {
+    0x00C6,    'A',       // AE
+    0x00DF,    'S',       // Etzett
+    0x1100, 0x3131,       // HANGUL LETTER KIYEOK
+    0x1101, 0x3132,       // HANGUL LETTER SSANGKIYEOK
+    0x1102, 0x3134,       // HANGUL LETTER NIEUN
+    0x1103, 0x3137,       // HANGUL LETTER TIKEUT
+    0x1104, 0x3138,       // HANGUL LETTER SSANGTIKEUT
+    0x1105, 0x3139,       // HANGUL LETTER RIEUL
+    0x1106, 0x3141,       // HANGUL LETTER MIEUM
+    0x1107, 0x3142,       // HANGUL LETTER PIEUP
+    0x1108, 0x3143,       // HANGUL LETTER SSANGPIEUP
+    0x1109, 0x3145,       // HANGUL LETTER SIOS
+    0x110A, 0x3146,       // HANGUL LETTER SSANGSIOS
+    0x110B, 0x3147,       // HANGUL LETTER IEUNG
+    0x110C, 0x3148,       // HANGUL LETTER CIEUC
+    0x110D, 0x3149,       // HANGUL LETTER SSANGCIEUC
+    0x110E, 0x314A,       // HANGUL LETTER CHIEUCH
+    0x110F, 0x314B,       // HANGUL LETTER KHIEUKH
+    0x1110, 0x314C,       // HANGUL LETTER THIEUTH
+    0x1111, 0x314D,       // HANGUL LETTER PHIEUPH
+    0x1112, 0x314E,       // HANGUL LETTER HIEUH
+    0x111A, 0x3140,       // HANGUL LETTER RIEUL-HIEUH
+    0x1121, 0x3144,       // HANGUL LETTER PIEUP-SIOS
+    0x1161, 0x314F,       // HANGUL LETTER A
+    0x1162, 0x3150,       // HANGUL LETTER AE
+    0x1163, 0x3151,       // HANGUL LETTER YA
+    0x1164, 0x3152,       // HANGUL LETTER YAE
+    0x1165, 0x3153,       // HANGUL LETTER EO
+    0x1166, 0x3154,       // HANGUL LETTER E
+    0x1167, 0x3155,       // HANGUL LETTER YEO
+    0x1168, 0x3156,       // HANGUL LETTER YE
+    0x1169, 0x3157,       // HANGUL LETTER O
+    0x116A, 0x3158,       // HANGUL LETTER WA
+    0x116B, 0x3159,       // HANGUL LETTER WAE
+    0x116C, 0x315A,       // HANGUL LETTER OE
+    0x116D, 0x315B,       // HANGUL LETTER YO
+    0x116E, 0x315C,       // HANGUL LETTER U
+    0x116F, 0x315D,       // HANGUL LETTER WEO
+    0x1170, 0x315E,       // HANGUL LETTER WE
+    0x1171, 0x315F,       // HANGUL LETTER WI
+    0x1172, 0x3160,       // HANGUL LETTER YU
+    0x1173, 0x3161,       // HANGUL LETTER EU
+    0x1174, 0x3162,       // HANGUL LETTER YI
+    0x1175, 0x3163,       // HANGUL LETTER I
+    0x11AA, 0x3133,       // HANGUL LETTER KIYEOK-SIOS
+    0x11AC, 0x3135,       // HANGUL LETTER NIEUN-CIEUC
+    0x11AD, 0x3136,       // HANGUL LETTER NIEUN-HIEUH
+    0x11B0, 0x313A,       // HANGUL LETTER RIEUL-KIYEOK
+    0x11B1, 0x313B,       // HANGUL LETTER RIEUL-MIEUM
+    0x11B3, 0x313D,       // HANGUL LETTER RIEUL-SIOS
+    0x11B4, 0x313E,       // HANGUL LETTER RIEUL-THIEUTH
+    0x11B5, 0x313F,       // HANGUL LETTER RIEUL-PHIEUPH
+};
+
+/**
+ * Binary search to map an individual character to the corresponding phone book index.
+ */
+static UChar map_character(UChar c, UChar * char_map, int32_t length) {
+  int from = 0, to = length;
+  while (from < to) {
+    int m = ((to + from) >> 1) & ~0x1;    // Only consider even positions
+    UChar cm = char_map[m];
+    if (cm == c) {
+      return char_map[m + 1];
+    } else if (cm < c) {
+      from = m + 2;
+    } else {
+      to = m;
+    }
+  }
+  return 0;
+}
+
+/**
+ * Returns TRUE if the character belongs to a Hanzi unicode block
+ */
+static bool is_CJK(UChar c) {
+  return
+       (0x4e00 <= c && c <= 0x9fff)     // CJK_UNIFIED_IDEOGRAPHS
+    || (0x3400 <= c && c <= 0x4dbf)     // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
+    || (0x3000 <= c && c <= 0x303f)     // CJK_SYMBOLS_AND_PUNCTUATION
+    || (0x2e80 <= c && c <= 0x2eff)     // CJK_RADICALS_SUPPLEMENT
+    || (0x3300 <= c && c <= 0x33ff)     // CJK_COMPATIBILITY
+    || (0xfe30 <= c && c <= 0xfe4f)     // CJK_COMPATIBILITY_FORMS
+    || (0xf900 <= c && c <= 0xfaff);    // CJK_COMPATIBILITY_IDEOGRAPHS
+}
+
+UChar GetPhonebookIndex(UCharIterator * iter, const char * locale) {
+    UChar dest[SMALL_BUFFER_SIZE];
+
+    // Normalize the first character to remove accents using the NFD normalization
+    UErrorCode errorCode = U_ZERO_ERROR;
+    int32_t len = unorm_next(iter, dest, SMALL_BUFFER_SIZE * sizeof(UChar), UNORM_NFD,
+            0 /* options */, TRUE /* normalize */, NULL, &errorCode);
+    if (U_FAILURE(errorCode) || len == 0) {
+      return 0;
+    }
+
+    UChar c = dest[0];
+
+    // We are only interested in letters
+    if (!u_isalpha(c)) {
+      return 0;
+    }
+
+    c = u_toupper(c);
+
+    // Check for explicitly mapped characters
+    UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
+    if (c_mapped != 0) {
+      return c_mapped;
+    }
+
+    // Convert Kanas to Hiragana
+    UChar next = len > 2 ? dest[1] : 0;
+    c = android::GetNormalizedCodePoint(c, next, NULL);
+
+    if (is_CJK(c)) {
+      if (strncmp(locale, "ja", 2) == 0) {
+        return 0x8A18;  // Kanji character used as a heading in letters, notices and other documents
+      } else {
+        return 0;
+      }
+    }
+
+    return c;
+}
+
+}  // namespace android
diff --git a/android/PhonebookIndex.h b/android/PhonebookIndex.h
new file mode 100644
index 0000000..f2bb289
--- /dev/null
+++ b/android/PhonebookIndex.h
@@ -0,0 +1,36 @@
+/*
+**
+** Copyright 2010, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef _ANDROID_PHONEBOOK_INDEX_H
+#define _ANDROID_PHONEBOOK_INDEX_H
+
+#include <unicode/uiter.h>
+#include <unicode/utypes.h>
+
+namespace android {
+
+/**
+ * A character converter that takes a UNICODE character and produces the
+ * phonebook index for it in the specified locale. For example, "a" becomes "A"
+ * and so does A with accents. Conversion rules differ from locale
+ * locale, which is why this function takes locale as an argument.
+ */
+UChar GetPhonebookIndex(UCharIterator * iter, const char * locale);
+
+}  // namespace android
+
+#endif
diff --git a/android/sqlite3_android.cpp b/android/sqlite3_android.cpp
index 2d4c3e6..6b253c1 100644
--- a/android/sqlite3_android.cpp
+++ b/android/sqlite3_android.cpp
@@ -22,14 +22,18 @@
 #include <unistd.h>
 
 #include <unicode/ucol.h>
+#include <unicode/uiter.h>
 #include <unicode/ustring.h>
+#include <unicode/utypes.h>
 #include <cutils/log.h>
 
 #include "sqlite3_android.h"
 #include "PhoneNumberUtils.h"
+#include "PhonebookIndex.h"
 #include "PhoneticStringUtils.h"
 
 #define ENABLE_ANDROID_LOG 0
+#define SMALL_BUFFER_SIZE 10
 
 static int collate16(void *p, int n1, const void *v1, int n2, const void *v2)
 {
@@ -70,6 +74,45 @@
     }
 }
 
+/**
+ * Obtains the first UNICODE letter from the supplied string, normalizes and returns it.
+ */
+static void get_phonebook_index(
+    sqlite3_context * context, int argc, sqlite3_value ** argv)
+{
+    if (argc != 2) {
+      sqlite3_result_null(context);
+      return;
+    }
+
+    char const * src = (char const *)sqlite3_value_text(argv[0]);
+    char const * locale = (char const *)sqlite3_value_text(argv[1]);
+    if (src == NULL || src[0] == 0 || locale == NULL) {
+      sqlite3_result_null(context);
+      return;
+    }
+
+    UCharIterator iter;
+    uiter_setUTF8(&iter, src, -1);
+
+    UChar index = android::GetPhonebookIndex(&iter, locale);
+    if (index == 0) {
+      sqlite3_result_null(context);
+      return;
+    }
+
+    uint32_t outlen = 0;
+    uint8_t out[SMALL_BUFFER_SIZE];
+    UBool isError = FALSE;
+    U8_APPEND(out, outlen, SMALL_BUFFER_SIZE * sizeof(uint8_t), index, isError);
+    if (isError || outlen == 0) {
+      sqlite3_result_null(context);
+      return;
+    }
+
+    sqlite3_result_text(context, (const char*)out, outlen, SQLITE_TRANSIENT);
+}
+
 static void get_phonetically_sortable_string(
     sqlite3_context * context, int argc, sqlite3_value ** argv)
 {
@@ -183,10 +226,10 @@
         sqlite3_result_null(context);
         return;
     }
-    
+
     if (strncmp("/sdcard/", path, 8) != 0) {
         sqlite3_result_null(context);
-        return;        
+        return;
     }
     if (strstr(path, "/../") != NULL) {
         sqlite3_result_null(context);
@@ -335,7 +378,7 @@
     if (origData == NULL) {
         sqlite3_result_null(context);
         return;
-    }    
+    }
 
     // Get the raw bytes for the delimiter
     const UChar * delim = (const UChar *)sqlite3_value_text16(argv[3]);
@@ -344,15 +387,15 @@
         sqlite3_result_null(context);
         return;
     }
- 
+
     UChar * token = NULL;
     UChar *state;
-    int numTokens = 0;    
-    
+    int numTokens = 0;
+
     do {
         if (numTokens == 0) {
             token = origData;
-        } 
+        }
 
         // Reset the program so we can use it to perform the insert
         sqlite3_reset(statement);
@@ -367,9 +410,9 @@
         uint32_t keysize = result-1;
         uint32_t base16Size = keysize*2;
         char *base16buf = (char*)malloc(base16Size);
-        base16Encode(base16buf, keybuf, keysize);      
+        base16Encode(base16buf, keybuf, keysize);
         err = sqlite3_bind_text(statement, 1, base16buf, base16Size, SQLITE_STATIC);
-        
+
         if (err != SQLITE_OK) {
             LOGE(" sqlite3_bind_text16 error %d", err);
             free(base16buf);
@@ -418,7 +461,7 @@
     if (U_FAILURE(status)) {
         return -1;
     }
-    
+
     ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &status);
     if (U_FAILURE(status)) {
         return -1;
@@ -438,7 +481,7 @@
     if (err != SQLITE_OK) {
         return err;
     }
-    
+
     // Register the _TOKENIZE function
     err = sqlite3_create_function(handle, "_TOKENIZE", 4, SQLITE_UTF16, collator, tokenize, NULL, NULL);
     if (err != SQLITE_OK) {
@@ -465,7 +508,7 @@
     UCollator * collator = ucol_open(NULL, &status);
     if (U_FAILURE(status)) {
         return -1;
-    }    
+    }
 
     if (utf16Storage) {
         // Note that text should be stored as UTF-16
@@ -536,5 +579,15 @@
         return err;
     }
 
+    // Register the GET_PHONEBOOK_INDEX function
+    err = sqlite3_create_function(handle,
+        "GET_PHONEBOOK_INDEX",
+        2, SQLITE_UTF8, NULL,
+        get_phonebook_index,
+        NULL, NULL);
+    if (err != SQLITE_OK) {
+        return err;
+    }
+
     return SQLITE_OK;
 }