i18n/alphaindex.cpp - platform/external/icu4c - Git at Google

 /*
 *******************************************************************************
 * Copyright (C) 2009-2014, International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_COLLATION

 #include "unicode/alphaindex.h"
 #include "unicode/coll.h"
 #include "unicode/localpointer.h"
 #include "unicode/normalizer2.h"
 #include "unicode/tblcoll.h"
 #include "unicode/uchar.h"
 #include "unicode/ulocdata.h"
 #include "unicode/uniset.h"
 #include "unicode/uobject.h"
 #include "unicode/usetiter.h"
 #include "unicode/utf16.h"

 #include "cmemory.h"
 #include "cstring.h"
 #include "uassert.h"
 #include "uvector.h"
 #include "uvectr64.h"

 //#include <string>
 //#include <iostream>

 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

 U_NAMESPACE_BEGIN

 namespace {

 /**
  * Prefix string for Chinese index buckets.
  * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
  */
 const UChar BASE[1] = { 0xFDD0 };
 const int32_t BASE_LENGTH = 1;

 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
                                 const UnicodeString &one, const UnicodeString &other);

 }  // namespace

 static int32_t U_CALLCONV
 collatorComparator(const void *context, const void *left, const void *right);

 static int32_t U_CALLCONV
 recordCompareFn(const void *context, const void *left, const void *right);

 //  UVector<Record *> support function, delete a Record.
 static void U_CALLCONV
 alphaIndex_deleteRecord(void *obj) {
     delete static_cast<AlphabeticIndex::Record *>(obj);
 }

 namespace {

 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
                            UErrorCode &errorCode) {
     if (U_FAILURE(errorCode)) { return NULL; }
     if (owned.isValid()) {
         return owned.orphan();
     }
     UnicodeString *p = new UnicodeString(s);
     if (p == NULL) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
     }
     return p;
 }

 inline UnicodeString *getString(const UVector &list, int32_t i) {
     return static_cast<UnicodeString *>(list[i]);
 }

 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
     return static_cast<AlphabeticIndex::Bucket *>(list[i]);
 }

 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
     return static_cast<AlphabeticIndex::Record *>(list[i]);
 }

 /**
  * Like Java Collections.binarySearch(List, String, Comparator).
  *
  * @return the index>=0 where the item was found,
  *         or the index<0 for inserting the string at ~index in sorted order
  */
 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
     if (list.size() == 0) { return ~0; }
     int32_t start = 0;
     int32_t limit = list.size();
     for (;;) {
         int32_t i = (start + limit) / 2;
         const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
         UErrorCode errorCode = U_ZERO_ERROR;
         UCollationResult cmp = coll.compare(s, *si, errorCode);
         if (cmp == UCOL_EQUAL) {
             return i;
         } else if (cmp < 0) {
             if (i == start) {
                 return ~start;  // insert s before *si
             }
             limit = i;
         } else {
             if (i == start) {
                 return ~(start + 1);  // insert s after *si
             }
             start = i;
         }
     }
 }

 }  // namespace

 // The BucketList is not in the anonymous namespace because only Clang
 // seems to support its use in other classes from there.
 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
 class BucketList : public UObject {
 public:
     BucketList(UVector *bucketList, UVector *publicBucketList)
             : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
         int32_t displayIndex = 0;
         for (int32_t i = 0; i < publicBucketList->size(); ++i) {
             getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
         }
     }

     // The virtual destructor must not be inline.
     // See ticket #8454 for details.
     virtual ~BucketList();

     int32_t getBucketCount() const {
         return immutableVisibleList_->size();
     }

     int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
                            UErrorCode &errorCode) {
         // binary search
         int32_t start = 0;
         int32_t limit = bucketList_->size();
         while ((start + 1) < limit) {
             int32_t i = (start + limit) / 2;
             const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
             UCollationResult nameVsBucket =
                 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
             if (nameVsBucket < 0) {
                 limit = i;
             } else {
                 start = i;
             }
         }
         const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
         if (bucket->displayBucket_ != NULL) {
             bucket = bucket->displayBucket_;
         }
         return bucket->displayIndex_;
     }

     /** All of the buckets, visible and invisible. */
     UVector *bucketList_;
     /** Just the visible buckets. */
     UVector *immutableVisibleList_;
 };

 BucketList::~BucketList() {
     delete bucketList_;
     if (immutableVisibleList_ != bucketList_) {
         delete immutableVisibleList_;
     }
 }

 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
     delete buckets_;
     delete collatorPrimaryOnly_;
 }

 int32_t
 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
     return buckets_->getBucketCount();
 }

 int32_t
 AlphabeticIndex::ImmutableIndex::getBucketIndex(
         const UnicodeString &name, UErrorCode &errorCode) const {
     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
 }

 const AlphabeticIndex::Bucket *
 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
     if (0 <= index && index < buckets_->getBucketCount()) {
         return icu::getBucket(*buckets_->immutableVisibleList_, index);
     } else {
         return NULL;
     }
 }

 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
         : inputList_(NULL),
           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
           maxLabelCount_(99),
           initialLabels_(NULL), firstCharsInScripts_(NULL),
           collator_(NULL), collatorPrimaryOnly_(NULL),
           buckets_(NULL) {
     init(&locale, status);
 }


 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
         : inputList_(NULL),
           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
           maxLabelCount_(99),
           initialLabels_(NULL), firstCharsInScripts_(NULL),
           collator_(collator), collatorPrimaryOnly_(NULL),
           buckets_(NULL) {
     init(NULL, status);
 }


 AlphabeticIndex::~AlphabeticIndex() {
     delete collator_;
     delete collatorPrimaryOnly_;
     delete firstCharsInScripts_;
     delete buckets_;
     delete inputList_;
     delete initialLabels_;
 }


 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return *this;
     }
     initialLabels_->addAll(additions);
     clearBuckets();
     return *this;
 }


 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
     addIndexExemplars(locale, status);
     clearBuckets();
     return *this;
 }


 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
     if (U_FAILURE(errorCode)) { return NULL; }
     // In C++, the ImmutableIndex must own its copy of the BucketList,
     // even if it contains no records, for proper memory management.
     // We could clone the buckets_ if they are not NULL,
     // but that would be worth it only if this method is called multiple times,
     // or called after using the old-style bucket iterator API.
     LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
     LocalPointer<RuleBasedCollator> coll(
         static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
     if (immutableBucketList.isNull() || coll.isNull()) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
     if (immIndex == NULL) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     // The ImmutableIndex adopted its parameter objects.
     immutableBucketList.orphan();
     coll.orphan();
     return immIndex;
 }

 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
     initBuckets(status);
     if (U_FAILURE(status)) {
         return 0;
     }
     return buckets_->getBucketCount();
 }


 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
     if (U_FAILURE(status) || inputList_ == NULL) {
         return 0;
     }
     return inputList_->size();
 }

 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
     const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
     if (U_FAILURE(errorCode)) { return; }

     const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
     const UnicodeString &overflowBoundary =
         *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);

     // We make a sorted array of elements.
     // Some of the input may be redundant.
     // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
     // We filter out those cases.
     UnicodeSetIterator iter(*initialLabels_);
     while (iter.next()) {
         const UnicodeString *item = &iter.getString();
         LocalPointer<UnicodeString> ownedItem;
         UBool checkDistinct;
         int32_t itemLength = item->length();
         if (!item->hasMoreChar32Than(0, itemLength, 1)) {
             checkDistinct = FALSE;
         } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
                 item->charAt(itemLength - 2) != 0x2a) {
             // Use a label if it is marked with one trailing star,
             // even if the label string sorts the same when all contractions are suppressed.
             ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
             item = ownedItem.getAlias();
             if (item == NULL) {
                 errorCode = U_MEMORY_ALLOCATION_ERROR;
                 return;
             }
             checkDistinct = FALSE;
         } else {
             checkDistinct = TRUE;
         }
         if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
             // Ignore a primary-ignorable or non-alphabetic index character.
         } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
             // Ignore an index character that will land in the overflow bucket.
         } else if (checkDistinct &&
                 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
             // Ignore a multi-code point index character that does not sort distinctly
             // from the sequence of its separate characters.
         } else {
             int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
             if (insertionPoint < 0) {
                 indexCharacters.insertElementAt(
                     ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
             } else {
                 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
                 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
                     indexCharacters.setElementAt(
                         ownedString(*item, ownedItem, errorCode), insertionPoint);
                 }
             }
         }
     }
     if (U_FAILURE(errorCode)) { return; }

     // if the result is still too large, cut down to maxCount elements, by removing every nth element

     int32_t size = indexCharacters.size() - 1;
     if (size > maxLabelCount_) {
         int32_t count = 0;
         int32_t old = -1;
         for (int32_t i = 0; i < indexCharacters.size();) {
             ++count;
             int32_t bump = count * maxLabelCount_ / size;
             if (bump == old) {
                 indexCharacters.removeElementAt(i);
             } else {
                 old = bump;
                 ++i;
             }
         }
     }
 }

 namespace {

 const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
     if (!current.startsWith(BASE, BASE_LENGTH)) {
         return current;
     }
     UChar rest = current.charAt(BASE_LENGTH);
     if (0x2800 < rest && rest <= 0x28FF) { // stroke count
         int32_t count = rest-0x2800;
         temp.setTo((UChar)(0x30 + count % 10));
         if (count >= 10) {
             count /= 10;
             temp.insert(0, (UChar)(0x30 + count % 10));
             if (count >= 10) {
                 count /= 10;
                 temp.insert(0, (UChar)(0x30 + count));
             }
         }
         return temp.append((UChar)0x5283);
     }
     return temp.setTo(current, BASE_LENGTH);
 }

 UBool hasMultiplePrimaryWeights(
         const RuleBasedCollator &coll, uint32_t variableTop,
         const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
     ces.removeAllElements();
     coll.internalGetCEs(s, ces, errorCode);
     if (U_FAILURE(errorCode)) { return FALSE; }
     UBool seenPrimary = FALSE;
     for (int32_t i = 0; i < ces.size(); ++i) {
         int64_t ce = ces.elementAti(i);
         uint32_t p = (uint32_t)(ce >> 32);
         if (p > variableTop) {
             // not primary ignorable
             if (seenPrimary) {
                 return TRUE;
             }
             seenPrimary = TRUE;
         }
     }
     return FALSE;
 }

 }  // namespace

 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
     // Initialize indexCharacters.
     UVector indexCharacters(errorCode);
     indexCharacters.setDeleter(uprv_deleteUObject);
     initLabels(indexCharacters, errorCode);
     if (U_FAILURE(errorCode)) { return NULL; }

     // Variables for hasMultiplePrimaryWeights().
     UVector64 ces(errorCode);
     uint32_t variableTop;
     if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
         variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
     } else {
         variableTop = 0;
     }
     UBool hasInvisibleBuckets = FALSE;

     // Helper arrays for Chinese Pinyin collation.
     Bucket *asciiBuckets[26] = {
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
     };
     Bucket *pinyinBuckets[26] = {
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
     };
     UBool hasPinyin = FALSE;

     LocalPointer<UVector> bucketList(new UVector(errorCode));
     if (bucketList.isNull()) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     bucketList->setDeleter(uprv_deleteUObject);

     // underflow bucket
     Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
     if (bucket == NULL) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     bucketList->addElement(bucket, errorCode);
     if (U_FAILURE(errorCode)) { return NULL; }

     UnicodeString temp;

     // fix up the list, adding underflow, additions, overflow
     // Insert inflow labels as needed.
     int32_t scriptIndex = -1;
     const UnicodeString *scriptUpperBoundary = &emptyString_;
     for (int32_t i = 0; i < indexCharacters.size(); ++i) {
         UnicodeString &current = *getString(indexCharacters, i);
         if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
             // We crossed the script boundary into a new script.
             const UnicodeString &inflowBoundary = *scriptUpperBoundary;
             UBool skippedScript = FALSE;
             for (;;) {
                 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
                 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
                     break;
                 }
                 skippedScript = TRUE;
             }
             if (skippedScript && bucketList->size() > 1) {
                 // We are skipping one or more scripts,
                 // and we are not just getting out of the underflow label.
                 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
                 if (bucket == NULL) {
                     errorCode = U_MEMORY_ALLOCATION_ERROR;
                     return NULL;
                 }
                 bucketList->addElement(bucket, errorCode);
             }
         }
         // Add a bucket with the current label.
         bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
         if (bucket == NULL) {
             errorCode = U_MEMORY_ALLOCATION_ERROR;
             return NULL;
         }
         bucketList->addElement(bucket, errorCode);
         // Remember ASCII and Pinyin buckets for Pinyin redirects.
         UChar c;
         if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) {  // A-Z
             asciiBuckets[c - 0x41] = bucket;
         } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
                 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
             pinyinBuckets[c - 0x41] = bucket;
             hasPinyin = TRUE;
         }
         // Check for multiple primary weights.
         if (!current.startsWith(BASE, BASE_LENGTH) &&
                 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
                                           ces, errorCode) &&
                 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
             // "AE-ligature" or "Sch" etc.
             for (int32_t i = bucketList->size() - 2;; --i) {
                 Bucket *singleBucket = getBucket(*bucketList, i);
                 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
                     // There is no single-character bucket since the last
                     // underflow or inflow label.
                     break;
                 }
                 if (singleBucket->displayBucket_ == NULL &&
                         !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
                                                    singleBucket->lowerBoundary_,
                                                    ces, errorCode)) {
                     // Add an invisible bucket that redirects strings greater than the expansion
                     // to the previous single-character bucket.
                     // For example, after ... Q R S Sch we add Sch\uFFFF->S
                     // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
                     bucket = new Bucket(emptyString_,
                         UnicodeString(current).append((UChar)0xFFFF),
                         U_ALPHAINDEX_NORMAL);
                     if (bucket == NULL) {
                         errorCode = U_MEMORY_ALLOCATION_ERROR;
                         return NULL;
                     }
                     bucket->displayBucket_ = singleBucket;
                     bucketList->addElement(bucket, errorCode);
                     hasInvisibleBuckets = TRUE;
                     break;
                 }
             }
         }
     }
     if (U_FAILURE(errorCode)) { return NULL; }
     if (bucketList->size() == 1) {
         // No real labels, show only the underflow label.
         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
         if (bl == NULL) {
             errorCode = U_MEMORY_ALLOCATION_ERROR;
             return NULL;
         }
         bucketList.orphan();
         return bl;
     }
     // overflow bucket
     bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
     if (bucket == NULL) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     bucketList->addElement(bucket, errorCode); // final

     if (hasPinyin) {
         // Redirect Pinyin buckets.
         Bucket *asciiBucket = NULL;
         for (int32_t i = 0; i < 26; ++i) {
             if (asciiBuckets[i] != NULL) {
                 asciiBucket = asciiBuckets[i];
             }
             if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
                 pinyinBuckets[i]->displayBucket_ = asciiBucket;
                 hasInvisibleBuckets = TRUE;
             }
         }
     }

     if (U_FAILURE(errorCode)) { return NULL; }
     if (!hasInvisibleBuckets) {
         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
         if (bl == NULL) {
             errorCode = U_MEMORY_ALLOCATION_ERROR;
             return NULL;
         }
         bucketList.orphan();
         return bl;
     }
     // Merge inflow buckets that are visually adjacent.
     // Iterate backwards: Merge inflow into overflow rather than the other way around.
     int32_t i = bucketList->size() - 1;
     Bucket *nextBucket = getBucket(*bucketList, i);
     while (--i > 0) {
         bucket = getBucket(*bucketList, i);
         if (bucket->displayBucket_ != NULL) {
             continue;  // skip invisible buckets
         }
         if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
             if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
                 bucket->displayBucket_ = nextBucket;
                 continue;
             }
         }
         nextBucket = bucket;
     }

     LocalPointer<UVector> publicBucketList(new UVector(errorCode));
     if (bucketList.isNull()) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     // Do not call publicBucketList->setDeleter():
     // This vector shares its objects with the bucketList.
     for (int32_t i = 0; i < bucketList->size(); ++i) {
         bucket = getBucket(*bucketList, i);
         if (bucket->displayBucket_ == NULL) {
             publicBucketList->addElement(bucket, errorCode);
         }
     }
     if (U_FAILURE(errorCode)) { return NULL; }
     BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
     if (bl == NULL) {
         errorCode = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     bucketList.orphan();
     publicBucketList.orphan();
     return bl;
 }

 /**
  * Creates an index, and buckets and sorts the list of records into the index.
  */
 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
     if (U_FAILURE(errorCode) || buckets_ != NULL) {
         return;
     }
     buckets_ = createBucketList(errorCode);
     if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
         return;
     }

     // Sort the records by name.
     // Stable sort preserves input order of collation duplicates.
     inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);

     // Now, we traverse all of the input, which is now sorted.
     // If the item doesn't go in the current bucket, we find the next bucket that contains it.
     // This makes the process order n*log(n), since we just sort the list and then do a linear process.
     // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
     // we need to improve it for that case.

     Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
     int32_t bucketIndex = 1;
     Bucket *nextBucket;
     const UnicodeString *upperBoundary;
     if (bucketIndex < buckets_->bucketList_->size()) {
         nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
         upperBoundary = &nextBucket->lowerBoundary_;
     } else {
         nextBucket = NULL;
         upperBoundary = NULL;
     }
     for (int32_t i = 0; i < inputList_->size(); ++i) {
         Record *r = getRecord(*inputList_, i);
         // if the current bucket isn't the right one, find the one that is
         // We have a special flag for the last bucket so that we don't look any further
         while (upperBoundary != NULL &&
                 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
             currentBucket = nextBucket;
             // now reset the boundary that we compare against
             if (bucketIndex < buckets_->bucketList_->size()) {
                 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
                 upperBoundary = &nextBucket->lowerBoundary_;
             } else {
                 upperBoundary = NULL;
             }
         }
         // now put the record into the bucket.
         Bucket *bucket = currentBucket;
         if (bucket->displayBucket_ != NULL) {
             bucket = bucket->displayBucket_;
         }
         if (bucket->records_ == NULL) {
             bucket->records_ = new UVector(errorCode);
             if (bucket->records_ == NULL) {
                 errorCode = U_MEMORY_ALLOCATION_ERROR;
                 return;
             }
         }
         bucket->records_->addElement(r, errorCode);
     }
 }

 void AlphabeticIndex::clearBuckets() {
     if (buckets_ != NULL) {
         delete buckets_;
         buckets_ = NULL;
         internalResetBucketIterator();
     }
 }

 void AlphabeticIndex::internalResetBucketIterator() {
     labelsIterIndex_ = -1;
     currentBucket_ = NULL;
 }


 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
     LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
     if (U_FAILURE(status)) {
         return;
     }

     UnicodeSet exemplars;
     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
     if (U_SUCCESS(status)) {
         initialLabels_->addAll(exemplars);
         return;
     }
     status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR

     // The locale data did not include explicit Index characters.
     // Synthesize a set of them from the locale's standard exemplar characters.
     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
     if (U_FAILURE(status)) {
         return;
     }

     // question: should we add auxiliary exemplars?
     if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
         exemplars.add(0x61, 0x7A);
     }
     if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
         // cut down to small list
         exemplars.remove(0xAC00, 0xD7A3).
             add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
             add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
             add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
             add(0xD30C).add(0xD558);
     }
     if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
         // cut down to small list
         // make use of the fact that Ethiopic is allocated in 8's, where
         // the base is 0 mod 8.
         UnicodeSet ethiopic(
             UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
         UnicodeSetIterator it(ethiopic);
         while (it.next() && !it.isString()) {
             if ((it.getCodepoint() & 0x7) != 0) {
                 exemplars.remove(it.getCodepoint());
             }
         }
     }

     // Upper-case any that aren't already so.
     //   (We only do this for synthesized index characters.)
     UnicodeSetIterator it(exemplars);
     UnicodeString upperC;
     while (it.next()) {
         const UnicodeString &exemplarC = it.getString();
         upperC = exemplarC;
         upperC.toUpper(locale);
         initialLabels_->add(upperC);
     }
 }

 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
     UnicodeSet contractions;
     collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
     if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
     initialLabels_->addAll(contractions);
     UnicodeSetIterator iter(contractions);
     while (iter.next()) {
         const UnicodeString &s = iter.getString();
         U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
         UChar c = s.charAt(s.length() - 1);
         if (0x41 <= c && c <= 0x5A) {  // A-Z
             // There are Pinyin labels, add ASCII A-Z labels as well.
             initialLabels_->add(0x41, 0x5A);  // A-Z
             break;
         }
     }
     return TRUE;
 }


 /*
  * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
  */
 static const UChar CGJ = 0x034F;
 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
     UnicodeString result;
     if (item.length() == 0) {
         return result;
     }
     int32_t i = 0;
     for (;;) {
         UChar32  cp = item.char32At(i);
         result.append(cp);
         i = item.moveIndex32(i, 1);
         if (i >= item.length()) {
             break;
         }
         result.append(CGJ);
     }
     return result;
 }


 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
     return FALSE;
 }


 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
     return FALSE;
 }


 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
     return *collator_;
 }


 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
     return inflowLabel_;
 }

 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
     return overflowLabel_;
 }


 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
     return underflowLabel_;
 }


 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
     inflowLabel_ = label;
     clearBuckets();
     return *this;
 }


 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
     overflowLabel_ = label;
     clearBuckets();
     return *this;
 }


 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
     underflowLabel_ = label;
     clearBuckets();
     return *this;
 }


 int32_t AlphabeticIndex::getMaxLabelCount() const {
     return maxLabelCount_;
 }


 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return *this;
     }
     if (maxLabelCount <= 0) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return *this;
     }
     maxLabelCount_ = maxLabelCount;
     clearBuckets();
     return *this;
 }


 //
 //  init() - Common code for constructors.
 //

 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
     if (U_FAILURE(status)) { return; }
     if (locale == NULL && collator_ == NULL) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return;
     }

     initialLabels_         = new UnicodeSet();
     if (initialLabels_ == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }

     inflowLabel_.setTo((UChar)0x2026);    // Ellipsis
     overflowLabel_ = inflowLabel_;
     underflowLabel_ = inflowLabel_;

     if (collator_ == NULL) {
         Collator *coll = Collator::createInstance(*locale, status);
         if (U_FAILURE(status)) {
             delete coll;
             return;
         }
         if (coll == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return;
         }
         collator_ = dynamic_cast<RuleBasedCollator *>(coll);
         if (collator_ == NULL) {
             delete coll;
             status = U_UNSUPPORTED_ERROR;
             return;
         }
     }
     collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
     if (collatorPrimaryOnly_ == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
     collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
     firstCharsInScripts_ = firstStringsInScript(status);
     if (U_FAILURE(status)) { return; }
     firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
     // Guard against a degenerate collator where
     // some script boundary strings are primary ignorable.
     for (;;) {
         if (U_FAILURE(status)) { return; }
         if (firstCharsInScripts_->isEmpty()) {
             // AlphabeticIndex requires some non-ignorable script boundary strings.
             status = U_ILLEGAL_ARGUMENT_ERROR;
             return;
         }
         if (collatorPrimaryOnly_->compare(
                 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
                 emptyString_, status) == UCOL_EQUAL) {
             firstCharsInScripts_->removeElementAt(0);
         } else {
             break;
         }
     }

     // Chinese index characters, which are specific to each of the several Chinese tailorings,
     // take precedence over the single locale data exemplar set per language.
     if (!addChineseIndexCharacters(status) && locale != NULL) {
         addIndexExemplars(*locale, status);
     }
 }


 //
 //  Comparison function for UVector<UnicodeString *> sorting with a collator.
 //
 static int32_t U_CALLCONV
 collatorComparator(const void *context, const void *left, const void *right) {
     const UElement *leftElement = static_cast<const UElement *>(left);
     const UElement *rightElement = static_cast<const UElement *>(right);
     const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer);
     const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);

     if (leftString == rightString) {
         // Catches case where both are NULL
         return 0;
     }
     if (leftString == NULL) {
         return 1;
     };
     if (rightString == NULL) {
         return -1;
     }
     const Collator *col = static_cast<const Collator *>(context);
     UErrorCode errorCode = U_ZERO_ERROR;
     return col->compare(*leftString, *rightString, errorCode);
 }

 //
 //  Comparison function for UVector<Record *> sorting with a collator.
 //
 static int32_t U_CALLCONV
 recordCompareFn(const void *context, const void *left, const void *right) {
     const UElement *leftElement = static_cast<const UElement *>(left);
     const UElement *rightElement = static_cast<const UElement *>(right);
     const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
     const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
     const Collator *col = static_cast<const Collator *>(context);
     UErrorCode errorCode = U_ZERO_ERROR;
     return col->compare(leftRec->name_, rightRec->name_, errorCode);
 }

 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return NULL;
     }
     LocalPointer<UVector> dest(new UVector(status));
     if (dest.isNull()) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     dest->setDeleter(uprv_deleteUObject);
     // Fetch the script-first-primary contractions which are defined in the root collator.
     // They all start with U+FDD1.
     UnicodeSet set;
     collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
     if (U_FAILURE(status)) {
         return NULL;
     }
     if (set.isEmpty()) {
         status = U_UNSUPPORTED_ERROR;
         return NULL;
     }
     UnicodeSetIterator iter(set);
     while (iter.next()) {
         const UnicodeString &boundary = iter.getString();
         uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
         if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
             // Ignore boundaries for the special reordering groups.
             // Take only those for "real scripts" (where the sample character is a Letter,
             // and the one for unassigned implicit weights (Cn).
             continue;
         }
         UnicodeString *s = new UnicodeString(boundary);
         if (s == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return NULL;
         }
         dest->addElement(s, status);
     }
     return dest.orphan();
 }


 namespace {

 /**
  * Returns true if one index character string is "better" than the other.
  * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
  * better, and otherwise binary-less-than is better.
  */
 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
                                 const UnicodeString &one, const UnicodeString &other) {
     // This is called with primary-equal strings, but never with one.equals(other).
     UErrorCode status = U_ZERO_ERROR;
     UnicodeString n1 = nfkdNormalizer.normalize(one, status);
     UnicodeString n2 = nfkdNormalizer.normalize(other, status);
     if (U_FAILURE(status)) { return FALSE; }
     int32_t result = n1.countChar32() - n2.countChar32();
     if (result != 0) {
         return result < 0;
     }
     result = n1.compareCodePointOrder(n2);
     if (result != 0) {
         return result < 0;
     }
     return one.compareCodePointOrder(other) < 0;
 }

 }  // namespace

 //
 //  Constructor & Destructor for AlphabeticIndex::Record
 //
 //     Records are internal only, instances are not directly surfaced in the public API.
 //     This class is mostly struct-like, with all public fields.

 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
         : name_(name), data_(data) {}

 AlphabeticIndex::Record::~Record() {
 }


 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return *this;
     }
     if (inputList_ == NULL) {
         inputList_ = new UVector(status);
         if (inputList_ == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return *this;
         }
         inputList_->setDeleter(alphaIndex_deleteRecord);
     }
     Record *r = new Record(name, data);
     if (r == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return *this;
     }
     inputList_->addElement(r, status);
     clearBuckets();
     //std::string ss;
     //std::string ss2;
     //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
     //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
     return *this;
 }


 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
     if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
         inputList_->removeAllElements();
         clearBuckets();
     }
     return *this;
 }

 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
     initBuckets(status);
     if (U_FAILURE(status)) {
         return 0;
     }
     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
 }


 int32_t AlphabeticIndex::getBucketIndex() const {
     return labelsIterIndex_;
 }


 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return FALSE;
     }
     if (buckets_ == NULL && currentBucket_ != NULL) {
         status = U_ENUM_OUT_OF_SYNC_ERROR;
         return FALSE;
     }
     initBuckets(status);
     if (U_FAILURE(status)) {
         return FALSE;
     }
     ++labelsIterIndex_;
     if (labelsIterIndex_ >= buckets_->getBucketCount()) {
         labelsIterIndex_ = buckets_->getBucketCount();
         return FALSE;
     }
     currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
     resetRecordIterator();
     return TRUE;
 }

 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
     if (currentBucket_ != NULL) {
         return currentBucket_->label_;
     } else {
         return emptyString_;
     }
 }


 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
     if (currentBucket_ != NULL) {
         return currentBucket_->labelType_;
     } else {
         return U_ALPHAINDEX_NORMAL;
     }
 }


 int32_t AlphabeticIndex::getBucketRecordCount() const {
     if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
         return currentBucket_->records_->size();
     } else {
         return 0;
     }
 }


 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return *this;
     }
     internalResetBucketIterator();
     return *this;
 }


 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
     if (U_FAILURE(status)) {
         return FALSE;
     }
     if (currentBucket_ == NULL) {
         // We are trying to iterate over the items in a bucket, but there is no
         // current bucket from the enumeration of buckets.
         status = U_INVALID_STATE_ERROR;
         return FALSE;
     }
     if (buckets_ == NULL) {
         status = U_ENUM_OUT_OF_SYNC_ERROR;
         return FALSE;
     }
     if (currentBucket_->records_ == NULL) {
         return FALSE;
     }
     ++itemsIterIndex_;
     if (itemsIterIndex_ >= currentBucket_->records_->size()) {
         itemsIterIndex_  = currentBucket_->records_->size();
         return FALSE;
     }
     return TRUE;
 }


 const UnicodeString &AlphabeticIndex::getRecordName() const {
     const UnicodeString *retStr = &emptyString_;
     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
         itemsIterIndex_ >= 0 &&
         itemsIterIndex_ < currentBucket_->records_->size()) {
             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
             retStr = &item->name_;
     }
     return *retStr;
 }

 const void *AlphabeticIndex::getRecordData() const {
     const void *retPtr = NULL;
     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
         itemsIterIndex_ >= 0 &&
         itemsIterIndex_ < currentBucket_->records_->size()) {
             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
             retPtr = item->data_;
     }
     return retPtr;
 }


 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
     itemsIterIndex_ = -1;
     return *this;
 }


 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
                                 const UnicodeString &lowerBoundary,
                                 UAlphabeticIndexLabelType type)
         : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
           displayBucket_(NULL), displayIndex_(-1),
           records_(NULL) {
 }


 AlphabeticIndex::Bucket::~Bucket() {
     delete records_;
 }

 U_NAMESPACE_END

 #endif  // !UCONFIG_NO_COLLATION