common/triedict.cpp - platform/external/icu4c.git - Git at Google

 /**
  *******************************************************************************
  * Copyright (C) 2006-2008, International Business Machines Corporation        *
  * and others. All Rights Reserved.                                            *
  *******************************************************************************
  */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_BREAK_ITERATION

 #include "triedict.h"
 #include "unicode/chariter.h"
 #include "unicode/uchriter.h"
 #include "unicode/strenum.h"
 #include "unicode/uenum.h"
 #include "unicode/udata.h"
 #include "cmemory.h"
 #include "udataswp.h"
 #include "uvector.h"
 #include "uvectr32.h"
 #include "uarrsort.h"

 //#define DEBUG_TRIE_DICT 1

 #ifdef DEBUG_TRIE_DICT
 #include <sys/times.h>
 #include <limits.h>
 #include <stdio.h>
 #endif

 U_NAMESPACE_BEGIN

 /*******************************************************************
  * TrieWordDictionary
  */

 TrieWordDictionary::TrieWordDictionary() {
 }

 TrieWordDictionary::~TrieWordDictionary() {
 }

 /*******************************************************************
  * MutableTrieDictionary
  */

 // Node structure for the ternary, uncompressed trie
 struct TernaryNode : public UMemory {
     UChar       ch;         // UTF-16 code unit
     uint16_t    flags;      // Flag word
     TernaryNode *low;       // Less-than link
     TernaryNode *equal;     // Equal link
     TernaryNode *high;      // Greater-than link

     TernaryNode(UChar uc);
     ~TernaryNode();
 };

 enum MutableTrieNodeFlags {
     kEndsWord = 0x0001      // This node marks the end of a valid word
 };

 inline
 TernaryNode::TernaryNode(UChar uc) {
     ch = uc;
     flags = 0;
     low = NULL;
     equal = NULL;
     high = NULL;
 }

 // Not inline since it's recursive
 TernaryNode::~TernaryNode() {
     delete low;
     delete equal;
     delete high;
 }

 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
     // Start the trie off with something. Having the root node already present
     // cuts a special case out of the search/insertion functions.
     // Making it a median character cuts the worse case for searches from
     // 4x a balanced trie to 2x a balanced trie. It's best to choose something
     // that starts a word that is midway in the list.
     fTrie = new TernaryNode(median);
     if (fTrie == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
     fIter = utext_openUChars(NULL, NULL, 0, &status);
     if (U_SUCCESS(status) && fIter == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
 }

 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
     fTrie = NULL;
     fIter = utext_openUChars(NULL, NULL, 0, &status);
     if (U_SUCCESS(status) && fIter == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }
 }

 MutableTrieDictionary::~MutableTrieDictionary() {
     delete fTrie;
     utext_close(fIter);
 }

 int32_t
 MutableTrieDictionary::search( UText *text,
                                    int32_t maxLength,
                                    int32_t *lengths,
                                    int &count,
                                    int limit,
                                    TernaryNode *&parent,
                                    UBool &pMatched ) const {
     // TODO: current implementation works in UTF-16 space
     const TernaryNode *up = NULL;
     const TernaryNode *p = fTrie;
     int mycount = 0;
     pMatched = TRUE;
     int i;

     UChar uc = utext_current32(text);
     for (i = 0; i < maxLength && p != NULL; ++i) {
         while (p != NULL) {
             if (uc < p->ch) {
                 up = p;
                 p = p->low;
             }
             else if (uc == p->ch) {
                 break;
             }
             else {
                 up = p;
                 p = p->high;
             }
         }
         if (p == NULL) {
             pMatched = FALSE;
             break;
         }
         // Must be equal to get here
         if (limit > 0 && (p->flags & kEndsWord)) {
             lengths[mycount++] = i+1;
             --limit;
         }
         up = p;
         p = p->equal;
         uc = utext_next32(text);
         uc = utext_current32(text);
     }

     // Note that there is no way to reach here with up == 0 unless
     // maxLength is 0 coming in.
     parent = (TernaryNode *)up;
     count = mycount;
     return i;
 }

 void
 MutableTrieDictionary::addWord( const UChar *word,
                                 int32_t length,
                                 UErrorCode &status ) {
 #if 0
     if (length <= 0) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return;
     }
 #endif
     TernaryNode *parent;
     UBool pMatched;
     int count;
     fIter = utext_openUChars(fIter, word, length, &status);

     int matched;
     matched = search(fIter, length, NULL, count, 0, parent, pMatched);

     while (matched++ < length) {
         UChar32 uc = utext_next32(fIter);  // TODO:  supplemetary support?
         U_ASSERT(uc != U_SENTINEL);
         TernaryNode *newNode = new TernaryNode(uc);
         if (newNode == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return;
         }
         if (pMatched) {
             parent->equal = newNode;
         }
         else {
             pMatched = TRUE;
             if (uc < parent->ch) {
                 parent->low = newNode;
             }
             else {
                 parent->high = newNode;
             }
         }
         parent = newNode;
     }

     parent->flags |= kEndsWord;
 }

 #if 0
 void
 MutableTrieDictionary::addWords( UEnumeration *words,
                                   UErrorCode &status ) {
     int32_t length;
     const UChar *word;
     while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
         addWord(word, length, status);
     }
 }
 #endif

 int32_t
 MutableTrieDictionary::matches( UText *text,
                                 int32_t maxLength,
                                 int32_t *lengths,
                                 int &count,
                                 int limit ) const {
     TernaryNode *parent;
     UBool pMatched;
     return search(text, maxLength, lengths, count, limit, parent, pMatched);
 }

 // Implementation of iteration for MutableTrieDictionary
 class MutableTrieEnumeration : public StringEnumeration {
 private:
     UStack      fNodeStack;     // Stack of nodes to process
     UVector32   fBranchStack;   // Stack of which branch we are working on
     TernaryNode *fRoot;         // Root node
     enum StackBranch {
         kLessThan,
         kEqual,
         kGreaterThan,
         kDone
     };

 public:
     static UClassID U_EXPORT2 getStaticClassID(void);
     virtual UClassID getDynamicClassID(void) const;
 public:
     MutableTrieEnumeration(TernaryNode *root, UErrorCode &status)
         : fNodeStack(status), fBranchStack(status) {
         fRoot = root;
         fNodeStack.push(root, status);
         fBranchStack.push(kLessThan, status);
         unistr.remove();
     }

     virtual ~MutableTrieEnumeration() {
     }

     virtual StringEnumeration *clone() const {
         UErrorCode status = U_ZERO_ERROR;
         return new MutableTrieEnumeration(fRoot, status);
     }

     virtual const UnicodeString *snext(UErrorCode &status) {
         if (fNodeStack.empty() || U_FAILURE(status)) {
             return NULL;
         }
         TernaryNode *node = (TernaryNode *) fNodeStack.peek();
         StackBranch where = (StackBranch) fBranchStack.peeki();
         while (!fNodeStack.empty() && U_SUCCESS(status)) {
             UBool emit;
             UBool equal;

             switch (where) {
             case kLessThan:
                 if (node->low != NULL) {
                     fBranchStack.setElementAt(kEqual, fBranchStack.size()-1);
                     node = (TernaryNode *) fNodeStack.push(node->low, status);
                     where = (StackBranch) fBranchStack.push(kLessThan, status);
                     break;
                 }
             case kEqual:
                 emit = (node->flags & kEndsWord) != 0;
                 equal = (node->equal != NULL);
                 // If this node should be part of the next emitted string, append
                 // the UChar to the string, and make sure we pop it when we come
                 // back to this node. The character should only be in the string
                 // for as long as we're traversing the equal subtree of this node
                 if (equal || emit) {
                     unistr.append(node->ch);
                     fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1);
                 }
                 if (equal) {
                     node = (TernaryNode *) fNodeStack.push(node->equal, status);
                     where = (StackBranch) fBranchStack.push(kLessThan, status);
                 }
                 if (emit) {
                     return &unistr;
                 }
                 if (equal) {
                     break;
                 }
             case kGreaterThan:
                 // If this node's character is in the string, remove it.
                 if (node->equal != NULL || (node->flags & kEndsWord)) {
                     unistr.truncate(unistr.length()-1);
                 }
                 if (node->high != NULL) {
                     fBranchStack.setElementAt(kDone, fBranchStack.size()-1);
                     node = (TernaryNode *) fNodeStack.push(node->high, status);
                     where = (StackBranch) fBranchStack.push(kLessThan, status);
                     break;
                 }
             case kDone:
                 fNodeStack.pop();
                 fBranchStack.popi();
                 node = (TernaryNode *) fNodeStack.peek();
                 where = (StackBranch) fBranchStack.peeki();
                 break;
             default:
                 return NULL;
             }
         }
         return NULL;
     }

     // Very expensive, but this should never be used.
     virtual int32_t count(UErrorCode &status) const {
         MutableTrieEnumeration counter(fRoot, status);
         int32_t result = 0;
         while (counter.snext(status) != NULL && U_SUCCESS(status)) {
             ++result;
         }
         return result;
     }

     virtual void reset(UErrorCode &status) {
         fNodeStack.removeAllElements();
         fBranchStack.removeAllElements();
         fNodeStack.push(fRoot, status);
         fBranchStack.push(kLessThan, status);
         unistr.remove();
     }
 };

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)

 StringEnumeration *
 MutableTrieDictionary::openWords( UErrorCode &status ) const {
     if (U_FAILURE(status)) {
         return NULL;
     }
     return new MutableTrieEnumeration(fTrie, status);
 }

 /*******************************************************************
  * CompactTrieDictionary
  */

 struct CompactTrieHeader {
     uint32_t        size;           // Size of the data in bytes
     uint32_t        magic;          // Magic number (including version)
     uint16_t        nodeCount;      // Number of entries in offsets[]
     uint16_t        root;           // Node number of the root node
     uint32_t        offsets[1];      // Offsets to nodes from start of data
 };

 // Note that to avoid platform-specific alignment issues, all members of the node
 // structures should be the same size, or should contain explicit padding to
 // natural alignment boundaries.

 // We can't use a bitfield for the flags+count field, because the layout of those
 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
 struct CompactTrieNode {
     uint16_t        flagscount;     // Count of sub-entries, plus flags
 };

 enum CompactTrieNodeFlags {
     kVerticalNode   = 0x1000,       // This is a vertical node
     kParentEndsWord = 0x2000,       // The node whose equal link points to this ends a word
     kReservedFlag1  = 0x4000,
     kReservedFlag2  = 0x8000,
     kCountMask      = 0x0FFF,       // The count portion of flagscount
     kFlagMask       = 0xF000        // The flags portion of flagscount
 };

 // The two node types are distinguished by the kVerticalNode flag.

 struct CompactTrieHorizontalEntry {
     uint16_t        ch;             // UChar
     uint16_t        equal;          // Equal link node index
 };

 // We don't use inheritance here because C++ does not guarantee that the
 // base class comes first in memory!!

 struct CompactTrieHorizontalNode {
     uint16_t        flagscount;     // Count of sub-entries, plus flags
     CompactTrieHorizontalEntry      entries[1];
 };

 struct CompactTrieVerticalNode {
     uint16_t        flagscount;     // Count of sub-entries, plus flags
     uint16_t        equal;          // Equal link node index
     uint16_t        chars[1];       // Code units
 };

 // {'Dic', 1}, version 1
 #define COMPACT_TRIE_MAGIC_1 0x44696301

 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
                                                 UErrorCode &status )
 : fUData(dataObj)
 {
     fData = (const CompactTrieHeader *) udata_getMemory(dataObj);
     fOwnData = FALSE;
     if (fData->magic != COMPACT_TRIE_MAGIC_1) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         fData = NULL;
     }
 }
 CompactTrieDictionary::CompactTrieDictionary( const void *data,
                                                 UErrorCode &status )
 : fUData(NULL)
 {
     fData = (const CompactTrieHeader *) data;
     fOwnData = FALSE;
     if (fData->magic != COMPACT_TRIE_MAGIC_1) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         fData = NULL;
     }
 }

 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
                                                 UErrorCode &status )
 : fUData(NULL)
 {
     fData = compactMutableTrieDictionary(dict, status);
     fOwnData = !U_FAILURE(status);
 }

 CompactTrieDictionary::~CompactTrieDictionary() {
     if (fOwnData) {
         uprv_free((void *)fData);
     }
     if (fUData) {
         udata_close(fUData);
     }
 }

 uint32_t
 CompactTrieDictionary::dataSize() const {
     return fData->size;
 }

 const void *
 CompactTrieDictionary::data() const {
     return fData;
 }

 // This function finds the address of a node for us, given its node ID
 static inline const CompactTrieNode *
 getCompactNode(const CompactTrieHeader *header, uint16_t node) {
     return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
 }

 int32_t
 CompactTrieDictionary::matches( UText *text,
                                 int32_t maxLength,
                                 int32_t *lengths,
                                 int &count,
                                 int limit ) const {
     // TODO: current implementation works in UTF-16 space
     const CompactTrieNode *node = getCompactNode(fData, fData->root);
     int mycount = 0;

     UChar uc = utext_current32(text);
     int i = 0;

     while (node != NULL) {
         // Check if the node we just exited ends a word
         if (limit > 0 && (node->flagscount & kParentEndsWord)) {
             lengths[mycount++] = i;
             --limit;
         }
         // Check that we haven't exceeded the maximum number of input characters.
         // We have to do that here rather than in the while condition so that
         // we can check for ending a word, above.
         if (i >= maxLength) {
             break;
         }

         int nodeCount = (node->flagscount & kCountMask);
         if (nodeCount == 0) {
             // Special terminal node; return now
             break;
         }
         if (node->flagscount & kVerticalNode) {
             // Vertical node; check all the characters in it
             const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
             for (int j = 0; j < nodeCount && i < maxLength; ++j) {
                 if (uc != vnode->chars[j]) {
                     // We hit a non-equal character; return
                     goto exit;
                 }
                 utext_next32(text);
                 uc = utext_current32(text);
                 ++i;
             }
             // To get here we must have come through the whole list successfully;
             // go on to the next node. Note that a word cannot end in the middle
             // of a vertical node.
             node = getCompactNode(fData, vnode->equal);
         }
         else {
             // Horizontal node; do binary search
             const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
             int low = 0;
             int high = nodeCount-1;
             int middle;
             node = NULL;    // If we don't find a match, we'll fall out of the loop
             while (high >= low) {
                 middle = (high+low)/2;
                 if (uc == hnode->entries[middle].ch) {
                     // We hit a match; get the next node and next character
                     node = getCompactNode(fData, hnode->entries[middle].equal);
                     utext_next32(text);
                     uc = utext_current32(text);
                     ++i;
                     break;
                 }
                 else if (uc < hnode->entries[middle].ch) {
                     high = middle-1;
                 }
                 else {
                     low = middle+1;
                 }
             }
         }
     }
 exit:
     count = mycount;
     return i;
 }

 // Implementation of iteration for CompactTrieDictionary
 class CompactTrieEnumeration : public StringEnumeration {
 private:
     UVector32               fNodeStack;     // Stack of nodes to process
     UVector32               fIndexStack;    // Stack of where in node we are
     const CompactTrieHeader *fHeader;       // Trie data

 public:
     static UClassID U_EXPORT2 getStaticClassID(void);
     virtual UClassID getDynamicClassID(void) const;
 public:
     CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
         : fNodeStack(status), fIndexStack(status) {
         fHeader = header;
         fNodeStack.push(header->root, status);
         fIndexStack.push(0, status);
         unistr.remove();
     }

     virtual ~CompactTrieEnumeration() {
     }

     virtual StringEnumeration *clone() const {
         UErrorCode status = U_ZERO_ERROR;
         return new CompactTrieEnumeration(fHeader, status);
     }

     virtual const UnicodeString * snext(UErrorCode &status);

     // Very expensive, but this should never be used.
     virtual int32_t count(UErrorCode &status) const {
         CompactTrieEnumeration counter(fHeader, status);
         int32_t result = 0;
         while (counter.snext(status) != NULL && U_SUCCESS(status)) {
             ++result;
         }
         return result;
     }

     virtual void reset(UErrorCode &status) {
         fNodeStack.removeAllElements();
         fIndexStack.removeAllElements();
         fNodeStack.push(fHeader->root, status);
         fIndexStack.push(0, status);
         unistr.remove();
     }
 };

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration)

 const UnicodeString *
 CompactTrieEnumeration::snext(UErrorCode &status) {
     if (fNodeStack.empty() || U_FAILURE(status)) {
         return NULL;
     }
     const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
     int where = fIndexStack.peeki();
     while (!fNodeStack.empty() && U_SUCCESS(status)) {
         int nodeCount = (node->flagscount & kCountMask);
         UBool goingDown = FALSE;
         if (nodeCount == 0) {
             // Terminal node; go up immediately
             fNodeStack.popi();
             fIndexStack.popi();
             node = getCompactNode(fHeader, fNodeStack.peeki());
             where = fIndexStack.peeki();
         }
         else if (node->flagscount & kVerticalNode) {
             // Vertical node
             const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
             if (where == 0) {
                 // Going down
                 unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount);
                 fIndexStack.setElementAt(1, fIndexStack.size()-1);
                 node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status));
                 where = fIndexStack.push(0, status);
                 goingDown = TRUE;
             }
             else {
                 // Going up
                 unistr.truncate(unistr.length()-nodeCount);
                 fNodeStack.popi();
                 fIndexStack.popi();
                 node = getCompactNode(fHeader, fNodeStack.peeki());
                 where = fIndexStack.peeki();
             }
         }
         else {
             // Horizontal node
             const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
             if (where > 0) {
                 // Pop previous char
                 unistr.truncate(unistr.length()-1);
             }
             if (where < nodeCount) {
                 // Push on next node
                 unistr.append((UChar)hnode->entries[where].ch);
                 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
                 node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status));
                 where = fIndexStack.push(0, status);
                 goingDown = TRUE;
             }
             else {
                 // Going up
                 fNodeStack.popi();
                 fIndexStack.popi();
                 node = getCompactNode(fHeader, fNodeStack.peeki());
                 where = fIndexStack.peeki();
             }
         }
         // Check if the parent of the node we've just gone down to ends a
         // word. If so, return it.
         if (goingDown && (node->flagscount & kParentEndsWord)) {
             return &unistr;
         }
     }
     return NULL;
 }

 StringEnumeration *
 CompactTrieDictionary::openWords( UErrorCode &status ) const {
     if (U_FAILURE(status)) {
         return NULL;
     }
     return new CompactTrieEnumeration(fData, status);
 }

 //
 // Below here is all code related to converting a ternary trie to a compact trie
 // and back again
 //

 // Helper classes to construct the compact trie
 class BuildCompactTrieNode: public UMemory {
  public:
     UBool           fParentEndsWord;
     UBool           fVertical;
     UBool           fHasDuplicate;
     int32_t         fNodeID;
     UnicodeString   fChars;

  public:
     BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) {
         fParentEndsWord = parentEndsWord;
         fHasDuplicate = FALSE;
         fVertical = vertical;
         fNodeID = nodes.size();
         nodes.push(this, status);
     }

     virtual ~BuildCompactTrieNode() {
     }

     virtual uint32_t size() {
         return sizeof(uint16_t);
     }

     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
         // Write flag/count
         *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)
             | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 );
         offset += sizeof(uint16_t);
     }
 };

 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
  public:
     UStack          fLinks;

  public:
     BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
         : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) {
     }

     virtual ~BuildCompactTrieHorizontalNode() {
     }

     virtual uint32_t size() {
         return offsetof(CompactTrieHorizontalNode,entries) +
                 (fChars.length()*sizeof(CompactTrieHorizontalEntry));
     }

     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
         BuildCompactTrieNode::write(bytes, offset, translate);
         int32_t count = fChars.length();
         for (int32_t i = 0; i < count; ++i) {
             CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
             entry->ch = fChars[i];
             entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
 #ifdef DEBUG_TRIE_DICT
             if (entry->equal == 0) {
                 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
                         i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
             }
 #endif
             offset += sizeof(CompactTrieHorizontalEntry);
         }
     }

     void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
         fChars.append(ch);
         fLinks.push(link, status);
     }
 };

 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
  public:
     BuildCompactTrieNode    *fEqual;

  public:
     BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
         : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {
         fEqual = NULL;
     }

     virtual ~BuildCompactTrieVerticalNode() {
     }

     virtual uint32_t size() {
         return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
     }

     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
         CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
         BuildCompactTrieNode::write(bytes, offset, translate);
         node->equal = translate.elementAti(fEqual->fNodeID);
         offset += sizeof(node->equal);
 #ifdef DEBUG_TRIE_DICT
         if (node->equal == 0) {
             fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
                     fEqual->fNodeID);
         }
 #endif
         fChars.extract(0, fChars.length(), (UChar *)node->chars);
         offset += sizeof(uint16_t)*fChars.length();
     }

     void addChar(UChar ch) {
         fChars.append(ch);
     }

     void setLink(BuildCompactTrieNode *node) {
         fEqual = node;
     }
 };

 // Forward declaration
 static void walkHorizontal(const TernaryNode *node,
                             BuildCompactTrieHorizontalNode *building,
                             UStack &nodes,
                             UErrorCode &status);

 // Convert one node. Uses recursion.

 static BuildCompactTrieNode *
 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return NULL;
     }
     BuildCompactTrieNode *result = NULL;
     UBool horizontal = (node->low != NULL || node->high != NULL);
     if (horizontal) {
         BuildCompactTrieHorizontalNode *hResult =
                 new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
         if (hResult == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return NULL;
         }
         if (U_SUCCESS(status)) {
             walkHorizontal(node, hResult, nodes, status);
             result = hResult;
         }
     }
     else {
         BuildCompactTrieVerticalNode *vResult =
                 new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
         if (vResult == NULL) {
             status = U_MEMORY_ALLOCATION_ERROR;
         }
         else if (U_SUCCESS(status)) {
             UBool   endsWord = FALSE;
             // Take up nodes until we end a word, or hit a node with < or > links
             do {
                 vResult->addChar(node->ch);
                 endsWord = (node->flags & kEndsWord) != 0;
                 node = node->equal;
             }
             while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
             if (node == NULL) {
                 if (!endsWord) {
                     status = U_ILLEGAL_ARGUMENT_ERROR;  // Corrupt input trie
                 }
                 else {
                     vResult->setLink((BuildCompactTrieNode *)nodes[1]);
                 }
             }
             else {
                 vResult->setLink(compactOneNode(node, endsWord, nodes, status));
             }
             result = vResult;
         }
     }
     return result;
 }

 // Walk the set of peers at the same level, to build a horizontal node.
 // Uses recursion.

 static void walkHorizontal(const TernaryNode *node,
                             BuildCompactTrieHorizontalNode *building,
                             UStack &nodes,
                             UErrorCode &status) {
     while (U_SUCCESS(status) && node != NULL) {
         if (node->low != NULL) {
             walkHorizontal(node->low, building, nodes, status);
         }
         BuildCompactTrieNode *link = NULL;
         if (node->equal != NULL) {
             link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
         }
         else if (node->flags & kEndsWord) {
             link = (BuildCompactTrieNode *)nodes[1];
         }
         if (U_SUCCESS(status) && link != NULL) {
             building->addNode(node->ch, link, status);
         }
         // Tail recurse manually instead of leaving it to the compiler.
         //if (node->high != NULL) {
         //    walkHorizontal(node->high, building, nodes, status);
         //}
         node = node->high;
     }
 }

 U_NAMESPACE_END
 U_NAMESPACE_USE
 U_CDECL_BEGIN
 static int32_t U_CALLCONV
 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
     BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
     BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
     // Check for comparing a node to itself, to avoid spurious duplicates
     if (left == right) {
         return 0;
     }
     // Most significant is type of node. Can never coalesce.
     if (left->fVertical != right->fVertical) {
         return left->fVertical - right->fVertical;
     }
     // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
     if (left->fParentEndsWord != right->fParentEndsWord) {
         return left->fParentEndsWord - right->fParentEndsWord;
     }
     // Next, the string. If that differs, we can never coalesce.
     int32_t result = left->fChars.compare(right->fChars);
     if (result != 0) {
         return result;
     }
     // We know they're both the same node type, so branch for the two cases.
     if (left->fVertical) {
         result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
                             - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
     }
     else {
         // We need to compare the links vectors. They should be the
         // same size because the strings were equal.
         // We compare the node IDs instead of the pointers, to handle
         // coalesced nodes.
         BuildCompactTrieHorizontalNode *hleft, *hright;
         hleft = (BuildCompactTrieHorizontalNode *)left;
         hright = (BuildCompactTrieHorizontalNode *)right;
         int32_t count = hleft->fLinks.size();
         for (int32_t i = 0; i < count && result == 0; ++i) {
             result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
                      ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
         }
     }
     // If they are equal to each other, mark them (speeds coalescing)
     if (result == 0) {
         left->fHasDuplicate = TRUE;
         right->fHasDuplicate = TRUE;
     }
     return result;
 }
 U_CDECL_END
 U_NAMESPACE_BEGIN

 static void coalesceDuplicates(UStack &nodes, UErrorCode &status) {
     // We sort the array of nodes to place duplicates next to each other
     if (U_FAILURE(status)) {
         return;
     }
     int32_t size = nodes.size();
     void **array = (void **)uprv_malloc(sizeof(void *)*size);
     if (array == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
     (void) nodes.toArray(array);

     // Now repeatedly identify duplicates until there are no more
     int32_t dupes = 0;
     long    passCount = 0;
 #ifdef DEBUG_TRIE_DICT
     long    totalDupes = 0;
 #endif
     do {
         BuildCompactTrieNode *node;
         BuildCompactTrieNode *first = NULL;
         BuildCompactTrieNode **p;
         BuildCompactTrieNode **pFirst = NULL;
         int32_t counter = size - 2;
         // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
         // pass for speed. For the second and subsequent passes, we use stable
         // (insertion) sort for two reasons:
         // 1. The array is already mostly ordered, so we get better performance.
         // 2. The way we find one and only one instance of a set of duplicates is to
         //    check that the node ID equals the array index. If we used an unstable
         //    sort for the second or later passes, it's possible that none of the
         //    duplicates would wind up with a node ID equal to its array index.
         //    The sort stability guarantees that, because as we coalesce more and
         //    more groups, the first element of the resultant group will be one of
         //    the first elements of the groups being coalesced.
         // To use quicksort for the second and subsequent passes, we would have to
         // find the minimum of the node numbers in a group, and set all the nodes
         // in the group to that node number.
         uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status);
         dupes = 0;
         for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) {
             node = *p;
             if (node->fHasDuplicate) {
                 if (first == NULL) {
                     first = node;
                     pFirst = p;
                 }
                 else if (_sortBuildNodes(NULL, pFirst, p) != 0) {
                     // Starting a new run of dupes
                     first = node;
                     pFirst = p;
                 }
                 else if (node->fNodeID != first->fNodeID) {
                     // Slave one to the other, note duplicate
                     node->fNodeID = first->fNodeID;
                     dupes += 1;
                 }
             }
             else {
                 // This node has no dupes
                 first = NULL;
                 pFirst = NULL;
             }
         }
         passCount += 1;
 #ifdef DEBUG_TRIE_DICT
         totalDupes += dupes;
         fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes);
 #endif
     }
     while (dupes > 0);
 #ifdef DEBUG_TRIE_DICT
     fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount);
 #endif

     // We no longer need the temporary array, as the nodes have all been marked appropriately.
     uprv_free(array);
 }

 U_NAMESPACE_END
 U_CDECL_BEGIN
 static void U_CALLCONV _deleteBuildNode(void *obj) {
     delete (BuildCompactTrieNode *) obj;
 }
 U_CDECL_END
 U_NAMESPACE_BEGIN

 CompactTrieHeader *
 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict,
                                 UErrorCode &status ) {
     if (U_FAILURE(status)) {
         return NULL;
     }
 #ifdef DEBUG_TRIE_DICT
     struct tms timing;
     struct tms previous;
     (void) ::times(&previous);
 #endif
     UStack nodes(_deleteBuildNode, NULL, status);      // Index of nodes

     // Add node 0, used as the NULL pointer/sentinel.
     nodes.addElement((int32_t)0, status);

     // Start by creating the special empty node we use to indicate that the parent
     // terminates a word. This must be node 1, because the builder assumes
     // that.
     if (U_FAILURE(status)) {
         return NULL;
     }
     BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status);
     if (terminal == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
     }

     // This call does all the work of building the new trie structure. The root
     // will be node 2.
     BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status);
 #ifdef DEBUG_TRIE_DICT
     (void) ::times(&timing);
     fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
         nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
     previous = timing;
 #endif

     // Now coalesce all duplicate nodes.
     coalesceDuplicates(nodes, status);
 #ifdef DEBUG_TRIE_DICT
     (void) ::times(&timing);
     fprintf(stderr, "Duplicates coalesced, time user %f system %f\n",
         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
     previous = timing;
 #endif

     // Next, build the output trie.
     // First we compute all the sizes and build the node ID translation table.
     uint32_t totalSize = offsetof(CompactTrieHeader,offsets);
     int32_t count = nodes.size();
     int32_t nodeCount = 1;              // The sentinel node we already have
     BuildCompactTrieNode *node;
     int32_t i;
     UVector32 translate(count, status); // Should be no growth needed after this
     translate.push(0, status);          // The sentinel node

     if (U_FAILURE(status)) {
         return NULL;
     }

     for (i = 1; i < count; ++i) {
         node = (BuildCompactTrieNode *)nodes[i];
         if (node->fNodeID == i) {
             // Only one node out of each duplicate set is used
             if (i >= translate.size()) {
                 // Logically extend the mapping table
                 translate.setSize(i+1);
             }
             translate.setElementAt(nodeCount++, i);
             totalSize += node->size();
         }
     }

     // Check for overflowing 16 bits worth of nodes.
     if (nodeCount > 0x10000) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }

     // Add enough room for the offsets.
     totalSize += nodeCount*sizeof(uint32_t);
 #ifdef DEBUG_TRIE_DICT
     (void) ::times(&timing);
     fprintf(stderr, "Sizes/mapping done, time user %f system %f\n",
         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
     previous = timing;
     fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize);
 #endif
     uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize);
     if (bytes == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }

     CompactTrieHeader *header = (CompactTrieHeader *)bytes;
     header->size = totalSize;
     header->nodeCount = nodeCount;
     header->offsets[0] = 0;                     // Sentinel
     header->root = translate.elementAti(root->fNodeID);
 #ifdef DEBUG_TRIE_DICT
     if (header->root == 0) {
         fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID);
     }
 #endif
     uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
     nodeCount = 1;
     // Now write the data
     for (i = 1; i < count; ++i) {
         node = (BuildCompactTrieNode *)nodes[i];
         if (node->fNodeID == i) {
             header->offsets[nodeCount++] = offset;
             node->write(bytes, offset, translate);
         }
     }
 #ifdef DEBUG_TRIE_DICT
     (void) ::times(&timing);
     fprintf(stderr, "Trie built, time user %f system %f\n",
         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
     previous = timing;
     fprintf(stderr, "Final offset is %d\n", offset);

     // Collect statistics on node types and sizes
     int hCount = 0;
     int vCount = 0;
     size_t hSize = 0;
     size_t vSize = 0;
     size_t hItemCount = 0;
     size_t vItemCount = 0;
     uint32_t previousOff = offset;
     for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
         const CompactTrieNode *node = getCompactNode(header, nodeIdx);
         if (node->flagscount & kVerticalNode) {
             vCount += 1;
             vItemCount += (node->flagscount & kCountMask);
             vSize += previousOff-header->offsets[nodeIdx];
         }
         else {
             hCount += 1;
             hItemCount += (node->flagscount & kCountMask);
             hSize += previousOff-header->offsets[nodeIdx];
         }
         previousOff = header->offsets[nodeIdx];
     }
     fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
                 (double)hSize/hCount, (double)hItemCount/hCount);
     fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
                 (double)vSize/vCount, (double)vItemCount/vCount);
 #endif

     if (U_FAILURE(status)) {
         uprv_free(bytes);
         header = NULL;
     }
     else {
         header->magic = COMPACT_TRIE_MAGIC_1;
     }
     return header;
 }

 // Forward declaration
 static TernaryNode *
 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status );


 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
 static TernaryNode *
 unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array,
                             int low, int high, UErrorCode &status ) {
     if (U_FAILURE(status) || low > high) {
         return NULL;
     }
     int middle = (low+high)/2;
     TernaryNode *result = new TernaryNode(array[middle].ch);
     if (result == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);
     if (equal->flagscount & kParentEndsWord) {
         result->flags |= kEndsWord;
     }
     result->low = unpackHorizontalArray(header, array, low, middle-1, status);
     result->high = unpackHorizontalArray(header, array, middle+1, high, status);
     result->equal = unpackOneNode(header, equal, status);
     return result;
 }

 // Convert one compact trie node into a ternary subtrie
 static TernaryNode *
 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) {
     int nodeCount = (node->flagscount & kCountMask);
     if (nodeCount == 0 || U_FAILURE(status)) {
         // Failure, or terminal node
         return NULL;
     }
     if (node->flagscount & kVerticalNode) {
         const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
         TernaryNode *head = NULL;
         TernaryNode *previous = NULL;
         TernaryNode *latest = NULL;
         for (int i = 0; i < nodeCount; ++i) {
             latest = new TernaryNode(vnode->chars[i]);
             if (latest == NULL) {
                 status = U_MEMORY_ALLOCATION_ERROR;
                 break;
             }
             if (head == NULL) {
                 head = latest;
             }
             if (previous != NULL) {
                 previous->equal = latest;
             }
             previous = latest;
         }
         if (latest != NULL) {
             const CompactTrieNode *equal = getCompactNode(header, vnode->equal);
             if (equal->flagscount & kParentEndsWord) {
                 latest->flags |= kEndsWord;
             }
             latest->equal = unpackOneNode(header, equal, status);
         }
         return head;
     }
     else {
         // Horizontal node
         const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
         return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status);
     }
 }

 MutableTrieDictionary *
 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
     MutableTrieDictionary *result = new MutableTrieDictionary( status );
     if (result == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status);
     if (U_FAILURE(status)) {
         delete root;    // Clean up
         delete result;
         return NULL;
     }
     result->fTrie = root;
     return result;
 }

 U_NAMESPACE_END

 U_CAPI int32_t U_EXPORT2
 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
            UErrorCode *status) {

     if (status == NULL || U_FAILURE(*status)) {
         return 0;
     }
     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
         *status=U_ILLEGAL_ARGUMENT_ERROR;
         return 0;
     }

     //
     //  Check that the data header is for for dictionary data.
     //    (Header contents are defined in genxxx.cpp)
     //
     const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
     if(!(  pInfo->dataFormat[0]==0x54 &&   /* dataFormat="TrDc" */
            pInfo->dataFormat[1]==0x72 &&
            pInfo->dataFormat[2]==0x44 &&
            pInfo->dataFormat[3]==0x63 &&
            pInfo->formatVersion[0]==1  )) {
         udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
                          pInfo->dataFormat[0], pInfo->dataFormat[1],
                          pInfo->dataFormat[2], pInfo->dataFormat[3],
                          pInfo->formatVersion[0]);
         *status=U_UNSUPPORTED_ERROR;
         return 0;
     }

     //
     // Swap the data header.  (This is the generic ICU Data Header, not the
     //                         CompactTrieHeader).  This swap also conveniently gets us
     //                         the size of the ICU d.h., which lets us locate the start
     //                         of the RBBI specific data.
     //
     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);

     //
     // Get the CompactTrieHeader, and check that it appears to be OK.
     //
     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
     const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
     if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
             || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
     {
         udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
         *status=U_UNSUPPORTED_ERROR;
         return 0;
     }

     //
     // Prefight operation?  Just return the size
     //
     uint32_t totalSize = ds->readUInt32(header->size);
     int32_t sizeWithUData = (int32_t)totalSize + headerSize;
     if (length < 0) {
         return sizeWithUData;
     }

     //
     // Check that length passed in is consistent with length from RBBI data header.
     //
     if (length < sizeWithUData) {
         udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
                             totalSize);
         *status=U_INDEX_OUTOFBOUNDS_ERROR;
         return 0;
         }

     //
     // Swap the Data.  Do the data itself first, then the CompactTrieHeader, because
     //                 we need to reference the header to locate the data, and an
     //                 inplace swap of the header leaves it unusable.
     //
     uint8_t             *outBytes = (uint8_t *)outData + headerSize;
     CompactTrieHeader   *outputHeader = (CompactTrieHeader *)outBytes;

 #if 0
     //
     // If not swapping in place, zero out the output buffer before starting.
     //
     if (inBytes != outBytes) {
         uprv_memset(outBytes, 0, totalSize);
     }

     // We need to loop through all the nodes in the offset table, and swap each one.
     uint16_t nodeCount = ds->readUInt16(header->nodeCount);
     // Skip node 0, which should always be 0.
     for (int i = 1; i < nodeCount; ++i) {
         uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
         const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
         CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
         uint16_t flagscount = ds->readUInt16(inNode->flagscount);
         uint16_t itemCount = flagscount & kCountMask;
         ds->writeUInt16(&outNode->flagscount, flagscount);
         if (itemCount > 0) {
             if (flagscount & kVerticalNode) {
                 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
                                     itemCount*sizeof(uint16_t),
                                     outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
                 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
                 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
             }
             else {
                 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode;
                 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode;
                 for (int j = 0; j < itemCount; ++j) {
                     uint16_t word = ds->readUInt16(inHNode->entries[j].ch);
                     ds->writeUInt16(&outHNode->entries[j].ch, word);
                     word = ds->readUInt16(inHNode->entries[j].equal);
                     ds->writeUInt16(&outHNode->entries[j].equal, word);
                 }
             }
         }
     }
 #endif

     // All the data in all the nodes consist of 16 bit items. Swap them all at once.
     uint16_t nodeCount = ds->readUInt16(header->nodeCount);
     uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t));
     ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);

     // Swap the header
     ds->writeUInt32(&outputHeader->size, totalSize);
     uint32_t magic = ds->readUInt32(header->magic);
     ds->writeUInt32(&outputHeader->magic, magic);
     ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
     uint16_t root = ds->readUInt16(header->root);
     ds->writeUInt16(&outputHeader->root, root);
     ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
             sizeof(uint32_t)*(int32_t)nodeCount,
             outBytes+offsetof(CompactTrieHeader,offsets), status);

     return sizeWithUData;
 }

 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */