jni/share/userdict.cpp - platform/packages/inputmethods/PinyinIME - Git at Google

 /*
  * Copyright (C) 2009 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "../include/userdict.h"
 #include "../include/splparser.h"
 #include "../include/ngram.h"
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <cutils/log.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <assert.h>
 #include <ctype.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <time.h>
 #include <pthread.h>
 #include <math.h>

 namespace ime_pinyin {

 #ifdef ___DEBUG_PERF___
 static uint64 _ellapse_ = 0;
 static struct timeval _tv_start_, _tv_end_;
 #define DEBUG_PERF_BEGIN \
     do { \
       gettimeofday(&_tv_start_, NULL); \
     } while(0)
 #define DEBUG_PERF_END \
     do { \
       gettimeofday(&_tv_end_, NULL); \
       _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
                   (_tv_end_.tv_usec - _tv_start_.tv_usec); \
     } while(0)
 #define LOGD_PERF(message) \
     LOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
 #else
 #define DEBUG_PERF_BEGIN
 #define DEBUG_PERF_END
 #define LOGD_PERF(message)
 #endif

 // XXX File load and write are thread-safe by g_mutex_
 static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
 static struct timeval g_last_update_ = {0, 0};

 inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
   return (4 + info->lemma_size + (info->lemma_count << 3)
 #ifdef ___PREDICT_ENABLED___
           + (info->lemma_count << 2)
 #endif
 #ifdef ___SYNC_ENABLED___
           + (info->sync_count << 2)
 #endif
           + sizeof(*info));
 }

 inline LmaScoreType UserDict::translate_score(int raw_score) {
   // 1) ori_freq: original user frequency
   uint32 ori_freq = extract_score_freq(raw_score);
   // 2) lmt_off: lmt index (week offset for example)
   uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
   if (kUserDictLMTBitWidth < 16) {
     uint64 mask = ~(1 << kUserDictLMTBitWidth);
     lmt_off &= mask;
   }
   // 3) now_off: current time index (current week offset for example)
   // assuming load_time_ is around current time
   uint64 now_off = load_time_.tv_sec;
   now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
   now_off = (now_off << (64 - kUserDictLMTBitWidth));
   now_off = (now_off >> (64 - kUserDictLMTBitWidth));
   // 4) factor: decide expand-factor
   int delta = now_off - lmt_off;
   if (delta > 4)
     delta = 4;
   int factor = 80 - (delta << 4);

   double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
   return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
                         * NGram::kLogValueAmplifier);
 }

 inline int UserDict::extract_score_freq(int raw_score) {
   // Frequence stored in lowest 16 bits
   int freq = (raw_score & 0x0000ffff);
   return freq;
 }

 inline uint64 UserDict::extract_score_lmt(int raw_score) {
   uint64 lmt = ((raw_score & 0xffff0000) >> 16);
   if (kUserDictLMTBitWidth < 16) {
     uint64 mask = ~(1 << kUserDictLMTBitWidth);
     lmt &= mask;
   }
   lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
   return lmt;
 }

 inline int UserDict::build_score(uint64 lmt, int freq) {
   lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
   lmt = (lmt << (64 - kUserDictLMTBitWidth));
   lmt = (lmt >> (64 - kUserDictLMTBitWidth));
   uint16 lmt16 = (uint16)lmt;
   int s = freq;
   s &= 0x0000ffff;
   s = (lmt16 << 16) | s;
   return s;
 }

 inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
   int64 ret = 0;
   if (len <= 0)
     return ret;

   int flag = 1;
   const uint16 * endp = s + len;
   if (*s == '-') {
     flag = -1;
     s++;
   } else if (*s == '+') {
     s++;
   }

   while (*s >= '0' && *s <= '9' && s < endp) {
     ret += ret * 10 + (*s) - '0';
     s++;
   }
   return ret * flag;
 }

 inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
   if (!s || size <= 0)
     return 0;
   uint16 *endp = s + size;
   int ret_len = 0;
   if (v < 0) {
     *(s++) = '-';
     ++ret_len;
     v *= -1;
   }

   uint16 *b = s;
   while (s < endp && v != 0) {
     *(s++) = '0' + (v % 10);
     v = v / 10;
     ++ret_len;
   }

   if (v != 0)
     return 0;

   --s;

   while (b < s) {
     *b = *s;
     ++b, --s;
   }

   return ret_len;
 }

 inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
   offset &= kUserDictOffsetMask;
   lemmas_[offset] |= flag;
 }

 inline char UserDict::get_lemma_flag(uint32 offset) {
   offset &= kUserDictOffsetMask;
   return (char)(lemmas_[offset]);
 }

 inline char UserDict::get_lemma_nchar(uint32 offset) {
   offset &= kUserDictOffsetMask;
   return (char)(lemmas_[offset + 1]);
 }

 inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
   offset &= kUserDictOffsetMask;
   return (uint16 *)(lemmas_ + offset + 2);
 }

 inline uint16 * UserDict::get_lemma_word(uint32 offset) {
   offset &= kUserDictOffsetMask;
   uint8 nchar = get_lemma_nchar(offset);
   return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
 }

 inline LemmaIdType UserDict::get_max_lemma_id() {
   // When a lemma is deleted, we don't not claim its id back for
   // simplicity and performance
   return start_id_ + dict_info_.lemma_count - 1;
 }

 inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
   if (id >= start_id_ && id <= get_max_lemma_id())
     return true;
   return false;
 }

 inline bool UserDict::is_valid_state() {
   if (state_ == USER_DICT_NONE)
     return false;
   return true;
 }

 UserDict::UserDict()
     : start_id_(0),
       version_(0),
       lemmas_(NULL),
       offsets_(NULL),
       scores_(NULL),
       ids_(NULL),
 #ifdef ___PREDICT_ENABLED___
       predicts_(NULL),
 #endif
 #ifdef ___SYNC_ENABLED___
       syncs_(NULL),
       sync_count_size_(0),
 #endif
       offsets_by_id_(NULL),
       lemma_count_left_(0),
       lemma_size_left_(0),
       dict_file_(NULL),
       state_(USER_DICT_NONE) {
   memset(&dict_info_, 0, sizeof(dict_info_));
   memset(&load_time_, 0, sizeof(load_time_));
 #ifdef ___CACHE_ENABLED___
   cache_init();
 #endif
 }

 UserDict::~UserDict() {
   close_dict();
 }

 bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
                          LemmaIdType end_id) {
 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_BEGIN;
 #endif
   dict_file_ = strdup(file_name);
   if (!dict_file_)
     return false;

   start_id_ = start_id;

   if (false == validate(file_name) && false == reset(file_name)) {
     goto error;
   }
   if (false == load(file_name, start_id)) {
     goto error;
   }

   state_ = USER_DICT_SYNC;

   gettimeofday(&load_time_, NULL);

 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_END;
   LOGD_PERF("load_dict");
 #endif
   return true;
  error:
   free((void*)dict_file_);
   start_id_ = 0;
   return false;
 }

 bool UserDict::close_dict() {
   if (state_ == USER_DICT_NONE)
     return true;
   if (state_ == USER_DICT_SYNC)
     goto out;

   // If dictionary is written back by others,
   // we can not simply write back here
   // To do a safe flush, we have to discard all newly added
   // lemmas and try to reload dict file.
   pthread_mutex_lock(&g_mutex_);
   if (load_time_.tv_sec > g_last_update_.tv_sec ||
     (load_time_.tv_sec == g_last_update_.tv_sec &&
      load_time_.tv_usec > g_last_update_.tv_usec)) {
     write_back();
     gettimeofday(&g_last_update_, NULL);
   }
   pthread_mutex_unlock(&g_mutex_);

  out:
   free((void*)dict_file_);
   free(lemmas_);
   free(offsets_);
   free(offsets_by_id_);
   free(scores_);
   free(ids_);
 #ifdef ___PREDICT_ENABLED___
   free(predicts_);
 #endif

   version_ = 0;
   dict_file_ = NULL;
   lemmas_ = NULL;
 #ifdef ___SYNC_ENABLED___
   syncs_ = NULL;
   sync_count_size_ = 0;
 #endif
   offsets_ = NULL;
   offsets_by_id_ = NULL;
   scores_ = NULL;
   ids_ = NULL;
 #ifdef ___PREDICT_ENABLED___
   predicts_ = NULL;
 #endif

   memset(&dict_info_, 0, sizeof(dict_info_));
   lemma_count_left_ = 0;
   lemma_size_left_ = 0;
   state_ = USER_DICT_NONE;

   return true;
 }

 size_t UserDict::number_of_lemmas() {
   return dict_info_.lemma_count;
 }

 void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
   return;
 }

 MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
                                       const DictExtPara *dep,
                                       LmaPsbItem *lpi_items,
                                       size_t lpi_max, size_t *lpi_num) {
   if (is_valid_state() == false)
     return 0;

   bool need_extend = false;

 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_BEGIN;
 #endif
   *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
                        lpi_items, lpi_max, &need_extend);
 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_END;
   LOGD_PERF("extend_dict");
 #endif
   return ((*lpi_num > 0 || need_extend) ? 1 : 0);
 }

 int UserDict::is_fuzzy_prefix_spell_id(
     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
   if (len1 < searchable->splids_len)
     return 0;

   SpellingTrie &spl_trie = SpellingTrie::get_instance();
   uint32 i = 0;
   for (i = 0; i < searchable->splids_len; i++) {
     const char py1 = *spl_trie.get_spelling_str(id1[i]);
     uint16 off = 8 * (i % 4);
     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
     if (py1 == py2)
       continue;
     return 0;
   }
   return 1;
 }

 int UserDict::fuzzy_compare_spell_id(
     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
   if (len1 < searchable->splids_len)
     return -1;
   if (len1 > searchable->splids_len)
     return 1;

   SpellingTrie &spl_trie = SpellingTrie::get_instance();
   uint32 i = 0;
   for (i = 0; i < len1; i++) {
     const char py1 = *spl_trie.get_spelling_str(id1[i]);
     uint16 off = 8 * (i % 4);
     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
     if (py1 == py2)
       continue;
     if (py1 > py2)
       return 1;
     return -1;
   }
   return 0;
 }

 bool UserDict::is_prefix_spell_id(
     const uint16 * fullids, uint16 fulllen,
     const UserDictSearchable *searchable) {
   if (fulllen < searchable->splids_len)
     return false;

   uint32 i = 0;
   for (; i < searchable->splids_len; i++) {
     uint16 start_id = searchable->splid_start[i];
     uint16 count = searchable->splid_count[i];
     if (fullids[i] >= start_id && fullids[i] < start_id + count)
       continue;
     else
       return false;
   }
   return true;
 }

 bool UserDict::equal_spell_id(
     const uint16 * fullids, uint16 fulllen,
     const UserDictSearchable *searchable) {
   if (fulllen != searchable->splids_len)
     return false;

   uint32 i = 0;
   for (; i < fulllen; i++) {
     uint16 start_id = searchable->splid_start[i];
     uint16 count = searchable->splid_count[i];
     if (fullids[i] >= start_id && fullids[i] < start_id + count)
       continue;
     else
       return false;
   }
   return true;
 }

 int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
   int32 begin = 0;
   int32 end = dict_info_.lemma_count - 1;
   int32 middle = -1;

   int32 first_prefix = middle;
   int32 last_matched = middle;

   while (begin <= end) {
     middle = (begin + end) >> 1;
     uint32 offset = offsets_[middle];
     uint8 nchar = get_lemma_nchar(offset);
     const uint16 * splids = get_lemma_spell_ids(offset);
     int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
     int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);

     if (pre)
       first_prefix = middle;

     if (cmp < 0) {
       begin = middle + 1;
     } else if (cmp > 0) {
       end = middle - 1;
     } else {
       end = middle - 1;
       last_matched = middle;
     }
   }

   return first_prefix;
 }

 void UserDict::prepare_locate(UserDictSearchable *searchable,
                              const uint16 *splid_str,
                              uint16 splid_str_len) {
   searchable->splids_len = splid_str_len;
   memset(searchable->signature, 0, sizeof(searchable->signature));

   SpellingTrie &spl_trie = SpellingTrie::get_instance();
   uint32 i = 0;
   for (; i < splid_str_len; i++) {
     if (spl_trie.is_half_id(splid_str[i])) {
       searchable->splid_count[i] =
           spl_trie.half_to_full(splid_str[i],
                                 &(searchable->splid_start[i]));
     } else {
       searchable->splid_count[i] = 1;
       searchable->splid_start[i] = splid_str[i];
     }
     const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
     searchable->signature[i>>2] |= (py << (8 * (i % 4)));
   }
 }

 size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
                           LmaPsbItem *lpi_items, size_t lpi_max) {
   return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
 }

 size_t UserDict::_get_lpis(const uint16 *splid_str,
                            uint16 splid_str_len, LmaPsbItem *lpi_items,
                            size_t lpi_max, bool * need_extend) {
   bool tmp_extend;
   if (!need_extend)
     need_extend = &tmp_extend;

   *need_extend = false;

   if (is_valid_state() == false)
     return 0;
   if (lpi_max <= 0)
     return 0;

   if (0 == pthread_mutex_trylock(&g_mutex_)) {
     if (load_time_.tv_sec < g_last_update_.tv_sec ||
       (load_time_.tv_sec == g_last_update_.tv_sec &&
        load_time_.tv_usec < g_last_update_.tv_usec)) {
       // Others updated disk file, have to reload
       pthread_mutex_unlock(&g_mutex_);
       flush_cache();
     } else {
       pthread_mutex_unlock(&g_mutex_);
     }
   } else {
   }

   UserDictSearchable searchable;
   prepare_locate(&searchable, splid_str, splid_str_len);

   uint32 max_off = dict_info_.lemma_count;
 #ifdef ___CACHE_ENABLED___
   int32 middle;
   uint32 start, count;
   bool cached = cache_hit(&searchable, &start, &count);
   if (cached) {
     middle = start;
     max_off = start + count;
   } else {
     middle = locate_first_in_offsets(&searchable);
     start = middle;
   }
 #else
   int32 middle = locate_first_in_offsets(&searchable);
 #endif

   if (middle == -1) {
 #ifdef ___CACHE_ENABLED___
     if (!cached)
       cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
 #endif
     return 0;
   }

   size_t lpi_current = 0;

   bool fuzzy_break = false;
   bool prefix_break = false;
   while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
     if (lpi_current >= lpi_max)
       break;
     uint32 offset = offsets_[middle];
     // Ignore deleted lemmas
     if (offset & kUserDictOffsetFlagRemove) {
       middle++;
       continue;
     }
     uint8 nchar = get_lemma_nchar(offset);
     uint16 * splids = get_lemma_spell_ids(offset);
 #ifdef ___CACHE_ENABLED___
     if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
 #else
     if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
 #endif
       fuzzy_break = true;
     }

     if (prefix_break == false) {
       if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
         if (*need_extend == false &&
             is_prefix_spell_id(splids, nchar, &searchable)) {
           *need_extend = true;
         }
       } else {
         prefix_break = true;
       }
     }

     if (equal_spell_id(splids, nchar, &searchable) == true) {
       lpi_items[lpi_current].psb = translate_score(scores_[middle]);
       lpi_items[lpi_current].id = ids_[middle];
       lpi_items[lpi_current].lma_len = nchar;
       lpi_current++;
     }
     middle++;
   }

 #ifdef ___CACHE_ENABLED___
   if (!cached) {
     count = middle - start;
     cache_push(USER_DICT_CACHE, &searchable, start, count);
   }
 #endif

   return lpi_current;
 }

 uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
                                uint16 str_max) {
   if (is_valid_state() == false)
     return 0;
   if (is_valid_lemma_id(id_lemma) == false)
     return 0;
   uint32 offset = offsets_by_id_[id_lemma - start_id_];
   uint8 nchar = get_lemma_nchar(offset);
   char16 * str = get_lemma_word(offset);
   uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
   int i = 0;
   for (; i < m; i++) {
     str_buf[i] = str[i];
   }
   str_buf[i] = 0;
   return m;
 }

 uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
                                   uint16 splids_max, bool arg_valid) {
   if (is_valid_lemma_id(id_lemma) == false)
     return 0;
   uint32 offset = offsets_by_id_[id_lemma - start_id_];
   uint8 nchar = get_lemma_nchar(offset);
   const uint16 * ids = get_lemma_spell_ids(offset);
   int i = 0;
   for (; i < nchar && i < splids_max; i++)
     splids[i] = ids[i];
   return i;
 }

 size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
                          NPredictItem *npre_items, size_t npre_max,
                          size_t b4_used) {
   uint32 new_added = 0;
 #ifdef ___PREDICT_ENABLED___
   int32 end = dict_info_.lemma_count - 1;
   int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
   if (j == -1)
     return 0;

   while (j <= end) {
     uint32 offset = predicts_[j];
     // Ignore deleted lemmas
     if (offset & kUserDictOffsetFlagRemove) {
       j++;
       continue;
     }
     uint32 nchar = get_lemma_nchar(offset);
     uint16 * words = get_lemma_word(offset);
     uint16 * splids = get_lemma_spell_ids(offset);

     if (nchar <= hzs_len) {
       j++;
       continue;
     }

     if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
       if (new_added >= npre_max) {
         return new_added;
       }
       uint32 cpy_len =
           (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
           - (hzs_len << 1);
       npre_items[new_added].his_len = hzs_len;
       npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
       memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
       if ((cpy_len >> 1) < kMaxPredictSize) {
         npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
       }
       new_added++;
     } else {
       break;
     }

     j++;
   }
 #endif
   return new_added;
 }

 int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
                                   uint16 lemma_len) {
   int32 max_off = dict_info_.lemma_count;

   UserDictSearchable searchable;
   prepare_locate(&searchable, splid_str, lemma_len);
 #ifdef ___CACHE_ENABLED___
   int32 off;
   uint32 start, count;
   bool cached = load_cache(&searchable, &start, &count);
   if (cached) {
     off = start;
     max_off = start + count;
   } else {
     off = locate_first_in_offsets(&searchable);
     start = off;
   }
 #else
   int32 off = locate_first_in_offsets(&searchable);
 #endif

   if (off == -1) {
     return off;
   }

   while (off < max_off) {
     uint32 offset = offsets_[off];
     if (offset & kUserDictOffsetFlagRemove) {
       off++;
       continue;
     }
     uint16 * splids = get_lemma_spell_ids(offset);
 #ifdef ___CACHE_ENABLED___
     if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
       break;
 #else
     if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
       break;
 #endif
     if (equal_spell_id(splids, lemma_len, &searchable) == true) {
       uint16 * str = get_lemma_word(offset);
       uint32 i = 0;
       for (i = 0; i < lemma_len; i++) {
         if (str[i] == lemma_str[i])
           continue;
         break;
       }
       if (i < lemma_len) {
         off++;
         continue;
       }
 #ifdef ___CACHE_ENABLED___
       // No need to save_cache here, since current function is invoked by
       // put_lemma. It's rarely possible for a user input same lemma twice.
       // That means first time user type a new lemma, it is newly added into
       // user dictionary, then it's possible that user type the same lemma
       // again.
       // Another reason save_cache can not be invoked here is this function
       // aborts when lemma is found, and it never knows the count.
 #endif
       return off;
     }
     off++;
   }

   return -1;
 }

 #ifdef ___PREDICT_ENABLED___
 uint32 UserDict::locate_where_to_insert_in_predicts(
     const uint16 * words, int lemma_len) {
   int32 begin = 0;
   int32 end = dict_info_.lemma_count - 1;
   int32 middle = end;

   uint32 last_matched = middle;

   while (begin <= end) {
     middle = (begin + end) >> 1;
     uint32 offset = offsets_[middle];
     uint8 nchar = get_lemma_nchar(offset);
     const uint16 * ws = get_lemma_word(offset);

     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
     uint32 k = 0;
     int cmp = 0;

     for (; k < minl; k++) {
       if (ws[k] < words[k]) {
         cmp = -1;
         break;
       } else if (ws[k] > words[k]) {
         cmp = 1;
         break;
       }
     }
     if (cmp == 0) {
       if (nchar < lemma_len)
         cmp = -1;
       else if (nchar > lemma_len)
         cmp = 1;
     }

     if (cmp < 0) {
       begin = middle + 1;
       last_matched = middle;
     } else if (cmp > 0) {
       end = middle - 1;
     } else {
       end = middle - 1;
       last_matched = middle;
     }
   }

   return last_matched;
 }

 int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
   int32 begin = 0;
   int32 end = dict_info_.lemma_count - 1;
   int32 middle = -1;

   int32 last_matched = middle;

   while (begin <= end) {
     middle = (begin + end) >> 1;
     uint32 offset = offsets_[middle];
     uint8 nchar = get_lemma_nchar(offset);
     const uint16 * ws = get_lemma_word(offset);

     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
     uint32 k = 0;
     int cmp = 0;

     for (; k < minl; k++) {
       if (ws[k] < words[k]) {
         cmp = -1;
         break;
       } else if (ws[k] > words[k]) {
         cmp = 1;
         break;
       }
     }
     if (cmp == 0) {
       if (nchar >= lemma_len)
         last_matched = middle;
       if (nchar < lemma_len)
         cmp = -1;
       else if (nchar > lemma_len)
         cmp = 1;
     }

     if (cmp < 0) {
       begin = middle + 1;
     } else if (cmp > 0) {
       end = middle - 1;
     } else {
       end = middle - 1;
     }
   }

   return last_matched;
 }

 #endif

 LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
                                    uint16 lemma_len) {
   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
   if (off == -1) {
     return 0;
   }

   return ids_[off];
 }

 LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
   if (is_valid_state() == false)
     return 0;
   if (is_valid_lemma_id(lemma_id) == false)
     return 0;

   return translate_score(_get_lemma_score(lemma_id));
 }

 LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
                                 uint16 lemma_len) {
   if (is_valid_state() == false)
     return 0;
   return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
 }

 int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
   if (is_valid_state() == false)
     return 0;
   if (is_valid_lemma_id(lemma_id) == false)
     return 0;

   uint32 offset = offsets_by_id_[lemma_id - start_id_];

   uint32 nchar = get_lemma_nchar(offset);
   uint16 * spl = get_lemma_spell_ids(offset);
   uint16 * wrd = get_lemma_word(offset);

   int32 off = locate_in_offsets(wrd, spl, nchar);
   if (off == -1) {
     return 0;
   }

   return scores_[off];
 }

 int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
                                 uint16 lemma_len) {
   if (is_valid_state() == false)
     return 0;

   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
   if (off == -1) {
     return 0;
   }

   return scores_[off];
 }

 #ifdef ___SYNC_ENABLED___
 void UserDict::remove_lemma_from_sync_list(uint32 offset) {
   offset &= kUserDictOffsetMask;
   uint32 i = 0;
   for (; i < dict_info_.sync_count; i++) {
     unsigned int off = (syncs_[i] & kUserDictOffsetMask);
     if (off == offset)
       break;
   }
   if (i < dict_info_.sync_count) {
     syncs_[i] = syncs_[dict_info_.sync_count - 1];
     dict_info_.sync_count--;
   }
 }
 #endif

 #ifdef ___PREDICT_ENABLED___
 void UserDict::remove_lemma_from_predict_list(uint32 offset) {
   offset &= kUserDictOffsetMask;
   uint32 i = 0;
   for (; i < dict_info_.lemma_count; i++) {
     unsigned int off = (predicts_[i] & kUserDictOffsetMask);
     if (off == offset) {
       predicts_[i] |= kUserDictOffsetFlagRemove;
       break;
     }
   }
 }
 #endif

 bool UserDict::remove_lemma_by_offset_index(int offset_index) {
   if (is_valid_state() == false)
     return 0;

   int32 off = offset_index;
   if (off == -1) {
     return false;
   }

   uint32 offset = offsets_[off];
   uint32 nchar = get_lemma_nchar(offset);

   offsets_[off] |= kUserDictOffsetFlagRemove;

 #ifdef ___SYNC_ENABLED___
   // Remove corresponding sync item
   remove_lemma_from_sync_list(offset);
 #endif

 #ifdef ___PREDICT_ENABLED___
   remove_lemma_from_predict_list(offset);
 #endif
   dict_info_.free_count++;
   dict_info_.free_size += (2 + (nchar << 2));

   if (state_ < USER_DICT_OFFSET_DIRTY)
     state_ = USER_DICT_OFFSET_DIRTY;
   return true;
 }

 bool UserDict::remove_lemma(LemmaIdType lemma_id) {
   if (is_valid_state() == false)
     return 0;
   if (is_valid_lemma_id(lemma_id) == false)
     return false;
   uint32 offset = offsets_by_id_[lemma_id - start_id_];

   uint32 nchar = get_lemma_nchar(offset);
   uint16 * spl = get_lemma_spell_ids(offset);
   uint16 * wrd = get_lemma_word(offset);

   int32 off = locate_in_offsets(wrd, spl, nchar);

   return remove_lemma_by_offset_index(off);
 }

 void UserDict::flush_cache() {
   LemmaIdType start_id = start_id_;
   const char * file = strdup(dict_file_);
   if (!file)
     return;
   close_dict();
   load_dict(file, start_id, kUserDictIdEnd);
   free((void*)file);
 #ifdef ___CACHE_ENABLED___
   cache_init();
 #endif
   return;
 }

 bool UserDict::reset(const char *file) {
   FILE *fp = fopen(file, "w+");
   if (!fp) {
     return false;
   }
   uint32 version = kUserDictVersion;
   size_t wred = fwrite(&version, 1, 4, fp);
   UserDictInfo info;
   memset(&info, 0, sizeof(info));
   // By default, no limitation for lemma count and size
   // thereby, reclaim_ratio is never used
   wred += fwrite(&info, 1, sizeof(info), fp);
   if (wred != sizeof(info) + sizeof(version)) {
     fclose(fp);
     unlink(file);
     return false;
   }
   fclose(fp);
   return true;
 }

 bool UserDict::validate(const char *file) {
   // b is ignored in POSIX compatible os including Linux
   // while b is important flag for Windows to specify binary mode
   FILE *fp = fopen(file, "rb");
   if (!fp) {
     return false;
   }

   size_t size;
   size_t readed;
   uint32 version;
   UserDictInfo dict_info;

   // validate
   int err = fseek(fp, 0, SEEK_END);
   if (err) {
     goto error;
   }

   size = ftell(fp);
   if (size < 4 + sizeof(dict_info)) {
     goto error;
   }

   err = fseek(fp, 0, SEEK_SET);
   if (err) {
     goto error;
   }

   readed = fread(&version, 1, sizeof(version), fp);
   if (readed < sizeof(version)) {
     goto error;
   }
   if (version != kUserDictVersion) {
     goto error;
   }

   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
   if (err) {
     goto error;
   }

   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
   if (readed != sizeof(dict_info)) {
     goto error;
   }

   if (size != get_dict_file_size(&dict_info)) {
     goto error;
   }

   fclose(fp);
   return true;

  error:
   fclose(fp);
   return false;
 }

 bool UserDict::load(const char *file, LemmaIdType start_id) {
   if (0 != pthread_mutex_trylock(&g_mutex_)) {
     return false;
   }
   // b is ignored in POSIX compatible os including Linux
   // while b is important flag for Windows to specify binary mode
   FILE *fp = fopen(file, "rb");
   if (!fp) {
     pthread_mutex_unlock(&g_mutex_);
     return false;
   }

   size_t readed, toread;
   UserDictInfo dict_info;
   uint8 *lemmas = NULL;
   uint32 *offsets = NULL;
 #ifdef ___SYNC_ENABLED___
   uint32 *syncs = NULL;
 #endif
   uint32 *scores = NULL;
   uint32 *ids = NULL;
   uint32 *offsets_by_id = NULL;
 #ifdef ___PREDICT_ENABLED___
   uint32 *predicts = NULL;
 #endif
   size_t i;
   int err;

   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
   if (err) goto error;

   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
   if (readed != sizeof(dict_info)) goto error;

   lemmas = (uint8 *)malloc(
       dict_info.lemma_size +
       (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));

   if (!lemmas) goto error;

   offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   if (!offsets) goto error;

 #ifdef ___PREDICT_ENABLED___
   predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   if (!predicts) goto error;
 #endif

 #ifdef ___SYNC_ENABLED___
   syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
   if (!syncs) goto error;
 #endif

   scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   if (!scores) goto error;

   ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   if (!ids) goto error;

   offsets_by_id = (uint32 *)malloc(
       (dict_info.lemma_count + kUserDictPreAlloc) << 2);
   if (!offsets_by_id) goto error;

   err = fseek(fp, 4, SEEK_SET);
   if (err) goto error;

   readed = 0;
   while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
     readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
   }
   if (readed < dict_info.lemma_size)
     goto error;

   toread = (dict_info.lemma_count << 2);
   readed = 0;
   while (readed < toread && !ferror(fp) && !feof(fp)) {
     readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
   }
   if (readed < toread)
     goto error;

 #ifdef ___PREDICT_ENABLED___
   toread = (dict_info.lemma_count << 2);
   readed = 0;
   while (readed < toread && !ferror(fp) && !feof(fp)) {
     readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
   }
   if (readed < toread)
     goto error;
 #endif

   readed = 0;
   while (readed < toread && !ferror(fp) && !feof(fp)) {
     readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
   }
   if (readed < toread)
     goto error;

 #ifdef ___SYNC_ENABLED___
   toread = (dict_info.sync_count << 2);
   readed = 0;
   while (readed < toread && !ferror(fp) && !feof(fp)) {
     readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
   }
   if (readed < toread)
     goto error;
 #endif

   for (i = 0; i < dict_info.lemma_count; i++) {
     ids[i] = start_id + i;
     offsets_by_id[i] = offsets[i];
   }

   lemmas_ = lemmas;
   offsets_ = offsets;
 #ifdef ___SYNC_ENABLED___
   syncs_ = syncs;
   sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
 #endif
   offsets_by_id_ = offsets_by_id;
   scores_ = scores;
   ids_ = ids;
 #ifdef ___PREDICT_ENABLED___
   predicts_ = predicts;
 #endif
   lemma_count_left_ = kUserDictPreAlloc;
   lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
   memcpy(&dict_info_, &dict_info, sizeof(dict_info));
   state_ = USER_DICT_SYNC;

   fclose(fp);

   pthread_mutex_unlock(&g_mutex_);
   return true;

  error:
   if (lemmas) free(lemmas);
   if (offsets) free(offsets);
 #ifdef ___SYNC_ENABLED___
   if (syncs) free(syncs);
 #endif
   if (scores) free(scores);
   if (ids) free(ids);
   if (offsets_by_id) free(offsets_by_id);
 #ifdef ___PREDICT_ENABLED___
   if (predicts) free(predicts);
 #endif
   fclose(fp);
   pthread_mutex_unlock(&g_mutex_);
   return false;
 }

 void UserDict::write_back() {
   // XXX write back is only allowed from close_dict due to thread-safe sake
   if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
     return;
   int fd = open(dict_file_, O_WRONLY);
   if (fd == -1)
     return;
   switch (state_) {
     case USER_DICT_DEFRAGMENTED:
       write_back_all(fd);
       break;
     case USER_DICT_LEMMA_DIRTY:
       write_back_lemma(fd);
       break;
     case USER_DICT_OFFSET_DIRTY:
       write_back_offset(fd);
       break;
     case USER_DICT_SCORE_DIRTY:
       write_back_score(fd);
       break;
 #ifdef ___SYNC_ENABLED___
     case USER_DICT_SYNC_DIRTY:
       write_back_sync(fd);
       break;
 #endif
     default:
       break;
   }
   // It seems truncate is not need on Linux, Windows except Mac
   // I am doing it here anyway for safety.
   off_t cur = lseek(fd, 0, SEEK_CUR);
   ftruncate(fd, cur);
   close(fd);
   state_ = USER_DICT_SYNC;
 }

 #ifdef ___SYNC_ENABLED___
 void UserDict::write_back_sync(int fd) {
   int err = lseek(fd, 4 + dict_info_.lemma_size
                   + (dict_info_.lemma_count << 3)
 #ifdef ___PREDICT_ENABLED___
                   + (dict_info_.lemma_count << 2)
 #endif
                   , SEEK_SET);
   if (err == -1)
     return;
   write(fd, syncs_, dict_info_.sync_count << 2);
   write(fd, &dict_info_, sizeof(dict_info_));
 }
 #endif

 void UserDict::write_back_offset(int fd) {
   int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
   if (err == -1)
     return;
   write(fd, offsets_, dict_info_.lemma_count << 2);
 #ifdef ___PREDICT_ENABLED___
   write(fd, predicts_, dict_info_.lemma_count << 2);
 #endif
   write(fd, scores_, dict_info_.lemma_count << 2);
 #ifdef ___SYNC_ENABLED___
   write(fd, syncs_, dict_info_.sync_count << 2);
 #endif
   write(fd, &dict_info_, sizeof(dict_info_));
 }

 void UserDict::write_back_score(int fd) {
   int err = lseek(fd, 4 + dict_info_.lemma_size
                   + (dict_info_.lemma_count << 2)
 #ifdef ___PREDICT_ENABLED___
                   + (dict_info_.lemma_count << 2)
 #endif
                   , SEEK_SET);
   if (err == -1)
     return;
   write(fd, scores_, dict_info_.lemma_count << 2);
 #ifdef ___SYNC_ENABLED___
   write(fd, syncs_, dict_info_.sync_count << 2);
 #endif
   write(fd, &dict_info_, sizeof(dict_info_));
 }

 void UserDict::write_back_lemma(int fd) {
   int err = lseek(fd, 4, SEEK_SET);
   if (err == -1)
     return;
   // New lemmas are always appended, no need to write whole lemma block
   size_t need_write = kUserDictPreAlloc *
       (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
   err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
   if (err == -1)
     return;
   write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);

   write(fd, offsets_,  dict_info_.lemma_count << 2);
 #ifdef ___PREDICT_ENABLED___
   write(fd, predicts_,  dict_info_.lemma_count << 2);
 #endif
   write(fd, scores_, dict_info_.lemma_count << 2);
 #ifdef ___SYNC_ENABLED___
   write(fd, syncs_, dict_info_.sync_count << 2);
 #endif
   write(fd, &dict_info_, sizeof(dict_info_));
 }

 void UserDict::write_back_all(int fd) {
   // XXX lemma_size is handled differently in writeall
   // and writelemma. I update lemma_size and lemma_count in different
   // places for these two cases. Should fix it to make it consistent.
   int err = lseek(fd, 4, SEEK_SET);
   if (err == -1)
     return;
   write(fd, lemmas_, dict_info_.lemma_size);
   write(fd, offsets_, dict_info_.lemma_count << 2);
 #ifdef ___PREDICT_ENABLED___
   write(fd, predicts_, dict_info_.lemma_count << 2);
 #endif
   write(fd, scores_, dict_info_.lemma_count << 2);
 #ifdef ___SYNC_ENABLED___
   write(fd, syncs_, dict_info_.sync_count << 2);
 #endif
   write(fd, &dict_info_, sizeof(dict_info_));
 }

 #ifdef ___CACHE_ENABLED___
 bool UserDict::load_cache(UserDictSearchable *searchable,
                           uint32 *offset, uint32 *length) {
   UserDictCache *cache = &caches_[searchable->splids_len - 1];
   if (cache->head == cache->tail)
     return false;

   uint16 j, sig_len = kMaxLemmaSize / 4;
   uint16 i = cache->head;
   while (1) {
     j = 0;
     for (; j < sig_len; j++) {
       if (cache->signatures[i][j] != searchable->signature[j])
         break;
     }
     if (j < sig_len) {
       i++;
       if (i >= kUserDictCacheSize)
         i -= kUserDictCacheSize;
       if (i == cache->tail)
         break;
       continue;
     }
     *offset = cache->offsets[i];
     *length = cache->lengths[i];
     return true;
   }
   return false;
 }

 void UserDict::save_cache(UserDictSearchable *searchable,
                           uint32 offset, uint32 length) {
   UserDictCache *cache = &caches_[searchable->splids_len - 1];
   uint16 next = cache->tail;

   cache->offsets[next] = offset;
   cache->lengths[next] = length;
   uint16 sig_len = kMaxLemmaSize / 4;
   uint16 j = 0;
   for (; j < sig_len; j++) {
     cache->signatures[next][j] = searchable->signature[j];
   }

   if (++next >= kUserDictCacheSize) {
     next -= kUserDictCacheSize;
   }
   if (next == cache->head) {
     cache->head++;
     if (cache->head >= kUserDictCacheSize) {
       cache->head -= kUserDictCacheSize;
     }
   }
   cache->tail = next;
 }

 void UserDict::reset_cache() {
   memset(caches_, 0, sizeof(caches_));
 }

 bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
   if (cache->head == cache->tail)
     return false;

   uint16 j, sig_len = kMaxLemmaSize / 4;
   uint16 i = cache->head;
   while (1) {
     j = 0;
     for (; j < sig_len; j++) {
       if (cache->signatures[i][j] != searchable->signature[j])
         break;
     }
     if (j < sig_len) {
       i++;
       if (i >= kUserDictMissCacheSize)
         i -= kUserDictMissCacheSize;
       if (i == cache->tail)
         break;
       continue;
     }
     return true;
   }
   return false;
 }

 void UserDict::save_miss_cache(UserDictSearchable *searchable) {
   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
   uint16 next = cache->tail;

   uint16 sig_len = kMaxLemmaSize / 4;
   uint16 j = 0;
   for (; j < sig_len; j++) {
     cache->signatures[next][j] = searchable->signature[j];
   }

   if (++next >= kUserDictMissCacheSize) {
     next -= kUserDictMissCacheSize;
   }
   if (next == cache->head) {
     cache->head++;
     if (cache->head >= kUserDictMissCacheSize) {
       cache->head -= kUserDictMissCacheSize;
     }
   }
   cache->tail = next;
 }

 void UserDict::reset_miss_cache() {
   memset(miss_caches_, 0, sizeof(miss_caches_));
 }

 void UserDict::cache_init() {
   reset_cache();
   reset_miss_cache();
 }

 bool UserDict::cache_hit(UserDictSearchable *searchable,
                          uint32 *offset, uint32 *length) {
   bool hit = load_miss_cache(searchable);
   if (hit) {
     *offset = 0;
     *length = 0;
     return true;
   }
   hit = load_cache(searchable, offset, length);
   if (hit) {
     return true;
   }
   return false;
 }

 void UserDict::cache_push(UserDictCacheType type,
                          UserDictSearchable *searchable,
                          uint32 offset, uint32 length) {
   switch (type) {
     case USER_DICT_MISS_CACHE:
       save_miss_cache(searchable);
       break;
     case USER_DICT_CACHE:
       save_cache(searchable, offset, length);
       break;
     default:
       break;
   }
 }

 #endif

 void UserDict::defragment(void) {
 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_BEGIN;
 #endif
   if (is_valid_state() == false)
     return;
   // Fixup offsets_, set REMOVE flag to lemma's flag if needed
   size_t first_freed = 0;
   size_t first_inuse = 0;
   while (first_freed < dict_info_.lemma_count) {
     // Find first freed offset
     while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
             first_freed < dict_info_.lemma_count) {
       first_freed++;
     }
     if (first_freed < dict_info_.lemma_count) {
       // Save REMOVE flag to lemma flag
       int off = offsets_[first_freed];
       set_lemma_flag(off, kUserDictLemmaFlagRemove);
     } else {
       break;
     }
     // Find first inuse offse after first_freed
     first_inuse = first_freed + 1;
     while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
            (first_inuse < dict_info_.lemma_count)) {
       // Save REMOVE flag to lemma flag
       int off = offsets_[first_inuse];
       set_lemma_flag(off, kUserDictLemmaFlagRemove);
       first_inuse++;
     }
     if (first_inuse >= dict_info_.lemma_count) {
       break;
     }
     // Swap offsets_
     int tmp = offsets_[first_inuse];
     offsets_[first_inuse] = offsets_[first_freed];
     offsets_[first_freed] = tmp;
     // Move scores_, no need to swap
     tmp = scores_[first_inuse];
     scores_[first_inuse] = scores_[first_freed];
     scores_[first_freed] = tmp;
     // Swap ids_
     LemmaIdType tmpid = ids_[first_inuse];
     ids_[first_inuse] = ids_[first_freed];
     ids_[first_freed] = tmpid;
     // Go on
     first_freed++;
   }
 #ifdef ___PREDICT_ENABLED___
   // Fixup predicts_
   first_freed = 0;
   first_inuse = 0;
   while (first_freed < dict_info_.lemma_count) {
     // Find first freed offset
     while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
             first_freed < dict_info_.lemma_count) {
       first_freed++;
     }
     if (first_freed >= dict_info_.lemma_count)
       break;
     // Find first inuse offse after first_freed
     first_inuse = first_freed + 1;
     while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
            && (first_inuse < dict_info_.lemma_count)) {
       first_inuse++;
     }
     if (first_inuse >= dict_info_.lemma_count) {
       break;
     }
     // Swap offsets_
     int tmp = predicts_[first_inuse];
     predicts_[first_inuse] = predicts_[first_freed];
     predicts_[first_freed] = tmp;
     // Go on
     first_freed++;
   }
 #endif
   dict_info_.lemma_count = first_freed;
   // Fixup lemmas_
   size_t begin = 0;
   size_t end = 0;
   size_t dst = 0;
   int total_size = dict_info_.lemma_size + lemma_size_left_;
   int total_count = dict_info_.lemma_count + lemma_count_left_;
   size_t real_size = total_size - lemma_size_left_;
   while (dst < real_size) {
     unsigned char flag = get_lemma_flag(dst);
     unsigned char nchr = get_lemma_nchar(dst);
     if ((flag & kUserDictLemmaFlagRemove) == 0) {
       dst += nchr * 4 + 2;
       continue;
     }
     break;
   }
   if (dst >= real_size)
     return;

   end = dst;
   while (end < real_size) {
     begin = end + get_lemma_nchar(end) * 4 + 2;
  repeat:
     // not used any more
     if (begin >= real_size)
       break;
     unsigned char flag = get_lemma_flag(begin);
     unsigned char nchr = get_lemma_nchar(begin);
     if (flag & kUserDictLemmaFlagRemove) {
       begin += nchr * 4 + 2;
       goto repeat;
     }
     end = begin + nchr * 4 + 2;
     while (end < real_size) {
       unsigned char eflag = get_lemma_flag(end);
       unsigned char enchr = get_lemma_nchar(end);
       if ((eflag & kUserDictLemmaFlagRemove) == 0) {
         end += enchr * 4 + 2;
         continue;
       }
       break;
     }
     memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
     for (size_t j = 0; j < dict_info_.lemma_count; j++) {
       if (offsets_[j] >= begin && offsets_[j] < end) {
         offsets_[j] -= (begin - dst);
         offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
       }
 #ifdef ___PREDICT_ENABLED___
       if (predicts_[j] >= begin && predicts_[j] < end) {
         predicts_[j] -= (begin - dst);
       }
 #endif
     }
 #ifdef ___SYNC_ENABLED___
     for (size_t j = 0; j < dict_info_.sync_count; j++) {
       if (syncs_[j] >= begin && syncs_[j] < end) {
         syncs_[j] -= (begin - dst);
       }
     }
 #endif
     dst += (end - begin);
   }

   dict_info_.free_count = 0;
   dict_info_.free_size = 0;
   dict_info_.lemma_size = dst;
   lemma_size_left_ = total_size - dict_info_.lemma_size;
   lemma_count_left_ = total_count - dict_info_.lemma_count;

   // XXX Without following code,
   // offsets_by_id_ is not reordered.
   // That's to say, all removed lemmas' ids are not collected back.
   // There may not be room for addition of new lemmas due to
   // offsests_by_id_ reason, although lemma_size_left_ is fixed.
   // By default, we do want defrag as fast as possible, because
   // during defrag procedure, other peers can not write new lemmas
   // to user dictionary file.
   // XXX If write-back is invoked immediately after
   // this defragment, no need to fix up following in-mem data.
   for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
     ids_[i] = start_id_ + i;
     offsets_by_id_[i] = offsets_[i];
   }

   state_ = USER_DICT_DEFRAGMENTED;

 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_END;
   LOGD_PERF("defragment");
 #endif
 }

 #ifdef ___SYNC_ENABLED___
 void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
   if (is_valid_state() == false)
     return;
   if (end > dict_info_.sync_count)
     end = dict_info_.sync_count;
   memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
   dict_info_.sync_count -= (end - start);
   if (state_ < USER_DICT_SYNC_DIRTY)
     state_ = USER_DICT_SYNC_DIRTY;
 }

 int UserDict::get_sync_count() {
   if (is_valid_state() == false)
     return 0;
   return dict_info_.sync_count;
 }

 LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
                         uint16 lemma_len, uint16 count, uint64 lmt) {
   int again = 0;
  begin:
   LemmaIdType id;
   uint32 * syncs_bak = syncs_;
   syncs_ = NULL;
   id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
   syncs_ = syncs_bak;
   if (id == 0 && again == 0) {
     if ((dict_info_.limit_lemma_count > 0 &&
         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
         || (dict_info_.limit_lemma_size > 0 &&
             dict_info_.lemma_size + (2 + (lemma_len << 2))
             > dict_info_.limit_lemma_size)) {
       // XXX Always reclaim and defrag in sync code path
       //     sync thread is background thread and ok with heavy work
       reclaim();
       defragment();
       flush_cache();
       again = 1;
       goto begin;
     }
   }
   return id;
 }

 int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
   int newly_added = 0;

   SpellingParser * spl_parser = new SpellingParser();
   if (!spl_parser) {
     return 0;
   }
 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_BEGIN;
 #endif
   char16 *ptr = lemmas;

   // Extract pinyin,words,frequence,last_mod_time
   char16 * p = ptr, * py16 = ptr;
   char16 * hz16 = NULL;
   int py16_len = 0;
   uint16 splid[kMaxLemmaSize];
   int splid_len = 0;
   int hz16_len = 0;
   char16 * fr16 = NULL;
   int fr16_len = 0;

   while (p - ptr < len) {
     // Pinyin
     py16 = p;
     splid_len = 0;
     while (*p != 0x2c && (p - ptr) < len) {
       if (*p == 0x20)
         splid_len++;
       p++;
     }
     splid_len++;
     if (p - ptr == len)
       break;
     py16_len = p - py16;
     if (kMaxLemmaSize < splid_len) {
       break;
     }
     bool is_pre;
     int splidl = spl_parser->splstr16_to_idxs_f(
         py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
     if (splidl != splid_len)
       break;
     // Phrase
     hz16 = ++p;
     while (*p != 0x2c && (p - ptr) < len) {
       p++;
     }
     hz16_len = p - hz16;
     if (hz16_len != splid_len)
       break;
     // Frequency
     fr16 = ++p;
     fr16_len = 0;
     while (*p != 0x2c && (p - ptr) < len) {
       p++;
     }
     fr16_len = p - fr16;
     uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
     // Last modified time
     fr16 = ++p;
     fr16_len = 0;
     while (*p != 0x3b && (p - ptr) < len) {
       p++;
     }
     fr16_len = p - fr16;
     uint64 last_mod = utf16le_atoll(fr16, fr16_len);

     put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
     newly_added++;

     p++;
   }

 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_END;
   LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
 #endif
   return newly_added;
 }

 int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
     char16 * str, int size, int * count) {
   int len = 0;
   *count = 0;

   int left_len = size;

   if (is_valid_state() == false)
     return len;

   SpellingTrie * spl_trie = &SpellingTrie::get_instance();
   if (!spl_trie) {
     return 0;
   }

   uint32 i;
   for (i = 0; i < dict_info_.sync_count; i++) {
     int offset = syncs_[i];
     uint32 nchar = get_lemma_nchar(offset);
     uint16 *spl = get_lemma_spell_ids(offset);
     uint16 *wrd = get_lemma_word(offset);
     int score = _get_lemma_score(wrd, spl, nchar);

     static char score_temp[32], *pscore_temp = score_temp;
     static char16 temp[256], *ptemp = temp;

     pscore_temp = score_temp;
     ptemp = temp;

     uint32 j;
     // Add pinyin
     for (j = 0; j < nchar; j++) {
       int ret_len = spl_trie->get_spelling_str16(
           spl[j], ptemp, temp + sizeof(temp) - ptemp);
       if (ret_len <= 0)
         break;
       ptemp += ret_len;
       if (ptemp < temp + sizeof(temp) - 1) {
         *(ptemp++) = ' ';
       } else {
         j = 0;
         break;
       }
     }
     if (j < nchar) {
       continue;
     }
     ptemp--;
     if (ptemp < temp + sizeof(temp) - 1) {
       *(ptemp++) = ',';
     } else {
       continue;
     }
     // Add phrase
     for (j = 0; j < nchar; j++) {
       if (ptemp < temp + sizeof(temp) - 1) {
         *(ptemp++) = wrd[j];
       } else {
         break;
       }
     }
     if (j < nchar) {
       continue;
     }
     if (ptemp < temp + sizeof(temp) - 1) {
       *(ptemp++) = ',';
     } else {
       continue;
     }
     // Add frequency
     uint32 intf = extract_score_freq(score);
     int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
     if (ret_len <= 0)
       continue;
     ptemp += ret_len;
     if (ptemp < temp + sizeof(temp) - 1) {
       *(ptemp++) = ',';
     } else {
       continue;
     }
     // Add last modified time
     uint64 last_mod = extract_score_lmt(score);
     ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
     if (ret_len <= 0)
       continue;
     ptemp += ret_len;
     if (ptemp < temp + sizeof(temp) - 1) {
       *(ptemp++) = ';';
     } else {
       continue;
     }

     // Write to string
     int need_len = ptemp - temp;
     if (need_len > left_len)
       break;
     memcpy(str + len, temp, need_len * 2);
     left_len -= need_len;

     len += need_len;
     (*count)++;
   }

   if (len > 0) {
     if (state_ < USER_DICT_SYNC_DIRTY)
       state_ = USER_DICT_SYNC_DIRTY;
   }
   return len;
 }

 #endif

 bool UserDict::state(UserDictStat * stat) {
   if (is_valid_state() == false)
     return false;
   if (!stat)
     return false;
   stat->version = version_;
   stat->file_name = dict_file_;
   stat->load_time.tv_sec = load_time_.tv_sec;
   stat->load_time.tv_usec = load_time_.tv_usec;
   pthread_mutex_lock(&g_mutex_);
   stat->last_update.tv_sec = g_last_update_.tv_sec;
   stat->last_update.tv_usec = g_last_update_.tv_usec;
   pthread_mutex_unlock(&g_mutex_);
   stat->disk_size = get_dict_file_size(&dict_info_);
   stat->lemma_count = dict_info_.lemma_count;
   stat->lemma_size = dict_info_.lemma_size;
   stat->delete_count = dict_info_.free_count;
   stat->delete_size = dict_info_.free_size;
 #ifdef ___SYNC_ENABLED___
   stat->sync_count = dict_info_.sync_count;
 #endif
   stat->limit_lemma_count = dict_info_.limit_lemma_count;
   stat->limit_lemma_size = dict_info_.limit_lemma_size;
   stat->reclaim_ratio = dict_info_.reclaim_ratio;
   return true;
 }

 void UserDict::set_limit(uint32 max_lemma_count,
                          uint32 max_lemma_size, uint32 reclaim_ratio) {
   dict_info_.limit_lemma_count = max_lemma_count;
   dict_info_.limit_lemma_size = max_lemma_size;
   if (reclaim_ratio > 100)
     reclaim_ratio = 100;
   dict_info_.reclaim_ratio = reclaim_ratio;
 }

 void UserDict::reclaim() {
   if (is_valid_state() == false)
     return;

   switch (dict_info_.reclaim_ratio) {
     case 0:
       return;
     case 100:
       // TODO: CLEAR to be implemented
       assert(false);
       return;
     default:
       break;
   }

   // XXX Reclaim is only based on count, not size
   uint32 count = dict_info_.lemma_count;
   int rc = count * dict_info_.reclaim_ratio / 100;

   UserDictScoreOffsetPair * score_offset_pairs = NULL;
   score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
       sizeof(UserDictScoreOffsetPair) * rc);
   if (score_offset_pairs == NULL) {
     return;
   }

   for (int i = 0; i < rc; i++) {
     int s = scores_[i];
     score_offset_pairs[i].score = s;
     score_offset_pairs[i].offset_index = i;
   }

   for (int i = (rc + 1) / 2; i >= 0; i--)
     shift_down(score_offset_pairs, i, rc);

   for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
     int s = scores_[i];
     if (s < score_offset_pairs[0].score) {
       score_offset_pairs[0].score = s;
       score_offset_pairs[0].offset_index = i;
       shift_down(score_offset_pairs, 0, rc);
     }
   }

   for (int i = 0; i < rc; i++) {
     int off = score_offset_pairs[i].offset_index;
     remove_lemma_by_offset_index(off);
   }
   if (rc > 0) {
     if (state_ < USER_DICT_OFFSET_DIRTY)
       state_ = USER_DICT_OFFSET_DIRTY;
   }

   free(score_offset_pairs);
 }

 inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
   int s = sop[i].score;
   int p = sop[i].offset_index;
   sop[i].score = sop[j].score;
   sop[i].offset_index = sop[j].offset_index;
   sop[j].score = s;
   sop[j].offset_index = p;
 }

 void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
   int par = i;
   while (par < n) {
     int left = par * 2 + 1;
     int right = left + 1;
     if (left >= n && right >= n)
       break;
     if (right >= n) {
       if (sop[left].score > sop[par].score) {
         swap(sop, left, par);
         par = left;
         continue;
       }
     } else if (sop[left].score > sop[right].score &&
                sop[left].score > sop[par].score) {
       swap(sop, left, par);
       par = left;
       continue;
     } else if (sop[right].score > sop[left].score &&
                sop[right].score > sop[par].score) {
       swap(sop, right, par);
       par = right;
       continue;
     }
     break;
   }
 }

 LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
                                 uint16 lemma_len, uint16 count) {
   return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
 }

 LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
                                  uint16 lemma_len, uint16 count, uint64 lmt) {
 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_BEGIN;
 #endif
   if (is_valid_state() == false)
     return 0;
   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
   if (off != -1) {
     int delta_score = count - scores_[off];
     dict_info_.total_nfreq += delta_score;
     scores_[off] = build_score(lmt, count);
     if (state_ < USER_DICT_SCORE_DIRTY)
       state_ = USER_DICT_SCORE_DIRTY;
 #ifdef ___DEBUG_PERF___
     DEBUG_PERF_END;
     LOGD_PERF("_put_lemma(update)");
 #endif
     return ids_[off];
   } else {
     if ((dict_info_.limit_lemma_count > 0 &&
         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
         || (dict_info_.limit_lemma_size > 0 &&
             dict_info_.lemma_size + (2 + (lemma_len << 2))
             > dict_info_.limit_lemma_size)) {
       // XXX Don't defragment here, it's too time-consuming.
       return 0;
     }
     int flushed = 0;
     if (lemma_count_left_ == 0 ||
         lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {

       // XXX When there is no space for new lemma, we flush to disk
       // flush_cache() may be called by upper user
       // and better place shoule be found instead of here
       flush_cache();
       flushed = 1;
       // Or simply return and do nothing
       // return 0;
     }
 #ifdef ___DEBUG_PERF___
     DEBUG_PERF_END;
     LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
 #endif
     LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
 #ifdef ___SYNC_ENABLED___
     if (syncs_ && id != 0) {
       queue_lemma_for_sync(id);
     }
 #endif
     return id;
   }
   return 0;
 }

 #ifdef ___SYNC_ENABLED___
 void UserDict::queue_lemma_for_sync(LemmaIdType id) {
   if (dict_info_.sync_count < sync_count_size_) {
     syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
   } else {
     uint32 * syncs = (uint32*)realloc(
         syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
     if (syncs) {
       sync_count_size_ += kUserDictPreAlloc;
       syncs_ = syncs;
       syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
     }
   }
 }
 #endif

 LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
                                    bool selected) {
 #ifdef ___DEBUG_PERF___
   DEBUG_PERF_BEGIN;
 #endif
   if (is_valid_state() == false)
     return 0;
   if (is_valid_lemma_id(lemma_id) == false)
     return 0;
   uint32 offset = offsets_by_id_[lemma_id - start_id_];
   uint8 lemma_len = get_lemma_nchar(offset);
   char16 * lemma_str = get_lemma_word(offset);
   uint16 * splids = get_lemma_spell_ids(offset);

   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
   if (off != -1) {
     int score = scores_[off];
     int count = extract_score_freq(score);
     uint64 lmt = extract_score_lmt(score);
     if (count + delta_count > kUserDictMaxFrequency ||
         count + delta_count < count) {
       delta_count = kUserDictMaxFrequency - count;
     }
     count += delta_count;
     dict_info_.total_nfreq += delta_count;
     if (selected) {
       lmt = time(NULL);
     }
     scores_[off] = build_score(lmt, count);
     if (state_ < USER_DICT_SCORE_DIRTY)
       state_ = USER_DICT_SCORE_DIRTY;
 #ifdef ___DEBUG_PERF___
     DEBUG_PERF_END;
     LOGD_PERF("update_lemma");
 #endif
 #ifdef ___SYNC_ENABLED___
     queue_lemma_for_sync(ids_[off]);
 #endif
     return ids_[off];
   }
   return 0;
 }

 size_t UserDict::get_total_lemma_count() {
   return dict_info_.total_nfreq;
 }

 void UserDict::set_total_lemma_count_of_others(size_t count) {
   total_other_nfreq_ = count;
 }

 LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
                                    uint16 lemma_len, uint16 count, uint64 lmt) {
   LemmaIdType id = get_max_lemma_id() + 1;
   size_t offset = dict_info_.lemma_size;
   if (offset > kUserDictOffsetMask)
     return 0;

   lemmas_[offset] = 0;
   lemmas_[offset + 1] = (uint8)lemma_len;
   for (size_t i = 0; i < lemma_len; i++) {
     *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
     *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
         = lemma_str[i];
   }
   uint32 off = dict_info_.lemma_count;
   offsets_[off] = offset;
   scores_[off] = build_score(lmt, count);
   ids_[off] = id;
 #ifdef ___PREDICT_ENABLED___
   predicts_[off] = offset;
 #endif

   offsets_by_id_[id - start_id_] = offset;

   dict_info_.lemma_count++;
   dict_info_.lemma_size += (2 + (lemma_len << 2));
   lemma_count_left_--;
   lemma_size_left_ -= (2 + (lemma_len << 2));

   // Sort

   UserDictSearchable searchable;
   prepare_locate(&searchable, splids, lemma_len);

   size_t i = 0;
   while (i < off) {
     offset = offsets_[i];
     uint32 nchar = get_lemma_nchar(offset);
     uint16 * spl = get_lemma_spell_ids(offset);

     if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
       break;
     i++;
   }
   if (i != off) {
     uint32 temp = offsets_[off];
     memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
     offsets_[i] = temp;

     temp = scores_[off];
     memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
     scores_[i] = temp;

     temp = ids_[off];
     memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
     ids_[i] = temp;
   }

 #ifdef ___PREDICT_ENABLED___
   uint32 j = 0;
   uint16 * words_new = get_lemma_word(predicts_[off]);
   j = locate_where_to_insert_in_predicts(words_new, lemma_len);
   if (j != off) {
     uint32 temp = predicts_[off];
     memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
     predicts_[j] = temp;
   }
 #endif

   if (state_ < USER_DICT_LEMMA_DIRTY)
     state_ = USER_DICT_LEMMA_DIRTY;

 #ifdef ___CACHE_ENABLED___
   cache_init();
 #endif

   dict_info_.total_nfreq += count;
   return id;
 }
 }