| /* |
| * Copyright (C) 2009 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include <assert.h> |
| #include <math.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <time.h> |
| #include "../include/mystdlib.h" |
| #include "../include/ngram.h" |
| |
| namespace ime_pinyin { |
| |
| #define ADD_COUNT 0.3 |
| |
| int comp_double(const void *p1, const void *p2) { |
| if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2)) |
| return -1; |
| if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2)) |
| return 1; |
| return 0; |
| } |
| |
| inline double distance(double freq, double code) { |
| // return fabs(freq - code); |
| return freq * fabs(log(freq) - log(code)); |
| } |
| |
| // Find the index of the code value which is nearest to the given freq |
| int qsearch_nearest(double code_book[], double freq, int start, int end) { |
| if (start == end) |
| return start; |
| |
| if (start + 1 == end) { |
| if (distance(freq, code_book[end]) > distance(freq, code_book[start])) |
| return start; |
| return end; |
| } |
| |
| int mid = (start + end) / 2; |
| |
| if (code_book[mid] > freq) |
| return qsearch_nearest(code_book, freq, start, mid); |
| else |
| return qsearch_nearest(code_book, freq, mid, end); |
| } |
| |
| size_t update_code_idx(double freqs[], size_t num, double code_book[], |
| CODEBOOK_TYPE *code_idx) { |
| size_t changed = 0; |
| for (size_t pos = 0; pos < num; pos++) { |
| CODEBOOK_TYPE idx; |
| idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); |
| if (idx != code_idx[pos]) |
| changed++; |
| code_idx[pos] = idx; |
| } |
| return changed; |
| } |
| |
| double recalculate_kernel(double freqs[], size_t num, double code_book[], |
| CODEBOOK_TYPE *code_idx) { |
| double ret = 0; |
| |
| size_t *item_num = new size_t[kCodeBookSize]; |
| assert(item_num); |
| memset(item_num, 0, sizeof(size_t) * kCodeBookSize); |
| |
| double *cb_new = new double[kCodeBookSize]; |
| assert(cb_new); |
| memset(cb_new, 0, sizeof(double) * kCodeBookSize); |
| |
| for (size_t pos = 0; pos < num; pos++) { |
| ret += distance(freqs[pos], code_book[code_idx[pos]]); |
| |
| cb_new[code_idx[pos]] += freqs[pos]; |
| item_num[code_idx[pos]] += 1; |
| } |
| |
| for (size_t code = 0; code < kCodeBookSize; code++) { |
| assert(item_num[code] > 0); |
| code_book[code] = cb_new[code] / item_num[code]; |
| } |
| |
| delete [] item_num; |
| delete [] cb_new; |
| |
| return ret; |
| } |
| |
| void iterate_codes(double freqs[], size_t num, double code_book[], |
| CODEBOOK_TYPE *code_idx) { |
| size_t iter_num = 0; |
| double delta_last = 0; |
| do { |
| size_t changed = update_code_idx(freqs, num, code_book, code_idx); |
| |
| double delta = recalculate_kernel(freqs, num, code_book, code_idx); |
| |
| if (kPrintDebug0) { |
| printf("---Unigram codebook iteration: %d : %d, %.9f\n", |
| iter_num, changed, delta); |
| } |
| iter_num++; |
| |
| if (iter_num > 1 && |
| (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) |
| break; |
| delta_last = delta; |
| } while (true); |
| } |
| |
| |
| NGram* NGram::instance_ = NULL; |
| |
| NGram::NGram() { |
| initialized_ = false; |
| idx_num_ = 0; |
| lma_freq_idx_ = NULL; |
| sys_score_compensation_ = 0; |
| |
| #ifdef ___BUILD_MODEL___ |
| freq_codes_df_ = NULL; |
| #endif |
| freq_codes_ = NULL; |
| } |
| |
| NGram::~NGram() { |
| if (NULL != lma_freq_idx_) |
| free(lma_freq_idx_); |
| |
| #ifdef ___BUILD_MODEL___ |
| if (NULL != freq_codes_df_) |
| free(freq_codes_df_); |
| #endif |
| |
| if (NULL != freq_codes_) |
| free(freq_codes_); |
| } |
| |
| NGram& NGram::get_instance() { |
| if (NULL == instance_) |
| instance_ = new NGram(); |
| return *instance_; |
| } |
| |
| bool NGram::save_ngram(FILE *fp) { |
| if (!initialized_ || NULL == fp) |
| return false; |
| |
| if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) |
| return false; |
| |
| if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1) |
| return false; |
| |
| if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != |
| kCodeBookSize) |
| return false; |
| |
| if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) |
| return false; |
| |
| return true; |
| } |
| |
| bool NGram::load_ngram(FILE *fp) { |
| if (NULL == fp) |
| return false; |
| |
| initialized_ = false; |
| |
| if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 ) |
| return false; |
| |
| if (NULL != lma_freq_idx_) |
| free(lma_freq_idx_); |
| |
| if (NULL != freq_codes_) |
| free(freq_codes_); |
| |
| lma_freq_idx_ = static_cast<CODEBOOK_TYPE*> |
| (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); |
| freq_codes_ = static_cast<LmaScoreType*> |
| (malloc(kCodeBookSize * sizeof(LmaScoreType))); |
| |
| if (NULL == lma_freq_idx_ || NULL == freq_codes_) |
| return false; |
| |
| if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != |
| kCodeBookSize) |
| return false; |
| |
| if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) |
| return false; |
| |
| initialized_ = true; |
| |
| total_freq_none_sys_ = 0; |
| return true; |
| } |
| |
| void NGram::set_total_freq_none_sys(size_t freq_none_sys) { |
| total_freq_none_sys_ = freq_none_sys; |
| if (0 == total_freq_none_sys_) { |
| sys_score_compensation_ = 0; |
| } else { |
| double factor = static_cast<double>(kSysDictTotalFreq) / ( |
| kSysDictTotalFreq + total_freq_none_sys_); |
| sys_score_compensation_ = static_cast<float>( |
| log(factor) * kLogValueAmplifier); |
| } |
| } |
| |
| // The caller makes sure this oject is initialized. |
| float NGram::get_uni_psb(LemmaIdType lma_id) { |
| return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) + |
| sys_score_compensation_; |
| } |
| |
| float NGram::convert_psb_to_score(double psb) { |
| float score = static_cast<float>( |
| log(psb) * static_cast<double>(kLogValueAmplifier)); |
| if (score > static_cast<float>(kMaxScore)) { |
| score = static_cast<float>(kMaxScore); |
| } |
| return score; |
| } |
| |
| #ifdef ___BUILD_MODEL___ |
| bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, |
| LemmaIdType next_idx_unused) { |
| if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) |
| return false; |
| |
| double total_freq = 0; |
| double *freqs = new double[next_idx_unused]; |
| if (NULL == freqs) |
| return false; |
| |
| freqs[0] = ADD_COUNT; |
| total_freq += freqs[0]; |
| LemmaIdType idx_now = 0; |
| for (size_t pos = 0; pos < lemma_num; pos++) { |
| if (lemma_arr[pos].idx_by_hz == idx_now) |
| continue; |
| idx_now++; |
| |
| assert(lemma_arr[pos].idx_by_hz == idx_now); |
| |
| freqs[idx_now] = lemma_arr[pos].freq; |
| if (freqs[idx_now] <= 0) |
| freqs[idx_now] = 0.3; |
| |
| total_freq += freqs[idx_now]; |
| } |
| |
| double max_freq = 0; |
| idx_num_ = idx_now + 1; |
| assert(idx_now + 1 == next_idx_unused); |
| |
| for (size_t pos = 0; pos < idx_num_; pos++) { |
| freqs[pos] = freqs[pos] / total_freq; |
| assert(freqs[pos] > 0); |
| if (freqs[pos] > max_freq) |
| max_freq = freqs[pos]; |
| } |
| |
| // calculate the code book |
| if (NULL == freq_codes_df_) |
| freq_codes_df_ = new double[kCodeBookSize]; |
| assert(freq_codes_df_); |
| memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); |
| |
| if (NULL == freq_codes_) |
| freq_codes_ = new LmaScoreType[kCodeBookSize]; |
| assert(freq_codes_); |
| memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); |
| |
| size_t freq_pos = 0; |
| for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { |
| bool found = true; |
| |
| while (found) { |
| found = false; |
| double cand = freqs[freq_pos]; |
| for (size_t i = 0; i < code_pos; i++) |
| if (freq_codes_df_[i] == cand) { |
| found = true; |
| break; |
| } |
| if (found) |
| freq_pos++; |
| } |
| |
| freq_codes_df_[code_pos] = freqs[freq_pos]; |
| freq_pos++; |
| } |
| |
| myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); |
| |
| if (NULL == lma_freq_idx_) |
| lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; |
| assert(lma_freq_idx_); |
| |
| iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); |
| |
| delete [] freqs; |
| |
| if (kPrintDebug0) { |
| printf("\n------Language Model Unigram Codebook------\n"); |
| } |
| |
| for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { |
| double log_score = log(freq_codes_df_[code_pos]); |
| float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); |
| if (kPrintDebug0) { |
| printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n", |
| code_pos, freq_codes_df_[code_pos], log_score, final_score); |
| } |
| freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score); |
| } |
| |
| initialized_ = true; |
| return true; |
| } |
| #endif |
| |
| } // namespace ime_pinyin |