blob: 6005e20d2aef49a538db3d0a3b2cd27464c088b8 [file] [log] [blame]
/*
* Copyright (C) 2009 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "../include/spellingtable.h"
namespace ime_pinyin {
#ifdef ___BUILD_MODEL___
const char SpellingTable::
kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
// "" is the biggest, so that all empty strings will be moved to the end
// _eb mean empty is biggest
int compare_raw_spl_eb(const void* p1, const void* p2) {
if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
return 1;
if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
return -1;
return strcmp((static_cast<const RawSpelling*>(p1))->str,
(static_cast<const RawSpelling*>(p2))->str);
}
size_t get_odd_next(size_t value) {
size_t v_next = value;
while (true) {
size_t v_next_sqrt = (size_t)sqrt(v_next);
bool is_odd = true;
for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
if (v_next % v_dv == 0) {
is_odd = false;
break;
}
}
if (is_odd)
return v_next;
v_next++;
}
// never reach here
return 0;
}
SpellingTable::SpellingTable() {
need_score_ = false;
raw_spellings_ = NULL;
spelling_buf_ = NULL;
spelling_num_ = 0;
total_freq_ = 0;
frozen_ = true;
}
SpellingTable::~SpellingTable() {
free_resource();
}
size_t SpellingTable::get_hash_pos(const char* spelling_str) {
size_t hash_pos = 0;
for (size_t pos = 0; pos < spelling_size_; pos++) {
if ('\0' == spelling_str[pos])
break;
hash_pos += (size_t)spelling_str[pos];
}
hash_pos = hash_pos % spelling_max_num_;
return hash_pos;
}
size_t SpellingTable::hash_pos_next(size_t hash_pos) {
hash_pos += 123;
hash_pos = hash_pos % spelling_max_num_;
return hash_pos;
}
void SpellingTable::free_resource() {
if (NULL != raw_spellings_)
delete [] raw_spellings_;
raw_spellings_ = NULL;
if (NULL != spelling_buf_)
delete [] spelling_buf_;
spelling_buf_ = NULL;
}
bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
bool need_score) {
if (pure_spl_size == 0 || spl_max_num ==0)
return false;
need_score_ = need_score;
free_resource();
spelling_size_ = pure_spl_size + 1;
if (need_score)
spelling_size_ += 1;
spelling_max_num_ = get_odd_next(spl_max_num);
spelling_num_ = 0;
raw_spellings_ = new RawSpelling[spelling_max_num_];
spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
if (NULL == raw_spellings_ || NULL == spelling_buf_) {
free_resource();
return false;
}
memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
frozen_ = false;
total_freq_ = 0;
return true;
}
bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
if (frozen_ || NULL == spelling_str)
return false;
for (size_t pos = 0; pos < kNotSupportNum; pos++) {
if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
return false;
}
}
total_freq_ += freq;
size_t hash_pos = get_hash_pos(spelling_str);
raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
spelling_size_ - 1) == 0) {
raw_spellings_[hash_pos].freq += freq;
return true;
}
size_t hash_pos_ori = hash_pos;
while (true) {
if (strncmp(raw_spellings_[hash_pos].str,
spelling_str, spelling_size_ - 1) == 0) {
raw_spellings_[hash_pos].freq += freq;
return true;
}
if ('\0' == raw_spellings_[hash_pos].str[0]) {
raw_spellings_[hash_pos].freq += freq;
strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
spelling_num_++;
return true;
}
hash_pos = hash_pos_next(hash_pos);
if (hash_pos_ori == hash_pos)
return false;
}
// never reach here
return false;
}
bool SpellingTable::contain(const char* spelling_str) {
if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
return false;
size_t hash_pos = get_hash_pos(spelling_str);
if ('\0' == raw_spellings_[hash_pos].str[0])
return false;
if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
== 0)
return true;
size_t hash_pos_ori = hash_pos;
while (true) {
hash_pos = hash_pos_next(hash_pos);
if (hash_pos_ori == hash_pos)
return false;
if ('\0' == raw_spellings_[hash_pos].str[0])
return false;
if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
== 0)
return true;
}
// never reach here
return false;
}
const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
NULL == item_size || NULL == spl_num)
return NULL;
qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
compare_raw_spl_eb);
// After sorting, only the first spelling_num_ items are valid.
// Copy them to the destination buffer.
for (size_t pos = 0; pos < spelling_num_; pos++) {
strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
spelling_size_);
}
if (need_score_) {
if (kPrintDebug0)
printf("------------Spelling Possiblities--------------\n");
double max_score = 0;
double min_score = 0;
// After sorting, only the first spelling_num_ items are valid.
for (size_t pos = 0; pos < spelling_num_; pos++) {
raw_spellings_[pos].freq /= total_freq_;
if (need_score_) {
if (0 == pos) {
max_score = raw_spellings_[0].freq;
min_score = max_score;
} else {
if (raw_spellings_[pos].freq > max_score)
max_score = raw_spellings_[pos].freq;
if (raw_spellings_[pos].freq < min_score)
min_score = raw_spellings_[pos].freq;
}
}
}
if (kPrintDebug0)
printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
max_score = log(max_score);
min_score = log(min_score);
if (kPrintDebug0)
printf("-----max log value: %f, min log value: %f\n",
max_score, min_score);
// The absolute value of min_score is bigger than that of max_score because
// both of them are negative after log function.
score_amplifier_ = 1.0 * 255 / min_score;
double average_score = 0;
for (size_t pos = 0; pos < spelling_num_; pos++) {
double score = log(raw_spellings_[pos].freq) * score_amplifier_;
assert(score >= 0);
average_score += score;
// Because of calculation precision issue, score might be a little bigger
// than 255 after being amplified.
if (score > 255)
score = 255;
char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
this_spl_buf[spelling_size_ - 1] =
static_cast<char>((unsigned char)score);
if (kPrintDebug0) {
printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
(unsigned char)this_spl_buf[spelling_size_ -1]);
}
}
average_score /= spelling_num_;
assert(average_score <= 255);
average_score_ = static_cast<uint8>(average_score);
if (kPrintDebug0)
printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
average_score_);
}
*item_size = spelling_size_;
*spl_num = spelling_num_;
frozen_ = true;
return spelling_buf_;
}
float SpellingTable::get_score_amplifier() {
return static_cast<float>(score_amplifier_);
}
unsigned char SpellingTable::get_average_score() {
return average_score_;
}
#endif // ___BUILD_MODEL___
} // namespace ime_pinyin