blob: 3f248b5d0623fe24f5aada4f2eab6007b165c47d [file] [log] [blame]
/*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file picoklex.c
*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
* All rights reserved.
*
* History:
* - 2009-04-20 -- initial version
*
*/
#include "picoos.h"
#include "picodbg.h"
#include "picodata.h"
#include "picoknow.h"
#include "picoklex.h"
#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif
/* ************************************************************/
/* lexicon */
/* ************************************************************/
/**
* @addtogroup picolex
*
overview:
- lex consists of optional searchindex and a non-empty list of lexblocks
- lexblocks are fixed size, at the start of a block there is also the
start of an entry
- using the searchindex a unambiguous lexblock can be determined which
contains the entry (or there is no entry)
- one lex entry has POS GRAPH PHON, all mandatory, but
- PHON can be empty string -> no pronunciation in the resulting TTS output
- PHON can be :G2P -> use G2P later to add pronunciation
- (POS,GRAPH) is a uniq key (only one entry allowed)
- (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and
differing POS and differing PHON possible)
- for one graph we can have two or three solutions from the lex
which all need to be passed on the the next PU
- in this case GRAPH, POS, and PHON all must be available in lex
sizing:
- 3 bytes entry index -> 16MB addressable
- 2 bytes searchindex nr -> 64K blocks possible
- 5 bytes per searchindex entry
- 3 bytes for graph-prefix
- 2 bytes blockadr in searchindex -> 64K blocks possible
- lexblock size 512B:
- 32M possible
- with ~20 bytes per entry
-> max. average of ~26 entries to be searched per lookup
- overhead of ~10 bytes per block to sync with
block boundaries
- examples:
- 500KB lex -> 1000 blocks,
1000 entries in searchindex, ~25.6K lex-entries,
- ~5KB searchindex
~10KB overhead for block sync
- 100KB lex -> 200 blocks,
200 entries in searchindex, ~5.1K lex-entries,
- ~1KB searchindex
~2KB overhead for block sync
pil-file: lexicon knowledge base in binary form
lex-kb = content
content = searchindex {lexblock}1:NRBLOCKS2
lexblock = {lexentry}1: (lexblock size is fixed 512Bytes)
searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2
lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1
LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2
- special cases:
- PHON is empty string (no pronunciation in the resulting TTS output):
lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 2 POS1
- PHON can be :G2P -> use G2P later to add pronunciation:
lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 3 POS1 <reserved-phon-val=5>
- multi-byte values always little endian
*/
/* ************************************************************/
/* lexicon data defines */
/* may not be changed with current implementation */
/* ************************************************************/
/* nr bytes of nrblocks info */
#define PICOKLEX_LEX_NRBLOCKS_SIZE 2
/* search index entry: - nr graphs
- nr bytes of block index
- nr bytes per entry, NRGRAPHS*INDSIZE */
#define PICOKLEX_LEX_SIE_NRGRAPHS 3
#define PICOKLEX_LEX_SIE_INDSIZE 2
#define PICOKLEX_LEX_SIE_SIZE 5
/* nr of bytes per lexblock */
#define PICOKLEX_LEXBLOCK_SIZE 512
/* reserved values in klex to indicate :G2P needed for a lexentry */
#define PICOKLEX_NEEDS_G2P 5
/* ************************************************************/
/* lexicon type and loading */
/* ************************************************************/
/** object : LexKnowledgeBase
* shortcut : klex
* derived from : picoknow_KnowledgeBase
*/
typedef struct klex_subobj *klex_SubObj;
typedef struct klex_subobj
{
picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */
picoos_uint8 *searchind;
picoos_uint8 *lexblocks;
} klex_subobj_t;
static pico_status_t klexInitialize(register picoknow_KnowledgeBase this,
picoos_Common common)
{
picoos_uint32 curpos = 0;
klex_subobj_t *klex;
PICODBG_DEBUG(("start"));
/* check whether (this->size != 0) done before calling this function */
if (NULL == this || NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
klex = (klex_subobj_t *) this->subObj;
if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos,
&(klex->nrblocks))) {
if (klex->nrblocks > 0) {
PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos));
klex->searchind = this->base + curpos;
} else {
klex->searchind = NULL;
}
klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE +
(klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE));
return PICO_OK;
} else {
return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT,
NULL, NULL);
}
}
static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this,
picoos_MemoryManager mm)
{
if (NULL != this) {
picoos_deallocate(mm, (void *) &this->subObj);
}
return PICO_OK;
}
/* we don't offer a specialized constructor for a LexKnowledgeBase but
* instead a "specializer" of an allready existing generic
* picoknow_KnowledgeBase */
pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,
picoos_Common common)
{
if (NULL == this) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
if (this->size > 0) {
this->subDeallocate = klexSubObjDeallocate;
this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t));
if (NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
NULL, NULL);
}
return klexInitialize(this, common);
} else {
/* some dummy klex */
return PICO_OK;
}
}
/* for now we don't need to do anything special for the main lex */
/*
pico_status_t picoklex_specializeMainLexKnowledgeBase(
picoknow_KnowledgeBase this,
picoos_Common common)
{
return picoklex_specializeLexKnowledgeBase(this,common);
}
*/
/* ************************************************************/
/* lexicon getLex */
/* ************************************************************/
picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this)
{
if (NULL == this) {
return NULL;
} else {
return (picoklex_Lex) this->subObj;
}
}
/* ************************************************************/
/* functions on searchindex */
/* ************************************************************/
static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this,
picoos_uint16 index)
{
picoos_uint32 pos, val;
pos = index * PICOKLEX_LEX_SIE_SIZE;
val = this->searchind[pos];
val = (val << 8) + this->searchind[pos + 1];
val = (val << 8) + this->searchind[pos + 2];
return val;
}
/* Determine first lexblock containing entries for specified
grapheme. */
static picoos_uint16 klex_getLexblockNr(const klex_SubObj this,
const picoos_uint8 *graphsi) {
/* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */
picoos_int32 low, mid, high;
picoos_uint32 searchval, indval;
/* PICOKLEX_LEX_SIE_NRGRAPHS */
/* convert graph-prefix to number with 'lexicographic' ordering */
searchval = graphsi[0];
searchval = (searchval << 8) + graphsi[1];
searchval = (searchval << 8) + graphsi[2];
low = 0;
high = this->nrblocks;
/* do binary search */
while (low < high) {
mid = (low + high) / 2;
indval = klex_getSearchIndexVal(this, mid);
if (indval < searchval) {
low = mid + 1;
} else {
high = mid;
}
}
PICODBG_ASSERT(high == low);
/* low points to the first entry greater than or equal to searchval */
if (low < this->nrblocks) {
indval = klex_getSearchIndexVal(this, low);
if (indval > searchval) {
low--;
/* if there are identical elements in the search index we have
to move to the first one */
if (low > 0) {
indval = klex_getSearchIndexVal(this, low);
while (indval == klex_getSearchIndexVal(this, low-1)) {
low--;
}
}
}
} else {
low = this->nrblocks - 1;
}
#if defined(PICO_DEBUG)
{
picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE;
PICODBG_DEBUG(("binary search result is %c%c%c (%d)",
this->searchind[pos], this->searchind[pos + 1],
this->searchind[pos + 2], low));
}
#endif
return (picoos_uint16) low;
}
/* Determine number of adjacent lexblocks containing entries for
the same grapheme search prefix (identified by search index). */
static picoos_uint16 klex_getLexblockRange(const klex_SubObj this,
picoos_uint16 index)
{
picoos_uint16 count;
picoos_uint32 sval1, sval2;
sval1 = klex_getSearchIndexVal(this, index);
#if defined(PICO_DEBUG)
/* 'index' must point to first lexblock of its kind */
if (index > 0) {
sval2 = klex_getSearchIndexVal(this, index - 1);
PICODBG_ASSERT(sval1 != sval2);
}
#endif
index++;
sval2 = klex_getSearchIndexVal(this, index);
count = 1;
while (sval1 == sval2) {
count++;
index++;
sval2 = klex_getSearchIndexVal(this, index);
}
return count;
}
/* ************************************************************/
/* functions on single lexblock */
/* ************************************************************/
static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry,
const picoos_uint8 *graph,
const picoos_uint16 graphlen) {
picoos_uint8 i;
picoos_uint8 lexlen;
picoos_uint8 *lexgraph;
lexlen = lexentry[0] - 1;
lexgraph = &(lexentry[1]);
for (i=0; (i<graphlen) && (i<lexlen); i++) {
PICODBG_TRACE(("%d|%d graph|lex: %c|%c", graphlen, lexlen,
graph[i], lexgraph[i]));
if (lexgraph[i] < graph[i]) {
return -1;
} else if (lexgraph[i] > graph[i]) {
return 1;
}
}
if (graphlen == lexlen) {
return 0;
} else if (lexlen < graphlen) {
return -1;
} else {
return 1;
}
}
static void klex_setLexResult(const picoos_uint8 *lexentry,
const picoos_uint32 lexpos,
picoklex_lexl_result_t *lexres) {
picoos_uint8 i;
/* check if :G2P */
if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) {
/* set pos */
lexres->posind[0] = lexentry[lexentry[0] + 1];
/* set rest */
lexres->phonfound = FALSE;
lexres->posindlen = 1;
lexres->nrres = 1;
PICODBG_DEBUG(("result %d :G2P", lexres->nrres));
} else {
i = lexres->nrres * (PICOKLEX_POSIND_SIZE);
lexres->posindlen += PICOKLEX_POSIND_SIZE;
lexres->phonfound = TRUE;
/* set pos */
lexres->posind[i++] = lexentry[lexentry[0] + 1];
/* set ind, PICOKLEX_IND_SIZE */
lexres->posind[i++] = 0x000000ff & (lexpos);
lexres->posind[i++] = 0x000000ff & (lexpos >> 8);
lexres->posind[i] = 0x000000ff & (lexpos >> 16);
lexres->nrres++;
PICODBG_DEBUG(("result %d", lexres->nrres));
}
}
static void klex_lexblockLookup(klex_SubObj this,
const picoos_uint32 lexposStart,
const picoos_uint32 lexposEnd,
const picoos_uint8 *graph,
const picoos_uint16 graphlen,
picoklex_lexl_result_t *lexres) {
picoos_uint32 lexpos;
picoos_int8 rv;
lexres->nrres = 0;
lexpos = lexposStart;
rv = -1;
while ((rv < 0) && (lexpos < lexposEnd)) {
rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen);
if (rv == 0) { /* found */
klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres);
if (lexres->phonfound) {
/* look for more results, up to MAX_NRRES, don't even
check if more results would be available */
while ((lexres->nrres < PICOKLEX_MAX_NRRES) &&
(lexpos < lexposEnd)) {
lexpos += this->lexblocks[lexpos];
lexpos += this->lexblocks[lexpos];
/* if there are no more entries in this block, advance
to next block by skipping all zeros */
while ((this->lexblocks[lexpos] == 0) &&
(lexpos < lexposEnd)) {
lexpos++;
}
if (lexpos < lexposEnd) {
if (klex_lexMatch(&(this->lexblocks[lexpos]), graph,
graphlen) == 0) {
klex_setLexResult(&(this->lexblocks[lexpos]),
lexpos, lexres);
} else {
/* no more results, quit loop */
lexpos = lexposEnd;
}
}
}
} else {
/* :G2P mark */
}
} else if (rv < 0) {
/* not found, goto next entry */
lexpos += this->lexblocks[lexpos];
lexpos += this->lexblocks[lexpos];
/* if there are no more entries in this block, advance
to next block by skipping all zeros */
while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) {
lexpos++;
}
} else {
/* rv > 0, not found, won't show up later in block */
}
}
}
/* ************************************************************/
/* lexicon lookup functions */
/* ************************************************************/
picoos_uint8 picoklex_lexLookup(const picoklex_Lex this,
const picoos_uint8 *graph,
const picoos_uint16 graphlen,
picoklex_lexl_result_t *lexres) {
picoos_uint16 lbnr, lbc;
picoos_uint32 lexposStart, lexposEnd;
picoos_uint8 i;
picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS];
klex_SubObj klex = (klex_SubObj) this;
if (NULL == klex) {
PICODBG_ERROR(("no lexicon loaded"));
/* no exception here needed, already checked at initialization */
return FALSE;
}
lexres->nrres = 0;
lexres->posindlen = 0;
lexres->phonfound = FALSE;
for (i = 0; i<PICOKLEX_LEX_SIE_NRGRAPHS; i++) {
if (i < graphlen) {
tgraph[i] = graph[i];
} else {
tgraph[i] = '\0';
}
}
PICODBG_DEBUG(("tgraph: %c%c%c", tgraph[0],tgraph[1],tgraph[2]));
if ((klex->nrblocks) == 0) {
/* no searchindex, no lexblock */
PICODBG_WARN(("no searchindex, no lexblock"));
return FALSE;
} else {
lbnr = klex_getLexblockNr(klex, tgraph);
PICODBG_ASSERT(lbnr < klex->nrblocks);
lbc = klex_getLexblockRange(klex, lbnr);
PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks));
}
PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc));
lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE;
lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE;
PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd));
klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres);
PICODBG_DEBUG(("lookup done, %d found", lexres->nrres));
return (lexres->nrres > 0);
}
picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this,
const picoos_uint8 *ind,
const picoos_uint8 indlen,
picoos_uint8 *pos,
picoos_uint8 **phon,
picoos_uint8 *phonlen) {
picoos_uint32 pentry;
klex_SubObj klex = (klex_SubObj) this;
/* check indlen */
if (indlen != PICOKLEX_IND_SIZE) {
return FALSE;
}
/* PICOKLEX_IND_SIZE */
pentry = 0x000000ff & (ind[0]);
pentry |= ((picoos_uint32)(ind[1]) << 8);
pentry |= ((picoos_uint32)(ind[2]) << 16);
/* check ind if it is within lexblocks byte stream, if not, return FALSE */
if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) {
return FALSE;
}
pentry += (klex->lexblocks[pentry]);
*phonlen = (klex->lexblocks[pentry++]) - 2;
*pos = klex->lexblocks[pentry++];
*phon = &(klex->lexblocks[pentry]);
PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen));
return TRUE;
}
#ifdef __cplusplus
}
#endif
/* end */