blob: ca8b4708cba2f43acf423de1bd5859de346901a0 [file] [log] [blame]
/*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @file picoktab.c
*
* symbol tables needed at runtime
*
* Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
* All rights reserved.
*
* History:
* - 2009-04-20 -- initial version
*
*/
#include "picoos.h"
#include "picodbg.h"
#include "picoknow.h"
#include "picobase.h"
#include "picoktab.h"
#include "picodata.h"
#ifdef __cplusplus
extern "C" {
#endif
#if 0
}
#endif
/** @todo : the following would be better part of a knowledge base.
* Make sure it is consistent with the phoneme symbol table used in the lingware */
/* PLANE_PHONEMES */
/* PLANE_POS */
/* PLANE_PB_STRENGTHS */
/* PLANE_ACCENTS */
/* PLANE_INTERN */
#define PICOKTAB_TMPID_PHONSTART '\x26' /* 38 '&' */
#define PICOKTAB_TMPID_PHONTERM '\x23' /* 35 '#' */
/* ************************************************************/
/* fixed ids */
/* ************************************************************/
static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this,
picoos_Common common)
{
picoktab_FixedIds ids;
PICODBG_DEBUG(("start"));
if (NULL == this || NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
ids = (picoktab_FixedIds) this->subObj;
ids->phonStartId = PICOKTAB_TMPID_PHONSTART;
ids->phonTermId = PICOKTAB_TMPID_PHONTERM;
return PICO_OK;
}
static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this,
picoos_MemoryManager mm)
{
if (NULL != this) {
picoos_deallocate(mm, (void *) &this->subObj);
}
return PICO_OK;
}
pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this,
picoos_Common common)
{
if (NULL == this) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
this->subDeallocate = ktabIdsSubObjDeallocate;
this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t));
if (NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
NULL, NULL);
}
return ktabIdsInitialize(this, common);
}
picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this)
{
return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj));
}
picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm)
{
picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this));
if (NULL != this) {
/* initialize */
}
return this;
}
void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this)
{
if (NULL != (*this)) {
/* terminate */
picoos_deallocate(mm,(void *)this);
}
}
/* ************************************************************/
/* Graphs */
/* ************************************************************/
/* overview binary file format for graphs kb:
graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs
NROFSENTRIES : 2 bytes, number of entries in offset table
SIZEOFSENTRY : 1 byte, size of one entry in offset table
ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET)
OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs
graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph)
graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2]
FROM : 1..4 unsigned bytes, UTF8 character without terminating 0
TO : 1..4 unsigned bytes, UTF8 character without terminating 0
PROPSET : 1 unsigned byte, least significant bit : has TO field
next bit : has TOKENTYPE
next bit : has TOKENSUBTYPE
next bit : has VALUE
next bit : has LOWERCASE
next bit : has GRAPHSUBS1
next bit : has GRAPHSUBS2
next bit : has PUNC
TOKENTYPE : 1 unsigned byte
TOKENSUBTYPE : 1 unsigned byte
VALUE : 1 unsigned byte
LOWERCASE : 1..4 unsigned bytes, UTF8 character without terminating 0
GRAPHSUBS1 : 1..4 unsigned bytes, UTF8 character without terminating 0
GRAPHSUBS2 : 1..4 unsigned bytes, UTF8 character without terminating 0
PUNC : 1 unsigned byte
*/
static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop);
#define KTAB_START_GRAPHS_NR_OFFSET 0
#define KTAB_START_GRAPHS_SIZE_OFFSET 2
#define KTAB_START_GRAPHS_OFFSET_TABLE 3
#define KTAB_START_GRAPHS_GRAPH_TABLE 0
/* bitmasks to extract the grapheme properties info from the property set */
#define KTAB_GRAPH_PROPSET_TO ((picoos_uint8)'\x01')
#define KTAB_GRAPH_PROPSET_TOKENTYPE ((picoos_uint8)'\x02')
#define KTAB_GRAPH_PROPSET_TOKENSUBTYPE ((picoos_uint8)'\x04')
#define KTAB_GRAPH_PROPSET_VALUE ((picoos_uint8)'\x08')
#define KTAB_GRAPH_PROPSET_LOWERCASE ((picoos_uint8)'\x010')
#define KTAB_GRAPH_PROPSET_GRAPHSUBS1 ((picoos_uint8)'\x020')
#define KTAB_GRAPH_PROPSET_GRAPHSUBS2 ((picoos_uint8)'\x040')
#define KTAB_GRAPH_PROPSET_PUNCT ((picoos_uint8)'\x080')
typedef struct ktabgraphs_subobj *ktabgraphs_SubObj;
typedef struct ktabgraphs_subobj {
picoos_uint16 nrOffset;
picoos_uint16 sizeOffset;
picoos_uint8 * offsetTable;
picoos_uint8 * graphTable;
} ktabgraphs_subobj_t;
static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this,
picoos_Common common) {
ktabgraphs_subobj_t * ktabgraphs;
PICODBG_DEBUG(("start"));
if (NULL == this || NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
ktabgraphs = (ktabgraphs_subobj_t *) this->subObj;
ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]);
ktabgraphs->sizeOffset = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]);
ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]);
ktabgraphs->graphTable = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]);
return PICO_OK;
}
static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this,
picoos_MemoryManager mm) {
if (NULL != this) {
picoos_deallocate(mm, (void *) &this->subObj);
}
return PICO_OK;
}
pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this,
picoos_Common common) {
if (NULL == this) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
this->subDeallocate = ktabGraphsSubObjDeallocate;
this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t));
if (NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
NULL, NULL);
}
return ktabGraphsInitialize(this, common);
}
picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) {
if (NULL == this) {
return NULL;
} else {
return (picoktab_Graphs) this->subObj;
}
}
/* Graphs methods */
picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this,
const picoos_uint8 *graph,
const picoos_uint8 graphlenmax) {
picoos_uint8 ui8App;
picoos_uint32 graphsOffset;
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
ui8App = graphlenmax; /* avoid warning "var not used in this function"*/
graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph);
return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV;
}
static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str)
{
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
picoos_uint32 i, l;
i = 0;
l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]);
while (i<l) {
str[i] = g->graphTable[graphsOffset+propOffset+i];
i++;
}
str[l] = 0;
}
static picoos_uint32 ktab_propOffset(const picoktab_Graphs this,
picoos_uint32 graphsOffset, picoos_uint32 prop)
/* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table;
If the property is found, a value > 0 is returned otherwise 0 */
{
picoos_uint32 n = 0;
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
if ((g->graphTable[graphsOffset] & prop) == prop) {
n = n + 1; /* overread PROPSET field */
n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */
if (prop > KTAB_GRAPH_PROPSET_TO) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO)
== KTAB_GRAPH_PROPSET_TO) {
n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */
}
} else {
return n;
}
if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE)
== KTAB_GRAPH_PROPSET_TOKENTYPE) {
n = n + 1; /* overread TOKENTYPE field */
}
} else {
return n;
}
if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE)
== KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
n = n + 1; /* overread stokentype field */
}
} else {
return n;
}
if (prop > KTAB_GRAPH_PROPSET_VALUE) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE)
== KTAB_GRAPH_PROPSET_VALUE) {
n = n + 1; /* overread value field */
}
} else {
return n;
}
if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE)
== KTAB_GRAPH_PROPSET_LOWERCASE) {
n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */
}
} else {
return n;
}
if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1)
== KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */
}
} else {
return n;
}
if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2)
== KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */
}
} else {
return n;
}
if (prop > KTAB_GRAPH_PROPSET_PUNCT) {
if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT)
== KTAB_GRAPH_PROPSET_PUNCT) {
n = n + 1; /* overread value field */
}
} else {
return n;
}
}
return n;
}
picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph)
{ ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
picoos_int32 a, b, m;
picoos_uint32 graphsOffset;
picoos_uint32 propOffset;
picobase_utf8char from;
picobase_utf8char to;
picoos_bool utfGEfrom;
picoos_bool utfLEto;
if (g->nrOffset > 0) {
a = 0;
b = g->nrOffset-1;
do {
m = (a+b) / 2;
/* get offset to graph[m] */
if (g->sizeOffset == 1) {
graphsOffset = g->offsetTable[g->sizeOffset*m];
}
else {
graphsOffset = g->offsetTable[g->sizeOffset*m ] +
256*g->offsetTable[g->sizeOffset*m + 1];
/* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset));
*/
}
/* get FROM and TO field of graph[m] */
ktab_getStrProp(this, graphsOffset, 1, from);
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO);
if (propOffset > 0) {
ktab_getStrProp(this, graphsOffset, propOffset, to);
}
else {
picoos_strcpy((picoos_char *)to, (picoos_char *)from);
}
/* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to));
*/
utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0;
utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0;
if (utfGEfrom && utfLEto) {
/* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph));
*/
return graphsOffset;
}
if (!utfGEfrom) {
b = m-1;
}
else if (!utfLEto) {
a = m+1;
}
} while (a<=b);
}
PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph));
return 0;
}
picoos_bool picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType)
{
picoos_uint32 propOffset;
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE);
if (propOffset > 0) {
*stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]);
return TRUE;
}
else {
return FALSE;
}
}
picoos_bool picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType)
{
picoos_uint32 propOffset;
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE);
if (propOffset > 0) {
*stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]);
return TRUE;
}
else {
return FALSE;
}
}
picoos_bool picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value)
{
picoos_uint32 propOffset;
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE);
if (propOffset > 0) {
*value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]);
return TRUE;
}
else {
return FALSE;
}
}
picoos_bool picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2)
{
picoos_uint32 propOffset;
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT);
if (propOffset > 0) {
if (g->graphTable[graphsOffset+propOffset] == 2) {
*info1 = PICODATA_ITEMINFO1_PUNC_SENTEND;
}
else {
*info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND;
}
if (g->graphTable[graphsOffset+1] == '.') {
*info2 = PICODATA_ITEMINFO2_PUNC_SENT_T;
}
else if (g->graphTable[graphsOffset+1] == '?') {
*info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q;
}
else if (g->graphTable[graphsOffset+1] == '!') {
*info2 = PICODATA_ITEMINFO2_PUNC_SENT_E;
}
else {
*info2 = PICODATA_ITEMINFO2_PUNC_PHRASE;
}
return TRUE;
}
else {
return FALSE;
}
}
picoos_bool picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase)
{
picoos_uint32 propOffset;
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE);
if (propOffset > 0) {
ktab_getStrProp(this, graphsOffset, propOffset, lowercase);
return TRUE;
}
else {
return FALSE;
}
}
picoos_bool picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1)
{
picoos_uint32 propOffset;
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1);
if (propOffset > 0) {
ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1);
return TRUE;
}
else {
return FALSE;
}
}
picoos_bool picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2)
{
picoos_uint32 propOffset;
propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2);
if (propOffset > 0) {
ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2);
return TRUE;
}
else {
return FALSE;
}
}
/* *****************************************************************/
/* used for tools */
static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to)
{
picoos_uint32 l;
l = picobase_det_utf8_length(**pos);
while (l>0) {
*(to++) = *((*pos)++);
l--;
}
*to = 0;
}
picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this)
{
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
return g->nrOffset;
}
void picoktab_graphsGetGraphInfo(const picoktab_Graphs this,
picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to,
picoos_uint8 * propset,
picoos_uint8 * stokenType, picoos_uint8 * stokenSubType,
picoos_uint8 * value, picoos_uchar * lowercase,
picoos_uchar * graphsubs1, picoos_uchar * graphsubs2,
picoos_uint8 * punct) {
ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
picoos_uint32 graphsOffset;
picoos_uint8 * pos;
/* calculate offset of graph[graphIndex] */
if (g->sizeOffset == 1) {
graphsOffset = g->offsetTable[graphIndex];
} else {
graphsOffset = g->offsetTable[2 * graphIndex]
+ (g->offsetTable[2 * graphIndex + 1] << 8);
}
pos = &(g->graphTable[graphsOffset]);
*propset = *pos;
pos++; /* advance to FROM */
ktab_getUtf8(&pos, from); /* get FROM and advance */
if ((*propset) & KTAB_GRAPH_PROPSET_TO) {
ktab_getUtf8(&pos, to); /* get TO and advance */
} else {
picoos_strcpy((picoos_char *)to, (picoos_char *)from);
}
if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) {
(*stokenType) = *(pos++); /* get TOKENTYPE and advance */
} else {
(*stokenType) = -1;
}
if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
(*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */
} else {
(*stokenSubType) = -1;
}
if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) {
(*value) = *(pos++); /* get VALUE and advance */
} else {
(*value) = -1;
}
if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) {
ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */
} else {
lowercase[0] = NULLC;
}
if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */
} else {
graphsubs1[0] = NULLC;
}
if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */
} else {
graphsubs2[0] = NULLC;
}
if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) {
(*punct) = *(pos++); /* get PUNCT and advance */
} else {
(*punct) = -1;
}
}
/* ************************************************************/
/* Phones */
/* ************************************************************/
/* overview binary file format for phones kb:
phones-kb = specids propertytable
specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1
RESERVE1 RESERVE1 RESERVE1
propertytable = {PHONEPROP2}=256
PRIMSTRESSID1: one byte, ID of primary stress
SECSTRESSID1: one byte, ID of secondary stress
SYLLBOUNDID1: one byte, ID of syllable boundary
PAUSEID1: one byte, ID of pause
RESERVE1: reserved for future use
PHONEPROP2: one byte, max. of 256 phones directly access this table
to check a property for a phone; binary properties
encoded (1 bit per prop)
least significant bit: vowel
next bit: diphth
next bit: glott
next bit: nonsyllvowel
next bit: syllcons
3 bits spare
*/
#define KTAB_START_SPECIDS 0
#define KTAB_IND_PRIMSTRESS 0
#define KTAB_IND_SECSTRESS 1
#define KTAB_IND_SYLLBOUND 2
#define KTAB_IND_PAUSE 3
#define KTAB_IND_WORDBOUND 4
#define KTAB_START_PROPS 8
typedef struct ktabphones_subobj *ktabphones_SubObj;
typedef struct ktabphones_subobj {
picoos_uint8 *specids;
picoos_uint8 *props;
} ktabphones_subobj_t;
/* bitmasks to extract the property info from props */
#define KTAB_PPROP_VOWEL '\x01'
#define KTAB_PPROP_DIPHTH '\x02'
#define KTAB_PPROP_GLOTT '\x04'
#define KTAB_PPROP_NONSYLLVOWEL '\x08'
#define KTAB_PPROP_SYLLCONS '\x10'
static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this,
picoos_Common common) {
ktabphones_subobj_t * ktabphones;
PICODBG_DEBUG(("start"));
if (NULL == this || NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
ktabphones = (ktabphones_subobj_t *) this->subObj;
ktabphones->specids = &(this->base[KTAB_START_SPECIDS]);
ktabphones->props = &(this->base[KTAB_START_PROPS]);
return PICO_OK;
}
static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this,
picoos_MemoryManager mm) {
if (NULL != this) {
picoos_deallocate(mm, (void *) &this->subObj);
}
return PICO_OK;
}
pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this,
picoos_Common common) {
if (NULL == this) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
this->subDeallocate = ktabPhonesSubObjDeallocate;
this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t));
if (NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
NULL, NULL);
}
return ktabPhonesInitialize(this, common);
}
picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) {
if (NULL == this) {
return NULL;
} else {
return (picoktab_Phones) this->subObj;
}
}
/* Phones methods */
picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this,
const picoos_uint8 ch) {
return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]);
}
picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this,
const picoos_uint8 ch) {
return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]);
}
picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this,
const picoos_uint8 ch) {
return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]);
}
picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this,
const picoos_uint8 ch) {
return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]);
}
picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this,
const picoos_uint8 ch) {
return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]);
}
picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this,
const picoos_uint8 ch) {
picoos_uint8 props;
props = ((ktabphones_SubObj)this)->props[ch];
return (((KTAB_PPROP_VOWEL & props) &&
!(KTAB_PPROP_NONSYLLVOWEL & props))
|| (KTAB_PPROP_SYLLCONS & props));
}
picoos_bool picoktab_isPrimstress(const picoktab_Phones this,
const picoos_uint8 ch) {
return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]);
}
picoos_bool picoktab_isSecstress(const picoktab_Phones this,
const picoos_uint8 ch) {
return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]);
}
picoos_bool picoktab_isSyllbound(const picoktab_Phones this,
const picoos_uint8 ch) {
return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]);
}
picoos_bool picoktab_isWordbound(const picoktab_Phones this,
const picoos_uint8 ch) {
return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]);
}
picoos_bool picoktab_isPause(const picoktab_Phones this,
const picoos_uint8 ch) {
return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]);
}
picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) {
return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS];
}
picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) {
return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS];
}
picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) {
return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND];
}
picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) {
return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND];
}
picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) {
return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE];
}
/* ************************************************************/
/* Pos */
/* ************************************************************/
/* overview binary file format for pos kb:
pos-kb = header posids
header = {COUNT2 OFFS2}=8
posids = {POSID1 {PARTID1}0:8}1:
where POSID1 is the value of the (combined) part-of-speech symbol,
and {PARTID1} are the symbol values of its components (empty if it
is not a combined symbol). The {PARTID1} list is sorted.
Part-of-speech symbols with equal number of components are grouped
together.
The header contains information about these groups:
COUNT2 specifies the number of elements in the group, and OFFS2
specifies the offset (relative to the beginning of the kb) where
the group data starts, i.e.:
25 32 -> 25 not-combined elements, starting at offset 32
44 57 -> 44 elements composed of 2 symbols, starting at offset 57
23 189 -> 23 elements composed of 3 symbols, starting at offset 189
...
Currently, each symbol may be composed of up to 8 other symbols.
Therefore, the header has 8 entries, too. The header starts with
the unique POS list, and then in increasing order, 2 symbols, 3
symbols,...
Zur Anschauung die ge-printf-te Version:
25 32
44 57
23 189
12 281
4 341
1 365
0 0
0 0
33 |
34 |
35 |
60 |
etc.
36 | 35 60
50 | 35 95
51 | 35 97
58 | 35 120
59 | 35 131
61 | 60 75
63 | 60 95
64 | 60 97
etc.
42 | 35 60 117
44 | 35 60 131
45 | 35 73 97
48 | 35 84 97
54 | 35 97 131
56 | 35 113 120
57 | 35 117 120
62 | 60 84 122
etc.
*/
typedef struct ktabpos_subobj *ktabpos_SubObj;
typedef struct ktabpos_subobj {
picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB];
picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB];
} ktabpos_subobj_t;
static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this,
picoos_Common common) {
ktabpos_subobj_t *ktabpos;
picoos_uint16 osprev;
picoos_uint16 os, pos;
picoos_uint8 i;
PICODBG_DEBUG(("start"));
if (NULL == this || NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
ktabpos = (ktabpos_subobj_t *)this->subObj;
os = 0;
for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) {
ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 |
this->base[pos];
if (ktabpos->nrcomb[i] > 0) {
osprev = os;
os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2];
ktabpos->nrcombstart[i] = &(this->base[os]);
PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos,
ktabpos->nrcomb[i], osprev, os));
if (osprev >= os) {
/* cannot be, in a valid kb */
return picoos_emRaiseException(common->em,
PICO_EXC_FILE_CORRUPT,
NULL, NULL);
}
} else {
if (i == 0) {
/* cannot be, in a valid kb */
return picoos_emRaiseException(common->em,
PICO_EXC_FILE_CORRUPT,
NULL, NULL);
}
ktabpos->nrcombstart[i] = NULL;
}
}
return PICO_OK;
}
static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this,
picoos_MemoryManager mm) {
if (NULL != this) {
picoos_deallocate(mm, (void *) &this->subObj);
}
return PICO_OK;
}
pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this,
picoos_Common common) {
if (NULL == this) {
return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
NULL, NULL);
}
this->subDeallocate = ktabPosSubObjDeallocate;
this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t));
if (NULL == this->subObj) {
return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
NULL, NULL);
}
return ktabPosInitialize(this, common);
}
picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) {
if (NULL == this) {
return NULL;
} else {
return (picoktab_Pos) this->subObj;
}
}
/* Pos methods */
static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1,
const picoos_uint8 *grp2,
picoos_uint8 len)
{
/* if both, grp1 and grp2 would be sorted in ascending order
we could implement a function picoktab_comparePosGroup in
a similar manner as strcmp */
picoos_uint16 i, j, equal;
equal = 1;
i = 0;
while (equal && (i < len)) {
/* search grp1[i] in grp2 */
j = 0;
while ((j < len) && (grp1[i] != grp2[j])) {
j++;
}
equal = (j < len);
i++;
}
return equal;
}
picoos_bool picoktab_isUniquePos(const picoktab_Pos this,
const picoos_uint8 pos) {
ktabpos_subobj_t *ktabpos;
picoos_uint16 i;
/* speed-up possible with e.g. binary search */
ktabpos = (ktabpos_subobj_t *)this;
PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0]));
i = 0;
while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) {
PICODBG_TRACE(("compare with pos %d at position %d",
ktabpos->nrcombstart[0][i], pos, i));
i++;
}
return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i]));
}
picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this,
const picoos_uint8 pos,
const picoos_uint8 posgroup)
{
ktabpos_subobj_t *ktabpos;
picoos_uint8 *grp;
picoos_uint16 i, j, n, s, grplen;
picoos_uint8 *e;
picoos_uint8 found;
ktabpos = (ktabpos_subobj_t *) this;
grp = NULL;
found = FALSE;
grplen = 0;
/* currently, a linear search is required to find 'posgroup'; the
knowledge base should be extended to allow for a faster search */
/* treat case i==0, grplen==0, ie. pos == posgroup */
if (pos == posgroup) {
found = TRUE;
}
i = 1;
while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) {
n = ktabpos->nrcomb[i]; /* number of entries */
e = ktabpos->nrcombstart[i]; /* ptr to first entry */
s = i + 2; /* size of an entry in bytes */
/* was with while starting at 0:
s = i > 0 ? i + 2 : 1;
*/
j = 0;
while ((grp == NULL) && (j < n)) {
if (posgroup == e[0]) {
grp = e + 1;
grplen = s - 1;
}
e += s;
j++;
}
i++;
}
/* test if 'pos' is contained in the components of 'posgroup' */
if (grp != NULL) {
for (i = 0; !found && (i < grplen); i++) {
if (pos == grp[i]) {
found = TRUE;
}
}
/* just a way to test picoktab_getPosGroup */
/*
PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup);
*/
}
return found;
}
picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this,
const picoos_uint8 *poslist,
const picoos_uint8 poslistlen)
{
picoos_uint8 poscomb;
ktabpos_subobj_t *ktabpos;
picoos_uint16 i, j, n, s;
picoos_uint8 *e;
ktabpos = (ktabpos_subobj_t *) this;
poscomb = 0;
if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) {
i = poslistlen - 1;
if (i > 0) {
n = ktabpos->nrcomb[i]; /* number of entries */
e = ktabpos->nrcombstart[i]; /* ptr to first entry */
s = i + 2; /* size of an entry in bytes */
j = 0;
while (!poscomb && (j < n)) {
if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) {
poscomb = *e;
}
e += s;
j++;
}
if (!poscomb) {
/* combination not found; shouldn't occur if lingware OK! */
/* contingency solution: take first */
PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0]));
poscomb = poslist[0];
}
} else { /* not a composed POS */
poscomb = poslist[0];
}
}
return poscomb;
}
#ifdef __cplusplus
}
#endif
/* end */