blob: f68e22139ee12d59e404b27c79d7de3a8a7e3290 [file] [log] [blame]
/*---------------------------------------------------------------------------*
* test_g2g.c *
* *
* Copyright 2007, 2008 Nuance Communciations, Inc. *
* *
* Licensed under the Apache License, Version 2.0 (the 'License'); *
* you may not use this file except in compliance with the License. *
* *
* You may obtain a copy of the License at *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an 'AS IS' BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
* *
*---------------------------------------------------------------------------*/
#include "pstdio.h"
#include "pmemory.h"
#include "plog.h"
#include "HashMap.h"
#include "SR_Grammar.h"
#include "SR_Vocabulary.h"
#include "SR_SemanticResult.h"
#include "ESR_Session.h"
#include "ESR_Locale.h"
#include "ESR_CommandLine.h"
#include "LCHAR.h"
#include "PFileSystem.h"
#include "PANSIFileSystem.h"
#include "SR_GrammarImpl.h"
#include "simapi.h"
#include "srec_context.h"
#include "srec_arb.h"
/**
* @todo document
*/
typedef struct
{
unsigned short nnodes;
unsigned long size;
long phoneme;
unsigned short node_pos;
unsigned long node_off;
short low_genone_no;
short high_genone_no;
short low_pel_no;
short high_pel_no;
}
tree_head;
int usage(LCHAR* exename)
{
pfprintf(PSTDOUT,"usage: %s -base <basefilename> \n",exename);
pfprintf(PSTDOUT,"<basefilename> can be a file.g2g or @g2gfilelist\n");
pfprintf(PSTDOUT,"[-checkword id] .. also checks word id in the file\n");
pfprintf(PSTDOUT,"[-swiarb esr/config/lang/models/generic.swiarb] ... enables word check\n");
return 1;
}
/* protos */
ESR_ReturnCode find_phonemes_for_ihmms( CA_Arbdata* ca_arbdata, modelID* ihmms, int num_hmms);
ESR_ReturnCode Parse(SR_Grammar* grammar, LCHAR* trans, PFile* fout);
int CheckG2G(CA_Arbdata* arbdata, int* p4pTable, const char* base, int wordid, char* outbase);
void load_filelist(char* filelist, char*** pfiles, int *pnum_files);
int *phonemecode_for_pel_table(CA_Arbdata* arbdata);
int debug = 0;
#define MAX_LINE_LENGTH 256
#define MAX_STR_LENGTH 80
#define MAX_SEM_RESULTS 3
#define MAX_KEYS 30
/* main */
int main (int argc, char **argv)
{
ESR_ReturnCode rc;
LCHAR base[P_PATH_MAX] = L("");
int i;
CA_Arbdata* ca_arbdata;
char* arbfile = NULL;
char** g2glist;
int g2glist_len;
char* outbase = NULL;
int *p4pTable;
int wordid = 0;
int log_level = 0;
/*
* Initialize portable library.
*/
CHKLOG(rc, PMemInit());
/* CHKLOG(rc, PFileSystemCreate());
CHKLOG(rc, PANSIFileSystemCreate());
CHKLOG(rc, PANSIFileSystemAddPath(L("/dev/ansi"), L("/")));*/
/* Set ANSI file-system as default file-system */
/* CHKLOG(rc, PANSIFileSystemSetDefault(ESR_TRUE));*/
/* Set virtual current working directory to native current working directory */
/* len = P_PATH_MAX;
CHKLOG(rc, PANSIFileSystemGetcwd(cwd, &len));
CHKLOG(rc, PFileSystemChdir(cwd));*/
if( argc <= 1)
{
usage(argv[0]);
exit(EXIT_FAILURE);
}
for (i = 1; i < argc; ++i)
{
if(!LSTRCMP(argv[i], L("-base")))
{
++i;
LSTRCPY(base, argv[i]);
}
else if(!LSTRCMP(argv[i],L("-out")))
{
outbase = argv[++i];
}
else if(!LSTRCMP(argv[i],L("-swiarb")))
{
arbfile = argv[++i];
}
else if(!LSTRCMP(argv[i],L("-checkword")))
{
wordid = atoi(argv[++i]);
}
else if(!LSTRCMP(argv[i],L("-log")))
{
log_level = 10;
}
else
{
printf("unrecog'd argument %s\n", argv[i]);
exit(1);
}
}
CHK(rc, PLogInit(NULL, log_level));
if(arbfile) {
ca_arbdata = CA_LoadArbdata(arbfile);
if(!ca_arbdata) {
pfprintf(PSTDOUT, "Error: loading arbfile %s\n", arbfile);
goto CLEANUP;
}
pfprintf(PSTDOUT, "arbdata done\n");
p4pTable = phonemecode_for_pel_table(ca_arbdata);
pfprintf(PSTDOUT, "p4pTable done\n");
} else {
ca_arbdata = 0;
p4pTable = 0;
}
if(base[0] == '@') {
load_filelist(base+1, &g2glist, &g2glist_len);
pfprintf(PSTDOUT, "g2glist %s .. %d entries\n", g2glist_len);
for(i=0; i<g2glist_len; i++)
CheckG2G( ca_arbdata, p4pTable, g2glist[i], wordid, outbase);
}
else {
CheckG2G( ca_arbdata, p4pTable, base, wordid, outbase);
}
CLEANUP:
PLogShutdown();
/* PANSIFileSystemDestroy();
PFileSystemDestroy();*/
PMemSetLogFile(PSTDOUT);
PMemDumpLogFile();
PMemShutdown();
return rc;
}
int CheckG2G(CA_Arbdata* ca_arbdata, int* p4pTable, const char* base, int wordid, char* outbase)
{
ESR_ReturnCode rc;
SR_GrammarImpl *grammarImpl;
SR_Grammar* grammar = NULL;
srec_context* fst;
CA_Syntax* syntax;
modelID ilabels_preceding[64], num_ilabels_preceding;
modelID ilabels_following[64], num_ilabels_following;
modelID ilabels[128], num_ilabels;
int i,j;
unsigned long g2gsize;
if(1) {
FILE* fp;
fp = fopen(base, "rb");
if(!fp) g2gsize = 0;
else {
fseek(fp, 0, SEEK_END);
g2gsize = ftell(fp);
fclose(fp);
}
}
rc = SR_GrammarLoad(base, &grammar);
if(rc != ESR_SUCCESS) {
pfprintf(PSTDOUT, "%s failed at load\n", base);
goto CLEANUP;
}
grammarImpl = (SR_GrammarImpl*)grammar;
syntax = grammarImpl->syntax;
if(outbase) {
CA_DumpSyntax( syntax, outbase);
}
fst = syntax->synx;
pfprintf(PSTDOUT, "%s %d arcs %d/%d/%d nodes %d/%d/%d words %d/%d chars %d/%d modelver %d\n",
base, g2gsize,
fst->num_arcs, fst->num_base_arcs, fst->FSMarc_list_len,
fst->num_nodes, fst->num_base_nodes, fst->FSMnode_list_len,
fst->olabels->num_words, fst->olabels->max_words,
fst->olabels->next_chars-fst->olabels->chars,
fst->olabels->max_chars,
#ifdef IMAGE_FORMAT_V2
fst->modelid
#else
-1
#endif
);
if(wordid == 0 || ca_arbdata == 0)
goto CLEANUP;
if(wordid >= fst->olabels->num_words) {
pfprintf(PSTDOUT, "%s failed 'cuz numwords(%d) < %d\n", base,
fst->olabels->num_words, wordid);
goto CLEANUP;
}
for(i=0; i<fst->num_arcs; i++) {
if(fst->FSMarc_list[i].olabel == wordid) {
FSMnode* node;
FSMarc* arc = &fst->FSMarc_list[i];
nodeID fr_node = arc->fr_node;
arcID iarc;
ilabels_following[0] = arc->ilabel;
num_ilabels_following = 1;
num_ilabels_preceding = 0;
for( ; fr_node!=fst->start_node; fr_node=arc->fr_node) {
node = &fst->FSMnode_list[fr_node];
iarc = node->first_prev_arc;
for( ; iarc!=MAXarcID; iarc=arc->linkl_prev_arc) {
arc = &fst->FSMarc_list[iarc];
if(arc->fr_node != fr_node) break;
}
if(iarc == MAXarcID) {
pfprintf(PSTDOUT, "%s failed at 11\n", base);
goto CLEANUP;
}
if(arc->ilabel == WORD_BOUNDARY) break;
ilabels_preceding[num_ilabels_preceding++] = arc->ilabel;
}
arc = &fst->FSMarc_list[i];
fr_node = arc->to_node;
for( ; fr_node!=fst->end_node; fr_node=arc->to_node) {
node = &fst->FSMnode_list[fr_node];
iarc = node->un_ptr.first_next_arc;
for( ; iarc!=MAXarcID; iarc=arc->linkl_next_arc) {
arc = &fst->FSMarc_list[iarc];
if(arc->to_node != fr_node) break;
}
if(iarc == MAXarcID) {
pfprintf(PSTDOUT, "%s failed at 12\n", base);
goto CLEANUP;
}
ilabels_following[num_ilabels_following++] = arc->ilabel;
if(arc->ilabel == WORD_BOUNDARY) break;
}
num_ilabels = 0;
for(j=0; j<num_ilabels_preceding; j++)
ilabels[num_ilabels++] = ilabels_preceding[num_ilabels_preceding-1-j];
for(j=0; j<num_ilabels_following; j++)
ilabels[num_ilabels++] = ilabels_following[j];
if(ilabels[num_ilabels-1] == WORD_BOUNDARY)
num_ilabels--;
for(j=0; j<num_ilabels; j++) {
if(ilabels[j]<fst->hmm_ilabel_offset) {
pfprintf(PSTDOUT, "%s failed at 15\n", base);
goto CLEANUP;
} else
ilabels[j] = ilabels[j] - (labelID)fst->hmm_ilabel_offset;
}
pfprintf(PSTDOUT, "%s (W%d) ihmms ", fst->olabels->words[wordid], wordid);
for(j=0;j<num_ilabels;j++)
pfprintf(PSTDOUT, " %d", ilabels[j]);
pfprintf(PSTDOUT, "\n");
if(num_ilabels < 2) {
pfprintf(PSTDOUT, "%s failed at 1\n", base);
goto CLEANUP;
}
if(p4pTable)
rc = find_phonemes_for_ihmms( ca_arbdata, ilabels, num_ilabels);
else {
rc = ESR_SUCCESS;
for(j=0; j<num_ilabels; j++) {
if(p4pTable[ ilabels[j]]<0) {
rc = ESR_NO_MATCH_ERROR;
ilabels[j] = MAXmodelID;
} else {
ilabels[j] = (modelID)p4pTable[ ilabels[j]];
}
}
}
if(rc) {
pfprintf(PSTDOUT, "%s failed at 2\n", base);
goto CLEANUP;
}
pfprintf(PSTDOUT, "%s ", fst->olabels->words[wordid]);
for(j=0;j<num_ilabels;j++) pfprintf(PSTDOUT, "%c", ilabels[j]);
pfprintf(PSTDOUT, "\n");
rc = Parse( grammar, fst->olabels->words[wordid], PSTDOUT);
if(rc) {
pfprintf(PSTDOUT, "%s failed at 3\n", base);
goto CLEANUP;
}
pfprintf(PSTDOUT, "%s PASSED (on %s)\n", base, fst->olabels->words[wordid]);
break;
}
}
return 0;
CLEANUP:
if(grammar) SR_GrammarDestroy(grammar);
return 1;
}
int traverse_tree(tree_node* node, tree_head *tree_topo, int *num_terminal_nodes)
{
if(node)
tree_topo->nnodes++;
if(node->node.quest_index < 0) {
if(num_terminal_nodes)
(*num_terminal_nodes)++;
if( node->term.pelid < tree_topo->low_pel_no)
tree_topo->low_pel_no = tree_topo->low_genone_no = node->term.pelid;
if( node->term.pelid > tree_topo->high_pel_no)
tree_topo->high_pel_no = tree_topo->high_genone_no = node->term.pelid;
} else {
traverse_tree( (tree_node*)node->node.fail, tree_topo, num_terminal_nodes);
traverse_tree( (tree_node*)node->node.pass, tree_topo, num_terminal_nodes);
}
return 0;
}
int num_nodes_in_tree(tree_node* node, int *num_terminal_nodes)
{
tree_head topo;
*num_terminal_nodes = 0;
topo.nnodes = 0;
traverse_tree(node, &topo, num_terminal_nodes);
return topo.nnodes;
}
ESR_ReturnCode find_phonemes_for_ihmms( CA_Arbdata* ca_arbdata, modelID* ihmms, int num_ihmms)
{
int ii, i;
int num_hmms_in_phoneme;
tree_head topo;
srec_arbdata* a = (srec_arbdata*)ca_arbdata;
int num_phonemes_for_ihmms = 0;
for(ii=0; ii<num_ihmms; ii++) {
for(i=0; i<a->num_phonemes; i++) {
num_hmms_in_phoneme = 0;
topo.low_pel_no = 32567;
topo.high_pel_no = 0;
traverse_tree(a->pdata[i].model_nodes, &topo, &num_hmms_in_phoneme);
if(debug)printf("phoneme %d num_hmms %d (%d-%d)\n", i, num_hmms_in_phoneme,
topo.low_pel_no, topo.high_pel_no);
if(ihmms[ii] >= topo.low_pel_no && ihmms[ii]<= topo.high_pel_no) {
ihmms[ii] = (modelID)i;
num_phonemes_for_ihmms++;
break;
}
}
if( i==a->num_phonemes) {
if(ihmms[ii]<=5) {
ihmms[ii] = 0;
num_phonemes_for_ihmms++;
} else {
PLogError("error: could not find hmm%d under any phoneme! ",ihmms[ii]);
}
}
}
if(num_phonemes_for_ihmms != num_ihmms)
return ESR_INVALID_ARGUMENT;
else {
for(ii=0; ii<num_ihmms; ii++) ihmms[ii] = a->pdata[ ihmms[ii]].code;
return ESR_SUCCESS;
}
}
void display_results(SR_SemanticResult *result, PFile* fout)
{
size_t i, size, len;
LCHAR* keys[MAX_KEYS]; /* array of pointers to strings */
LCHAR value[MAX_STR_LENGTH];
ESR_ReturnCode rc;
size = MAX_KEYS;
rc = result->getKeyList(result, (LCHAR**) &keys, &size); /* get the key list */
if(rc == ESR_SUCCESS)
{
for(i=0; i<size; i++)
{
len = MAX_STR_LENGTH;
if ((rc = result->getValue(result,keys[i],value,&len)) == ESR_SUCCESS)
pfprintf(fout,"{%s : %s}\n",keys[i],value);
else
pfprintf(fout,"Error: %s\n",ESR_rc2str(rc));
}
}
else
pfprintf(fout,"Error: %s\n",ESR_rc2str(rc));
}
ESR_ReturnCode Parse(SR_Grammar* grammar, LCHAR* trans, PFile* fout)
{
ESR_ReturnCode rc;
int i, result_count;
SR_SemanticResult* semanticResults[MAX_SEM_RESULTS];
result_count = MAX_SEM_RESULTS; /* initially not greater than MAX */
for(i =0; i<result_count; i++)
SR_SemanticResultCreate(&semanticResults[i]); /* create the result holders */
lstrtrim(trans);
rc = grammar->checkParse(grammar, trans, semanticResults, (size_t*) &result_count);
if(rc != ESR_SUCCESS)
return rc;
if(result_count < 1)
{
pfprintf(fout,"no parse\n\n");
return ESR_NO_MATCH_ERROR;
}
else
{
pfprintf(fout,"parse ok (%d results)\n", result_count);
for(i=0; i < result_count; i++)
display_results(semanticResults[i],fout);
for(i=0; i < MAX_SEM_RESULTS; i++)
{
rc = semanticResults[i]->destroy(semanticResults[i]);
if(rc != ESR_SUCCESS)
return rc;
}
return ESR_SUCCESS;
}
}
void load_filelist(char* filelist, char*** pfiles, int *pnum_files)
{
int i = 0;
FILE* fp;
char line[512];
char **files = 0, *file;
int num_files = 0;
fp = fopen(filelist, "r");
if(!fp) {
pfprintf(PSTDOUT, "failed to open %s\n", filelist);
goto DONE;
}
while( fgets(line, sizeof(line), fp)) {
if(line[0] == '#') continue;
i++;
}
fclose(fp);
num_files = i;
*files = CALLOC( num_files, sizeof(char*), __FILE__);
fp = fopen(filelist, "r");
for(i=0; fgets(line,sizeof(line),fp) && i<num_files; i++) {
if(line[0] == '#') continue;
strtok(line,"\n\r\t");
file = files[i++] = CALLOC(strlen(line)+1,sizeof(char),__FILE__);
strcpy( file, line);
}
fclose(fp);
num_files = i;
DONE:
*pfiles = files;
*pnum_files = num_files;
}
int* phonemecode_for_pel_table(CA_Arbdata* ca_arbdata)
{
static int table[2048];
int i,j;
tree_head topo;
srec_arbdata* a = (srec_arbdata*)ca_arbdata;
int num_hmms_in_phoneme;
for(j=0; j< (int)(sizeof(table)/sizeof(int)); j++)
table[j] = 0;
for(i=0; i<a->num_phonemes; i++) {
num_hmms_in_phoneme = 0;
topo.low_pel_no = 32567;
topo.high_pel_no = 0;
traverse_tree(a->pdata[i].model_nodes, &topo, &num_hmms_in_phoneme);
if(debug)printf("phoneme %d num_hmms %d (%d-%d)\n", i, num_hmms_in_phoneme,
topo.low_pel_no, topo.high_pel_no);
for(j=topo.low_pel_no; j<=topo.high_pel_no; j++)
table[j] = a->pdata[i].code;
}
return &table[0];
}