blob: b367b38601c84705daacb295c70eba5cad6b1813 [file] [log] [blame]
/*---------------------------------------------------------------------------*
* text_parser.c *
* *
* Copyright 2007, 2008 Nuance Communciations, Inc. *
* *
* Licensed under the Apache License, Version 2.0 (the 'License'); *
* you may not use this file except in compliance with the License. *
* *
* You may obtain a copy of the License at *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an 'AS IS' BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
* *
*---------------------------------------------------------------------------*/
#include"pstdio.h"
#include"srec_context.h"
#include"astar.h"
#include "passert.h"
#include "portable.h"
#define MAX_LOCAL_LEN 256
#define PARSE_PASS 0
#define PARSE_FAIL 1
static int check_word_path(srec_context* context, arc_token* atok,
const char* transcription, int tlen)
{
const char *wd, *p;
char *q;
arc_token* next_atok;
wordID wdID;
int q_position;
if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
{
PLogError("Transcription too long [%s]\n", transcription);
return PARSE_FAIL;
}
while (1) {
char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */
/* wd points to the first char of last word */
wd = transcription;
if (tlen > 0)
{
for (wd = transcription + tlen - 1; wd > transcription; wd--)
{
if (*wd == ' ')
{
wd++;
break;
}
}
}
for (p = wd, q = copy_of_word; ; p++, q++)
{
q_position = q - copy_of_word;
if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN)
{
PLogError("Word too long in transcription [%s]\n", transcription);
return PARSE_FAIL;
}
*q = *p;
if (*p == ' ' || *p == '\0')
{
*q = 0;
break;
}
}
wdID = wordmap_find_index(context->olabels, copy_of_word);
if (wdID < MAXwordID)
{
next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
}
else
{
next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
if (!next_atok) return PARSE_FAIL;
}
if (!next_atok) return PARSE_FAIL;
int whether_final_atok = 0;
arc_token* tmp;
for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
{
if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
}
if (wd == transcription && whether_final_atok) return PARSE_PASS;
if (wd == transcription) return PARSE_FAIL;
tlen--;
while (transcription[tlen] != ' ' && tlen > 0) tlen--;
atok = next_atok;
}
}
int FST_CheckPath_Simple(srec_context* context, const char* transcription)
{
arc_token* atok = &context->arc_token_list[0];
int transcription_len = strlen(transcription);
int rc;
for (; transcription_len > 0; transcription_len--)
if (transcription[transcription_len-1] != ' ') break;
rc = check_word_path(context, atok, transcription, transcription_len);
return rc;
}
int FST_CheckPath_Complex(srec_context* context, const char* transcription,
char* literal, size_t max_literal_len)
{
int i, j, rc;
int num_spaces;
char copy_of_transcription[MAX_LOCAL_LEN];
char* spaces[24], *p; /* can't go too high here!! */
ASSERT(strlen(transcription) < MAX_LOCAL_LEN);
strcpy(copy_of_transcription, transcription);
for (num_spaces = 0, p = copy_of_transcription; *p; p++)
{
if (*p == ' ')
{
if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
{
PLogError("FST_CheckPath_Complex() failed on too many words\n");
return PARSE_FAIL;
}
spaces[num_spaces++] = p;
}
}
if (num_spaces == 0)
{
rc = FST_CheckPath_Simple(context, transcription);
if (rc == PARSE_PASS)
{
ASSERT(strlen(copy_of_transcription) < max_literal_len);
strcpy(literal, copy_of_transcription);
}
return rc;
}
for (i = 0; i < (1 << num_spaces); i++)
{
/* find the space pointers */
for (j = 0; j < num_spaces; j++)
*spaces[j] = i & (1 << j) ? '_' : ' ';
/* check each word, potentially within a rule! */
for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
{
wordID k, wdid = wordmap_find_index(context->olabels, p);
if (wdid < MAXwordID) continue;
for (k = 1; k < context->olabels->num_slots; k++)
{
wdid = wordmap_find_index_in_rule(context->olabels, p, k);
if (wdid < MAXwordID) break;
}
if (wdid == MAXwordID)
goto next_i;
}
/* fix the nulls back */
for (j = 0; j < num_spaces; j++)
*spaces[j] = i & (1 << j) ? '_' : ' ';
rc = FST_CheckPath_Simple(context, copy_of_transcription);
if (rc == PARSE_PASS)
{
ASSERT(strlen(copy_of_transcription) < max_literal_len);
strcpy(literal, copy_of_transcription);
return rc;
}
next_i:
continue;
}
return PARSE_FAIL;
}
static void clean_up_sentence(char* s);
int FST_CheckPath(srec_context* context, const char* transcription,
char* literal, size_t max_literal_len)
{
char mytranscription[256];
passert(strlen(transcription) < sizeof(mytranscription));
strcpy(mytranscription, transcription);
clean_up_sentence(mytranscription);
if (!context->arc_token_list)
return 2;
else
return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
}
static void clean_up_sentence(char* s)
{
char* p, *q;
if (0) printf("sentence: '%s'\n", s);
/* change speech codes to spaces */
for (p = s; *p; p++)
{
if (*p == '[')
for (;*p && *p != ']'; p++)
*p = ' ';
if (*p == ']') *p = ' ';
}
/* trim leading spaces */
for (p = s; *p == ' ';)
for (q = p; *q; q++) *q = *(q + 1);
/* trim middle spaces */
for (p = s; p && *p;)
{
if (!*p) break;
p = strchr(p, ' ');
if (!p) break;
for (;*(p + 1) == ' ';)
for (q = p; *q; q++) *q = *(q + 1);
p++;
}
/* trim ending spaces */
for (p = s + strlen(s); p != s;)
if (*(--p) == ' ') *p = 0;
else break;
if (0) printf("clean_sentence: '%s'\n", s);
}