srec/crec/text_parser.c - platform/external/srec - Git at Google

 /*---------------------------------------------------------------------------*
  *  text_parser.c  *
  *                                                                           *
  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
  *                                                                           *
  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
  *  you may not use this file except in compliance with the License.         *
  *                                                                           *
  *  You may obtain a copy of the License at                                  *
  *      http://www.apache.org/licenses/LICENSE-2.0                           *
  *                                                                           *
  *  Unless required by applicable law or agreed to in writing, software      *
  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  *  See the License for the specific language governing permissions and      *
  *  limitations under the License.                                           *
  *                                                                           *
  *---------------------------------------------------------------------------*/

 #include"pstdio.h"
 #include"srec_context.h"
 #include"astar.h"

 #include "passert.h"
 #include "portable.h"


 #define MAX_LOCAL_LEN 256
 #define PARSE_PASS 0
 #define PARSE_FAIL 1


 static int check_word_path(srec_context* context, arc_token* atok,
                            const char* transcription, int tlen)
 {
   const char    *wd, *p;
   char          *q;
   arc_token*    next_atok;
   wordID        wdID;
   int           q_position;

   if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
   {
     PLogError("Transcription too long [%s]\n", transcription);
     return PARSE_FAIL;
   }

   while (1) {
     char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */

     /* wd points to the first char of last word */
     wd = transcription;
     if (tlen > 0)
     {
       for (wd = transcription + tlen - 1; wd > transcription; wd--)
       {
         if (*wd == ' ')
         {
           wd++;
           break;
         }
       }
     }
     for (p = wd, q = copy_of_word; ; p++, q++)
     {
       q_position = q - copy_of_word;
       if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN)
       {
         PLogError("Word too long in transcription [%s]\n", transcription);
         return PARSE_FAIL;
       }
       *q = *p;
       if (*p == ' ' || *p == '\0')
       {
         *q = 0;
         break;
       }
     }
     wdID = wordmap_find_index(context->olabels, copy_of_word);

     if (wdID < MAXwordID)
     {
       next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
     }
     else
     {
       next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
       if (!next_atok) return PARSE_FAIL;
     }

     if (!next_atok) return PARSE_FAIL;

     int whether_final_atok = 0;
     arc_token* tmp;
     for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
          tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
     {
       if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
     }

     if (wd == transcription && whether_final_atok) return PARSE_PASS;
     if (wd == transcription) return PARSE_FAIL;
     tlen--;
     while (transcription[tlen] != ' ' && tlen > 0) tlen--;

     atok = next_atok;
   }
 }

 int FST_CheckPath_Simple(srec_context* context, const char* transcription)
 {
   arc_token* atok = &context->arc_token_list[0];
   int transcription_len = strlen(transcription);
   int rc;

   for (; transcription_len > 0; transcription_len--)
     if (transcription[transcription_len-1] != ' ') break;
   rc = check_word_path(context, atok, transcription, transcription_len);
   return rc;
 }

 int FST_CheckPath_Complex(srec_context* context, const char* transcription,
                           char* literal, size_t max_literal_len)
 {
   int i, j, rc;
   int num_spaces;
   char copy_of_transcription[MAX_LOCAL_LEN];
   char* spaces[24], *p; /* can't go too high here!! */
   ASSERT(strlen(transcription) < MAX_LOCAL_LEN);

   strcpy(copy_of_transcription, transcription);
   for (num_spaces = 0, p = copy_of_transcription; *p; p++)
   {
     if (*p == ' ')
     {
       if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
       {
         PLogError("FST_CheckPath_Complex() failed on too many words\n");
         return PARSE_FAIL;
       }
       spaces[num_spaces++] = p;
     }
   }

   if (num_spaces == 0)
   {
     rc = FST_CheckPath_Simple(context, transcription);
     if (rc == PARSE_PASS)
     {
       ASSERT(strlen(copy_of_transcription) < max_literal_len);
       strcpy(literal, copy_of_transcription);
     }
     return rc;
   }

   for (i = 0; i < (1 << num_spaces); i++)
   {
     /* find the space pointers */
     for (j = 0; j < num_spaces; j++)
       *spaces[j] = i & (1 << j) ? '_' : ' ';
     /* check each word, potentially within a rule! */
     for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
     {
       wordID k, wdid = wordmap_find_index(context->olabels, p);
       if (wdid < MAXwordID) continue;
       for (k = 1; k < context->olabels->num_slots; k++)
       {
         wdid = wordmap_find_index_in_rule(context->olabels, p, k);
         if (wdid < MAXwordID) break;
       }
       if (wdid == MAXwordID)
         goto next_i;
     }
     /* fix the nulls back */
     for (j = 0; j < num_spaces; j++)
       *spaces[j] = i & (1 << j) ? '_' : ' ';
     rc = FST_CheckPath_Simple(context, copy_of_transcription);
     if (rc == PARSE_PASS)
     {
       ASSERT(strlen(copy_of_transcription) < max_literal_len);
       strcpy(literal, copy_of_transcription);
       return rc;
     }
 next_i:
     continue;
   }
   return PARSE_FAIL;
 }

 static void clean_up_sentence(char* s);

 int FST_CheckPath(srec_context* context, const char* transcription,
                   char* literal, size_t max_literal_len)
 {
   char mytranscription[256];
   passert(strlen(transcription) < sizeof(mytranscription));
   strcpy(mytranscription, transcription);
   clean_up_sentence(mytranscription);
   if (!context->arc_token_list)
     return 2;
   else
     return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
 }

 static void clean_up_sentence(char* s)
 {
   char* p, *q;
   if (0) printf("sentence: '%s'\n", s);
   /* change speech codes to spaces */
   for (p = s; *p; p++)
   {
     if (*p == '[')
       for (;*p && *p != ']'; p++)
         *p = ' ';
     if (*p == ']') *p = ' ';
   }
   /* trim leading spaces */
   for (p = s; *p == ' ';)
     for (q = p; *q; q++) *q = *(q + 1);
   /* trim middle spaces */
   for (p = s; p && *p;)
   {
     if (!*p) break;
     p = strchr(p, ' ');
     if (!p) break;
     for (;*(p + 1) == ' ';)
       for (q = p; *q; q++) *q = *(q + 1);
     p++;
   }
   /* trim ending spaces */
   for (p = s + strlen(s); p != s;)
     if (*(--p) == ' ') *p = 0;
     else break;

   if (0) printf("clean_sentence: '%s'\n", s);
 }
	/---------------------------------------------------------------------------
	* text_parser.c *
	* *
	* Copyright 2007, 2008 Nuance Communciations, Inc. *
	* *
	* Licensed under the Apache License, Version 2.0 (the 'License'); *
	* you may not use this file except in compliance with the License. *
	* *
	* You may obtain a copy of the License at *
	* http://www.apache.org/licenses/LICENSE-2.0 *
	* *
	* Unless required by applicable law or agreed to in writing, software *
	* distributed under the License is distributed on an 'AS IS' BASIS, *
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
	* See the License for the specific language governing permissions and *
	* limitations under the License. *
	* *
	---------------------------------------------------------------------------/

	#include"pstdio.h"
	#include"srec_context.h"
	#include"astar.h"

	#include "passert.h"
	#include "portable.h"


	#define MAX_LOCAL_LEN 256
	#define PARSE_PASS 0
	#define PARSE_FAIL 1


	static int check_word_path(srec_context* context, arc_token* atok,
	const char* transcription, int tlen)
	{
	const char wd, p;
	char *q;
	arc_token* next_atok;
	wordID wdID;
	int q_position;

	if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
	{
	PLogError("Transcription too long [%s]\n", transcription);
	return PARSE_FAIL;
	}

	while (1) {
	char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */

	/* wd points to the first char of last word */
	wd = transcription;
	if (tlen > 0)
	{
	for (wd = transcription + tlen - 1; wd > transcription; wd--)
	{
	if (*wd == ' ')
	{
	wd++;
	break;
	}
	}
	}
	for (p = wd, q = copy_of_word; ; p++, q++)
	{
	q_position = q - copy_of_word;
	if (q_position < 0 \|\| (size_t)q_position >= MAX_LOCAL_LEN)
	{
	PLogError("Word too long in transcription [%s]\n", transcription);
	return PARSE_FAIL;
	}
	q = p;
	if (p == ' ' \|\| p == '\0')
	{
	*q = 0;
	break;
	}
	}
	wdID = wordmap_find_index(context->olabels, copy_of_word);

	if (wdID < MAXwordID)
	{
	next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
	}
	else
	{
	next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
	if (!next_atok) return PARSE_FAIL;
	}

	if (!next_atok) return PARSE_FAIL;

	int whether_final_atok = 0;
	arc_token* tmp;
	for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
	tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
	{
	if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
	}

	if (wd == transcription && whether_final_atok) return PARSE_PASS;
	if (wd == transcription) return PARSE_FAIL;
	tlen--;
	while (transcription[tlen] != ' ' && tlen > 0) tlen--;

	atok = next_atok;
	}
	}

	int FST_CheckPath_Simple(srec_context* context, const char* transcription)
	{
	arc_token* atok = &context->arc_token_list[0];
	int transcription_len = strlen(transcription);
	int rc;

	for (; transcription_len > 0; transcription_len--)
	if (transcription[transcription_len-1] != ' ') break;
	rc = check_word_path(context, atok, transcription, transcription_len);
	return rc;
	}

	int FST_CheckPath_Complex(srec_context* context, const char* transcription,
	char* literal, size_t max_literal_len)
	{
	int i, j, rc;
	int num_spaces;
	char copy_of_transcription[MAX_LOCAL_LEN];
	char* spaces[24], p; / can't go too high here!! */
	ASSERT(strlen(transcription) < MAX_LOCAL_LEN);

	strcpy(copy_of_transcription, transcription);
	for (num_spaces = 0, p = copy_of_transcription; *p; p++)
	{
	if (*p == ' ')
	{
	if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
	{
	PLogError("FST_CheckPath_Complex() failed on too many words\n");
	return PARSE_FAIL;
	}
	spaces[num_spaces++] = p;
	}
	}

	if (num_spaces == 0)
	{
	rc = FST_CheckPath_Simple(context, transcription);
	if (rc == PARSE_PASS)
	{
	ASSERT(strlen(copy_of_transcription) < max_literal_len);
	strcpy(literal, copy_of_transcription);
	}
	return rc;
	}

	for (i = 0; i < (1 << num_spaces); i++)
	{
	/* find the space pointers */
	for (j = 0; j < num_spaces; j++)
	*spaces[j] = i & (1 << j) ? '_' : ' ';
	/* check each word, potentially within a rule! */
	for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
	{
	wordID k, wdid = wordmap_find_index(context->olabels, p);
	if (wdid < MAXwordID) continue;
	for (k = 1; k < context->olabels->num_slots; k++)
	{
	wdid = wordmap_find_index_in_rule(context->olabels, p, k);
	if (wdid < MAXwordID) break;
	}
	if (wdid == MAXwordID)
	goto next_i;
	}
	/* fix the nulls back */
	for (j = 0; j < num_spaces; j++)
	*spaces[j] = i & (1 << j) ? '_' : ' ';
	rc = FST_CheckPath_Simple(context, copy_of_transcription);
	if (rc == PARSE_PASS)
	{
	ASSERT(strlen(copy_of_transcription) < max_literal_len);
	strcpy(literal, copy_of_transcription);
	return rc;
	}
	next_i:
	continue;
	}
	return PARSE_FAIL;
	}

	static void clean_up_sentence(char* s);

	int FST_CheckPath(srec_context* context, const char* transcription,
	char* literal, size_t max_literal_len)
	{
	char mytranscription[256];
	passert(strlen(transcription) < sizeof(mytranscription));
	strcpy(mytranscription, transcription);
	clean_up_sentence(mytranscription);
	if (!context->arc_token_list)
	return 2;
	else
	return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
	}

	static void clean_up_sentence(char* s)
	{
	char* p, *q;
	if (0) printf("sentence: '%s'\n", s);
	/* change speech codes to spaces */
	for (p = s; *p; p++)
	{
	if (*p == '[')
	for (;p && p != ']'; p++)
	*p = ' ';
	if (p == ']') p = ' ';
	}
	/* trim leading spaces */
	for (p = s; *p == ' ';)
	for (q = p; q; q++) q = *(q + 1);
	/* trim middle spaces */
	for (p = s; p && *p;)
	{
	if (!*p) break;
	p = strchr(p, ' ');
	if (!p) break;
	for (;*(p + 1) == ' ';)
	for (q = p; q; q++) q = *(q + 1);
	p++;
	}
	/* trim ending spaces */
	for (p = s + strlen(s); p != s;)
	if ((--p) == ' ') p = 0;
	else break;

	if (0) printf("clean_sentence: '%s'\n", s);
	}