srec/crec/priority_q.c - platform/external/srec - Git at Google

 /*---------------------------------------------------------------------------*
  *  priority_q.c  *
  *                                                                           *
  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
  *                                                                           *
  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
  *  you may not use this file except in compliance with the License.         *
  *                                                                           *
  *  You may obtain a copy of the License at                                  *
  *      http://www.apache.org/licenses/LICENSE-2.0                           *
  *                                                                           *
  *  Unless required by applicable law or agreed to in writing, software      *
  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  *  See the License for the specific language governing permissions and      *
  *  limitations under the License.                                           *
  *                                                                           *
  *---------------------------------------------------------------------------*/

 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include "passert.h"

 #include "portable.h"

 #include "hmm_desc.h"
 #include "utteranc.h"
 #include "hmmlib.h"

 #include "srec_sizes.h"
 #include "search_network.h"
 #include "srec.h"
 #include "word_lattice.h"

 #define PRINT_SEARCH_DETAILS 0

 /*this is just implemented as a list so far - FIX this!!*/

 /*allocates priority_q to han le max_n entries*/
 priority_q* allocate_priority_q(int max_n)
 {
   priority_q *pq;

   pq = (priority_q*) CALLOC(1, sizeof(priority_q), "search.srec.priority_q");
   pq->max_cost_in_q = MAXcostdata;
   pq->word_token_list = MAXwordID;
   pq->max_in_q = (miscdata)max_n;
   pq->num_in_q = 0;
   return pq;
 }

 void free_priority_q(priority_q* pq)
 {
   FREE(pq);
 }

 /*empties out priority_q*/

 void clear_priority_q(priority_q *pq)
 {
   pq->max_cost_in_q = MAXcostdata;
   pq->word_token_list = MAXwordID;
   pq->num_in_q = 0;
   /* Jean: what about the list of free tokens? */
 }
 /* returns the head of a linked list of all words in the priority_q.
    Return MAXwtokenID if list is empty */

 wtokenID get_word_token_list(priority_q *pq, word_token *word_token_array)
 {
   return pq->word_token_list;
 }

 void remove_non_end_word_from_q(srec *rec, priority_q *pq, word_token *word_token_array, nodeID end_node)
 {
   word_token *token;
   wtokenID *ptoken_index;
   wtokenID old_token_index;

   pq->max_cost_in_q = MAXcostdata;
   pq->num_in_q = 0;
   ptoken_index = &(pq->word_token_list);

   while (*ptoken_index != MAXwtokenID)
   {
     token = &(word_token_array[*ptoken_index]);
     if (token->end_node != end_node)
     {
       old_token_index = *ptoken_index;
       *ptoken_index = token->next_token_index;
       free_word_token(rec, old_token_index);
       pq->max_cost_in_q = MAXcostdata; /* fix: sep9 */
     }
     else
     {
       pq->num_in_q++;
       if ((pq->max_cost_in_q == MAXcostdata) || (token->cost > pq->max_cost_in_q))
       {
         pq->max_cost_in_q = token->cost;
       }
       ptoken_index = &(token->next_token_index);
     }
   }
 }

 int compare_histories(word_token* token1, word_token* token2,
                       word_token* word_token_array)
 {
   int history_for_token1 = 0;
   int history_for_token2 = 0;

   /* compare_histories() was an attempt to be smart about the priority_q,
      in that we don't need to store two word_tokens when the two tokens
      are the same word (obviously ending at the same frame), and with the
      same word history.  This happens for a digit that has multiple end nodes
      due to context-dependency.  When "history_for_token" ignores the end_node,
      then we're all clear to save just 1 word_token, but continue propagating
      all paths from the end nodes.  That bit of "continue propagating" is not
      done. THE OTHER PROBLEM is that the two nodes may NOT be
      simply different CD end models, they may be different from digit shifting!
      We're screwed if we drop the path, unless we compare all the way back to
      the start of utterance. */

   if (token1->word != token2->word)
     return 1;
   if (token1->end_node != token2->end_node)
     return 1;

   if (token1->backtrace != MAXwordID)
   {
     history_for_token1 += token1->end_node * 1000000;
     history_for_token1 += word_token_array[token1->backtrace].word * 10000;
     history_for_token1 += word_token_array[token1->backtrace].end_time;
   }

   if (token2->backtrace != MAXwordID)
   {
     history_for_token2 += token2->end_node * 1000000;
     history_for_token2 += word_token_array[token2->backtrace].word * 10000;
     history_for_token2 += word_token_array[token2->backtrace].end_time;
   }

 #if PRINT_SEARCH_DETAILS
   printf("comparing history_for_token1 %d history_for_token2 %d\n",
          history_for_token1, history_for_token2);
 #endif

   if (history_for_token1 == history_for_token2)
   {
     return 0;
   }
   else
   {
     return 1;
   }
 }

 #if PRINT_SEARCH_DETAILS
 void sanity_check_priority_q(priority_q* pq, word_token *word_token_array)
 {
   int n = 0;
   wtokenID token_index;
   word_token* token;
   n = 0;
   token_index = pq->word_token_list;
   while (token_index != MAXwordID)
   {
     token = &(word_token_array[token_index]);
     token_index = token->next_token_index;
     n++;
   }
   ASSERT(n == pq->num_in_q);
   if (pq->num_in_q == pq->max_in_q)
   {
     token = &(word_token_array[pq->word_token_list]);
     ASSERT(pq->max_cost_in_q == token->cost);
   }
 }
 #endif

 /*adds a word token to the priority_q.  Returns the index of the word to
   remove.
   if nothing needs to be removed, returns MAXwtokenID.
   if no room on priority_q, returns the one being put on */

 wtokenID add_word_token_to_priority_q(priority_q *pq, wtokenID token_index_to_add, word_token *word_token_array)
 {
   word_token *token;
   word_token *token_to_add;
   wtokenID token_index, return_token_index;
   wordID word_to_add;
   costdata cost_to_add;
   wtokenID *ptoken_index;
   wtokenID *pplace_to_add;
   wtokenID *pdelete_index;
   word_token *token_to_delete;

   token_to_add = &(word_token_array[token_index_to_add]);
   cost_to_add = token_to_add->cost;

 #if PRINT_SEARCH_DETAILS
   printf("WORDADD PQ tokenid %d cost %d\n", token_index_to_add, cost_to_add);
   token_index = pq->word_token_list;
   while (token_index != MAXwordID)
   {
     token = &(word_token_array[token_index]);
     printf("WORDADD PQ token %d word %d cost %d\n", token_index, token->word, token->cost);
     token_index = token->next_token_index;
   }
 #endif

   if (cost_to_add >= pq->max_cost_in_q && pq->num_in_q >= pq->max_in_q)
   {
 #if PRINT_SEARCH_DETAILS
     printf("WORDADD PQ - rejecting because cost too high cost_to_add(%d) max_cost_in_q(%d) num_in_q(%d)\n",
            cost_to_add, pq->max_cost_in_q, pq->num_in_q);
 #endif
 #if PRINT_SEARCH_DETAILS
     printf("WORDADD PQ (D) returning %d\n", token_index_to_add);
     sanity_check_priority_q(pq, word_token_array);
 #endif
     return token_index_to_add;
   }

   word_to_add = token_to_add->word;
   /* search for duplicate words first */
   ptoken_index = &(pq->word_token_list);
   pplace_to_add = NULL;
   pdelete_index = NULL;
   while ((*ptoken_index) != MAXwordID)
   {
     token = &word_token_array[(*ptoken_index)];

     if (token->word == token_to_add->word
         && !compare_histories(token, token_to_add, word_token_array))
     {
       if (token->cost < cost_to_add)
       {
         /* don't bother adding, there's another like it on the list!
            with a better score! */
 #if PRINT_SEARCH_DETAILS
         printf("WORDADD PQ - rejecting because another like it is on the list\n");
 #endif
         /* TODO: when returning back on the basis that something else is better,
            we should let the caller know what to use instead, ie, make the
            distinction between no-space and something-else-better */
         token = &word_token_array[ token_index_to_add];
         token->next_token_index = (*ptoken_index);
         return token_index_to_add;
       }
       else
       {
         /* ok, replace the one on the list with this better scoring one! */
         pdelete_index = ptoken_index;
       }
     }
     if (token->cost < cost_to_add && pplace_to_add == NULL)
     {
       pplace_to_add = ptoken_index;
       /* do not break, 'cuz we're still searching for a possible duplicates */
     }
     ptoken_index = &(token->next_token_index);
   }
   if (!pplace_to_add)
     pplace_to_add = ptoken_index;

   /* add the token by inserting in the linked list */
   token_index = *pplace_to_add;
   *pplace_to_add = token_index_to_add;
   token_to_add->next_token_index = token_index;
   pq->num_in_q++;
   if (pplace_to_add == &pq->word_token_list && pq->num_in_q >= pq->max_in_q)
     pq->max_cost_in_q = cost_to_add;

   /* now delete any duplicate that was found */
   if (pdelete_index)
   {
     token_index = *pdelete_index;
     token_to_delete = &word_token_array[  token_index];
     *pdelete_index = token_to_delete->next_token_index;
     pq->num_in_q--;
 #if PRINT_SEARCH_DETAILS
     printf("WORDADD PQ (B) returning %d\n", token_index);
 #endif
     return_token_index = token_index;
   }

   /* now check for max length in the queue */
   if (pq->num_in_q > pq->max_in_q)
   { /* really expecting just 1 over */
     token_index = pq->word_token_list;
     token = &(word_token_array[ token_index]);
     pq->num_in_q--;
     pq->word_token_list = token->next_token_index;
 #if PRINT_SEARCH_DETAILS
     printf("WORDADD PQ (C) returning %d\n", token_index);
 #endif
     return_token_index = token_index;
   }
   else
   {
     return_token_index = MAXwtokenID;
   }
   if (pq->num_in_q >= pq->max_in_q)
   {
     token_index = pq->word_token_list;
     token = &(word_token_array[token_index]);
     pq->max_cost_in_q = token->cost;
   }
   else
   { /* pq->num_in_q < pq->max_in_q, fixed sep9 */
     pq->max_cost_in_q = MAXcostdata;
   }
 #if PRINT_SEARCH_DETAILS
   printf("WORDADD PQ (A) returning %d\n", token_index);
   sanity_check_priority_q(pq, word_token_array);
 #endif
   return return_token_index;
 }


 /*returns the cost threshold for the end of the priority queue.
   If words have greater cost than this, no need to try to put them on the
   queue*/

 costdata get_priority_q_threshold(priority_q *pq, word_token *word_token_array)
 {
   return pq->max_cost_in_q;
 }
	/---------------------------------------------------------------------------
	* priority_q.c *
	* *
	* Copyright 2007, 2008 Nuance Communciations, Inc. *
	* *
	* Licensed under the Apache License, Version 2.0 (the 'License'); *
	* you may not use this file except in compliance with the License. *
	* *
	* You may obtain a copy of the License at *
	* http://www.apache.org/licenses/LICENSE-2.0 *
	* *
	* Unless required by applicable law or agreed to in writing, software *
	* distributed under the License is distributed on an 'AS IS' BASIS, *
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
	* See the License for the specific language governing permissions and *
	* limitations under the License. *
	* *
	---------------------------------------------------------------------------/

	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include "passert.h"

	#include "portable.h"

	#include "hmm_desc.h"
	#include "utteranc.h"
	#include "hmmlib.h"

	#include "srec_sizes.h"
	#include "search_network.h"
	#include "srec.h"
	#include "word_lattice.h"

	#define PRINT_SEARCH_DETAILS 0

	/this is just implemented as a list so far - FIX this!!/

	/allocates priority_q to han le max_n entries/
	priority_q* allocate_priority_q(int max_n)
	{
	priority_q *pq;

	pq = (priority_q*) CALLOC(1, sizeof(priority_q), "search.srec.priority_q");
	pq->max_cost_in_q = MAXcostdata;
	pq->word_token_list = MAXwordID;
	pq->max_in_q = (miscdata)max_n;
	pq->num_in_q = 0;
	return pq;
	}

	void free_priority_q(priority_q* pq)
	{
	FREE(pq);
	}

	/empties out priority_q/

	void clear_priority_q(priority_q *pq)
	{
	pq->max_cost_in_q = MAXcostdata;
	pq->word_token_list = MAXwordID;
	pq->num_in_q = 0;
	/* Jean: what about the list of free tokens? */
	}
	/* returns the head of a linked list of all words in the priority_q.
	Return MAXwtokenID if list is empty */

	wtokenID get_word_token_list(priority_q pq, word_token word_token_array)
	{
	return pq->word_token_list;
	}

	void remove_non_end_word_from_q(srec rec, priority_q pq, word_token *word_token_array, nodeID end_node)
	{
	word_token *token;
	wtokenID *ptoken_index;
	wtokenID old_token_index;

	pq->max_cost_in_q = MAXcostdata;
	pq->num_in_q = 0;
	ptoken_index = &(pq->word_token_list);

	while (*ptoken_index != MAXwtokenID)
	{
	token = &(word_token_array[*ptoken_index]);
	if (token->end_node != end_node)
	{
	old_token_index = *ptoken_index;
	*ptoken_index = token->next_token_index;
	free_word_token(rec, old_token_index);
	pq->max_cost_in_q = MAXcostdata; /* fix: sep9 */
	}
	else
	{
	pq->num_in_q++;
	if ((pq->max_cost_in_q == MAXcostdata) \|\| (token->cost > pq->max_cost_in_q))
	{
	pq->max_cost_in_q = token->cost;
	}
	ptoken_index = &(token->next_token_index);
	}
	}
	}

	int compare_histories(word_token* token1, word_token* token2,
	word_token* word_token_array)
	{
	int history_for_token1 = 0;
	int history_for_token2 = 0;

	/* compare_histories() was an attempt to be smart about the priority_q,
	in that we don't need to store two word_tokens when the two tokens
	are the same word (obviously ending at the same frame), and with the
	same word history. This happens for a digit that has multiple end nodes
	due to context-dependency. When "history_for_token" ignores the end_node,
	then we're all clear to save just 1 word_token, but continue propagating
	all paths from the end nodes. That bit of "continue propagating" is not
	done. THE OTHER PROBLEM is that the two nodes may NOT be
	simply different CD end models, they may be different from digit shifting!
	We're screwed if we drop the path, unless we compare all the way back to
	the start of utterance. */

	if (token1->word != token2->word)
	return 1;
	if (token1->end_node != token2->end_node)
	return 1;

	if (token1->backtrace != MAXwordID)
	{
	history_for_token1 += token1->end_node * 1000000;
	history_for_token1 += word_token_array[token1->backtrace].word * 10000;
	history_for_token1 += word_token_array[token1->backtrace].end_time;
	}

	if (token2->backtrace != MAXwordID)
	{
	history_for_token2 += token2->end_node * 1000000;
	history_for_token2 += word_token_array[token2->backtrace].word * 10000;
	history_for_token2 += word_token_array[token2->backtrace].end_time;
	}

	#if PRINT_SEARCH_DETAILS
	printf("comparing history_for_token1 %d history_for_token2 %d\n",
	history_for_token1, history_for_token2);
	#endif

	if (history_for_token1 == history_for_token2)
	{
	return 0;
	}
	else
	{
	return 1;
	}
	}

	#if PRINT_SEARCH_DETAILS
	void sanity_check_priority_q(priority_q* pq, word_token *word_token_array)
	{
	int n = 0;
	wtokenID token_index;
	word_token* token;
	n = 0;
	token_index = pq->word_token_list;
	while (token_index != MAXwordID)
	{
	token = &(word_token_array[token_index]);
	token_index = token->next_token_index;
	n++;
	}
	ASSERT(n == pq->num_in_q);
	if (pq->num_in_q == pq->max_in_q)
	{
	token = &(word_token_array[pq->word_token_list]);
	ASSERT(pq->max_cost_in_q == token->cost);
	}
	}
	#endif

	/*adds a word token to the priority_q. Returns the index of the word to
	remove.
	if nothing needs to be removed, returns MAXwtokenID.
	if no room on priority_q, returns the one being put on */

	wtokenID add_word_token_to_priority_q(priority_q pq, wtokenID token_index_to_add, word_token word_token_array)
	{
	word_token *token;
	word_token *token_to_add;
	wtokenID token_index, return_token_index;
	wordID word_to_add;
	costdata cost_to_add;
	wtokenID *ptoken_index;
	wtokenID *pplace_to_add;
	wtokenID *pdelete_index;
	word_token *token_to_delete;

	token_to_add = &(word_token_array[token_index_to_add]);
	cost_to_add = token_to_add->cost;

	#if PRINT_SEARCH_DETAILS
	printf("WORDADD PQ tokenid %d cost %d\n", token_index_to_add, cost_to_add);
	token_index = pq->word_token_list;
	while (token_index != MAXwordID)
	{
	token = &(word_token_array[token_index]);
	printf("WORDADD PQ token %d word %d cost %d\n", token_index, token->word, token->cost);
	token_index = token->next_token_index;
	}
	#endif

	if (cost_to_add >= pq->max_cost_in_q && pq->num_in_q >= pq->max_in_q)
	{
	#if PRINT_SEARCH_DETAILS
	printf("WORDADD PQ - rejecting because cost too high cost_to_add(%d) max_cost_in_q(%d) num_in_q(%d)\n",
	cost_to_add, pq->max_cost_in_q, pq->num_in_q);
	#endif
	#if PRINT_SEARCH_DETAILS
	printf("WORDADD PQ (D) returning %d\n", token_index_to_add);
	sanity_check_priority_q(pq, word_token_array);
	#endif
	return token_index_to_add;
	}

	word_to_add = token_to_add->word;
	/* search for duplicate words first */
	ptoken_index = &(pq->word_token_list);
	pplace_to_add = NULL;
	pdelete_index = NULL;
	while ((*ptoken_index) != MAXwordID)
	{
	token = &word_token_array[(*ptoken_index)];

	if (token->word == token_to_add->word
	&& !compare_histories(token, token_to_add, word_token_array))
	{
	if (token->cost < cost_to_add)
	{
	/* don't bother adding, there's another like it on the list!
	with a better score! */
	#if PRINT_SEARCH_DETAILS
	printf("WORDADD PQ - rejecting because another like it is on the list\n");
	#endif
	/* TODO: when returning back on the basis that something else is better,
	we should let the caller know what to use instead, ie, make the
	distinction between no-space and something-else-better */
	token = &word_token_array[ token_index_to_add];
	token->next_token_index = (*ptoken_index);
	return token_index_to_add;
	}
	else
	{
	/* ok, replace the one on the list with this better scoring one! */
	pdelete_index = ptoken_index;
	}
	}
	if (token->cost < cost_to_add && pplace_to_add == NULL)
	{
	pplace_to_add = ptoken_index;
	/* do not break, 'cuz we're still searching for a possible duplicates */
	}
	ptoken_index = &(token->next_token_index);
	}
	if (!pplace_to_add)
	pplace_to_add = ptoken_index;

	/* add the token by inserting in the linked list */
	token_index = *pplace_to_add;
	*pplace_to_add = token_index_to_add;
	token_to_add->next_token_index = token_index;
	pq->num_in_q++;
	if (pplace_to_add == &pq->word_token_list && pq->num_in_q >= pq->max_in_q)
	pq->max_cost_in_q = cost_to_add;

	/* now delete any duplicate that was found */
	if (pdelete_index)
	{
	token_index = *pdelete_index;
	token_to_delete = &word_token_array[ token_index];
	*pdelete_index = token_to_delete->next_token_index;
	pq->num_in_q--;
	#if PRINT_SEARCH_DETAILS
	printf("WORDADD PQ (B) returning %d\n", token_index);
	#endif
	return_token_index = token_index;
	}

	/* now check for max length in the queue */
	if (pq->num_in_q > pq->max_in_q)
	{ /* really expecting just 1 over */
	token_index = pq->word_token_list;
	token = &(word_token_array[ token_index]);
	pq->num_in_q--;
	pq->word_token_list = token->next_token_index;
	#if PRINT_SEARCH_DETAILS
	printf("WORDADD PQ (C) returning %d\n", token_index);
	#endif
	return_token_index = token_index;
	}
	else
	{
	return_token_index = MAXwtokenID;
	}
	if (pq->num_in_q >= pq->max_in_q)
	{
	token_index = pq->word_token_list;
	token = &(word_token_array[token_index]);
	pq->max_cost_in_q = token->cost;
	}
	else
	{ /* pq->num_in_q < pq->max_in_q, fixed sep9 */
	pq->max_cost_in_q = MAXcostdata;
	}
	#if PRINT_SEARCH_DETAILS
	printf("WORDADD PQ (A) returning %d\n", token_index);
	sanity_check_priority_q(pq, word_token_array);
	#endif
	return return_token_index;
	}


	/*returns the cost threshold for the end of the priority queue.
	If words have greater cost than this, no need to try to put them on the
	queue*/

	costdata get_priority_q_threshold(priority_q pq, word_token word_token_array)
	{
	return pq->max_cost_in_q;
	}