srec/clib/voc_read.c - platform/external/srec - Git at Google

 /*---------------------------------------------------------------------------*
  *  voc_read.c  *
  *                                                                           *
  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
  *                                                                           *
  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
  *  you may not use this file except in compliance with the License.         *
  *                                                                           *
  *  You may obtain a copy of the License at                                  *
  *      http://www.apache.org/licenses/LICENSE-2.0                           *
  *                                                                           *
  *  Unless required by applicable law or agreed to in writing, software      *
  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  *  See the License for the specific language governing permissions and      *
  *  limitations under the License.                                           *
  *                                                                           *
  *---------------------------------------------------------------------------*/


 #ifndef _RTT
 #include <stdio.h>
 #endif
 #include <stdlib.h>
 #include <math.h>
 #include <assert.h>

 #if defined(__cplusplus) && defined(_MSC_VER)
 extern "C"
 {
 #include <string.h>
 }
 #else
 #include <string.h>
 #endif

 #include <sys/types.h>
 #include <sys/stat.h>
 #ifdef _WIN32
 #define stat _stat
 #else
 #include <unistd.h>
 #endif


 #include <fcntl.h>
 #include <sys/mman.h>

 #include <zipfile/zipfile.h>


 #include "hmmlib.h"
 #include "duk_io.h"
 #include "LCHAR.h"
 #include "portable.h"

 #include "memmove.h"

 static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";


 #define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r')


 #ifndef _RTT

 /**
  *  Read word models and their phoneme transcriptions from .ok or .voc files.
  *  returns -1 on error
  */
 int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
 {
   const char *ok;
   ESR_ReturnCode rc;
   int result;
   int i;
   char token[256];

   ASSERT(voc);

   if (basename == NULL || strlen(basename) == 0) {
     PLogError("Error: invalid arg to read_word_transcription()\n");
     goto CLEANUP;
   }

   if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) {
     PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
     goto CLEANUP;
   }

   /* this assumption eliminates simplifies bounds checking when parsing */
   if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
     PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
     goto CLEANUP;
   }

   /* set up point to walk the data */
   ok = voc->ok_file_data;

   /* verify the header */
   i = 0;
   while (*ok != '=') {
     if (cr_or_nl(*ok)) {
       PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
       goto CLEANUP;
     }
     token[i++] = *ok++;
   }
   token[i] = 0;
   ok++;
   CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
   if (result != 0)
   {
     PLogError(L("%s was missing #LANG=en-us header"), basename);
     goto CLEANUP;
   }
   i = 0;
   while (!cr_or_nl(*ok)) token[i++] = *ok++;
   token[i] = 0;
   ok++;
   CHKLOG(rc, ESR_str2locale(token, locale));

   /* set up first and last entries */
   voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
   voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
   while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */
   voc->last_entry++;

   /* determine if there are any upper case entries */
   voc->hasUpper = 1;
   while (ok < voc->ok_file_data + voc->ok_file_data_length) {
     int ch = *ok;
     if ('A' <= ch && ch <= 'Z') {
       voc->hasUpper = 1;
       break;
     }
     else if ('Z' < ch) {
       voc->hasUpper = 0;
       break;
     }
     /* scan to the next entry */
     while (*ok++ != '\n') ;
   }

   return 0;

 CLEANUP:
   delete_word_transcription(voc);

   PLogError(L("read_word_transcription: failed to read '%s'"), basename);

   return -1;
 }
 #endif

 /* the label is terminated with 0 and the entry terminated with ' ' */
 static int kompare(const char* label, const char* entry) {
   while (*label == *entry) {
     label++;
     entry++;
   }
   return (*label ? *label : ' ') - *entry;
 }

 int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
   int num_prons;
   const char* low;
   const char* middle;
   const char* high;

   //PLogError(L("get_prons '%s'"), label);

   /* dictionaries are usually lower case, so do this for speed */
   if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0;

   /* binary search to find matching entry */
   low = voc->first_entry;
   high = voc->last_entry;
   while (1) {
     /* pick a point in the middle and align to next entry */
     middle = low + ((high - low) >> 1) - 1;
     while (*middle++ != '\n') ;

     /* compare 'label' to 'middle' */
     int diff = kompare(label, middle);
     if (diff == 0) break;

     /* nothing found */
     if (low == high) return 0;

     /* 'middle' aligned to 'high', so move 'high' down */
     if (middle == high) {
       high -= 2;
       while (*high != '\n') high--;
       high++;
       continue;
     }

     if (diff > 0) low = middle;
     else high = middle;
   }

   /* back up to find the first entry equal to 'label' */
   low = middle;
   while (voc->first_entry < low) {
     const char* lo;
     for (lo = low - 2; *lo != '\n'; lo--) ;
     lo++;
     if (kompare(label, lo)) break;
     low = lo;
   }

   /* move forward to the last entry equal to 'label' */
   high = middle;
   while (high < voc->last_entry) {
     const char* hi;
     for (hi = high; *hi != '\n'; hi++) ;
     hi++;
     if (kompare(label, hi)) break;
     high = hi;
   }

   /* loop over all the entries */
   num_prons = 0;
   while (low <= high) {
     /* scan over the label */
     while (*low++ != ' ') ;

     /* skip the whitespace */
     while (*low == ' ') low++;

     /* copy the pron */
     while (*low != '\n') {
       if (--prons_len <= 2) return -1;
       *prons++ = *low++;
     }
     *prons++ = 0;
     low++;
     num_prons++;
   }
   *prons++ = 0;

   return num_prons;
 }

 void delete_word_transcription(vocab_info* voc)
 {
   ASSERT(voc);

   voc->first_entry = 0;
   voc->last_entry = 0;
   if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
   voc->ok_file_data = NULL;
   voc->ok_file_data_length = 0;
 }


 /**************************************************/
 /* may want to move these functions to 'portable' */
 /**************************************************/

 static int endeql(const char* string, const char* end) {
   return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
 }

 /* decompress_entry requires an oversize destination buffer, so... */
 static size_t inflateSize(size_t size) {
   return size + size / 1000 + 1;
 }

 int mmap_zip(const char* fname, void** buf, size_t* size) {
     int fd = -1;
     struct stat statbuf;
     zipfile_t zf = 0;
     zipentry_t ze = 0;
     char entryname[FILENAME_MAX];
     size_t size2 = 0;
     void* buf2 = 0;

     /* open data file, determine size, map it, and close fd */
     fd = open(fname, O_RDONLY);
     if (fd < 0) goto FAILED;

     /* determine length */
     if (fstat(fd, &statbuf) < 0) goto FAILED;

     /* mmap it */
     *size = statbuf.st_size;
     *buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
     if (*buf == MAP_FAILED) goto FAILED;

     /* close fd, since we can */
     close(fd);
     fd = -1;

     /* if not a zip file, we are done! */
     if (!endeql(fname, ".zip")) return 0;

     /* set up zipfiler */
     zf = init_zipfile(*buf, *size);
     if (!zf) goto FAILED;

     /* get entry */
     strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
     entryname[strlen(entryname) - strlen(".zip")] = 0;
     ze = lookup_zipentry(zf, entryname);
     if (!ze) goto FAILED;

     /* mmap anon memory to hold unzipped entry */
     size2 = get_zipentry_size(ze);
     buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
     if (buf2 == (void*)-1) goto FAILED;

     /* unzip entry */
     if (decompress_zipentry(ze, buf2, size2)) goto FAILED;

     /* release unzipper */
     release_zipfile(zf);
     zf = 0;

     /* release mmapped file */
     munmap(*buf, inflateSize(*size));

     /* set return values */
     *buf = buf2;
     *size = size2;

     return 0;

 FAILED:
     if (fd != -1) close(fd);
     if (zf) release_zipfile(zf);
     if (buf2) munmap(buf2, inflateSize(size2));
     if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size));
     *buf = 0;
     *size = 0;
     return -1;
 }

 int munmap_zip(void* buf, size_t size) {
     return munmap(buf, inflateSize(size));
 }
	/---------------------------------------------------------------------------
	* voc_read.c *
	* *
	* Copyright 2007, 2008 Nuance Communciations, Inc. *
	* *
	* Licensed under the Apache License, Version 2.0 (the 'License'); *
	* you may not use this file except in compliance with the License. *
	* *
	* You may obtain a copy of the License at *
	* http://www.apache.org/licenses/LICENSE-2.0 *
	* *
	* Unless required by applicable law or agreed to in writing, software *
	* distributed under the License is distributed on an 'AS IS' BASIS, *
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
	* See the License for the specific language governing permissions and *
	* limitations under the License. *
	* *
	---------------------------------------------------------------------------/


	#ifndef _RTT
	#include <stdio.h>
	#endif
	#include <stdlib.h>
	#include <math.h>
	#include <assert.h>

	#if defined(__cplusplus) && defined(_MSC_VER)
	extern "C"
	{
	#include <string.h>
	}
	#else
	#include <string.h>
	#endif

	#include <sys/types.h>
	#include <sys/stat.h>
	#ifdef _WIN32
	#define stat _stat
	#else
	#include <unistd.h>
	#endif


	#include <fcntl.h>
	#include <sys/mman.h>

	#include <zipfile/zipfile.h>


	#include "hmmlib.h"
	#include "duk_io.h"
	#include "LCHAR.h"
	#include "portable.h"

	#include "memmove.h"

	static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";


	#define cr_or_nl(ch) ((ch) == '\n' \|\| (ch) == '\r')


	#ifndef _RTT

	/**
	* Read word models and their phoneme transcriptions from .ok or .voc files.
	* returns -1 on error
	*/
	int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
	{
	const char *ok;
	ESR_ReturnCode rc;
	int result;
	int i;
	char token[256];

	ASSERT(voc);

	if (basename == NULL \|\| strlen(basename) == 0) {
	PLogError("Error: invalid arg to read_word_transcription()\n");
	goto CLEANUP;
	}

	if (mmap_zip(basename, (void*)&voc->ok_file_data, (size_t)&voc->ok_file_data_length)) {
	PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
	goto CLEANUP;
	}

	/* this assumption eliminates simplifies bounds checking when parsing */
	if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
	PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
	goto CLEANUP;
	}

	/* set up point to walk the data */
	ok = voc->ok_file_data;

	/* verify the header */
	i = 0;
	while (*ok != '=') {
	if (cr_or_nl(*ok)) {
	PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
	goto CLEANUP;
	}
	token[i++] = *ok++;
	}
	token[i] = 0;
	ok++;
	CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
	if (result != 0)
	{
	PLogError(L("%s was missing #LANG=en-us header"), basename);
	goto CLEANUP;
	}
	i = 0;
	while (!cr_or_nl(ok)) token[i++] = ok++;
	token[i] = 0;
	ok++;
	CHKLOG(rc, ESR_str2locale(token, locale));

	/* set up first and last entries */
	voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
	voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
	while (voc->last_entry != '\n') voc->last_entry--; / header forces termination */
	voc->last_entry++;

	/* determine if there are any upper case entries */
	voc->hasUpper = 1;
	while (ok < voc->ok_file_data + voc->ok_file_data_length) {
	int ch = *ok;
	if ('A' <= ch && ch <= 'Z') {
	voc->hasUpper = 1;
	break;
	}
	else if ('Z' < ch) {
	voc->hasUpper = 0;
	break;
	}
	/* scan to the next entry */
	while (*ok++ != '\n') ;
	}

	return 0;

	CLEANUP:
	delete_word_transcription(voc);

	PLogError(L("read_word_transcription: failed to read '%s'"), basename);

	return -1;
	}
	#endif

	/* the label is terminated with 0 and the entry terminated with ' ' */
	static int kompare(const char* label, const char* entry) {
	while (label == entry) {
	label++;
	entry++;
	}
	return (label ? label : ' ') - *entry;
	}

	int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
	int num_prons;
	const char* low;
	const char* middle;
	const char* high;

	//PLogError(L("get_prons '%s'"), label);

	/* dictionaries are usually lower case, so do this for speed */
	if (!voc->hasUpper && 'A' <= label && label <= 'Z') return 0;

	/* binary search to find matching entry */
	low = voc->first_entry;
	high = voc->last_entry;
	while (1) {
	/* pick a point in the middle and align to next entry */
	middle = low + ((high - low) >> 1) - 1;
	while (*middle++ != '\n') ;

	/* compare 'label' to 'middle' */
	int diff = kompare(label, middle);
	if (diff == 0) break;

	/* nothing found */
	if (low == high) return 0;

	/* 'middle' aligned to 'high', so move 'high' down */
	if (middle == high) {
	high -= 2;
	while (*high != '\n') high--;
	high++;
	continue;
	}

	if (diff > 0) low = middle;
	else high = middle;
	}

	/* back up to find the first entry equal to 'label' */
	low = middle;
	while (voc->first_entry < low) {
	const char* lo;
	for (lo = low - 2; *lo != '\n'; lo--) ;
	lo++;
	if (kompare(label, lo)) break;
	low = lo;
	}

	/* move forward to the last entry equal to 'label' */
	high = middle;
	while (high < voc->last_entry) {
	const char* hi;
	for (hi = high; *hi != '\n'; hi++) ;
	hi++;
	if (kompare(label, hi)) break;
	high = hi;
	}

	/* loop over all the entries */
	num_prons = 0;
	while (low <= high) {
	/* scan over the label */
	while (*low++ != ' ') ;

	/* skip the whitespace */
	while (*low == ' ') low++;

	/* copy the pron */
	while (*low != '\n') {
	if (--prons_len <= 2) return -1;
	prons++ = low++;
	}
	*prons++ = 0;
	low++;
	num_prons++;
	}
	*prons++ = 0;

	return num_prons;
	}

	void delete_word_transcription(vocab_info* voc)
	{
	ASSERT(voc);

	voc->first_entry = 0;
	voc->last_entry = 0;
	if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
	voc->ok_file_data = NULL;
	voc->ok_file_data_length = 0;
	}


	/**************************************************/
	/* may want to move these functions to 'portable' */
	/**************************************************/

	static int endeql(const char* string, const char* end) {
	return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
	}

	/* decompress_entry requires an oversize destination buffer, so... */
	static size_t inflateSize(size_t size) {
	return size + size / 1000 + 1;
	}

	int mmap_zip(const char* fname, void** buf, size_t* size) {
	int fd = -1;
	struct stat statbuf;
	zipfile_t zf = 0;
	zipentry_t ze = 0;
	char entryname[FILENAME_MAX];
	size_t size2 = 0;
	void* buf2 = 0;

	/* open data file, determine size, map it, and close fd */
	fd = open(fname, O_RDONLY);
	if (fd < 0) goto FAILED;

	/* determine length */
	if (fstat(fd, &statbuf) < 0) goto FAILED;

	/* mmap it */
	*size = statbuf.st_size;
	*buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ\|PROT_WRITE, MAP_PRIVATE, fd, 0);
	if (*buf == MAP_FAILED) goto FAILED;

	/* close fd, since we can */
	close(fd);
	fd = -1;

	/* if not a zip file, we are done! */
	if (!endeql(fname, ".zip")) return 0;

	/* set up zipfiler */
	zf = init_zipfile(buf, size);
	if (!zf) goto FAILED;

	/* get entry */
	strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
	entryname[strlen(entryname) - strlen(".zip")] = 0;
	ze = lookup_zipentry(zf, entryname);
	if (!ze) goto FAILED;

	/* mmap anon memory to hold unzipped entry */
	size2 = get_zipentry_size(ze);
	buf2 = mmap(0, inflateSize(size2), PROT_READ\|PROT_WRITE, MAP_PRIVATE\|MAP_ANON, -1, 0);
	if (buf2 == (void*)-1) goto FAILED;

	/* unzip entry */
	if (decompress_zipentry(ze, buf2, size2)) goto FAILED;

	/* release unzipper */
	release_zipfile(zf);
	zf = 0;

	/* release mmapped file */
	munmap(buf, inflateSize(size));

	/* set return values */
	*buf = buf2;
	*size = size2;

	return 0;

	FAILED:
	if (fd != -1) close(fd);
	if (zf) release_zipfile(zf);
	if (buf2) munmap(buf2, inflateSize(size2));
	if (buf && buf != (void)-1) munmap(buf, inflateSize(*size));
	*buf = 0;
	*size = 0;
	return -1;
	}

	int munmap_zip(void* buf, size_t size) {
	return munmap(buf, inflateSize(size));
	}