blob: 70ddff311fbc54c259e9ee6656e141efda5602c3 [file] [log] [blame]
/*---------------------------------------------------------------------------*
* voc_read.c *
* *
* Copyright 2007, 2008 Nuance Communciations, Inc. *
* *
* Licensed under the Apache License, Version 2.0 (the 'License'); *
* you may not use this file except in compliance with the License. *
* *
* You may obtain a copy of the License at *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an 'AS IS' BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
* *
*---------------------------------------------------------------------------*/
#ifndef _RTT
#include <stdio.h>
#endif
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#if defined(__cplusplus) && defined(_MSC_VER)
extern "C"
{
#include <string.h>
}
#else
#include <string.h>
#endif
#include <sys/types.h>
#include <sys/stat.h>
#ifdef _WIN32
#define stat _stat
#else
#include <unistd.h>
#endif
#include <fcntl.h>
#include <sys/mman.h>
#include <zipfile/zipfile.h>
#include "hmmlib.h"
#include "duk_io.h"
#include "LCHAR.h"
#include "portable.h"
#include "memmove.h"
static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";
#define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r')
#ifndef _RTT
/**
* Read word models and their phoneme transcriptions from .ok or .voc files.
* returns -1 on error
*/
int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
{
const char *ok;
ESR_ReturnCode rc;
int result;
int i;
char token[256];
ASSERT(voc);
if (basename == NULL || strlen(basename) == 0) {
PLogError("Error: invalid arg to read_word_transcription()\n");
goto CLEANUP;
}
if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) {
PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
goto CLEANUP;
}
/* this assumption eliminates simplifies bounds checking when parsing */
if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
goto CLEANUP;
}
/* set up point to walk the data */
ok = voc->ok_file_data;
/* verify the header */
i = 0;
while (*ok != '=') {
if (cr_or_nl(*ok)) {
PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
goto CLEANUP;
}
token[i++] = *ok++;
}
token[i] = 0;
ok++;
CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
if (result != 0)
{
PLogError(L("%s was missing #LANG=en-us header"), basename);
goto CLEANUP;
}
i = 0;
while (!cr_or_nl(*ok)) token[i++] = *ok++;
token[i] = 0;
ok++;
CHKLOG(rc, ESR_str2locale(token, locale));
/* set up first and last entries */
voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */
voc->last_entry++;
/* determine if there are any upper case entries */
voc->hasUpper = 1;
while (ok < voc->ok_file_data + voc->ok_file_data_length) {
int ch = *ok;
if ('A' <= ch && ch <= 'Z') {
voc->hasUpper = 1;
break;
}
else if ('Z' < ch) {
voc->hasUpper = 0;
break;
}
/* scan to the next entry */
while (*ok++ != '\n') ;
}
return 0;
CLEANUP:
delete_word_transcription(voc);
PLogError(L("read_word_transcription: failed to read '%s'"), basename);
return -1;
}
#endif
/* the label is terminated with 0 and the entry terminated with ' ' */
static int kompare(const char* label, const char* entry) {
while (*label == *entry) {
label++;
entry++;
}
return (*label ? *label : ' ') - *entry;
}
int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
int num_prons;
const char* low;
const char* middle;
const char* high;
//PLogError(L("get_prons '%s'"), label);
/* dictionaries are usually lower case, so do this for speed */
if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0;
/* binary search to find matching entry */
low = voc->first_entry;
high = voc->last_entry;
while (1) {
/* pick a point in the middle and align to next entry */
middle = low + ((high - low) >> 1) - 1;
while (*middle++ != '\n') ;
/* compare 'label' to 'middle' */
int diff = kompare(label, middle);
if (diff == 0) break;
/* nothing found */
if (low == high) return 0;
/* 'middle' aligned to 'high', so move 'high' down */
if (middle == high) {
high -= 2;
while (*high != '\n') high--;
high++;
continue;
}
if (diff > 0) low = middle;
else high = middle;
}
/* back up to find the first entry equal to 'label' */
low = middle;
while (voc->first_entry < low) {
const char* lo;
for (lo = low - 2; *lo != '\n'; lo--) ;
lo++;
if (kompare(label, lo)) break;
low = lo;
}
/* move forward to the last entry equal to 'label' */
high = middle;
while (high < voc->last_entry) {
const char* hi;
for (hi = high; *hi != '\n'; hi++) ;
hi++;
if (kompare(label, hi)) break;
high = hi;
}
/* loop over all the entries */
num_prons = 0;
while (low <= high) {
/* scan over the label */
while (*low++ != ' ') ;
/* skip the whitespace */
while (*low == ' ') low++;
/* copy the pron */
while (*low != '\n') {
if (--prons_len <= 2) return -1;
*prons++ = *low++;
}
*prons++ = 0;
low++;
num_prons++;
}
*prons++ = 0;
return num_prons;
}
void delete_word_transcription(vocab_info* voc)
{
ASSERT(voc);
voc->first_entry = 0;
voc->last_entry = 0;
if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
voc->ok_file_data = NULL;
voc->ok_file_data_length = 0;
}
/**************************************************/
/* may want to move these functions to 'portable' */
/**************************************************/
static int endeql(const char* string, const char* end) {
return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
}
/* decompress_entry requires an oversize destination buffer, so... */
static size_t inflateSize(size_t size) {
return size + size / 1000 + 1;
}
int mmap_zip(const char* fname, void** buf, size_t* size) {
int fd = -1;
struct stat statbuf;
zipfile_t zf = 0;
zipentry_t ze = 0;
char entryname[FILENAME_MAX];
size_t size2 = 0;
void* buf2 = 0;
/* open data file, determine size, map it, and close fd */
fd = open(fname, O_RDONLY);
if (fd < 0) goto FAILED;
/* determine length */
if (fstat(fd, &statbuf) < 0) goto FAILED;
/* mmap it */
*size = statbuf.st_size;
*buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
if (*buf == MAP_FAILED) goto FAILED;
/* close fd, since we can */
close(fd);
fd = -1;
/* if not a zip file, we are done! */
if (!endeql(fname, ".zip")) return 0;
/* set up zipfiler */
zf = init_zipfile(*buf, *size);
if (!zf) goto FAILED;
/* get entry */
strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
entryname[strlen(entryname) - strlen(".zip")] = 0;
ze = lookup_zipentry(zf, entryname);
if (!ze) goto FAILED;
/* mmap anon memory to hold unzipped entry */
size2 = get_zipentry_size(ze);
buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
if (buf2 == (void*)-1) goto FAILED;
/* unzip entry */
if (decompress_zipentry(ze, buf2, size2)) goto FAILED;
/* release unzipper */
release_zipfile(zf);
zf = 0;
/* release mmapped file */
munmap(*buf, inflateSize(*size));
/* set return values */
*buf = buf2;
*size = size2;
return 0;
FAILED:
if (fd != -1) close(fd);
if (zf) release_zipfile(zf);
if (buf2) munmap(buf2, inflateSize(size2));
if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size));
*buf = 0;
*size = 0;
return -1;
}
int munmap_zip(void* buf, size_t size) {
return munmap(buf, inflateSize(size));
}