blob: a6dd53928aa474f603b72612405c647145e2d70b [file] [log] [blame]
/*---------------------------------------------------------------------------*
* utteranc.h *
* *
* Copyright 2007, 2008 Nuance Communciations, Inc. *
* *
* Licensed under the Apache License, Version 2.0 (the 'License'); *
* you may not use this file except in compliance with the License. *
* *
* You may obtain a copy of the License at *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an 'AS IS' BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
* *
*---------------------------------------------------------------------------*/
#ifndef _h_utteranc_
#define _h_utteranc_
#ifdef SET_RCSID
static const char utteranc_h[] = "$Id: utteranc.h,v 1.3.6.7 2007/08/31 17:44:53 dahan Exp $";
#endif
#include "all_defs.h"
#include "hmm_type.h"
#include "fpi_tgt.h"
#include "voicing.h"
#include "specnorm.h"
#include "channorm.h"
#include "swicms.h"
#ifndef _RTT
#include "duk_io.h"
#endif
#define DEFAULT_BUFFER_SIZE 100 /* in frames */
#define KEEP_FRAMES 40 /* in frames, past frames kept */
/* Functions supported are
** new, delete (by source)
** open file/device, close file/device
** attach and detach sink
** read/store samples - including the header
*/
/**
* @todo document
*/
typedef struct
{ /* label structure */
char *label;
long begin;
long end;
char *extra;
unsigned char flag;
}
annotate;
/**
* @todo document
*/
typedef struct
{
int utt_type;
int dim;
fepFramePkt *frame;
int num_chan;
int do_channorm;
spect_dist_info **spchchan; /* Mirrored from the Wave object */
norm_info *channorm; /* Mirrored from the Wave object */
swicms_norm_info *swicms; /* copy of wave obj pointer */
spect_dist_info *backchan[MAX_CHAN_DIM];
featdata *last_push;
int voice_duration;
int quiet_duration;
int unsure_duration;
int start_windback;
}
utt_generic_info;
#ifndef _RTT
/**
* @todo document
*/
typedef struct
{
char typ; /* s (16 bit), c (8 bit), u (newton .utb) */
int endian; /* 0 is little 1 is big */
int do_skip; /* skip every other frame */
unsigned long len; /* length of file/utterance */
PFile* file; /* pointer to file */
char name[MAX_LABEL]; /* file name */
/* int op; read or write */
int num_utts; /* no. of utterances in utb file */
annotate *utb_table; /* utb file header information */
}
utt_file_info;
/**
* @todo document
*/
typedef struct
{
int utt_type;
int dim;
fepFramePkt *frame;
int num_chan;
int do_channorm;
spect_dist_info **spchchan; /* Mirrored from the Wave object */
norm_info *channorm; /* Mirrored from the Wave object */
swicms_norm_info *swicms; /* copy of wave obj pointer */
spect_dist_info *backchan[MAX_CHAN_DIM];
featdata *last_push;
int voice_duration;
int quiet_duration;
int unsure_duration;
int start_windback;
/* voicing_info voice; */
utt_file_info file;
}
file_utterance_info;
#endif
/**
* @todo document
*/
typedef struct
{
int utt_type;
int dim;
fepFramePkt *frame;
int num_chan;
int do_channorm;
spect_dist_info **spchchan; /* Mirrored from the Wave object */
norm_info *channorm; /* Mirrored from the Wave object */
swicms_norm_info *swicms; /* copy of wave obj pointer */
spect_dist_info *backchan[MAX_CHAN_DIM];
featdata *last_push;
int voice_duration;
int quiet_duration;
int unsure_duration;
int start_windback;
}
live_utterance_info;
/**
* @todo document
*/
typedef union
{
int utt_type; /* live or from file */
utt_generic_info gen_utt; /* generic one */
#ifndef _RTT
file_utterance_info file_utt;
#endif
live_utterance_info live_utt;
} utterance_info;
/*
** Size of the utb file headers and details
*/
#ifndef _RTT
#define UTT_VERSION 2
#define UTT_HEADER_SIZE 16 /*Size on disk*/
#define UTB_HEADER_SIZE 32 /*Size on disk*/
#define UTB_HEADER_USED 16 /*Size on disk*/ /* SAL */
/**
* UTB file header.
*/
typedef struct _UttHeader
{
/**
* The size of the header in bytes.
*/
unsigned short headerSize;
/**
* The version of the file format.
*/
unsigned short version;
/**
* The size of the payload in bytes.
*/
unsigned long nBytes;
/**
* The number of parameters per frame.
*/
unsigned short nParametersPerFrame;
/**
* 0=unknown, 1=none, 2=amp-based, 3=harmonicity-based, 4=mrec style
*/
unsigned short channelNormalization;
/**
* 0=unknown, 1=no, 2=yes
*/
unsigned short speakerNormalization;
/**
* 0=unknown, 1=no, 2=yes
*/
unsigned short imeldaization;
/**
* Before imelda truncation.
*/
unsigned short nOriginalParameters;
/**
* The number of samples per frame.
*/
unsigned short samplesPerFrame;
/**
* The audio sample rate.
*/
unsigned long sampleRate;
/**
* not used in version 5.
*/
unsigned long checksum;
}
UttHeader;
int update_utb_header(file_utterance_info *utt, int frames, int samplerate,
int framerate);
void init_utt_v5_header(UttHeader *uhead, int dim, int samplerate, int framerate);
int init_data_file(char *filename, file_utterance_info *utt, int dimen,
char typ, int endian, int do_skip);
int new_data_file(char *filename, file_utterance_info *utt, int dimen,
char typ, int endian);
int set_data_frame(file_utterance_info *utt, long begin);
int buffer_data_frames(file_utterance_info *utt, long f_begin, long f_end);
void more_data_frames(file_utterance_info *utt);
int save_data_frames(file_utterance_info *utt);
void close_data_stream(file_utterance_info *utt);
int init_utb_file(file_utterance_info *utt, annotate **table);
int position_utb_file(file_utterance_info *utt, long position, annotate *table);
int load_utb_data(file_utterance_info *utt, int num_frames, int do_skip);
int load_short_data(file_utterance_info *utt, int num_frames, int do_skip);
int save_utb_data(file_utterance_info *utt, int num_frames);
int save_short_data(file_utterance_info *utt, int num_frames);
int read_utt_head(UttHeader *head, PFile* datafile);
int write_utt_head(UttHeader *head, PFile* datafile);
int check_for_utb(char* filename);
/* TCP reading routines
*/
int read_tcp(char *filename, annotate **tag_base);
int read_lst(char *filename, annotate *tag_base, int ntags);
int read_utb_table(char *filename, annotate **tag_base);
void save_tcp(char *tcpnam, annotate *tag, int ntags);
void compose_tcp_name_of_utt(char* uttname , char* tcpname);
#endif
void init_utterance(utterance_info *utt, int utt_type, int dimen,
int buffer_size, int keep_frames, int num_chan, int do_voicing);
void set_voicing_durations(utterance_info *utt, int voice_duration,
int quiet_duration, int unsure_duration,
int start_windback);
void free_utterance(utterance_info *utt);
int utterance_started(utterance_info *utt);
int utterance_ended(utterance_info *utt);
int load_utterance_frame(utterance_info *utt, unsigned char* pUttFrame, int voicing);
int copy_utterance_frame(utterance_info *oututt, utterance_info *inutt);
#endif /* _h_utteranc_ */