| /*---------------------------------------------------------------------------* |
| * swicms.c * |
| * * |
| * Copyright 2007, 2008 Nuance Communciations, Inc. * |
| * * |
| * Licensed under the Apache License, Version 2.0 (the 'License'); * |
| * you may not use this file except in compliance with the License. * |
| * * |
| * You may obtain a copy of the License at * |
| * http://www.apache.org/licenses/LICENSE-2.0 * |
| * * |
| * Unless required by applicable law or agreed to in writing, software * |
| * distributed under the License is distributed on an 'AS IS' BASIS, * |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * |
| * See the License for the specific language governing permissions and * |
| * limitations under the License. * |
| * * |
| *---------------------------------------------------------------------------*/ |
| |
| #include <string.h> |
| #include"swicms.h" |
| #include"srec_sizes.h" |
| #include"prelib.h" |
| |
| #include "passert.h" |
| #include "ESR_Session.h" |
| #include "ESR_SessionType.h" |
| #include "IntArrayList.h" |
| #include "portable.h" |
| |
| #define printf_vector(HEAD, FMT, PTR, NN) { int i; LCHAR buffer[256]; sprintf(buffer, HEAD); sprintf(buffer + LSTRLEN(buffer), " %x", (int)PTR); for (i=0; i<(NN); ++i) sprintf(buffer + LSTRLEN(buffer), FMT, PTR[i]); PLogMessage(buffer); } |
| |
| /* Cross-utterance CMN calculation: |
| We try to normalize the speech frames before they get to the recognizer. |
| The speech frames are LDA-processed mfcc-with-dynamic feature vectors. |
| We collect these speech frames during recognition. At the end of |
| recognition we exclude the silence frames from the collected data, and |
| generate a new channel average based on the previous average and the new |
| data, using an exponential decay formula. |
| |
| In-utterance CMN calculation: |
| A new short-term average mechanism was introduced, with faster update, |
| to improve recognition on the very first recognition after init or reset. |
| We wait for a minimum number of new data frames to apply this. We also |
| disable the fast updater after some frames, because we assume the |
| cross-utterance estimator to be more reliable, particularly in its |
| ability to exclude silence frames from the calculation. |
| */ |
| |
| /* default settings for cross-utterance cms */ |
| #define SWICMS_FORGET_FACTOR_DEFAULT 400 /* effective frms of history */ |
| #define SWICMS_SBINDEX_DEFAULT 100 /* use speech frames only */ |
| /* #define SWICMS_CACHE_RESOLUTION_DEFAULT see swicms.h */ |
| /* #define SWICMS_CACHE_SIZE_DEFAULT see swicms.h */ |
| |
| /* default settings for in-utterance cms */ |
| #define SWICMS_INUTT_FORGET_FACTOR2_DISABLE 65535 /* any large number */ |
| #define SWICMS_INUTT_FORGET_FACTOR2_DEFAULT SWICMS_INUTT_FORGET_FACTOR2_DISABLE |
| /* disable this when cross-utt become more reliable */ |
| #define SWICMS_INUTT_DISABLE_AFTER_FRAMES 200 |
| /* wait while the estimate is poor */ |
| #define SWICMS_INUTT_ENABLE_AFTER_FRAMES 10 |
| |
| /** |
| * Logging Stuff |
| */ |
| #define LOG_LEVEL 2 |
| #define MODULE_NAME L("swicms.c") |
| //static const char* MTAG = MODULE_NAME; |
| |
| static const char *rcsid = 0 ? (const char *) &rcsid : |
| "$Id: swicms.c,v 1.21.6.16 2008/06/05 19:00:55 stever Exp $"; |
| |
| static ESR_BOOL SWICMS_DEBUG = ESR_FALSE; |
| |
| /* these are good values from cmn/tmn files */ |
| static const imeldata gswicms_cmn1_8 [MAX_CHAN_DIM] = |
| { |
| 158, 141, 99, 125, 101, 162, 113, 138, 128, 143, 123, 141, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 |
| }; |
| |
| static const imeldata gswicms_cmn1_11 [MAX_CHAN_DIM] = |
| { |
| 163, 121, 120, 114, 124, 139, 144, 108, 150, 119, 146, 124, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 |
| }; |
| |
| static const imeldata gswicms_tmn1_8 [MAX_CHAN_DIM] = |
| { |
| 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 |
| }; |
| |
| static const imeldata gswicms_tmn1_11 [MAX_CHAN_DIM] = |
| { |
| 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, |
| 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 |
| }; |
| |
| static ESR_ReturnCode GetSomeIntsIfAny( const LCHAR* parname, imeldata* parvalue, size_t reqSize) |
| { |
| size_t i, size; |
| ESR_ReturnCode rc; |
| ESR_BOOL exists; |
| IntArrayList* intList = 0; |
| |
| CHKLOG(rc, ESR_SessionContains(parname, &exists)); |
| if (exists) { |
| rc = ESR_SessionGetProperty(parname, (void**)&intList, TYPES_INTARRAYLIST); |
| if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) { |
| /* no match will revert to default data already in static array */ |
| PLogError(L("Error reading %s from session: %s"), parname, ESR_rc2str(rc)); |
| return ESR_FATAL_ERROR; |
| } |
| else if (rc == ESR_SUCCESS) { |
| CHKLOG(rc, IntArrayListGetSize(intList, &size)); |
| if(size != reqSize) { |
| PLogError(L("Error reading %s from session, expected len %d: %s"), parname, reqSize, ESR_rc2str(rc)); |
| return ESR_FATAL_ERROR; |
| } |
| if(reqSize == 1) |
| CHKLOG(rc, IntArrayListGet(intList, 0, parvalue)); |
| else { |
| for (i=0; i<size; ++i) |
| CHKLOG(rc, IntArrayListGet(intList, i, &parvalue[i])); |
| } |
| } |
| } |
| return ESR_SUCCESS; |
| CLEANUP: |
| return rc; |
| } |
| |
| int swicms_init(swicms_norm_info* swicms) |
| { |
| ESR_ReturnCode rc = ESR_SUCCESS; |
| size_t i; |
| ESR_BOOL exists, sessionExists; |
| size_t sample_rate; |
| |
| /* defaults */ |
| swicms->sbindex = SWICMS_SBINDEX_DEFAULT; |
| swicms->cached_num_frames = 0; |
| swicms->forget_factor = SWICMS_FORGET_FACTOR_DEFAULT; |
| swicms->cache_resolution = SWICMS_CACHE_RESOLUTION_DEFAULT; |
| swicms->num_frames_in_cmn = 0; |
| |
| CHKLOG(rc, ESR_SessionExists(&sessionExists)); |
| |
| if (sessionExists) |
| { /* We'll assume this rate is valid or someone else will be complaining. SteveR */ |
| rc = ESR_SessionGetSize_t ( L ( "CREC.Frontend.samplerate" ), &sample_rate ); |
| |
| if ( rc != ESR_SUCCESS ) |
| return ( rc ); |
| } |
| else |
| sample_rate = 11025; |
| |
| /* init the data structures by copying the static data so that we can have a copy if we need to reset */ |
| if ( sample_rate == 8000 ) |
| { |
| for ( i = 0; i < MAX_CHAN_DIM; i++ ) |
| { |
| swicms->cmn [i] = gswicms_cmn1_8 [i]; |
| swicms->tmn [i] = gswicms_tmn1_8 [i]; |
| // _lda_*mn below are OK, but are recalculated in swicms_lda_process() |
| swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */ |
| swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */ |
| } |
| } |
| else |
| { |
| for ( i = 0; i < MAX_CHAN_DIM; i++ ) |
| { |
| swicms->cmn [i] = gswicms_cmn1_11 [i]; |
| swicms->tmn [i] = gswicms_tmn1_11 [i]; |
| // _lda_*mn below are OK, but are recalculated in swicms_lda_process() |
| swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */ |
| swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */ |
| } |
| } |
| CHKLOG(rc, ESR_SessionExists(&sessionExists)); |
| |
| if (sessionExists) |
| { |
| const LCHAR* parname = L("CREC.Frontend.swicms.debug"); |
| CHKLOG(rc, ESR_SessionContains(parname, &exists)); |
| if (exists) { |
| rc = ESR_SessionGetBool(parname, &SWICMS_DEBUG); |
| if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) { |
| PLOG_DBG_ERROR((L("Error reading %s from session: %s"), parname, ESR_rc2str(rc))); |
| return rc; |
| } |
| } |
| |
| rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.forget_factor"), |
| &swicms->forget_factor, 1); |
| if(rc != ESR_SUCCESS) return rc; |
| |
| rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.sbindex"), |
| &swicms->sbindex, 1); |
| if(rc != ESR_SUCCESS) return rc; |
| |
| rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn"), |
| &swicms->cmn[0], MAX_CHAN_DIM); |
| if(rc != ESR_SUCCESS) return rc; |
| |
| if ( sample_rate == 8000 ) |
| { |
| rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn8"), &swicms->cmn[0], MAX_CHAN_DIM); |
| |
| if(rc != ESR_SUCCESS) |
| return rc; |
| } |
| else |
| { |
| rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn11"), &swicms->cmn[0], MAX_CHAN_DIM); |
| |
| if(rc != ESR_SUCCESS) |
| return rc; |
| } |
| |
| rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.tmn"), |
| &swicms->tmn[0], MAX_CHAN_DIM); |
| if(rc != ESR_SUCCESS) return rc; |
| } |
| |
| swicms->is_valid = 0; |
| for (i = 0; i < MAX_CHAN_DIM; i++) |
| swicms->adjust[i] = 255; |
| |
| #ifdef SREC_ENGINE_VERBOSE_LOGGING |
| PLogMessage("swicms->forget_factor = %d\n", swicms->forget_factor); |
| PLogMessage("swicms->cache_resolution = %d\n", swicms->cache_resolution); |
| PLogMessage("swicms->sbindex = %d\n", swicms->sbindex); |
| #endif |
| |
| /* in-utt cms parameters */ |
| swicms->inutt.forget_factor2 = SWICMS_INUTT_FORGET_FACTOR2_DEFAULT; |
| swicms->inutt.disable_after = 200; |
| swicms->inutt.enable_after = 10; /* in-utt is less reliable */ |
| swicms->inutt.num_bou_frames_to_skip = 20; /* silence frames! see windback */ |
| swicms->inutt.num_frames_since_bou = 0; |
| swicms->inutt.num_frames_in_accum = 0; |
| for(i=0; i<MAX_CHAN_DIM; i++) swicms->inutt.accum[i] = 0; |
| |
| if (sessionExists) { |
| rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.forget_factor2"), |
| &swicms->inutt.forget_factor2, 1); |
| if(rc != ESR_SUCCESS) return rc; |
| |
| rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.disable_after"), |
| &swicms->inutt.disable_after, 1); |
| if(rc != ESR_SUCCESS) return rc; |
| |
| rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.enable_after"), |
| &swicms->inutt.enable_after, 1); |
| if(rc != ESR_SUCCESS) return rc; |
| |
| /* we need to estimate the in-utt cmn from speech frames only! so let's |
| make sure to skip some frames before collecting data, */ |
| ESR_SessionContains(L("CREC.Frontend.start_windback"), &exists); |
| if (exists) { |
| ESR_BOOL do_skip_even_frames = ESR_TRUE; |
| ESR_SessionGetBool(L("CREC.Frontend.do_skip_even_frames"), &do_skip_even_frames); |
| ESR_SessionGetInt(L("CREC.Frontend.start_windback"), &swicms->inutt.num_bou_frames_to_skip); |
| if( do_skip_even_frames) |
| swicms->inutt.num_bou_frames_to_skip /= 2; |
| swicms->inutt.num_bou_frames_to_skip -= 5; /* ensure spch frames only */ |
| } |
| } |
| |
| return 0; |
| CLEANUP: |
| return rc; |
| } |
| |
| |
| ESR_ReturnCode swicms_get_cmn ( swicms_norm_info* swicms, LCHAR *cmn_params, size_t* len ) |
| { |
| int dim_count; |
| int i; |
| imeldata temp[MAX_CHAN_DIM]; |
| const size_t INT_LENGTH = 12; |
| |
| if ( swicms->_prep != NULL ) /* lda exists give them transformed lda. */ |
| { |
| for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) |
| temp [dim_count] = swicms->lda_cmn [dim_count]; |
| inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); |
| } |
| else /* lda does not exist give them raw cmn values */ |
| { |
| for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) |
| temp [dim_count] = swicms->cmn [dim_count]; |
| } |
| |
| for ( dim_count = 0, i = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) |
| { |
| i += sprintf( cmn_params + i, dim_count==0 ? "%d" : ",%d", temp [dim_count] ); |
| if (i + INT_LENGTH >= *len) { |
| *len = MAX_CHAN_DIM * (INT_LENGTH + 2) * sizeof(LCHAR); |
| return ESR_BUFFER_OVERFLOW; |
| } |
| } |
| |
| return ESR_SUCCESS; |
| } |
| |
| |
| ESR_ReturnCode swicms_set_cmn ( swicms_norm_info* swicms, const char *cmn_params ) |
| { |
| ESR_ReturnCode set_status; |
| int length_of_params; |
| int dim_count; |
| int got_word; |
| int current_position; |
| char *copy_of_params; |
| char *parsed_strings [MAX_CHAN_DIM]; |
| int temp_cmn [MAX_CHAN_DIM]; |
| |
| length_of_params = strlen ( cmn_params ) + 1; |
| copy_of_params = (char*)MALLOC ( length_of_params, NULL ); |
| |
| if ( copy_of_params != NULL ) |
| { |
| set_status = ESR_SUCCESS; |
| memcpy ( copy_of_params, cmn_params, length_of_params ); |
| dim_count = 0; |
| current_position = 0; |
| got_word = 0; |
| parsed_strings [dim_count] = copy_of_params + current_position; |
| |
| while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) ) |
| { |
| switch ( *( copy_of_params + current_position ) ) |
| { |
| case '\0': |
| if ( got_word == 1 ) |
| { |
| if ( dim_count == ( MAX_CHAN_DIM - 1 ) ) |
| dim_count++; |
| else |
| { |
| PLogError ( "Channel Normalization : Missing Params Must Contain %d Params\n", MAX_CHAN_DIM ); |
| set_status = ESR_INVALID_ARGUMENT; |
| } |
| } |
| else |
| { |
| PLogError ( "Channel Normalization : Missing Params Mus Contain %d Params\n", MAX_CHAN_DIM ); |
| set_status = ESR_INVALID_ARGUMENT; |
| } |
| break; |
| |
| case ',': |
| if ( got_word == 1 ) |
| { |
| if ( dim_count < ( MAX_CHAN_DIM - 1 ) ) |
| { |
| dim_count++; |
| *( copy_of_params + current_position) = '\0'; |
| current_position++; |
| |
| if ( current_position == length_of_params ) |
| { |
| PLogError ( "Channel Normalization : Delimiter At End Of Param String\n" ); |
| set_status = ESR_INVALID_ARGUMENT; |
| } |
| parsed_strings [dim_count] = copy_of_params + current_position; |
| got_word = 0; |
| } |
| else |
| { |
| PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); |
| set_status = ESR_INVALID_ARGUMENT; |
| } |
| } |
| else |
| { |
| PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); |
| set_status = ESR_INVALID_ARGUMENT; |
| } |
| break; |
| |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| got_word = 1; |
| current_position++; |
| |
| if ( current_position == length_of_params ) |
| { |
| PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); |
| set_status = ESR_INVALID_ARGUMENT; |
| } |
| break; |
| |
| default: |
| PLogError ( "Channel Normalization : Invalid Param : %c : Params Must Contain Only Digits\n" ); |
| set_status = ESR_INVALID_ARGUMENT; |
| break; |
| } |
| } |
| if ( set_status == ESR_SUCCESS ) |
| { |
| dim_count = 0; |
| |
| while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) ) |
| { |
| temp_cmn [dim_count] = atoi ( parsed_strings [dim_count] ); |
| |
| if ( ( temp_cmn [dim_count] < 0 ) || ( temp_cmn [dim_count] > 255 ) ) |
| { |
| set_status = ESR_INVALID_ARGUMENT; |
| } |
| |
| dim_count++; |
| } |
| if ( set_status == ESR_SUCCESS ) |
| { |
| for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) |
| swicms->cmn [dim_count] = temp_cmn [dim_count]; |
| if ( swicms->_prep != NULL ) /* Set now if NULL it will automatically be set on first utterance */ |
| linear_transform_frame(swicms->_prep, swicms->lda_cmn, 1 /*do_shift*/); |
| } |
| } |
| FREE ( copy_of_params ); |
| } |
| else |
| { |
| PLogError ( "Channel Normalization Out Of Memory Error\n" ); |
| set_status = ESR_OUT_OF_MEMORY; |
| } |
| swicms->num_frames_in_cmn = 0; |
| return ( set_status ); |
| } |
| |
| |
| int swicms_cache_frame(swicms_norm_info* swicms, imeldata* frame, int dimen) |
| { |
| int i; |
| imeldata *pcache, *pframe; |
| |
| ASSERT(dimen == MAX_CHAN_DIM); |
| i = swicms->cached_num_frames / swicms->cache_resolution; |
| if (i < SWICMS_CACHE_SIZE_DEFAULT) |
| { |
| pcache = swicms->cached_sections[ i]; |
| if (swicms->cached_num_frames % swicms->cache_resolution == 0) |
| { |
| for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ = 0; |
| pcache -= MAX_CHAN_DIM; |
| } |
| pframe = frame; |
| for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ += *pframe++; |
| swicms->cached_num_frames++; |
| } |
| |
| return 0; |
| } |
| |
| int apply_channel_normalization_in_swicms(swicms_norm_info *swicms, |
| imeldata* oframe, |
| imeldata* iframe, int dimen) |
| { |
| int ii; |
| ASSERT(dimen == MAX_CHAN_DIM); |
| |
| /* IF inutt is activated at all */ |
| if(swicms->inutt.forget_factor2 != SWICMS_INUTT_FORGET_FACTOR2_DISABLE) { |
| /* AND IF we have not disabled it (due to x-utt more reliable) */ |
| if(swicms->inutt.num_frames_in_accum < swicms->inutt.disable_after) { |
| /* AND IF we have skipped past the silence frames */ |
| if( swicms->inutt.num_frames_since_bou >= swicms->inutt.num_bou_frames_to_skip){ |
| swicms->inutt.num_frames_in_accum++; |
| for(ii=0;ii<dimen;ii++) swicms->inutt.accum[ii] += iframe[ii]; |
| /* AND IF we've already seen at least 10 frames (presumably) of speech */ |
| if(swicms->inutt.num_frames_in_accum>swicms->inutt.enable_after) { |
| /* THEN we update the adjustment in-line with the current utterance! */ |
| for(ii=0;ii<dimen;ii++) { |
| imeldata denom = ( swicms->inutt.forget_factor2 |
| + swicms->inutt.num_frames_in_accum ); |
| /* tmp: weighted average of the old lda_cmn and the new accum */ |
| imeldata tmp=(swicms->lda_cmn[ii]*swicms->inutt.forget_factor2 |
| + swicms->inutt.accum[ii] + denom/2) / denom; |
| swicms->adjust[ii] = swicms->lda_tmn[ii] - tmp; |
| } |
| //printf_vector("swicms->adjust2 "," %d",swicms->adjust, dimen); |
| } |
| } |
| } |
| swicms->inutt.num_frames_since_bou++; |
| } |
| |
| for (ii = 0; ii < dimen; ii++) |
| oframe[ii] = MAKEBYTE(iframe[ii] + swicms->adjust[ii]); |
| return 0; |
| } |
| |
| int swicms_update(swicms_norm_info* swicms, int speech_start, int speech_end) |
| { |
| int i, j; |
| asr_int32_t speech_avg[MAX_CHAN_DIM], backgr_avg[MAX_CHAN_DIM], avg[MAX_CHAN_DIM]; |
| int ff; |
| int nn, speech_nn, backgr_nn; |
| int num_frames = swicms->cached_num_frames; |
| int cache_start, cache_end, backgr_cache_end; |
| int sbindex = swicms->sbindex; |
| |
| /* init for utterance */ |
| swicms->inutt.num_frames_since_bou = 0; |
| |
| swicms->cached_num_frames = 0; |
| cache_start = speech_start; |
| cache_start -= (cache_start % swicms->cache_resolution); |
| cache_start /= swicms->cache_resolution; |
| |
| if (speech_end == MAXframeID) |
| { |
| cache_end = SWICMS_CACHE_SIZE_DEFAULT; |
| } |
| else |
| { |
| if (speech_end < num_frames) |
| cache_end = speech_end; |
| else |
| cache_end = num_frames; |
| cache_end -= (cache_end % swicms->cache_resolution); |
| cache_end /= swicms->cache_resolution; |
| } |
| |
| if (num_frames == 0 || speech_end == 0 || speech_start == speech_end || speech_end == MAXframeID) |
| { |
| if (speech_end != 0 || speech_start != 0) |
| PLogError("Warning: speech_bounds (%d,%d) swicms->cached_num_frames (%d)\n", |
| speech_start, speech_end, num_frames); |
| if (SWICMS_DEBUG) { |
| //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM); |
| } |
| return 1; |
| } |
| |
| backgr_cache_end = (num_frames - num_frames % swicms->cache_resolution) / swicms->cache_resolution; |
| |
| speech_nn = (cache_end - cache_start) * swicms->cache_resolution; |
| backgr_nn = backgr_cache_end * swicms->cache_resolution - speech_nn; |
| |
| for (i = 0; i < MAX_CHAN_DIM; i++) |
| { |
| speech_avg[i] = 0; |
| backgr_avg[i] = 0; |
| for (j = cache_start; j < cache_end; j++) |
| speech_avg[i] += swicms->cached_sections[j][i]; |
| for (j = 0; j < cache_start; j++) |
| backgr_avg[i] += swicms->cached_sections[j][i]; |
| for (j = cache_end; j < backgr_cache_end; j++) |
| backgr_avg[i] += swicms->cached_sections[j][i]; |
| if (speech_nn == 0 && backgr_nn > 0) |
| { |
| backgr_avg[i] /= backgr_nn; |
| speech_avg[i] = backgr_avg[i]; |
| speech_nn = backgr_nn; |
| } |
| else if (speech_nn > 0 && backgr_nn == 0) |
| { |
| speech_avg[i] /= speech_nn; |
| backgr_avg[i] = speech_avg[i]; |
| backgr_nn = speech_nn; |
| } |
| else if (speech_nn > 0 && backgr_nn > 0) |
| { |
| speech_avg[i] /= speech_nn; |
| backgr_avg[i] /= backgr_nn; |
| } |
| else |
| { |
| return 0; |
| } |
| |
| avg[i] = (sbindex * speech_avg[i] + (100 - sbindex) * backgr_avg[i] + 50) / 100; |
| } |
| nn = (sbindex * speech_nn + (100 - sbindex) * backgr_nn + 50) / 100; |
| |
| for (i = 0, ff = 0; i < MAX_CHAN_DIM; i++) |
| { |
| ff += (swicms->lda_tmn[i] - avg[i]); |
| } |
| ff /= MAX_CHAN_DIM; /* sum is now the average offset from TMN */ |
| if (ff > 5) |
| { |
| PLogError("Warning: bad utt mean during swicms_update() (moffs=%d)\n", ff); |
| //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM); |
| return 1; |
| } |
| ff = swicms->forget_factor; |
| if (ff < 9999) |
| { |
| for (i = 0; i < MAX_CHAN_DIM; i++) |
| { |
| swicms->lda_cmn[i] = (swicms->lda_cmn[i] * ff + avg[i] * nn + (ff + nn) / 2) / (ff + nn); |
| swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i]; |
| } |
| } |
| |
| if (SWICMS_DEBUG) |
| { |
| imeldata temp[MAX_CHAN_DIM]; |
| PLogMessage("swicms_update() used %d frames (%d-%d)", nn, speech_start, speech_end); |
| |
| for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i]; |
| inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); |
| /* use this dump, to put back into CREC.Frontend.swicms.cmn */ |
| printf_vector("swicms.cmn(r) ", " %d", temp, MAX_CHAN_DIM); |
| |
| //printf_vector("swicms.lda_cmn ", " %d", &swicms.lda_cmn [0], MAX_CHAN_DIM); |
| //printf_vector("swicms.lda_tmn ", " %d", &swicms.lda_tmn [0], MAX_CHAN_DIM); |
| //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); |
| //printf_vector("avg.speech ", " %d", avg, MAX_CHAN_DIM); |
| } |
| else |
| { |
| #ifndef NDEBUG |
| //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); |
| #endif |
| } |
| swicms->num_frames_in_cmn += nn; |
| return 0; |
| } |
| |
| int swicms_lda_process(swicms_norm_info* swicms, preprocessed* prep) |
| { |
| int i; |
| |
| for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_tmn[i] = swicms->tmn[i]; |
| for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_cmn[i] = swicms->cmn[i]; |
| linear_transform_frame(prep, swicms->lda_tmn, 1 /*do_shift*/); |
| linear_transform_frame(prep, swicms->lda_cmn, 1 /*do_shift*/); |
| |
| for (i = 0; i < MAX_CHAN_DIM; i++) |
| { |
| swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i]; |
| } |
| |
| #ifndef NDEBUG |
| //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); |
| #endif |
| swicms->is_valid = 1; |
| swicms->_prep = prep; |
| |
| if(SWICMS_DEBUG) { |
| imeldata temp[MAX_CHAN_DIM]; |
| printf_vector("swicms->cmn ", " %d", swicms->cmn, MAX_CHAN_DIM); |
| printf_vector("swicms->lda_cmn ", " %d", swicms->lda_cmn, MAX_CHAN_DIM); |
| //printf_vector("swicms->tmn ", " %d", swicms->tmn, MAX_CHAN_DIM); |
| //printf_vector("swicms->lda_tmn ", " %d", swicms->lda_tmn, MAX_CHAN_DIM); |
| //printf_vector("swicms->adjust ", " %d", swicms->adjust, MAX_CHAN_DIM); |
| |
| //for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_tmn[i]; |
| //inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); |
| //printf_vector("swicms->tmn(r) ", " %d", temp, MAX_CHAN_DIM); |
| |
| for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i]; |
| inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); |
| printf_vector("swicms->cmn(r) ", " %d", temp, MAX_CHAN_DIM); |
| } |
| return 0; |
| } |
| |
| |
| |