| /* Copyright (c) 2018 Gregor Richards |
| * Copyright (c) 2017 Mozilla */ |
| /* |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions |
| are met: |
| |
| - Redistributions of source code must retain the above copyright |
| notice, this list of conditions and the following disclaimer. |
| |
| - Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifdef HAVE_CONFIG_H |
| #include "config.h" |
| #endif |
| |
| #include <stdlib.h> |
| #include <string.h> |
| #include <stdio.h> |
| #include "kiss_fft.h" |
| #include "common.h" |
| #include <math.h> |
| #include "rnnoise.h" |
| #include "pitch.h" |
| #include "arch.h" |
| #include "rnn.h" |
| #include "rnn_data.h" |
| |
| #define FRAME_SIZE_SHIFT 2 |
| #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT) |
| #define WINDOW_SIZE (2*FRAME_SIZE) |
| #define FREQ_SIZE (FRAME_SIZE + 1) |
| |
| #define PITCH_MIN_PERIOD 60 |
| #define PITCH_MAX_PERIOD 768 |
| #define PITCH_FRAME_SIZE 960 |
| #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE) |
| |
| #define SQUARE(x) ((x)*(x)) |
| |
| #define NB_BANDS 22 |
| |
| #define CEPS_MEM 8 |
| #define NB_DELTA_CEPS 6 |
| |
| #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2) |
| |
| |
| #ifndef TRAINING |
| #define TRAINING 0 |
| #endif |
| |
| |
| /* The built-in model, used if no file is given as input */ |
| extern const struct RNNModel rnnoise_model_orig; |
| |
| |
| static const opus_int16 eband5ms[] = { |
| /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/ |
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100 |
| }; |
| |
| |
| typedef struct { |
| int init; |
| kiss_fft_state *kfft; |
| float half_window[FRAME_SIZE]; |
| float dct_table[NB_BANDS*NB_BANDS]; |
| } CommonState; |
| |
| struct DenoiseState { |
| float analysis_mem[FRAME_SIZE]; |
| float cepstral_mem[CEPS_MEM][NB_BANDS]; |
| int memid; |
| float synthesis_mem[FRAME_SIZE]; |
| float pitch_buf[PITCH_BUF_SIZE]; |
| float pitch_enh_buf[PITCH_BUF_SIZE]; |
| float last_gain; |
| int last_period; |
| float mem_hp_x[2]; |
| float lastg[NB_BANDS]; |
| RNNState rnn; |
| }; |
| |
| void compute_band_energy(float *bandE, const kiss_fft_cpx *X) { |
| int i; |
| float sum[NB_BANDS] = {0}; |
| for (i=0;i<NB_BANDS-1;i++) |
| { |
| int j; |
| int band_size; |
| band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT; |
| for (j=0;j<band_size;j++) { |
| float tmp; |
| float frac = (float)j/band_size; |
| tmp = SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r); |
| tmp += SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i); |
| sum[i] += (1-frac)*tmp; |
| sum[i+1] += frac*tmp; |
| } |
| } |
| sum[0] *= 2; |
| sum[NB_BANDS-1] *= 2; |
| for (i=0;i<NB_BANDS;i++) |
| { |
| bandE[i] = sum[i]; |
| } |
| } |
| |
| void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) { |
| int i; |
| float sum[NB_BANDS] = {0}; |
| for (i=0;i<NB_BANDS-1;i++) |
| { |
| int j; |
| int band_size; |
| band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT; |
| for (j=0;j<band_size;j++) { |
| float tmp; |
| float frac = (float)j/band_size; |
| tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r; |
| tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i; |
| sum[i] += (1-frac)*tmp; |
| sum[i+1] += frac*tmp; |
| } |
| } |
| sum[0] *= 2; |
| sum[NB_BANDS-1] *= 2; |
| for (i=0;i<NB_BANDS;i++) |
| { |
| bandE[i] = sum[i]; |
| } |
| } |
| |
| void interp_band_gain(float *g, const float *bandE) { |
| int i; |
| memset(g, 0, FREQ_SIZE); |
| for (i=0;i<NB_BANDS-1;i++) |
| { |
| int j; |
| int band_size; |
| band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT; |
| for (j=0;j<band_size;j++) { |
| float frac = (float)j/band_size; |
| g[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j] = (1-frac)*bandE[i] + frac*bandE[i+1]; |
| } |
| } |
| } |
| |
| |
| CommonState common; |
| |
| static void check_init() { |
| int i; |
| if (common.init) return; |
| common.kfft = opus_fft_alloc_twiddles(2*FRAME_SIZE, NULL, NULL, NULL, 0); |
| for (i=0;i<FRAME_SIZE;i++) |
| common.half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE)); |
| for (i=0;i<NB_BANDS;i++) { |
| int j; |
| for (j=0;j<NB_BANDS;j++) { |
| common.dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS); |
| if (j==0) common.dct_table[i*NB_BANDS + j] *= sqrt(.5); |
| } |
| } |
| common.init = 1; |
| } |
| |
| static void dct(float *out, const float *in) { |
| int i; |
| check_init(); |
| for (i=0;i<NB_BANDS;i++) { |
| int j; |
| float sum = 0; |
| for (j=0;j<NB_BANDS;j++) { |
| sum += in[j] * common.dct_table[j*NB_BANDS + i]; |
| } |
| out[i] = sum*sqrt(2./22); |
| } |
| } |
| |
| #if 0 |
| static void idct(float *out, const float *in) { |
| int i; |
| check_init(); |
| for (i=0;i<NB_BANDS;i++) { |
| int j; |
| float sum = 0; |
| for (j=0;j<NB_BANDS;j++) { |
| sum += in[j] * common.dct_table[i*NB_BANDS + j]; |
| } |
| out[i] = sum*sqrt(2./22); |
| } |
| } |
| #endif |
| |
| static void forward_transform(kiss_fft_cpx *out, const float *in) { |
| int i; |
| kiss_fft_cpx x[WINDOW_SIZE]; |
| kiss_fft_cpx y[WINDOW_SIZE]; |
| check_init(); |
| for (i=0;i<WINDOW_SIZE;i++) { |
| x[i].r = in[i]; |
| x[i].i = 0; |
| } |
| opus_fft(common.kfft, x, y, 0); |
| for (i=0;i<FREQ_SIZE;i++) { |
| out[i] = y[i]; |
| } |
| } |
| |
| static void inverse_transform(float *out, const kiss_fft_cpx *in) { |
| int i; |
| kiss_fft_cpx x[WINDOW_SIZE]; |
| kiss_fft_cpx y[WINDOW_SIZE]; |
| check_init(); |
| for (i=0;i<FREQ_SIZE;i++) { |
| x[i] = in[i]; |
| } |
| for (;i<WINDOW_SIZE;i++) { |
| x[i].r = x[WINDOW_SIZE - i].r; |
| x[i].i = -x[WINDOW_SIZE - i].i; |
| } |
| opus_fft(common.kfft, x, y, 0); |
| /* output in reverse order for IFFT. */ |
| out[0] = WINDOW_SIZE*y[0].r; |
| for (i=1;i<WINDOW_SIZE;i++) { |
| out[i] = WINDOW_SIZE*y[WINDOW_SIZE - i].r; |
| } |
| } |
| |
| static void apply_window(float *x) { |
| int i; |
| check_init(); |
| for (i=0;i<FRAME_SIZE;i++) { |
| x[i] *= common.half_window[i]; |
| x[WINDOW_SIZE - 1 - i] *= common.half_window[i]; |
| } |
| } |
| |
| int rnnoise_get_size() { |
| return sizeof(DenoiseState); |
| } |
| |
| int rnnoise_get_frame_size() { |
| return FRAME_SIZE; |
| } |
| |
| int rnnoise_init(DenoiseState *st, RNNModel *model) { |
| memset(st, 0, sizeof(*st)); |
| if (model) |
| st->rnn.model = model; |
| else |
| st->rnn.model = &rnnoise_model_orig; |
| st->rnn.vad_gru_state = calloc(sizeof(float), st->rnn.model->vad_gru_size); |
| st->rnn.noise_gru_state = calloc(sizeof(float), st->rnn.model->noise_gru_size); |
| st->rnn.denoise_gru_state = calloc(sizeof(float), st->rnn.model->denoise_gru_size); |
| return 0; |
| } |
| |
| DenoiseState *rnnoise_create(RNNModel *model) { |
| DenoiseState *st; |
| st = malloc(rnnoise_get_size()); |
| rnnoise_init(st, model); |
| return st; |
| } |
| |
| void rnnoise_destroy(DenoiseState *st) { |
| free(st->rnn.vad_gru_state); |
| free(st->rnn.noise_gru_state); |
| free(st->rnn.denoise_gru_state); |
| free(st); |
| } |
| |
| #if TRAINING |
| int lowpass = FREQ_SIZE; |
| int band_lp = NB_BANDS; |
| #endif |
| |
| static void frame_analysis(DenoiseState *st, kiss_fft_cpx *X, float *Ex, const float *in) { |
| int i; |
| float x[WINDOW_SIZE]; |
| RNN_COPY(x, st->analysis_mem, FRAME_SIZE); |
| for (i=0;i<FRAME_SIZE;i++) x[FRAME_SIZE + i] = in[i]; |
| RNN_COPY(st->analysis_mem, in, FRAME_SIZE); |
| apply_window(x); |
| forward_transform(X, x); |
| #if TRAINING |
| for (i=lowpass;i<FREQ_SIZE;i++) |
| X[i].r = X[i].i = 0; |
| #endif |
| compute_band_energy(Ex, X); |
| } |
| |
| static int compute_frame_features(DenoiseState *st, kiss_fft_cpx *X, kiss_fft_cpx *P, |
| float *Ex, float *Ep, float *Exp, float *features, const float *in) { |
| int i; |
| float E = 0; |
| float *ceps_0, *ceps_1, *ceps_2; |
| float spec_variability = 0; |
| float Ly[NB_BANDS]; |
| float p[WINDOW_SIZE]; |
| float pitch_buf[PITCH_BUF_SIZE>>1]; |
| int pitch_index; |
| float gain; |
| float *(pre[1]); |
| float tmp[NB_BANDS]; |
| float follow, logMax; |
| frame_analysis(st, X, Ex, in); |
| RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE); |
| RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE); |
| pre[0] = &st->pitch_buf[0]; |
| pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1); |
| pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE, |
| PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index); |
| pitch_index = PITCH_MAX_PERIOD-pitch_index; |
| |
| gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD, |
| PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain); |
| st->last_period = pitch_index; |
| st->last_gain = gain; |
| for (i=0;i<WINDOW_SIZE;i++) |
| p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i]; |
| apply_window(p); |
| forward_transform(P, p); |
| compute_band_energy(Ep, P); |
| compute_band_corr(Exp, X, P); |
| for (i=0;i<NB_BANDS;i++) Exp[i] = Exp[i]/sqrt(.001+Ex[i]*Ep[i]); |
| dct(tmp, Exp); |
| for (i=0;i<NB_DELTA_CEPS;i++) features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i]; |
| features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3; |
| features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9; |
| features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300); |
| logMax = -2; |
| follow = -2; |
| for (i=0;i<NB_BANDS;i++) { |
| Ly[i] = log10(1e-2+Ex[i]); |
| Ly[i] = MAX16(logMax-7, MAX16(follow-1.5, Ly[i])); |
| logMax = MAX16(logMax, Ly[i]); |
| follow = MAX16(follow-1.5, Ly[i]); |
| E += Ex[i]; |
| } |
| if (!TRAINING && E < 0.04) { |
| /* If there's no audio, avoid messing up the state. */ |
| RNN_CLEAR(features, NB_FEATURES); |
| return 1; |
| } |
| dct(features, Ly); |
| features[0] -= 12; |
| features[1] -= 4; |
| ceps_0 = st->cepstral_mem[st->memid]; |
| ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1]; |
| ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2]; |
| for (i=0;i<NB_BANDS;i++) ceps_0[i] = features[i]; |
| st->memid++; |
| for (i=0;i<NB_DELTA_CEPS;i++) { |
| features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i]; |
| features[NB_BANDS+i] = ceps_0[i] - ceps_2[i]; |
| features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i]; |
| } |
| /* Spectral variability features. */ |
| if (st->memid == CEPS_MEM) st->memid = 0; |
| for (i=0;i<CEPS_MEM;i++) |
| { |
| int j; |
| float mindist = 1e15f; |
| for (j=0;j<CEPS_MEM;j++) |
| { |
| int k; |
| float dist=0; |
| for (k=0;k<NB_BANDS;k++) |
| { |
| float tmp; |
| tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k]; |
| dist += tmp*tmp; |
| } |
| if (j!=i) |
| mindist = MIN32(mindist, dist); |
| } |
| spec_variability += mindist; |
| } |
| features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1; |
| return TRAINING && E < 0.1; |
| } |
| |
| static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) { |
| float x[WINDOW_SIZE]; |
| int i; |
| inverse_transform(x, y); |
| apply_window(x); |
| for (i=0;i<FRAME_SIZE;i++) out[i] = x[i] + st->synthesis_mem[i]; |
| RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE); |
| } |
| |
| static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) { |
| int i; |
| for (i=0;i<N;i++) { |
| float xi, yi; |
| xi = x[i]; |
| yi = x[i] + mem[0]; |
| mem[0] = mem[1] + (b[0]*(double)xi - a[0]*(double)yi); |
| mem[1] = (b[1]*(double)xi - a[1]*(double)yi); |
| y[i] = yi; |
| } |
| } |
| |
| void pitch_filter(kiss_fft_cpx *X, const kiss_fft_cpx *P, const float *Ex, const float *Ep, |
| const float *Exp, const float *g) { |
| int i; |
| float r[NB_BANDS]; |
| float rf[FREQ_SIZE] = {0}; |
| for (i=0;i<NB_BANDS;i++) { |
| #if 0 |
| if (Exp[i]>g[i]) r[i] = 1; |
| else r[i] = Exp[i]*(1-g[i])/(.001 + g[i]*(1-Exp[i])); |
| r[i] = MIN16(1, MAX16(0, r[i])); |
| #else |
| if (Exp[i]>g[i]) r[i] = 1; |
| else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i]))); |
| r[i] = sqrt(MIN16(1, MAX16(0, r[i]))); |
| #endif |
| r[i] *= sqrt(Ex[i]/(1e-8+Ep[i])); |
| } |
| interp_band_gain(rf, r); |
| for (i=0;i<FREQ_SIZE;i++) { |
| X[i].r += rf[i]*P[i].r; |
| X[i].i += rf[i]*P[i].i; |
| } |
| float newE[NB_BANDS]; |
| compute_band_energy(newE, X); |
| float norm[NB_BANDS]; |
| float normf[FREQ_SIZE]={0}; |
| for (i=0;i<NB_BANDS;i++) { |
| norm[i] = sqrt(Ex[i]/(1e-8+newE[i])); |
| } |
| interp_band_gain(normf, norm); |
| for (i=0;i<FREQ_SIZE;i++) { |
| X[i].r *= normf[i]; |
| X[i].i *= normf[i]; |
| } |
| } |
| |
| float rnnoise_process_frame(DenoiseState *st, float *out, const float *in) { |
| int i; |
| kiss_fft_cpx X[FREQ_SIZE]; |
| kiss_fft_cpx P[WINDOW_SIZE]; |
| float x[FRAME_SIZE]; |
| float Ex[NB_BANDS], Ep[NB_BANDS]; |
| float Exp[NB_BANDS]; |
| float features[NB_FEATURES]; |
| float g[NB_BANDS]; |
| float gf[FREQ_SIZE]={1}; |
| float vad_prob = 0; |
| int silence; |
| static const float a_hp[2] = {-1.99599, 0.99600}; |
| static const float b_hp[2] = {-2, 1}; |
| biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE); |
| silence = compute_frame_features(st, X, P, Ex, Ep, Exp, features, x); |
| |
| if (!silence) { |
| compute_rnn(&st->rnn, g, &vad_prob, features); |
| pitch_filter(X, P, Ex, Ep, Exp, g); |
| for (i=0;i<NB_BANDS;i++) { |
| float alpha = .6f; |
| g[i] = MAX16(g[i], alpha*st->lastg[i]); |
| st->lastg[i] = g[i]; |
| } |
| interp_band_gain(gf, g); |
| #if 1 |
| for (i=0;i<FREQ_SIZE;i++) { |
| X[i].r *= gf[i]; |
| X[i].i *= gf[i]; |
| } |
| #endif |
| } |
| |
| frame_synthesis(st, out, X); |
| return vad_prob; |
| } |
| |
| #if TRAINING |
| |
| static float uni_rand() { |
| return rand()/(double)RAND_MAX-.5; |
| } |
| |
| static void rand_resp(float *a, float *b) { |
| a[0] = .75*uni_rand(); |
| a[1] = .75*uni_rand(); |
| b[0] = .75*uni_rand(); |
| b[1] = .75*uni_rand(); |
| } |
| |
| int main(int argc, char **argv) { |
| int i; |
| int count=0; |
| static const float a_hp[2] = {-1.99599, 0.99600}; |
| static const float b_hp[2] = {-2, 1}; |
| float a_noise[2] = {0}; |
| float b_noise[2] = {0}; |
| float a_sig[2] = {0}; |
| float b_sig[2] = {0}; |
| float mem_hp_x[2]={0}; |
| float mem_hp_n[2]={0}; |
| float mem_resp_x[2]={0}; |
| float mem_resp_n[2]={0}; |
| float x[FRAME_SIZE]; |
| float n[FRAME_SIZE]; |
| float xn[FRAME_SIZE]; |
| int vad_cnt=0; |
| int gain_change_count=0; |
| float speech_gain = 1, noise_gain = 1; |
| FILE *f1, *f2; |
| int maxCount; |
| DenoiseState *st; |
| DenoiseState *noise_state; |
| DenoiseState *noisy; |
| st = rnnoise_create(NULL); |
| noise_state = rnnoise_create(NULL); |
| noisy = rnnoise_create(NULL); |
| if (argc!=4) { |
| fprintf(stderr, "usage: %s <speech> <noise> <count>\n", argv[0]); |
| return 1; |
| } |
| f1 = fopen(argv[1], "r"); |
| f2 = fopen(argv[2], "r"); |
| maxCount = atoi(argv[3]); |
| for(i=0;i<150;i++) { |
| short tmp[FRAME_SIZE]; |
| fread(tmp, sizeof(short), FRAME_SIZE, f2); |
| } |
| while (1) { |
| kiss_fft_cpx X[FREQ_SIZE], Y[FREQ_SIZE], N[FREQ_SIZE], P[WINDOW_SIZE]; |
| float Ex[NB_BANDS], Ey[NB_BANDS], En[NB_BANDS], Ep[NB_BANDS]; |
| float Exp[NB_BANDS]; |
| float Ln[NB_BANDS]; |
| float features[NB_FEATURES]; |
| float g[NB_BANDS]; |
| short tmp[FRAME_SIZE]; |
| float vad=0; |
| float E=0; |
| if (count==maxCount) break; |
| if ((count%1000)==0) fprintf(stderr, "%d\r", count); |
| if (++gain_change_count > 2821) { |
| speech_gain = pow(10., (-40+(rand()%60))/20.); |
| noise_gain = pow(10., (-30+(rand()%50))/20.); |
| if (rand()%10==0) noise_gain = 0; |
| noise_gain *= speech_gain; |
| if (rand()%10==0) speech_gain = 0; |
| gain_change_count = 0; |
| rand_resp(a_noise, b_noise); |
| rand_resp(a_sig, b_sig); |
| lowpass = FREQ_SIZE * 3000./24000. * pow(50., rand()/(double)RAND_MAX); |
| for (i=0;i<NB_BANDS;i++) { |
| if (eband5ms[i]<<FRAME_SIZE_SHIFT > lowpass) { |
| band_lp = i; |
| break; |
| } |
| } |
| } |
| if (speech_gain != 0) { |
| fread(tmp, sizeof(short), FRAME_SIZE, f1); |
| if (feof(f1)) { |
| rewind(f1); |
| fread(tmp, sizeof(short), FRAME_SIZE, f1); |
| } |
| for (i=0;i<FRAME_SIZE;i++) x[i] = speech_gain*tmp[i]; |
| for (i=0;i<FRAME_SIZE;i++) E += tmp[i]*(float)tmp[i]; |
| } else { |
| for (i=0;i<FRAME_SIZE;i++) x[i] = 0; |
| E = 0; |
| } |
| if (noise_gain!=0) { |
| fread(tmp, sizeof(short), FRAME_SIZE, f2); |
| if (feof(f2)) { |
| rewind(f2); |
| fread(tmp, sizeof(short), FRAME_SIZE, f2); |
| } |
| for (i=0;i<FRAME_SIZE;i++) n[i] = noise_gain*tmp[i]; |
| } else { |
| for (i=0;i<FRAME_SIZE;i++) n[i] = 0; |
| } |
| biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE); |
| biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE); |
| biquad(n, mem_hp_n, n, b_hp, a_hp, FRAME_SIZE); |
| biquad(n, mem_resp_n, n, b_noise, a_noise, FRAME_SIZE); |
| for (i=0;i<FRAME_SIZE;i++) xn[i] = x[i] + n[i]; |
| if (E > 1e9f) { |
| vad_cnt=0; |
| } else if (E > 1e8f) { |
| vad_cnt -= 5; |
| } else if (E > 1e7f) { |
| vad_cnt++; |
| } else { |
| vad_cnt+=2; |
| } |
| if (vad_cnt < 0) vad_cnt = 0; |
| if (vad_cnt > 15) vad_cnt = 15; |
| |
| if (vad_cnt >= 10) vad = 0; |
| else if (vad_cnt > 0) vad = 0.5f; |
| else vad = 1.f; |
| |
| frame_analysis(st, Y, Ey, x); |
| frame_analysis(noise_state, N, En, n); |
| for (i=0;i<NB_BANDS;i++) Ln[i] = log10(1e-2+En[i]); |
| int silence = compute_frame_features(noisy, X, P, Ex, Ep, Exp, features, xn); |
| pitch_filter(X, P, Ex, Ep, Exp, g); |
| //printf("%f %d\n", noisy->last_gain, noisy->last_period); |
| for (i=0;i<NB_BANDS;i++) { |
| g[i] = sqrt((Ey[i]+1e-3)/(Ex[i]+1e-3)); |
| if (g[i] > 1) g[i] = 1; |
| if (silence || i > band_lp) g[i] = -1; |
| if (Ey[i] < 5e-2 && Ex[i] < 5e-2) g[i] = -1; |
| if (vad==0 && noise_gain==0) g[i] = -1; |
| } |
| count++; |
| #if 1 |
| fwrite(features, sizeof(float), NB_FEATURES, stdout); |
| fwrite(g, sizeof(float), NB_BANDS, stdout); |
| fwrite(Ln, sizeof(float), NB_BANDS, stdout); |
| fwrite(&vad, sizeof(float), 1, stdout); |
| #endif |
| } |
| fprintf(stderr, "matrix size: %d x %d\n", count, NB_FEATURES + 2*NB_BANDS + 1); |
| fclose(f1); |
| fclose(f2); |
| return 0; |
| } |
| |
| #endif |