blob: 4f94ffceac157a58705ec401c25c0ad10cb1ea02 [file] [log] [blame]
/* Copyright (C) 2002 Jean-Marc Valin
File: speex.c
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "speex.h"
#include "lpc.h"
#include "lsp.h"
#include "ltp.h"
#include "quant_lsp.h"
#include "cb_search.h"
#include "filters.h"
#include "stack_alloc.h"
extern float stoc[];
extern float exc_table[][8];
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
#define sqr(x) ((x)*(x))
#define min(a,b) ((a) < (b) ? (a) : (b))
void encoder_init(EncState *st, SpeexMode *mode)
{
int i;
float tmp;
/* Codec parameters, should eventually have several "modes"*/
st->frameSize = mode->frameSize;
st->windowSize = mode->windowSize;
st->nbSubframes=mode->frameSize/mode->subframeSize;
st->subframeSize=mode->subframeSize;
st->lpcSize = mode->lpcSize;
st->bufSize = mode->bufSize;
st->gamma1=mode->gamma1;
st->gamma2=mode->gamma2;
st->min_pitch=mode->pitchStart;
st->max_pitch=mode->pitchEnd;
/* Over-sampling filter (fractional pitch)*/
st->os_fact=4;
st->os_filt_ord2=4*st->os_fact;
st->os_filt = malloc((1+2*st->os_filt_ord2)*sizeof(float));
st->os_filt[st->os_filt_ord2] = 1;
for (i=1;i<=st->os_filt_ord2;i++)
{
float x=M_PI*i/st->os_fact;
st->os_filt[st->os_filt_ord2-i] = st->os_filt[st->os_filt_ord2+i]=sin(x)/x*(.5+.5*cos(M_PI*i/st->os_filt_ord2));
}
/* Normalizing the over-sampling filter */
tmp=0;
for (i=0;i<2*st->os_filt_ord2+1;i++)
tmp += st->os_filt[i];
tmp=1/tmp;
for (i=0;i<2*st->os_filt_ord2+1;i++)
st->os_filt[i] *= tmp;
/*for (i=0;i<2*st->os_filt_ord2+1;i++)
printf ("%f ", st->os_filt[i]);
printf ("\n");*/
/* Allocating input buffer */
st->inBuf = calloc(st->bufSize,sizeof(float));
st->frame = st->inBuf + st->bufSize - st->windowSize;
/* Allocating excitation buffer */
st->excBuf = calloc(st->bufSize,sizeof(float));
st->exc = st->excBuf + st->bufSize - st->windowSize;
st->swBuf = calloc(st->bufSize,sizeof(float));
st->sw = st->swBuf + st->bufSize - st->windowSize;
/* Hanning window */
st->window = malloc(st->windowSize*sizeof(float));
for (i=0;i<st->windowSize;i++)
st->window[i]=.5*(1-cos(2*M_PI*i/st->windowSize));
/* Create the window for autocorrelation (lag-windowing) */
st->lagWindow = malloc((st->lpcSize+1)*sizeof(float));
for (i=0;i<st->lpcSize+1;i++)
st->lagWindow[i]=exp(-.5*sqr(2*M_PI*.01*i));
st->autocorr = malloc((st->lpcSize+1)*sizeof(float));
st->stack = calloc(10000, sizeof(float));
st->buf2 = malloc(st->windowSize*sizeof(float));
st->lpc = malloc((st->lpcSize+1)*sizeof(float));
st->interp_lpc = malloc((st->lpcSize+1)*sizeof(float));
st->interp_qlpc = malloc((st->lpcSize+1)*sizeof(float));
st->bw_lpc1 = malloc((st->lpcSize+1)*sizeof(float));
st->bw_lpc2 = malloc((st->lpcSize+1)*sizeof(float));
st->bw_az = malloc((st->lpcSize*2+1)*sizeof(float));
st->lsp = malloc(st->lpcSize*sizeof(float));
st->qlsp = malloc(st->lpcSize*sizeof(float));
st->old_lsp = malloc(st->lpcSize*sizeof(float));
st->old_qlsp = malloc(st->lpcSize*sizeof(float));
st->interp_lsp = malloc(st->lpcSize*sizeof(float));
st->interp_qlsp = malloc(st->lpcSize*sizeof(float));
st->rc = malloc(st->lpcSize*sizeof(float));
st->first = 1;
st->mem_sp = calloc(st->lpcSize, sizeof(float));
st->mem_sw = calloc(st->lpcSize, sizeof(float));
}
void encoder_destroy(EncState *st)
{
/* Free all allocated memory */
free(st->inBuf);
free(st->excBuf);
free(st->swBuf);
free(st->stack);
free(st->window);
free(st->buf2);
free(st->lpc);
free(st->interp_lpc);
free(st->interp_qlpc);
free(st->bw_lpc1);
free(st->bw_lpc2);
free(st->bw_az);
free(st->autocorr);
free(st->lagWindow);
free(st->lsp);
free(st->qlsp);
free(st->old_lsp);
free(st->interp_lsp);
free(st->old_qlsp);
free(st->interp_qlsp);
free(st->rc);
free(st->mem_sp);
free(st->mem_sw);
}
void encode(EncState *st, float *in, FrameBits *bits)
{
int i, j, sub, roots;
float error;
/* Copy new data in input buffer */
memmove(st->inBuf, st->inBuf+st->frameSize, (st->bufSize-st->frameSize)*sizeof(float));
for (i=0;i<st->frameSize;i++)
st->inBuf[st->bufSize-st->frameSize+i] = in[i];
memmove(st->excBuf, st->excBuf+st->frameSize, (st->bufSize-st->frameSize)*sizeof(float));
memmove(st->swBuf, st->swBuf+st->frameSize, (st->bufSize-st->frameSize)*sizeof(float));
/* Window for analysis */
for (i=0;i<st->windowSize;i++)
st->buf2[i] = st->frame[i] * st->window[i];
/* Compute auto-correlation */
autocorr(st->buf2, st->autocorr, st->lpcSize+1, st->windowSize);
st->autocorr[0] += 1; /* prevents NANs */
st->autocorr[0] *= 1.0001; /* 40 dB noise floor */
/* Lag windowing: equivalent to filtering in the power-spectrum domain */
for (i=0;i<st->lpcSize+1;i++)
st->autocorr[i] *= st->lagWindow[i];
/* Levinson-Durbin */
error = wld(st->lpc+1, st->autocorr, st->rc, st->lpcSize);
st->lpc[0]=1;
/* LPC to LSPs (x-domain) transform */
roots=lpc_to_lsp (st->lpc, st->lpcSize, st->lsp, 6, 0.02, st->stack);
if (roots!=st->lpcSize)
{
fprintf (stderr, "roots!=st->lpcSize\n");
exit(1);
}
/* x-domain to angle domain*/
for (i=0;i<st->lpcSize;i++)
st->lsp[i] = acos(st->lsp[i]);
/* LSP Quantization */
{
unsigned int id;
for (i=0;i<st->lpcSize;i++)
st->qlsp[i]=st->lsp[i];
id=lsp_quant_nb(st->qlsp,10 );
lsp_unquant_nb(st->qlsp,10,id);
}
/*Find open-loop pitch for the whole frame*/
{
float *mem = PUSH(st->stack, st->lpcSize);
for (i=0;i<st->lpcSize;i++)
st->interp_lsp[i] = .5*st->old_lsp[i] + .5*st->lsp[i];
for (i=0;i<st->lpcSize;i++)
st->interp_lsp[i] = cos(st->interp_lsp[i]);
lsp_to_lpc(st->interp_lsp, st->interp_lpc, st->lpcSize,st->stack);
bw_lpc(st->gamma1, st->interp_lpc, st->bw_lpc1, st->lpcSize);
bw_lpc(st->gamma2, st->interp_lpc, st->bw_lpc2, st->lpcSize);
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sp[i];
residue_mem(st->frame, st->bw_lpc1, st->sw, st->frameSize, st->lpcSize, mem);
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sw[i];
syn_filt_mem(st->sw, st->bw_lpc2, st->sw, st->frameSize, st->lpcSize, mem);
open_loop_pitch(st->sw, st->min_pitch, st->max_pitch, st->frameSize, &st->ol_pitch, &st->ol_voiced);
printf ("Open-loop pitch = %d\n", st->ol_pitch);
POP(st->stack);
}
/* Loop on sub-frames */
for (sub=0;sub<st->nbSubframes;sub++)
{
float tmp, tmp1,tmp2,gain[3];
float esig=0, enoise=0, snr;
int pitch, offset, pitch_gain_index;
float *sp, *sw, *res, *exc, *target, *mem;
/* Offset relative to start of frame */
offset = st->subframeSize*sub;
/* Original signal */
sp=st->frame+offset;
/* Excitation */
exc=st->exc+offset;
/* Weighted signal */
sw=st->sw+offset;
/* Filter response */
res = PUSH(st->stack, st->subframeSize);
/* Target signal */
target = PUSH(st->stack, st->subframeSize);
mem = PUSH(st->stack, st->lpcSize);
/* LSP interpolation (quantized and unquantized) */
tmp = (.5 + sub)/st->nbSubframes;
for (i=0;i<st->lpcSize;i++)
st->interp_lsp[i] = (1-tmp)*st->old_lsp[i] + tmp*st->lsp[i];
for (i=0;i<st->lpcSize;i++)
st->interp_qlsp[i] = (1-tmp)*st->old_qlsp[i] + tmp*st->qlsp[i];
/* Compute interpolated LPCs (quantized and unquantized) */
for (i=0;i<st->lpcSize;i++)
st->interp_lsp[i] = cos(st->interp_lsp[i]);
lsp_to_lpc(st->interp_lsp, st->interp_lpc, st->lpcSize,st->stack);
for (i=0;i<st->lpcSize;i++)
st->interp_qlsp[i] = cos(st->interp_qlsp[i]);
lsp_to_lpc(st->interp_qlsp, st->interp_qlpc, st->lpcSize, st->stack);
/* Compute bandwidth-expanded (unquantized) LPCs for perceptual weighting */
bw_lpc(st->gamma1, st->interp_lpc, st->bw_lpc1, st->lpcSize);
bw_lpc(st->gamma2, st->interp_lpc, st->bw_lpc2, st->lpcSize);
/* Reset excitation */
for (i=0;i<st->subframeSize;i++)
exc[i]=0;
/* Compute zero response of A(z/g1) / ( A(z/g2) * Aq(z) ) */
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sp[i];
syn_filt_mem(exc, st->interp_qlpc, exc, st->subframeSize, st->lpcSize, mem);
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sp[i];
residue_mem(exc, st->bw_lpc1, res, st->subframeSize, st->lpcSize, mem);
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sw[i];
syn_filt_mem(res, st->bw_lpc2, res, st->subframeSize, st->lpcSize, mem);
/* Compute weighted signal */
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sp[i];
residue_mem(sp, st->bw_lpc1, sw, st->subframeSize, st->lpcSize, mem);
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sw[i];
syn_filt_mem(sw, st->bw_lpc2, sw, st->subframeSize, st->lpcSize, mem);
for (i=0;i<st->subframeSize;i++)
esig+=sw[i]*sw[i];
/* Compute target signal */
for (i=0;i<st->subframeSize;i++)
target[i]=sw[i]-res[i];
for (i=0;i<st->subframeSize;i++)
exc[i]=0;
#if 1 /*If set to 0, we compute the excitation directly from the target, i.e. we're cheating */
/* Perform adaptive codebook search (3-tap pitch predictor) */
pitch = st->ol_pitch;
#if 0 /* 1 for fractional pitch, 0 for integer pitch */
closed_loop_fractional_pitch(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
exc, st->os_filt, st->os_filt_ord2, st->os_fact, 20, 147,
&gain[0], &pitch, st->lpcSize,
st->subframeSize, st->stack);
#else
pitch_search_3tap(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
exc, 20, 147, &gain[0], &pitch, &pitch_gain_index, st->lpcSize,
st->subframeSize);
for (i=0;i<st->subframeSize;i++)
exc[i]=gain[0]*exc[i-pitch]+gain[1]*exc[i-pitch-1]+gain[2]*exc[i-pitch-2];
printf ("3-tap pitch = %d, gains = [%f %f %f]\n",pitch, gain[0], gain[1], gain[2]);
#endif
/* Update target for adaptive codebook contribution */
residue_zero(exc, st->bw_lpc1, res, st->subframeSize, st->lpcSize);
syn_filt_zero(res, st->interp_qlpc, res, st->subframeSize, st->lpcSize);
syn_filt_zero(res, st->bw_lpc2, res, st->subframeSize, st->lpcSize);
for (i=0;i<st->subframeSize;i++)
target[i]-=res[i];
enoise=0;
for (i=0;i<st->subframeSize;i++)
enoise += target[i]*target[i];
snr = 10*log10((esig+1)/(enoise+1));
printf ("pitch SNR = %f\n", snr);
#if 0 /* 1 for stochastic excitation, 0 for split-VQ*/
for(j=0;j<1;j++){
/*float stoc2[1080];*/
float *stoc2 = PUSH(st->stack,1080);
for (i=0;i<1080;i++)
{
stoc2[i]=stoc[i];
if (i-(pitch-1)>=0)
stoc2[i] += .0*stoc[i-(pitch-1)];
}
POP(st->stack);
/* Perform stochastic codebook search */
overlap_cb_search(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
stoc2, 1024, &gain[0], &pitch, st->lpcSize,
st->subframeSize);
printf ("gain = %f index = %d energy = %f\n",gain[0], pitch, esig);
for (i=0;i<st->subframeSize;i++)
exc[i]+=gain[0]*stoc2[i+pitch];
/* Update target for adaptive codebook contribution (Useless for now)*/
residue_zero(stoc2+pitch, st->bw_lpc1, res, st->subframeSize, st->lpcSize);
syn_filt_zero(res, st->interp_qlpc, res, st->subframeSize, st->lpcSize);
syn_filt_zero(res, st->bw_lpc2, res, st->subframeSize, st->lpcSize);
for (i=0;i<st->subframeSize;i++)
target[i]-=gain[0]*res[i];
}
#else
split_cb_search(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
exc_table, 64, &gain[0], &pitch, st->lpcSize,
st->subframeSize, exc);
#endif
/* Compute weighted noise energy, SNR */
enoise=0;
for (i=0;i<st->subframeSize;i++)
enoise += target[i]*target[i];
snr = 10*log10((esig+1)/(enoise+1));
printf ("seg SNR = %f\n", snr);
#else
#if 1 /* Code to calculate the exact excitation after pitch prediction */
for (i=0;i<st->subframeSize;i++)
st->buf2[i]=target[i];
#if 0 /* 0 for fractional pitch, 1 for integer */
pitch_search_3tap(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
exc, 20, 147, &gain[0], &pitch, &pitch_gain_index, st->lpcSize,
st->subframeSize);
for (i=0;i<st->subframeSize;i++)
exc[i]=gain[0]*exc[i-pitch]+gain[1]*exc[i-pitch-1]+gain[2]*exc[i-pitch-2];
printf ("3-tap pitch = %d, gains = [%f %f %f]\n",pitch, gain[0], gain[1], gain[2]);
#else
pitch = st->ol_pitch;
closed_loop_fractional_pitch(target, st->interp_qlpc, st->bw_lpc1, st->bw_lpc2,
exc, st->os_filt, st->os_filt_ord2, st->os_fact, 20, 147,
&gain[0], &pitch, st->lpcSize,
st->subframeSize, st->stack);
#endif
/* Update target for adaptive codebook contribution */
residue_zero(exc, st->bw_lpc1, res, st->subframeSize, st->lpcSize);
syn_filt_zero(res, st->interp_qlpc, res, st->subframeSize, st->lpcSize);
syn_filt_zero(res, st->bw_lpc2, res, st->subframeSize, st->lpcSize);
for (i=0;i<st->subframeSize;i++)
target[i]-=res[i];
enoise=0;
for (i=0;i<st->subframeSize;i++)
enoise += target[i]*target[i];
snr = 10*log10((esig+1)/(enoise+1));
printf ("pitch SNR = %f\n", snr);
syn_filt_zero(target, st->bw_lpc1, res, st->subframeSize, st->lpcSize);
residue_zero(res, st->interp_qlpc, exc, st->subframeSize, st->lpcSize);
residue_zero(exc, st->bw_lpc2, exc, st->subframeSize, st->lpcSize);
if (snr>5)
{
for (i=0;i<st->subframeSize;i++)
{
if (i%8==0&&i)
printf("\n");
printf ("%f ", exc[i]);
}
printf ("\n");
}
for (i=0;i<st->subframeSize;i++)
target[i]=st->buf2[i];
#endif
/* We're cheating to get perfect reconstruction */
syn_filt_zero(target, st->bw_lpc1, res, st->subframeSize, st->lpcSize);
residue_zero(res, st->interp_qlpc, exc, st->subframeSize, st->lpcSize);
residue_zero(exc, st->bw_lpc2, exc, st->subframeSize, st->lpcSize);
#endif
/*Keep the previous memory*/
for (i=0;i<st->lpcSize;i++)
mem[i]=st->mem_sp[i];
/* Final signal synthesis from excitation */
syn_filt_mem(exc, st->interp_qlpc, sp, st->subframeSize, st->lpcSize, st->mem_sp);
/* Compute weighted signal again, from synthesized speech (not sure it's the right thing) */
residue_mem(sp, st->bw_lpc1, sw, st->subframeSize, st->lpcSize, mem);
syn_filt_mem(sw, st->bw_lpc2, sw, st->subframeSize, st->lpcSize, st->mem_sw);
POP(st->stack);
POP(st->stack);
POP(st->stack);
}
/* Store the LSPs for interpolation in the next frame */
for (i=0;i<st->lpcSize;i++)
st->old_lsp[i] = st->lsp[i];
for (i=0;i<st->lpcSize;i++)
st->old_qlsp[i] = st->qlsp[i];
/* The next frame will not by the first (Duh!) */
st->first = 0;
/* Replace input by synthesized speech */
for (i=0;i<st->frameSize;i++)
in[i]=st->frame[i];
}
void decoder_init(DecState *st, SpeexMode *mode)
{
int i;
/* Codec parameters, should eventually have several "modes"*/
st->frameSize = mode->frameSize;
st->windowSize = mode->windowSize;
st->nbSubframes=mode->frameSize/mode->subframeSize;
st->subframeSize=mode->subframeSize;
st->lpcSize = mode->lpcSize;
st->bufSize = mode->bufSize;
st->gamma1=mode->gamma1;
st->gamma2=mode->gamma2;
st->inBuf = malloc(st->bufSize*sizeof(float));
st->frame = st->inBuf + st->bufSize - st->windowSize;
st->excBuf = malloc(st->bufSize*sizeof(float));
st->exc = st->excBuf + st->bufSize - st->windowSize;
for (i=0;i<st->bufSize;i++)
st->inBuf[i]=0;
for (i=0;i<st->bufSize;i++)
st->excBuf[i]=0;
}
void decoder_destroy(DecState *st)
{
free(st->inBuf);
free(st->excBuf);
}
void decode(DecState *st, FrameBits *bits, float *out)
{
}