...
git-svn-id: http://svn.xiph.org/trunk/speex@4121 0101bb08-14d6-0310-b084-bc0e0c8e3800
diff --git a/configure.in b/configure.in
index af5b086..df043a6 100644
--- a/configure.in
+++ b/configure.in
@@ -5,7 +5,7 @@
SPEEX_MAJOR_VERSION=0
SPEEX_MINOR_VERSION=99
SPEEX_MICRO_VERSION=0
-SPEEX_VERSION=1.0beta3
+SPEEX_VERSION=1.0beta4cvs
SPEEX_BINARY_AGE=0
SPEEX_INTERFACE_AGE=0
diff --git a/doc/manual.lyx b/doc/manual.lyx
index 9d26643..310e3bb 100644
--- a/doc/manual.lyx
+++ b/doc/manual.lyx
@@ -494,17 +494,18 @@
\layout Standard
-During voiced segments, the speech signal is very periodic, so it is possible
- to take advantage of that by expressing the excitation signal
-\begin_inset Formula $e(n)$
+During voiced segments, the speech signal is periodic, so it is possible
+ to take advantage of that property by approximating the excitation signal
+
+\begin_inset Formula $e[n]$
\end_inset
- as
+ by a gain times the past of the excitation:
\layout Standard
\begin_inset Formula \[
-e[n]=\beta e[n-T]+c[n]\]
+e[n]\simeq p[n]=\beta e[n-T]\]
\end_inset
@@ -528,28 +529,60 @@
innovation codebook
\emph default
.
- In the
-\emph on
-z
-\emph default
--domain, the excitation can be expressed as:
-\layout Standard
-
-
-\begin_inset Formula \[
-e(z)=\frac{1}{1-\beta z^{-T}}\: c(z)\]
-
+ We call that long-term prediction since the excitation is predicted from
+
+\begin_inset Formula $e[n-T]$
\end_inset
+ with
+\begin_inset Formula $T\gg N$
+\end_inset
+.
\layout Subsection
Innovation Codebook
\layout Standard
+The final excitation
+\begin_inset Formula $e[n]$
+\end_inset
+
+ will be the sum of the pitch prediction and an
+\emph on
+innovation
+\emph default
+ signal
+\begin_inset Formula $c[n]$
+\end_inset
+
+ taken from a fixed codebook.
+\layout Standard
+
+
+\begin_inset Formula \[
+e[n]=p[n]+c[n]=\beta e[n-T]+c[n]\]
+
+\end_inset
+
This is where most of the bits in a CELP codec are allocated.
It represents the information that couldn't be obtained either from linear
prediction or pitch prediction.
+ In the
+\emph on
+z
+\emph default
+-domain we can represent the final signal
+\begin_inset Formula $X(z)$
+\end_inset
+
+ as
+\begin_inset Formula \[
+X(z)=\frac{C(z)}{A(z)\left(1-\beta z^{-T}\right)}\]
+
+\end_inset
+
+
\layout Subsection
Analysis-by-Synthesis and Error Weighting
@@ -2286,7 +2319,7 @@
\layout Standard
-(Tones/DTMF to be implemented)
+reserved
\end_inset
</cell>
</row>
diff --git a/libspeex/nb_celp.c b/libspeex/nb_celp.c
index c3617f6..e6a289b 100644
--- a/libspeex/nb_celp.c
+++ b/libspeex/nb_celp.c
@@ -384,7 +384,7 @@
break;
mode--;
}
- /*fprintf (stderr, "%f %d\n", st->relative_quality, mode);*/
+ fprintf (stderr, "%f %d\n", st->relative_quality, mode);
speex_encoder_ctl(state, SPEEX_SET_MODE, &mode);
} else {
st->relative_quality = -1;
diff --git a/libspeex/sb_celp.c b/libspeex/sb_celp.c
index 5591fbc..b4662b6 100644
--- a/libspeex/sb_celp.c
+++ b/libspeex/sb_celp.c
@@ -363,7 +363,7 @@
ratio=0;
/*if (ratio>-2)*/
low_qual+=1.0*(ratio+2);
- {
+ /*{
int high_mode=2;
if (low_qual>10)
high_mode=4;
@@ -371,8 +371,26 @@
high_mode=3;
else if (low_qual>5)
high_mode=2;
- /*high_mode=1;*/
speex_encoder_ctl(st, SPEEX_SET_HIGH_MODE, &high_mode);
+ }*/
+ {
+ int mode;
+ mode = 4;
+ while (mode)
+ {
+ int v1;
+ float thresh;
+ v1=(int)floor(st->vbr_quality);
+ if (v1==10)
+ thresh = vbr_nb_thresh[mode][v1];
+ else
+ thresh = (st->vbr_quality-v1)*vbr_hb_thresh[mode][v1+1] + (1+v1-st->vbr_quality)*vbr_hb_thresh[mode][v1];
+ if (low_qual > thresh)
+ break;
+ mode--;
+ }
+ fprintf (stderr, "%f %d\n", low_qual, mode);
+ speex_encoder_ctl(state, SPEEX_SET_HIGH_MODE, &mode);
}
/*fprintf (stderr, "%f %f\n", ratio, low_qual);*/
}
@@ -970,6 +988,7 @@
{
int q;
float qual = (*(float*)ptr)+.5;
+ st->vbr_quality = (*(float*)ptr);
if (qual>10)
qual=10;
q=(int)floor(.5+*(float*)ptr);
diff --git a/libspeex/vbr.c b/libspeex/vbr.c
index f8f28eb..a43db62 100644
--- a/libspeex/vbr.c
+++ b/libspeex/vbr.c
@@ -47,9 +47,9 @@
float vbr_nb_thresh[8][11]={
{-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0}, /* silence */
- { 3.9, 2.5, 2.0, 1.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0}, /* 2 kbps */
- { 8.0, 6.0, 3.9, 4.5, 4.0, 3.5, 3.0, 2.5, 2.0, 1.0, 0.0}, /* 6 kbps */
- {11.0, 8.5, 7.5, 7.0, 6.5, 6.0, 5.5, 5.0, 4.0, 3.0, 1.0}, /* 8 kbps */
+ { 3.9, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0}, /* 2 kbps */
+ { 8.0, 5.6, 4.7, 4.2, 3.9, 3.5, 3.0, 2.5, 2.0, 1.0, 0.0}, /* 6 kbps */
+ {11.0, 8.5, 7.5, 6.5, 5.0, 3.9, 3.9, 3.9, 3.5, 3.0, 1.0}, /* 8 kbps */
{11.0, 11.0, 9.9, 9.0, 8.0, 7.0, 6.5, 6.0, 5.0, 4.0, 2.0}, /* 11 kbps */
{11.0, 11.0, 11.0, 11.0, 9.5, 9.0, 8.0, 7.0, 6.5, 5.0, 3.0}, /* 15 kbps */
{11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 9.5, 8.5, 8.0, 6.5, 4.0}, /* 18 kbps */
@@ -57,6 +57,14 @@
};
+float vbr_hb_thresh[5][11]={
+ {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0}, /* silence */
+ { 3.9, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0}, /* 2 kbps */
+ {11.0, 11.0, 9.9, 9.0, 8.0, 7.0, 6.5, 6.0, 5.0, 4.0, 2.0}, /* 6 kbps */
+ {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 9.5, 8.5, 8.0, 6.5, 4.0}, /* 10 kbps */
+ {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 9.8, 7.5, 5.5} /* 18 kbps */
+};
+
void vbr_init(VBRState *vbr)
{
int i;
@@ -104,6 +112,7 @@
non-stationary (harder to notice high-frequency noise)???
*/
+#include <stdio.h>
float vbr_analysis(VBRState *vbr, float *sig, int len, int pitch, float pitch_coef)
{
int i;
@@ -153,13 +162,6 @@
vbr->consec_noise=0;
}
- /* Checking for "pseudo temporal masking" */
- if (ener < .1*vbr->average_energy)
- qual -= .5;
- if (ener < .01*vbr->average_energy)
- qual -= .5;
- if (ener < .001*vbr->average_energy)
- qual -= .5;
/* Checking for very low absolute energy */
if (ener < 30000)
{
@@ -169,26 +171,33 @@
if (ener < 3000)
qual-=.7;
} else {
- /* Checking for energy increases */
- if (ener > vbr->last_energy*4.0)
- qual += .7;
- if (ener > vbr->last_energy*1.8)
- qual += .7;
- if (ener > 2*vbr->average_energy)
- qual += .7;
- if (ener > 4*vbr->average_energy)
- qual += .7;
- if (ener2 > 1.6*ener1)
- qual += .7;
- if (ener2 < .6*ener1)
- qual -= .5;
+ float short_diff, long_diff;
+ short_diff = log((ener+1)/(1+vbr->last_energy));
+ long_diff = log((ener+1)/(1+vbr->average_energy));
+ /*fprintf (stderr, "%f %f\n", short_diff, long_diff);*/
- if (ener < .3*vbr->last_energy)
- qual -= .6;
+ if (long_diff<-5)
+ long_diff=-5;
+ if (long_diff>2)
+ long_diff=2;
+
+ if (long_diff>0)
+ qual += .6*long_diff;
+ if (long_diff<0)
+ qual += .5*long_diff;
+ if (short_diff>0)
+ {
+ if (short_diff>5)
+ short_diff=5;
+ qual += .5*short_diff;
+ }
+ /* Checking for energy increases */
+ if (ener2 > 1.6*ener1)
+ qual += .5;
}
vbr->last_energy = ener;
vbr->soft_pitch = .6*vbr->soft_pitch + .4*pitch_coef;
- qual += 2.5*((pitch_coef-.4) + (vbr->soft_pitch-.4));
+ qual += 2.2*((pitch_coef-.4) + (vbr->soft_pitch-.4));
if (qual < vbr->last_quality)
qual = .5*qual + .5*vbr->last_quality;
@@ -197,7 +206,7 @@
if (qual>10)
qual=10;
- if (vbr->consec_noise>=1)
+ if (vbr->consec_noise>=2)
qual-=1.3;
if (vbr->consec_noise>=5)
qual-=1.3;
diff --git a/libspeex/vbr.h b/libspeex/vbr.h
index a8ca682..51c834e 100644
--- a/libspeex/vbr.h
+++ b/libspeex/vbr.h
@@ -39,6 +39,7 @@
#define VBR_MEMORY_SIZE 5
extern float vbr_nb_thresh[8][11];
+extern float vbr_hb_thresh[5][11];
typedef struct VBRState {
float energy_alpha;