ref: b7c7653941f7add1dbf58360640b0da7ee6e6cf4
parent: fb19ba9126988e123dbde1b2384b87bd1165fc55
author: Jean-Marc Valin <[email protected]>
date: Mon Aug 15 13:30:00 EDT 2016
Speeding up PVQ search by allocating even more pulses in the projection.
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -210,7 +210,8 @@
while (++j<N);
sum = QCONST16(1.f,14);
}
- rcp = EXTRACT16(MULT16_32_Q16(K-1, celt_rcp(sum)));
+ /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
+ rcp = EXTRACT16(MULT16_32_Q16(K+0.8, celt_rcp(sum)));
j=0; do {
#ifdef FIXED_POINT
/* It's really important to round *towards zero* here */
@@ -225,7 +226,7 @@
pulsesLeft -= iy[j];
} while (++j<N);
}
- celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass");
+ celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
/* This should never happen, but just in case it does (e.g. on silence)
we fill the first bin with pulses. */
--- a/celt/x86/vq_sse2.c
+++ b/celt/x86/vq_sse2.c
@@ -104,7 +104,8 @@
while (++j<N);
sums = _mm_set_ps1(1.f);
}
- rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K-1)), _mm_rcp_ps(sums));
+ /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
+ rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
xy4 = yy4 = _mm_setzero_ps();
pulses_sum = _mm_setzero_si128();
for (j=0;j<N;j+=4)
@@ -134,7 +135,7 @@
}
X[N] = X[N+1] = X[N+2] = -100;
y[N] = y[N+1] = y[N+2] = 100;
- celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass");
+ celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
/* This should never happen, but just in case it does (e.g. on silence)
we fill the first bin with pulses. */