ref: 76674feae22db03848a40446beb2fcec70d2180d
parent: e806d6a74129c70d2849cf621968f905149c21e1
author: Jean-Marc Valin <[email protected]>
date: Tue Aug 9 19:22:27 EDT 2016
SSE2 implementation of the PVQ search We used the SSE reciprocal square root instruction to vectorize the serch rather than compare one at a time with multiplies. Speeds up the entire encoder by 8-10%.
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -1036,7 +1036,7 @@
/* Finally do the actual quantization */
if (encode)
{
- cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth);
+ cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth, ctx->arch);
} else {
cm = alg_unquant(X, N, K, spread, B, ec, gain);
}
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -57,6 +57,7 @@
# endif
# if defined(OPUS_X86_MAY_HAVE_SSE2)
# include "x86/pitch_sse2.c"
+# include "x86/vq_sse2.c"
# endif
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
# include "x86/pitch_sse4_1.c"
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -55,6 +55,7 @@
# endif
# if defined(OPUS_X86_MAY_HAVE_SSE2)
# include "x86/pitch_sse2.c"
+# include "x86/vq_sse2.c"
# endif
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
# include "x86/pitch_sse4_1.c"
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -158,11 +158,9 @@
return collapse_mask;
}
-unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,
- opus_val16 gain, int resynth)
+opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch)
{
VARDECL(celt_norm, y);
- VARDECL(int, iy);
VARDECL(int, signx);
int i, j;
int pulsesLeft;
@@ -169,18 +167,12 @@
opus_val32 sum;
opus_val32 xy;
opus_val16 yy;
- unsigned collapse_mask;
SAVE_STACK;
- celt_assert2(K>0, "alg_quant() needs at least one pulse");
- celt_assert2(N>1, "alg_quant() needs at least two dimensions");
-
+ (void)arch;
ALLOC(y, N, celt_norm);
- ALLOC(iy, N, int);
ALLOC(signx, N, int);
- exp_rotation(X, N, 1, B, K, spread);
-
/* Get rid of the sign */
sum = 0;
j=0; do {
@@ -322,6 +314,28 @@
but has the same performance otherwise. */
iy[j] = (iy[j]^-signx[j]) + signx[j];
} while (++j<N);
+ RESTORE_STACK;
+ return yy;
+}
+
+unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,
+ opus_val16 gain, int resynth, int arch)
+{
+ VARDECL(int, iy);
+ opus_val16 yy;
+ unsigned collapse_mask;
+ SAVE_STACK;
+
+ celt_assert2(K>0, "alg_quant() needs at least one pulse");
+ celt_assert2(N>1, "alg_quant() needs at least two dimensions");
+
+ /* Covers vectorization by up to 4. */
+ ALLOC(iy, N+3, int);
+
+ exp_rotation(X, N, 1, B, K, spread);
+
+ yy = op_pvq_search(X, iy, K, N, arch);
+
encode_pulses(iy, N, K, enc);
if (resynth)
--- a/celt/vq.h
+++ b/celt/vq.h
@@ -37,11 +37,21 @@
#include "entdec.h"
#include "modes.h"
+#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT))
+#include "x86/vq_sse.h"
+#endif
+
#if defined(MIPSr1_ASM)
#include "mips/vq_mipsr1.h"
#endif
+opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch);
+#if !defined(OVERRIDE_OP_PVQ_SEARCH)
+#define op_pvq_search(x, iy, K, N, arch) \
+ (op_pvq_search_c(x, iy, K, N, arch))
+#endif
+
/** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of
* the pitch and a combination of pulses such that its norm is still equal
* to 1. This is the function that will typically require the most CPU.
@@ -52,7 +62,7 @@
* @ret A mask indicating which blocks in the band received pulses
*/
unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,
- opus_val16 gain, int resynth);
+ opus_val16 gain, int resynth, int arch);
/** Algebraic pulse decoder
* @param X Decoded normalised spectrum (returned)
--- /dev/null
+++ b/celt/x86/vq_sse.h
@@ -1,0 +1,50 @@
+/* Copyright (c) 2016 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef VQ_SSE_H
+#define VQ_SSE_H
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+#define OVERRIDE_OP_PVQ_SEARCH
+
+opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
+
+#if defined(OPUS_X86_PRESUME_SSE2)
+#define op_pvq_search(x, iy, K, N, arch) \
+ (op_pvq_search_sse2(x, iy, K, N, arch))
+
+#else
+
+extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
+ celt_norm *_X, int *iy, int K, int N, int arch);
+
+# define op_pvq_search(X, iy, K, N, arch) \
+ ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch))
+
+#endif
+#endif
+
+#endif
--- /dev/null
+++ b/celt/x86/vq_sse2.c
@@ -1,0 +1,217 @@
+/* Copyright (c) 2007-2008 CSIRO
+ Copyright (c) 2007-2009 Xiph.Org Foundation
+ Copyright (c) 2007-2016 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "vq.h"
+#include "x86cpu.h"
+
+
+#ifndef FIXED_POINT
+
+opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
+{
+ int i, j;
+ int pulsesLeft;
+ float xy, yy;
+ VARDECL(celt_norm, y);
+ VARDECL(celt_norm, X);
+ VARDECL(float, signy);
+ __m128 signmask;
+ __m128 sums;
+ __m128i fours;
+ SAVE_STACK;
+
+ (void)arch;
+ /* All bits set to zero, except for the sign bit. */
+ signmask = _mm_set_ps1(-0.f);
+ fours = _mm_set_epi32(4, 4, 4, 4);
+ ALLOC(y, N+3, celt_norm);
+ ALLOC(X, N+3, celt_norm);
+ ALLOC(signy, N+3, float);
+
+ OPUS_COPY(X, _X, N);
+ X[N] = X[N+1] = X[N+2] = 0;
+ sums = _mm_setzero_ps();
+ for (j=0;j<N;j+=4)
+ {
+ __m128 x4, s4;
+ x4 = _mm_loadu_ps(&X[j]);
+ s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
+ /* Get rid of the sign */
+ x4 = _mm_andnot_ps(signmask, x4);
+ sums = _mm_add_ps(sums, x4);
+ /* Clear y and iy in case we don't do the projection. */
+ _mm_storeu_ps(&y[j], _mm_setzero_ps());
+ _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+ _mm_storeu_ps(&X[j], x4);
+ _mm_storeu_ps(&signy[j], s4);
+ }
+ sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
+ sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
+
+ xy = yy = 0;
+
+ pulsesLeft = K;
+
+ /* Do a pre-search by projecting on the pyramid */
+ if (K > (N>>1))
+ {
+ __m128i pulses_sum;
+ __m128 yy4, xy4;
+ __m128 rcp4;
+ opus_val32 sum = _mm_cvtss_f32(sums);
+ /* If X is too small, just replace it with a pulse at 0 */
+ /* Prevents infinities and NaNs from causing too many pulses
+ to be allocated. 64 is an approximation of infinity here. */
+ if (!(sum > EPSILON && sum < 64))
+ {
+ X[0] = QCONST16(1.f,14);
+ j=1; do
+ X[j]=0;
+ while (++j<N);
+ sums = _mm_set_ps1(1.f);
+ }
+ rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K-1)), _mm_rcp_ps(sums));
+ xy4 = yy4 = _mm_setzero_ps();
+ pulses_sum = _mm_setzero_si128();
+ for (j=0;j<N;j+=4)
+ {
+ __m128 rx4, x4, y4;
+ __m128i iy4;
+ x4 = _mm_loadu_ps(&X[j]);
+ rx4 = _mm_mul_ps(x4, rcp4);
+ iy4 = _mm_cvttps_epi32(rx4);
+ pulses_sum = _mm_add_epi32(pulses_sum, iy4);
+ _mm_storeu_si128((__m128i*)&iy[j], iy4);
+ y4 = _mm_cvtepi32_ps(iy4);
+ xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
+ yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
+ /* double the y[] vector so we don't have to do it in the search loop. */
+ _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
+ }
+ pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
+ pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
+ pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
+ xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
+ xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
+ xy = _mm_cvtss_f32(xy4);
+ yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
+ yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
+ yy = _mm_cvtss_f32(yy4);
+ }
+ X[N] = X[N+1] = X[N+2] = -100;
+ y[N] = y[N+1] = y[N+2] = 100;
+ celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass");
+
+ /* This should never happen, but just in case it does (e.g. on silence)
+ we fill the first bin with pulses. */
+ if (pulsesLeft > N+3)
+ {
+ opus_val16 tmp = (opus_val16)pulsesLeft;
+ yy = MAC16_16(yy, tmp, tmp);
+ yy = MAC16_16(yy, tmp, y[0]);
+ iy[0] += pulsesLeft;
+ pulsesLeft=0;
+ }
+
+ for (i=0;i<pulsesLeft;i++)
+ {
+ int best_id;
+ __m128 xy4, yy4;
+ __m128 max, max2;
+ __m128i count;
+ __m128i pos;
+ best_id = 0;
+ /* The squared magnitude term gets added anyway, so we might as well
+ add it outside the loop */
+ yy = ADD16(yy, 1);
+ xy4 = _mm_load1_ps(&xy);
+ yy4 = _mm_load1_ps(&yy);
+ max = _mm_setzero_ps();
+ pos = _mm_setzero_si128();
+ count = _mm_set_epi32(3, 2, 1, 0);
+ for (j=0;j<N;j+=4)
+ {
+ __m128 x4, y4, r4;
+ x4 = _mm_loadu_ps(&X[j]);
+ y4 = _mm_loadu_ps(&y[j]);
+ x4 = _mm_add_ps(x4, xy4);
+ y4 = _mm_add_ps(y4, yy4);
+ y4 = _mm_rsqrt_ps(y4);
+ r4 = _mm_mul_ps(x4, y4);
+ /* Update the index of the max. */
+ pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
+ /* Update the max. */
+ max = _mm_max_ps(max, r4);
+ /* Update the indices (+4) */
+ count = _mm_add_epi32(count, fours);
+ }
+ /* Horizontal max */
+ max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
+ max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
+ /* Now that max2 contains the max at all positions, look at which value(s) of the
+ partial max is equal to the global max. */
+ pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
+ pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
+ pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
+ best_id = _mm_cvtsi128_si32(pos);
+
+ /* Updating the sums of the new pulse(s) */
+ xy = ADD32(xy, EXTEND32(X[best_id]));
+ /* We're multiplying y[j] by two so we don't have to do it here */
+ yy = ADD16(yy, y[best_id]);
+
+ /* Only now that we've made the final choice, update y/iy */
+ /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
+ y[best_id] += 2;
+ iy[best_id]++;
+ }
+
+ /* Put the original sign back */
+ for (j=0;j<N;j+=4)
+ {
+ __m128i y4;
+ __m128i s4;
+ y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+ s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
+ y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
+ _mm_storeu_si128((__m128i*)&iy[j], y4);
+ }
+ RESTORE_STACK;
+ return yy;
+}
+
+#endif
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -33,6 +33,7 @@
#include "celt_lpc.h"
#include "pitch.h"
#include "pitch_sse.h"
+#include "vq.h"
#if defined(OPUS_HAVE_RTCD)
@@ -149,6 +150,18 @@
};
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)
+opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
+ celt_norm *_X, int *iy, int K, int N, int arch
+) = {
+ op_pvq_search_c, /* non-sse */
+ op_pvq_search_c,
+ MAY_HAVE_SSE2(op_pvq_search),
+ MAY_HAVE_SSE2(op_pvq_search),
+ MAY_HAVE_SSE2(op_pvq_search)
+};
#endif
#endif
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -49,4 +49,5 @@
celt/mips/pitch_mipsr1.h \
celt/mips/vq_mipsr1.h \
celt/x86/pitch_sse.h \
+celt/x86/vq_sse.h \
celt/x86/x86cpu.h
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -21,7 +21,7 @@
celt/x86/x86_celt_map.c \
celt/x86/pitch_sse.c
-CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c
+CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c celt/x86/vq_sse2.c
CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \
celt/x86/pitch_sse4_1.c