shithub: opus

--- a/Makefile.am

+++ b/Makefile.am

@@ -23,6 +23,9 @@

 endif

 else

 SILK_SOURCES += $(SILK_SOURCES_FLOAT)

+if HAVE_SSE4_1

+SILK_SOURCES += $(SILK_SOURCES_SSE4_1)

+endif

 endif

 if DISABLE_FLOAT_API

--- a/celt/bands.c

+++ b/celt/bands.c

@@ -398,7 +398,7 @@

-static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N)

+static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N, int arch)

    int j;

    opus_val32 xp=0, side=0;

@@ -410,7 +410,7 @@

    opus_val32 t, lgain, rgain;

    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */

-   dual_inner_prod(Y, X, Y, N, &xp, &side);

+   dual_inner_prod(Y, X, Y, N, &xp, &side, arch);

    /* Compensating for the mid normalization */

    xp = MULT16_32_Q15(mid, xp);

    /* mid and side are in Q15, not Q14 like X and Y */

@@ -1348,7 +1348,7 @@

    if (resynth)

       if (N!=2)

-         stereo_merge(X, Y, mid, N);

+         stereo_merge(X, Y, mid, N, ctx->arch);

       if (inv)

          int j;

--- a/celt/celt.c

+++ b/celt/celt.c

@@ -89,10 +89,12 @@

    return ret;

-#ifndef OVERRIDE_COMB_FILTER_CONST

 /* This version should be faster on ARM */

 #ifdef OPUS_ARM_ASM

-static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,

+#ifndef NON_STATIC_COMB_FILTER_CONST_C

+static

+#endif

+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,

       opus_val16 g10, opus_val16 g11, opus_val16 g12)

    opus_val32 x0, x1, x2, x3, x4;

@@ -147,7 +149,10 @@

 #endif

 #else

-static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,

+#ifndef NON_STATIC_COMB_FILTER_CONST_C

+static

+#endif

+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,

       opus_val16 g10, opus_val16 g11, opus_val16 g12)

    opus_val32 x0, x1, x2, x3, x4;

@@ -171,12 +176,11 @@

 #endif

-#endif

 #ifndef OVERRIDE_comb_filter

 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,

       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,

-      const opus_val16 *window, int overlap)

+      const opus_val16 *window, int overlap, int arch)

    int i;

    /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */

@@ -234,7 +238,7 @@

    /* Compute the part with the constant filter. */

-   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);

+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch);

 #endif /* OVERRIDE_comb_filter */

--- a/celt/celt.h

+++ b/celt/celt.h

@@ -201,7 +201,17 @@

 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,

       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,

-      const opus_val16 *window, int overlap);

+      const opus_val16 *window, int overlap, int arch);

+#ifdef NON_STATIC_COMB_FILTER_CONST_C

+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,

+                         opus_val16 g10, opus_val16 g11, opus_val16 g12);

+#endif

+#ifndef OVERRIDE_COMB_FILTER_CONST

+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch)		\

+    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))

+#endif

 void init_caps(const CELTMode *m,int *cap,int LM,int C);

--- a/celt/celt_decoder.c

+++ b/celt/celt_decoder.c

@@ -698,7 +698,7 @@

          comb_filter(etmp, buf+DECODE_BUFFER_SIZE,

               st->postfilter_period, st->postfilter_period, overlap,

               -st->postfilter_gain, -st->postfilter_gain,

-              st->postfilter_tapset, st->postfilter_tapset, NULL, 0);

+              st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch);

          /* Simulate TDAC on the concealed audio so that it blends with the

             MDCT of the next frame. */

@@ -1009,11 +1009,11 @@

       st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD);

       comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize,

             st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset,

-            mode->window, overlap);

+            mode->window, overlap, st->arch);

       if (LM!=0)

          comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize,

                st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset,

-               mode->window, overlap);

+               mode->window, overlap, st->arch);

    } while (++c<CC);

    st->postfilter_period_old = st->postfilter_period;

--- a/celt/celt_encoder.c

+++ b/celt/celt_encoder.c

@@ -1163,11 +1163,11 @@

       if (offset)

          comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD,

                st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain,

-               st->prefilter_tapset, st->prefilter_tapset, NULL, 0);

+               st->prefilter_tapset, st->prefilter_tapset, NULL, 0, st->arch);

       comb_filter(in+c*(N+overlap)+overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset,

             st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1,

-            st->prefilter_tapset, prefilter_tapset, mode->window, overlap);

+            st->prefilter_tapset, prefilter_tapset, mode->window, overlap, st->arch);

       OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap);

       if (N>COMBFILTER_MAXPERIOD)

--- a/celt/celt_lpc.h

+++ b/celt/celt_lpc.h

@@ -48,7 +48,7 @@

          opus_val16 *mem,

          int arch);

-#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)

+#if !defined(OVERRIDE_CELT_FIR)

 #define celt_fir(x, num, y, N, ord, mem, arch) \

     (celt_fir_c(x, num, y, N, ord, mem, arch))

 #endif

--- a/celt/cpu_support.h

+++ b/celt/cpu_support.h

@@ -43,14 +43,16 @@

*/

 #define OPUS_ARCHMASK 3

-#elif (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2) || (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)))

+#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \

+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \

+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1))

 #include "x86/x86cpu.h"

-/* We currently support 3 x86 variants:

+/* We currently support 4 x86 variants:

  * arch[0] -> non-sse

- * arch[1] -> sse2

- * arch[2] -> sse4.1

- * arch[3] -> NULL

+ * arch[1] -> sse

+ * arch[2] -> sse2

+ * arch[3] -> sse4.1

*/

 #define OPUS_ARCHMASK 3

 int opus_select_arch(void);

--- a/celt/mips/celt_mipsr1.h

+++ b/celt/mips/celt_mipsr1.h

@@ -56,7 +56,7 @@

 #define OVERRIDE_comb_filter

 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,

       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,

-      const opus_val16 *window, int overlap)

+      const opus_val16 *window, int overlap, int arch)

    int i;

    opus_val32 x0, x1, x2, x3, x4;

--- a/celt/pitch.c

+++ b/celt/pitch.c

@@ -439,7 +439,7 @@

    T = T0 = *T0_;

    ALLOC(yy_lookup, maxperiod+1, opus_val32);

-   dual_inner_prod(x, x, x-T0, N, &xx, &xy);

+   dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch);

    yy_lookup[0] = xx;

    yy=xx;

    for (i=1;i<=maxperiod;i++)

@@ -483,7 +483,7 @@

          T1b = celt_udiv(2*second_check[k]*T0+k, 2*k);

-      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);

+      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch);

       xy += xy2;

       yy = yy_lookup[T1] + yy_lookup[T1b];

 #ifdef FIXED_POINT

--- a/celt/pitch.h

+++ b/celt/pitch.h

@@ -37,8 +37,8 @@

 #include "modes.h"

 #include "cpu_support.h"

-#if defined(__SSE__) && !defined(FIXED_POINT) \

- || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)

+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \

+  || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT))

 #include "x86/pitch_sse.h"

 #endif

@@ -135,8 +135,7 @@

 #endif /* OVERRIDE_XCORR_KERNEL */

-#ifndef OVERRIDE_DUAL_INNER_PROD

-static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,

+static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,

       int N, opus_val32 *xy1, opus_val32 *xy2)

    int i;

@@ -150,6 +149,10 @@

    *xy1 = xy01;

    *xy2 = xy02;

+#ifndef OVERRIDE_DUAL_INNER_PROD

+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \

+    ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2))

 #endif

 /*We make sure a C version is always available for cases where the overhead of

@@ -168,6 +171,12 @@

 # define celt_inner_prod(x, y, N, arch) \

     ((void)(arch),celt_inner_prod_c(x, y, N))

 #endif

+#ifdef NON_STATIC_COMB_FILTER_CONST_C

+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,

+     opus_val16 g10, opus_val16 g11, opus_val16 g12);

+#endif

 #ifdef FIXED_POINT

 opus_val32

--- a/celt/x86/celt_lpc_sse.c

+++ b/celt/x86/celt_lpc_sse.c

@@ -38,6 +38,8 @@

 #include "pitch.h"

 #include "x86cpu.h"

+#if defined(FIXED_POINT)

 void celt_fir_sse4_1(const opus_val16 *_x,

          const opus_val16 *num,

          opus_val16 *_y,

@@ -126,3 +128,5 @@

 #endif

    RESTORE_STACK;

+#endif

--- a/celt/x86/celt_lpc_sse.h

+++ b/celt/x86/celt_lpc_sse.h

@@ -32,7 +32,9 @@

 #include "config.h"

 #endif

-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)

+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)

+#define OVERRIDE_CELT_FIR

 void celt_fir_sse4_1(

          const opus_val16 *x,

          const opus_val16 *num,

@@ -42,6 +44,12 @@

          opus_val16 *mem,

          int arch);

+#if defined(OPUS_X86_PRESUME_SSE4_1)

+#define celt_fir(x, num, y, N, ord, mem, arch) \

+    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))

+#else

 extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(

          const opus_val16 *x,

          const opus_val16 *num,

@@ -55,4 +63,6 @@

     ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))

 #endif

+#endif

 #endif

--- a/celt/x86/pitch_sse.c

+++ b/celt/x86/pitch_sse.c

@@ -35,3 +35,151 @@

 #include "mathops.h"

 #include "pitch.h"

+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)

+#include <xmmintrin.h>

+#include "arch.h"

+void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)

+{

+   int j;

+   __m128 xsum1, xsum2;

+   xsum1 = _mm_loadu_ps(sum);

+   xsum2 = _mm_setzero_ps();

+   for (j = 0; j < len-3; j += 4)

+   {

+      __m128 x0 = _mm_loadu_ps(x+j);

+      __m128 yj = _mm_loadu_ps(y+j);

+      __m128 y3 = _mm_loadu_ps(y+j+3);

+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));

+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),

+                                          _mm_shuffle_ps(yj,y3,0x49)));

+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),

+                                          _mm_shuffle_ps(yj,y3,0x9e)));

+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));

+   }

+   if (j < len)

+   {

+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));

+      if (++j < len)

+      {

+         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));

+         if (++j < len)

+         {

+            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));

+         }

+      }

+   }

+   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));

+}

+void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,

+      int N, opus_val32 *xy1, opus_val32 *xy2)

+{

+   int i;

+   __m128 xsum1, xsum2;

+   xsum1 = _mm_setzero_ps();

+   xsum2 = _mm_setzero_ps();

+   for (i=0;i<N-3;i+=4)

+   {

+      __m128 xi = _mm_loadu_ps(x+i);

+      __m128 y1i = _mm_loadu_ps(y01+i);

+      __m128 y2i = _mm_loadu_ps(y02+i);

+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));

+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));

+   }

+   /* Horizontal sum */

+   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));

+   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));

+   _mm_store_ss(xy1, xsum1);

+   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));

+   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));

+   _mm_store_ss(xy2, xsum2);

+   for (;i<N;i++)

+   {

+      *xy1 = MAC16_16(*xy1, x[i], y01[i]);

+      *xy2 = MAC16_16(*xy2, x[i], y02[i]);

+   }

+}

+opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,

+      int N)

+{

+   int i;

+   float xy;

+   __m128 sum;

+   sum = _mm_setzero_ps();

+   /* FIXME: We should probably go 8-way and use 2 sums. */

+   for (i=0;i<N-3;i+=4)

+   {

+      __m128 xi = _mm_loadu_ps(x+i);

+      __m128 yi = _mm_loadu_ps(y+i);

+      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));

+   }

+   /* Horizontal sum */

+   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));

+   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));

+   _mm_store_ss(&xy, sum);

+   for (;i<N;i++)

+   {

+      xy = MAC16_16(xy, x[i], y[i]);

+   }

+   return xy;

+}

+void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,

+      opus_val16 g10, opus_val16 g11, opus_val16 g12)

+{

+   int i;

+   __m128 x0v;

+   __m128 g10v, g11v, g12v;

+   g10v = _mm_load1_ps(&g10);

+   g11v = _mm_load1_ps(&g11);

+   g12v = _mm_load1_ps(&g12);

+   x0v = _mm_loadu_ps(&x[-T-2]);

+   for (i=0;i<N-3;i+=4)

+   {

+      __m128 yi, yi2, x1v, x2v, x3v, x4v;

+      const opus_val32 *xp = &x[i-T-2];

+      yi = _mm_loadu_ps(x+i);

+      x4v = _mm_loadu_ps(xp+4);

+#if 0

+      /* Slower version with all loads */

+      x1v = _mm_loadu_ps(xp+1);

+      x2v = _mm_loadu_ps(xp+2);

+      x3v = _mm_loadu_ps(xp+3);

+#else

+      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);

+      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);

+      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);

+#endif

+      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));

+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */

+      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));

+      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));

+#else

+      /* Use partial sums */

+      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),

+                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));

+      yi = _mm_add_ps(yi, yi2);

+#endif

+      x0v=x4v;

+      _mm_storeu_ps(y+i, yi);

+   }

+#ifdef CUSTOM_MODES

+   for (;i<N;i++)

+   {

+      y[i] = x[i]

+               + MULT16_32_Q15(g10,x[i-T])

+               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))

+               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));

+   }

+#endif

+}

+#endif

--- a/celt/x86/pitch_sse.h

+++ b/celt/x86/pitch_sse.h

@@ -37,17 +37,37 @@

 #include "config.h"

 #endif

-#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)

-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)

+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)

 void xcorr_kernel_sse4_1(

                     const opus_int16 *x,

                     const opus_int16 *y,

                     opus_val32       sum[4],

                     int              len);

+#endif

+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)

+void xcorr_kernel_sse(

+                    const opus_val16 *x,

+                    const opus_val16 *y,

+                    opus_val32       sum[4],

+                    int              len);

+#endif

+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)

+#define OVERRIDE_XCORR_KERNEL

+#define xcorr_kernel(x, y, sum, len, arch) \

+    ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))

+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)

+#define OVERRIDE_XCORR_KERNEL

+#define xcorr_kernel(x, y, sum, len, arch) \

+    ((void)arch, xcorr_kernel_sse(x, y, sum, len))

+#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))

 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(

-                    const opus_int16 *x,

-                    const opus_int16 *y,

+                    const opus_val16 *x,

+                    const opus_val16 *y,

                     opus_val32       sum[4],

                     int              len);

@@ -55,6 +75,9 @@

 #define xcorr_kernel(x, y, sum, len, arch) \

     ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))

+#endif

+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)

 opus_val32 celt_inner_prod_sse4_1(

     const opus_int16 *x,

     const opus_int16 *y,

@@ -61,7 +84,7 @@

     int               N);

 #endif

-#if defined(OPUS_X86_MAY_HAVE_SSE2)

+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)

 opus_val32 celt_inner_prod_sse2(

     const opus_int16 *x,

     const opus_int16 *y,

@@ -68,168 +91,102 @@

     int               N);

 #endif

+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)

+opus_val32 celt_inner_prod_sse(

+    const opus_val16 *x,

+    const opus_val16 *y,

+    int               N);

+#endif

+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)

+#define OVERRIDE_CELT_INNER_PROD

+#define celt_inner_prod(x, y, N, arch) \

+	((void)arch, celt_inner_prod_sse4_1(x, y, N))

+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)

+#define OVERRIDE_CELT_INNER_PROD

+#define celt_inner_prod(x, y, N, arch) \

+	((void)arch, celt_inner_prod_sse2(x, y, N))

+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)

+#define OVERRIDE_CELT_INNER_PROD

+#define celt_inner_prod(x, y, N, arch) \

+	((void)arch, celt_inner_prod_sse(x, y, N))

+#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \

+	(defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))

 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(

-                    const opus_int16 *x,

-                    const opus_int16 *y,

+                    const opus_val16 *x,

+                    const opus_val16 *y,

                     int               N);

 #define OVERRIDE_CELT_INNER_PROD

 #define celt_inner_prod(x, y, N, arch) \

     ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))

-#else

-#include <xmmintrin.h>

-#include "arch.h"

+#endif

-#define OVERRIDE_XCORR_KERNEL

-static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)

-{

-   int j;

-   __m128 xsum1, xsum2;

-   xsum1 = _mm_loadu_ps(sum);

-   xsum2 = _mm_setzero_ps();

+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)

-   for (j = 0; j < len-3; j += 4)

-   {

-      __m128 x0 = _mm_loadu_ps(x+j);

-      __m128 yj = _mm_loadu_ps(y+j);

-      __m128 y3 = _mm_loadu_ps(y+j+3);

+#define OVERRIDE_DUAL_INNER_PROD

+#define OVERRIDE_COMB_FILTER_CONST

-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));

-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),

-                                          _mm_shuffle_ps(yj,y3,0x49)));

-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),

-                                          _mm_shuffle_ps(yj,y3,0x9e)));

-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));

-   }

-   if (j < len)

-   {

-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));

-      if (++j < len)

-      {

-         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));

-         if (++j < len)

-         {

-            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));

-         }

-      }

-   }

-   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));

-}

+#undef dual_inner_prod

+#undef comb_filter_const

-#define xcorr_kernel(_x, _y, _z, len, arch) \

-    ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))

+void dual_inner_prod_sse(const opus_val16 *x,

+	const opus_val16 *y01,

+	const opus_val16 *y02,

+	int               N,

+	opus_val32       *xy1,

+	opus_val32       *xy2);

-#define OVERRIDE_DUAL_INNER_PROD

-static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,

-      int N, opus_val32 *xy1, opus_val32 *xy2)

-{

-   int i;

-   __m128 xsum1, xsum2;

-   xsum1 = _mm_setzero_ps();

-   xsum2 = _mm_setzero_ps();

-   for (i=0;i<N-3;i+=4)

-   {

-      __m128 xi = _mm_loadu_ps(x+i);

-      __m128 y1i = _mm_loadu_ps(y01+i);

-      __m128 y2i = _mm_loadu_ps(y02+i);

-      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));

-      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));

-   }

-   /* Horizontal sum */

-   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));

-   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));

-   _mm_store_ss(xy1, xsum1);

-   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));

-   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));

-   _mm_store_ss(xy2, xsum2);

-   for (;i<N;i++)

-   {

-      *xy1 = MAC16_16(*xy1, x[i], y01[i]);

-      *xy2 = MAC16_16(*xy2, x[i], y02[i]);

-   }

-}

+void comb_filter_const_sse(opus_val32 *y,

+	opus_val32 *x,

+	int         T,

+	int         N,

+	opus_val16  g10,

+	opus_val16  g11,

+	opus_val16  g12);

-#define OVERRIDE_CELT_INNER_PROD

-static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,

-      int N)

-{

-   int i;

-   float xy;

-   __m128 sum;

-   sum = _mm_setzero_ps();

-   /* FIXME: We should probably go 8-way and use 2 sums. */

-   for (i=0;i<N-3;i+=4)

-   {

-      __m128 xi = _mm_loadu_ps(x+i);

-      __m128 yi = _mm_loadu_ps(y+i);

-      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));

-   }

-   /* Horizontal sum */

-   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));

-   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));

-   _mm_store_ss(&xy, sum);

-   for (;i<N;i++)

-   {

-      xy = MAC16_16(xy, x[i], y[i]);

-   }

-   return xy;

-}

-#  define celt_inner_prod(_x, _y, len, arch) \

-    ((void)(arch),celt_inner_prod_sse(_x, _y, len))

+#if defined(OPUS_X86_PRESUME_SSE)

+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \

+    ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))

-#define OVERRIDE_COMB_FILTER_CONST

-static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,

-      opus_val16 g10, opus_val16 g11, opus_val16 g12)

-{

-   int i;

-   __m128 x0v;

-   __m128 g10v, g11v, g12v;

-   g10v = _mm_load1_ps(&g10);

-   g11v = _mm_load1_ps(&g11);

-   g12v = _mm_load1_ps(&g12);

-   x0v = _mm_loadu_ps(&x[-T-2]);

-   for (i=0;i<N-3;i+=4)

-   {

-      __m128 yi, yi2, x1v, x2v, x3v, x4v;

-      const opus_val32 *xp = &x[i-T-2];

-      yi = _mm_loadu_ps(x+i);

-      x4v = _mm_loadu_ps(xp+4);

-#if 0

-      /* Slower version with all loads */

-      x1v = _mm_loadu_ps(xp+1);

-      x2v = _mm_loadu_ps(xp+2);

-      x3v = _mm_loadu_ps(xp+3);

+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \

+    ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))

 #else

-      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);

-      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);

-      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);

-#endif

-      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));

-#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */

-      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));

-      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));

-#else

-      /* Use partial sums */

-      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),

-                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));

-      yi = _mm_add_ps(yi, yi2);

+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(

+              const opus_val16 *x,

+              const opus_val16 *y01,

+              const opus_val16 *y02,

+              int               N,

+              opus_val32       *xy1,

+              opus_val32       *xy2);

+#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch)			\

+    ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))

+extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(

+              opus_val32 *y,

+              opus_val32 *x,

+              int         T,

+              int         N,

+              opus_val16  g10,

+              opus_val16  g11,

+              opus_val16  g12);

+#define comb_filter_const(y, x, T, N, g10, g11, g12, arch)				\

+    ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12))

+#define NON_STATIC_COMB_FILTER_CONST_C

 #endif

-      x0v=x4v;

-      _mm_storeu_ps(y+i, yi);

-   }

-#ifdef CUSTOM_MODES

-   for (;i<N;i++)

-   {

-      y[i] = x[i]

-               + MULT16_32_Q15(g10,x[i-T])

-               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))

-               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));

-   }

 #endif

-}

-#endif

 #endif

--- a/celt/x86/x86_celt_map.c

+++ b/celt/x86/x86_celt_map.c

@@ -38,6 +38,8 @@

 # if defined(FIXED_POINT)

+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)

 void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(

          const opus_val16 *x,

          const opus_val16 *num,

@@ -49,8 +51,8 @@

 ) = {

   celt_fir_c,                /* non-sse */

   celt_fir_c,

+  celt_fir_c,

   MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */

-  NULL

};

 void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(

@@ -61,10 +63,15 @@

 ) = {

   xcorr_kernel_c,                /* non-sse */

   xcorr_kernel_c,

+  xcorr_kernel_c,

   MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */

-  NULL

};

+#endif

+#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) ||  \

+	(!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))

 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(

          const opus_val16 *x,

          const opus_val16 *y,

@@ -71,14 +78,71 @@

          int              N

 ) = {

   celt_inner_prod_c,                /* non-sse */

+  celt_inner_prod_c,

   MAY_HAVE_SSE2(celt_inner_prod),

   MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */

-  NULL

};

+#endif

 # else

-#  error "Floating-point implementation is not supported by x86 RTCD yet." \

- "Reconfigure with --disable-rtcd or send patches."

-# endif

+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)

+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(

+         const opus_val16 *x,

+         const opus_val16 *y,

+         opus_val32       sum[4],

+         int              len

+) = {

+  xcorr_kernel_c,                /* non-sse */

+  MAY_HAVE_SSE(xcorr_kernel),

+  MAY_HAVE_SSE(xcorr_kernel),

+  MAY_HAVE_SSE(xcorr_kernel),

+};

+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(

+         const opus_val16 *x,

+         const opus_val16 *y,

+         int              N

+) = {

+  celt_inner_prod_c,                /* non-sse */

+  MAY_HAVE_SSE(celt_inner_prod),

+  MAY_HAVE_SSE(celt_inner_prod),

+  MAY_HAVE_SSE(celt_inner_prod),

+};

+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(

+                    const opus_val16 *x,

+                    const opus_val16 *y01,

+                    const opus_val16 *y02,

+                    int               N,

+                    opus_val32       *xy1,

+                    opus_val32       *xy2

+) = {

+  dual_inner_prod_c,                /* non-sse */

+  MAY_HAVE_SSE(dual_inner_prod),

+  MAY_HAVE_SSE(dual_inner_prod),

+  MAY_HAVE_SSE(dual_inner_prod),

+};

+void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(

+              opus_val32 *y,

+              opus_val32 *x,

+              int         T,

+              int         N,

+              opus_val16  g10,

+              opus_val16  g11,

+              opus_val16  g12

+) = {

+  comb_filter_const_c,                /* non-sse */

+  MAY_HAVE_SSE(comb_filter_const),

+  MAY_HAVE_SSE(comb_filter_const),

+  MAY_HAVE_SSE(comb_filter_const),

+};

+#endif

+#endif

 #endif

--- a/celt/x86/x86cpu.c

+++ b/celt/x86/x86cpu.c

@@ -35,6 +35,11 @@

 #include "pitch.h"

 #include "x86cpu.h"

+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \

+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \

+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1))

 #if defined(_MSC_VER)

 #include <intrin.h>

@@ -79,6 +84,7 @@

 typedef struct CPU_Feature{

     /*  SIMD: 128-bit */

+    int HW_SSE;

     int HW_SSE2;

     int HW_SSE41;

 } CPU_Feature;

@@ -93,10 +99,12 @@

     if (nIds >= 1){

         cpuid(info, 1);

+        cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;

         cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;

         cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;

     else {

+        cpu_feature->HW_SSE = 0;

         cpu_feature->HW_SSE2 = 0;

         cpu_feature->HW_SSE41 = 0;

@@ -110,6 +118,12 @@

     opus_cpu_feature_check(&cpu_feature);

     arch = 0;

+    if (!cpu_feature.HW_SSE)

+    {

+       return arch;

+    }

+    arch++;

     if (!cpu_feature.HW_SSE2)

        return arch;

@@ -124,3 +138,5 @@

     return arch;

+#endif

--- a/celt/x86/x86cpu.h

+++ b/celt/x86/x86cpu.h

@@ -28,6 +28,12 @@

 #if !defined(X86CPU_H)

 # define X86CPU_H

+# if defined(OPUS_X86_MAY_HAVE_SSE)

+#  define MAY_HAVE_SSE(name) name ## _sse

+# else

+#  define MAY_HAVE_SSE(name) name ## _c

+# endif

 # if defined(OPUS_X86_MAY_HAVE_SSE2)

 #  define MAY_HAVE_SSE2(name) name ## _sse2

 # else

--- a/configure.ac

+++ b/configure.ac

@@ -491,9 +491,6 @@

-      #Currently we only have intrinsic optimizations for floating point

-      AS_IF([test x"$enable_float" = x"no"],

-      [

          AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""])

          AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"],

@@ -541,11 +538,6 @@

             [rtcd_support=no],

             [rtcd_support="x86$rtcd_support"],

-      ], [

-            AC_MSG_WARN([Currently only have X86 intrinsics for fixed-point])

-            intrinsics_support=no

-      ]

-    )

     AS_IF([test x"$enable_rtcd" = x"yes" && test x"$rtcd_support" != x""],[

             get_cpuid_by_asm="no"

--- a/silk/x86/SigProc_FIX_sse.h

+++ b/silk/x86/SigProc_FIX_sse.h

@@ -45,6 +45,12 @@

     int                         arch                /* I    Run-time architecture                                       */

);

+#if defined(OPUS_X86_PRESUME_SSE4_1)

+#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \

+    ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))

+#else

 extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(

     opus_int32                  *res_nrg,           /* O    Residual energy                                             */

     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */

@@ -59,6 +65,8 @@

 #  define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \

     ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))

+#endif

 opus_int64 silk_inner_prod16_aligned_64_sse4_1(

     const opus_int16 *inVec1,

     const opus_int16 *inVec2,

@@ -65,6 +73,14 @@

     const opus_int   len

);

+#if defined(OPUS_X86_PRESUME_SSE4_1)

+#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \

+    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))

+#else

 extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(

                     const opus_int16 *inVec1,

                     const opus_int16 *inVec2,

@@ -73,5 +89,6 @@

 #  define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \

     ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))

+#endif

 #endif

 #endif

--- a/silk/x86/main_sse.h

+++ b/silk/x86/main_sse.h

@@ -50,6 +50,15 @@

     opus_int                    L                               /* I    number of vectors in codebook               */

);

+#if defined OPUS_X86_PRESUME_SSE4_1

+#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \

+                          mu_Q9, max_gain_Q7, L, arch) \

+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \

+                          mu_Q9, max_gain_Q7, L))

+#else

 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(

     opus_int8                   *ind,                           /* O    index of best codebook vector               */

     opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */

@@ -69,6 +78,8 @@

     ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \

                           mu_Q9, max_gain_Q7, L))

+#endif

 #  define OVERRIDE_silk_NSQ

 void silk_NSQ_sse4_1(

@@ -89,6 +100,15 @@

     const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

);

+#if defined OPUS_X86_PRESUME_SSE4_1

+#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \

+    ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

+#else

 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(

     const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */

     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

@@ -112,6 +132,8 @@

     ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

+#endif

 #  define OVERRIDE_silk_NSQ_del_dec

 void silk_NSQ_del_dec_sse4_1(

@@ -132,6 +154,15 @@

     const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

);

+#if defined OPUS_X86_PRESUME_SSE4_1

+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \

+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

+#else

 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(

     const silk_encoder_state    *psEncC,                                    /* I/O  Encoder State                   */

     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

@@ -155,6 +186,8 @@

     ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

+#endif

 void silk_noise_shape_quantizer(

     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */

     opus_int            signalType,             /* I    Signal type                     */

@@ -192,6 +225,11 @@

     const opus_int16   pIn[]

);

+#if defined(OPUS_X86_PRESUME_SSE4_1)

+#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))

+#else

 #  define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \

      ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))

@@ -201,6 +239,8 @@

 #  define OVERRIDE_silk_warped_LPC_analysis_filter_FIX

+#endif

 void silk_warped_LPC_analysis_filter_FIX_sse4_1(

           opus_int32            state[],                    /* I/O  State [order + 1]                   */

           opus_int32            res_Q2[],                   /* O    Residual signal [length]            */

@@ -211,6 +251,12 @@

     const opus_int              order                       /* I    Filter order (even)                 */

);

+#if defined(OPUS_X86_PRESUME_SSE4_1)

+#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \

+    ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))

+#else

 extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])(

           opus_int32            state[],                    /* I/O  State [order + 1]                   */

           opus_int32            res_Q2[],                   /* O    Residual signal [length]            */

@@ -223,6 +269,8 @@

 #  define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \

     ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))

+#endif

 # endif

 #endif

--- a/silk/x86/x86_silk_map.c

+++ b/silk/x86/x86_silk_map.c

@@ -35,6 +35,10 @@

 #include "pitch.h"

 #include "main.h"

+#if !defined(OPUS_X86_PRESUME_SSE4_1)

+#if defined(FIXED_POINT)

 opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(

     const opus_int16 *inVec1,

     const opus_int16 *inVec2,

@@ -42,10 +46,12 @@

 ) = {

   silk_inner_prod16_aligned_64_c,                  /* non-sse */

   silk_inner_prod16_aligned_64_c,

+  silk_inner_prod16_aligned_64_c,

   MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */

-  NULL

};

+#endif

 opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(

     silk_encoder_state *psEncC,

     const opus_int16   pIn[]

@@ -52,8 +58,8 @@

 ) = {

   silk_VAD_GetSA_Q8_c,                  /* non-sse */

   silk_VAD_GetSA_Q8_c,

+  silk_VAD_GetSA_Q8_c,

   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */

-  NULL

};

 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(

@@ -75,8 +81,8 @@

 ) = {

   silk_NSQ_c,                  /* non-sse */

   silk_NSQ_c,

+  silk_NSQ_c,

   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */

-  NULL

};

 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(

@@ -94,8 +100,8 @@

 ) = {

   silk_VQ_WMat_EC_c,                  /* non-sse */

   silk_VQ_WMat_EC_c,

+  silk_VQ_WMat_EC_c,

   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */

-  NULL

};

 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(

@@ -117,10 +123,12 @@

 ) = {

   silk_NSQ_del_dec_c,                  /* non-sse */

   silk_NSQ_del_dec_c,

+  silk_NSQ_del_dec_c,

   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */

-  NULL

};

+#if defined(FIXED_POINT)

 void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )(

     opus_int32                  state[],                    /* I/O  State [order + 1]                   */

     opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */

@@ -132,8 +140,8 @@

 ) = {

   silk_warped_LPC_analysis_filter_FIX_c,                  /* non-sse */

   silk_warped_LPC_analysis_filter_FIX_c,

+  silk_warped_LPC_analysis_filter_FIX_c,

   MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */

-  NULL

};

 void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(

@@ -149,6 +157,9 @@

 ) = {

   silk_burg_modified_c,                  /* non-sse */

   silk_burg_modified_c,

+  silk_burg_modified_c,

   MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */

-  NULL

};

+#endif

+#endif