shithub: opus

Download patch

ref: 0d95b3b48cb0f852b22693bcf49ed4f0e604def3
parent: 71766ef7a451a48536255c62c273b037ebcb2c42
author: Jean-Marc Valin <[email protected]>
date: Sun Jun 16 23:58:16 EDT 2013

SSE optimization of comb_filter()

Should make it easy to adapt to other architectures.

--- a/celt/celt.c
+++ b/celt/celt.c
@@ -85,7 +85,32 @@
    return ret;
 }
 
+#ifndef OVERRIDE_COMB_FILTER_CONST
+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = x[-T-2];
+   x3 = x[-T-1];
+   x2 = x[-T];
+   x1 = x[-T+1];
+   for (i=0;i<N;i++)
+   {
+      x0=x[i-T+2];
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x2)
+               + MULT16_32_Q15(g11,ADD32(x1,x3))
+               + MULT16_32_Q15(g12,ADD32(x0,x4));
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
 
+}
+#endif
+
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
       const opus_val16 *window, int overlap)
@@ -141,19 +166,9 @@
          OPUS_MOVE(y+overlap, x+overlap, N-overlap);
       return;
    }
-   /* OPT: For machines where the movs are costly, unroll by 5 */
-   for (;i<N;i++)
-   {
-      x0=x[i-T1+2];
-      y[i] = x[i]
-               + MULT16_32_Q15(g10,x2)
-               + MULT16_32_Q15(g11,ADD32(x1,x3))
-               + MULT16_32_Q15(g12,ADD32(x0,x4));
-      x4=x3;
-      x3=x2;
-      x2=x1;
-      x1=x0;
-   }
+
+   /* Compute the part with the constant filter. */
+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
 }
 
 const signed char tf_select_table[4][8] = {
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -100,5 +100,54 @@
    return xy;
 }
 
+#define OVERRIDE_COMB_FILTER_CONST
+static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   int i;
+   __m128 x0v;
+   __m128 g10v, g11v, g12v;
+   g10v = _mm_load1_ps(&g10);
+   g11v = _mm_load1_ps(&g11);
+   g12v = _mm_load1_ps(&g12);
+   x0v = _mm_loadu_ps(&x[-T-2]);
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 yi, yi2, x1v, x2v, x3v, x4v;
+      const opus_val32 *xp = &x[i-T-2];
+      yi = _mm_loadu_ps(x+i);
+      x4v = _mm_loadu_ps(xp+4);
+#if 0
+      /* Slower version with all loads */
+      x1v = _mm_loadu_ps(xp+1);
+      x2v = _mm_loadu_ps(xp+2);
+      x3v = _mm_loadu_ps(xp+3);
+#else
+      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+      /* Use partial sums */
+      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+      yi = _mm_add_ps(yi, yi2);
+#endif
+      x0v=x4v;
+      _mm_storeu_ps(y+i, yi);
+   }
+   for (;i<N;i++)
+   {
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x[i-T])
+               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+   }
+}
 
 #endif