shithub: opus

--- a/celt/arch.h

+++ b/celt/arch.h

@@ -208,6 +208,7 @@

 #define MULT32_32_Q31(a,b)     ((a)*(b))

 #define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))

+#define MAC16_32_Q16(c,a,b)     ((c)+(a)*(b))

 #define MULT16_16_Q11_32(a,b)     ((a)*(b))

 #define MULT16_16_Q11(a,b)     ((a)*(b))

--- a/celt/arm/fixed_armv4.h

+++ b/celt/arm/fixed_armv4.h

@@ -68,6 +68,10 @@

 #undef MAC16_32_Q15

 #define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))

+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.

+    Result fits in 32 bits. */

+#undef MAC16_32_Q16

+#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))

 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */

 #undef MULT32_32_Q31

--- a/celt/arm/fixed_armv5e.h

+++ b/celt/arm/fixed_armv5e.h

@@ -82,6 +82,23 @@

 #define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))

+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.

+    Result fits in 32 bits. */

+#undef MAC16_32_Q16

+static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a,

+ opus_val32 b)

+{

+  int res;

+  __asm__(

+      "#MAC16_32_Q16\n\t"

+      "smlawb %0, %1, %2, %3;\n"

+      : "=r"(res)

+      : "r"(b), "r"(a), "r"(c)

+  );

+  return res;

+}

+#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b))

 /** 16x16 multiply-add where the result fits in 32 bits */

 #undef MAC16_16

 static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,

--- a/celt/celt.c

+++ b/celt/celt.c

@@ -86,11 +86,38 @@

 #ifndef OVERRIDE_COMB_FILTER_CONST

+/* This version should be faster on ARM */

+#ifdef OPUS_ARM_ASM

 static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,

       opus_val16 g10, opus_val16 g11, opus_val16 g12)

    opus_val32 x0, x1, x2, x3, x4;

    int i;

+   x4 = SHL32(x[-T-2], 1);

+   x3 = SHL32(x[-T-1], 1);

+   x2 = SHL32(x[-T], 1);

+   x1 = SHL32(x[-T+1], 1);

+   for (i=0;i<N;i++)

+   {

+      opus_val32 t;

+      x0=SHL32(x[i-T+2],1);

+      t = MAC16_32_Q16(x[i], g10, x2);

+      t = MAC16_32_Q16(t, g11, ADD32(x1,x3));

+      t = MAC16_32_Q16(t, g12, ADD32(x0,x4));

+      y[i] = t;

+      x4=x3;

+      x3=x2;

+      x2=x1;

+      x1=x0;

+   }

+}

+#else

+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,

+      opus_val16 g10, opus_val16 g11, opus_val16 g12)

+{

+   opus_val32 x0, x1, x2, x3, x4;

+   int i;

    x4 = x[-T-2];

    x3 = x[-T-1];

    x2 = x[-T];

@@ -109,6 +136,7 @@

+#endif

 #endif

 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,

--- a/celt/fixed_debug.h

+++ b/celt/fixed_debug.h

@@ -496,6 +496,7 @@

 #define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15)

 #define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b))))

+#define MAC16_32_Q16(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q16((a),(b))))

 static OPUS_INLINE int SATURATE(int a, int b)

--- a/celt/fixed_generic.h

+++ b/celt/fixed_generic.h

@@ -113,7 +113,11 @@

 /** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.

     b must fit in 31 bits.

     Result fits in 32 bits. */

-#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))

+#define MAC16_32_Q15(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))

+/** 16x32 multiplication, followed by a 16-bit shift right and 32-bit add.

+    Results fits in 32 bits */

+#define MAC16_32_Q16(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)))

 #define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))

 #define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))