shithub: opus

--- a/celt/arm/arm_celt_map.c

+++ b/celt/arm/arm_celt_map.c

@@ -35,8 +35,6 @@

 #if defined(OPUS_HAVE_RTCD)

-# if defined(FIXED_POINT)

 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)

 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {

   celt_inner_prod_c,   /* ARMv4 */

@@ -54,6 +52,7 @@

};

 # endif

+# if defined(FIXED_POINT)

 #  if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \

     (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \

     (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))

--- a/celt/arm/pitch_arm.h

+++ b/celt/arm/pitch_arm.h

@@ -30,8 +30,6 @@

 # include "armcpu.h"

-# if defined(FIXED_POINT)

 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)

 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);

 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,

@@ -67,6 +65,8 @@

 #   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))

 #  endif

 # endif

+# if defined(FIXED_POINT)

 #  if defined(OPUS_ARM_MAY_HAVE_NEON)

 opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,

--- a/celt/arm/pitch_neon_intr.c

+++ b/celt/arm/pitch_neon_intr.c

@@ -126,4 +126,165 @@

 #endif

+#else /* !FIXED_POINT */

+/* ========================================================================== */

+#ifdef OPUS_CHECK_ASM

+/* This part of code simulates floating-point NEON operations. */

+/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */

+/* operations of celt_inner_prod_neon(), and both functions should have bit */

+/* exact output.                                                            */

+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)

+{

+   int i;

+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;

+   for (i = 0; i < N - 3; i += 4) {

+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);

+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);

+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);

+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);

+   }

+   xy0 += xy2;

+   xy1 += xy3;

+   xy = xy0 + xy1;

+   for (; i < N; i++) {

+      xy = MAC16_16(xy, x[i], y[i]);

+   }

+   return xy;

+}

+/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */

+/* operations of dual_inner_prod_neon(), and both functions should have bit */

+/* exact output.                                                            */

+static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,

+      int N, opus_val32 *xy1, opus_val32 *xy2)

+{

+   int i;

+   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;

+   for (i = 0; i < N - 3; i += 4) {

+      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);

+      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);

+      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);

+      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);

+      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);

+      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);

+      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);

+      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);

+   }

+   xy01_0 += xy01_2;

+   xy02_0 += xy02_2;

+   xy01_1 += xy01_3;

+   xy02_1 += xy02_3;

+   xy01 = xy01_0 + xy01_1;

+   xy02 = xy02_0 + xy02_1;

+   for (; i < N; i++) {

+      xy01 = MAC16_16(xy01, x[i], y01[i]);

+      xy02 = MAC16_16(xy02, x[i], y02[i]);

+   }

+   *xy1 = xy01;

+   *xy2 = xy02;

+}

+#endif /* OPUS_CHECK_ASM */

+/* ========================================================================== */

+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)

+{

+    int i;

+    opus_val32 xy;

+    float32x4_t xy_f32x4 = vdupq_n_f32(0);

+    float32x2_t xy_f32x2;

+    for (i = 0; i < N - 7; i += 8) {

+        float32x4_t x_f32x4, y_f32x4;

+        x_f32x4  = vld1q_f32(&x[i]);

+        y_f32x4  = vld1q_f32(&y[i]);

+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);

+        x_f32x4  = vld1q_f32(&x[i + 4]);

+        y_f32x4  = vld1q_f32(&y[i + 4]);

+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);

+    }

+    if (N - i >= 4) {

+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);

+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);

+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);

+        i += 4;

+    }

+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));

+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);

+    xy       = vget_lane_f32(xy_f32x2, 0);

+    for (; i < N; i++) {

+        xy = MAC16_16(xy, x[i], y[i]);

+    }

+#ifdef OPUS_CHECK_ASM

+    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);

 #endif

+    return xy;

+}

+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,

+        int N, opus_val32 *xy1, opus_val32 *xy2)

+{

+    int i;

+    opus_val32 xy01, xy02;

+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);

+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);

+    float32x2_t xy01_f32x2, xy02_f32x2;

+    for (i = 0; i < N - 7; i += 8) {

+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;

+        x_f32x4    = vld1q_f32(&x[i]);

+        y01_f32x4  = vld1q_f32(&y01[i]);

+        y02_f32x4  = vld1q_f32(&y02[i]);

+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);

+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);

+        x_f32x4    = vld1q_f32(&x[i + 4]);

+        y01_f32x4  = vld1q_f32(&y01[i + 4]);

+        y02_f32x4  = vld1q_f32(&y02[i + 4]);

+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);

+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);

+    }

+    if (N - i >= 4) {

+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);

+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);

+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);

+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);

+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);

+        i += 4;

+    }

+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));

+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));

+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);

+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);

+    xy01       = vget_lane_f32(xy01_f32x2, 0);

+    xy02       = vget_lane_f32(xy02_f32x2, 0);

+    for (; i < N; i++) {

+        xy01 = MAC16_16(xy01, x[i], y01[i]);

+        xy02 = MAC16_16(xy02, x[i], y02[i]);

+    }

+    *xy1 = xy01;

+    *xy2 = xy02;

+#ifdef OPUS_CHECK_ASM

+    {

+        opus_val32 xy1_c, xy2_c;

+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);

+        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);

+        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);

+    }

+#endif

+}

+#endif /* FIXED_POINT */

--- a/celt/pitch.h

+++ b/celt/pitch.h

@@ -46,8 +46,7 @@

 #include "mips/pitch_mipsr1.h"

 #endif

-#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \

-  || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))

+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))

 # include "arm/pitch_arm.h"

 #endif