ref: b9176a4c3e6f0b8cbe4a309afbe117069b9d32e5
parent: ff31b79016082034330263ee14ab995bc9c7101d
author: Jean-Marc Valin <[email protected]>
date: Mon Jun 17 12:37:41 EDT 2013
Makes dual_inner_prod() more generic to increase its use
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -41,6 +41,7 @@
#include "mathops.h"
#include "rate.h"
#include "quant_bands.h"
+#include "pitch.h"
int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev)
{
@@ -383,11 +384,7 @@
opus_val32 t, lgain, rgain;
/* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
- for (j=0;j<N;j++)
- {
- xp = MAC16_16(xp, X[j], Y[j]);
- side = MAC16_16(side, Y[j], Y[j]);
- }
+ dual_inner_prod(Y, X, Y, N, &xp, &side);
/* Compensating for the mid normalization */
xp = MULT16_32_Q15(mid, xp);
/* mid and side are in Q15, not Q14 like X and Y */
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -394,20 +394,6 @@
RESTORE_STACK;
}
-#ifndef OVERRIDE_DUAL_INNER_PROD
-static opus_val32 dual_inner_prod(opus_val16 *x, opus_val16 *y01, opus_val16 *y02, int N)
-{
- int i;
- opus_val32 xy=0;
- for (i=0;i<N;i++)
- {
- xy = MAC16_16(xy, x[i], y01[i]);
- xy = MAC16_16(xy, x[i], y02[i]);
- }
- return xy;
-}
-#endif
-
static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0_, int prev_period, opus_val16 prev_gain)
@@ -415,7 +401,7 @@
int k, i, T, T0;
opus_val16 g, g0;
opus_val16 pg;
- opus_val32 xy,xx,yy;
+ opus_val32 xy,xx,yy,xy2;
opus_val32 xcorr[3];
opus_val32 best_xy, best_yy;
int offset;
@@ -435,12 +421,7 @@
T = T0 = *T0_;
ALLOC(yy_lookup, maxperiod+1, opus_val32);
- xy=xx=0;
- for (i=0;i<N;i++)
- {
- xx = MAC16_16(xx, x[i], x[i]);
- xy = MAC16_16(xy, x[i], x[i-T0]);
- }
+ dual_inner_prod(x, x, x-T0, N, &xx, &xy);
yy_lookup[0] = xx;
yy=xx;
for (i=1;i<=maxperiod;i++)
@@ -484,7 +465,8 @@
{
T1b = (2*second_check[k]*T0+k)/(2*k);
}
- xy = dual_inner_prod(x, &x[-T1], &x[-T1b], N);
+ dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
+ xy += xy2;
yy = yy_lookup[T1] + yy_lookup[T1b];
#ifdef FIXED_POINT
{
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -118,6 +118,23 @@
}
#endif /* OVERRIDE_XCORR_KERNEL */
+#ifndef OVERRIDE_DUAL_INNER_PROD
+static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+ int i;
+ opus_val32 xy01=0;
+ opus_val32 xy02=0;
+ for (i=0;i<N;i++)
+ {
+ xy01 = MAC16_16(xy01, x[i], y01[i]);
+ xy02 = MAC16_16(xy02, x[i], y02[i]);
+ }
+ *xy1 = xy01;
+ *xy2 = xy02;
+}
+#endif
+
#ifdef FIXED_POINT
opus_val32
#else
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -72,11 +72,11 @@
}
#define OVERRIDE_DUAL_INNER_PROD
-static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N)
+static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
__m128 xsum1, xsum2;
- opus_val32 xy=0;
xsum1 = _mm_setzero_ps();
xsum2 = _mm_setzero_ps();
for (i=0;i<N-3;i+=4)
@@ -87,17 +87,18 @@
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
}
- xsum1 = _mm_add_ps(xsum1,xsum2);
/* Horizontal sum */
xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
- _mm_store_ss(&xy, xsum1);
+ _mm_store_ss(xy1, xsum1);
+ xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
+ xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
+ _mm_store_ss(xy2, xsum2);
for (;i<N;i++)
{
- xy = MAC16_16(xy, x[i], y01[i]);
- xy = MAC16_16(xy, x[i], y02[i]);
+ *xy1 = MAC16_16(*xy1, x[i], y01[i]);
+ *xy2 = MAC16_16(*xy2, x[i], y02[i]);
}
- return xy;
}
#define OVERRIDE_COMB_FILTER_CONST