shithub: opus

Download patch

ref: 60eb7d88b4eace91395e553cf70fc4578a950353
parent: 1eaa67c0dce1f642af872cc26b7ff4b057a55bfb
author: Linfeng Zhang <[email protected]>
date: Thu Sep 1 09:44:11 EDT 2016

Update silk_biquad_alt()

Split to silk_biquad_alt_stride1() and silk_biquad_alt_stride2(),
so that it can be optimized more efficiently when stride is 2.

This change in C code is bit exact with the origin.

Change-Id: Idaefe670397016ace2a489e3435ac61b7dbe79d5

Signed-off-by: Jean-Marc Valin <[email protected]>

--- a/silk/LP_variable_cutoff.c
+++ b/silk/LP_variable_cutoff.c
@@ -130,6 +130,6 @@
 
         /* ARMA low-pass filtering */
         silk_assert( TRANSITION_NB == 3 && TRANSITION_NA == 2 );
-        silk_biquad_alt( frame, B_Q28, A_Q28, psLP->In_LP_State, frame, frame_length, 1);
+        silk_biquad_alt_stride1( frame, B_Q28, A_Q28, psLP->In_LP_State, frame, frame_length);
     }
 }
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -100,14 +100,22 @@
  * slower than biquad() but uses more precise coefficients
  * can handle (slowly) varying coefficients
  */
-void silk_biquad_alt(
+void silk_biquad_alt_stride1(
     const opus_int16            *in,                /* I     input signal                                               */
     const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
     const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
     opus_int32                  *S,                 /* I/O   State vector [2]                                           */
     opus_int16                  *out,               /* O     output signal                                              */
-    const opus_int32            len,                /* I     signal length (must be even)                               */
-    opus_int                    stride              /* I     Operate on interleaved signal if > 1                       */
+    const opus_int32            len                 /* I     signal length (must be even)                               */
+);
+
+void silk_biquad_alt_stride2(
+    const opus_int16            *in,                /* I     input signal                                               */
+    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
+    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
+    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
+    opus_int16                  *out,               /* O     output signal                                              */
+    const opus_int32            len                 /* I     signal length (must be even)                               */
 );
 
 /* Variable order MA prediction error filter. */
--- a/silk/biquad_alt.c
+++ b/silk/biquad_alt.c
@@ -39,14 +39,13 @@
 #include "SigProc_FIX.h"
 
 /* Second order ARMA filter, alternative implementation */
-void silk_biquad_alt(
+void silk_biquad_alt_stride1(
     const opus_int16            *in,                /* I     input signal                                               */
     const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
     const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
     opus_int32                  *S,                 /* I/O   State vector [2]                                           */
     opus_int16                  *out,               /* O     output signal                                              */
-    const opus_int32            len,                /* I     signal length (must be even)                               */
-    opus_int                    stride              /* I     Operate on interleaved signal if > 1                       */
+    const opus_int32            len                 /* I     signal length (must be even)                               */
 )
 {
     /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
@@ -61,7 +60,7 @@
 
     for( k = 0; k < len; k++ ) {
         /* S[ 0 ], S[ 1 ]: Q12 */
-        inval = in[ k * stride ];
+        inval = in[ k ];
         out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], inval ), 2 );
 
         S[ 0 ] = S[1] + silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14, A0_L_Q28 ), 14 );
@@ -73,6 +72,50 @@
         S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], inval );
 
         /* Scale back to Q0 and saturate */
-        out[ k * stride ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 + (1<<14) - 1, 14 ) );
+        out[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 + (1<<14) - 1, 14 ) );
+    }
+}
+
+void silk_biquad_alt_stride2(
+    const opus_int16            *in,                /* I     input signal                                               */
+    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
+    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
+    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
+    opus_int16                  *out,               /* O     output signal                                              */
+    const opus_int32            len                 /* I     signal length (must be even)                               */
+)
+{
+    /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
+    opus_int   k;
+    opus_int32 A0_U_Q28, A0_L_Q28, A1_U_Q28, A1_L_Q28, out32_Q14[ 2 ];
+
+    /* Negate A_Q28 values and split in two parts */
+    A0_L_Q28 = ( -A_Q28[ 0 ] ) & 0x00003FFF;        /* lower part */
+    A0_U_Q28 = silk_RSHIFT( -A_Q28[ 0 ], 14 );      /* upper part */
+    A1_L_Q28 = ( -A_Q28[ 1 ] ) & 0x00003FFF;        /* lower part */
+    A1_U_Q28 = silk_RSHIFT( -A_Q28[ 1 ], 14 );      /* upper part */
+
+    for( k = 0; k < len; k++ ) {
+        /* S[ 0 ], S[ 1 ], S[ 2 ], S[ 3 ]: Q12 */
+        out32_Q14[ 0 ] = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ 2 * k + 0 ] ), 2 );
+        out32_Q14[ 1 ] = silk_LSHIFT( silk_SMLAWB( S[ 2 ], B_Q28[ 0 ], in[ 2 * k + 1 ] ), 2 );
+
+        S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 0 ], A0_L_Q28 ), 14 );
+        S[ 2 ] = S[ 3 ] + silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 1 ], A0_L_Q28 ), 14 );
+        S[ 0 ] = silk_SMLAWB( S[ 0 ], out32_Q14[ 0 ], A0_U_Q28 );
+        S[ 2 ] = silk_SMLAWB( S[ 2 ], out32_Q14[ 1 ], A0_U_Q28 );
+        S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ 2 * k + 0 ] );
+        S[ 2 ] = silk_SMLAWB( S[ 2 ], B_Q28[ 1 ], in[ 2 * k + 1 ] );
+
+        S[ 1 ] = silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 0 ], A1_L_Q28 ), 14 );
+        S[ 3 ] = silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 1 ], A1_L_Q28 ), 14 );
+        S[ 1 ] = silk_SMLAWB( S[ 1 ], out32_Q14[ 0 ], A1_U_Q28 );
+        S[ 3 ] = silk_SMLAWB( S[ 3 ], out32_Q14[ 1 ], A1_U_Q28 );
+        S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ 2 * k + 0 ] );
+        S[ 3 ] = silk_SMLAWB( S[ 3 ], B_Q28[ 2 ], in[ 2 * k + 1 ] );
+
+        /* Scale back to Q0 and saturate */
+        out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14[ 0 ] + (1<<14) - 1, 14 ) );
+        out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14[ 1 ] + (1<<14) - 1, 14 ) );
     }
 }
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -364,9 +364,10 @@
    A_Q28[ 1 ] = silk_SMULWW( r_Q22, r_Q22 );
 
 #ifdef FIXED_POINT
-   silk_biquad_alt( in, B_Q28, A_Q28, hp_mem, out, len, channels );
-   if( channels == 2 ) {
-       silk_biquad_alt( in+1, B_Q28, A_Q28, hp_mem+2, out+1, len, channels );
+   if( channels == 1 ) {
+      silk_biquad_alt_stride1( in, B_Q28, A_Q28, hp_mem, out, len );
+   } else {
+      silk_biquad_alt_stride2( in, B_Q28, A_Q28, hp_mem, out, len );
    }
 #else
    silk_biquad_float( in, B_Q28, A_Q28, hp_mem, out, len, channels );