shithub: opus

Download patch

ref: a156c5ece7133383468d4cba33f067595d9da391
parent: 568de0a17b7b8e49fe3fa4d2a3cc4ebd62400cab
author: Jean-Marc Valin <[email protected]>
date: Mon Aug 26 14:54:39 EDT 2013

Makes the SILK pitch search use celt_pitch_xcorr()

Should gives us ARM/SSE optimizations for free.

--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -250,7 +250,7 @@
 #else
 void
 #endif
-celt_pitch_xcorr(opus_val16 *_x, opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
+celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
 {
    int i,j;
 #ifdef FIXED_POINT
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -140,6 +140,6 @@
 #else
 void
 #endif
-celt_pitch_xcorr(opus_val16 *_x, opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch);
+celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch);
 
 #endif
--- a/silk/fixed/pitch_analysis_core_FIX.c
+++ b/silk/fixed/pitch_analysis_core_FIX.c
@@ -36,6 +36,7 @@
 #include "pitch_est_defines.h"
 #include "stack_alloc.h"
 #include "debug.h"
+#include "pitch.h"
 
 #define SCRATCH_SIZE    22
 #define SF_LENGTH_4KHZ  ( PE_SUBFR_LENGTH_MS * 4 )
@@ -96,6 +97,7 @@
     const opus_int16 *input_frame_ptr;
     opus_int   i, k, d, j;
     VARDECL( opus_int16, C );
+    VARDECL( opus_int32, xcorr32 );
     const opus_int16 *target_ptr, *basis_ptr;
     opus_int32 cross_corr, normalizer, energy, shift, energy_basis, energy_target;
     opus_int   d_srch[ PE_D_SRCH_LENGTH ], Cmax, length_d_srch, length_d_comp;
@@ -173,6 +175,7 @@
     * FIRST STAGE, operating in 4 khz
     ******************************************************************************/
     ALLOC( C, nb_subfr * CSTRIDE_8KHZ, opus_int16 );
+    ALLOC( xcorr32, MAX_LAG_4KHZ-MIN_LAG_4KHZ+1, opus_int32 );
     silk_memset( C, 0, (nb_subfr >> 1) * CSTRIDE_4KHZ * sizeof( opus_int16 ) );
     target_ptr = &frame_4kHz[ silk_LSHIFT( SF_LENGTH_4KHZ, 2 ) ];
     for( k = 0; k < nb_subfr >> 1; k++ ) {
@@ -186,8 +189,10 @@
         silk_assert( basis_ptr >= frame_4kHz );
         silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
 
+        celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1 );
+
         /* Calculate first vector products before loop */
-        cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr,  SF_LENGTH_8KHZ );
+        cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ];
         normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ );
         normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr,  basis_ptr, SF_LENGTH_8KHZ ) );
         normalizer = silk_ADD32( normalizer, silk_SMULBB( SF_LENGTH_8KHZ, 4000 ) );
@@ -203,7 +208,7 @@
             silk_assert( basis_ptr >= frame_4kHz );
             silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz );
 
-            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ );
+            cross_corr = xcorr32[ MAX_LAG_4KHZ - d ];
 
             /* Add contribution of new sample and remove contribution from oldest sample */
             normalizer = silk_ADD32( normalizer,
@@ -595,11 +600,11 @@
     opus_int          complexity                       /* I Complexity setting          */
 )
 {
-    const opus_int16 *target_ptr, *basis_ptr;
-    opus_int32 cross_corr;
+    const opus_int16 *target_ptr;
     opus_int   i, j, k, lag_counter, lag_low, lag_high;
     opus_int   nb_cbk_search, delta, idx, cbk_size;
     VARDECL( opus_int32, scratch_mem );
+    VARDECL( opus_int32, xcorr32 );
     const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;
     SAVE_STACK;
 
@@ -619,6 +624,7 @@
         cbk_size      = PE_NB_CBKS_STAGE3_10MS;
     }
     ALLOC( scratch_mem, SCRATCH_SIZE, opus_int32 );
+    ALLOC( xcorr32, SCRATCH_SIZE, opus_int32 );
 
     target_ptr = &frame[ silk_LSHIFT( sf_length, 2 ) ]; /* Pointer to middle of frame */
     for( k = 0; k < nb_subfr; k++ ) {
@@ -627,11 +633,11 @@
         /* Calculate the correlations for each subframe */
         lag_low  = matrix_ptr( Lag_range_ptr, k, 0, 2 );
         lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );
+        silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);
+        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1 );
         for( j = lag_low; j <= lag_high; j++ ) {
-            basis_ptr = target_ptr - ( start_lag + j );
-            cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, sf_length );
             silk_assert( lag_counter < SCRATCH_SIZE );
-            scratch_mem[ lag_counter ] = cross_corr;
+            scratch_mem[ lag_counter ] = xcorr32[ lag_high - j ];
             lag_counter++;
         }
 
--- a/silk/float/pitch_analysis_core_FLP.c
+++ b/silk/float/pitch_analysis_core_FLP.c
@@ -35,6 +35,7 @@
 #include "SigProc_FLP.h"
 #include "SigProc_FIX.h"
 #include "pitch_est_defines.h"
+#include "pitch.h"
 
 #define SCRATCH_SIZE        22
 
@@ -84,6 +85,7 @@
     opus_int32 filt_state[ 6 ];
     silk_float threshold, contour_bias;
     silk_float C[ PE_MAX_NB_SUBFR][ (PE_MAX_LAG >> 1) + 5 ];
+    opus_val32 xcorr[ PE_MAX_LAG_MS * 4 - PE_MIN_LAG_MS * 4 + 1 ];
     silk_float CC[ PE_NB_CBKS_STAGE2_EXT ];
     const silk_float *target_ptr, *basis_ptr;
     double    cross_corr, normalizer, energy, energy_tmp;
@@ -174,8 +176,10 @@
         silk_assert( basis_ptr >= frame_4kHz );
         silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
 
+        celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1 );
+
         /* Calculate first vector products before loop */
-        cross_corr = silk_inner_product_FLP( target_ptr, basis_ptr, sf_length_8kHz );
+        cross_corr = xcorr[ max_lag_4kHz - min_lag_4kHz ];
         normalizer = silk_energy_FLP( target_ptr, sf_length_8kHz ) + 
                      silk_energy_FLP( basis_ptr,  sf_length_8kHz ) + 
                      sf_length_8kHz * 4000.0f;
@@ -190,7 +194,7 @@
             silk_assert( basis_ptr >= frame_4kHz );
             silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz );
 
-            cross_corr = silk_inner_product_FLP(target_ptr, basis_ptr, sf_length_8kHz);
+            cross_corr = xcorr[ max_lag_4kHz - d ];
 
             /* Add contribution of new sample and remove contribution from oldest sample */
             normalizer +=
@@ -496,6 +500,7 @@
     opus_int   i, j, k, lag_counter, lag_low, lag_high;
     opus_int   nb_cbk_search, delta, idx, cbk_size;
     silk_float scratch_mem[ SCRATCH_SIZE ];
+    opus_val32 xcorr[ SCRATCH_SIZE ];
     const opus_int8 *Lag_range_ptr, *Lag_CB_ptr;
 
     silk_assert( complexity >= SILK_PE_MIN_COMPLEX );
@@ -521,10 +526,12 @@
         /* Calculate the correlations for each subframe */
         lag_low  = matrix_ptr( Lag_range_ptr, k, 0, 2 );
         lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 );
+        silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE);
+        celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1 );
         for( j = lag_low; j <= lag_high; j++ ) {
             basis_ptr = target_ptr - ( start_lag + j );
             silk_assert( lag_counter < SCRATCH_SIZE );
-            scratch_mem[ lag_counter ] = (silk_float)silk_inner_product_FLP( target_ptr, basis_ptr, sf_length );
+            scratch_mem[ lag_counter ] = xcorr[ lag_high - j ];
             lag_counter++;
         }