shithub: opus

--- a/silk/NSQ.c

+++ b/silk/NSQ.c

@@ -75,21 +75,21 @@

 void silk_NSQ_c

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

     const opus_int16            x16[],                                        /* I    Input                           */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

     opus_int            k, lag, start_idx, LSF_interpolation_flag;

@@ -173,9 +173,9 @@

     RESTORE_STACK;

-/***********************************/

-/* silk_noise_shape_quantizer  */

-/***********************************/

+/******************************/

+/* silk_noise_shape_quantizer */

+/******************************/

 #if !defined(OPUS_X86_MAY_HAVE_SSE4_1)

 static OPUS_INLINE

--- a/silk/NSQ_del_dec.c

+++ b/silk/NSQ_del_dec.c

@@ -115,21 +115,21 @@

);

 void silk_NSQ_del_dec_c(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

     const opus_int16            x16[],                                        /* I    Input                           */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;

--- a/silk/SigProc_FIX.h

+++ b/silk/SigProc_FIX.h

@@ -381,7 +381,7 @@

     const opus_int              len                 /*    I vector lengths                                              */

);

-opus_int64 silk_inner_prod16_aligned_64_c(

+opus_int64 silk_inner_prod16_c(

     const opus_int16            *inVec1,            /*    I input vector 1                                              */

     const opus_int16            *inVec2,            /*    I input vector 2                                              */

     const opus_int              len                 /*    I vector lengths                                              */

@@ -613,8 +613,8 @@

 #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \

     ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))

-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \

-    ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))

+#define silk_inner_prod16(inVec1, inVec2, len, arch) \

+    ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))

 #endif

 #include "Inlines.h"

--- a/silk/VQ_WMat_EC.c

+++ b/silk/VQ_WMat_EC.c

@@ -64,7 +64,7 @@

     *rate_dist_Q8 = silk_int32_MAX;

     *res_nrg_Q15 = silk_int32_MAX;

     cb_row_Q7 = cb_Q7;

-    /* In things go really bad, at least *ind is set to something safe. */

+    /* If things go really bad, at least *ind is set to something safe. */

     *ind = 0;

     for( k = 0; k < L; k++ ) {

         opus_int32 penalty;

@@ -115,7 +115,7 @@

         if( sum1_Q15 >= 0 ) {

             /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */

             bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );

-            /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */

+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */

             bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );

             if( bits_tot_Q8 <= *rate_dist_Q8 ) {

                 *rate_dist_Q8 = bits_tot_Q8;

--- a/silk/fixed/burg_modified_FIX.c

+++ b/silk/fixed/burg_modified_FIX.c

@@ -68,7 +68,7 @@

     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );

     /* Compute autocorrelations, added over subframes */

-    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );

+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );

     lz = silk_CLZ64(C0_64);

     rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;

     if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;

@@ -87,7 +87,7 @@

             x_ptr = x + s * subfr_length;

             for( n = 1; n < D + 1; n++ ) {

                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(

-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );

+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );

     } else {

@@ -150,7 +150,7 @@

                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */

                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */

                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */

-                    /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),

+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),

                        but they cancel each other and the real result seems to always fit in a 32-bit

                        signed integer. This was determined experimentally, not theoretically (unfortunately). */

                     tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */

@@ -253,7 +253,7 @@

         if( rshifts > 0 ) {

             for( s = 0; s < nb_subfr; s++ ) {

                 x_ptr = x + s * subfr_length;

-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );

+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );

         } else {

             for( s = 0; s < nb_subfr; s++ ) {

--- a/silk/fixed/vector_ops_FIX.c

+++ b/silk/fixed/vector_ops_FIX.c

@@ -87,7 +87,7 @@

 #endif

-opus_int64 silk_inner_prod16_aligned_64_c(

+opus_int64 silk_inner_prod16_c(

     const opus_int16            *inVec1,            /*    I input vector 1                                              */

     const opus_int16            *inVec2,            /*    I input vector 2                                              */

     const opus_int              len                 /*    I vector lengths                                              */

--- a/silk/fixed/x86/burg_modified_FIX_sse4_1.c

+++ b/silk/fixed/x86/burg_modified_FIX_sse4_1.c

@@ -1,5 +1,5 @@

-/* Copyright (c) 2014, Cisco Systems, INC

-   Written by XiangMingZhu WeiZhou MinPeng YanWang

+/* Copyright (c) 2014-2020, Cisco Systems, INC

+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

    Redistribution and use in source and binary forms, with or without

    modification, are permitted provided that the following conditions

@@ -42,7 +42,7 @@

 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */

 #define QA                          25

-#define N_BITS_HEAD_ROOM            2

+#define N_BITS_HEAD_ROOM            3

 #define MIN_RSHIFTS                 -16

 #define MAX_RSHIFTS                 (32 - QA)

@@ -59,7 +59,7 @@

     int                         arch                /* I    Run-time architecture                                       */

-    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;

+    opus_int         k, n, s, lz, rshifts, reached_max_gain;

     opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;

     const opus_int16 *x_ptr;

     opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];

@@ -68,6 +68,7 @@

     opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];

     opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];

     opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];

+    opus_int64       C0_64;

     __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;

     __m128i CONST1 = _mm_set1_epi32(1);

@@ -75,23 +76,18 @@

     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );

     /* Compute autocorrelations, added over subframes */

-    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );

-    if( rshifts > MAX_RSHIFTS ) {

-        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );

-        silk_assert( C0 > 0 );

-        rshifts = MAX_RSHIFTS;

+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );

+    lz = silk_CLZ64(C0_64);

+    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;

+    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;

+    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;

+    if (rshifts > 0) {

+        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );

     } else {

-        lz = silk_CLZ32( C0 ) - 1;

-        rshifts_extra = N_BITS_HEAD_ROOM - lz;

-        if( rshifts_extra > 0 ) {

-            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );

-            C0 = silk_RSHIFT32( C0, rshifts_extra );

-        } else {

-            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );

-            C0 = silk_LSHIFT32( C0, -rshifts_extra );

-        }

-        rshifts += rshifts_extra;

+        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );

     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */

     silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );

     if( rshifts > 0 ) {

@@ -99,7 +95,7 @@

             x_ptr = x + s * subfr_length;

             for( n = 1; n < D + 1; n++ ) {

                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(

-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );

+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );

     } else {

@@ -203,8 +199,11 @@

                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */

                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */

                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */

-                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */

-                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */

+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),

+                       but they cancel each other and the real result seems to always fit in a 32-bit

+                       signed integer. This was determined experimentally, not theoretically (unfortunately). */

+                    tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */

+                    tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */

                 tmp1 = -tmp1;                /* Q17 */

@@ -350,7 +349,7 @@

         if( rshifts > 0 ) {

             for( s = 0; s < nb_subfr; s++ ) {

                 x_ptr = x + s * subfr_length;

-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );

+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );

         } else {

             for( s = 0; s < nb_subfr; s++ ) {

@@ -374,4 +373,28 @@

         *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */

         *res_nrg_Q = -rshifts;

+#ifdef OPUS_CHECK_ASM

+    {

+        opus_int32 res_nrg_c = 0;

+        opus_int res_nrg_Q_c = 0;

+        opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};

+        silk_burg_modified_c(

+            &res_nrg_c,

+            &res_nrg_Q_c,

+            A_Q16_c,

+            x,

+            minInvGain_Q30,

+            subfr_length,

+            nb_subfr,

+            D,

+            0

+        );

+        silk_assert( *res_nrg == res_nrg_c );

+        silk_assert( *res_nrg_Q == res_nrg_Q_c );

+        silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );

+    }

+#endif

--- a/silk/fixed/x86/prefilter_FIX_sse.c

+++ /dev/null

@@ -1,160 +1,0 @@

-/* Copyright (c) 2014, Cisco Systems, INC

-   Written by XiangMingZhu WeiZhou MinPeng YanWang

-   Redistribution and use in source and binary forms, with or without

-   modification, are permitted provided that the following conditions

-   are met:

-   - Redistributions of source code must retain the above copyright

-   notice, this list of conditions and the following disclaimer.

-   - Redistributions in binary form must reproduce the above copyright

-   notice, this list of conditions and the following disclaimer in the

-   documentation and/or other materials provided with the distribution.

-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-*/

-#ifdef HAVE_CONFIG_H

-#include "config.h"

-#endif

-#include <xmmintrin.h>

-#include <emmintrin.h>

-#include <smmintrin.h>

-#include "main.h"

-#include "celt/x86/x86cpu.h"

-void silk_warped_LPC_analysis_filter_FIX_sse4_1(

-    opus_int32                  state[],                    /* I/O  State [order + 1]                   */

-    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */

-    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */

-    const opus_int16            input[],                    /* I    Input signal [length]               */

-    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */

-    const opus_int              length,                     /* I    Length of input signal              */

-    const opus_int              order                       /* I    Filter order (even)                 */

-)

-{

-    opus_int     n, i;

-    opus_int32   acc_Q11, tmp1, tmp2;

-    /* Order must be even */

-    celt_assert( ( order & 1 ) == 0 );

-    if (order == 10)

-    {

-        if (0 == lambda_Q16)

-        {

-            __m128i coef_Q13_3210, coef_Q13_7654;

-            __m128i coef_Q13_0123, coef_Q13_4567;

-            __m128i state_0123, state_4567;

-            __m128i xmm_product1, xmm_product2;

-            __m128i xmm_tempa, xmm_tempb;

-            register opus_int32 sum;

-            register opus_int32 state_8, state_9, state_a;

-            register opus_int64 coef_Q13_8, coef_Q13_9;

-            celt_assert( length > 0 );

-            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );

-            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );

-            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );

-            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );

-            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];

-            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];

-            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );

-            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );

-            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );

-            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );

-            state_8 = state[ 8 ];

-            state_9 = state[ 9 ];

-            state_a = 0;

-            for( n = 0; n < length; n++ )

-            {

-                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */

-                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );

-                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );

-                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );

-                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */

-                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );

-                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );

-                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );

-                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );

-                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );

-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );

-                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );

-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );

-                sum  = (opus_int32)((coef_Q13_8 * state_8) >> 16);

-                sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);

-                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );

-                sum += _mm_cvtsi128_si32( xmm_tempa);

-                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);

-                /* move right */

-                state_a = state_9;

-                state_9 = state_8;

-                state_8 = _mm_cvtsi128_si32( state_4567 );

-                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );

-                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );

-            }

-            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );

-            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );

-            state[ 8 ] = state_8;

-            state[ 9 ] = state_9;

-            state[ 10 ] = state_a;

-            return;

-        }

-    }

-    for( n = 0; n < length; n++ ) {

-        /* Output of lowpass section */

-        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );

-        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );

-        /* Output of allpass section */

-        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );

-        state[ 1 ] = tmp2;

-        acc_Q11 = silk_RSHIFT( order, 1 );

-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );

-        /* Loop over allpass sections */

-        for( i = 2; i < order; i += 2 ) {

-            /* Output of allpass section */

-            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );

-            state[ i ] = tmp1;

-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );

-            /* Output of allpass section */

-            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );

-            state[ i + 1 ] = tmp2;

-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );

-        }

-        state[ order ] = tmp1;

-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );

-        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );

-    }

-}

--- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c

+++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c

@@ -37,39 +37,36 @@

 #include "SigProc_FIX.h"

 #include "pitch.h"

-opus_int64 silk_inner_prod16_aligned_64_sse4_1(

+opus_int64 silk_inner_prod16_sse4_1(

     const opus_int16            *inVec1,            /*    I input vector 1                                              */

     const opus_int16            *inVec2,            /*    I input vector 2                                              */

     const opus_int              len                 /*    I vector lengths                                              */

-    opus_int  i, dataSize8;

+    opus_int  i, dataSize4;

     opus_int64 sum;

-    __m128i xmm_tempa;

-    __m128i inVec1_76543210, acc1;

-    __m128i inVec2_76543210, acc2;

+    __m128i xmm_prod_20, xmm_prod_31;

+    __m128i inVec1_3210, acc1;

+    __m128i inVec2_3210, acc2;

     sum = 0;

-    dataSize8 = len & ~7;

+    dataSize4 = len & ~3;

     acc1 = _mm_setzero_si128();

     acc2 = _mm_setzero_si128();

-    for( i = 0; i < dataSize8; i += 8 ) {

-        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );

-        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );

+    for( i = 0; i < dataSize4; i += 4 ) {

+        inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );

+        inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );

+        xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );

-        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */

-        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );

+        inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );

+        inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );

+        xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );

-        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );

-        /* equal shift right 8 bytes */

-        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );

-        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );

-        acc1 = _mm_add_epi64( acc1, xmm_tempa );

-        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );

+        acc1 = _mm_add_epi64( acc1, xmm_prod_20 );

+        acc2 = _mm_add_epi64( acc2, xmm_prod_31 );

     acc1 = _mm_add_epi64( acc1, acc2 );

@@ -83,6 +80,13 @@

     for( ; i < len; i++ ) {

         sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );

+#ifdef OPUS_CHECK_ASM

+    {

+        opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );

+        silk_assert( sum == sum_c );

+    }

+#endif

     return sum;

--- a/silk/main.h

+++ b/silk/main.h

@@ -247,21 +247,21 @@

 /************************************/

 void silk_NSQ_c(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int16            x16[],                                      /* I    Input                           */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

);

 #if !defined(OVERRIDE_silk_NSQ)

@@ -273,21 +273,21 @@

 /* Noise shaping using delayed decision */

 void silk_NSQ_del_dec_c(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int16            x16[],                                      /* I    Input                           */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

);

 #if !defined(OVERRIDE_silk_NSQ_del_dec)

--- a/silk/x86/NSQ_del_dec_sse4_1.c

+++ b/silk/x86/NSQ_del_dec_sse4_1.c

@@ -1,5 +1,5 @@

-/* Copyright (c) 2014, Cisco Systems, INC

-   Written by XiangMingZhu WeiZhou MinPeng YanWang

+/* Copyright (c) 2014-2020, Cisco Systems, INC

+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

    Redistribution and use in source and binary forms, with or without

    modification, are permitted provided that the following conditions

@@ -46,6 +46,7 @@

     opus_int32 Shape_Q14[ DECISION_DELAY ];

     opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];

     opus_int32 LF_AR_Q14;

+    opus_int32 Diff_Q14;

     opus_int32 Seed;

     opus_int32 SeedInit;

     opus_int32 RD_Q10;

@@ -56,6 +57,7 @@

     opus_int32 RD_Q10;

     opus_int32 xq_Q14;

     opus_int32 LF_AR_Q14;

+    opus_int32 Diff_Q14;

     opus_int32 sLTP_shp_Q14;

     opus_int32 LPC_exc_Q14;

 } NSQ_sample_struct;

@@ -66,7 +68,7 @@

     const silk_encoder_state *psEncC,               /* I    Encoder State                       */

     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */

     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */

-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */

+    const opus_int16    x16[],                      /* I    Input                               */

     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */

     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */

     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */

@@ -112,21 +114,21 @@

);

 void silk_NSQ_del_dec_sse4_1(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;

@@ -142,8 +144,39 @@

     VARDECL( opus_int32, delayedGain_Q10 );

     VARDECL( NSQ_del_dec_struct, psDelDec );

     NSQ_del_dec_struct  *psDD;

+#ifdef OPUS_CHECK_ASM

+    silk_nsq_state NSQ_c;

+    SideInfoIndices psIndices_c;

+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];

+    const opus_int8 *const pulses_a = pulses;

+#endif

     SAVE_STACK;

+#ifdef OPUS_CHECK_ASM

+    ( void )pulses_a;

+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );

+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );

+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );

+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );

+    silk_NSQ_del_dec_c(

+        psEncC,

+        &NSQ_c,

+        &psIndices_c,

+        x16,

+        pulses_c,

+        PredCoef_Q12,

+        LTPCoef_Q14,

+        AR_Q13,

+        HarmShapeGain_Q14,

+        Tilt_Q14,

+        LF_shp_Q14,

+        Gains_Q16,

+        pitchL,

+        Lambda_Q10,

+        LTP_scale_Q14

+    );

+#endif

     /* Set unvoiced lag to the previous one, overwrite later for voiced */

     lag = NSQ->lagPrev;

@@ -158,6 +191,7 @@

         psDD->SeedInit       = psDD->Seed;

         psDD->RD_Q10         = 0;

         psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;

+        psDD->Diff_Q14       = NSQ->sDiff_shp_Q14;

         psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];

         silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );

         silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );

@@ -185,8 +219,7 @@

         LSF_interpolation_flag = 1;

-    ALLOC( sLTP_Q15,

-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );

+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );

     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );

     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );

     ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );

@@ -198,7 +231,7 @@

     for( k = 0; k < psEncC->nb_subfr; k++ ) {

         A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];

         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];

-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];

+        AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];

         /* Noise shape parameters */

         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );

@@ -257,7 +290,7 @@

-        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,

+        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,

             psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );

         silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,

@@ -265,7 +298,7 @@

             Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,

             psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );

-        x_Q3   += psEncC->subfr_length;

+        x16    += psEncC->subfr_length;

         pulses += psEncC->subfr_length;

         pxq    += psEncC->subfr_length;

@@ -288,6 +321,7 @@

     for( i = 0; i < decisionDelay; i++ ) {

         last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;

         if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;

         pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );

         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(

             silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );

@@ -298,11 +332,19 @@

     /* Update states */

     NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;

+    NSQ->sDiff_shp_Q14  = psDD->Diff_Q14;

     NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];

     /* Save quantized speech signal */

     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );

     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );

+#ifdef OPUS_CHECK_ASM

+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );

+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );

+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );

+#endif

     RESTORE_STACK;

@@ -345,6 +387,7 @@

     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;

     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;

     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;

     VARDECL( NSQ_sample_pair, psSampleState );

     NSQ_del_dec_struct *psDD;

     NSQ_sample_struct  *psSS;

@@ -356,6 +399,8 @@

     celt_assert( nStatesDelayedDecision > 0 );

     ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );

+    int rdo_offset = (Lambda_Q10 >> 1) - 512;

     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];

     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];

     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );

@@ -407,8 +452,8 @@

         /* Long-term shaping */

         if( lag > 0 ) {

             /* Symmetric, packed FIR coefficients */

-            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );

-            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );

+            n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );

+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );

             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */

             shp_lag_ptr++;

         } else {

@@ -478,7 +523,7 @@

                     psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );

                     tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );

-                    /* setp 4 */

+                    /* step 4 */

                     psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );

                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );

                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );

@@ -511,9 +556,9 @@

                 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */

                 /* Noise shape feedback */

-                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */

+                celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */

                 /* Output of lowpass section */

-                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );

+                tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );

                 /* Output of allpass section */

                 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );

                 psDD->sAR2_Q14[ 0 ] = tmp2;

@@ -543,9 +588,9 @@

                 /* Input minus prediction plus noise feedback                       */

                 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */

-                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */

+                tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 );                                /* Q14 */

                 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */

-                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */

+                tmp1 = silk_SUB_SAT32( tmp2, tmp1 );                                        /* Q13 */

                 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */

                 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */

@@ -559,6 +604,18 @@

                 /* Find two quantization level candidates and measure their rate-distortion */

                 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );

                 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );

+                if (Lambda_Q10 > 2048) {

+                    /* For aggressive RDO, the bias becomes more than one pulse. */

+                    if (q1_Q10 > rdo_offset) {

+                        q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );

+                    } else if (q1_Q10 < -rdo_offset) {

+                        q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );

+                    } else if (q1_Q10 < 0) {

+                        q1_Q0 = -1;

+                    } else {

+                        q1_Q0 = 0;

+                    }

+                }

                 if( q1_Q0 > 0 ) {

                     q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );

                     q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );

@@ -612,8 +669,9 @@

                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );

                 /* Update states */

-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );

-                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );

+                psSS[ 0 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );

+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );

+                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );

                 psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;

                 psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;

                 psSS[ 0 ].xq_Q14       = xq_Q14;

@@ -626,14 +684,14 @@

                     exc_Q14 = -exc_Q14;

                 /* Add predictions */

                 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );

                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );

                 /* Update states */

-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );

-                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );

+                psSS[ 1 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );

+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );

+                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );

                 psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;

                 psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;

                 psSS[ 1 ].xq_Q14       = xq_Q14;

@@ -705,6 +763,7 @@

             psDD                                     = &psDelDec[ k ];

             psSS                                     = &psSampleState[ k ][ 0 ];

             psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;

+            psDD->Diff_Q14                           = psSS->Diff_Q14;

             psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;

             psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;

             psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;

@@ -728,7 +787,7 @@

     const silk_encoder_state *psEncC,               /* I    Encoder State                       */

     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */

     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */

-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */

+    const opus_int16    x16[],                      /* I    Input                               */

     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */

     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */

     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */

@@ -742,51 +801,41 @@

     opus_int            i, k, lag;

-    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;

+    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;

     NSQ_del_dec_struct  *psDD;

-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;

+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;

     lag          = pitchL[ subfr ];

     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );

     silk_assert( inv_gain_Q31 != 0 );

-    /* Calculate gain adjustment factor */

-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {

-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );

-    } else {

-        gain_adj_Q16 = (opus_int32)1 << 16;

-    }

     /* Scale input */

-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );

+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );

-    /* prepare inv_gain_Q23 in packed 4 32-bits */

-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);

+    /* prepare inv_gain_Q26 in packed 4 32-bits */

+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);

     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {

-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );

+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );

         /* equal shift right 4 bytes*/

-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );

+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );

-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );

-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );

+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );

+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );

-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );

-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );

+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );

+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );

-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );

+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );

-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );

+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );

     for( ; i < psEncC->subfr_length; i++ ) {

-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );

+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );

-    /* Save inverse gain */

-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];

     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */

     if( NSQ->rewhite_flag ) {

         if( subfr == 0 ) {

@@ -800,7 +849,9 @@

     /* Adjust for changing gain */

-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {

+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {

+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );

         /* Scale long-term shaping state */

             __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;

@@ -841,6 +892,7 @@

                 /* Scale scalar states */

                 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );

+                psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );

                 /* Scale short-term prediction and shaping states */

                 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {

@@ -855,5 +907,8 @@

+        /* Save inverse gain */

+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];

--- a/silk/x86/NSQ_sse4_1.c

+++ b/silk/x86/NSQ_sse4_1.c

@@ -1,5 +1,5 @@

-/* Copyright (c) 2014, Cisco Systems, INC

-   Written by XiangMingZhu WeiZhou MinPeng YanWang

+/* Copyright (c) 2014-2020, Cisco Systems, INC

+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

    Redistribution and use in source and binary forms, with or without

    modification, are permitted provided that the following conditions

@@ -37,17 +37,17 @@

 #include "stack_alloc.h"

 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(

-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */

-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */

-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */

-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */

-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */

-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */

-    opus_int            subfr,                  /* I    subframe number                 */

-    const opus_int      LTP_scale_Q14,          /* I                                    */

-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */

-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */

-    const opus_int      signal_type             /* I    Signal type                     */

+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */

+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */

+    const opus_int16    x16[],                     /* I    input                           */

+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */

+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */

+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */

+    opus_int            subfr,                     /* I    subframe number                 */

+    const opus_int      LTP_scale_Q14,             /* I                                    */

+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */

+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */

+    const opus_int      signal_type                /* I    Signal type                     */

);

 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(

@@ -65,6 +65,7 @@

     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */

     opus_int32          LF_shp_Q14,             /* I                                    */

     opus_int32          Gain_Q16,               /* I                                    */

+    opus_int            Lambda_Q10,             /* I                                    */

     opus_int            offset_Q10,             /* I                                    */

     opus_int            length,                 /* I    Input length                    */

     opus_int32          table[][4]              /* I                                    */

@@ -71,21 +72,21 @@

);

 void silk_NSQ_sse4_1(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

     opus_int            k, lag, start_idx, LSF_interpolation_flag;

@@ -101,8 +102,41 @@

     opus_int32   tmp1;

     opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;

+#ifdef OPUS_CHECK_ASM

+    silk_nsq_state NSQ_c;

+    SideInfoIndices psIndices_c;

+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];

+    const opus_int8 *const pulses_a = pulses;

+#endif

     SAVE_STACK;

+#ifdef OPUS_CHECK_ASM

+    ( void )pulses_a;

+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );

+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );

+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );

+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );

+    silk_NSQ_c(

+        psEncC,

+        &NSQ_c,

+        &psIndices_c,

+        x16,

+        pulses_c,

+        PredCoef_Q12,

+        LTPCoef_Q14,

+        AR_Q13,

+        HarmShapeGain_Q14,

+        Tilt_Q14,

+        LF_shp_Q14,

+        Gains_Q16,

+        pitchL,

+        Lambda_Q10,

+        LTP_scale_Q14

+    );

+#endif

     NSQ->rand_seed = psIndices->Seed;

     /* Set unvoiced lag to the previous one, overwrite later for voiced */

@@ -172,8 +206,7 @@

         LSF_interpolation_flag = 1;

-    ALLOC( sLTP_Q15,

-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );

+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );

     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );

     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );

     /* Set up pointers to start of sub frame */

@@ -183,7 +216,7 @@

     for( k = 0; k < psEncC->nb_subfr; k++ ) {

         A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];

         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];

-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];

+        AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];

         /* Noise shape parameters */

         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );

@@ -209,12 +242,12 @@

-        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );

+        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );

         if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )

             silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,

-                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],

+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,

                 offset_Q10, psEncC->subfr_length, &(table[32]) );

         else

@@ -224,7 +257,7 @@

                 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );

-        x_Q3   += psEncC->subfr_length;

+        x16    += psEncC->subfr_length;

         pulses += psEncC->subfr_length;

         pxq    += psEncC->subfr_length;

@@ -235,12 +268,19 @@

     /* Save quantized speech and noise shaping signals */

     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );

     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );

+#ifdef OPUS_CHECK_ASM

+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );

+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );

+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );

+#endif

     RESTORE_STACK;

-/***********************************/

-/* silk_noise_shape_quantizer_10_16  */

-/***********************************/

+/************************************/

+/* silk_noise_shape_quantizer_10_16 */

+/************************************/

 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(

     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */

     opus_int            signalType,             /* I    Signal type                     */

@@ -256,6 +296,7 @@

     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */

     opus_int32          LF_shp_Q14,             /* I                                    */

     opus_int32          Gain_Q16,               /* I                                    */

+    opus_int            Lambda_Q10,             /* I                                    */

     opus_int            offset_Q10,             /* I                                    */

     opus_int            length,                 /* I    Input length                    */

     opus_int32          table[][4]              /* I                                    */

@@ -264,7 +305,7 @@

     opus_int     i;

     opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;

     opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;

-    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;

+    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;

     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;

     opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;

@@ -279,6 +320,8 @@

     __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;

     __m128i AR_shp_Q13_76543210;

+    int rdo_offset = (Lambda_Q10 >> 1) - 512;

     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];

     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];

     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );

@@ -288,6 +331,7 @@

     sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;

     xq_Q14         = psLPC_Q14[ 0 ];

+    sDiff_shp_Q14  = NSQ->sDiff_shp_Q14;

     LTP_pred_Q13   = 0;

     /* load a_Q12 */

@@ -430,8 +474,8 @@

         sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );

         sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );

-        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );

-        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );

+        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );

+        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14),       0 );

         /* high part, use pmaddwd, results in 4 32-bit */

         xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );

@@ -462,7 +506,7 @@

         n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );

         n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );

-        silk_assert( lag > 0 || signalType != TYPE_VOICED );

+        celt_assert( lag > 0 || signalType != TYPE_VOICED );

         /* Combine prediction and noise shaping signals */

         tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */

@@ -469,7 +513,7 @@

         tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */

         if( lag > 0 ) {

             /* Symmetric, packed FIR coefficients */

-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );

+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );

             n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );

             n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );

             shp_lag_ptr++;

@@ -495,6 +539,18 @@

         /* Find two quantization level candidates and measure their rate-distortion */

         q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );

         q1_Q0 = silk_RSHIFT( q1_Q10, 10 );

+        if (Lambda_Q10 > 2048) {

+            /* For aggressive RDO, the bias becomes more than one pulse. */

+            if (q1_Q10 > rdo_offset) {

+                q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );

+            } else if (q1_Q10 < -rdo_offset) {

+                q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );

+            } else if (q1_Q10 < 0) {

+                q1_Q0 = -1;

+            } else {

+                q1_Q0 = 0;

+            }

+        }

         q1_Q10 = table[q1_Q0][0];

         q2_Q10 = table[q1_Q0][1];

@@ -519,7 +575,8 @@

         /* Update states */

         psLPC_Q14++;

         *psLPC_Q14 = xq_Q14;

-        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );

+        NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );

+        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );

         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );

         sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );

@@ -600,64 +657,54 @@

 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(

-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */

-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */

-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */

-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */

-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */

-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */

-    opus_int            subfr,                  /* I    subframe number                 */

-    const opus_int      LTP_scale_Q14,          /* I                                    */

-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */

-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */

-    const opus_int      signal_type             /* I    Signal type                     */

+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */

+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */

+    const opus_int16    x16[],                     /* I    input                           */

+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */

+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */

+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */

+    opus_int            subfr,                     /* I    subframe number                 */

+    const opus_int      LTP_scale_Q14,             /* I                                    */

+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */

+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */

+    const opus_int      signal_type                /* I    Signal type                     */

     opus_int   i, lag;

-    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;

-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;

+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;

+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;

     lag          = pitchL[ subfr ];

     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );

     silk_assert( inv_gain_Q31 != 0 );

-    /* Calculate gain adjustment factor */

-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {

-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );

-    } else {

-        gain_adj_Q16 = (opus_int32)1 << 16;

-    }

     /* Scale input */

-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );

+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );

-    /* prepare inv_gain_Q23 in packed 4 32-bits */

-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);

+    /* prepare inv_gain_Q26 in packed 4 32-bits */

+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);

     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {

-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );

+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );

         /* equal shift right 4 bytes*/

-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );

+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );

-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );

-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );

+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );

+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );

-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );

-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );

+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );

+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );

-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );

+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );

-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );

+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );

     for( ; i < psEncC->subfr_length; i++ ) {

-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );

+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );

-    /* Save inverse gain */

-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];

     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */

     if( NSQ->rewhite_flag ) {

         if( subfr == 0 ) {

@@ -671,7 +718,9 @@

     /* Adjust for changing gain */

-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {

+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {

+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );

         /* Scale long-term shaping state */

         __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;

@@ -707,6 +756,7 @@

         NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );

+        NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );

         /* Scale short-term prediction and shaping states */

         for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {

@@ -715,5 +765,8 @@

         for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {

             NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );

+        /* Save inverse gain */

+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];

--- a/silk/x86/SigProc_FIX_sse.h

+++ b/silk/x86/SigProc_FIX_sse.h

@@ -67,7 +67,7 @@

 #endif

-opus_int64 silk_inner_prod16_aligned_64_sse4_1(

+opus_int64 silk_inner_prod16_sse4_1(

     const opus_int16 *inVec1,

     const opus_int16 *inVec2,

     const opus_int   len

@@ -76,18 +76,18 @@

 #if defined(OPUS_X86_PRESUME_SSE4_1)

-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \

-    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))

+#define silk_inner_prod16(inVec1, inVec2, len, arch) \

+    ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))

 #else

-extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(

+extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(

                     const opus_int16 *inVec1,

                     const opus_int16 *inVec2,

                     const opus_int   len);

-#  define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \

-    ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))

+#  define silk_inner_prod16(inVec1, inVec2, len, arch) \

+    ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))

 #endif

 #endif

--- a/silk/x86/VAD_sse4_1.c

+++ b/silk/x86/VAD_sse4_1.c

@@ -1,5 +1,5 @@

-/* Copyright (c) 2014, Cisco Systems, INC

-   Written by XiangMingZhu WeiZhou MinPeng YanWang

+/* Copyright (c) 2014-2020, Cisco Systems, INC

+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

    Redistribution and use in source and binary forms, with or without

    modification, are permitted provided that the following conditions

@@ -63,6 +63,14 @@

     SAVE_STACK;

+#ifdef OPUS_CHECK_ASM

+    silk_encoder_state psEncC_c;

+    opus_int ret_c;

+    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );

+    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );

+#endif

     /* Safety checks */

     silk_assert( VAD_N_BANDS == 4 );

     celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );

@@ -233,15 +241,14 @@

         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );

+    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {

+        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );

+    }

     /* Power scaling */

     if( speech_nrg <= 0 ) {

         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );

-    } else if( speech_nrg < 32768 ) {

-        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {

-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );

-        } else {

-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );

-        }

+    } else if( speech_nrg < 16384 ) {

+        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );

         /* square-root */

         speech_nrg = silk_SQRT_APPROX( speech_nrg );

@@ -271,6 +278,11 @@

         /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */

         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );

+#ifdef OPUS_CHECK_ASM

+    silk_assert( ret == ret_c );

+    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );

+#endif

     RESTORE_STACK;

     return( ret );

--- a/silk/x86/VQ_WMat_EC_sse4_1.c

+++ b/silk/x86/VQ_WMat_EC_sse4_1.c

@@ -1,5 +1,5 @@

-/* Copyright (c) 2014, Cisco Systems, INC

-   Written by XiangMingZhu WeiZhou MinPeng YanWang

+/* Copyright (c) 2014-2020, Cisco Systems, INC

+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers

    Redistribution and use in source and binary forms, with or without

    modification, are permitted provided that the following conditions

@@ -38,105 +38,136 @@

 /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */

 void silk_VQ_WMat_EC_sse4_1(

     opus_int8                   *ind,                           /* O    index of best codebook vector               */

-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */

+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */

+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */

     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */

-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */

-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */

+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */

+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */

     const opus_int8             *cb_Q7,                         /* I    codebook                                    */

     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */

     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */

-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */

+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */

     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */

-    opus_int                    L                               /* I    number of vectors in codebook               */

+    const opus_int              L                               /* I    number of vectors in codebook               */

     opus_int   k, gain_tmp_Q7;

     const opus_int8 *cb_row_Q7;

-    opus_int16 diff_Q14[ 5 ];

-    opus_int32 sum1_Q14, sum2_Q16;

+    opus_int32 neg_xX_Q24[ 5 ];

+    opus_int32 sum1_Q15, sum2_Q24;

+    opus_int32 bits_res_Q8, bits_tot_Q8;

+    __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;

-    __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;

+    /* Negate and convert to new Q domain */

+    neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );

+    neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );

+    neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );

+    neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );

+    neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );

+    v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );

+    v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );

     /* Loop over codebook */

-    *rate_dist_Q14 = silk_int32_MAX;

+    *rate_dist_Q8 = silk_int32_MAX;

+    *res_nrg_Q15 = silk_int32_MAX;

     cb_row_Q7 = cb_Q7;

+    /* If things go really bad, at least *ind is set to something safe. */

+    *ind = 0;

     for( k = 0; k < L; k++ ) {

+        opus_int32 penalty;

         gain_tmp_Q7 = cb_gain_Q7[k];

-        diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );

-        C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );

-        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );

-        C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );

-        C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );

-        diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );

-        diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );

-        diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );

-        diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );

         /* Weighted rate */

-        sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );

+        /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */

+        sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );

         /* Penalty for too large gain */

-        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );

+        penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );

-        silk_assert( sum1_Q14 >= 0 );

+        /* first row of XX_Q17 */

+        v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );

+        v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );

+        v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );

+        v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );

+        v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);

+        v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );

+        v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);

+        sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );

+        sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );

+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  0 ], cb_row_Q7[ 0 ] );

+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 0 ] );

-        /* first row of W_Q18 */

-        C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );

-        C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );

-        C_tmp4 = _mm_srli_si128( C_tmp4, 2 );

+        /* second row of XX_Q17 */

+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[  7 ], cb_row_Q7[ 2 ] );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  8 ], cb_row_Q7[ 3 ] );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  9 ], cb_row_Q7[ 4 ] );

+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  6 ], cb_row_Q7[ 1 ] );

+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 1 ] );

-        C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */

-        C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */

+        /* third row of XX_Q17 */

+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 14 ], cb_row_Q7[ 4 ] );

+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 12 ], cb_row_Q7[ 2 ] );

+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 2 ] );

-        C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );

-        C_tmp5 = _mm_srli_si128( C_tmp5, 2 );

+        /* fourth row of XX_Q17 */

+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );

+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 18 ], cb_row_Q7[ 3 ] );

+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 3 ] );

-        C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );

-        C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );

+        /* last row of XX_Q17 */

+        sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );

+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 24 ], cb_row_Q7[ 4 ] );

+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 4 ] );

-        C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );

-        sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );

-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  0 ], diff_Q14[ 0 ] );

-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 0 ] );

-        /* second row of W_Q18 */

-        sum2_Q16 = silk_SMULWB(           W_Q18[  7 ], diff_Q14[ 2 ] );

-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  8 ], diff_Q14[ 3 ] );

-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  9 ], diff_Q14[ 4 ] );

-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );

-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  6 ], diff_Q14[ 1 ] );

-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 1 ] );

-        /* third row of W_Q18 */

-        sum2_Q16 = silk_SMULWB(           W_Q18[ 13 ], diff_Q14[ 3 ] );

-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );

-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );

-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );

-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 2 ] );

-        /* fourth row of W_Q18 */

-        sum2_Q16 = silk_SMULWB(           W_Q18[ 19 ], diff_Q14[ 4 ] );

-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );

-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );

-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 3 ] );

-        /* last row of W_Q18 */

-        sum2_Q16 = silk_SMULWB(           W_Q18[ 24 ], diff_Q14[ 4 ] );

-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 4 ] );

-        silk_assert( sum1_Q14 >= 0 );

         /* find best */

-        if( sum1_Q14 < *rate_dist_Q14 ) {

-            *rate_dist_Q14 = sum1_Q14;

-            *ind = (opus_int8)k;

-            *gain_Q7 = gain_tmp_Q7;

+        if( sum1_Q15 >= 0 ) {

+            /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */

+            bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );

+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */

+            bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );

+            if( bits_tot_Q8 <= *rate_dist_Q8 ) {

+                *rate_dist_Q8 = bits_tot_Q8;

+                *res_nrg_Q15 = sum1_Q15 + penalty;

+                *ind = (opus_int8)k;

+                *gain_Q7 = gain_tmp_Q7;

+            }

         /* Go to next cbk vector */

         cb_row_Q7 += LTP_ORDER;

+#ifdef OPUS_CHECK_ASM

+    {

+        opus_int8  ind_c = 0;

+        opus_int32 res_nrg_Q15_c = 0;

+        opus_int32 rate_dist_Q8_c = 0;

+        opus_int   gain_Q7_c = 0;

+        silk_VQ_WMat_EC_c(

+            &ind_c,

+            &res_nrg_Q15_c,

+            &rate_dist_Q8_c,

+            &gain_Q7_c,

+            XX_Q17,

+            xX_Q17,

+            cb_Q7,

+            cb_gain_Q7,

+            cl_Q5,

+            subfr_len,

+            max_gain_Q7,

+            L

+        );

+        silk_assert( *ind == ind_c );

+        silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );

+        silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );

+        silk_assert( *gain_Q7 == gain_Q7_c );

+    }

+#endif

--- a/silk/x86/main_sse.h

+++ b/silk/x86/main_sse.h

@@ -34,73 +34,72 @@

 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)

-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */

 #  define OVERRIDE_silk_VQ_WMat_EC

 void silk_VQ_WMat_EC_sse4_1(

     opus_int8                   *ind,                           /* O    index of best codebook vector               */

-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */

+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */

+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */

     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */

-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */

-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */

+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */

+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */

     const opus_int8             *cb_Q7,                         /* I    codebook                                    */

     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */

     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */

-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */

+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */

     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */

-    opus_int                    L                               /* I    number of vectors in codebook               */

+    const opus_int              L                               /* I    number of vectors in codebook               */

);

 #if defined OPUS_X86_PRESUME_SSE4_1

-#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \

-                          mu_Q9, max_gain_Q7, L, arch) \

-    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \

-                          mu_Q9, max_gain_Q7, L))

+#define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \

+                          subfr_len, max_gain_Q7, L, arch) \

+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \

+                          subfr_len, max_gain_Q7, L))

 #else

 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(

     opus_int8                   *ind,                           /* O    index of best codebook vector               */

-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */

+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */

+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */

     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */

-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */

-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */

+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */

+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */

     const opus_int8             *cb_Q7,                         /* I    codebook                                    */

     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */

     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */

-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */

+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */

     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */

-    opus_int                    L                               /* I    number of vectors in codebook               */

+    const opus_int              L                               /* I    number of vectors in codebook               */

);

-#  define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \

-                          mu_Q9, max_gain_Q7, L, arch) \

-    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \

-                          mu_Q9, max_gain_Q7, L))

+#  define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \

+                          subfr_len, max_gain_Q7, L, arch) \

+    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \

+                          subfr_len, max_gain_Q7, L))

 #endif

-#endif

-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */

 #  define OVERRIDE_silk_NSQ

 void silk_NSQ_sse4_1(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

);

 #if defined OPUS_X86_PRESUME_SSE4_1

@@ -113,21 +112,21 @@

 #else

 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

);

 #  define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

@@ -140,56 +139,55 @@

 #  define OVERRIDE_silk_NSQ_del_dec

 void silk_NSQ_del_dec_sse4_1(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

);

 #if defined OPUS_X86_PRESUME_SSE4_1

-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \

                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \

-    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \

                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

 #else

 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

);

-#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \

                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \

-    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \

+    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \

                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))

-#endif

 #endif

 void silk_noise_shape_quantizer(

--- a/silk/x86/x86_silk_map.c

+++ b/silk/x86/x86_silk_map.c

@@ -41,16 +41,16 @@

 #include "fixed/main_FIX.h"

-opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(

+opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(

     const opus_int16 *inVec1,

     const opus_int16 *inVec2,

     const opus_int   len

 ) = {

-  silk_inner_prod16_aligned_64_c,                  /* non-sse */

-  silk_inner_prod16_aligned_64_c,

-  silk_inner_prod16_aligned_64_c,

-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */

-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 )  /* avx */

+  silk_inner_prod16_c,                  /* non-sse */

+  silk_inner_prod16_c,

+  silk_inner_prod16_c,

+  MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */

+  MAY_HAVE_SSE4_1( silk_inner_prod16 )  /* avx */

};

 #endif

@@ -66,23 +66,22 @@

   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 )  /* avx */

};

-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */

 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

 ) = {

   silk_NSQ_c,                  /* non-sse */

   silk_NSQ_c,

@@ -90,21 +89,20 @@

   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */

   MAY_HAVE_SSE4_1( silk_NSQ )  /* avx */

};

-#endif

-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */

 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(

     opus_int8                   *ind,                           /* O    index of best codebook vector               */

-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */

+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */

+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */

     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */

-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */

-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */

+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */

+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */

     const opus_int8             *cb_Q7,                         /* I    codebook                                    */

     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */

     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */

-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */

+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */

     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */

-    opus_int                    L                               /* I    number of vectors in codebook               */

+    const opus_int              L                               /* I    number of vectors in codebook               */

 ) = {

   silk_VQ_WMat_EC_c,                  /* non-sse */

   silk_VQ_WMat_EC_c,

@@ -112,25 +110,23 @@

   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */

   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC )  /* avx */

};

-#endif

-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */

 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(

-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */

-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */

-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */

-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */

-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */

-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */

-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */

-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */

-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */

-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */

-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */

-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */

-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */

-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */

+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */

+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */

+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */

+    const opus_int16            x16[],                                        /* I    Input                           */

+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */

+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */

+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */

+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */

+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */

+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */

+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */

+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */

+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */

+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */

+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */

 ) = {

   silk_NSQ_del_dec_c,                  /* non-sse */

   silk_NSQ_del_dec_c,

@@ -138,7 +134,6 @@

   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */

   MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */

};

-#endif

 #if defined(FIXED_POINT)