shithub: opus

--- a/celt/x86/x86cpu.h

+++ b/celt/x86/x86cpu.h

@@ -44,18 +44,26 @@

 int opus_select_arch(void);

 # endif

-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi16_epi32()

-  when optimizations are disabled, even though the actual PMOVSXWD instruction

-  takes an m64. Unlike a normal m64 reference, these require 16-byte alignment

-  and load 16 bytes instead of 8, possibly reading out of bounds.

+/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()

+  or _mm_cvtepi16_epi32() when optimizations are disabled, even though the

+  actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory

+  reference, these require 16-byte alignment and load a full 16 bytes (instead

+  of 4 or 8), possibly reading out of bounds.

-  We can insert an explicit MOVQ using _mm_loadl_epi64(), which should have the

-  same semantics as an m64 reference in the PMOVSXWD instruction itself, but

-  gcc is not smart enough to optimize this out when optimizations ARE enabled.*/

+  We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or

+  _mm_loadl_epi64(), which should have the same semantics as an m32 or m64

+  reference in the PMOVSXWD instruction itself, but gcc is not smart enough to

+  optimize this out when optimizations ARE enabled.*/

 # if !defined(__OPTIMIZE__)

+#  define OP_CVTEPI8_EPI32_M32(x) \

+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))

 #  define OP_CVTEPI16_EPI32_M64(x) \

  (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))

 # else

+#  define OP_CVTEPI8_EPI32_M32(x) \

+ (_mm_cvtepi8_epi32(*(__m128i *)(x)))

 #  define OP_CVTEPI16_EPI32_M64(x) \

  (_mm_cvtepi16_epi32(*(__m128i *)(x)))

 # endif

--- a/silk/x86/VQ_WMat_EC_sse.c

+++ b/silk/x86/VQ_WMat_EC_sse.c

@@ -65,7 +65,7 @@

         diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );

         C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );

-        C_tmp2 = OP_CVTEPI16_EPI32_M64( &cb_row_Q7[ 1 ] );

+        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );

         C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );

         C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );