ref: e4a9c7f8e41b1ffd2fefe6f91b65e4f74b316349
parent: 0e7ed629975084bf70767d572d52d39a6f689e3a
parent: fe5b8d1a696437f2b29eafd44789e09a43ccff88
author: volvet <[email protected]>
date: Sun May 4 12:29:59 EDT 2014
Merge pull request #779 from zhilwang/intraSad Add IntraSad asm code.
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -100,6 +100,7 @@
%define r1w dx
%define r2w r8w
%define r3w r9w
+%define r6w r11w
%define r0b cl
%define r1b dl
@@ -149,6 +150,7 @@
%define r1w si
%define r2w dx
%define r3w cx
+%define r6w r10w
%define r0b dil
%define r1b sil
@@ -198,6 +200,7 @@
%define r1w cx
%define r2w dx
%define r3w bx
+%define r6w bp
%define r0b al
%define r1b cl
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -353,6 +353,283 @@
;
;***********************************************************************
+
+%macro SSE_DB_1_2REG 2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubb %1, %2
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
+; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatdThree4x4_sse2
+
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num 4
+%else
+ %assign push_num 0
+%endif
+ PUSH_XMM 8
+
+ mov r2, arg3
+ mov r3, arg4
+ SIGN_EXTENSION r3, r3d
+
+ ; load source 4x4 samples and Hadamard transform
+ movd xmm0, [r2]
+ movd xmm1, [r2+r3]
+ lea r2 , [r2+2*r3]
+ movd xmm2, [r2]
+ movd xmm3, [r2+r3]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
+
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
+
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
+
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
+
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
+
+ ; Hadamard transform results are saved in xmm0 and xmm2
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ ;load top boundary samples: [a b c d]
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENSION r1, r1d
+ sub r0, r1
+%ifdef UNIX64
+ push r4
+ push r5
+%endif
+
+ movzx r2d, byte [r0]
+ movzx r3d, byte [r0+1]
+ movzx r4d, byte [r0+2]
+ movzx r5d, byte [r0+3]
+
+ ; get the transform results of top boundary samples: [a b c d]
+ add r3d, r2d ; r3d = a + b
+ add r5d, r4d ; r5d = c + d
+ add r2d, r2d ; r2d = a + a
+ add r4d, r4d ; r4d = c + c
+ sub r2d, r3d ; r2d = a + a - a - b = a - b
+ sub r4d, r5d ; r4d = c + c - c - d = c - d
+ add r5d, r3d ; r5d = (a + b) + (c + d)
+ add r3d, r3d
+ sub r3d, r5d ; r3d = (a + b) - (c + d)
+ add r4d, r2d ; r4d = (a - b) + (c - d)
+ add r2d, r2d
+ sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm2
+ movd xmm5, r5d ; store the edi for DC mode
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ pinsrw xmm3, r5d, 0
+ pinsrw xmm3, r4d, 4
+ psllw xmm3, 2
+ pinsrw xmm4, r3d, 0
+ pinsrw xmm4, r2d, 4
+ psllw xmm4, 2
+
+ ; get the satd of H
+ psubw xmm0, xmm3
+ psubw xmm2, xmm4
+
+ WELS_AbsW xmm0, xmm1
+ WELS_AbsW xmm2, xmm1
+ paddusw xmm0, xmm2
+ SSE2_SumWHorizon1 xmm0, xmm1 ; satd of V is stored in xmm0
+
+ ;load left boundary samples: [a b c d]'
+ add r0, r1
+
+ movzx r2d, byte [r0-1]
+ movzx r3d, byte [r0+r1-1]
+ lea r0 , [r0+2*r1]
+ movzx r4d, byte [r0-1]
+ movzx r5d, byte [r0+r1-1]
+
+ ; get the transform results of left boundary samples: [a b c d]'
+ add r3d, r2d ; r3d = a + b
+ add r5d, r4d ; r5d = c + d
+ add r2d, r2d ; r2d = a + a
+ add r4d, r4d ; r4d = c + c
+ sub r2d, r3d ; r2d = a + a - a - b = a - b
+ sub r4d, r5d ; r4d = c + c - c - d = c - d
+ add r5d, r3d ; r5d = (a + b) + (c + d)
+ add r3d, r3d
+ sub r3d, r5d ; r3d = (a + b) - (c + d)
+ add r4d, r2d ; r4d = (a - b) + (c - d)
+ add r2d, r2d
+ sub r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+
+ ; store the transform results in xmm3
+ movd xmm3, r5d
+ pinsrw xmm3, r3d, 1
+ pinsrw xmm3, r2d, 2
+ pinsrw xmm3, r4d, 3
+ psllw xmm3, 2
+
+ ; get the satd of V
+ movdqa xmm2, xmm6
+ movdqa xmm4, xmm7
+ psubw xmm2, xmm3
+ WELS_AbsW xmm2, xmm1
+ WELS_AbsW xmm4, xmm1
+ paddusw xmm2, xmm4
+ SSE2_SumWHorizon1 xmm2, xmm1 ; satd of H is stored in xmm2
+
+ ; DC result is stored in xmm1
+ add r5d, 4
+ movd xmm1, r5d
+ paddw xmm1, xmm5
+ psrlw xmm1, 3
+ movdqa xmm5, xmm1
+ psllw xmm1, 4
+
+ ; get the satd of DC
+ psubw xmm6, xmm1
+ WELS_AbsW xmm6, xmm1
+ WELS_AbsW xmm7, xmm1
+ paddusw xmm6, xmm7
+ SSE2_SumWHorizon1 xmm6, xmm1 ; satd of DC is stored in xmm6
+%ifdef UNIX64
+ pop r5
+ pop r4
+%endif
+ ; comparing order: DC H V
+
+ mov r4, arg5
+ movd r2d, xmm6
+ movd r3d, xmm2
+ movd r6d, xmm0
+
+ and r2d, 0xffff
+ shr r2d, 1
+ and r3d, 0xffff
+ shr r3d, 1
+ and r6d, 0xffff
+ shr r6d, 1
+ add r2d, dword arg7
+ add r3d, dword arg8
+ add r6d, dword arg9
+ cmp r2w, r3w
+ jg near not_dc
+ cmp r2w, r6w
+ jg near not_dc_h
+
+ ; for DC mode
+ movd r3d, xmm5
+ imul r3d, 0x01010101
+ movd xmm5, r3d
+ pshufd xmm5, xmm5, 0
+ movdqa [r4], xmm5
+ mov r5, arg6
+ mov dword [r5], 0x02
+ mov retrd, r2d
+ POP_XMM
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%endif
+ ret
+
+not_dc:
+ cmp r3w, r6w
+ jg near not_dc_h
+
+ ; for H mode
+ SSE_DB_1_2REG xmm6, xmm7
+ sub r0, r1
+ sub r0, r1
+ movzx r6d, byte [r0-1]
+ movd xmm0, r6d
+ pmuludq xmm0, xmm6
+
+ movzx r6d, byte [r0+r1-1]
+ movd xmm1, r6d
+ pmuludq xmm1, xmm6
+ punpckldq xmm0, xmm1
+
+ lea r0, [r0+r1*2]
+ movzx r6d, byte [r0-1]
+ movd xmm2, r6d
+ pmuludq xmm2, xmm6
+
+ movzx r6d, byte [r0+r1-1]
+ movd xmm3, r6d
+ pmuludq xmm3, xmm6
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+
+ movdqa [r4],xmm0
+
+ mov retrd, r3d
+ mov r5, arg6
+ mov dword [r5], 0x01
+ POP_XMM
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%endif
+ ret
+not_dc_h:
+ sub r0, r1
+ sub r0, r1
+ sub r0, r1
+ movd xmm0, [r0]
+ pshufd xmm0, xmm0, 0
+ movdqa [r4],xmm0
+ mov retrd, r6d
+ mov r5, arg6
+ mov dword [r5], 0x00
+ POP_XMM
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%endif
+ ret
+
+
%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
pmaddubsw %1, xmm5
movdqa %2, %1
@@ -390,12 +667,12 @@
%macro SSE41_GetX38x4SatdDec 0
pxor xmm7, xmm7
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movq xmm2, [eax]
- movq xmm3, [eax+ebx]
- lea eax, [eax+2*ebx]
+ movq xmm0, [r2]
+ movq xmm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
punpcklbw xmm2, xmm7
@@ -405,34 +682,35 @@
SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
;doesn't need another transpose
%endmacro
+
%macro SSE41_GetX38x4SatdV 2
pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2], 0
- pinsrw xmm0, word[esi+%2+8], 4
+ pinsrw xmm0, word[r6+%2], 0
+ pinsrw xmm0, word[r6+%2+8], 4
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+2], 0
- pinsrw xmm0, word[esi+%2+10], 4
+ pinsrw xmm0, word[r6+%2+2], 0
+ pinsrw xmm0, word[r6+%2+10], 4
psubsw xmm0, xmm1
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+4], 0
- pinsrw xmm0, word[esi+%2+12], 4
+ pinsrw xmm0, word[r6+%2+4], 0
+ pinsrw xmm0, word[r6+%2+12], 4
psubsw xmm0, xmm3
pabsw xmm0, xmm0
paddw xmm4, xmm0
pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+6], 0
- pinsrw xmm0, word[esi+%2+14], 4
+ pinsrw xmm0, word[r6+%2+6], 0
+ pinsrw xmm0, word[r6+%2+14], 4
psubsw xmm0, xmm2
pabsw xmm0, xmm0
paddw xmm4, xmm0
%endmacro
%macro SSE41_GetX38x4SatdH 3
- movq xmm0, [esi+%3+8*%1]
+ movq xmm0, [r6+%3+8*%1]
punpcklqdq xmm0, xmm0
psubsw xmm0, xmm7
pabsw xmm0, xmm0
@@ -455,7 +733,7 @@
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
shl %1, 4
- movdqa xmm0, [esi+32+%1]
+ movdqa xmm0, [r6+32+%1]
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
@@ -481,83 +759,93 @@
paddd %1, %3
%endmacro
-
-%ifdef X86_32
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
+
+%ifndef X86_32
+ push r12
+ mov r12, r2
+%endif
+
pxor xmm4, xmm4
movdqa xmm5, [HSumSubDB1]
movdqa xmm6, [HSumSubDW1]
movdqa xmm7, [PDW1]
- sub ecx, edx
- movdqu xmm0, [ecx]
+ sub r0, r1
+ movdqu xmm0, [r0]
movhlps xmm1, xmm0
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi], xmm0 ;V
- movdqa [esi+16], xmm1
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 8
- pinsrb xmm0, byte[ecx+edx-1], 9
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 10
- pinsrb xmm0, byte[ecx+edx-1], 11
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 12
- pinsrb xmm0, byte[ecx+edx-1], 13
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 14
- pinsrb xmm0, byte[ecx+edx-1], 15
+ movdqa [r6], xmm0 ;V
+ movdqa [r6+16], xmm1
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 8
+ pinsrb xmm0, byte[r0+r1-1], 9
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 10
+ pinsrb xmm0, byte[r0+r1-1], 11
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 12
+ pinsrb xmm0, byte[r0+r1-1], 13
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 14
+ pinsrb xmm0, byte[r0+r1-1], 15
movhlps xmm1, xmm0
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi+32], xmm0 ;H
- movdqa [esi+48], xmm1
- movd ecx, xmm4 ;dc
- add ecx, 16 ;(sum+16)
- shr ecx, 5 ;((sum+16)>>5)
- shl ecx, 4 ;
- movd mm4, ecx ; mm4 copy DC
+ movdqa [r6+32], xmm0 ;H
+ movdqa [r6+48], xmm1
+ movd r0d, xmm4 ;dc
+ add r0d, 16 ;(sum+16)
+ shr r0d, 5 ;((sum+16)>>5)
+ shl r0d, 4 ;
+ movd mm4, r0d ; mm4 copy DC
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
- mov ecx, 0
- mov edi, 0
+%ifdef UNIX64
+ push r4
+%endif
+ mov r0, 0
+ mov r4, 0
+
.loop16x16_get_satd:
.loopStart1:
- SSE41_I16x16GetX38x4Satd ecx, edi
- inc ecx
- cmp ecx, 4
+ SSE41_I16x16GetX38x4Satd r0, r4
+ inc r0
+ cmp r0, 4
jl .loopStart1
- cmp edi, 16
+ cmp r4, 16
je .loop16x16_get_satd_end
- mov eax, [esp+24]
- add eax, 8
- mov ecx, 0
- add edi, 16
+%ifdef X86_32
+ mov r2, arg3
+%else
+ mov r2, r12
+%endif
+ add r2, 8
+ mov r0, 0
+ add r4, 16
jmp .loop16x16_get_satd
.loop16x16_get_satd_end:
MMX_DW_1_2REG xmm0, xmm1
@@ -568,40 +856,44 @@
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
+%ifdef UNIX64
+ pop r4
+%endif
; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ebx, edx
- mov edx, [esp+32]
- cmp ebx, edi
+ movd r3d, xmm6 ;DC
+ movd r1d, xmm5 ;H
+ movd r0d, xmm4 ;V
+%ifndef X86_32
+ pop r12
+%endif
+ shl r5d, 1
+ add r1d, r5d
+ add r3d, r5d
+ mov r4, arg5
+ cmp r3d, r1d
jge near not_dc_16x16
- cmp ebx, ecx
+ cmp r3d, r0d
jge near not_dc_h_16x16
; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
+ mov dword[r4], 2;I16_PRED_DC
+ mov retrd, r3d
jmp near return_satd_intra_16x16_x3
not_dc_16x16:
; for H mode
- cmp edi, ecx
+ cmp r1d, r0d
jge near not_dc_h_16x16
- mov dword[edx], 1;I16_PRED_H
- mov eax, edi
+ mov dword[r4], 1;I16_PRED_H
+ mov retrd, r1d
jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
+ mov dword[r4], 0;I16_PRED_V
+ mov retrd, r0d
return_satd_intra_16x16_x3:
WELSEMMS
- pop edi
- pop esi
- pop ebx
+ POP_XMM
+ LOAD_7_PARA_POP
ret
%macro SSE41_ChromaGetX38x8Satd 0
@@ -608,26 +900,26 @@
movdqa xmm5, [HSumSubDB1]
movdqa xmm6, [HSumSubDW1]
movdqa xmm7, [PDW1]
- sub ecx, edx
- movq xmm0, [ecx]
+ sub r0, r1
+ movq xmm0, [r0]
punpcklqdq xmm0, xmm0
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [esi], xmm0 ;V
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
+ movdqa [r6], xmm0 ;V
+ add r0, r1
+ pinsrb xmm0, byte[r0-1], 0
+ pinsrb xmm0, byte[r0+r1-1], 1
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 2
+ pinsrb xmm0, byte[r0+r1-1], 3
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 4
+ pinsrb xmm0, byte[r0+r1-1], 5
+ lea r0, [r0+2*r1]
+ pinsrb xmm0, byte[r0-1], 6
+ pinsrb xmm0, byte[r0+r1-1], 7
punpcklqdq xmm0, xmm0
SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- movdqa [esi+16], xmm0 ;H
+ movdqa [r6+16], xmm0 ;H
;(sum+2)>>2
movdqa xmm6, [PDQ2]
movdqa xmm5, xmm4
@@ -647,21 +939,19 @@
punpcklqdq xmm4, xmm5
psllq xmm4, 32
psrlq xmm4, 32
- movdqa [esi+32], xmm4
+ movdqa [r6+32], xmm4
punpckhqdq xmm5, xmm6
psllq xmm5, 32
psrlq xmm5, 32
- movdqa [esi+48], xmm5
+ movdqa [r6+48], xmm5
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
- mov ecx, 0
-loop_chroma_satdx3_cb_cr:
- SSE41_ChromaGetX38x4Satd ecx, 0
- inc ecx
- cmp ecx, 2
- jl loop_chroma_satdx3_cb_cr
+ mov r0, 0
+ SSE41_ChromaGetX38x4Satd r0, 0
+ inc r0
+ SSE41_ChromaGetX38x4Satd r0, 0
%endmacro
%macro SSEReg2MMX 3
@@ -677,27 +967,22 @@
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- xor edi, edi
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
SSE41_ChromaGetX38x8Satd
- cmp edi, 1
- je loop_chroma_satdx3end
- inc edi
SSEReg2MMX xmm4, mm0,mm1
SSEReg2MMX xmm5, mm2,mm3
SSEReg2MMX xmm6, mm5,mm6
- mov ecx, [esp+44]
- mov eax, [esp+48]
- jmp loop_chroma_satdx3
-loop_chroma_satdx3end:
+ mov r0, arg8
+ mov r2, arg9
+
+ SSE41_ChromaGetX38x8Satd
+
MMXReg2SSE xmm0, xmm3, mm0, mm1
MMXReg2SSE xmm1, xmm3, mm2, mm3
MMXReg2SSE xmm2, xmm3, mm5, mm6
@@ -714,39 +999,38 @@
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ecx, edx
- mov edx, [esp+32]
- cmp ebx, edi
+ movd r3d, xmm6 ;DC
+ movd r1d, xmm5 ;H
+ movd r0d, xmm4 ;V
+
+
+ shl r5d, 1
+ add r1d, r5d
+ add r0d, r5d
+ cmp r3d, r1d
jge near not_dc_8x8
- cmp ebx, ecx
+ cmp r3d, r0d
jge near not_dc_h_8x8
; for DC mode
- mov dword[edx], 0;I8_PRED_DC
- mov eax, ebx
+ mov dword[r4], 0;I8_PRED_DC
+ mov retrd, r3d
jmp near return_satd_intra_8x8_x3
not_dc_8x8:
; for H mode
- cmp edi, ecx
+ cmp r1d, r0d
jge near not_dc_h_8x8
- mov dword[edx], 1;I8_PRED_H
- mov eax, edi
+ mov dword[r4], 1;I8_PRED_H
+ mov retrd, r1d
jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
; for V mode
- mov dword[edx], 2;I8_PRED_V
- mov eax, ecx
+ mov dword[r4], 2;I8_PRED_V
+ mov retrd, r0d
return_satd_intra_8x8_x3:
WELSEMMS
- pop edi
- pop esi
- pop ebx
+ POP_XMM
+ LOAD_7_PARA_POP
ret
@@ -769,9 +1053,9 @@
paddw xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
- movzx %2, byte %1
- mov %3, %2
- add %4, %2
+ movzx %2, byte %1
+ mov %3, %2
+ add %4, %2
%endmacro
;***********************************************************************
@@ -780,133 +1064,139 @@
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov edi, [esp+40] ;temp_sad
- sub ecx, edx
- movdqa xmm5,[ecx]
- pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
- movd eax,xmm0
+ %assign push_num 0
+ LOAD_7_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r5, r5d
- add ecx,edx
- lea ebx, [edx+2*edx]
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- sub edi, 192
- add eax,10h
- shr eax,5
- movd xmm7,eax
- pxor xmm1,xmm1
- pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
-;sad begin
- mov eax, [esp+24]
- mov ebx, [esp+28]
- lea esi, [ebx+2*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ push r5
+ push r4
+ push r3
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
- movhlps xmm0,xmm4
- paddw xmm4,xmm0
-; comparing order: DC H V
- movd ebx, xmm4 ;DC
- movd ecx, xmm3 ;V
+ sub r0, r1
+ movdqa xmm5,[r0]
+ pxor xmm0,xmm0
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
+ movd r5d, xmm0
+
+ add r0,r1
+ lea r3,[r1+2*r1] ;ebx r3
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d ; esi r4d, eax r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ lea r0, [r0+4*r1]
+ add r6, 64
+ WelsAddDCValue [r0-1 ], r4d, [r6 ], r5d
+ WelsAddDCValue [r0-1+r1 ], r4d, [r6+16], r5d
+ WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+ WelsAddDCValue [r0-1+r3 ], r4d, [r6+48], r5d
+ sub r6, 192
+ add r5d,10h
+ shr r5d,5
+ movd xmm7,r5d
+ pxor xmm1,xmm1
+ pshufb xmm7,xmm1
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+ ;sad begin
+ pop r3
+ lea r4, [r3+2*r3] ;esi r4
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+ add r6, 64
+ lea r2, [r2+4*r3]
+ SSSE3_Get16BSadHVDC [r6], [r2]
+ SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+ SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+ SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+
+ pop r4
+ pop r5
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
+ movhlps xmm0,xmm4
+ paddw xmm4,xmm0
+ ; comparing order: DC H V
+ movd r1d, xmm4 ;DC ;ebx r1d
+ movd r0d, xmm3 ;V ;ecx r0d
psrldq xmm3, 4
- movd esi, xmm3 ;H
- mov eax, [esp+36] ;lamda
- shl eax, 1
- add esi, eax
- add ebx, eax
- mov edx, [esp+32]
- cmp ebx, esi
+ movd r2d, xmm3 ;H ;esi r2d
+
+ ;mov eax, [esp+36] ;lamda ;eax r5
+ shl r5d, 1
+ add r2d, r5d
+ add r1d, r5d
+ ;mov edx, [esp+32] ;edx r4
+ cmp r1d, r2d
jge near not_dc_16x16_sad
- cmp ebx, ecx
+ cmp r1d, r0d
jge near not_dc_h_16x16_sad
; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- sub edi, 192
+ mov dword[r4], 2;I16_PRED_DC
+ mov retrd, r1d
+ sub r6, 192
%assign x 0
%rep 16
- movdqa [edi+16*x], xmm7
+ movdqa [r6+16*x], xmm7
%assign x x+1
%endrep
jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
; for H mode
- cmp esi, ecx
+ cmp r2d, r0d
jge near not_dc_h_16x16_sad
- mov dword[edx], 1;I16_PRED_H
- mov eax, esi
+ mov dword[r4], 1;I16_PRED_H
+ mov retrd, r2d
jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
- sub edi, 192
+ mov dword[r4], 0;I16_PRED_V
+ mov retrd, r0d
+ sub r6, 192
%assign x 0
%rep 16
- movdqa [edi+16*x], xmm5
+ movdqa [r6+16*x], xmm5
%assign x x+1
%endrep
return_sad_intra_16x16_x3:
- pop edi
- pop esi
- pop ebx
+ POP_XMM
+ LOAD_7_PARA_POP
ret
-%endif
+
;***********************************************************************
;
;Pixel_sad_intra_ssse3 END
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -47,6 +47,11 @@
//int32_t WelsSampleSatd4x8( uint8_t *, int32_t, uint8_t *, int32_t );
int32_t WelsSampleSatd4x4_c (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatdIntra4x4Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t);
+int32_t WelsSampleSatdIntra16x16Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsSampleSadIntra16x16Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsSampleSatdIntra8x8Combined3_c (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
+ uint8_t*, uint8_t*);
#if defined(__cplusplus)
extern "C" {
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -367,11 +367,11 @@
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;
- //pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSampleSatdThree4x4_sse2;
+ pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSampleSatdThree4x4_sse2;
}
if (uiCpuFlag & WELS_CPU_SSSE3) {
- //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
+ pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
}
if (uiCpuFlag & WELS_CPU_SSE41) {
@@ -380,8 +380,8 @@
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
- //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
- //pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
+ pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
+ pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
}
#endif //(X86_ASM)
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -1166,251 +1166,4 @@
pop r4
pop r3
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
-; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
-;
-;***********************************************************************
-%ifdef X86_32
-WELS_EXTERN WelsSampleSatdThree4x4_sse2
- push ebx
- push esi
- push edi
- mov eax, [esp+24];p_enc
- mov ebx, [esp+28];linesize_enc
-
- ; load source 4x4 samples and Hadamard transform
- movd xmm0, [eax]
- movd xmm1, [eax+ebx]
- lea eax , [eax+2*ebx]
- movd xmm2, [eax]
- movd xmm3, [eax+ebx]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- ; Hadamard transform results are saved in xmm0 and xmm2
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- ; load top boundary samples: [a b c d]
- mov eax, [esp+16];p_dec
- sub eax, [esp+20];linesize_dec
- movzx ecx, byte [eax]
- movzx edx, byte [eax+1]
- movzx esi, byte [eax+2]
- movzx edi, byte [eax+3]
-
- ; get the transform results of top boundary samples: [a b c d]
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
- movdqa xmm6, xmm0
- movdqa xmm7, xmm2
- movd xmm5, edi ; store the edi for DC mode
- pxor xmm3, xmm3
- pxor xmm4, xmm4
- pinsrw xmm3, edi, 0
- pinsrw xmm3, esi, 4
- psllw xmm3, 2
- pinsrw xmm4, edx, 0
- pinsrw xmm4, ecx, 4
- psllw xmm4, 2
-
- ; get the satd of H
- psubw xmm0, xmm3
- psubw xmm2, xmm4
-
- WELS_AbsW xmm0, xmm1
- WELS_AbsW xmm2, xmm1
- paddusw xmm0, xmm2
- SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
-
- ; load left boundary samples: [a b c d]'
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movzx edx, byte [eax+ebx-1]
- lea eax , [eax+2*ebx]
- movzx esi, byte [eax-1]
- movzx edi, byte [eax+ebx-1]
-
- ; get the transform results of left boundary samples: [a b c d]'
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
- ; store the transform results in xmm3
- movd xmm3, edi
- pinsrw xmm3, edx, 1
- pinsrw xmm3, ecx, 2
- pinsrw xmm3, esi, 3
- psllw xmm3, 2
-
- ; get the satd of V
- movdqa xmm2, xmm6
- movdqa xmm4, xmm7
- psubw xmm2, xmm3
- WELS_AbsW xmm2, xmm1
- WELS_AbsW xmm4, xmm1
- paddusw xmm2, xmm4
- SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
-
- ; DC result is stored in xmm1
- add edi, 4
- movd xmm1, edi
- paddw xmm1, xmm5
- psrlw xmm1, 3
- movdqa xmm5, xmm1
- psllw xmm1, 4
-
- ; get the satd of DC
- psubw xmm6, xmm1
- WELS_AbsW xmm6, xmm1
- WELS_AbsW xmm7, xmm1
- paddusw xmm6, xmm7
- SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
-
- ; comparing order: DC H V
- mov edx, [esp+32]
- movd eax, xmm6
- movd edi, xmm2
- movd esi, xmm0
- and eax, 0xffff
- shr eax, 1
- and edi, 0xffff
- shr edi, 1
- and esi, 0xffff
- shr esi, 1
- add eax, [esp+40]
- add edi, [esp+44]
- add esi, [esp+48]
- cmp ax, di
- jg near not_dc
- cmp ax, si
- jg near not_dc_h
-
- ; for DC mode
- movd ebx, xmm5
- imul ebx, 0x01010101
- movd xmm5, ebx
- pshufd xmm5, xmm5, 0
- movdqa [edx], xmm5
- mov ebx, [esp+36]
- mov dword [ebx], 0x02
- pop edi
- pop esi
- pop ebx
- ret
-
-not_dc:
- cmp di, si
- jg near not_dc_h
-
- ; for H mode
- SSE_DB_1_2REG xmm6, xmm7
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movd xmm0, ecx
- pmuludq xmm0, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm1, ecx
- pmuludq xmm1, xmm6
-%if 1
- punpckldq xmm0, xmm1
-%else
- unpcklps xmm0, xmm1
-%endif
- lea eax, [eax+ebx*2]
- movzx ecx, byte [eax-1]
- movd xmm2, ecx
- pmuludq xmm2, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm3, ecx
- pmuludq xmm3, xmm6
-%if 1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
-%else
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
-%endif
- movdqa [edx],xmm0
-
- mov eax, edi
- mov ebx, [esp+36]
- mov dword [ebx], 0x01
-
- pop edi
- pop esi
- pop ebx
- ret
-not_dc_h:
- ; for V mode
- mov eax, [esp+16]
- sub eax, [esp+20]
- movd xmm0, [eax]
- pshufd xmm0, xmm0, 0
- movdqa [edx],xmm0
-
- mov eax, esi
- mov ebx, [esp+36]
- mov dword [ebx], 0x00
-
- pop edi
- pop esi
- pop ebx
- ret
-%endif
-
+ ret
\ No newline at end of file
--- a/test/encoder/EncUT_Sample.cpp
+++ b/test/encoder/EncUT_Sample.cpp
@@ -9,7 +9,136 @@
#include "sad_common.h"
using namespace WelsSVCEnc;
+#ifdef X86_ASM
+TEST(IntraSadSatdFuncTest, WelsIntra16x16Combined3Sad_ssse3){
+ const int32_t iLineSizeDec = 32;
+ const int32_t iLineSizeEnc = 32;
+ int32_t tmpa, tmpb;
+ int32_t iBestMode_c, iBestMode_a, iLambda = 50;
+ CMemoryAlign cMemoryAlign(0);
+ int32_t iCpuCores = 0;
+ uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
+ if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSSE3))
+ return;
+ uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec");
+ uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc");
+ uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst");
+ srand((uint32_t)time(NULL));
+ for(int i=0;i<(iLineSizeDec<<5);i++)
+ pDec[i]=rand()%256;
+ for(int i=0;i<(iLineSizeEnc<<5);i++)
+ pEnc[i]=rand()%256;
+ for(int i=0;i<512;i++)
+ pDst[i]=rand()%256;
+ tmpa = WelsSampleSadIntra16x16Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_c, iLambda, pDst);
+ tmpb = WelsIntra16x16Combined3Sad_ssse3(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_a, iLambda, pDst);
+
+ ASSERT_EQ(tmpa, tmpb);
+ ASSERT_EQ(iBestMode_c, iBestMode_a);
+
+ cMemoryAlign.WelsFree(pDec,"pDec");
+ cMemoryAlign.WelsFree(pEnc,"pEnc");
+ cMemoryAlign.WelsFree(pDst,"pDst");
+}
+
+TEST(IntraSadSatdFuncTest, WelsIntra16x16Combined3Satd_sse41){
+ const int32_t iLineSizeDec = 32;
+ const int32_t iLineSizeEnc = 32;
+ int32_t tmpa, tmpb;
+ int32_t iBestMode_c, iBestMode_a, iLambda = 50;
+ CMemoryAlign cMemoryAlign(0);
+ int32_t iCpuCores = 0;
+ uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
+ if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE41))
+ return;
+ uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec");
+ uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc");
+ uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst");
+ srand((uint32_t)time(NULL));
+ for(int i=0;i<(iLineSizeDec<<5);i++)
+ pDec[i]=rand()%256;
+ for(int i=0;i<(iLineSizeEnc<<5);i++)
+ pEnc[i]=rand()%256;
+ for(int i=0;i<512;i++)
+ pDst[i]=rand()%256;
+ tmpa = WelsSampleSatdIntra16x16Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_c, iLambda, pDst);
+ tmpb = WelsIntra16x16Combined3Satd_sse41(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode_a, iLambda, pDst);
+ ASSERT_EQ(tmpa, tmpb);
+ ASSERT_EQ(iBestMode_c, iBestMode_a);
+ cMemoryAlign.WelsFree(pDec,"pDec");
+ cMemoryAlign.WelsFree(pEnc,"pEnc");
+ cMemoryAlign.WelsFree(pDst,"pDst");
+}
+
+TEST(IntraSadSatdFuncTest, WelsSampleSatdThree4x4_sse2){
+ const int32_t iLineSizeDec = 32;
+ const int32_t iLineSizeEnc = 32;
+ int32_t tmpa, tmpb;
+ int32_t iBestMode_c, iBestMode_a, iLambda = 50;
+ int32_t lambda[2] = {iLambda << 2, iLambda};
+ int32_t iPredMode = rand()%3;
+ CMemoryAlign cMemoryAlign(0);
+ int32_t iCpuCores = 0;
+ uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
+ if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE2))
+ return;
+ uint8_t* pDec = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDec");
+ uint8_t* pEnc = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEnc");
+ uint8_t* pDst = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDst");
+ srand((uint32_t)time(NULL));
+ for(int i=0;i<(iLineSizeDec<<5);i++)
+ pDec[i]=rand()%256;
+ for(int i=0;i<(iLineSizeEnc<<5);i++)
+ pEnc[i]=rand()%256;
+ for(int i=0;i<512;i++)
+ pDst[i]=rand()%256;
+ tmpa = WelsSampleSatdIntra4x4Combined3_c(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc, pDst, &iBestMode_c, lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
+ tmpb = WelsSampleSatdThree4x4_sse2(pDec+128, iLineSizeDec, pEnc,iLineSizeEnc, pDst, &iBestMode_a, lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
+ ASSERT_EQ(tmpa, tmpb);
+ ASSERT_EQ(iBestMode_c, iBestMode_a);
+ cMemoryAlign.WelsFree(pDec,"pDec");
+ cMemoryAlign.WelsFree(pEnc,"pEnc");
+ cMemoryAlign.WelsFree(pDst,"pDst");
+}
+
+TEST(IntraSadSatdFuncTest, WelsIntraChroma8x8Combined3Satd_sse41){
+ const int32_t iLineSizeDec = 32;
+ const int32_t iLineSizeEnc = 32;
+ int32_t tmpa, tmpb;
+ int32_t iBestMode_c, iBestMode_a, iLambda = 50;
+ CMemoryAlign cMemoryAlign(0);
+ int32_t iCpuCores = 0;
+ uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect(&iCpuCores);
+ if (0 == (m_uiCpuFeatureFlag & WELS_CPU_SSE41))
+ return;
+ uint8_t* pDecCb = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDecCb");
+ uint8_t* pEncCb = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEncCb");
+ uint8_t* pDecCr = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeDec<<5,"pDecCr");
+ uint8_t* pEncCr = (uint8_t *)cMemoryAlign.WelsMalloc(iLineSizeEnc<<5,"pEncCr");
+ uint8_t* pDstChma = (uint8_t *)cMemoryAlign.WelsMalloc(512,"pDstChma");
+ srand((uint32_t)time(NULL));
+ for(int i=0;i<(iLineSizeDec<<5);i++){
+ pDecCb[i]=rand()%256;
+ pDecCr[i]=rand()%256;
+ }
+ for(int i=0;i<(iLineSizeEnc<<5);i++){
+ pEncCb[i]=rand()%256;
+ pEncCr[i]=rand()%256;
+ }
+ for(int i=0;i<512;i++)
+ pDstChma[i]=rand()%256;
+ tmpa = WelsSampleSatdIntra8x8Combined3_c(pDecCb+128, iLineSizeDec, pEncCb,iLineSizeEnc,&iBestMode_c, iLambda, pDstChma, pDecCr+128, pEncCr);
+ tmpb = WelsIntraChroma8x8Combined3Satd_sse41(pDecCb+128, iLineSizeDec, pEncCb,iLineSizeEnc,&iBestMode_a, iLambda, pDstChma, pDecCr+128, pEncCr);
+ ASSERT_EQ(tmpa, tmpb);
+ ASSERT_EQ(iBestMode_c, iBestMode_a);
+ cMemoryAlign.WelsFree(pDecCb,"pDecCb");
+ cMemoryAlign.WelsFree(pEncCb,"pEncCb");
+ cMemoryAlign.WelsFree(pDecCr,"pDecCr");
+ cMemoryAlign.WelsFree(pEncCr,"pEncCr");
+ cMemoryAlign.WelsFree(pDstChma,"pDstChma");
+}
+#endif
#define ASSERT_MEMORY_FAIL2X(A, B) \
if (NULL == B) { \
pMemAlign->WelsFree(A, "Sad_SrcA");\