ref: 9230b497282ada7649b2ac4cfd51a7b5d9e2f75d
parent: 301b06ad363e964daccaeb823e88de528a55c6a6
parent: 65b339815efb168f48e256231299a932f397fb51
author: Ethan Hugg <[email protected]>
date: Mon Jan 13 16:21:17 EST 2014
Merge pull request #97 from mstorsjo/asm-source-cleanup Make all asm sources consistently use unix newlines and remove trailing whitespace
--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -154,7 +154,7 @@
%define PUSHRFLAGS pushfq
%define POPRFLAGS popfq
%define retrq rax
-%define retrd eax
+%define retrd eax
%elifdef X86_32 ; X86_32 ;************************************
@@ -233,7 +233,7 @@
%macro LOAD_4_PARA 0
%ifdef X86_32
push r3
- %assign push_num push_num+1
+ %assign push_num push_num+1
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -245,7 +245,7 @@
%ifdef X86_32
push r3
push r4
- %assign push_num push_num+2
+ %assign push_num push_num+2
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -261,7 +261,7 @@
push r3
push r4
push r5
- %assign push_num push_num+3
+ %assign push_num push_num+3
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -280,7 +280,7 @@
push r4
push r5
push r6
- %assign push_num push_num+4
+ %assign push_num push_num+4
mov r0, [esp + push_num*4 + 4]
mov r1, [esp + push_num*4 + 8]
mov r2, [esp + push_num*4 + 12]
@@ -334,7 +334,7 @@
movsx %1, %2
%endif
%endmacro
-
+
%macro WELS_EXTERN 1
%ifdef PREFIX
global _%1
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -81,17 +81,17 @@
%ifdef WIN64
WelsCPUId:
- push rbx
- push rdx
-
+ push rbx
+ push rdx
+
mov eax, ecx
mov rcx, [r9]
- cpuid
+ cpuid
mov [r9], ecx
mov [r8], ebx
- mov rcx, [rsp + 2*8 + 40]
+ mov rcx, [rsp + 2*8 + 40]
mov [rcx], edx
- pop rdx
+ pop rdx
mov [rdx], eax
pop rbx
@@ -103,8 +103,8 @@
push rcx
push rdx
- mov eax, edi
- mov rcx, [rcx]
+ mov eax, edi
+ mov rcx, [rcx]
cpuid
mov [r8], edx
pop rdx
@@ -156,9 +156,9 @@
%elifdef UNIX64
mov eax, edi
mov ecx, esi
-%else
+%else
mov eax, [esp+4]
- mov ecx, [esp+8]
+ mov ecx, [esp+8]
%endif
; refer to detection of AVX addressed in INTEL AVX manual document
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -57,264 +57,264 @@
SECTION .text
-%ifdef WIN64
+%ifdef WIN64
WELS_EXTERN DeblockLumaLt4V_sse2
DeblockLumaLt4V_sse2:
- push rbp
+ push rbp
mov r11,[rsp + 16 + 20h] ; pTC
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,r8d
- movd xmm2,r9d
- mov qword [rbp+180h],r12
- mov r10,rcx
- movsxd r12,edx
- add edx,edx
- movsxd rdx,edx
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rcx]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx edx,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,edx
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rcx]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rcx]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rcx]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rcx]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rcx]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rcx],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rcx],xmm8
- movdqa [r12+rcx],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- pop rbp
- ret
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,r8d
+ movd xmm2,r9d
+ mov qword [rbp+180h],r12
+ mov r10,rcx
+ movsxd r12,edx
+ add edx,edx
+ movsxd rdx,edx
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rcx]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx edx,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,edx
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rcx]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rcx]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rcx]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rcx]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rcx]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rcx],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rcx],xmm8
+ movdqa [r12+rcx],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_sse2
@@ -321,462 +321,462 @@
ALIGN 16
DeblockLumaEq4V_sse2:
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rdi
- pop rsi
- pop rbp
- pop rbx
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
ret
@@ -784,161 +784,161 @@
ALIGN 16
DeblockChromaLt4V_sse2:
- mov rax,rsp
- push rbx
- push rdi
- sub rsp,0C8h
+ mov rax,rsp
+ push rbx
+ push rdi
+ sub rsp,0C8h
mov r10,qword [rax + 30h] ; pTC
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
movsx eax,word [rsp + 0C8h + 38h] ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- pop rdi
- pop rbx
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rdi
+ pop rbx
ret
@@ -945,151 +945,151 @@
WELS_EXTERN DeblockChromaEq4V_sse2
ALIGN 16
DeblockChromaEq4V_sse2:
- mov rax,rsp
- push rbx
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
+ mov rax,rsp
+ push rbx
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movaps xmm7,[rsp+70h]
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movaps xmm6,[r11-10h]
- movaps xmm8,[r11-30h]
- movaps xmm9,[r11-40h]
- movq [rbx],xmm1
- movaps xmm10,[r11-50h]
- movaps xmm11,[r11-60h]
- movaps xmm12,[r11-70h]
- movaps xmm13,[r11-80h]
- mov rsp,r11
- pop rbx
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movaps xmm7,[rsp+70h]
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movaps xmm6,[r11-10h]
+ movaps xmm8,[r11-30h]
+ movaps xmm9,[r11-40h]
+ movq [rbx],xmm1
+ movaps xmm10,[r11-50h]
+ movaps xmm11,[r11-60h]
+ movaps xmm12,[r11-70h]
+ movaps xmm13,[r11-80h]
+ mov rsp,r11
+ pop rbx
ret
@@ -1099,263 +1099,263 @@
WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
DeblockChromaEq4H_sse2:
- mov rax,rsp
- mov [rax+20h],rbx
- push rdi
- sub rsp,140h
- mov rdi,rdx
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
- mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- movsx eax,word [rsp+170h] ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
- pop rdi
+ mov rax,rsp
+ mov [rax+20h],rbx
+ push rdi
+ sub rsp,140h
+ mov rdi,rdx
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ movsx eax,word [rsp+170h] ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
+ mov rsp,r11
+ pop rdi
ret
@@ -1363,283 +1363,283 @@
WELS_EXTERN DeblockChromaLt4H_sse2
ALIGN 16
DeblockChromaLt4H_sse2:
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- push r12
- sub rsp,170h
-
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ sub rsp,170h
+
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
mov rax, [rsp+1C8h] ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
movsx eax,word [rsp+1C0h] ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
- pop r12
- pop rdi
- pop rsi
- pop rbp
- pop rbx
- ret
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
@@ -1649,258 +1649,258 @@
WELS_EXTERN DeblockLumaLt4V_sse2
DeblockLumaLt4V_sse2:
- push rbp
- mov r11,r8 ; pTC
- sub rsp,1B0h
- lea rbp,[rsp+20h]
- movd xmm4,edx
- movd xmm2,ecx
- mov qword [rbp+180h],r12
- mov r10,rdi
- movsxd r12,esi
+ push rbp
+ mov r11,r8 ; pTC
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,edx
+ movd xmm2,ecx
+ mov qword [rbp+180h],r12
+ mov r10,rdi
+ movsxd r12,esi
add rsi,rsi
- movsxd rdx,esi
- sub r10,r12
- movsx r8d,byte [r11]
- pxor xmm3,xmm3
- punpcklwd xmm2,xmm2
- movaps [rbp+50h],xmm14
- lea rax,[r12+r12*2]
- movdqa xmm14,[rdx+rdi]
- neg rax
- pshufd xmm0,xmm2,0
- movd xmm2,r8d
- movsx rsi,byte [r11+1]
- movsx r8d,byte [r11+2]
- movsx r11d,byte [r11+3]
- movaps [rbp+70h],xmm12
- movd xmm1,esi
- movaps [rbp+80h],xmm11
- movd xmm12,r8d
- movd xmm11,r11d
- movdqa xmm5, [rax+rdi]
- lea rax,[r12+r12]
- punpcklwd xmm12,xmm12
- neg rax
- punpcklwd xmm11,xmm11
- movaps [rbp],xmm8
- movdqa xmm8, [r10]
- punpcklwd xmm2,xmm2
- punpcklwd xmm1,xmm1
- punpcklqdq xmm12,xmm12
- punpcklqdq xmm11,xmm11
- punpcklqdq xmm2,xmm2
- punpcklqdq xmm1,xmm1
- shufps xmm12,xmm11,88h
- movdqa xmm11,xmm8
- movaps [rbp+30h],xmm9
- movdqa xmm9,[rdi]
- shufps xmm2,xmm1,88h
- movdqa xmm1,xmm5
- punpcklbw xmm11,xmm3
- movaps [rbp+20h],xmm6
- movaps [rbp+60h],xmm13
- movdqa xmm13,xmm11
- movaps [rbp+90h],xmm10
- movdqa xmm10,xmm9
- movdqa xmm6,[rax+rdi]
- punpcklbw xmm1,xmm3
- movaps [rbp+0A0h],xmm12
- psubw xmm13,xmm1
- movaps [rbp+40h],xmm15
- movdqa xmm15,xmm14
- movaps [rbp+10h],xmm7
- movdqa xmm7,xmm6
- punpcklbw xmm10,xmm3
- movdqa xmm12,[r12+rdi]
- punpcklbw xmm7,xmm3
- punpcklbw xmm12,xmm3
- punpcklbw xmm15,xmm3
- pabsw xmm3,xmm13
- movdqa xmm13,xmm10
- psubw xmm13,xmm15
- movdqa [rbp+0F0h],xmm15
- pabsw xmm15,xmm13
- movdqa xmm13,xmm11
- movdqa [rbp+0B0h],xmm1
- movdqa xmm1,xmm0
- pavgw xmm13,xmm10
- pcmpgtw xmm1,xmm3
- movdqa [rbp+120h],xmm13
- movaps xmm13,xmm2
- punpcklwd xmm4,xmm4
- movdqa xmm3,xmm0
- movdqa [rbp+100h],xmm1
- psubw xmm13,xmm1
- movdqa xmm1,xmm10
- pcmpgtw xmm3,xmm15
- pshufd xmm4,xmm4,0
- psubw xmm1,xmm11
- movdqa [rbp+0D0h],xmm10
- psubw xmm13,xmm3
- movdqa [rbp+110h],xmm3
- pabsw xmm15,xmm1
- movdqa xmm3,xmm4
- psubw xmm10,xmm12
- pcmpgtw xmm3,xmm15
- pabsw xmm15,xmm10
- movdqa xmm10,xmm0
- psllw xmm1,2
- movdqa [rbp+0C0h],xmm11
- psubw xmm11,xmm7
- pcmpgtw xmm10,xmm15
- pabsw xmm11,xmm11
- movdqa xmm15,xmm0
- pand xmm3,xmm10
- pcmpgtw xmm15,xmm11
- movaps xmm11,xmm2
- pxor xmm10,xmm10
- pand xmm3,xmm15
- pcmpgtw xmm11,xmm10
- pcmpeqw xmm10,xmm2
- por xmm11,xmm10
- pand xmm3,xmm11
- movdqa xmm11,xmm7
- psubw xmm11,xmm12
- pxor xmm15,xmm15
- paddw xmm11,xmm1
- psubw xmm15,xmm13
- movdqa [rbp+0E0h],xmm12
- paddw xmm11,[FOUR_16B_SSE2]
- pxor xmm12,xmm12
- psraw xmm11,3
- punpckhbw xmm8,xmm12
- pmaxsw xmm15,xmm11
- punpckhbw xmm5,xmm12
- movdqa xmm11,xmm8
- pminsw xmm13,xmm15
- psubw xmm11,xmm5
- punpckhbw xmm9,xmm12
- pand xmm13,xmm3
- movdqa [rbp+130h],xmm13
- pabsw xmm13,xmm11
- punpckhbw xmm14,xmm12
- movdqa xmm11,xmm9
- psubw xmm11,xmm14
- movdqa xmm15,xmm0
- movdqa [rbp+140h],xmm14
- pabsw xmm14,xmm11
- movdqa xmm11,xmm8
- pcmpgtw xmm15,xmm14
- movdqa xmm1,[r12+rdi]
- pavgw xmm11,xmm9
- movdqa [rbp+170h],xmm11
- movdqa xmm10,xmm9
- punpckhbw xmm6,xmm12
- psubw xmm10,xmm8
- punpckhbw xmm1,xmm12
- movdqa xmm12,xmm0
- movaps xmm11,[rbp+0A0h]
- pcmpgtw xmm12,xmm13
- movaps xmm13,xmm11
- psubw xmm13,xmm12
- movdqa [rbp+160h],xmm15
- psubw xmm13,xmm15
- movdqa xmm15,xmm9
- psubw xmm15,xmm1
- movdqa [rbp+150h],xmm12
- pabsw xmm12,xmm10
- pabsw xmm14,xmm15
- movdqa xmm15,xmm8
- pcmpgtw xmm4,xmm12
- movdqa xmm12,xmm0
- psubw xmm15,xmm6
- pcmpgtw xmm12,xmm14
- pabsw xmm14,xmm15
- psllw xmm10,2
- pcmpgtw xmm0,xmm14
- movdqa xmm14,xmm6
- psubw xmm14,xmm1
- pand xmm4,xmm12
- paddw xmm14,xmm10
- pand xmm4,xmm0
- paddw xmm14,[FOUR_16B_SSE2]
- pxor xmm15,xmm15
- movaps xmm12,xmm11
- psubw xmm15,xmm13
- pxor xmm0,xmm0
- psraw xmm14,3
- pcmpgtw xmm12,xmm0
- pcmpeqw xmm0,xmm11
- pmaxsw xmm15,xmm14
- por xmm12,xmm0
- movdqa xmm0,[rbp+120h]
- pminsw xmm13,xmm15
- movdqa xmm15,[rbp+0B0h]
- movdqa xmm10,xmm7
- pand xmm4,xmm12
- paddw xmm15,xmm0
- pxor xmm12,xmm12
- paddw xmm10,xmm7
- movdqa xmm14,xmm12
- psubw xmm15,xmm10
- psubw xmm14,xmm2
- psraw xmm15,1
- pmaxsw xmm15,xmm14
- movdqa xmm10,xmm6
- pminsw xmm15,xmm2
- paddw xmm10,xmm6
- pand xmm15,xmm3
- psubw xmm12,xmm11
- pand xmm15,[rbp+100h]
- pand xmm13,xmm4
- paddw xmm7,xmm15
- paddw xmm8,xmm13
- movdqa xmm15,[rbp+170h]
- psubw xmm9,xmm13
- paddw xmm5,xmm15
- psubw xmm5,xmm10
- psraw xmm5,1
- pmaxsw xmm5,xmm12
- pminsw xmm5,xmm11
- pand xmm5,xmm4
- pand xmm5,[rbp+150h]
- paddw xmm6,xmm5
- movdqa xmm5,[rbp+0C0h]
- packuswb xmm7,xmm6
- movdqa xmm6,[rbp+130h]
- paddw xmm5,xmm6
- packuswb xmm5,xmm8
- movdqa xmm8,[rbp+0D0h]
- psubw xmm8,xmm6
- movdqa xmm6,[rbp+0F0h]
- paddw xmm6,xmm0
- movdqa xmm0,[rbp+0E0h]
- packuswb xmm8,xmm9
- movdqa xmm9,xmm0
- paddw xmm9,xmm0
- psubw xmm6,xmm9
- psraw xmm6,1
- pmaxsw xmm14,xmm6
- pminsw xmm2,xmm14
- pand xmm2,xmm3
- pand xmm2,[rbp+110h]
- paddw xmm0,xmm2
- movdqa xmm2,[rbp+140h]
- paddw xmm2,xmm15
- movdqa xmm15,xmm1
- paddw xmm15,xmm1
- psubw xmm2,xmm15
- psraw xmm2,1
- pmaxsw xmm12,xmm2
- pminsw xmm11,xmm12
- pand xmm11,xmm4
- pand xmm11,[rbp+160h]
- paddw xmm1,xmm11
- movdqa [rax+rdi],xmm7
- movdqa [r10],xmm5
- packuswb xmm0,xmm1
- movdqa [rdi],xmm8
- movdqa [r12+rdi],xmm0
- mov r12,qword [rbp+180h]
- lea rsp,[rbp+190h]
- pop rbp
- ret
+ movsxd rdx,esi
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rdi]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx rsi,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,esi
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rdi]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rdi]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rdi]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rdi]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rdi]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rdi],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rdi],xmm8
+ movdqa [r12+rdi],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
WELS_EXTERN DeblockLumaEq4V_sse2
@@ -1907,637 +1907,637 @@
ALIGN 16
DeblockLumaEq4V_sse2:
- mov rax,rsp
- push rbx
- push rbp
+ mov rax,rsp
+ push rbx
+ push rbp
mov r8, rdx
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
ret
WELS_EXTERN DeblockChromaLt4V_sse2
-ALIGN 16
-DeblockChromaLt4V_sse2:
- mov rax,rsp
- push rbx
- push rbp
+ALIGN 16
+DeblockChromaLt4V_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
mov r10, rdx
mov r11, rcx
mov rcx, rdi
- mov rdx, rsi
+ mov rdx, rsi
mov rsi, r10
mov r10, r9
mov rbp, r8
mov r8, rsi
mov r9, r11
- sub rsp,0C8h
- pxor xmm1,xmm1
- mov rbx,rcx
- movsxd r11,r8d
- movsx ecx,byte [r10]
- movsx r8d,byte [r10+2]
- mov rdi,rdx
- movq xmm2,[rbx]
- movq xmm9,[r11+rbx]
- movsx edx,byte [r10+1]
- mov word [rsp+2],cx
- mov word [rsp],cx
- movsx eax,byte [r10+3]
- mov word [rsp+6],dx
- mov word [rsp+4],dx
- movdqa xmm11,xmm1
- mov word [rsp+0Eh],ax
- mov word [rsp+0Ch],ax
- lea eax,[r11+r11]
- movsxd rcx,eax
- mov rax,rbx
- mov rdx,rdi
- sub rax,rcx
- mov word [rsp+0Ah],r8w
- mov word [rsp+8],r8w
- movdqa xmm6,[rsp]
- movdqa xmm7,xmm6
- movq xmm13, [rax]
- mov rax,rdi
- sub rax,rcx
- mov rcx,rbx
- pcmpgtw xmm7,xmm1
- psubw xmm11,xmm6
- sub rcx,r11
- sub rdx,r11
- movq xmm0,[rax]
- movsx eax,r9w
- movq xmm15,[rcx]
- punpcklqdq xmm13,xmm0
- movq xmm0, [rdx]
- movdqa xmm4,xmm13
- punpcklqdq xmm15,xmm0
- movq xmm0, [rdi]
- punpcklbw xmm4,xmm1
- movdqa xmm12,xmm15
- punpcklqdq xmm2,xmm0
- movq xmm0, [r11+rdi]
- punpcklbw xmm12,xmm1
- movdqa xmm14,xmm2
- punpcklqdq xmm9,xmm0
- punpckhbw xmm2,xmm1
- punpcklbw xmm14,xmm1
- movd xmm0,eax
+ sub rsp,0C8h
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm9
- movdqa [rsp+10h],xmm2
- punpcklwd xmm0,xmm0
- punpckhbw xmm9,xmm1
- punpcklbw xmm3,xmm1
- movdqa xmm1,xmm14
- pshufd xmm10,xmm0,0
- movd xmm0,eax
- mov eax,4
- cwde
- punpcklwd xmm0,xmm0
- pshufd xmm8,xmm0,0
- movd xmm0,eax
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- psubw xmm1,xmm12
- movdqa xmm2,xmm10
- lea r11,[rsp+0C8h]
- psllw xmm1,2
- movdqa xmm0,xmm4
- psubw xmm4,xmm12
- psubw xmm0,xmm3
- psubw xmm3,xmm14
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm11
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm12
- psubw xmm0,xmm14
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- movdqa xmm3,[rsp]
- pand xmm2,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- psubw xmm0,xmm9
- psubw xmm13,xmm15
- pand xmm2,xmm7
- pand xmm6,xmm2
- paddw xmm12,xmm6
- psubw xmm14,xmm6
- movdqa xmm2,[rsp+10h]
- movaps xmm6,[r11-18h]
- movdqa xmm1,xmm2
- psubw xmm1,xmm15
- psubw xmm9,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm15
- psubw xmm0,xmm2
- psraw xmm1,3
- pmaxsw xmm11,xmm1
- pabsw xmm0,xmm0
- movdqa xmm1,xmm8
- pcmpgtw xmm10,xmm0
- pabsw xmm0,xmm13
- pminsw xmm3,xmm11
- movaps xmm11,[r11-68h]
- movaps xmm13,[rsp+40h]
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm9
- movaps xmm9, [r11-48h]
- pand xmm10,xmm1
- pcmpgtw xmm8,xmm0
- pand xmm10,xmm8
- pand xmm10,xmm7
- movaps xmm8,[r11-38h]
- movaps xmm7,[r11-28h]
- pand xmm3,xmm10
- paddw xmm15,xmm3
- psubw xmm2,xmm3
- movaps xmm10,[r11-58h]
- packuswb xmm12,xmm15
- movaps xmm15,[rsp+20h]
- packuswb xmm14,xmm2
- movq [rcx],xmm12
- movq [rbx],xmm14
- psrldq xmm12,8
- psrldq xmm14,8
- movq [rdx],xmm12
- movaps xmm12,[r11-78h]
- movq [rdi],xmm14
- movaps xmm14,[rsp+30h]
- mov rsp,r11
- pop rbp
- pop rbx
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rbp
+ pop rbx
ret
WELS_EXTERN DeblockChromaEq4V_sse2
ALIGN 16
DeblockChromaEq4V_sse2:
- mov rax,rsp
- push rbx
+ mov rax,rsp
+ push rbx
push rbp
mov rbp, r8
@@ -2545,143 +2545,143 @@
mov r9, rcx
mov rcx, rdi
mov rdx, rsi
-
- sub rsp,90h
- pxor xmm1,xmm1
- mov r11,rcx
- mov rbx,rdx
- mov r10d,r9d
- movq xmm13,[r11]
- lea eax,[r8+r8]
- movsxd r9,eax
- mov rax,rcx
- sub rax,r9
- movq xmm14,[rax]
- mov rax,rdx
- sub rax,r9
- movq xmm0,[rax]
- movsxd rax,r8d
- sub rcx,rax
- sub rdx,rax
- movq xmm12,[rax+r11]
- movq xmm10,[rcx]
- punpcklqdq xmm14,xmm0
- movdqa xmm8,xmm14
- movq xmm0,[rdx]
- punpcklbw xmm8,xmm1
- punpckhbw xmm14,xmm1
- punpcklqdq xmm10,xmm0
- movq xmm0,[rbx]
- movdqa xmm5,xmm10
- punpcklqdq xmm13,xmm0
- movq xmm0, [rax+rbx]
- punpcklbw xmm5,xmm1
- movsx eax,r10w
- movdqa xmm9,xmm13
- punpcklqdq xmm12,xmm0
- punpcklbw xmm9,xmm1
- punpckhbw xmm10,xmm1
- movd xmm0,eax
+
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
mov eax, ebp ; iBeta
- punpckhbw xmm13,xmm1
- movdqa xmm7,xmm12
- punpcklwd xmm0,xmm0
- punpckhbw xmm12,xmm1
- pshufd xmm11,xmm0,0
- punpcklbw xmm7,xmm1
- movd xmm0,eax
- movdqa xmm1,xmm8
- psubw xmm1,xmm5
- punpcklwd xmm0,xmm0
- movdqa xmm6,xmm11
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm5
- psubw xmm0,xmm9
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm10
- movdqa xmm1,xmm14
- psubw xmm0,xmm13
- psubw xmm1,xmm10
- pabsw xmm0,xmm0
- pcmpgtw xmm11,xmm0
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm11,xmm2
- movdqa xmm0,xmm12
- movdqa xmm4,xmm6
- movdqa xmm1,xmm8
- mov eax,2
- cwde
- paddw xmm1,xmm8
- psubw xmm0,xmm13
- paddw xmm1,xmm5
- pabsw xmm0,xmm0
- movdqa xmm2,xmm14
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm14
- movd xmm0,eax
- pand xmm11,xmm3
- paddw xmm7,xmm7
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- paddw xmm2,xmm12
- paddw xmm12,xmm12
- pshufd xmm3,xmm0,0
- paddw xmm7,xmm9
- paddw xmm12,xmm13
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm5
- paddw xmm7,xmm8
- psraw xmm1,2
- paddw xmm12,xmm14
- paddw xmm7,xmm3
- ;movaps xmm14,[rsp]
- pand xmm4,xmm1
- paddw xmm12,xmm3
- psraw xmm7,2
- movdqa xmm1,xmm11
- por xmm4,xmm0
- psraw xmm12,2
- paddw xmm2,xmm3
- movdqa xmm0,xmm11
- pandn xmm0,xmm10
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- packuswb xmm4,xmm1
- movdqa xmm0,xmm11
- movdqa xmm1,xmm6
- pand xmm1,xmm7
- movq [rcx],xmm4
- pandn xmm6,xmm9
- pandn xmm11,xmm13
- pand xmm0,xmm12
- por xmm1,xmm6
- por xmm0,xmm11
- psrldq xmm4,8
- packuswb xmm1,xmm0
- movq [r11],xmm1
- psrldq xmm1,8
- movq [rdx],xmm4
- lea r11,[rsp+90h]
- movq [rbx],xmm1
- mov rsp,r11
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ ;movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movq [rbx],xmm1
+ mov rsp,r11
pop rbp
- pop rbx
+ pop rbx
ret
@@ -2688,270 +2688,270 @@
WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
DeblockChromaEq4H_sse2:
- mov rax,rsp
- push rbx
- push rbp
+ mov rax,rsp
+ push rbx
+ push rbp
push r12
-
- mov rbp, r8
+
+ mov rbp, r8
mov r8, rdx
mov r9, rcx
mov rcx, rdi
- mov rdx, rsi
+ mov rdx, rsi
mov rdi, rdx
- sub rsp,140h
- lea eax,[r8*4]
- movsxd r10,eax
- mov eax,[rcx-2]
- mov [rsp+10h],eax
- lea rbx,[r10+rdx-2]
- lea r11,[r10+rcx-2]
+ sub rsp,140h
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
- movdqa xmm5,[rsp+10h]
- movsxd r10,r8d
- mov eax,[r10+rcx-2]
- lea rdx,[r10+r10*2]
- mov [rsp+20h],eax
- mov eax,[rcx+r10*2-2]
- mov [rsp+30h],eax
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
mov eax,[rdx+rcx-2]
- movdqa xmm2,[rsp+20h]
- mov [rsp+40h],eax
- mov eax, [rdi-2]
- movdqa xmm4,[rsp+30h]
- mov [rsp+50h],eax
- mov eax,[r10+rdi-2]
- movdqa xmm3,[rsp+40h]
- mov [rsp+60h],eax
- mov eax,[rdi+r10*2-2]
- punpckldq xmm5,[rsp+50h]
- mov [rsp+70h],eax
- mov eax, [rdx+rdi-2]
- punpckldq xmm2, [rsp+60h]
- mov [rsp+80h],eax
- mov eax,[r11]
- punpckldq xmm4, [rsp+70h]
- mov [rsp+50h],eax
- mov eax,[rbx]
- punpckldq xmm3,[rsp+80h]
- mov [rsp+60h],eax
- mov eax,[r10+r11]
- movdqa xmm0, [rsp+50h]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[r10+rbx]
- movdqa xmm0,[rsp+50h]
- movdqa xmm1,xmm5
- mov [rsp+60h],eax
- mov eax,[r11+r10*2]
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax,[rbx+r10*2]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- mov eax, [rdx+r11]
- movdqa xmm15,xmm1
- punpckldq xmm0,[rsp+60h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+50h],xmm0
- mov [rsp+50h],eax
- mov eax, [rdx+rbx]
- movdqa xmm0,[rsp+50h]
- mov [rsp+60h],eax
- punpckldq xmm0, [rsp+60h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm15,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm12,xmm15
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm12,xmm0
- punpckhdq xmm15,xmm0
- movdqa xmm0,xmm1
- movdqa xmm11,xmm12
- punpckldq xmm0,xmm5
- punpckhdq xmm1,xmm5
- punpcklqdq xmm11,xmm0
- punpckhqdq xmm12,xmm0
- movsx eax,r9w
- movdqa xmm14,xmm15
- punpcklqdq xmm14,xmm1
- punpckhqdq xmm15,xmm1
- pxor xmm1,xmm1
- movd xmm0,eax
- movdqa xmm4,xmm12
- movdqa xmm8,xmm11
- mov eax, ebp ; iBeta
- punpcklwd xmm0,xmm0
- punpcklbw xmm4,xmm1
- punpckhbw xmm12,xmm1
- movdqa xmm9,xmm14
- movdqa xmm7,xmm15
- movdqa xmm10,xmm15
- pshufd xmm13,xmm0,0
- punpcklbw xmm9,xmm1
- punpckhbw xmm14,xmm1
- movdqa xmm6,xmm13
- movd xmm0,eax
- movdqa [rsp],xmm11
- mov eax,2
- cwde
- punpckhbw xmm11,xmm1
- punpckhbw xmm10,xmm1
- punpcklbw xmm7,xmm1
- punpcklwd xmm0,xmm0
- punpcklbw xmm8,xmm1
- pshufd xmm3,xmm0,0
- movdqa xmm1,xmm8
- movdqa xmm0,xmm4
- psubw xmm0,xmm9
- psubw xmm1,xmm4
- movdqa xmm2,xmm3
- pabsw xmm0,xmm0
- pcmpgtw xmm6,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm3
- pcmpgtw xmm2,xmm0
- pand xmm6,xmm2
- movdqa xmm0,xmm7
- movdqa xmm2,xmm3
- psubw xmm0,xmm9
- pabsw xmm0,xmm0
- pcmpgtw xmm1,xmm0
- pand xmm6,xmm1
- movdqa xmm0,xmm12
- movdqa xmm1,xmm11
- psubw xmm0,xmm14
- psubw xmm1,xmm12
- movdqa xmm5,xmm6
- pabsw xmm0,xmm0
- pcmpgtw xmm13,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm8
- pcmpgtw xmm2,xmm0
- paddw xmm1,xmm8
- movdqa xmm0,xmm10
- pand xmm13,xmm2
- psubw xmm0,xmm14
- paddw xmm1,xmm4
- movdqa xmm2,xmm11
- pabsw xmm0,xmm0
- paddw xmm2,xmm11
- paddw xmm1,xmm7
- pcmpgtw xmm3,xmm0
- paddw xmm2,xmm12
- movd xmm0,eax
- pand xmm13,xmm3
- paddw xmm2,xmm10
- punpcklwd xmm0,xmm0
- pshufd xmm3,xmm0,0
- movdqa xmm0,xmm6
- paddw xmm1,xmm3
- pandn xmm0,xmm4
- paddw xmm2,xmm3
- psraw xmm1,2
- pand xmm5,xmm1
- por xmm5,xmm0
- paddw xmm7,xmm7
- paddw xmm10,xmm10
- psraw xmm2,2
- movdqa xmm1,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm2
- paddw xmm7,xmm9
- por xmm1,xmm0
- paddw xmm10,xmm14
- paddw xmm7,xmm8
- movdqa xmm0,xmm13
- packuswb xmm5,xmm1
- paddw xmm7,xmm3
- paddw xmm10,xmm11
- movdqa xmm1,xmm6
- paddw xmm10,xmm3
- pandn xmm6,xmm9
- psraw xmm7,2
- pand xmm1,xmm7
- psraw xmm10,2
- pandn xmm13,xmm14
- pand xmm0,xmm10
- por xmm1,xmm6
- movdqa xmm6,[rsp]
- movdqa xmm4,xmm6
- por xmm0,xmm13
- punpcklbw xmm4,xmm5
- punpckhbw xmm6,xmm5
- movdqa xmm3,xmm4
- packuswb xmm1,xmm0
- movdqa xmm0,xmm1
- punpckhbw xmm1,xmm15
- punpcklbw xmm0,xmm15
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm6
- movdqa xmm2,xmm3
- punpcklwd xmm0,xmm1
- punpckhwd xmm6,xmm1
- movdqa xmm1,xmm4
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm6
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm6
- punpckhqdq xmm2,xmm1
- movdqa [rsp+10h],xmm0
- movdqa [rsp+60h],xmm2
- movdqa xmm0,xmm3
- mov eax,[rsp+10h]
- mov [rcx-2],eax
- mov eax,[rsp+60h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [r10+rcx-2],eax
- movdqa [rsp+20h],xmm0
- mov eax, [rsp+20h]
- movdqa [rsp+70h],xmm3
- mov [rcx+r10*2-2],eax
- mov eax,[rsp+70h]
- mov [rdx+rcx-2],eax
- mov eax,[rsp+18h]
- mov [r11],eax
- mov eax,[rsp+68h]
- mov [r10+r11],eax
- mov eax,[rsp+28h]
- mov [r11+r10*2],eax
- mov eax,[rsp+78h]
- mov [rdx+r11],eax
- mov eax,[rsp+14h]
- mov [rdi-2],eax
- mov eax,[rsp+64h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+24h]
- mov [rdi+r10*2-2],eax
- mov eax, [rsp+74h]
- mov [rdx+rdi-2],eax
- mov eax, [rsp+1Ch]
- mov [rbx],eax
- mov eax, [rsp+6Ch]
- mov [r10+rbx],eax
- mov eax,[rsp+2Ch]
- mov [rbx+r10*2],eax
- mov eax,[rsp+7Ch]
- mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ mov eax, ebp ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
mov rsp,r11
pop r12
pop rbp
@@ -2962,14 +2962,14 @@
WELS_EXTERN DeblockChromaLt4H_sse2
ALIGN 16
DeblockChromaLt4H_sse2:
- mov rax,rsp
- push rbx
- push rbp
- push r12
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
push r13
push r14
- sub rsp,170h
-
+ sub rsp,170h
+
mov r13, r8
mov r14, r9
mov r8, rdx
@@ -2977,275 +2977,275 @@
mov rdx, rdi
mov rcx, rsi
- movsxd rsi,r8d
- lea eax,[r8*4]
- mov r11d,r9d
- movsxd r10,eax
- mov eax, [rcx-2]
- mov r12,rdx
- mov [rsp+40h],eax
- mov eax, [rsi+rcx-2]
- lea rbx,[r10+rcx-2]
- movdqa xmm5,[rsp+40h]
- mov [rsp+50h],eax
- mov eax, [rcx+rsi*2-2]
- lea rbp,[r10+rdx-2]
- movdqa xmm2, [rsp+50h]
- mov [rsp+60h],eax
- lea r10,[rsi+rsi*2]
- mov rdi,rcx
- mov eax,[r10+rcx-2]
- movdqa xmm4,[rsp+60h]
- mov [rsp+70h],eax
- mov eax,[rdx-2]
- mov [rsp+80h],eax
- mov eax, [rsi+rdx-2]
- movdqa xmm3,[rsp+70h]
- mov [rsp+90h],eax
- mov eax,[rdx+rsi*2-2]
- punpckldq xmm5,[rsp+80h]
- mov [rsp+0A0h],eax
- mov eax, [r10+rdx-2]
- punpckldq xmm2,[rsp+90h]
- mov [rsp+0B0h],eax
- mov eax, [rbx]
- punpckldq xmm4,[rsp+0A0h]
- mov [rsp+80h],eax
- mov eax,[rbp]
- punpckldq xmm3,[rsp+0B0h]
- mov [rsp+90h],eax
- mov eax,[rsi+rbx]
- movdqa xmm0,[rsp+80h]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm5,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rsi+rbp]
- movdqa xmm0,[rsp+80h]
- movdqa xmm1,xmm5
- mov [rsp+90h],eax
- mov eax,[rbx+rsi*2]
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm2,xmm0
- punpcklbw xmm1,xmm2
- punpckhbw xmm5,xmm2
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax,[rbp+rsi*2]
- movdqa xmm0, [rsp+80h]
- mov [rsp+90h],eax
- mov eax,[r10+rbx]
- movdqa xmm7,xmm1
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm4,xmm0
- movdqa [rsp+80h],xmm0
- mov [rsp+80h],eax
- mov eax, [r10+rbp]
- movdqa xmm0,[rsp+80h]
- mov [rsp+90h],eax
- punpckldq xmm0,[rsp+90h]
- punpcklqdq xmm3,xmm0
- movdqa xmm0,xmm4
- punpcklbw xmm0,xmm3
- punpckhbw xmm4,xmm3
- punpcklwd xmm7,xmm0
- punpckhwd xmm1,xmm0
- movdqa xmm0,xmm5
- movdqa xmm6,xmm7
- punpcklwd xmm0,xmm4
- punpckhwd xmm5,xmm4
- punpckldq xmm6,xmm0
- punpckhdq xmm7,xmm0
- movdqa xmm0,xmm1
- punpckldq xmm0,xmm5
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
mov rax, r14 ; pTC
- punpckhdq xmm1,xmm5
- movdqa xmm9,xmm6
- punpckhqdq xmm6,xmm0
- punpcklqdq xmm9,xmm0
- movdqa xmm2,xmm7
- movdqa xmm13,xmm6
- movdqa xmm4,xmm9
- movdqa [rsp+10h],xmm9
- punpcklqdq xmm2,xmm1
- punpckhqdq xmm7,xmm1
- pxor xmm1,xmm1
- movsx ecx,byte [rax+3]
- movsx edx,byte [rax+2]
- movsx r8d,byte [rax+1]
- movsx r9d,byte [rax]
- movdqa xmm10,xmm1
- movdqa xmm15,xmm2
- punpckhbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm4,xmm1
- movsx eax,r11w
- mov word [rsp+0Eh],cx
- mov word [rsp+0Ch],cx
- movdqa xmm3,xmm7
- movdqa xmm8,xmm7
- movdqa [rsp+20h],xmm7
- punpcklbw xmm15,xmm1
- punpcklbw xmm13,xmm1
- punpcklbw xmm3,xmm1
- mov word [rsp+0Ah],dx
- mov word [rsp+8],dx
- mov word [rsp+6],r8w
- movd xmm0,eax
- movdqa [rsp+30h],xmm6
- punpckhbw xmm9,xmm1
- punpckhbw xmm8,xmm1
- punpcklwd xmm0,xmm0
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
mov eax, r13d ; iBeta
- mov word [rsp+4],r8w
- mov word [rsp+2],r9w
- pshufd xmm12,xmm0,0
- mov word [rsp],r9w
- movd xmm0,eax
- mov eax,4
- cwde
- movdqa xmm14, [rsp]
- movdqa [rsp],xmm2
- movdqa xmm2,xmm12
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- psubw xmm10,xmm14
- movd xmm0,eax
- movdqa xmm7,xmm14
- movdqa xmm6,xmm14
- pcmpgtw xmm7,xmm1
- punpcklwd xmm0,xmm0
- pshufd xmm5,xmm0,0
- movdqa xmm0,xmm4
- movdqa xmm1,xmm15
- psubw xmm4,xmm13
- psubw xmm0,xmm3
- psubw xmm1,xmm13
- psubw xmm3,xmm15
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm10
- psraw xmm1,3
- pmaxsw xmm0,xmm1
- pminsw xmm6,xmm0
- movdqa xmm1,xmm11
- movdqa xmm0,xmm13
- psubw xmm0,xmm15
- pabsw xmm0,xmm0
- pcmpgtw xmm2,xmm0
- pabsw xmm0,xmm4
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm3
- pand xmm2,xmm1
- movdqa xmm1,xmm11
- movdqa xmm3,[rsp+30h]
- pcmpgtw xmm1,xmm0
- movdqa xmm0,xmm9
- pand xmm2,xmm1
- psubw xmm0,xmm8
- psubw xmm9,xmm3
- pand xmm2,xmm7
- pand xmm6,xmm2
- psubw xmm15,xmm6
- paddw xmm13,xmm6
- movdqa xmm2,[rsp]
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- psubw xmm8,xmm2
- psllw xmm1,2
- paddw xmm1,xmm0
- paddw xmm1,xmm5
- movdqa xmm0,xmm3
- movdqa xmm5,[rsp+10h]
- psubw xmm0,xmm2
- psraw xmm1,3
- movdqa xmm4,xmm5
- pabsw xmm0,xmm0
- pmaxsw xmm10,xmm1
- movdqa xmm1,xmm11
- pcmpgtw xmm12,xmm0
- pabsw xmm0,xmm9
- pminsw xmm14,xmm10
- pcmpgtw xmm1,xmm0
- pabsw xmm0,xmm8
- pcmpgtw xmm11,xmm0
- pand xmm12,xmm1
- movdqa xmm1,[rsp+20h]
- pand xmm12,xmm11
- pand xmm12,xmm7
- pand xmm14,xmm12
- paddw xmm3,xmm14
- psubw xmm2,xmm14
- packuswb xmm13,xmm3
- packuswb xmm15,xmm2
- punpcklbw xmm4,xmm13
- punpckhbw xmm5,xmm13
- movdqa xmm0,xmm15
- punpcklbw xmm0,xmm1
- punpckhbw xmm15,xmm1
- movdqa xmm3,xmm4
- punpcklwd xmm3,xmm0
- punpckhwd xmm4,xmm0
- movdqa xmm0,xmm5
- movdqa xmm2,xmm3
- movdqa xmm1,xmm4
- punpcklwd xmm0,xmm15
- punpckhwd xmm5,xmm15
- punpckldq xmm2,xmm0
- punpckhdq xmm3,xmm0
- punpckldq xmm1,xmm5
- movdqa xmm0,xmm2
- punpcklqdq xmm0,xmm1
- punpckhdq xmm4,xmm5
- punpckhqdq xmm2,xmm1
- movdqa [rsp+40h],xmm0
- movdqa xmm0,xmm3
- movdqa [rsp+90h],xmm2
- mov eax,[rsp+40h]
- mov [rdi-2],eax
- mov eax, [rsp+90h]
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm3,xmm4
- mov [rsi+rdi-2],eax
- movdqa [rsp+50h],xmm0
- mov eax,[rsp+50h]
- movdqa [rsp+0A0h],xmm3
- mov [rdi+rsi*2-2],eax
- mov eax,[rsp+0A0h]
- mov [r10+rdi-2],eax
- mov eax,[rsp+48h]
- mov [rbx],eax
- mov eax,[rsp+98h]
- mov [rsi+rbx],eax
- mov eax,[rsp+58h]
- mov [rbx+rsi*2],eax
- mov eax, [rsp+0A8h]
- mov [r10+rbx],eax
- mov eax, [rsp+44h]
- mov [r12-2],eax
- mov eax,[rsp+94h]
- mov [rsi+r12-2],eax
- mov eax,[rsp+54h]
- mov [r12+rsi*2-2],eax
- mov eax, [rsp+0A4h]
- mov [r10+r12-2],eax
- mov eax,[rsp+4Ch]
- mov [rbp],eax
- mov eax,[rsp+9Ch]
- mov [rsi+rbp],eax
- mov eax, [rsp+5Ch]
- mov [rbp+rsi*2],eax
- mov eax,[rsp+0ACh]
- mov [r10+rbp],eax
- lea r11,[rsp+170h]
- mov rsp,r11
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
pop r14
pop r13
- pop r12
- pop rbp
- pop rbx
- ret
+ pop r12
+ pop rbp
+ pop rbx
+ ret
@@ -5162,7 +5162,7 @@
mov esp, ebp
pop ebp
ret
-
+
%endif
@@ -5178,16 +5178,16 @@
ALIGN 16
DeblockLumaTransposeH2V_sse2:
- push r3
- push r4
+ push r3
+ push r4
push r5
-%assign push_num 3
- LOAD_3_PARA
+%assign push_num 3
+ LOAD_3_PARA
SIGN_EXTENTION r1, r1d
- mov r5, r7
+ mov r5, r7
mov r3, r7
and r3, 0Fh
sub r7, r3
@@ -5229,7 +5229,7 @@
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
+
movdqa [r2], xmm4
movdqa [r2 + 10h], xmm2
movdqa [r2 + 20h], xmm3
@@ -5258,17 +5258,17 @@
DeblockLumaTransposeV2H_sse2:
push r3
- push r4
+ push r4
%assign push_num 2
LOAD_3_PARA
- SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r1, r1d
mov r4, r7
- mov r3, r7
+ mov r3, r7
and r3, 0Fh
- sub r7, r3
+ sub r7, r3
sub r7, 10h
movdqa xmm0, [r2]
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -244,7 +244,7 @@
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
;r6 [height]
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
- ;r3 [pSrc+(w-1)] r4[pSrc+w]
+ ;r3 [pSrc+(w-1)] r4[pSrc+w]
%if %1 == 32 ; for luma
.left_right_loops:
@@ -375,13 +375,13 @@
%assign push_num 3
LOAD_4_PARA
-
+
SIGN_EXTENTION r1, r1d
SIGN_EXTENTION r2, r2d
SIGN_EXTENTION r3, r3d
;also prepare for cross border pData top-left:xmm3
-
+
movzx r6d,byte[r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@@ -395,22 +395,22 @@
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
+
mov r6,r1 ;r6 = stride
sal r6,05h ;r6 = 32*stride
lea r4,[r3+r6] ;r4 = dst bottom
-
+
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
+
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
-
+
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
-
+
neg r1 ;r1 = -stride
-
+
push r0
push r1
push r2
@@ -419,13 +419,13 @@
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
+
pop r2
pop r1
pop r0
lea r5,[r0-32] ;left border dst luma =32 chroma = -16
-
+
lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
@@ -432,7 +432,7 @@
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
-
+
neg r1 ;r1 = stride
@@ -444,7 +444,7 @@
push r1
push r2
push r6
-
+
exp_left_right_sse2 32,a
pop r6
@@ -455,22 +455,22 @@
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
+
neg r1 ;r1 = -stride
lea r3,[r0-32]
lea r3,[r3+r1] ;last line of top-left border
-
+
lea r4,[r0+r2] ;psrc +width
lea r4,[r4+r1] ;psrc +width -stride
-
-
+
+
neg r1 ;r1 = stride
add r6,32 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
-
+
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
-
+
neg r1 ; r1 = -stride
; for left & right border expanding
@@ -477,11 +477,11 @@
exp_cross_sse2 32,a
LOAD_4_PARA_POP
-
+
pop r6
pop r5
pop r4
-
+
%assign push_num 0
@@ -495,7 +495,7 @@
; const int32_t iHeight );
;***********************************************************************----------------
ExpandPictureChromaAlign_sse2:
-
+
push r4
push r5
push r6
@@ -508,7 +508,7 @@
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
-
+
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@@ -522,24 +522,24 @@
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
+
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
- lea r4,[r3+r6] ;r4 = dst bottom
-
+ lea r4,[r3+r6] ;r4 = dst bottom
+
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
+
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
-
+
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
-
+
neg r1 ;r1 = -stride
-
+
push r0
- push r1
+ push r1
push r2
exp_top_bottom_sse2 16
@@ -546,20 +546,20 @@
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
+
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
-
- lea r3,[r0+r2-1] ;right border src
+
+ lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
-
+
neg r1 ;r1 = stride
@@ -568,7 +568,7 @@
push r0
- push r1
+ push r1
push r2
push r6
exp_left_right_sse2 16,a
@@ -581,22 +581,22 @@
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
+
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
-
+
lea r4,[r0+r2] ;psrc +width
- lea r4,[r4+r1] ;psrc +width -stride
-
-
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
-
+
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
-
+
neg r1 ; r1 = -stride
; for left & right border expanding
@@ -603,11 +603,11 @@
exp_cross_sse2 16,a
LOAD_4_PARA_POP
-
+
pop r6
pop r5
pop r4
-
+
%assign push_num 0
@@ -633,7 +633,7 @@
SIGN_EXTENTION r3,r3d
;also prepare for cross border pData top-left:xmm3
-
+
movzx r6d,byte [r0]
SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
@@ -647,24 +647,24 @@
dec r3 ;h-1
imul r3,r1 ;(h-1)*stride
lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
-
+
mov r6,r1 ;r6 = stride
sal r6,04h ;r6 = 32*stride
- lea r4,[r3+r6] ;r4 = dst bottom
-
+ lea r4,[r3+r6] ;r4 = dst bottom
+
;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
+
movzx r6d,byte [r3] ;bottom-left
SSE2_Copy16Times xmm5,r6d
-
+
lea r6,[r3+r2-1]
movzx r6d,byte [r6]
SSE2_Copy16Times xmm6,r6d ;bottom-right
-
+
neg r1 ;r1 = -stride
-
+
push r0
- push r1
+ push r1
push r2
exp_top_bottom_sse2 16
@@ -671,20 +671,20 @@
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
+
pop r2
pop r1
pop r0
lea r5,[r0-16] ;left border dst luma =32 chroma = -16
-
- lea r3,[r0+r2-1] ;right border src
+
+ lea r3,[r0+r2-1] ;right border src
lea r4,[r3+1] ;right border dst
;prepare for cross border data: top-rigth with xmm4
movzx r6d,byte [r3] ;top -rigth
SSE2_Copy16Times xmm4,r6d
-
+
neg r1 ;r1 = stride
@@ -693,7 +693,7 @@
push r0
- push r1
+ push r1
push r2
push r6
exp_left_right_sse2 16,u
@@ -706,22 +706,22 @@
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
+
neg r1 ;r1 = -stride
lea r3,[r0-16]
lea r3,[r3+r1] ;last line of top-left border
-
+
lea r4,[r0+r2] ;psrc +width
- lea r4,[r4+r1] ;psrc +width -stride
-
-
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
neg r1 ;r1 = stride
add r6,16 ;height +32(16) ,luma = 32, chroma = 16
imul r6,r1
-
+
lea r5,[r3+r6] ;last line of bottom-left border
lea r6,[r4+r6] ;last line of botoom-right border
-
+
neg r1 ; r1 = -stride
; for left & right border expanding
@@ -728,13 +728,12 @@
exp_cross_sse2 16,u
LOAD_4_PARA_POP
-
+
pop r6
pop r5
pop r4
-
+
%assign push_num 0
ret
-
\ No newline at end of file
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -1,701 +1,701 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mb_copy.asm
-;*
-;* Abstract
-;* mb_copy and mb_copy1
-;*
-;* History
-;* 15/09/2009 Created
-;* 12/28/2009 Modified with larger throughput
-;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN WelsCopy16x16_sse2
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
-WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
-WELS_EXTERN WelsCopy8x16_mmx ;
-WELS_EXTERN UpdateMbMv_sse2 ;
-
-;***********************************************************************
-; void WelsCopy16x16_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x16_sse2:
-
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
-
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
-
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
- lea r2, [r2+4*r3]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
-
- movdqa xmm0, [r2]
- movdqa xmm1, [r2+r3]
- movdqa xmm2, [r2+2*r3]
- movdqa xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqa xmm4, [r2]
- movdqa xmm5, [r2+r3]
- movdqa xmm6, [r2+2*r3]
- movdqa xmm7, [r2+r5]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
-
-;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WelsCopy16x16NotAligned_sse2:
- ;push esi
- ;push edi
- ;push ebx
-
- ;mov edi, [esp+16] ; Dst
- ;mov eax, [esp+20] ; iStrideD
- ;mov esi, [esp+24] ; Src
- ;mov ecx, [esp+28] ; iStrideS
-
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
-
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
-
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
- lea r2, [r2+4*r3]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- lea r0, [r0+4*r1]
-
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
-
-; , 12/29/2011
-;***********************************************************************
-; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x8NotAligned_sse2:
- ;push esi
- ;push edi
- ;push ebx
-
- ;mov edi, [esp+16] ; Dst
- ;mov eax, [esp+20] ; iStrideD
- ;mov esi, [esp+24] ; Src
- ;mov ecx, [esp+28] ; iStrideS
-
- push r4
- push r5
- %assign push_num 2
- LOAD_4_PARA
-
- lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
- lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
-
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+2*r3]
- movdqu xmm3, [r2+r5]
- lea r2, [r2+4*r3]
- movdqu xmm4, [r2]
- movdqu xmm5, [r2+r3]
- movdqu xmm6, [r2+2*r3]
- movdqu xmm7, [r2+r5]
-
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm1
- movdqa [r0+2*r1], xmm2
- movdqa [r0+r4], xmm3
- lea r0, [r0+4*r1]
- movdqa [r0], xmm4
- movdqa [r0+r1], xmm5
- movdqa [r0+2*r1], xmm6
- movdqa [r0+r4], xmm7
- LOAD_4_PARA_POP
- pop r5
- pop r4
- ret
-
-
-;***********************************************************************
-; void WelsCopy8x16_mmx(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x16_mmx:
- ;push ebx
-
- ;mov eax, [esp + 8 ] ;Dst
- ;mov ecx, [esp + 12] ;iStrideD
- ;mov ebx, [esp + 16] ;Src
- ;mov edx, [esp + 20] ;iStrideS
-
- %assign push_num 0
- LOAD_4_PARA
-
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
- lea r2, [r2+2*r3]
-
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
- lea r0, [r0+2*r1]
-
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
-
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
-
- WELSEMMS
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-; void WelsCopy8x8_mmx( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x8_mmx:
- ;push ebx
- ;push esi
- ;mov eax, [esp + 12] ;Dst
- ;mov ecx, [esp + 16] ;iStrideD
- ;mov esi, [esp + 20] ;Src
- ;mov ebx, [esp + 24] ;iStrideS
-
- push r4
- %assign push_num 1
- LOAD_4_PARA
- lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
-
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm0, [r2]
- movq mm1, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm2, [r2]
- movq mm3, [r2+r3]
- lea r2, [r2+2*r3]
- ; to prefetch next loop
- prefetchnta [r2+2*r3]
- prefetchnta [r2+r4]
- movq mm4, [r2]
- movq mm5, [r2+r3]
- lea r2, [r2+2*r3]
- movq mm6, [r2]
- movq mm7, [r2+r3]
-
- movq [r0], mm0
- movq [r0+r1], mm1
- lea r0, [r0+2*r1]
- movq [r0], mm2
- movq [r0+r1], mm3
- lea r0, [r0+2*r1]
- movq [r0], mm4
- movq [r0+r1], mm5
- lea r0, [r0+2*r1]
- movq [r0], mm6
- movq [r0+r1], mm7
-
- WELSEMMS
- ;pop esi
- ;pop ebx
- LOAD_4_PARA_POP
- pop r4
- ret
-
-; (dunhuang@cisco), 12/21/2011
-;***********************************************************************
-; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
-;***********************************************************************
-ALIGN 16
-UpdateMbMv_sse2:
-
- %assign push_num 0
- LOAD_2_PARA
-
- ;mov eax, [esp+4] ; mv_buffer
- ;movd xmm0, [esp+8] ; _mv
- movd xmm0, r1d ; _mv
- pshufd xmm1, xmm0, $0
- movdqa [r0 ], xmm1
- movdqa [r0+0x10], xmm1
- movdqa [r0+0x20], xmm1
- movdqa [r0+0x30], xmm1
- ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-;SECTION .rodata data align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN PixelAvgWidthEq4_mmx
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq4_mmx:
-
- %assign push_num 0
- LOAD_7_PARA
-
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r5, r5d
- movsx r6, r6d
-%endif
-
-ALIGN 4
-.height_loop:
- movd mm0, [r4]
- pavgb mm0, [r2]
- movd [r0], mm0
-
- dec r6
- lea r0, [r0+r1]
- lea r2, [r2+r3]
- lea r4, [r4+r5]
- jne .height_loop
-
- WELSEMMS
- LOAD_7_PARA_POP
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq8_mmx:
-
- ;push esi
- ;push edi
- ;push ebp
- ;push ebx
-
- ;mov edi, [esp+20] ; pDst
- ;mov eax, [esp+24] ; iDstStride
- ;mov esi, [esp+28] ; pSrcA
- ;mov ecx, [esp+32] ; iSrcAStride
- ;mov ebp, [esp+36] ; pSrcB
- ;mov edx, [esp+40] ; iSrcBStride
- ;mov ebx, [esp+44] ; iHeight
-
- %assign push_num 0
- LOAD_7_PARA
-
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r5, r5d
- movsx r6, r6d
-%endif
-
-ALIGN 4
-.height_loop:
- movq mm0, [r2]
- pavgb mm0, [r4]
- movq [r0], mm0
- movq mm0, [r2+r3]
- pavgb mm0, [r4+r5]
- movq [r0+r1], mm0
-
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
-
- sub r6, 2
- jnz .height_loop
-
- WELSEMMS
- LOAD_7_PARA_POP
- ret
-
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq16_sse2:
-
- %assign push_num 0
- LOAD_7_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r5, r5d
- movsx r6, r6d
-%endif
-ALIGN 4
-.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm1, [r4]
- pavgb xmm0, xmm1
- ;pavgb xmm0, [r4]
- movdqu [r0], xmm0
-
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
- movdqu [r0+r1], xmm0
-
- movdqu xmm0, [r2+2*r3]
- movdqu xmm1, [r4+2*r5]
- pavgb xmm0, xmm1
- movdqu [r0+2*r1], xmm0
-
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
-
- movdqu xmm0, [r2+r3]
- movdqu xmm1, [r4+r5]
- pavgb xmm0, xmm1
- movdqu [r0+r1], xmm0
-
- lea r2, [r2+2*r3]
- lea r4, [r4+2*r5]
- lea r0, [r0+2*r1]
-
- sub r6, 4
- jne .height_loop
-
- WELSEMMS
- LOAD_7_PARA_POP
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq4_mmx:
- ;push esi
- ;push edi
- ;push ebx
-
-
- ;mov esi, [esp+16]
- ;mov eax, [esp+20]
- ;mov edi, [esp+24]
- ;mov ecx, [esp+28]
- ;mov edx, [esp+32]
-
- push r5
- %assign push_num 1
- LOAD_5_PARA
-
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
-
-ALIGN 4
-.height_loop:
- mov r5d, [r0]
- mov [r2], r5d
-
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
- WELSEMMS
- LOAD_5_PARA_POP
- pop r5
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq8_mmx:
- ;push esi
- ;push edi
- ;mov esi, [esp+12]
- ;mov eax, [esp+16]
- ;mov edi, [esp+20]
- ;mov ecx, [esp+24]
- ;mov edx, [esp+28]
-
- %assign push_num 0
- LOAD_5_PARA
-
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
-
-ALIGN 4
-.height_loop:
- movq mm0, [r0]
- movq [r2], mm0
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
-
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-;read unaligned memory
-%macro SSE_READ_UNA 2
- movq %1, [%2]
- movhps %1, [%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE_WRITE_UNA 2
- movq [%1], %2
- movhps [%1+8], %2
-%endmacro
-McCopyWidthEq16_sse2:
- ;push esi
- ;push edi
-
- ;mov esi, [esp+12] ; pSrc
- ;mov eax, [esp+16] ; iSrcStride
- ;mov edi, [esp+20] ; pDst
- ;mov edx, [esp+24] ; iDstStride
- ;mov ecx, [esp+28] ; iHeight
-
- %assign push_num 0
- LOAD_5_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
-ALIGN 4
-.height_loop:
- SSE_READ_UNA xmm0, r0
- SSE_READ_UNA xmm1, r0+r1
- SSE_WRITE_UNA r2, xmm0
- SSE_WRITE_UNA r2+r3, xmm1
-
- sub r4, 2
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- jnz .height_loop
-
- LOAD_5_PARA_POP
- ret
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mb_copy.asm
+;*
+;* Abstract
+;* mb_copy and mb_copy1
+;*
+;* History
+;* 15/09/2009 Created
+;* 12/28/2009 Modified with larger throughput
+;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN WelsCopy16x16_sse2
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
+WELS_EXTERN WelsCopy8x8_mmx
+WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
+WELS_EXTERN WelsCopy8x16_mmx ;
+WELS_EXTERN UpdateMbMv_sse2 ;
+
+;***********************************************************************
+; void WelsCopy16x16_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x16_sse2:
+
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+;***********************************************************************
+; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
+WelsCopy16x16NotAligned_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ ;mov edi, [esp+16] ; Dst
+ ;mov eax, [esp+20] ; iStrideD
+ ;mov esi, [esp+24] ; Src
+ ;mov ecx, [esp+28] ; iStrideS
+
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+; , 12/29/2011
+;***********************************************************************
+; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x8NotAligned_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ ;mov edi, [esp+16] ; Dst
+ ;mov eax, [esp+20] ; iStrideD
+ ;mov esi, [esp+24] ; Src
+ ;mov ecx, [esp+28] ; iStrideS
+
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+
+;***********************************************************************
+; void WelsCopy8x16_mmx(uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x16_mmx:
+ ;push ebx
+
+ ;mov eax, [esp + 8 ] ;Dst
+ ;mov ecx, [esp + 12] ;iStrideD
+ ;mov ebx, [esp + 16] ;Src
+ ;mov edx, [esp + 20] ;iStrideS
+
+ %assign push_num 0
+ LOAD_4_PARA
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+ lea r2, [r2+2*r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+ lea r0, [r0+2*r1]
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+; void WelsCopy8x8_mmx( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x8_mmx:
+ ;push ebx
+ ;push esi
+ ;mov eax, [esp + 12] ;Dst
+ ;mov ecx, [esp + 16] ;iStrideD
+ ;mov esi, [esp + 20] ;Src
+ ;mov ebx, [esp + 24] ;iStrideS
+
+ push r4
+ %assign push_num 1
+ LOAD_4_PARA
+ lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
+
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+
+ WELSEMMS
+ ;pop esi
+ ;pop ebx
+ LOAD_4_PARA_POP
+ pop r4
+ ret
+
+; (dunhuang@cisco), 12/21/2011
+;***********************************************************************
+; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
+;***********************************************************************
+ALIGN 16
+UpdateMbMv_sse2:
+
+ %assign push_num 0
+ LOAD_2_PARA
+
+ ;mov eax, [esp+4] ; mv_buffer
+ ;movd xmm0, [esp+8] ; _mv
+ movd xmm0, r1d ; _mv
+ pshufd xmm1, xmm0, $0
+ movdqa [r0 ], xmm1
+ movdqa [r0+0x10], xmm1
+ movdqa [r0+0x20], xmm1
+ movdqa [r0+0x30], xmm1
+ ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+;SECTION .rodata data align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN PixelAvgWidthEq4_mmx
+WELS_EXTERN PixelAvgWidthEq8_mmx
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+WELS_EXTERN McCopyWidthEq4_mmx
+WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq16_sse2
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq4_mmx:
+
+ %assign push_num 0
+ LOAD_7_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+ movsx r6, r6d
+%endif
+
+ALIGN 4
+.height_loop:
+ movd mm0, [r4]
+ pavgb mm0, [r2]
+ movd [r0], mm0
+
+ dec r6
+ lea r0, [r0+r1]
+ lea r2, [r2+r3]
+ lea r4, [r4+r5]
+ jne .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq8_mmx:
+
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;push ebx
+
+ ;mov edi, [esp+20] ; pDst
+ ;mov eax, [esp+24] ; iDstStride
+ ;mov esi, [esp+28] ; pSrcA
+ ;mov ecx, [esp+32] ; iSrcAStride
+ ;mov ebp, [esp+36] ; pSrcB
+ ;mov edx, [esp+40] ; iSrcBStride
+ ;mov ebx, [esp+44] ; iHeight
+
+ %assign push_num 0
+ LOAD_7_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+ movsx r6, r6d
+%endif
+
+ALIGN 4
+.height_loop:
+ movq mm0, [r2]
+ pavgb mm0, [r4]
+ movq [r0], mm0
+ movq mm0, [r2+r3]
+ pavgb mm0, [r4+r5]
+ movq [r0+r1], mm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ sub r6, 2
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq16_sse2:
+
+ %assign push_num 0
+ LOAD_7_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+ movsx r6, r6d
+%endif
+ALIGN 4
+.height_loop:
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r4]
+ pavgb xmm0, xmm1
+ ;pavgb xmm0, [r4]
+ movdqu [r0], xmm0
+
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+r1], xmm0
+
+ movdqu xmm0, [r2+2*r3]
+ movdqu xmm1, [r4+2*r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+2*r1], xmm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+r1], xmm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ sub r6, 4
+ jne .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq4_mmx:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+
+ ;mov esi, [esp+16]
+ ;mov eax, [esp+20]
+ ;mov edi, [esp+24]
+ ;mov ecx, [esp+28]
+ ;mov edx, [esp+32]
+
+ push r5
+ %assign push_num 1
+ LOAD_5_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+
+ALIGN 4
+.height_loop:
+ mov r5d, [r0]
+ mov [r2], r5d
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+ WELSEMMS
+ LOAD_5_PARA_POP
+ pop r5
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq8_mmx:
+ ;push esi
+ ;push edi
+ ;mov esi, [esp+12]
+ ;mov eax, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+
+ %assign push_num 0
+ LOAD_5_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+
+ALIGN 4
+.height_loop:
+ movq mm0, [r0]
+ movq [r2], mm0
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+;read unaligned memory
+%macro SSE_READ_UNA 2
+ movq %1, [%2]
+ movhps %1, [%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE_WRITE_UNA 2
+ movq [%1], %2
+ movhps [%1+8], %2
+%endmacro
+McCopyWidthEq16_sse2:
+ ;push esi
+ ;push edi
+
+ ;mov esi, [esp+12] ; pSrc
+ ;mov eax, [esp+16] ; iSrcStride
+ ;mov edi, [esp+20] ; pDst
+ ;mov edx, [esp+24] ; iDstStride
+ ;mov ecx, [esp+28] ; iHeight
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ALIGN 4
+.height_loop:
+ SSE_READ_UNA xmm0, r0
+ SSE_READ_UNA xmm1, r0+r1
+ SSE_WRITE_UNA r2, xmm0
+ SSE_WRITE_UNA r2+r3, xmm1
+
+ sub r4, 2
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ jnz .height_loop
+
+ LOAD_5_PARA_POP
+ ret
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -1,345 +1,345 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
- ;push esi
- ;push edi
- ;push ebx
-
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r5, r5d
-%endif
-
- ;mov eax, [esp +12 + 20]
-
- movd mm3, [r4]; [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- ;mov esi, [esp +12+ 4]
- ;mov eax, [esp + 12 + 8]
- ;mov edi, [esp + 12 + 12]
- ;mov edx, [esp + 12 + 16]
- ;mov ecx, [esp + 12 + 24]
-
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movd mm0, [r0]
- movd mm1, [r0+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [r4]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [r4+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [r2], mm0
-
- movq mm0, mm2
-
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
-
- dec r5
- jnz near .xloop
- WELSEMMS
- LOAD_6_PARA_POP
- ;pop ebx
- ;pop edi
- ;pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
- ;push esi
- ;push edi
- ;push ebx
-
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r5, r5d
-%endif
-
- ;mov eax, [esp +12 + 20]
- movd xmm3, [r4]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- ;mov esi, [esp +12+ 4]
- ;mov eax, [esp + 12 + 8]
- ;mov edi, [esp + 12 + 12]
- ;mov edx, [esp + 12 + 16]
- ;mov ecx, [esp + 12 + 24]
-
- lea r4, [r0 + r1] ;lea ebx, [esi + eax]
- movq xmm0, [r0]
- movq xmm1, [r0+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [r4]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [r4+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [r2], xmm0
-
- movdqa xmm0, xmm2
-
- lea r2, [r2 + r3]
- lea r4, [r4 + r1]
-
- dec r5
- jnz near .xloop
-
- LOAD_6_PARA_POP
-
- ;pop ebx
- ;pop edi
- ;pop esi
- ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
- ;push ebx
- ;push esi
- ;push edi
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r5, r5d
-%endif
-
- ;mov eax, [esp + 12 + 20]
-
- pxor xmm7, xmm7
- movd xmm5, [r4]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- ;mov eax, [esp + 12 + 4]
- ;mov edx, [esp + 12 + 8]
- ;mov esi, [esp + 12 + 12]
- ;mov edi, [esp + 12 + 16]
- ;mov ecx, [esp + 12 + 24]
-
- sub r2, r3 ;sub esi, edi
- sub r2, r3
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [r0]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea r2, [r2+2*r3]
-
- movdqu xmm2, [r0+r1]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [r2],xmm0
-
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [r2+r3],xmm4
-
- sub r5, 2
- jnz .hloop_chroma
-
- LOAD_6_PARA_POP
-
- ;pop edi
- ;pop esi
- ;pop ebx
-
- ret
-
-
+;*!
+;* \copy
+;* Copyright (c) 2004-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_chroma.asm
+;*
+;* Abstract
+;* mmx motion compensation for chroma
+;*
+;* History
+;* 10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+ dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+ dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+%endif
+
+ ;mov eax, [esp +12 + 20]
+
+ movd mm3, [r4]; [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
+
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ ;mov esi, [esp +12+ 4]
+ ;mov eax, [esp + 12 + 8]
+ ;mov edi, [esp + 12 + 12]
+ ;mov edx, [esp + 12 + 16]
+ ;mov ecx, [esp + 12 + 24]
+
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movd mm0, [r0]
+ movd mm1, [r0+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+.xloop:
+
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
+
+ movd mm1, [r4]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
+
+ movd mm1, [r4+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
+
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
+
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [r2], mm0
+
+ movq mm0, mm2
+
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
+
+ dec r5
+ jnz near .xloop
+ WELSEMMS
+ LOAD_6_PARA_POP
+ ;pop ebx
+ ;pop edi
+ ;pop esi
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+%endif
+
+ ;mov eax, [esp +12 + 20]
+ movd xmm3, [r4]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
+
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
+
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
+
+ ;mov esi, [esp +12+ 4]
+ ;mov eax, [esp + 12 + 8]
+ ;mov edi, [esp + 12 + 12]
+ ;mov edx, [esp + 12 + 16]
+ ;mov ecx, [esp + 12 + 24]
+
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movq xmm0, [r0]
+ movq xmm1, [r0+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+.xloop:
+
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
+
+ movq xmm1, [r4]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
+
+ movq xmm1, [r4+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
+
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
+
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ movdqa xmm0, xmm2
+
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
+
+ dec r5
+ jnz near .xloop
+
+ LOAD_6_PARA_POP
+
+ ;pop ebx
+ ;pop edi
+ ;pop esi
+ ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+ ;push ebx
+ ;push esi
+ ;push edi
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+%endif
+
+ ;mov eax, [esp + 12 + 20]
+
+ pxor xmm7, xmm7
+ movd xmm5, [r4]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm5
+ punpckhqdq xmm6, xmm6
+
+ ;mov eax, [esp + 12 + 4]
+ ;mov edx, [esp + 12 + 8]
+ ;mov esi, [esp + 12 + 12]
+ ;mov edi, [esp + 12 + 16]
+ ;mov ecx, [esp + 12 + 24]
+
+ sub r2, r3 ;sub esi, edi
+ sub r2, r3
+ movdqa xmm7, [h264_d0x20_sse2]
+
+ movdqu xmm0, [r0]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+ lea r2, [r2+2*r3]
+
+ movdqu xmm2, [r0+r1]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
+
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm0, xmm2
+ paddw xmm0, xmm7
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0
+ movq [r2],xmm0
+
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm0, xmm2
+
+ pmaddubsw xmm4, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm4, xmm2
+ paddw xmm4, xmm7
+ psrlw xmm4, 6
+ packuswb xmm4, xmm4
+ movq [r2+r3],xmm4
+
+ sub r5, 2
+ jnz .hloop_chroma
+
+ LOAD_6_PARA_POP
+
+ ;pop edi
+ ;pop esi
+ ;pop ebx
+
+ ret
+
+
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -1,1293 +1,1293 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_luma.asm
-;*
-;* Abstract
-;* sse2 motion compensation
-;*
-;* History
-;* 17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10:
- dw 16, 16, 16, 16
-ALIGN 16
-h264_w0x10_1:
- dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
- dw 32, 32, 32, 32, 32, 32, 32, 32
-
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20WidthEq4_mmx
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight)
-;*******************************************************************************
-McHorVer20WidthEq4_mmx:
- ;push esi
- ;push edi
-
- ;mov esi, [esp+12]
- ;mov eax, [esp+16]
- ;mov edi, [esp+20]
- ;mov ecx, [esp+24]
- ;mov edx, [esp+28]
-
- %assign push_num 0
- LOAD_5_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
-
- sub r0, 2
- WELS_Zero mm7
- movq mm6, [h264_w0x10]
-.height_loop:
- movd mm0, [r0]
- punpcklbw mm0, mm7
- movd mm1, [r0+5]
- punpcklbw mm1, mm7
- movd mm2, [r0+1]
- punpcklbw mm2, mm7
- movd mm3, [r0+4]
- punpcklbw mm3, mm7
- movd mm4, [r0+2]
- punpcklbw mm4, mm7
- movd mm5, [r0+3]
- punpcklbw mm5, mm7
-
- paddw mm2, mm3
- paddw mm4, mm5
- psllw mm4, 2
- psubw mm4, mm2
- paddw mm0, mm1
- paddw mm0, mm4
- psllw mm4, 2
- paddw mm0, mm4
- paddw mm0, mm6
- psraw mm0, 5
- packuswb mm0, mm7
- movd [r2], mm0
-
- add r0, r1
- add r2, r3
- dec r4
- jnz .height_loop
-
- WELSEMMS
- LOAD_5_PARA_POP
- ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-
-%macro SSE_LOAD_8P 3
- movq %1, %3
- punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
- paddw %1, %6
- movdqa %8, %3
- movdqa %7, %2
- paddw %1, [h264_w0x10_1]
- paddw %8, %4
- paddw %7, %5
- psllw %8, 2
- psubw %8, %7
- paddw %1, %8
- psllw %8, 2
- paddw %1, %8
- psraw %1, 5
- WELS_Zero %8
- packuswb %1, %8
- movq %9, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
-
-ALIGN 16
-;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
-; int16_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride
-; int32_t iHeight
-; )
-;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
- ;push esi
- ;push edi
- ;push ebx
- ;mov esi, [esp+16] ;pSrc
- ;mov eax, [esp+20] ;iSrcStride
- ;mov edi, [esp+24] ;pDst
- ;mov edx, [esp+28] ;iDstStride
- ;mov ebx, [esp+32] ;iHeight
-
- %assign push_num 0
- LOAD_5_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
- pxor xmm7, xmm7
-
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
-
-.yloop_width_8:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
-
- add r0, r1
- add r2, r3
- dec r4
- jnz .yloop_width_8
- LOAD_5_PARA_POP
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
-; );
-;*******************************************************************************
-McHorVer20WidthEq8_sse2:
- ;push esi
- ;push edi
-
- ;mov esi, [esp + 12] ;pSrc
- ;mov eax, [esp + 16] ;iSrcStride
- ;mov edi, [esp + 20] ;pDst
- ;mov ecx, [esp + 28] ;iHeight
- ;mov edx, [esp + 24] ;iDstStride
-
- %assign push_num 0
- LOAD_5_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
- lea r0, [r0-2] ;pSrc -= 2;
-
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
-.y_loop:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
-
- packuswb xmm0, xmm7
- movq [r2], xmm0
-
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
-
- LOAD_5_PARA_POP
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
-; );
-;*******************************************************************************
-McHorVer20WidthEq16_sse2:
- ;push esi
- ;push edi
- ;mov esi, [esp + 12] ;pSrc
- ;mov eax, [esp + 16] ;iSrcStride
- ;mov edi, [esp + 20] ;pDst
- ;mov ecx, [esp + 28] ;iHeight
- ;mov edx, [esp + 24] ;iDstStride
-
- %assign push_num 0
- LOAD_5_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
- lea r0, [r0-2] ;pSrc -= 2;
-
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2], xmm0
-
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [r2+8], xmm0
-
- lea r2, [r2+r3]
- lea r0, [r0+r1]
- dec r4
- jnz near .y_loop
-
- LOAD_5_PARA_POP
- ret
-
-
-;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight )
-;*******************************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
- ;push esi
- ;push edi
- ;mov esi, [esp + 12] ;pSrc
- ;mov edx, [esp + 16] ;iSrcStride
- ;mov edi, [esp + 20] ;pDst
- ;mov eax, [esp + 24] ;iDstStride
- ;mov ecx, [esp + 28] ;iHeight
-
- %assign push_num 0
- LOAD_5_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
-%endif
- sub r0, r1
- sub r0, r1
-
- WELS_Zero xmm7
-
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
-.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r4
- jz near .xx_exit
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r4
- jz near .xx_exit
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
-
-.xx_exit:
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20Width9Or17_sse2
-WELS_EXTERN McHorVer02Height9Or17_sse2
-WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
-WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
-WELS_EXTERN McHorVer22HorFirst_sse2
-
-
-;***********************************************************************
-; void McHorVer02Height9Or17_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight )
-;***********************************************************************
-ALIGN 16
-McHorVer02Height9Or17_sse2:
- ;push esi
- ;push edi
- ;push ebx
-
- ;mov esi, [esp + 16]
- ;mov edx, [esp + 20]
- ;mov edi, [esp + 24]
- ;mov eax, [esp + 28]
- ;mov ecx, [esp + 36]
- ;mov ebx, [esp + 32]
-
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
- movsx r5, r5d
-%endif
-
-%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
-%endif
-
- shr r4, 3
- sub r0, r1
- sub r0, r1
-
-.xloop:
- WELS_Zero xmm7
- SSE_LOAD_8P xmm0, xmm7, [r0]
- SSE_LOAD_8P xmm1, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm7, [r0]
- SSE_LOAD_8P xmm3, xmm7, [r0+r1]
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm7, [r0]
- SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- movdqa xmm0,xmm1
- movdqa xmm1,xmm2
- movdqa xmm2,xmm3
- movdqa xmm3,xmm4
- movdqa xmm4,xmm5
- movdqa xmm5,xmm6
- add r2, r3
- sub r0, r1
-
-.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm6, xmm7, [r0]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm7, xmm0, [r0+r1]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm0, xmm1, [r0]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm1, xmm2, [r0+r1]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm2, xmm3, [r0]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm3, xmm4, [r0+r1]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- SSE_LOAD_8P xmm4, xmm5, [r0]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- SSE_LOAD_8P xmm5, xmm6, [r0+r1]
- jmp near .start
-
-.x_loop_dec:
- dec r4
- jz near .xx_exit
- ;mov esi, [esp + 16]
- ;mov edi, [esp + 24]
- ;mov ecx, [esp + 36]
-%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
-%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
-%endif
- sub r0, r1
- sub r0, r1
- add r0, 8
- add r2, 8
- jmp near .xloop
-
-.xx_exit:
-%ifndef X86_32
- pop r14
- pop r13
- pop r12
-%endif
- LOAD_6_PARA_POP
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer20Width9Or17_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight
-; );
-;***********************************************************************
-McHorVer20Width9Or17_sse2:
- ;push esi
- ;push edi
- ;push ebx
- ;mov esi, [esp+16]
- ;mov eax, [esp+20]
- ;mov edi, [esp+24]
- ;mov edx, [esp+28]
- ;mov ecx, [esp+32]
- ;mov ebx, [esp+36]
-
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
- movsx r5, r5d
-%endif
- sub r0, 2
- pxor xmm7, xmm7
-
- cmp r4, 9
- jne near .width_17
-
-.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2], xmm0
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+1], xmm2
-
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- LOAD_6_PARA_POP
- ret
-
-
-.width_17:
-.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movq [r2], xmm0
-
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [r2+8], xmm0
-
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r2+9], xmm2
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- LOAD_6_PARA_POP
- ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;void McHorVer22HorFirst_sse2
-; (uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t * pTap,
-; int32_t iTapStride,
-; int32_t iWidth,int32_t iHeight);
-;***********************************************************************
-McHorVer22HorFirst_sse2:
- ;push esi
- ;push edi
- ;push ebx
- ;mov esi, [esp+16]
- ;mov eax, [esp+20]
- ;mov edi, [esp+24]
- ;mov edx, [esp+28]
- ;mov ecx, [esp+32]
- ;mov ebx, [esp+36]
-
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
- movsx r5, r5d
-%endif
- pxor xmm7, xmm7
- sub r0, r1 ;;;;;;;;need more 5 lines.
- sub r0, r1
-
- cmp r4, 9
- jne near .width_17
-
-.yloop_width_9:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2], xmm0
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+2], xmm2
- movhps [r2+2+8], xmm2
-
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_9
- LOAD_6_PARA_POP
- ret
-
-
-.width_17:
-.yloop_width_17:
- movq xmm0, [r0]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [r2], xmm0
-
- movq xmm0, [r0+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [r0+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [r0+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [r0+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [r0+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [r0+3+8]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [r2+16], xmm0
-
-
- pxor xmm7, xmm7
- movq xmm0, [r0+6+8]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [r2+18], xmm2
- movhps [r2+18+8], xmm2
-
- add r0, r1
- add r2, r3
- dec r5
- jnz .yloop_width_17
- LOAD_6_PARA_POP
- ret
-
-
-%macro FILTER_VER 9
- paddw %1, %6
- movdqa %7, %2
- movdqa %8, %3
-
-
- paddw %7, %5
- paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
- paddw %8, [h264_mc_hc_32]
- psraw %8, 6
- packuswb %8, %8
- movq %9, %8
-%endmacro
-;***********************************************************************
-;void McHorVer22Width8VerLastAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
-;***********************************************************************
-
- McHorVer22Width8VerLastAlign_sse2:
- ;push esi
- ;push edi
- ;push ebx
- ;push ebp
-
- ;mov esi, [esp+20]
- ;mov eax, [esp+24]
- ;mov edi, [esp+28]
- ;mov edx, [esp+32]
- ;mov ebx, [esp+36]
- ;mov ecx, [esp+40]
-
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
- movsx r5, r5d
-%endif
-%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
-%endif
-
- shr r4, 3
-
-.width_loop:
- movdqa xmm0, [r0]
- movdqa xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- movdqa xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- movdqa xmm5, [r0+r1]
-
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
-
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
-
- add r2, r3
- sub r0, r1
-
-.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqa xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqa xmm5, [r0+r1]
- jmp near .start
-
-.x_loop_dec:
- dec r4
- jz near .exit
- ;mov esi, [esp+20]
- ;mov edi, [esp+28]
- ;mov ecx, [esp+40]
-%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
-%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
-%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
-
-.exit:
-%ifndef X86_32
- pop r14
- pop r13
- pop r12
-%endif
- LOAD_6_PARA_POP
- ret
-
-;***********************************************************************
-;void McHorVer22Width8VerLastUnAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
-;***********************************************************************
-
- McHorVer22Width8VerLastUnAlign_sse2:
- ;push esi
- ;push edi
- ;push ebx
- ;push ebp
-
- ;mov esi, [esp+20]
- ;mov eax, [esp+24]
- ;mov edi, [esp+28]
- ;mov edx, [esp+32]
- ;mov ebx, [esp+36]
- ;mov ecx, [esp+40]
-
- %assign push_num 0
- LOAD_6_PARA
-%ifndef X86_32
- movsx r1, r1d
- movsx r3, r3d
- movsx r4, r4d
- movsx r5, r5d
-%endif
-%ifndef X86_32
- push r12
- push r13
- push r14
- mov r12, r0
- mov r13, r2
- mov r14, r5
-%endif
- shr r4, 3
-
-.width_loop:
- movdqu xmm0, [r0]
- movdqu xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- movdqu xmm3, [r0+r1]
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- movdqu xmm5, [r0+r1]
-
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
-
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
-
- add r2, r3
- sub r0, r1
-
-.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm6, [r0]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm7, [r0+r1]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm0, [r0]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm1, [r0+r1]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm2, [r0]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm3, [r0+r1]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
- dec r5
- jz near .x_loop_dec
-
- lea r0, [r0+2*r1]
- movdqu xmm4, [r0]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
- dec r5
- jz near .x_loop_dec
-
- lea r2, [r2+2*r3]
- movdqu xmm5, [r0+r1]
- jmp near .start
-
-.x_loop_dec:
- dec r4
- jz near .exit
- ;mov esi, [esp+20]
- ;mov edi, [esp+28]
- ;mov ecx, [esp+40]
-%ifdef X86_32
- mov r0, arg1
- mov r2, arg3
- mov r5, arg6
-%else
- mov r0, r12
- mov r2, r13
- mov r5, r14
-%endif
- add r0, 16
- add r2, 8
- jmp .width_loop
-
-.exit:
-%ifndef X86_32
- pop r14
- pop r13
- pop r12
-%endif
- LOAD_6_PARA_POP
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_luma.asm
+;*
+;* Abstract
+;* sse2 motion compensation
+;*
+;* History
+;* 17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10:
+ dw 16, 16, 16, 16
+ALIGN 16
+h264_w0x10_1:
+ dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+ dw 32, 32, 32, 32, 32, 32, 32, 32
+
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20WidthEq4_mmx
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight)
+;*******************************************************************************
+McHorVer20WidthEq4_mmx:
+ ;push esi
+ ;push edi
+
+ ;mov esi, [esp+12]
+ ;mov eax, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+
+ sub r0, 2
+ WELS_Zero mm7
+ movq mm6, [h264_w0x10]
+.height_loop:
+ movd mm0, [r0]
+ punpcklbw mm0, mm7
+ movd mm1, [r0+5]
+ punpcklbw mm1, mm7
+ movd mm2, [r0+1]
+ punpcklbw mm2, mm7
+ movd mm3, [r0+4]
+ punpcklbw mm3, mm7
+ movd mm4, [r0+2]
+ punpcklbw mm4, mm7
+ movd mm5, [r0+3]
+ punpcklbw mm5, mm7
+
+ paddw mm2, mm3
+ paddw mm4, mm5
+ psllw mm4, 2
+ psubw mm4, mm2
+ paddw mm0, mm1
+ paddw mm0, mm4
+ psllw mm4, 2
+ paddw mm0, mm4
+ paddw mm0, mm6
+ psraw mm0, 5
+ packuswb mm0, mm7
+ movd [r2], mm0
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+
+%macro SSE_LOAD_8P 3
+ movq %1, %3
+ punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+ paddw %1, %6
+ movdqa %8, %3
+ movdqa %7, %2
+ paddw %1, [h264_w0x10_1]
+ paddw %8, %4
+ paddw %7, %5
+ psllw %8, 2
+ psubw %8, %7
+ paddw %1, %8
+ psllw %8, 2
+ paddw %1, %8
+ psraw %1, 5
+ WELS_Zero %8
+ packuswb %1, %8
+ movq %9, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+WELS_EXTERN McHorVer02WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq16_sse2
+
+ALIGN 16
+;***********************************************************************
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+; int16_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride
+; int32_t iHeight
+; )
+;***********************************************************************
+McHorVer22Width8HorFirst_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;mov esi, [esp+16] ;pSrc
+ ;mov eax, [esp+20] ;iSrcStride
+ ;mov edi, [esp+24] ;pDst
+ ;mov edx, [esp+28] ;iDstStride
+ ;mov ebx, [esp+32] ;iHeight
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ pxor xmm7, xmm7
+
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
+
+.yloop_width_8:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .yloop_width_8
+ LOAD_5_PARA_POP
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
+; );
+;*******************************************************************************
+McHorVer20WidthEq8_sse2:
+ ;push esi
+ ;push edi
+
+ ;mov esi, [esp + 12] ;pSrc
+ ;mov eax, [esp + 16] ;iSrcStride
+ ;mov edi, [esp + 20] ;pDst
+ ;mov ecx, [esp + 28] ;iHeight
+ ;mov edx, [esp + 24] ;iDstStride
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ lea r0, [r0-2] ;pSrc -= 2;
+
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
+
+ LOAD_5_PARA_POP
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
+; );
+;*******************************************************************************
+McHorVer20WidthEq16_sse2:
+ ;push esi
+ ;push edi
+ ;mov esi, [esp + 12] ;pSrc
+ ;mov eax, [esp + 16] ;iSrcStride
+ ;mov edi, [esp + 20] ;pDst
+ ;mov ecx, [esp + 28] ;iHeight
+ ;mov edx, [esp + 24] ;iDstStride
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ lea r0, [r0-2] ;pSrc -= 2;
+
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2+8], xmm0
+
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
+
+ LOAD_5_PARA_POP
+ ret
+
+
+;*******************************************************************************
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight )
+;*******************************************************************************
+ALIGN 16
+McHorVer02WidthEq8_sse2:
+ ;push esi
+ ;push edi
+ ;mov esi, [esp + 12] ;pSrc
+ ;mov edx, [esp + 16] ;iSrcStride
+ ;mov edi, [esp + 20] ;pDst
+ ;mov eax, [esp + 24] ;iDstStride
+ ;mov ecx, [esp + 28] ;iHeight
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ sub r0, r1
+ sub r0, r1
+
+ WELS_Zero xmm7
+
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+.start:
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
+
+.xx_exit:
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20Width9Or17_sse2
+WELS_EXTERN McHorVer02Height9Or17_sse2
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
+WELS_EXTERN McHorVer22HorFirst_sse2
+
+
+;***********************************************************************
+; void McHorVer02Height9Or17_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight )
+;***********************************************************************
+ALIGN 16
+McHorVer02Height9Or17_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ ;mov esi, [esp + 16]
+ ;mov edx, [esp + 20]
+ ;mov edi, [esp + 24]
+ ;mov eax, [esp + 28]
+ ;mov ecx, [esp + 36]
+ ;mov ebx, [esp + 32]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+
+ shr r4, 3
+ sub r0, r1
+ sub r0, r1
+
+.xloop:
+ WELS_Zero xmm7
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm2
+ movdqa xmm2,xmm3
+ movdqa xmm3,xmm4
+ movdqa xmm4,xmm5
+ movdqa xmm5,xmm6
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .xx_exit
+ ;mov esi, [esp + 16]
+ ;mov edi, [esp + 24]
+ ;mov ecx, [esp + 36]
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ sub r0, r1
+ sub r0, r1
+ add r0, 8
+ add r2, 8
+ jmp near .xloop
+
+.xx_exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ LOAD_6_PARA_POP
+ ret
+
+
+ALIGN 16
+;***********************************************************************
+; void McHorVer20Width9Or17_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight
+; );
+;***********************************************************************
+McHorVer20Width9Or17_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;mov esi, [esp+16]
+ ;mov eax, [esp+20]
+ ;mov edi, [esp+24]
+ ;mov edx, [esp+28]
+ ;mov ecx, [esp+32]
+ ;mov ebx, [esp+36]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+ sub r0, 2
+ pxor xmm7, xmm7
+
+ cmp r4, 9
+ jne near .width_17
+
+.yloop_width_9:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2], xmm0
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+1], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ LOAD_6_PARA_POP
+ ret
+
+
+.width_17:
+.yloop_width_17:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movq [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2+8], xmm0
+
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+9], xmm2
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ LOAD_6_PARA_POP
+ ret
+
+
+
+ALIGN 16
+;***********************************************************************
+;void McHorVer22HorFirst_sse2
+; (uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t * pTap,
+; int32_t iTapStride,
+; int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+McHorVer22HorFirst_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;mov esi, [esp+16]
+ ;mov eax, [esp+20]
+ ;mov edi, [esp+24]
+ ;mov edx, [esp+28]
+ ;mov ecx, [esp+32]
+ ;mov ebx, [esp+36]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+ pxor xmm7, xmm7
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
+
+ cmp r4, 9
+ jne near .width_17
+
+.yloop_width_9:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2], xmm0
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+2], xmm2
+ movhps [r2+2+8], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ LOAD_6_PARA_POP
+ ret
+
+
+.width_17:
+.yloop_width_17:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2+16], xmm0
+
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+18], xmm2
+ movhps [r2+18+8], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ LOAD_6_PARA_POP
+ ret
+
+
+%macro FILTER_VER 9
+ paddw %1, %6
+ movdqa %7, %2
+ movdqa %8, %3
+
+
+ paddw %7, %5
+ paddw %8, %4
+
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
+ paddw %8, [h264_mc_hc_32]
+ psraw %8, 6
+ packuswb %8, %8
+ movq %9, %8
+%endmacro
+;***********************************************************************
+;void McHorVer22Width8VerLastAlign_sse2(
+; uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastAlign_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;push ebp
+
+ ;mov esi, [esp+20]
+ ;mov eax, [esp+24]
+ ;mov edi, [esp+28]
+ ;mov edx, [esp+32]
+ ;mov ebx, [esp+36]
+ ;mov ecx, [esp+40]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+
+ shr r4, 3
+
+.width_loop:
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
+
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
+
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm5, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .exit
+ ;mov esi, [esp+20]
+ ;mov edi, [esp+28]
+ ;mov ecx, [esp+40]
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
+
+.exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ LOAD_6_PARA_POP
+ ret
+
+;***********************************************************************
+;void McHorVer22Width8VerLastUnAlign_sse2(
+; uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastUnAlign_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;push ebp
+
+ ;mov esi, [esp+20]
+ ;mov eax, [esp+24]
+ ;mov edi, [esp+28]
+ ;mov edx, [esp+32]
+ ;mov ebx, [esp+36]
+ ;mov ecx, [esp+40]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+ shr r4, 3
+
+.width_loop:
+ movdqu xmm0, [r0]
+ movdqu xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqu xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ movdqu xmm5, [r0+r1]
+
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
+
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm5, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .exit
+ ;mov esi, [esp+20]
+ ;mov edi, [esp+28]
+ ;mov ecx, [esp+40]
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
+
+.exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ LOAD_6_PARA_POP
ret
\ No newline at end of file
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -160,7 +160,7 @@
AnalysisVaaInfoIntra_sse2:
%assign push_num 0
- LOAD_2_PARA
+ LOAD_2_PARA
SIGN_EXTENTION r1,r1d
%ifdef X86_32
@@ -175,16 +175,16 @@
and r5,0fh
sub r7,r5
sub r7,32
-
-
- mov r2,r1
+
+
+ mov r2,r1
sal r2,$1 ;r2 = 2*iLineSize
mov r3,r2
add r3,r1 ;r3 = 3*iLineSize
-
+
mov r4,r2
sal r4,$1 ;r4 = 4*iLineSize
-
+
pxor xmm7, xmm7
; loops
@@ -225,8 +225,8 @@
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
-
+
+
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
@@ -234,7 +234,7 @@
sar r2, $4
movd retrd, xmm1
sub retrd, r2d
-
+
add r7,32
add r7,r5
@@ -244,7 +244,7 @@
pop r4
pop r3
%endif
-
+
ret
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@@ -255,7 +255,7 @@
AnalysisVaaInfoIntra_ssse3:
%assign push_num 0
- LOAD_2_PARA
+ LOAD_2_PARA
SIGN_EXTENTION r1,r1d
%ifdef X86_32
@@ -265,41 +265,41 @@
push r6
%assign push_num push_num+4
%endif
-
+
mov r5,r7
and r5,0fh
sub r7,r5
sub r7,32
-
- mov r2,r1
+
+ mov r2,r1
sal r2,$1 ;r2 = 2*iLineSize
mov r3,r2
add r3,r1 ;r3 = 3*iLineSize
-
+
mov r4,r2
sal r4,$1 ;r4 = 4*iLineSize
-
+
pxor xmm7, xmm7
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7],xmm0
-
+
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+8],xmm1
-
-
+
+
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [r7+16],xmm0
-
+
lea r0,[r0+r4]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [r7+24],xmm1
-
-
+
+
movdqa xmm0,[r7]
movdqa xmm1,[r7+16]
movdqa xmm2, xmm0
@@ -322,7 +322,7 @@
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd r2d, xmm0
and r2, 0ffffh ; effective low work truncated
mov r3, r2
@@ -339,7 +339,7 @@
pop r4
pop r3
%endif
-
+
ret
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
@@ -368,7 +368,7 @@
paddd xmm3, xmm4
movd r0d, xmm3
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
-
+
jb near .threshold_exit
pshufd xmm0, xmm0, 01Bh
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
@@ -412,7 +412,7 @@
paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h
paddd xmm5, xmm4
-
+
movd r0d, xmm5
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
jb near .threshold_exit
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -477,7 +477,7 @@
SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
;mov esi, [esp + pushsize + 4]
- mov r0, r4
+ mov r0, r4
add r3, 16
imul r2, -3
add r3, r2 ; s = a + 16 + (-3)*c
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -186,7 +186,7 @@
movsx r1, r1d
movsx r3, r3d
%endif
-; mov eax, [pDct ]
+; mov eax, [pDct ]
movq mm0, [r4+ 0]
movq mm1, [r4+ 8]
movq mm2, [r4+16]
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
;* memzero.asm
;*
;* Abstract
-;*
;*
+;*
;* History
;* 9/16/2009 Created
;*
@@ -45,8 +45,8 @@
; Code
;***********************************************************************
-SECTION .text
-
+SECTION .text
+
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
LOAD_1_PARA
;mov eax,[esp+4]
prefetchnta [r0]
- ret
+ ret
ALIGN 16
@@ -71,7 +71,7 @@
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
neg r1
-
+
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [r0], xmm0
@@ -79,12 +79,12 @@
movdqa [r0+32], xmm0
movdqa [r0+48], xmm0
add r0, 0x40
-
+
add r1, 0x40
jnz near .memzeroa64_sse2_loops
-
- ret
+ ret
+
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -96,7 +96,7 @@
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
neg r1
-
+
pxor mm0, mm0
.memzero64_mmx_loops:
movq [r0], mm0
@@ -106,16 +106,16 @@
movq [r0+32], mm0
movq [r0+40], mm0
movq [r0+48], mm0
- movq [r0+56], mm0
+ movq [r0+56], mm0
add r0, 0x40
-
+
add r1, 0x40
jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
+
+ WELSEMMS
+ ret
+
+ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
@@ -125,17 +125,17 @@
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENTION r1, r1d
- neg r1
+ neg r1
pxor mm0, mm0
-
+
.memzero8_mmx_loops:
movq [r0], mm0
add r0, 0x08
-
+
add r1, 0x08
jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
+ WELSEMMS
+ ret
+
+
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -1,2344 +1,2344 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* satd_sad.asm
-;*
-;* Abstract
-;* WelsSampleSatd4x4_sse2
-;* WelsSampleSatd8x8_sse2
-;* WelsSampleSatd16x8_sse2
-;* WelsSampleSatd8x16_sse2
-;* WelsSampleSatd16x16_sse2
-;*
-;* WelsSampleSad16x8_sse2
-;* WelsSampleSad16x16_sse2
-;*
-;* History
-;* 8/5/2009 Created
-;* 24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1: dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2: dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro SSE2_SumWHorizon1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
- WELS_AbsW %1, %3
- WELS_AbsW %2, %3
- WELS_AbsW %4, %6
- WELS_AbsW %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SSE2_SumWHorizon 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0 , [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- movd xmm4, [r2]
- movd xmm5, [r2+r3]
- lea r2 , [r2+2*r3]
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm4, xmm6
- punpckldq xmm5, xmm7
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
- punpcklbw xmm4, xmm6
- punpcklbw xmm5, xmm6
-
- psubw xmm0, xmm4
- psubw xmm1, xmm5
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- WELS_AbsW xmm0, xmm3
- paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
- paddusw xmm6, xmm2
- SSE2_SumWHorizon1 xmm6, xmm4
- movd retrd, xmm6
- and retrd, 0xffff
- shr retrd, 1
- LOAD_4_PARA_POP
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
-
- pop r2
- pop r0
- ;mov eax, [esp+8]
- ;mov ecx, [esp+16]
- add r0, 8
- add r2, 8
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- push r0
- push r2
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- pop r2
- pop r0
- ;mov eax, [esp+8]
- ;mov ecx, [esp+16]
- add r0, 8
- add r2, 8
-
- SSE2_GetSatd8x8
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd retrd, xmm6
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
- paddd xmm4, %1 ;for dc
- paddd xmm4, %3 ;for dc
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
-; paddd xmm4, %1 ;for dc
-; paddd xmm4, %3 ;for dc
- movdqa %4, %1
- punpcklqdq %4, %3
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
- pxor xmm7, xmm7
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movq xmm2, [eax]
- movq xmm3, [eax+ebx]
- lea eax, [eax+2*ebx]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
- ;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2], 0
- pinsrw xmm0, word[esi+%2+8], 4
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+2], 0
- pinsrw xmm0, word[esi+%2+10], 4
- psubsw xmm0, xmm1
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+4], 0
- pinsrw xmm0, word[esi+%2+12], 4
- psubsw xmm0, xmm3
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+6], 0
- pinsrw xmm0, word[esi+%2+14], 4
- psubsw xmm0, xmm2
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH 3
- movq xmm0, [esi+%3+8*%1]
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm5, xmm0
- pabsw xmm1, xmm1
- pabsw xmm2, xmm2
- pabsw xmm3, xmm3
- paddw xmm2, xmm1;for DC
- paddw xmm2, xmm3;for DC
- paddw xmm5, xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
- pxor xmm0, xmm0
- movq2dq xmm0, mm4
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
- shl %1, 4
- movdqa xmm0, [esi+32+%1]
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 16
- SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
-%endmacro
-
-
-%ifdef X86_32
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- pxor xmm4, xmm4
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movdqu xmm0, [ecx]
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi], xmm0 ;V
- movdqa [esi+16], xmm1
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 8
- pinsrb xmm0, byte[ecx+edx-1], 9
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 10
- pinsrb xmm0, byte[ecx+edx-1], 11
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 12
- pinsrb xmm0, byte[ecx+edx-1], 13
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 14
- pinsrb xmm0, byte[ecx+edx-1], 15
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi+32], xmm0 ;H
- movdqa [esi+48], xmm1
- movd ecx, xmm4 ;dc
- add ecx, 16 ;(sum+16)
- shr ecx, 5 ;((sum+16)>>5)
- shl ecx, 4 ;
- movd mm4, ecx ; mm4 copy DC
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
- mov edi, 0
-.loop16x16_get_satd:
-.loopStart1:
- SSE41_I16x16GetX38x4Satd ecx, edi
- inc ecx
- cmp ecx, 4
- jl .loopStart1
- cmp edi, 16
- je .loop16x16_get_satd_end
- mov eax, [esp+24]
- add eax, 8
- mov ecx, 0
- add edi, 16
- jmp .loop16x16_get_satd
- .loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
-
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ebx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_16x16
- cmp ebx, ecx
- jge near not_dc_h_16x16
-
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_16x16
- mov dword[edx], 1;I16_PRED_H
- mov eax, edi
- jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
-return_satd_intra_16x16_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movq xmm0, [ecx]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [esi], xmm0 ;V
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- movdqa [esi+16], xmm0 ;H
-;(sum+2)>>2
- movdqa xmm6, [PDQ2]
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
-;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
-;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [esi+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [esi+48], xmm5
-
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
-loop_chroma_satdx3_cb_cr:
- SSE41_ChromaGetX38x4Satd ecx, 0
- inc ecx
- cmp ecx, 2
- jl loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
-%endmacro
-%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
- punpcklqdq %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- xor edi, edi
-loop_chroma_satdx3:
- SSE41_ChromaGetX38x8Satd
- cmp edi, 1
- je loop_chroma_satdx3end
- inc edi
- SSEReg2MMX xmm4, mm0,mm1
- SSEReg2MMX xmm5, mm2,mm3
- SSEReg2MMX xmm6, mm5,mm6
- mov ecx, [esp+44]
- mov eax, [esp+48]
- jmp loop_chroma_satdx3
-loop_chroma_satdx3end:
- MMXReg2SSE xmm0, xmm3, mm0, mm1
- MMXReg2SSE xmm1, xmm3, mm2, mm3
- MMXReg2SSE xmm2, xmm3, mm5, mm6
-
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- paddw xmm6, xmm2
-
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ecx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_8x8
- cmp ebx, ecx
- jge near not_dc_h_8x8
-
- ; for DC mode
- mov dword[edx], 0;I8_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_8x8
- mov dword[edx], 1;I8_PRED_H
- mov eax, edi
- jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
- ; for V mode
- mov dword[edx], 2;I8_PRED_V
- mov eax, ecx
-return_satd_intra_8x8_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
- movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
- movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
- psadbw xmm6,%2
- paddw xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
- movzx %2, byte %1
- mov %3, %2
- add %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov edi, [esp+40] ;temp_sad
- sub ecx, edx
- movdqa xmm5,[ecx]
- pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
- movd eax,xmm0
-
- add ecx,edx
- lea ebx, [edx+2*edx]
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- sub edi, 192
- add eax,10h
- shr eax,5
- movd xmm7,eax
- pxor xmm1,xmm1
- pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
-;sad begin
- mov eax, [esp+24]
- mov ebx, [esp+28]
- lea esi, [ebx+2*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
- movhlps xmm0,xmm4
- paddw xmm4,xmm0
-; comparing order: DC H V
- movd ebx, xmm4 ;DC
- movd ecx, xmm3 ;V
- psrldq xmm3, 4
- movd esi, xmm3 ;H
- mov eax, [esp+36] ;lamda
- shl eax, 1
- add esi, eax
- add ebx, eax
- mov edx, [esp+32]
- cmp ebx, esi
- jge near not_dc_16x16_sad
- cmp ebx, ecx
- jge near not_dc_h_16x16_sad
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm7
-%assign x x+1
-%endrep
- jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
- ; for H mode
- cmp esi, ecx
- jge near not_dc_h_16x16_sad
- mov dword[edx], 1;I16_PRED_H
- mov eax, esi
- jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
- pop edi
- pop esi
- pop ebx
- ret
-%endif
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
- movq xmm0, [r0]
- punpcklqdq xmm0, xmm0
- pmaddubsw xmm0, xmm7
- movq xmm1, [r0+r1]
- punpcklqdq xmm1, xmm1
- pmaddubsw xmm1, xmm7
- movq xmm2, [r2]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r2+r3]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- psubsw xmm0, xmm2
- psubsw xmm1, xmm3
- movq xmm2, [r0+2*r1]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [r0+r4]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- movq xmm4, [r2+2*r3]
- punpcklqdq xmm4, xmm4
- pmaddubsw xmm4, xmm7
- movq xmm5, [r2+r5]
- punpcklqdq xmm5, xmm5
- pmaddubsw xmm5, xmm7
- psubsw xmm2, xmm4
- psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
- pabsw xmm0, xmm0
- pabsw xmm2, xmm2
- pabsw xmm1, xmm1
- pabsw xmm3, xmm3
- movdqa xmm4, xmm3
- pblendw xmm3, xmm1, 0xAA
- pslld xmm1, 16
- psrld xmm4, 16
- por xmm1, xmm4
- pmaxuw xmm1, xmm3
- paddw xmm6, xmm1
- movdqa xmm4, xmm0
- pblendw xmm0, xmm2, 0xAA
- pslld xmm2, 16
- psrld xmm4, 16
- por xmm2, xmm4
- pmaxuw xmm0, xmm2
- paddw xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
- ;push ebx
- ;mov eax,[esp+8]
- ;mov ebx,[esp+12]
- ;mov ecx,[esp+16]
- ;mov edx,[esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[r2]
- movd xmm5,[r2+r3]
- shufps xmm2,xmm5,0
- movd xmm3,[r2+r3*2]
- lea r2, [r3*2+r2]
- movd xmm5,[r2+r3]
- shufps xmm3,xmm5,0
- movd xmm0,[r0]
- movd xmm5,[r0+r1]
- shufps xmm0,xmm5,0
- movd xmm1,[r0+r1*2]
- lea r0, [r1*2+r0]
- movd xmm5,[r0+r1]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
- SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
- LOAD_4_PARA_POP
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;mov eax, [esp+16]
- ;mov ebx, [esp+20]
- ;mov ecx, [esp+24]
- ;mov edx, [esp+28]
-%ifdef X86_32
- push r4
- push r5
-%endif
- %assign push_num 2
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;push ebp
- ;%define pushsize 16
- ;mov eax, [esp+pushsize+4]
- ;mov ebx, [esp+pushsize+8]
- ;mov ecx, [esp+pushsize+12]
- ;mov edx, [esp+pushsize+16]
-%ifdef X86_32
- push r4
- push r5
- push r6
-%endif
- %assign push_num 3
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
-loop_get_satd_8x16:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_8x16
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;mov eax, [esp+16]
- ;mov ebx, [esp+20]
- ;mov ecx, [esp+24]
- ;mov edx, [esp+28]
-%ifdef X86_32
- push r4
- push r5
-%endif
- %assign push_num 2
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- push r0
- push r2
-
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
-
- pop r2
- pop r0
- ;mov eax, [esp+16]
- ;mov ecx, [esp+24]
- add r0, 8
- add r2, 8
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
- ;push ebx
- ;push esi
- ;push edi
- ;push ebp
- ;%define pushsize 16
- ;mov eax, [esp+pushsize+4]
- ;mov ebx, [esp+pushsize+8]
- ;mov ecx, [esp+pushsize+12]
- ;mov edx, [esp+pushsize+16]
-%ifdef X86_32
- push r4
- push r5
- push r6
-%endif
- %assign push_num 3
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
-
- push r0
- push r2
-
- movdqa xmm7, [HSumSubDB1]
- lea r4, [r1+r1*2]
- lea r5, [r3+r3*2]
- pxor xmm6, xmm6
- mov r6, 0
-loop_get_satd_16x16_left:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_left
-
- pop r2
- pop r0
- ;mov eax, [esp+pushsize+4]
- ;mov ecx, [esp+pushsize+12]
- add r0, 8
- add r2, 8
- mov r6, 0
-loop_get_satd_16x16_right:
- SSE41_GetSatd8x4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- inc r6
- cmp r6, 4
- jl loop_get_satd_16x16_right
- SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
- ;%undef pushsize
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r6
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqu xmm1, [r2]
- MOVDQ xmm2, [r0];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- paddw xmm7, xmm0
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+2*r3]
- MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [r2+r5]
- MOVDQ xmm2, [r0+r4]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
- movq xmm0, [r0]
- movq xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movhps xmm0, [r0]
- movhps xmm1, [r0+r1]
-
- movq xmm2, [r2]
- movq xmm3, [r2+r3]
- lea r2, [r2+2*r3]
- movhps xmm2, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
- ;push ebx
- ;push edi
- ;push esi
- ;%define _STACK_SIZE 12
- ;mov eax, [esp+_STACK_SIZE+4 ]
- ;mov ebx, [esp+_STACK_SIZE+8 ]
- ;mov ecx, [esp+_STACK_SIZE+12]
- ;mov edx, [esp+_STACK_SIZE+16]
-%ifdef X86_32
- push r4
- push r5
-%endif
-
- %assign push_num 2
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- lea r4, [3*r1]
- lea r5, [3*r3]
-
- pxor xmm7, xmm7
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- SSE2_GetSad4x16
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
- LOAD_4_PARA_POP
-%ifdef X86_32
- pop r5
- pop r4
-%endif
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movdqu xmm0, [r2]
- MOVDQ xmm2, [r0]
- psadbw xmm0, xmm2
- movdqu xmm1, [r2+r3]
- MOVDQ xmm2, [r0+r1]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-
- SSE2_GetSad2x16
- SSE2_GetSad2x16
- SSE2_GetSad2x16
-
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd retrd, xmm0
- LOAD_4_PARA_POP
- ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
-
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
-
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- LOAD_4_PARA_POP
- ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
- ;mov ecx, [esp+12]
- ;mov edx, ecx
- ;CACHE_SPLIT_CHECK edx, 8, 64
- ;jle near .pixel_sad_8x8_nsplit
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
-
- %assign push_num 0
- mov r2, arg3
- push r2
- CACHE_SPLIT_CHECK r2, 8, 64
- jle near .pixel_sad_8x8_nsplit
- pop r2
-%ifdef X86_32
- push r3
- push r4
- push r5
-%endif
- %assign push_num 3
- mov r0, arg1
- mov r1, arg2
- SIGN_EXTENTION r1, r1d
- pxor xmm7, xmm7
-
- ;ecx r2, edx r4, edi r5
-
- mov r5, r2
- and r5, 0x07
- sub r2, r5
- mov r4, 8
- sub r4, r5
-
- shl r5, 3
- shl r4, 3
- movd xmm5, r5d
- movd xmm6, r4d
- mov r5, 8
- add r5, r2
- mov r3, arg4
- SIGN_EXTENTION r3, r3d
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- lea r5, [r5+2*r3]
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
-
- movq xmm1, [r2]
- movq xmm2, [r5]
- movhps xmm1, [r2+r3]
- movhps xmm2, [r5+r3]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd retrd, xmm0
-%ifdef X86_32
- pop r5
- pop r4
- pop r3
-%endif
- jmp .return
-
-.pixel_sad_8x8_nsplit:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov edx, [esp+20]
-
- pop r2
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- SSE2_GetSad8x4
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd retrd, xmm0
- LOAD_4_PARA_POP
-.return:
- ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
- psadbw %1, %4
- paddw xmm5, %1
- psadbw %4, %3
- paddw xmm4, %4
- movdqu %4, [%5-1]
- psadbw %4, %2
- paddw xmm6, %4
- movdqu %4, [%5+1]
- psadbw %4, %2
- paddw xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
- ;push ebx
- ;mov eax, [esp+8]
- ;mov ebx, [esp+12]
- ;mov ecx, [esp+16]
- ;mov edx, [esp+20]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm2, xmm3
- paddw xmm5, xmm2
-
- movdqu xmm2, [r2-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movdqu xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- ;mov ecx, [esp+24]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [r0]
- sub r2, r3
- movdqu xmm3, [r2]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [r2+r3-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [r2+r3+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm2, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
- movdqa xmm0, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm1, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
- movdqa xmm2, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movdqa xmm0, [r0]
- movdqu xmm3, [r2]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
- movdqa xmm1, [r0+r1]
- movdqu xmm3, [r2+r3]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
- lea r2, [r2+2*r3]
- movdqu xmm3, [r2]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movdqu xmm0, [r2-1]
- psadbw xmm0, xmm1
- paddw xmm6, xmm0
-
- movdqu xmm3, [r2+1]
- psadbw xmm3, xmm1
- paddw xmm7, xmm3
-
- movdqu xmm3, [r2+r3]
- psadbw xmm1, xmm3
- paddw xmm5, xmm1
-
- ;mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- ;mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- sub r2, r3
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [r0]
- movhps xmm0, [r0+r1]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
-
- movq xmm1, [r2+r3-1]
- movq xmm3, [r2+r3+1]
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- movhps xmm1, [r2-1]
- movhps xmm3, [r2+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [r2]
- movhps xmm3, [r2+r3]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- ;mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [r4],xmm4
- LOAD_5_PARA_POP
- ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
- ;push ebx
- ;push edi
- ;mov eax, [esp+12]
- ;mov ebx, [esp+16]
- ;mov edi, [esp+20]
- ;mov edx, [esp+24]
-
- %assign push_num 0
- LOAD_5_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movd xmm0, [r0]
- movd xmm1, [r0+r1]
- lea r0, [r0+2*r1]
- movd xmm2, [r0]
- movd xmm3, [r0+r1]
- punpckldq xmm0, xmm1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub r2, r3
- movd xmm1, [r2]
- movd xmm2, [r2+r3]
- punpckldq xmm1, xmm2
- movd xmm2, [r2+r3-1]
- movd xmm3, [r2+r3+1]
-
- lea r2, [r2+2*r3]
-
- movd xmm4, [r2]
- movd xmm5, [r2-1]
- punpckldq xmm2, xmm5
- movd xmm5, [r2+1]
- punpckldq xmm3, xmm5
-
- movd xmm5, [r2+r3]
- punpckldq xmm4, xmm5
-
- punpcklqdq xmm1, xmm4 ;-L
-
- movd xmm5, [r2+r3-1]
- movd xmm6, [r2+r3+1]
-
- lea r2, [r2+2*r3]
- movd xmm7, [r2-1]
- punpckldq xmm5, xmm7
- punpcklqdq xmm2, xmm5 ;-1
- movd xmm7, [r2+1]
- punpckldq xmm6, xmm7
- punpcklqdq xmm3, xmm6 ;+1
- movd xmm6, [r2]
- movd xmm7, [r2+r3]
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
-
- movhlps xmm0, xmm1
- paddw xmm1, xmm0
- movhlps xmm0, xmm2
- paddw xmm2, xmm0
- movhlps xmm0, xmm3
- paddw xmm3, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- ;mov edi, [esp+28]
- punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
- punpcklqdq xmm1, xmm2
- movdqa [r4],xmm1
- LOAD_5_PARA_POP
- ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
-;***********************************************************************
-; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WelsSampleSad4x4_mmx:
- ;push ebx
- ;%define pushsize 4
- ;%define pix1address esp+pushsize+4
- ;%define pix1stride esp+pushsize+8
- ;%define pix2address esp+pushsize+12
- ;%define pix2stride esp+pushsize+16
- ;mov eax, [pix1address]
- ;mov ebx, [pix1stride ]
- ;mov ecx, [pix2address]
- ;mov edx, [pix2stride ]
-
- %assign push_num 0
- LOAD_4_PARA
- SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
- movd mm0, [r0]
- movd mm1, [r0+r1]
- punpckldq mm0, mm1
-
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm0, mm3
-
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
-
- movd mm1, [r0]
- movd mm2, [r0+r1]
- punpckldq mm1, mm2
-
- movd mm3, [r2]
- movd mm4, [r2+r3]
- punpckldq mm3, mm4
- psadbw mm1, mm3
- paddw mm0, mm1
-
- movd retrd, mm0
-
- WELSEMMS
- LOAD_4_PARA_POP
- ret
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* satd_sad.asm
+;*
+;* Abstract
+;* WelsSampleSatd4x4_sse2
+;* WelsSampleSatd8x8_sse2
+;* WelsSampleSatd16x8_sse2
+;* WelsSampleSatd8x16_sse2
+;* WelsSampleSatd16x16_sse2
+;*
+;* WelsSampleSad16x8_sse2
+;* WelsSampleSad16x16_sse2
+;*
+;* History
+;* 8/5/2009 Created
+;* 24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1: dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2: dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro SSE2_SumWHorizon1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+ WELS_AbsW %1, %3
+ WELS_AbsW %2, %3
+ WELS_AbsW %4, %6
+ WELS_AbsW %5, %6
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
+%endmacro
+
+%macro SSE2_SumWHorizon 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0 , [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
+
+ movd xmm4, [r2]
+ movd xmm5, [r2+r3]
+ lea r2 , [r2+2*r3]
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm4, xmm6
+ punpckldq xmm5, xmm7
+
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
+ punpcklbw xmm4, xmm6
+ punpcklbw xmm5, xmm6
+
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
+
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
+
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
+
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ WELS_AbsW xmm0, xmm3
+ paddusw xmm6, xmm0
+ WELS_AbsW xmm2, xmm4
+ paddusw xmm6, xmm2
+ SSE2_SumWHorizon1 xmm6, xmm4
+ movd retrd, xmm6
+ and retrd, 0xffff
+ shr retrd, 1
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+ SSE2_GetSatd8x8
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+8]
+ ;mov ecx, [esp+16]
+ add r0, 8
+ add r2, 8
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+8]
+ ;mov ecx, [esp+16]
+ add r0, 8
+ add r2, 8
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+ paddd xmm4, %1 ;for dc
+ paddd xmm4, %3 ;for dc
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+; paddd xmm4, %1 ;for dc
+; paddd xmm4, %3 ;for dc
+ movdqa %4, %1
+ punpcklqdq %4, %3
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+ pxor xmm7, xmm7
+ movq xmm0, [eax]
+ movq xmm1, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ movq xmm2, [eax]
+ movq xmm3, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+ ;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2], 0
+ pinsrw xmm0, word[esi+%2+8], 4
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+2], 0
+ pinsrw xmm0, word[esi+%2+10], 4
+ psubsw xmm0, xmm1
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+4], 0
+ pinsrw xmm0, word[esi+%2+12], 4
+ psubsw xmm0, xmm3
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+6], 0
+ pinsrw xmm0, word[esi+%2+14], 4
+ psubsw xmm0, xmm2
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH 3
+ movq xmm0, [esi+%3+8*%1]
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm5, xmm0
+ pabsw xmm1, xmm1
+ pabsw xmm2, xmm2
+ pabsw xmm3, xmm3
+ paddw xmm2, xmm1;for DC
+ paddw xmm2, xmm3;for DC
+ paddw xmm5, xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+ pxor xmm0, xmm0
+ movq2dq xmm0, mm4
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+ shl %1, 4
+ movdqa xmm0, [esi+32+%1]
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 32
+ SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 16
+ SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ pxor xmm4, xmm4
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movdqu xmm0, [ecx]
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi], xmm0 ;V
+ movdqa [esi+16], xmm1
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 8
+ pinsrb xmm0, byte[ecx+edx-1], 9
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 10
+ pinsrb xmm0, byte[ecx+edx-1], 11
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 12
+ pinsrb xmm0, byte[ecx+edx-1], 13
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 14
+ pinsrb xmm0, byte[ecx+edx-1], 15
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi+32], xmm0 ;H
+ movdqa [esi+48], xmm1
+ movd ecx, xmm4 ;dc
+ add ecx, 16 ;(sum+16)
+ shr ecx, 5 ;((sum+16)>>5)
+ shl ecx, 4 ;
+ movd mm4, ecx ; mm4 copy DC
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+ mov edi, 0
+.loop16x16_get_satd:
+.loopStart1:
+ SSE41_I16x16GetX38x4Satd ecx, edi
+ inc ecx
+ cmp ecx, 4
+ jl .loopStart1
+ cmp edi, 16
+ je .loop16x16_get_satd_end
+ mov eax, [esp+24]
+ add eax, 8
+ mov ecx, 0
+ add edi, 16
+ jmp .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ebx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_16x16
+ cmp ebx, ecx
+ jge near not_dc_h_16x16
+
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_16x16
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+return_satd_intra_16x16_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movq xmm0, [ecx]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [esi], xmm0 ;V
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+ movdqa [esi+16], xmm0 ;H
+;(sum+2)>>2
+ movdqa xmm6, [PDQ2]
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
+;(sum1+sum2+4)>>3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
+;satd *16
+ pslld xmm5, 4
+ pslld xmm4, 4
+;temp satd
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [esi+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [esi+48], xmm5
+
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+loop_chroma_satdx3_cb_cr:
+ SSE41_ChromaGetX38x4Satd ecx, 0
+ inc ecx
+ cmp ecx, 2
+ jl loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+ movq2dq %1, %3
+ movq2dq %2, %4
+ punpcklqdq %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ xor edi, edi
+loop_chroma_satdx3:
+ SSE41_ChromaGetX38x8Satd
+ cmp edi, 1
+ je loop_chroma_satdx3end
+ inc edi
+ SSEReg2MMX xmm4, mm0,mm1
+ SSEReg2MMX xmm5, mm2,mm3
+ SSEReg2MMX xmm6, mm5,mm6
+ mov ecx, [esp+44]
+ mov eax, [esp+48]
+ jmp loop_chroma_satdx3
+loop_chroma_satdx3end:
+ MMXReg2SSE xmm0, xmm3, mm0, mm1
+ MMXReg2SSE xmm1, xmm3, mm2, mm3
+ MMXReg2SSE xmm2, xmm3, mm5, mm6
+
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ecx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_8x8
+ cmp ebx, ecx
+ jge near not_dc_h_8x8
+
+ ; for DC mode
+ mov dword[edx], 0;I8_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_8x8
+ mov dword[edx], 1;I8_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+ ; for V mode
+ mov dword[edx], 2;I8_PRED_V
+ mov eax, ecx
+return_satd_intra_8x8_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+ movd xmm6,%1
+ pshufb xmm6,xmm1
+ movdqa %1, xmm6
+ movdqa xmm0,%2
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
+ psadbw xmm6,%2
+ paddw xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+ movzx %2, byte %1
+ mov %3, %2
+ add %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov edi, [esp+40] ;temp_sad
+ sub ecx, edx
+ movdqa xmm5,[ecx]
+ pxor xmm0,xmm0
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
+ movd eax,xmm0
+
+ add ecx,edx
+ lea ebx, [edx+2*edx]
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ sub edi, 192
+ add eax,10h
+ shr eax,5
+ movd xmm7,eax
+ pxor xmm1,xmm1
+ pshufb xmm7,xmm1
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+;sad begin
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ lea esi, [ebx+2*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
+ movhlps xmm0,xmm4
+ paddw xmm4,xmm0
+; comparing order: DC H V
+ movd ebx, xmm4 ;DC
+ movd ecx, xmm3 ;V
+ psrldq xmm3, 4
+ movd esi, xmm3 ;H
+ mov eax, [esp+36] ;lamda
+ shl eax, 1
+ add esi, eax
+ add ebx, eax
+ mov edx, [esp+32]
+ cmp ebx, esi
+ jge near not_dc_16x16_sad
+ cmp ebx, ecx
+ jge near not_dc_h_16x16_sad
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm7
+%assign x x+1
+%endrep
+ jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+ ; for H mode
+ cmp esi, ecx
+ jge near not_dc_h_16x16_sad
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, esi
+ jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+ pop edi
+ pop esi
+ pop ebx
+ ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ pmaddubsw xmm0, xmm7
+ movq xmm1, [r0+r1]
+ punpcklqdq xmm1, xmm1
+ pmaddubsw xmm1, xmm7
+ movq xmm2, [r2]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r2+r3]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ psubsw xmm0, xmm2
+ psubsw xmm1, xmm3
+ movq xmm2, [r0+2*r1]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r0+r4]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ movq xmm4, [r2+2*r3]
+ punpcklqdq xmm4, xmm4
+ pmaddubsw xmm4, xmm7
+ movq xmm5, [r2+r5]
+ punpcklqdq xmm5, xmm5
+ pmaddubsw xmm5, xmm7
+ psubsw xmm2, xmm4
+ psubsw xmm3, xmm5
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ pabsw xmm0, xmm0
+ pabsw xmm2, xmm2
+ pabsw xmm1, xmm1
+ pabsw xmm3, xmm3
+ movdqa xmm4, xmm3
+ pblendw xmm3, xmm1, 0xAA
+ pslld xmm1, 16
+ psrld xmm4, 16
+ por xmm1, xmm4
+ pmaxuw xmm1, xmm3
+ paddw xmm6, xmm1
+ movdqa xmm4, xmm0
+ pblendw xmm0, xmm2, 0xAA
+ pslld xmm2, 16
+ psrld xmm4, 16
+ por xmm2, xmm4
+ pmaxuw xmm0, xmm2
+ paddw xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+ ;push ebx
+ ;mov eax,[esp+8]
+ ;mov ebx,[esp+12]
+ ;mov ecx,[esp+16]
+ ;mov edx,[esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[r2]
+ movd xmm5,[r2+r3]
+ shufps xmm2,xmm5,0
+ movd xmm3,[r2+r3*2]
+ lea r2, [r3*2+r2]
+ movd xmm5,[r2+r3]
+ shufps xmm3,xmm5,0
+ movd xmm0,[r0]
+ movd xmm5,[r0+r1]
+ shufps xmm0,xmm5,0
+ movd xmm1,[r0+r1*2]
+ lea r0, [r1*2+r0]
+ movd xmm5,[r0+r1]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
+ SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;mov eax, [esp+16]
+ ;mov ebx, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;%define pushsize 16
+ ;mov eax, [esp+pushsize+4]
+ ;mov ebx, [esp+pushsize+8]
+ ;mov ecx, [esp+pushsize+12]
+ ;mov edx, [esp+pushsize+16]
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_8x16:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_8x16
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;mov eax, [esp+16]
+ ;mov ebx, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+16]
+ ;mov ecx, [esp+24]
+ add r0, 8
+ add r2, 8
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;%define pushsize 16
+ ;mov eax, [esp+pushsize+4]
+ ;mov ebx, [esp+pushsize+8]
+ ;mov ecx, [esp+pushsize+12]
+ ;mov edx, [esp+pushsize+16]
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_16x16_left:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_left
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+pushsize+4]
+ ;mov ecx, [esp+pushsize+12]
+ add r0, 8
+ add r2, 8
+ mov r6, 0
+loop_get_satd_16x16_right:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_right
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ ;%undef pushsize
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r2]
+ MOVDQ xmm2, [r0];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ paddw xmm7, xmm0
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+2*r3]
+ MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+r5]
+ MOVDQ xmm2, [r0+r4]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+ movq xmm0, [r0]
+ movq xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movhps xmm0, [r0]
+ movhps xmm1, [r0+r1]
+
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movhps xmm2, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+ ;push ebx
+ ;push edi
+ ;push esi
+ ;%define _STACK_SIZE 12
+ ;mov eax, [esp+_STACK_SIZE+4 ]
+ ;mov ebx, [esp+_STACK_SIZE+8 ]
+ ;mov ecx, [esp+_STACK_SIZE+12]
+ ;mov edx, [esp+_STACK_SIZE+16]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+
+ pxor xmm7, xmm7
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and %1, 0x1f|(%3>>1)
+cmp %1, (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+ ;mov ecx, [esp+12]
+ ;mov edx, ecx
+ ;CACHE_SPLIT_CHECK edx, 8, 64
+ ;jle near .pixel_sad_8x8_nsplit
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+
+ %assign push_num 0
+ mov r2, arg3
+ push r2
+ CACHE_SPLIT_CHECK r2, 8, 64
+ jle near .pixel_sad_8x8_nsplit
+ pop r2
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+%endif
+ %assign push_num 3
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENTION r1, r1d
+ pxor xmm7, xmm7
+
+ ;ecx r2, edx r4, edi r5
+
+ mov r5, r2
+ and r5, 0x07
+ sub r2, r5
+ mov r4, 8
+ sub r4, r5
+
+ shl r5, 3
+ shl r4, 3
+ movd xmm5, r5d
+ movd xmm6, r4d
+ mov r5, 8
+ add r5, r2
+ mov r3, arg4
+ SIGN_EXTENTION r3, r3d
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+%ifdef X86_32
+ pop r5
+ pop r4
+ pop r3
+%endif
+ jmp .return
+
+.pixel_sad_8x8_nsplit:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov edx, [esp+20]
+
+ pop r2
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+.return:
+ ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+ psadbw %1, %4
+ paddw xmm5, %1
+ psadbw %4, %3
+ paddw xmm4, %4
+ movdqu %4, [%5-1]
+ psadbw %4, %2
+ paddw xmm6, %4
+ movdqu %4, [%5+1]
+ psadbw %4, %2
+ paddw xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm2, xmm3
+ paddw xmm5, xmm2
+
+ movdqu xmm2, [r2-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov ecx, [esp+24]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movdqu xmm0, [r2-1]
+ psadbw xmm0, xmm1
+ paddw xmm6, xmm0
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm1
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm1, xmm3
+ paddw xmm5, xmm1
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub r2, r3
+ movd xmm1, [r2]
+ movd xmm2, [r2+r3]
+ punpckldq xmm1, xmm2
+ movd xmm2, [r2+r3-1]
+ movd xmm3, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+
+ movd xmm4, [r2]
+ movd xmm5, [r2-1]
+ punpckldq xmm2, xmm5
+ movd xmm5, [r2+1]
+ punpckldq xmm3, xmm5
+
+ movd xmm5, [r2+r3]
+ punpckldq xmm4, xmm5
+
+ punpcklqdq xmm1, xmm4 ;-L
+
+ movd xmm5, [r2+r3-1]
+ movd xmm6, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+ movd xmm7, [r2-1]
+ punpckldq xmm5, xmm7
+ punpcklqdq xmm2, xmm5 ;-1
+ movd xmm7, [r2+1]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm3, xmm6 ;+1
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6 ;+L
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movhlps xmm0, xmm2
+ paddw xmm2, xmm0
+ movhlps xmm0, xmm3
+ paddw xmm3, xmm0
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ ;mov edi, [esp+28]
+ punpckldq xmm1, xmm4
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm1, xmm2
+ movdqa [r4],xmm1
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+ ;push ebx
+ ;%define pushsize 4
+ ;%define pix1address esp+pushsize+4
+ ;%define pix1stride esp+pushsize+8
+ ;%define pix2address esp+pushsize+12
+ ;%define pix2stride esp+pushsize+16
+ ;mov eax, [pix1address]
+ ;mov ebx, [pix1stride ]
+ ;mov ecx, [pix2address]
+ ;mov edx, [pix2stride ]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd mm0, [r0]
+ movd mm1, [r0+r1]
+ punpckldq mm0, mm1
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm0, mm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movd mm1, [r0]
+ movd mm2, [r0+r1]
+ punpckldq mm1, mm2
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm1, mm3
+ paddw mm0, mm1
+
+ movd retrd, mm0
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -179,15 +179,15 @@
;%define stride r1
BilateralLumaFilter8_sse2:
-
- push r3
+
+ push r3
%assign push_num 1
LOAD_2_PARA
pxor xmm7, xmm7
-
+
mov r3, r0
-
+
movq xmm6, [r0]
punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32]
@@ -218,10 +218,10 @@
packuswb xmm5, xmm5
movq [r3], xmm5
-
+
pop r3
%assign push_num 0
-
+
ret
WELS_EXTERN WaverageChromaFilter8_sse2
@@ -239,11 +239,11 @@
WaverageChromaFilter8_sse2:
push r3
-
+
%assign push_num 1
-
+
LOAD_2_PARA
-
+
mov r3, r1
add r3, r3
sub r0, r3 ; pixels - 2 * stride
@@ -272,8 +272,8 @@
packuswb xmm3, xmm3
movq [r0 + 2], xmm3
-
+
pop r3
-
+
%assign push_num 0
ret
--- a/codec/processing/src/asm/sad.asm
+++ b/codec/processing/src/asm/sad.asm
@@ -84,7 +84,7 @@
;push edi
;mov eax, [esp+12]
;mov ebx, [esp+16]
-
+
%assign push_num 0
mov r2, arg3
push r2
@@ -91,7 +91,7 @@
CACHE_SPLIT_CHECK r2, 8, 64
jle near .pixel_sad_8x8_nsplit
pop r2
-%ifdef X86_32
+%ifdef X86_32
push r3
push r4
push r5
@@ -98,10 +98,10 @@
%endif
%assign push_num 3
mov r0, arg1
- mov r1, arg2
+ mov r1, arg2
SIGN_EXTENTION r1, r1d
pxor xmm7, xmm7
-
+
;ecx r2, edx r4, edi r5
mov r5, r2
@@ -195,18 +195,18 @@
pop r3
%endif
jmp .return
-
+
.pixel_sad_8x8_nsplit:
;push ebx
;mov eax, [esp+8]
;mov ebx, [esp+12]
;mov edx, [esp+20]
-
+
pop r2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENTION r1, r1d
- SIGN_EXTENTION r3, r3d
+ SIGN_EXTENTION r3, r3d
pxor xmm6, xmm6
SSE2_GetSad8x4
lea r0, [r0+2*r1]