ref: 62fb37d0960afd5b98d76c0f05e8174a708120bc
parent: 732e1c5f789f8d2e280975f9fa23af7025cb6036
author: Sindre Aamås <[email protected]>
date: Tue Feb 9 18:28:14 EST 2016
[Common/x86] DeblockLumaEq4_ssse3 optimizations Use packed 8-bit operations rather than unpack to 16-bit. Minimize spills. ~2.31x speedup on Haswell (x86-64). ~2.40x speedup on Haswell (x86 32-bit).
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -51,10 +51,12 @@
FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
ALIGN 16
-WELS_DB96_16:
- times 16 db 96
+WELS_DB1_16:
+ times 16 db 1
WELS_DB127_16:
times 16 db 127
+WELS_DB96_16:
+ times 16 db 96
WELS_SHUFB0000111122223333:
times 4 db 0
times 4 db 1
@@ -85,6 +87,12 @@
pcmpgtb %1, %2
%endmacro
+; Unsigned byte compare greater than or equal.
+%macro SSE2_CmpgeUB 2
+ pminub %1, %2
+ pcmpeqb %1, %2
+%endmacro
+
; Clip unsigned bytes to ref +/- diff.
; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4
%macro SSE2_ClipUB 4
@@ -95,7 +103,34 @@
pminub %1, %3
%endmacro
+; (a + b + 1 - c) >> 1
+; a=%1 b=%2 c=%3 [out:a^b&c]=%4
+%macro SSE2_AvgbFloor1 4
+ movdqa %4, %1
+ pxor %4, %2
+ pavgb %1, %2
+ pand %4, %3
+ psubb %1, %4
+%endmacro
+; (a + b + carry) >> 1
+; a=%1 b=%2 carry-1=%3
+%macro SSE2_AvgbFloor2 3
+ pxor %1, %3
+ pxor %2, %3
+ pavgb %1, %2
+ pxor %1, %3
+%endmacro
+
+; a = (a & m) | (b & ~m)
+; a=%1 b=%2 m=%3
+%macro SSE2_Blend 3
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endmacro
+
+
;*******************************************************************************
; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta, int8_t * pTC)
@@ -211,469 +246,138 @@
ret
-%ifdef WIN64
+; Deblock 3x16 luma pixels for the eq4 case.
+;
+; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be
+; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1,
+; which maps to 3 pavgbs.
+;
+; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12
+%macro SSE2_DeblockLumaEq4_3x16P 12
+ movdqa %7, %3
+ movdqa %8, %6
+ MOVDQ %10, [%1 + 1 * %2] ; p1
+ SSE2_Blend %7, %10, %8 ; t0 = bDeltaP2P0 ? q0 : p1
+ movdqa %8, %6
+ MOVDQ %9, [%1 + 2 * %2] ; p2
+ SSE2_Blend %9, %4, %8 ; t1 = bDeltaP2P0 ? p2 : q1
+ SSE2_AvgbFloor1 %4, %9, %12, %8 ; t1 = (t1 + q1) >> 1
+ SSE2_AvgbFloor1 %10, [%1], %12, %8 ; (p0 + p1) >> 1, p0 ^ p1
+ pxor %8, %12
+ SSE2_AvgbFloor1 %7, %4, %8, %9 ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1
+ MOVDQ %9, [%1 + 2 * %2] ; p2
+ SSE2_AvgbFloor1 %3, %9, %8, %4 ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1
+ pavgb %7, %10 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2
+ movdqa %8, %10
+ pxor %8, %3 ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1
+ pand %8, %12 ; & 1
+ pavgb %10, %3 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2
+ pand %6, %5 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0
+%if %11
+ MOVDQ %3, [%1 + 0 * %2] ; p0
+ movdqa %4, %5
+ SSE2_Blend %7, %3, %4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
+%else
+ SSE2_Blend %7, [%1 + 0 * %2], %5 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0
+%endif
+ MOVDQ [%1 + 0 * %2], %7 ; store p0
+ add %1, %2
+ movdqa %7, %10
+ psubb %10, %8 ; (p0 + p1 + p2 + q0) >> 2
+ psubb %8, %12
+ MOVDQ %4, [%1 + (3 - 1) * %2] ; p3
+ SSE2_AvgbFloor2 %4, %9, %8 ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1
+ pavgb %10, %4 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2
+ movdqa %8, %6
+ SSE2_Blend %10, [%1 + (2 - 1) * %2], %8 ; p2out = bDeltaP2P0 ? p2' : p2
+ MOVDQ [%1 + (2 - 1) * %2], %10 ; store p2
+%if %11
+ MOVDQ %4, [%1 + (1 - 1) * %2] ; p1
+ SSE2_Blend %7, %4, %6 ; p1out = bDeltaP2P0 ? p1' : p1
+%else
+ SSE2_Blend %7, [%1 + (1 - 1) * %2], %6 ; p1out = bDeltaP2P0 ? p1' : p1
+%endif
+ MOVDQ [%1 + (1 - 1) * %2], %7 ; store p1
+%endmacro
+;*******************************************************************************
+; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta)
+;*******************************************************************************
+
WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- push rsi
- push rdi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rdi
- pop rsi
- pop rbp
- pop rbx
+ %assign push_num 0
+ LOAD_4_PARA
+ PUSH_XMM 10
+ SIGN_EXTENSION r1, r1d
+ movd xmm1, arg3d
+ movd xmm2, arg4d
+ shr r2, 2
+ add r2, 1
+ movd xmm3, r2d
+ pxor xmm4, xmm4
+ pxor xmm1, [WELS_DB127_16]
+ pxor xmm2, [WELS_DB127_16]
+ pshufb xmm1, xmm4 ; iAlpha ^ 0x7f
+ pshufb xmm2, xmm4 ; iBeta ^ 0x7f
+ pshufb xmm3, xmm4 ; (iAlpha >> 2) + 1
+ mov r2, r1 ; iStride
+ neg r1 ; -iStride
+ lea r3, [r0 + r1] ; pPix - iStride
+
+ ; Compute masks to enable/disable filtering.
+ MOVDQ xmm7, [r3 + 1 * r1] ; p1
+ MOVDQ xmm6, [r3 + 0 * r1] ; p0
+ MOVDQ xmm0, [r0 + 0 * r2] ; q0
+ movdqa xmm4, xmm6
+ SSE2_AbsDiffUB xmm6, xmm0, xmm5 ; |p0 - q0|
+ SSE2_CmpgeUB xmm3, xmm6 ; |p0 - q0| < (iAlpha >> 2) + 2
+ SSE2_CmpltUB xmm6, xmm1, [WELS_DB127_16] ; bDeltaP0Q0 = |p0 - q0| < iAlpha
+ MOVDQ xmm1, [r0 + 1 * r2] ; q1
+ SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p1 - p0|
+ SSE2_AbsDiffUB xmm0, xmm1, xmm5 ; |q1 - q0|
+ pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|)
+ SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta
+ pand xmm6, xmm7 ; & bDeltaP0Q0
+
+ MOVDQ xmm7, [r3 + 2 * r1] ; p2
+ SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p2 - p0|
+ SSE2_CmpltUB xmm7, xmm2, [WELS_DB127_16] ; bDeltaP2P0 = |p2 - p0| < iBeta
+ pand xmm7, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
+
+ MOVDQ xmm0, [r0 + 0 * r2] ; q0
+ MOVDQ xmm5, [r0 + 2 * r2] ; q2
+ SSE2_AbsDiffUB xmm5, xmm0, xmm4 ; |q2 - q0|
+ SSE2_CmpltUB xmm5, xmm2, [WELS_DB127_16] ; bDeltaQ2Q0 = |q2 - q0| < iBeta
+ pand xmm5, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2
+
+%ifdef X86_32
+ ; Push xmm5 to free up one register. Align stack so as to ensure that failed
+ ; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB).
+ mov r2, esp
+ sub esp, 16
+ and esp, -16
+ movdqa [esp], xmm5
+ SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [WELS_DB1_16]
+ movdqa xmm5, [esp]
+ mov esp, r2
+ neg r1
+ SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [WELS_DB1_16]
+%else
+ movdqa xmm9, [WELS_DB1_16]
+ SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9
+ SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9
+%endif
+
+ POP_XMM
+ LOAD_4_PARA_POP
ret
+%ifdef WIN64
+
+
WELS_EXTERN DeblockChromaLt4V_ssse3
mov rax,rsp
push rbx
@@ -1538,465 +1242,6 @@
%elifdef UNIX64
-WELS_EXTERN DeblockLumaEq4V_ssse3
- mov rax,rsp
- push rbx
- push rbp
- mov r8, rdx
- mov r9, rcx
- mov rcx, rdi
- mov rdx, rsi
- sub rsp,1D8h
- movaps [rax-38h],xmm6
- movaps [rax-48h],xmm7
- movaps [rax-58h],xmm8
- pxor xmm1,xmm1
- movsxd r10,edx
- mov rbp,rcx
- mov r11d,r8d
- mov rdx,rcx
- mov rdi,rbp
- mov rbx,rbp
- movdqa xmm5,[rbp]
- movaps [rax-68h],xmm9
- movaps [rax-78h],xmm10
- punpcklbw xmm5,xmm1
- movaps [rax-88h],xmm11
- movaps [rax-98h],xmm12
- movaps [rax-0A8h],xmm13
- movaps [rax-0B8h],xmm14
- movdqa xmm14,[r10+rbp]
- movaps [rax-0C8h],xmm15
- lea eax,[r10*4]
- movsxd r8,eax
- lea eax,[r10+r10*2]
- movsxd rcx,eax
- lea eax,[r10+r10]
- sub rdx,r8
- punpcklbw xmm14,xmm1
- movdqa [rsp+90h],xmm5
- movdqa [rsp+30h],xmm14
- movsxd rsi,eax
- movsx eax,r11w
- sub rdi,rcx
- sub rbx,rsi
- mov r8,rbp
- sub r8,r10
- movd xmm0,eax
- movsx eax,r9w
- movdqa xmm12,[rdi]
- movdqa xmm6, [rsi+rbp]
- movdqa xmm13,[rbx]
- punpcklwd xmm0,xmm0
- pshufd xmm11,xmm0,0
- punpcklbw xmm13,xmm1
- punpcklbw xmm6,xmm1
- movdqa xmm8,[r8]
- movd xmm0,eax
- movdqa xmm10,xmm11
- mov eax,2
- punpcklbw xmm8,xmm1
- punpcklbw xmm12,xmm1
- cwde
- punpcklwd xmm0,xmm0
- psraw xmm10,2
- movdqa xmm1,xmm8
- movdqa [rsp+0F0h],xmm13
- movdqa [rsp+0B0h],xmm8
- pshufd xmm7,xmm0,0
- psubw xmm1,xmm13
- movdqa xmm0,xmm5
- movdqa xmm4,xmm7
- movdqa xmm2,xmm7
- psubw xmm0,xmm8
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,xmm5
- movdqa [rsp+40h],xmm7
- movdqa [rsp+60h],xmm6
- pcmpgtw xmm4,xmm0
- psubw xmm1,xmm14
- pabsw xmm0,xmm1
- pcmpgtw xmm2,xmm0
- pand xmm4,xmm2
- movdqa xmm0,xmm11
- pcmpgtw xmm0,xmm3
- pand xmm4,xmm0
- movd xmm0,eax
- movdqa [rsp+20h],xmm4
- punpcklwd xmm0,xmm0
- pshufd xmm2,xmm0,0
- paddw xmm10,xmm2
- movdqa [rsp+0A0h],xmm2
- movdqa xmm15,xmm7
- pxor xmm4,xmm4
- movdqa xmm0,xmm8
- psubw xmm0,xmm12
- mov eax,4
- pabsw xmm0,xmm0
- movdqa xmm1,xmm10
- cwde
- pcmpgtw xmm15,xmm0
- pcmpgtw xmm1,xmm3
- movdqa xmm3,xmm7
- movdqa xmm7,[rdx]
- movdqa xmm0,xmm5
- psubw xmm0,xmm6
- pand xmm15,xmm1
- punpcklbw xmm7,xmm4
- movdqa xmm9,xmm15
- pabsw xmm0,xmm0
- psllw xmm7,1
- pandn xmm9,xmm12
- pcmpgtw xmm3,xmm0
- paddw xmm7,xmm12
- movd xmm0,eax
- pand xmm3,xmm1
- paddw xmm7,xmm12
- punpcklwd xmm0,xmm0
- paddw xmm7,xmm12
- pshufd xmm1,xmm0,0
- paddw xmm7,xmm13
- movdqa xmm0,xmm3
- pandn xmm0,xmm6
- paddw xmm7,xmm8
- movdqa [rsp+70h],xmm1
- paddw xmm7,xmm5
- movdqa [rsp+120h],xmm0
- movdqa xmm0,[rcx+rbp]
- punpcklbw xmm0,xmm4
- paddw xmm7,xmm1
- movdqa xmm4,xmm15
- psllw xmm0,1
- psraw xmm7,3
- paddw xmm0,xmm6
- pand xmm7,xmm15
- paddw xmm0,xmm6
- paddw xmm0,xmm6
- paddw xmm0,xmm14
- movdqa xmm6,xmm15
- paddw xmm0,xmm5
- pandn xmm6,xmm13
- paddw xmm0,xmm8
- paddw xmm0,xmm1
- psraw xmm0,3
- movdqa xmm1,xmm12
- paddw xmm1,xmm13
- pand xmm0,xmm3
- movdqa [rsp+100h],xmm0
- movdqa xmm0,xmm8
- paddw xmm0,xmm5
- paddw xmm1,xmm0
- movdqa xmm0,xmm3
- paddw xmm1,xmm2
- psraw xmm1,2
- pandn xmm0,xmm14
- pand xmm4,xmm1
- movdqa [rsp+0E0h],xmm0
- movdqa xmm0,xmm5
- paddw xmm0,xmm8
- movdqa xmm1,[rsp+60h]
- paddw xmm1,xmm14
- movdqa xmm14,xmm3
- paddw xmm1,xmm0
- movdqa xmm0,xmm8
- paddw xmm0,[rsp+30h]
- paddw xmm1,xmm2
- psraw xmm1,2
- pand xmm14,xmm1
- movdqa xmm1,xmm13
- paddw xmm1,xmm13
- paddw xmm1,xmm0
- paddw xmm1,xmm2
- psraw xmm1,2
- movdqa xmm0,[rsp+30h]
- movdqa xmm2,xmm13
- movdqa xmm5,xmm15
- paddw xmm0,[rsp+70h]
- pandn xmm5,xmm1
- paddw xmm2,xmm8
- movdqa xmm8,[rsp+90h]
- movdqa xmm1,xmm12
- paddw xmm2,xmm8
- psllw xmm2,1
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,xmm8
- movdqa xmm8,xmm3
- movdqa xmm2,[rsp+30h]
- paddw xmm0,xmm13
- psraw xmm1,3
- pand xmm15,xmm1
- movdqa xmm1,xmm2
- paddw xmm1,xmm2
- paddw xmm2,[rsp+90h]
- paddw xmm2,[rsp+0B0h]
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- movdqa xmm13,[r8]
- paddw xmm0, [rsp+70h]
- paddw xmm1, [rsp+0A0h]
- psllw xmm2,1
- paddw xmm2,xmm0
- psraw xmm1,2
- movdqa xmm0, [rdi]
- pandn xmm8,xmm1
- movdqa xmm1, [rsp+60h]
- paddw xmm1,xmm2
- movdqa xmm2, [rbx]
- psraw xmm1,3
- pand xmm3,xmm1
- movdqa xmm1, [rbp]
- movdqa [rsp+0D0h],xmm3
- pxor xmm3,xmm3
- punpckhbw xmm0,xmm3
- punpckhbw xmm1,xmm3
- punpckhbw xmm13,xmm3
- movdqa [rsp+0C0h],xmm0
- movdqa xmm0,[r10+rbp]
- movdqa [rsp],xmm1
- punpckhbw xmm0,xmm3
- punpckhbw xmm2,xmm3
- movdqa [rsp+80h],xmm0
- movdqa xmm0,[rsi+rbp]
- movdqa [rsp+10h],xmm13
- punpckhbw xmm0,xmm3
- movdqa [rsp+50h],xmm0
- movdqa xmm0,xmm1
- movdqa xmm1,xmm13
- psubw xmm0,xmm13
- psubw xmm1,xmm2
- pabsw xmm3,xmm0
- pabsw xmm0,xmm1
- movdqa xmm1,[rsp]
- movdqa xmm13,[rsp+40h]
- movdqa [rsp+110h],xmm2
- psubw xmm1, [rsp+80h]
- pcmpgtw xmm13,xmm0
- pcmpgtw xmm11,xmm3
- pabsw xmm0,xmm1
- pcmpgtw xmm10,xmm3
- movdqa xmm1, [rsp+40h]
- movdqa xmm2,xmm1
- movdqa xmm3,xmm1
- pcmpgtw xmm2,xmm0
- movdqa xmm0, [rsp+10h]
- pand xmm13,xmm2
- pand xmm13,xmm11
- movdqa xmm11,[rsp+0C0h]
- psubw xmm0,xmm11
- pabsw xmm0,xmm0
- pcmpgtw xmm3,xmm0
- pand xmm3,xmm10
- movdqa xmm0,[rsp]
- psubw xmm0,[rsp+50h]
- movdqa xmm2,[rdx]
- pabsw xmm0,xmm0
- por xmm7,xmm9
- movdqa xmm9,[rsp+20h]
- pcmpgtw xmm1,xmm0
- pand xmm9,xmm7
- movdqa xmm7,[rsp+20h]
- movdqa xmm0,xmm7
- pandn xmm0,xmm12
- movdqa xmm12,[rsp+110h]
- pand xmm1,xmm10
- movdqa xmm10,[rsp+70h]
- movdqa [rsp+40h],xmm1
- movdqa xmm1,xmm13
- por xmm9,xmm0
- pxor xmm0,xmm0
- por xmm4,xmm6
- movdqa xmm6,xmm7
- punpckhbw xmm2,xmm0
- por xmm15,xmm5
- movdqa xmm5,[rsp+20h]
- movdqa xmm0,xmm3
- psllw xmm2,1
- pandn xmm0,xmm11
- pand xmm6,xmm4
- movdqa xmm4,[rsp]
- paddw xmm2,xmm11
- pand xmm5,xmm15
- movdqa xmm15,[rsp+20h]
- paddw xmm2,xmm11
- paddw xmm2,xmm11
- paddw xmm2,xmm12
- paddw xmm2,[rsp+10h]
- paddw xmm2,[rsp]
- paddw xmm2,xmm10
- psraw xmm2,3
- pand xmm2,xmm3
- por xmm2,xmm0
- pand xmm1,xmm2
- movdqa xmm0,xmm13
- movdqa xmm2,xmm11
- pandn xmm0,xmm11
- paddw xmm2,xmm12
- por xmm1,xmm0
- packuswb xmm9,xmm1
- movdqa xmm0,xmm7
- movdqa xmm7,[rsp+0A0h]
- pandn xmm0,[rsp+0F0h]
- movdqa xmm1,xmm3
- por xmm6,xmm0
- movdqa xmm0,[rsp+10h]
- paddw xmm0,xmm4
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm3
- pandn xmm0,xmm12
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pand xmm2,xmm1
- pandn xmm0,xmm12
- movdqa xmm1,xmm12
- paddw xmm1,[rsp+10h]
- por xmm2,xmm0
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+0B0h]
- paddw xmm1,xmm4
- packuswb xmm6,xmm2
- movdqa xmm2,xmm3
- psllw xmm1,1
- por xmm5,xmm0
- movdqa xmm0,[rsp+80h]
- paddw xmm0,xmm10
- paddw xmm1,xmm0
- paddw xmm11,xmm1
- psraw xmm11,3
- movdqa xmm1,xmm12
- pand xmm2,xmm11
- paddw xmm1,xmm12
- movdqa xmm11,[rsp+80h]
- movdqa xmm0, [rsp+10h]
- por xmm14,[rsp+0E0h]
- paddw xmm0,xmm11
- movdqa xmm4,xmm15
- paddw xmm1,xmm0
- movdqa xmm0,xmm13
- paddw xmm1,xmm7
- psraw xmm1,2
- pandn xmm3,xmm1
- por xmm2,xmm3
- movdqa xmm1,xmm13
- movdqa xmm3,[rsp+10h]
- pandn xmm0,xmm3
- pand xmm1,xmm2
- movdqa xmm2,xmm11
- paddw xmm2,[rsp]
- por xmm1,xmm0
- movdqa xmm0,[rsp+0D0h]
- por xmm0,xmm8
- paddw xmm2,xmm3
- packuswb xmm5,xmm1
- movdqa xmm8,[rsp+40h]
- movdqa xmm1,[rsp+50h]
- movdqa xmm3,xmm8
- pand xmm4,xmm0
- psllw xmm2,1
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+90h]
- por xmm4,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm10
- paddw xmm2,xmm0
- paddw xmm1,xmm2
- movdqa xmm0,[rsp]
- movdqa xmm2,xmm11
- paddw xmm0,xmm12
- movdqa xmm12,[rsp]
- paddw xmm2,xmm11
- paddw xmm2,xmm0
- psraw xmm1,3
- movdqa xmm0,xmm8
- pand xmm3,xmm1
- paddw xmm2,xmm7
- movdqa xmm1,xmm13
- psraw xmm2,2
- pandn xmm0,xmm2
- por xmm3,xmm0
- movdqa xmm2,[rsp+50h]
- movdqa xmm0,xmm13
- pandn xmm0,xmm12
- pand xmm1,xmm3
- paddw xmm2,xmm11
- movdqa xmm3,xmm15
- por xmm1,xmm0
- pand xmm3,xmm14
- movdqa xmm14,[rsp+10h]
- movdqa xmm0,xmm15
- pandn xmm0,[rsp+30h]
- packuswb xmm4,xmm1
- movdqa xmm1,xmm8
- por xmm3,xmm0
- movdqa xmm0,xmm12
- paddw xmm0,xmm14
- paddw xmm2,xmm0
- paddw xmm2,xmm7
- movdqa xmm0,xmm8
- pandn xmm0,xmm11
- psraw xmm2,2
- pand xmm1,xmm2
- por xmm1,xmm0
- movdqa xmm2,xmm13
- movdqa xmm0,xmm13
- pandn xmm0,xmm11
- pand xmm2,xmm1
- movdqa xmm1,xmm15
- por xmm2,xmm0
- packuswb xmm3,xmm2
- movdqa xmm0,[rsp+100h]
- por xmm0,[rsp+120h]
- pand xmm1,xmm0
- movdqa xmm2,[rcx+rbp]
- movdqa xmm7,[rsp+50h]
- pandn xmm15,[rsp+60h]
- lea r11,[rsp+1D8h]
- pxor xmm0,xmm0
- por xmm1,xmm15
- movaps xmm15,[r11-0A8h]
- movdqa [rdi],xmm9
- movaps xmm9,[r11-48h]
- punpckhbw xmm2,xmm0
- psllw xmm2,1
- paddw xmm2,xmm7
- paddw xmm2,xmm7
- movdqa [rbx],xmm6
- movaps xmm6,[r11-18h]
- paddw xmm2,xmm7
- paddw xmm2,xmm11
- movaps xmm11,[r11-68h]
- paddw xmm2,xmm12
- movaps xmm12,[r11-78h]
- paddw xmm2,xmm14
- paddw xmm2,xmm10
- psraw xmm2,3
- movaps xmm10,[r11-58h]
- movaps xmm14,[r11-98h]
- movdqa xmm0,xmm13
- pand xmm2,xmm8
- pandn xmm8,xmm7
- pandn xmm13,xmm7
- por xmm2,xmm8
- movaps xmm7,[r11-28h]
- movaps xmm8,[r11-38h]
- movdqa [r8],xmm5
- pand xmm0,xmm2
- por xmm0,xmm13
- packuswb xmm1,xmm0
- movaps xmm13,[r11-88h]
- movdqa [rbp],xmm4
- movdqa [r10+rbp],xmm3
- movdqa [rsi+rbp],xmm1
- mov rsp,r11
- pop rbp
- pop rbx
- ret
-
WELS_EXTERN DeblockChromaLt4V_ssse3
mov rax,rsp
push rbx
@@ -3831,552 +3076,6 @@
pop ebp
ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaEq4V_ssse3
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
%endif