shithub: openh264

Download patch

ref: fc1601058379affdd1cc3f6d1b3c3d4be7155edf
parent: 62fb37d0960afd5b98d76c0f05e8174a708120bc
author: Sindre Aamås <[email protected]>
date: Fri Feb 12 15:59:22 EST 2016

[Common/x86] DeblockChromaLt4V_ssse3 optimizations

Use packed 8-bit operations rather than unpack to 16-bit.

Avoid spills.

~2.68x speedup on Haswell (x86-64).
~2.38x speedup on Haswell (x86 32-bit).

--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -375,170 +375,106 @@
     ret
 
 
-%ifdef  WIN64
+;******************************************************************************
+; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
 
-
 WELS_EXTERN DeblockChromaLt4V_ssse3
-    mov         rax,rsp
-    push        rbx
-    push        rdi
-    PUSH_XMM 16
-    sub         rsp,0C8h
-    mov         r10,qword [rax + 30h]  ; pTC
-    pxor        xmm1,xmm1
-    mov         rbx,rcx
-    movsxd      r11,r8d
-    movsx       ecx,byte [r10]
-    movsx       r8d,byte [r10+2]
-    mov         rdi,rdx
-    movq        xmm2,[rbx]
-    movq        xmm9,[r11+rbx]
-    movsx       edx,byte [r10+1]
-    mov         word [rsp+2],cx
-    mov         word [rsp],cx
-    movsx       eax,byte [r10+3]
-    mov         word [rsp+6],dx
-    mov         word [rsp+4],dx
-    movdqa      xmm11,xmm1
-    mov         word [rsp+0Eh],ax
-    mov         word [rsp+0Ch],ax
-    lea         eax,[r11+r11]
-    movsxd      rcx,eax
-    mov         rax,rbx
-    mov         rdx,rdi
-    sub         rax,rcx
-    mov         word [rsp+0Ah],r8w
-    mov         word [rsp+8],r8w
-    movdqa      xmm6,[rsp]
-    movdqa      xmm7,xmm6
-    movq        xmm13, [rax]
-    mov         rax,rdi
-    sub         rax,rcx
-    mov         rcx,rbx
-    pcmpgtw     xmm7,xmm1
-    psubw       xmm11,xmm6
-    sub         rcx,r11
-    sub         rdx,r11
-    movq        xmm0,[rax]
-    movsx       eax,r9w
-    movq        xmm15,[rcx]
-    punpcklqdq  xmm13,xmm0
-    movq        xmm0, [rdx]
-    movdqa      xmm4,xmm13
-    punpcklqdq  xmm15,xmm0
-    movq        xmm0, [rdi]
-    punpcklbw   xmm4,xmm1
-    movdqa      xmm12,xmm15
-    punpcklqdq  xmm2,xmm0
-    movq        xmm0, [r11+rdi]
-    punpcklbw   xmm12,xmm1
-    movdqa      xmm14,xmm2
-    punpcklqdq  xmm9,xmm0
-    punpckhbw   xmm2,xmm1
-    punpcklbw   xmm14,xmm1
-    movd        xmm0,eax
-    movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
-    punpckhbw   xmm13,xmm1
-    punpckhbw   xmm15,xmm1
-    movdqa      xmm3,xmm9
-    movdqa      [rsp+10h],xmm2
-    punpcklwd   xmm0,xmm0
-    punpckhbw   xmm9,xmm1
-    punpcklbw   xmm3,xmm1
-    movdqa      xmm1,xmm14
-    pshufd      xmm10,xmm0,0
-    movd        xmm0,eax
-    mov         eax,4
-    cwde
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm8,xmm0,0
-    movd        xmm0,eax
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm5,xmm0,0
-    psubw       xmm1,xmm12
-    movdqa      xmm2,xmm10
-    lea         r11,[rsp+0C8h]
-    psllw       xmm1,2
-    movdqa      xmm0,xmm4
-    psubw       xmm4,xmm12
-    psubw       xmm0,xmm3
-    psubw       xmm3,xmm14
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm11
-    psraw       xmm1,3
-    pmaxsw      xmm0,xmm1
-    pminsw      xmm6,xmm0
-    movdqa      xmm1,xmm8
-    movdqa      xmm0,xmm12
-    psubw       xmm0,xmm14
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm2,xmm0
-    pabsw       xmm0,xmm4
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm3
-    movdqa      xmm3,[rsp]
-    pand        xmm2,xmm1
-    movdqa      xmm1,xmm8
-    pcmpgtw     xmm1,xmm0
-    movdqa      xmm0,xmm13
-    pand        xmm2,xmm1
-    psubw       xmm0,xmm9
-    psubw       xmm13,xmm15
-    pand        xmm2,xmm7
-    pand        xmm6,xmm2
-    paddw       xmm12,xmm6
-    psubw       xmm14,xmm6
-    movdqa      xmm2,[rsp+10h]
-    movaps      xmm6,[r11-18h]
-    movdqa      xmm1,xmm2
-    psubw       xmm1,xmm15
-    psubw       xmm9,xmm2
-    psllw       xmm1,2
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm15
-    psubw       xmm0,xmm2
-    psraw       xmm1,3
-    pmaxsw      xmm11,xmm1
-    pabsw       xmm0,xmm0
-    movdqa      xmm1,xmm8
-    pcmpgtw     xmm10,xmm0
-    pabsw       xmm0,xmm13
-    pminsw      xmm3,xmm11
-    movaps      xmm11,[r11-68h]
-    movaps      xmm13,[rsp+40h]
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm9
-    movaps      xmm9, [r11-48h]
-    pand        xmm10,xmm1
-    pcmpgtw     xmm8,xmm0
-    pand        xmm10,xmm8
-    pand        xmm10,xmm7
-    movaps      xmm8,[r11-38h]
-    movaps      xmm7,[r11-28h]
-    pand        xmm3,xmm10
-    paddw       xmm15,xmm3
-    psubw       xmm2,xmm3
-    movaps      xmm10,[r11-58h]
-    packuswb    xmm12,xmm15
-    movaps      xmm15,[rsp+20h]
-    packuswb    xmm14,xmm2
-    movq        [rcx],xmm12
-    movq        [rbx],xmm14
-    psrldq      xmm12,8
-    psrldq      xmm14,8
-    movq        [rdx],xmm12
-    movaps      xmm12,[r11-78h]
-    movq        [rdi],xmm14
-    movaps      xmm14,[rsp+30h]
-    mov         rsp,r11
+    %assign push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    movd     xmm7, arg4d
+    pxor     xmm0, xmm0
+    pshufb   xmm7, xmm0                       ; iAlpha
+    mov      r3, r2
+    neg      r3                               ; -iStride
+
+    movq     xmm0, [r0 + 0 * r2]              ; q0 cb
+    movhps   xmm0, [r1 + 0 * r2]              ; q0 cr
+    movq     xmm2, [r0 + 1 * r3]              ; p0 cb
+    movhps   xmm2, [r1 + 1 * r3]              ; p0 cr
+
+    movdqa   xmm4, xmm0
+    SSE2_AbsDiffUB xmm4, xmm2, xmm5           ; |p0 - q0|
+    SSE2_CmpgeUB xmm4, xmm7                   ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha
+
+    movq     xmm1, [r0 + 1 * r2]              ; q1 cb
+    movhps   xmm1, [r1 + 1 * r2]              ; q1 cr
+    movq     xmm3, [r0 + 2 * r3]              ; p1 cb
+    movhps   xmm3, [r1 + 2 * r3]              ; p1 cr
+
+    movdqa   xmm5, xmm1
+    SSE2_AbsDiffUB xmm5, xmm0, xmm7           ; |q1 - q0|
+    movdqa   xmm6, xmm3
+    SSE2_AbsDiffUB xmm6, xmm2, xmm7           ; |p1 - p0|
+    pmaxub   xmm5, xmm6                       ; max(|q1 - q0|, |p1 - p0|)
+
+    pxor     xmm6, xmm6
+    movd     xmm7, arg5d
+    pshufb   xmm7, xmm6                       ; iBeta
+
+    SSE2_CmpgeUB xmm5, xmm7                   ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta
+    por      xmm4, xmm5                       ; | !bDeltaP0Q0
+
+%ifidni arg6, r5
+    movd     xmm7, [arg6]
+%else
+    mov      r2, arg6
+    movd     xmm7, [r2]
+%endif
+    punpckldq xmm7, xmm7
+    punpcklbw xmm7, xmm7                      ; iTc
+    pcmpeqw  xmm6, xmm6                       ; FFh
+    movdqa   xmm5, xmm7
+    pcmpgtb  xmm5, xmm6                       ; iTc > -1 ? FFh : FFh
+    pandn    xmm4, xmm7                       ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0
+    pand     xmm4, xmm5                       ; &= (iTc > -1 ? FFh : 00h)
+
+    ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255].
+    ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later.
+    ; Bias so that unsigned saturation can be used.
+    ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs.
+    ; q0 - p0 is split into a non-negative and non-positive part. The latter is
+    ; subtracted from the biased value.
+    movdqa   xmm7, xmm2
+    psubusb  xmm7, xmm0  ; clip(p0 - q0, 0, 255)
+    ; ((p1 - q1) >> 2) + 0xc0
+    pxor     xmm1, xmm6  ; q1 ^ 0xff aka -q1 - 1 & 0xff
+    pavgb    xmm3, xmm1  ; (((p1 - q1 + 0x100) >> 1)
+    pavgb    xmm3, xmm6  ;  + 0x100) >> 1
+    psubusb  xmm3, xmm7  ; -= clip(p0 - q0, 0, 255) saturate.
+    movdqa   xmm5, xmm0
+    psubusb  xmm5, xmm2  ; (clip(q0 - p0, 0, 255)
+    pavgb    xmm5, xmm3  ;  + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1
+
+    ; Unbias and split into a non-negative and a non-positive part.
+    ; Clip each part to iTc via minub.
+    ; Add/subtract each part to/from p0/q0 and clip.
+    movdqa   xmm6, [WELS_DB96_16]
+    psubusb  xmm6, xmm5
+    psubusb  xmm5, [WELS_DB96_16]
+    pminub   xmm6, xmm4
+    pminub   xmm5, xmm4
+    psubusb  xmm2, xmm6
+    paddusb  xmm2, xmm5
+    paddusb  xmm0, xmm6
+    psubusb  xmm0, xmm5
+
+    movlps   [r0 + 1 * r3], xmm2              ; store p0 cb
+    movhps   [r1 + 1 * r3], xmm2              ; store p0 cr
+    movlps   [r0         ], xmm0              ; store q0 cb
+    movhps   [r1         ], xmm0              ; store q0 cr
+
     POP_XMM
-    pop         rdi
-    pop         rbx
+    LOAD_4_PARA_POP
     ret
 
 
+%ifdef  WIN64
+
+
 WELS_EXTERN DeblockChromaEq4V_ssse3
     mov         rax,rsp
     push        rbx
@@ -1242,172 +1178,6 @@
 %elifdef  UNIX64
 
 
-WELS_EXTERN DeblockChromaLt4V_ssse3
-    mov         rax,rsp
-    push        rbx
-    push        rbp
-    mov         r10,  rdx
-    mov         r11,  rcx
-    mov         rcx,  rdi
-    mov         rdx,  rsi
-    mov         rsi,  r10
-    mov         r10,  r9
-    mov         rbp,  r8
-    mov         r8,   rsi
-    mov         r9,   r11
-    sub         rsp,0C8h
-    pxor        xmm1,xmm1
-    mov         rbx,rcx
-    movsxd      r11,r8d
-    movsx       ecx,byte [r10]
-    movsx       r8d,byte [r10+2]
-    mov         rdi,rdx
-    movq        xmm2,[rbx]
-    movq        xmm9,[r11+rbx]
-    movsx       edx,byte [r10+1]
-    mov         word [rsp+2],cx
-    mov         word [rsp],cx
-    movsx       eax,byte [r10+3]
-    mov         word [rsp+6],dx
-    mov         word [rsp+4],dx
-    movdqa      xmm11,xmm1
-    mov         word [rsp+0Eh],ax
-    mov         word [rsp+0Ch],ax
-    lea         eax,[r11+r11]
-    movsxd      rcx,eax
-    mov         rax,rbx
-    mov         rdx,rdi
-    sub         rax,rcx
-    mov         word [rsp+0Ah],r8w
-    mov         word [rsp+8],r8w
-    movdqa      xmm6,[rsp]
-    movdqa      xmm7,xmm6
-    movq        xmm13, [rax]
-    mov         rax,rdi
-    sub         rax,rcx
-    mov         rcx,rbx
-    pcmpgtw     xmm7,xmm1
-    psubw       xmm11,xmm6
-    sub         rcx,r11
-    sub         rdx,r11
-    movq        xmm0,[rax]
-    movsx       eax,r9w
-    movq        xmm15,[rcx]
-    punpcklqdq  xmm13,xmm0
-    movq        xmm0, [rdx]
-    movdqa      xmm4,xmm13
-    punpcklqdq  xmm15,xmm0
-    movq        xmm0, [rdi]
-    punpcklbw   xmm4,xmm1
-    movdqa      xmm12,xmm15
-    punpcklqdq  xmm2,xmm0
-    movq        xmm0, [r11+rdi]
-    punpcklbw   xmm12,xmm1
-    movdqa      xmm14,xmm2
-    punpcklqdq  xmm9,xmm0
-    punpckhbw   xmm2,xmm1
-    punpcklbw   xmm14,xmm1
-    movd        xmm0,eax
-    mov         eax, ebp ; iBeta
-    punpckhbw   xmm13,xmm1
-    punpckhbw   xmm15,xmm1
-    movdqa      xmm3,xmm9
-    movdqa      [rsp+10h],xmm2
-    punpcklwd   xmm0,xmm0
-    punpckhbw   xmm9,xmm1
-    punpcklbw   xmm3,xmm1
-    movdqa      xmm1,xmm14
-    pshufd      xmm10,xmm0,0
-    movd        xmm0,eax
-    mov         eax,4
-    cwde
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm8,xmm0,0
-    movd        xmm0,eax
-    punpcklwd   xmm0,xmm0
-    pshufd      xmm5,xmm0,0
-    psubw       xmm1,xmm12
-    movdqa      xmm2,xmm10
-    lea         r11,[rsp+0C8h]
-    psllw       xmm1,2
-    movdqa      xmm0,xmm4
-    psubw       xmm4,xmm12
-    psubw       xmm0,xmm3
-    psubw       xmm3,xmm14
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm11
-    psraw       xmm1,3
-    pmaxsw      xmm0,xmm1
-    pminsw      xmm6,xmm0
-    movdqa      xmm1,xmm8
-    movdqa      xmm0,xmm12
-    psubw       xmm0,xmm14
-    pabsw       xmm0,xmm0
-    pcmpgtw     xmm2,xmm0
-    pabsw       xmm0,xmm4
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm3
-    movdqa      xmm3,[rsp]
-    pand        xmm2,xmm1
-    movdqa      xmm1,xmm8
-    pcmpgtw     xmm1,xmm0
-    movdqa      xmm0,xmm13
-    pand        xmm2,xmm1
-    psubw       xmm0,xmm9
-    psubw       xmm13,xmm15
-    pand        xmm2,xmm7
-    pand        xmm6,xmm2
-    paddw       xmm12,xmm6
-    psubw       xmm14,xmm6
-    movdqa      xmm2,[rsp+10h]
-    movaps      xmm6,[r11-18h]
-    movdqa      xmm1,xmm2
-    psubw       xmm1,xmm15
-    psubw       xmm9,xmm2
-    psllw       xmm1,2
-    paddw       xmm1,xmm0
-    paddw       xmm1,xmm5
-    movdqa      xmm0,xmm15
-    psubw       xmm0,xmm2
-    psraw       xmm1,3
-    pmaxsw      xmm11,xmm1
-    pabsw       xmm0,xmm0
-    movdqa      xmm1,xmm8
-    pcmpgtw     xmm10,xmm0
-    pabsw       xmm0,xmm13
-    pminsw      xmm3,xmm11
-    movaps      xmm11,[r11-68h]
-    movaps      xmm13,[rsp+40h]
-    pcmpgtw     xmm1,xmm0
-    pabsw       xmm0,xmm9
-    movaps      xmm9, [r11-48h]
-    pand        xmm10,xmm1
-    pcmpgtw     xmm8,xmm0
-    pand        xmm10,xmm8
-    pand        xmm10,xmm7
-    movaps      xmm8,[r11-38h]
-    movaps      xmm7,[r11-28h]
-    pand        xmm3,xmm10
-    paddw       xmm15,xmm3
-    psubw       xmm2,xmm3
-    movaps      xmm10,[r11-58h]
-    packuswb    xmm12,xmm15
-    movaps      xmm15,[rsp+20h]
-    packuswb    xmm14,xmm2
-    movq        [rcx],xmm12
-    movq        [rbx],xmm14
-    psrldq      xmm12,8
-    psrldq      xmm14,8
-    movq        [rdx],xmm12
-    movaps      xmm12,[r11-78h]
-    movq        [rdi],xmm14
-    movaps      xmm14,[rsp+30h]
-    mov         rsp,r11
-    pop         rbp
-    pop         rbx
-    ret
-
 WELS_EXTERN DeblockChromaEq4V_ssse3
     mov         rax,rsp
     push        rbx
@@ -2281,207 +2051,6 @@
     psrldq      xmm2,8
     movq        [ecx],xmm2
     pop         esi
-    mov         esp,ebp
-    pop         ebp
-    ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_ssse3
-    push        ebp
-    mov         ebp,esp
-    and         esp,0FFFFFFF0h
-    sub         esp,0E4h
-    push        ebx
-    push        esi
-    mov         esi, [ebp+1Ch]      ;  pTC
-    movsx       ebx, byte [esi+2]
-    push        edi
-    movsx       di,byte [esi+3]
-    mov         word [esp+0Ch],bx
-    movsx       bx,byte  [esi+1]
-    movsx       esi,byte  [esi]
-    mov         word  [esp+0Eh],si
-    movzx       esi,di
-    movd        xmm1,esi
-    movzx       esi,di
-    movd        xmm2,esi
-    mov         si,word  [esp+0Ch]
-    mov         edx, [ebp + 10h]
-    mov         eax, [ebp + 08h]
-    movzx       edi,si
-    movzx       esi,si
-    mov         ecx, [ebp + 0Ch]
-    movd        xmm4,esi
-    movzx       esi,bx
-    movd        xmm5,esi
-    movd        xmm3,edi
-    movzx       esi,bx
-    movd        xmm6,esi
-    mov         si,word [esp+0Eh]
-    movzx       edi,si
-    movzx       esi,si
-    punpcklwd   xmm6,xmm2
-    pxor        xmm0,xmm0
-    movdqa      [esp+40h],xmm0
-    movd        xmm7,edi
-    movd        xmm0,esi
-    lea         esi,[edx+edx]
-    mov         edi,eax
-    sub         edi,esi
-    punpcklwd   xmm5,xmm1
-    movdqa      xmm1,[esp+40h]
-    punpcklwd   xmm0,xmm4
-    movq        xmm4,[edx+ecx]
-    punpcklwd   xmm7,xmm3
-    movq        xmm3,[eax]
-    punpcklwd   xmm0,xmm6
-    movq        xmm6,[edi]
-    punpcklwd   xmm7,xmm5
-    punpcklwd   xmm0,xmm7
-    mov         edi,ecx
-    sub         edi,esi
-    movdqa      xmm2,xmm1
-    psubw       xmm2,xmm0
-    movdqa      [esp+60h],xmm2
-    movq        xmm2, [edi]
-    punpcklqdq  xmm6,xmm2
-    mov         esi,eax
-    sub         esi,edx
-    movq        xmm7,[esi]
-    mov         edi,ecx
-    sub         edi,edx
-    movq        xmm2,[edi]
-    punpcklqdq  xmm7,xmm2
-    movq        xmm2,[ecx]
-    punpcklqdq  xmm3,xmm2
-    movq        xmm2,[edx+eax]
-    movsx       edx,word [ebp + 14h]
-    punpcklqdq  xmm2,xmm4
-    movdqa      [esp+0E0h],xmm2
-    movd        xmm2,edx
-    movsx       edx,word [ebp + 18h]
-    movdqa      xmm4,xmm2
-    punpcklwd   xmm4,xmm2
-    movd        xmm2,edx
-    movdqa      xmm5,xmm2
-    punpcklwd   xmm5,xmm2
-    pshufd      xmm2,xmm5,0
-    movdqa      [esp+50h],xmm2
-    movdqa      xmm2,xmm6
-    punpcklbw   xmm2,xmm1
-    movdqa      [esp+0D0h],xmm3
-    pshufd      xmm4,xmm4,0
-    movdqa      [esp+30h],xmm2
-    punpckhbw   xmm6,xmm1
-    movdqa      [esp+80h],xmm6
-    movdqa      xmm6,[esp+0D0h]
-    punpckhbw   xmm6,xmm1
-    movdqa      [esp+70h],xmm6
-    movdqa      xmm6, [esp+0E0h]
-    punpckhbw   xmm6,xmm1
-    movdqa     [esp+90h],xmm6
-    movdqa      xmm5, [esp+0E0h]
-    movdqa      xmm2,xmm7
-    punpckhbw   xmm7,xmm1
-    punpcklbw   xmm5,xmm1
-    movdqa       [esp+0A0h],xmm7
-    punpcklbw   xmm3,xmm1
-    mov         edx,4
-    punpcklbw   xmm2,xmm1
-    movsx       edx,dx
-    movd        xmm6,edx
-    movdqa      xmm7,xmm6
-    punpcklwd   xmm7,xmm6
-    pshufd      xmm6,xmm7,0
-    movdqa      xmm7,[esp+30h]
-    movdqa      [esp+20h],xmm6
-    psubw       xmm7,xmm5
-    movdqa      xmm6,xmm0
-    pcmpgtw     xmm6,xmm1
-    movdqa      xmm1,[esp+60h]
-    movdqa      [esp+40h],xmm6
-    movdqa      xmm6,xmm3
-    psubw       xmm6,xmm2
-    psllw       xmm6,2
-    paddw       xmm6,xmm7
-    paddw       xmm6, [esp+20h]
-    movdqa      xmm7, [esp+50h]
-    psraw       xmm6,3
-    pmaxsw      xmm1,xmm6
-    movdqa      [esp+10h],xmm0
-    movdqa      xmm6, [esp+10h]
-    pminsw      xmm6,xmm1
-    movdqa      [esp+10h],xmm6
-    movdqa      xmm1,xmm2
-    psubw       xmm1,xmm3
-    pabsw       xmm1,xmm1
-    movdqa      xmm6,xmm4
-    pcmpgtw     xmm6,xmm1
-    movdqa      xmm1, [esp+30h]
-    psubw       xmm1,xmm2
-    pabsw       xmm1,xmm1
-    pcmpgtw     xmm7,xmm1
-    movdqa      xmm1,[esp+50h]
-    pand        xmm6,xmm7
-    movdqa      xmm7,[esp+50h]
-    psubw       xmm5,xmm3
-    pabsw       xmm5,xmm5
-    pcmpgtw     xmm1,xmm5
-    movdqa      xmm5,[esp+80h]
-    psubw       xmm5,[esp+90h]
-    pand        xmm6,xmm1
-    pand        xmm6,[esp+40h]
-    movdqa      xmm1,[esp+10h]
-    pand        xmm1,xmm6
-    movdqa      xmm6,[esp+70h]
-    movdqa      [esp+30h],xmm1
-    movdqa      xmm1,[esp+0A0h]
-    psubw       xmm6,xmm1
-    psllw       xmm6,2
-    paddw       xmm6,xmm5
-    paddw       xmm6,[esp+20h]
-    movdqa      xmm5,[esp+60h]
-    psraw       xmm6,3
-    pmaxsw      xmm5,xmm6
-    pminsw      xmm0,xmm5
-    movdqa      xmm5,[esp+70h]
-    movdqa      xmm6,xmm1
-    psubw       xmm6,xmm5
-    pabsw       xmm6,xmm6
-    pcmpgtw     xmm4,xmm6
-    movdqa      xmm6,[esp+80h]
-    psubw       xmm6,xmm1
-    pabsw       xmm6,xmm6
-    pcmpgtw     xmm7,xmm6
-    movdqa      xmm6,[esp+90h]
-    pand        xmm4,xmm7
-    movdqa      xmm7,[esp+50h]
-    psubw       xmm6,xmm5
-    pabsw       xmm6,xmm6
-    pcmpgtw     xmm7,xmm6
-    pand        xmm4,xmm7
-    pand        xmm4,[esp+40h]
-    pand        xmm0,xmm4
-    movdqa      xmm4,[esp+30h]
-    paddw       xmm2,xmm4
-    paddw       xmm1,xmm0
-    packuswb    xmm2,xmm1
-    movq        [esi],xmm2
-    psubw       xmm3,xmm4
-    psubw       xmm5,xmm0
-    packuswb    xmm3,xmm5
-    movq        [eax],xmm3
-    psrldq      xmm2,8
-    movq        [edi],xmm2
-    pop         edi
-    pop         esi
-    psrldq      xmm3,8
-    movq        [ecx],xmm3
-    pop         ebx
     mov         esp,ebp
     pop         ebp
     ret