shithub: openh264

ref: 85414c1904b17d3b6ed58b1e2b09dd5b6f99e288
dir: /codec/encoder/core/asm/deblock.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  deblock.asm
;*
;*  Abstract
;*      edge loop
;*
;*  History
;*      08/07/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
BITS 32

;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************

%ifdef FORMAT_COFF
SECTION .rodata pData
%else
SECTION .rodata align=16
%endif

SECTION .text

;********************************************************************************
;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                             int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN   DeblockChromaEq4V_sse2

ALIGN  16
DeblockChromaEq4V_sse2:
  push        ebp  
  mov         ebp,esp 
  and         esp,0FFFFFFF0h 
  sub         esp,68h 
  mov         edx,[ebp+10h]      ;  iStride
  mov         eax,[ebp+8]        ;  pPixCb
  mov         ecx,[ebp+0Ch]      ;  pPixCr
  movq        xmm4,[ecx] 
  movq        xmm5,[edx+ecx] 
  push        esi  
  push        edi  
  lea         esi,[edx+edx] 
  mov         edi,eax 
  sub         edi,esi 
  movq        xmm1,[edi] 
  mov         edi,ecx 
  sub         edi,esi 
  movq        xmm2,[edi] 
  punpcklqdq  xmm1,xmm2 
  mov         esi,eax 
  sub         esi,edx 
  movq        xmm2,[esi] 
  mov         edi,ecx 
  sub         edi,edx 
  movq        xmm3,[edi] 
  punpcklqdq  xmm2,xmm3 
  movq        xmm3,[eax] 
  punpcklqdq  xmm3,xmm4 
  movq        xmm4,[edx+eax] 
  mov       edx, [ebp + 14h] 
  punpcklqdq  xmm4,xmm5 
  movd        xmm5,edx 
  mov       edx, [ebp + 18h] 
  pxor        xmm0,xmm0 
  movdqa      xmm6,xmm5 
  punpcklwd   xmm6,xmm5 
  pshufd      xmm5,xmm6,0 
  movd        xmm6,edx 
  movdqa      xmm7,xmm6 
  punpcklwd   xmm7,xmm6 
  pshufd      xmm6,xmm7,0 
  movdqa      xmm7,xmm1 
  punpckhbw   xmm1,xmm0 
  punpcklbw   xmm7,xmm0 
  movdqa      [esp+40h],xmm1 
  movdqa      [esp+60h],xmm7 
  movdqa      xmm7,xmm2 
  punpcklbw   xmm7,xmm0 
  movdqa      [esp+10h],xmm7 
  movdqa      xmm7,xmm3 
  punpcklbw   xmm7,xmm0 
  punpckhbw   xmm3,xmm0 
  movdqa      [esp+50h],xmm7 
  movdqa      xmm7,xmm4 
  punpckhbw   xmm4,xmm0 
  punpckhbw   xmm2,xmm0 
  punpcklbw   xmm7,xmm0 
  movdqa      [esp+30h],xmm3 
  movdqa      xmm3,[esp+10h] 
  movdqa      xmm1,xmm3 
  psubw       xmm1,[esp+50h] 
  pabsw       xmm1,xmm1 
  movdqa      [esp+20h],xmm4 
  movdqa      xmm0,xmm5 
  pcmpgtw     xmm0,xmm1 
  movdqa      xmm1,[esp+60h] 
  psubw       xmm1,xmm3 
  pabsw       xmm1,xmm1 
  movdqa      xmm4,xmm6 
  pcmpgtw     xmm4,xmm1 
  pand        xmm0,xmm4 
  movdqa      xmm1,xmm7 
  psubw       xmm1,[esp+50h] 
  pabsw       xmm1,xmm1 
  movdqa      xmm4,xmm6 
  pcmpgtw     xmm4,xmm1 
  movdqa      xmm1,xmm2 
  psubw       xmm1,[esp+30h] 
  pabsw       xmm1,xmm1 
  pcmpgtw     xmm5,xmm1 
  movdqa      xmm1,[esp+40h] 
  pand        xmm0,xmm4 
  psubw       xmm1,xmm2 
  pabsw       xmm1,xmm1 
  movdqa      xmm4,xmm6 
  pcmpgtw     xmm4,xmm1 
  movdqa      xmm1,[esp+20h] 
  psubw       xmm1,[esp+30h] 
  pand        xmm5,xmm4 
  pabsw       xmm1,xmm1 
  pcmpgtw     xmm6,xmm1 
  pand        xmm5,xmm6 
  mov         edx,2 
  movsx       edx,dx 
  movd        xmm1,edx 
  movdqa      xmm4,xmm1 
  punpcklwd   xmm4,xmm1 
  pshufd      xmm1,xmm4,0 
  movdqa      xmm4,[esp+60h] 
  movdqa      xmm6,xmm4 
  paddw       xmm6,xmm4 
  paddw       xmm6,xmm3 
  paddw       xmm6,xmm7 
  movdqa      [esp+10h],xmm1 
  paddw       xmm6,[esp+10h] 
  psraw       xmm6,2 
  movdqa      xmm4,xmm0 
  pandn       xmm4,xmm3 
  movdqa      xmm3,[esp+40h] 
  movdqa      xmm1,xmm0 
  pand        xmm1,xmm6 
  por         xmm1,xmm4 
  movdqa      xmm6,xmm3 
  paddw       xmm6,xmm3 
  movdqa      xmm3,[esp+10h] 
  paddw       xmm6,xmm2 
  paddw       xmm6,[esp+20h] 
  paddw       xmm6,xmm3 
  psraw       xmm6,2 
  movdqa      xmm4,xmm5 
  pand        xmm4,xmm6 
  movdqa      xmm6,xmm5 
  pandn       xmm6,xmm2 
  por         xmm4,xmm6 
  packuswb    xmm1,xmm4 
  movdqa      xmm4,[esp+50h] 
  movdqa      xmm6,xmm7 
  paddw       xmm6,xmm7 
  paddw       xmm6,xmm4 
  paddw       xmm6,[esp+60h] 
  paddw       xmm6,xmm3 
  psraw       xmm6,2 
  movdqa      xmm2,xmm0 
  pand        xmm2,xmm6 
  pandn       xmm0,xmm4 
  por         xmm2,xmm0 
  movdqa      xmm0,[esp+20h] 
  movdqa      xmm6,xmm0 
  paddw       xmm6,xmm0 
  movdqa      xmm0,[esp+30h] 
  paddw       xmm6,xmm0 
  paddw       xmm6,[esp+40h] 
  movdqa      xmm4,xmm5 
  paddw       xmm6,xmm3 
  movq        [esi],xmm1 
  psraw       xmm6,2 
  pand        xmm4,xmm6 
  pandn       xmm5,xmm0 
  por         xmm4,xmm5 
  packuswb    xmm2,xmm4 
  movq        [eax],xmm2 
  psrldq      xmm1,8 
  movq        [edi],xmm1 
  pop         edi  
  psrldq      xmm2,8 
  movq        [ecx],xmm2 
  pop         esi  
  mov         esp,ebp 
  pop         ebp  
  ret              

;******************************************************************************
; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN  DeblockChromaLt4V_sse2

DeblockChromaLt4V_sse2:
  push        ebp  
  mov         ebp,esp 
  and         esp,0FFFFFFF0h 
  sub         esp,0E4h 
  push        ebx  
  push        esi  
  mov         esi, [ebp+1Ch]      ;  pTC
  movsx       ebx, byte [esi+2] 
  push        edi  
  movsx       di,byte [esi+3] 
  mov         word [esp+0Ch],bx 
  movsx       bx,byte  [esi+1] 
  movsx       esi,byte  [esi] 
  mov         word  [esp+0Eh],si 
  movzx       esi,di 
  movd        xmm1,esi 
  movzx       esi,di 
  movd        xmm2,esi 
  mov         si,word  [esp+0Ch] 
  mov         edx, [ebp + 10h] 
  mov         eax, [ebp + 08h] 
  movzx       edi,si 
  movzx       esi,si 
  mov         ecx, [ebp + 0Ch] 
  movd        xmm4,esi 
  movzx       esi,bx 
  movd        xmm5,esi 
  movd        xmm3,edi 
  movzx       esi,bx 
  movd        xmm6,esi 
  mov         si,word [esp+0Eh] 
  movzx       edi,si 
  movzx       esi,si 
  punpcklwd   xmm6,xmm2 
  pxor        xmm0,xmm0 
  movdqa      [esp+40h],xmm0 
  movd        xmm7,edi 
  movd        xmm0,esi 
  lea         esi,[edx+edx] 
  mov         edi,eax 
  sub         edi,esi 
  punpcklwd   xmm5,xmm1 
  movdqa      xmm1,[esp+40h] 
  punpcklwd   xmm0,xmm4 
  movq        xmm4,[edx+ecx] 
  punpcklwd   xmm7,xmm3 
  movq        xmm3,[eax] 
  punpcklwd   xmm0,xmm6 
  movq        xmm6,[edi] 
  punpcklwd   xmm7,xmm5 
  punpcklwd   xmm0,xmm7 
  mov         edi,ecx 
  sub         edi,esi 
  movdqa      xmm2,xmm1 
  psubw       xmm2,xmm0 
  movdqa      [esp+60h],xmm2 
  movq        xmm2, [edi] 
  punpcklqdq  xmm6,xmm2 
  mov         esi,eax 
  sub         esi,edx 
  movq        xmm7,[esi] 
  mov         edi,ecx 
  sub         edi,edx 
  movq        xmm2,[edi] 
  punpcklqdq  xmm7,xmm2 
  movq        xmm2,[ecx] 
  punpcklqdq  xmm3,xmm2 
  movq        xmm2,[edx+eax] 
  movsx       edx,word [ebp + 14h] 
  punpcklqdq  xmm2,xmm4 
  movdqa      [esp+0E0h],xmm2 
  movd        xmm2,edx 
  movsx       edx,word [ebp + 18h] 
  movdqa      xmm4,xmm2 
  punpcklwd   xmm4,xmm2 
  movd        xmm2,edx 
  movdqa      xmm5,xmm2 
  punpcklwd   xmm5,xmm2 
  pshufd      xmm2,xmm5,0 
  movdqa      [esp+50h],xmm2 
  movdqa      xmm2,xmm6 
  punpcklbw   xmm2,xmm1 
  movdqa      [esp+0D0h],xmm3 
  pshufd      xmm4,xmm4,0 
  movdqa      [esp+30h],xmm2 
  punpckhbw   xmm6,xmm1 
  movdqa      [esp+80h],xmm6 
  movdqa      xmm6,[esp+0D0h] 
  punpckhbw   xmm6,xmm1 
  movdqa      [esp+70h],xmm6 
  movdqa      xmm6, [esp+0E0h] 
  punpckhbw   xmm6,xmm1 
  movdqa     [esp+90h],xmm6 
  movdqa      xmm5, [esp+0E0h] 
  movdqa      xmm2,xmm7 
  punpckhbw   xmm7,xmm1 
  punpcklbw   xmm5,xmm1 
  movdqa       [esp+0A0h],xmm7 
  punpcklbw   xmm3,xmm1 
  mov         edx,4 
  punpcklbw   xmm2,xmm1 
  movsx       edx,dx 
  movd        xmm6,edx 
  movdqa      xmm7,xmm6 
  punpcklwd   xmm7,xmm6 
  pshufd      xmm6,xmm7,0 
  movdqa      xmm7,[esp+30h] 
  movdqa      [esp+20h],xmm6 
  psubw       xmm7,xmm5 
  movdqa      xmm6,xmm0 
  pcmpgtw     xmm6,xmm1 
  movdqa      xmm1,[esp+60h] 
  movdqa      [esp+40h],xmm6 
  movdqa      xmm6,xmm3 
  psubw       xmm6,xmm2 
  psllw       xmm6,2 
  paddw       xmm6,xmm7 
  paddw       xmm6, [esp+20h] 
  movdqa      xmm7, [esp+50h] 
  psraw       xmm6,3 
  pmaxsw      xmm1,xmm6 
  movdqa      [esp+10h],xmm0 
  movdqa      xmm6, [esp+10h] 
  pminsw      xmm6,xmm1 
  movdqa      [esp+10h],xmm6 
  movdqa      xmm1,xmm2 
  psubw       xmm1,xmm3 
  pabsw       xmm1,xmm1 
  movdqa      xmm6,xmm4 
  pcmpgtw     xmm6,xmm1 
  movdqa      xmm1, [esp+30h] 
  psubw       xmm1,xmm2 
  pabsw       xmm1,xmm1 
  pcmpgtw     xmm7,xmm1 
  movdqa      xmm1,[esp+50h] 
  pand        xmm6,xmm7 
  movdqa      xmm7,[esp+50h] 
  psubw       xmm5,xmm3 
  pabsw       xmm5,xmm5 
  pcmpgtw     xmm1,xmm5 
  movdqa      xmm5,[esp+80h] 
  psubw       xmm5,[esp+90h] 
  pand        xmm6,xmm1 
  pand        xmm6,[esp+40h] 
  movdqa      xmm1,[esp+10h] 
  pand        xmm1,xmm6 
  movdqa      xmm6,[esp+70h] 
  movdqa      [esp+30h],xmm1 
  movdqa      xmm1,[esp+0A0h] 
  psubw       xmm6,xmm1 
  psllw       xmm6,2 
  paddw       xmm6,xmm5 
  paddw       xmm6,[esp+20h] 
  movdqa      xmm5,[esp+60h] 
  psraw       xmm6,3 
  pmaxsw      xmm5,xmm6 
  pminsw      xmm0,xmm5 
  movdqa      xmm5,[esp+70h] 
  movdqa      xmm6,xmm1 
  psubw       xmm6,xmm5 
  pabsw       xmm6,xmm6 
  pcmpgtw     xmm4,xmm6 
  movdqa      xmm6,[esp+80h] 
  psubw       xmm6,xmm1 
  pabsw       xmm6,xmm6 
  pcmpgtw     xmm7,xmm6 
  movdqa      xmm6,[esp+90h] 
  pand        xmm4,xmm7 
  movdqa      xmm7,[esp+50h] 
  psubw       xmm6,xmm5 
  pabsw       xmm6,xmm6 
  pcmpgtw     xmm7,xmm6 
  pand        xmm4,xmm7 
  pand        xmm4,[esp+40h] 
  pand        xmm0,xmm4 
  movdqa      xmm4,[esp+30h] 
  paddw       xmm2,xmm4 
  paddw       xmm1,xmm0 
  packuswb    xmm2,xmm1 
  movq        [esi],xmm2 
  psubw       xmm3,xmm4 
  psubw       xmm5,xmm0 
  packuswb    xmm3,xmm5 
  movq        [eax],xmm3 
  psrldq      xmm2,8 
  movq        [edi],xmm2 
  pop         edi  
  pop         esi  
  psrldq      xmm3,8 
  movq        [ecx],xmm3 
  pop         ebx  
  mov         esp,ebp 
  pop         ebp  
  ret    
  
;***************************************************************************
;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
;          int32_t iAlpha, int32_t iBeta)
;***************************************************************************

WELS_EXTERN     DeblockChromaEq4H_sse2

ALIGN  16
  
DeblockChromaEq4H_sse2:
  push        ebp  
  mov         ebp,esp 
  and         esp,0FFFFFFF0h 
  sub         esp,0C8h  
  mov         ecx,dword [ebp+8] 
  mov         edx,dword [ebp+0Ch] 
  mov         eax,dword [ebp+10h] 
  sub         ecx,2 
  sub         edx,2 
  push        esi  
  lea         esi,[eax+eax*2] 
  mov         dword [esp+18h],ecx 
  mov         dword [esp+4],edx 
  lea         ecx,[ecx+eax*4] 
  lea         edx,[edx+eax*4] 
  lea         eax,[esp+7Ch] 
  push        edi  
  mov         dword [esp+14h],esi 
  mov         dword [esp+18h],ecx 
  mov         dword [esp+0Ch],edx 
  mov         dword [esp+10h],eax 
  mov         esi,dword [esp+1Ch] 
  mov         ecx,dword [ebp+10h] 
  mov         edx,dword [esp+14h] 
  movd        xmm0,dword [esi] 
  movd        xmm1,dword [esi+ecx] 
  movd        xmm2,dword [esi+ecx*2] 
  movd        xmm3,dword [esi+edx] 
  mov         esi,dword  [esp+8] 
  movd        xmm4,dword [esi] 
  movd        xmm5,dword [esi+ecx] 
  movd        xmm6,dword [esi+ecx*2] 
  movd        xmm7,dword [esi+edx] 
  punpckldq   xmm0,xmm4 
  punpckldq   xmm1,xmm5 
  punpckldq   xmm2,xmm6 
  punpckldq   xmm3,xmm7 
  mov         esi,dword [esp+18h] 
  mov         edi,dword [esp+0Ch] 
  movd        xmm4,dword [esi] 
  movd        xmm5,dword [edi] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm0,xmm4 
  movd        xmm4,dword [esi+ecx] 
  movd        xmm5,dword [edi+ecx] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm1,xmm4 
  movd        xmm4,dword [esi+ecx*2] 
  movd        xmm5,dword [edi+ecx*2] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm2,xmm4 
  movd        xmm4,dword [esi+edx] 
  movd        xmm5,dword [edi+edx] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm3,xmm4 
  movdqa      xmm6,xmm0 
  punpcklbw   xmm0,xmm1 
  punpckhbw   xmm6,xmm1 
  movdqa      xmm7,xmm2 
  punpcklbw   xmm2,xmm3 
  punpckhbw   xmm7,xmm3 
  movdqa      xmm4,xmm0 
  movdqa      xmm5,xmm6 
  punpcklwd   xmm0,xmm2 
  punpckhwd   xmm4,xmm2 
  punpcklwd   xmm6,xmm7 
  punpckhwd   xmm5,xmm7 
  movdqa      xmm1,xmm0 
  movdqa      xmm2,xmm4 
  punpckldq   xmm0,xmm6 
  punpckhdq   xmm1,xmm6 
  punpckldq   xmm4,xmm5 
  punpckhdq   xmm2,xmm5 
  movdqa      xmm5,xmm0 
  movdqa      xmm6,xmm1 
  punpcklqdq  xmm0,xmm4 
  punpckhqdq  xmm5,xmm4 
  punpcklqdq  xmm1,xmm2 
  punpckhqdq  xmm6,xmm2 
  mov         edi,dword [esp+10h] 
  movdqa      [edi],xmm0 
  movdqa      [edi+10h],xmm5 
  movdqa      [edi+20h],xmm1 
  movdqa      [edi+30h],xmm6 
  movsx       ecx,word [ebp+14h] 
  movsx       edx,word [ebp+18h] 
  movdqa      xmm6,[esp+80h] 
  movdqa      xmm4,[esp+90h] 
  movdqa      xmm5,[esp+0A0h] 
  movdqa      xmm7,[esp+0B0h] 
  pxor        xmm0,xmm0 
  movd        xmm1,ecx 
  movdqa      xmm2,xmm1 
  punpcklwd   xmm2,xmm1 
  pshufd      xmm1,xmm2,0 
  movd        xmm2,edx 
  movdqa      xmm3,xmm2 
  punpcklwd   xmm3,xmm2 
  pshufd      xmm2,xmm3,0 
  movdqa      xmm3,xmm6 
  punpckhbw   xmm6,xmm0 
  movdqa      [esp+60h],xmm6 
  movdqa      xmm6,[esp+90h] 
  punpckhbw   xmm6,xmm0 
  movdqa      [esp+30h],xmm6 
  movdqa      xmm6,[esp+0A0h] 
  punpckhbw   xmm6,xmm0 
  movdqa      [esp+40h],xmm6 
  movdqa      xmm6,[esp+0B0h] 
  punpckhbw   xmm6,xmm0 
  movdqa      [esp+70h],xmm6 
  punpcklbw   xmm7,xmm0 
  punpcklbw   xmm4,xmm0 
  punpcklbw   xmm5,xmm0 
  punpcklbw   xmm3,xmm0 
  movdqa      [esp+50h],xmm7 
  movdqa      xmm6,xmm4 
  psubw       xmm6,xmm5 
  pabsw       xmm6,xmm6 
  movdqa      xmm0,xmm1 
  pcmpgtw     xmm0,xmm6 
  movdqa      xmm6,xmm3 
  psubw       xmm6,xmm4 
  pabsw       xmm6,xmm6 
  movdqa      xmm7,xmm2 
  pcmpgtw     xmm7,xmm6 
  movdqa      xmm6,[esp+50h] 
  psubw       xmm6,xmm5 
  pabsw       xmm6,xmm6 
  pand        xmm0,xmm7 
  movdqa      xmm7,xmm2 
  pcmpgtw     xmm7,xmm6 
  movdqa      xmm6,[esp+30h] 
  psubw       xmm6,[esp+40h] 
  pabsw       xmm6,xmm6 
  pcmpgtw     xmm1,xmm6 
  movdqa      xmm6,[esp+60h] 
  psubw       xmm6,[esp+30h] 
  pabsw       xmm6,xmm6 
  pand        xmm0,xmm7 
  movdqa      xmm7,xmm2 
  pcmpgtw     xmm7,xmm6 
  movdqa      xmm6,[esp+70h] 
  psubw       xmm6,[esp+40h] 
  pabsw       xmm6,xmm6 
  pand        xmm1,xmm7 
  pcmpgtw     xmm2,xmm6 
  pand        xmm1,xmm2 
  mov         eax,2 
  movsx       ecx,ax 
  movd        xmm2,ecx 
  movdqa      xmm6,xmm2 
  punpcklwd   xmm6,xmm2 
  pshufd      xmm2,xmm6,0 
  movdqa      [esp+20h],xmm2 
  movdqa      xmm2,xmm3 
  paddw       xmm2,xmm3 
  paddw       xmm2,xmm4 
  paddw       xmm2,[esp+50h] 
  paddw       xmm2,[esp+20h] 
  psraw       xmm2,2 
  movdqa      xmm6,xmm0 
  pand        xmm6,xmm2 
  movdqa      xmm2,xmm0 
  pandn       xmm2,xmm4 
  por         xmm6,xmm2 
  movdqa      xmm2,[esp+60h] 
  movdqa      xmm7,xmm2 
  paddw       xmm7,xmm2 
  paddw       xmm7,[esp+30h] 
  paddw       xmm7,[esp+70h] 
  paddw       xmm7,[esp+20h] 
  movdqa      xmm4,xmm1 
  movdqa      xmm2,xmm1 
  pandn       xmm2,[esp+30h] 
  psraw       xmm7,2 
  pand        xmm4,xmm7 
  por         xmm4,xmm2 
  movdqa      xmm2,[esp+50h] 
  packuswb    xmm6,xmm4 
  movdqa      [esp+90h],xmm6 
  movdqa      xmm6,xmm2 
  paddw       xmm6,xmm2 
  movdqa      xmm2,[esp+20h] 
  paddw       xmm6,xmm5 
  paddw       xmm6,xmm3 
  movdqa      xmm4,xmm0 
  pandn       xmm0,xmm5 
  paddw       xmm6,xmm2 
  psraw       xmm6,2 
  pand        xmm4,xmm6 
  por         xmm4,xmm0 
  movdqa      xmm0,[esp+70h] 
  movdqa      xmm5,xmm0 
  paddw       xmm5,xmm0 
  movdqa      xmm0,[esp+40h] 
  paddw       xmm5,xmm0 
  paddw       xmm5,[esp+60h] 
  movdqa      xmm3,xmm1 
  paddw       xmm5,xmm2 
  psraw       xmm5,2 
  pand        xmm3,xmm5 
  pandn       xmm1,xmm0 
  por         xmm3,xmm1 
  packuswb    xmm4,xmm3 
  movdqa      [esp+0A0h],xmm4 
  mov         esi,dword [esp+10h] 
  movdqa      xmm0,[esi] 
  movdqa      xmm1,[esi+10h] 
  movdqa      xmm2,[esi+20h] 
  movdqa      xmm3,[esi+30h] 
  movdqa      xmm6,xmm0 
  punpcklbw   xmm0,xmm1 
  punpckhbw   xmm6,xmm1 
  movdqa      xmm7,xmm2 
  punpcklbw   xmm2,xmm3 
  punpckhbw   xmm7,xmm3 
  movdqa      xmm4,xmm0 
  movdqa      xmm5,xmm6 
  punpcklwd   xmm0,xmm2 
  punpckhwd   xmm4,xmm2 
  punpcklwd   xmm6,xmm7 
  punpckhwd   xmm5,xmm7 
  movdqa      xmm1,xmm0 
  movdqa      xmm2,xmm4 
  punpckldq   xmm0,xmm6 
  punpckhdq   xmm1,xmm6 
  punpckldq   xmm4,xmm5 
  punpckhdq   xmm2,xmm5 
  movdqa      xmm5,xmm0 
  movdqa      xmm6,xmm1 
  punpcklqdq  xmm0,xmm4 
  punpckhqdq  xmm5,xmm4 
  punpcklqdq  xmm1,xmm2 
  punpckhqdq  xmm6,xmm2 
  mov         esi,dword [esp+1Ch] 
  mov         ecx,dword [ebp+10h] 
  mov         edx,dword [esp+14h] 
  mov         edi,dword [esp+8] 
  movd        dword [esi],xmm0 
  movd        dword [esi+ecx],xmm5 
  movd        dword [esi+ecx*2],xmm1 
  movd        dword [esi+edx],xmm6 
  psrldq      xmm0,4 
  psrldq      xmm5,4 
  psrldq      xmm1,4 
  psrldq      xmm6,4 
  mov         esi,dword [esp+18h] 
  movd        dword [edi],xmm0 
  movd        dword [edi+ecx],xmm5 
  movd        dword [edi+ecx*2],xmm1 
  movd        dword [edi+edx],xmm6 
  psrldq      xmm0,4 
  psrldq      xmm5,4 
  psrldq      xmm1,4 
  psrldq      xmm6,4 
  movd        dword [esi],xmm0 
  movd        dword [esi+ecx],xmm5 
  movd        dword [esi+ecx*2],xmm1 
  movd        dword [esi+edx],xmm6 
  psrldq      xmm0,4 
  psrldq      xmm5,4 
  psrldq      xmm1,4 
  psrldq      xmm6,4 
  mov         edi,dword [esp+0Ch] 
  movd        dword [edi],xmm0 
  movd        dword [edi+ecx],xmm5 
  movd        dword [edi+ecx*2],xmm1 
  movd        dword [edi+edx],xmm6 
  pop         edi  
  pop         esi  
  mov         esp,ebp 
  pop         ebp  
  ret              
  
;*******************************************************************************
;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
  
WELS_EXTERN  DeblockChromaLt4H_sse2
  
ALIGN  16

DeblockChromaLt4H_sse2:
  push        ebp  
  mov         ebp,esp 
  and         esp,0FFFFFFF0h 
  sub         esp,108h   
  mov         ecx,dword [ebp+8] 
  mov         edx,dword [ebp+0Ch] 
  mov         eax,dword [ebp+10h] 
  sub         ecx,2 
  sub         edx,2 
  push        esi  
  lea         esi,[eax+eax*2] 
  mov         dword [esp+10h],ecx 
  mov         dword [esp+4],edx 
  lea         ecx,[ecx+eax*4] 
  lea         edx,[edx+eax*4] 
  lea         eax,[esp+6Ch] 
  push        edi  
  mov         dword [esp+0Ch],esi 
  mov         dword [esp+18h],ecx 
  mov         dword [esp+10h],edx 
  mov         dword [esp+1Ch],eax 
  mov         esi,dword [esp+14h] 
  mov         ecx,dword [ebp+10h] 
  mov         edx,dword [esp+0Ch] 
  movd        xmm0,dword [esi] 
  movd        xmm1,dword [esi+ecx] 
  movd        xmm2,dword [esi+ecx*2] 
  movd        xmm3,dword [esi+edx] 
  mov         esi,dword [esp+8] 
  movd        xmm4,dword [esi] 
  movd        xmm5,dword [esi+ecx] 
  movd        xmm6,dword [esi+ecx*2] 
  movd        xmm7,dword [esi+edx] 
  punpckldq   xmm0,xmm4 
  punpckldq   xmm1,xmm5 
  punpckldq   xmm2,xmm6 
  punpckldq   xmm3,xmm7 
  mov         esi,dword [esp+18h] 
  mov         edi,dword [esp+10h] 
  movd        xmm4,dword [esi] 
  movd        xmm5,dword [edi] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm0,xmm4 
  movd        xmm4,dword [esi+ecx] 
  movd        xmm5,dword [edi+ecx] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm1,xmm4 
  movd        xmm4,dword [esi+ecx*2] 
  movd        xmm5,dword [edi+ecx*2] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm2,xmm4 
  movd        xmm4,dword [esi+edx] 
  movd        xmm5,dword [edi+edx] 
  punpckldq   xmm4,xmm5 
  punpcklqdq  xmm3,xmm4 
  movdqa      xmm6,xmm0 
  punpcklbw   xmm0,xmm1 
  punpckhbw   xmm6,xmm1 
  movdqa      xmm7,xmm2 
  punpcklbw   xmm2,xmm3 
  punpckhbw   xmm7,xmm3 
  movdqa      xmm4,xmm0 
  movdqa      xmm5,xmm6 
  punpcklwd   xmm0,xmm2 
  punpckhwd   xmm4,xmm2 
  punpcklwd   xmm6,xmm7 
  punpckhwd   xmm5,xmm7 
  movdqa      xmm1,xmm0 
  movdqa      xmm2,xmm4 
  punpckldq   xmm0,xmm6 
  punpckhdq   xmm1,xmm6 
  punpckldq   xmm4,xmm5 
  punpckhdq   xmm2,xmm5 
  movdqa      xmm5,xmm0 
  movdqa      xmm6,xmm1 
  punpcklqdq  xmm0,xmm4 
  punpckhqdq  xmm5,xmm4 
  punpcklqdq  xmm1,xmm2 
  punpckhqdq  xmm6,xmm2 
  mov         edi,dword [esp+1Ch] 
  movdqa      [edi],xmm0 
  movdqa      [edi+10h],xmm5 
  movdqa      [edi+20h],xmm1 
  movdqa      [edi+30h],xmm6 
  mov         eax,dword [ebp+1Ch] 
  movsx       cx,byte [eax+3] 
  movsx       dx,byte [eax+2] 
  movsx       si,byte [eax+1] 
  movsx       ax,byte [eax] 
  movzx       edi,cx 
  movzx       ecx,cx 
  movd        xmm2,ecx 
  movzx       ecx,dx 
  movzx       edx,dx 
  movd        xmm3,ecx 
  movd        xmm4,edx 
  movzx       ecx,si 
  movzx       edx,si 
  movd        xmm5,ecx 
  pxor        xmm0,xmm0 
  movd        xmm6,edx 
  movzx       ecx,ax 
  movdqa      [esp+60h],xmm0 
  movzx       edx,ax 
  movsx       eax,word [ebp+14h] 
  punpcklwd   xmm6,xmm2 
  movd        xmm1,edi 
  movd        xmm7,ecx 
  movsx       ecx,word [ebp+18h] 
  movd        xmm0,edx 
  punpcklwd   xmm7,xmm3 
  punpcklwd   xmm5,xmm1 
  movdqa      xmm1,[esp+60h] 
  punpcklwd   xmm7,xmm5 
  movdqa      xmm5,[esp+0A0h] 
  punpcklwd   xmm0,xmm4 
  punpcklwd   xmm0,xmm6 
  movdqa      xmm6, [esp+70h] 
  punpcklwd   xmm0,xmm7 
  movdqa      xmm7,[esp+80h] 
  movdqa      xmm2,xmm1 
  psubw       xmm2,xmm0 
  movdqa      [esp+0D0h],xmm2 
  movd        xmm2,eax 
  movdqa      xmm3,xmm2 
  punpcklwd   xmm3,xmm2 
  pshufd      xmm4,xmm3,0 
  movd        xmm2,ecx 
  movdqa      xmm3,xmm2 
  punpcklwd   xmm3,xmm2 
  pshufd      xmm2,xmm3,0 
  movdqa      xmm3, [esp+90h] 
  movdqa      [esp+50h],xmm2 
  movdqa      xmm2,xmm6 
  punpcklbw   xmm2,xmm1 
  punpckhbw   xmm6,xmm1 
  movdqa      [esp+40h],xmm2 
  movdqa      [esp+0B0h],xmm6 
  movdqa      xmm6,[esp+90h] 
  movdqa      xmm2,xmm7 
  punpckhbw   xmm7,xmm1 
  punpckhbw   xmm6,xmm1 
  punpcklbw   xmm2,xmm1 
  punpcklbw   xmm3,xmm1 
  punpcklbw   xmm5,xmm1 
  movdqa      [esp+0F0h],xmm7 
  movdqa      [esp+0C0h],xmm6 
  movdqa      xmm6, [esp+0A0h] 
  punpckhbw   xmm6,xmm1 
  movdqa      [esp+0E0h],xmm6 
  mov         edx,4 
  movsx       eax,dx 
  movd        xmm6,eax 
  movdqa      xmm7,xmm6 
  punpcklwd   xmm7,xmm6 
  pshufd      xmm6,xmm7,0 
  movdqa      [esp+30h],xmm6 
  movdqa      xmm7, [esp+40h] 
  psubw       xmm7,xmm5 
  movdqa      xmm6,xmm0 
  pcmpgtw     xmm6,xmm1 
  movdqa      [esp+60h],xmm6 
  movdqa      xmm1, [esp+0D0h] 
  movdqa      xmm6,xmm3 
  psubw       xmm6,xmm2 
  psllw       xmm6,2 
  paddw       xmm6,xmm7 
  paddw       xmm6,[esp+30h] 
  psraw       xmm6,3 
  pmaxsw      xmm1,xmm6 
  movdqa      xmm7,[esp+50h] 
  movdqa      [esp+20h],xmm0 
  movdqa      xmm6, [esp+20h] 
  pminsw      xmm6,xmm1 
  movdqa      [esp+20h],xmm6 
  movdqa      xmm6,xmm4 
  movdqa      xmm1,xmm2 
  psubw       xmm1,xmm3 
  pabsw       xmm1,xmm1 
  pcmpgtw     xmm6,xmm1 
  movdqa      xmm1, [esp+40h] 
  psubw       xmm1,xmm2 
  pabsw       xmm1,xmm1 
  pcmpgtw     xmm7,xmm1 
  movdqa      xmm1, [esp+50h] 
  pand        xmm6,xmm7 
  movdqa      xmm7, [esp+50h] 
  psubw       xmm5,xmm3 
  pabsw       xmm5,xmm5 
  pcmpgtw     xmm1,xmm5 
  movdqa      xmm5, [esp+0B0h] 
  psubw       xmm5,[esp+0E0h] 
  pand        xmm6,xmm1 
  pand        xmm6, [esp+60h] 
  movdqa      xmm1, [esp+20h] 
  pand        xmm1,xmm6 
  movdqa      xmm6, [esp+0C0h] 
  movdqa      [esp+40h],xmm1 
  movdqa      xmm1, [esp+0F0h] 
  psubw       xmm6,xmm1 
  psllw       xmm6,2 
  paddw       xmm6,xmm5 
  paddw       xmm6, [esp+30h] 
  movdqa      xmm5, [esp+0D0h] 
  psraw       xmm6,3 
  pmaxsw      xmm5,xmm6 
  pminsw      xmm0,xmm5 
  movdqa      xmm5,[esp+0C0h] 
  movdqa      xmm6,xmm1 
  psubw       xmm6,xmm5 
  pabsw       xmm6,xmm6 
  pcmpgtw     xmm4,xmm6 
  movdqa      xmm6,[esp+0B0h] 
  psubw       xmm6,xmm1 
  pabsw       xmm6,xmm6 
  pcmpgtw     xmm7,xmm6 
  movdqa      xmm6, [esp+0E0h] 
  pand        xmm4,xmm7 
  movdqa      xmm7, [esp+50h] 
  psubw       xmm6,xmm5 
  pabsw       xmm6,xmm6 
  pcmpgtw     xmm7,xmm6 
  pand        xmm4,xmm7 
  pand        xmm4,[esp+60h] 
  pand        xmm0,xmm4 
  movdqa      xmm4, [esp+40h] 
  paddw       xmm2,xmm4 
  paddw       xmm1,xmm0 
  psubw       xmm3,xmm4 
  psubw       xmm5,xmm0 
  packuswb    xmm2,xmm1 
  packuswb    xmm3,xmm5 
  movdqa      [esp+80h],xmm2 
  movdqa      [esp+90h],xmm3 
  mov         esi,dword [esp+1Ch] 
  movdqa      xmm0, [esi] 
  movdqa      xmm1, [esi+10h] 
  movdqa      xmm2, [esi+20h] 
  movdqa      xmm3, [esi+30h] 
  movdqa      xmm6,xmm0 
  punpcklbw   xmm0,xmm1 
  punpckhbw   xmm6,xmm1 
  movdqa      xmm7,xmm2 
  punpcklbw   xmm2,xmm3 
  punpckhbw   xmm7,xmm3 
  movdqa      xmm4,xmm0 
  movdqa      xmm5,xmm6 
  punpcklwd   xmm0,xmm2 
  punpckhwd   xmm4,xmm2 
  punpcklwd   xmm6,xmm7 
  punpckhwd   xmm5,xmm7 
  movdqa      xmm1,xmm0 
  movdqa      xmm2,xmm4 
  punpckldq   xmm0,xmm6 
  punpckhdq   xmm1,xmm6 
  punpckldq   xmm4,xmm5 
  punpckhdq   xmm2,xmm5 
  movdqa      xmm5,xmm0 
  movdqa      xmm6,xmm1 
  punpcklqdq  xmm0,xmm4 
  punpckhqdq  xmm5,xmm4 
  punpcklqdq  xmm1,xmm2 
  punpckhqdq  xmm6,xmm2 
  mov         esi,dword [esp+14h] 
  mov         ecx,dword [ebp+10h] 
  mov         edx,dword [esp+0Ch] 
  mov         edi,dword [esp+8] 
  movd        dword [esi],xmm0 
  movd        dword [esi+ecx],xmm5 
  movd        dword [esi+ecx*2],xmm1 
  movd        dword [esi+edx],xmm6 
  psrldq      xmm0,4 
  psrldq      xmm5,4 
  psrldq      xmm1,4 
  psrldq      xmm6,4 
  mov         esi,dword [esp+18h] 
  movd        dword [edi],xmm0 
  movd        dword [edi+ecx],xmm5 
  movd        dword [edi+ecx*2],xmm1 
  movd        dword [edi+edx],xmm6 
  psrldq      xmm0,4 
  psrldq      xmm5,4 
  psrldq      xmm1,4 
  psrldq      xmm6,4 
  movd        dword [esi],xmm0 
  movd        dword [esi+ecx],xmm5 
  movd        dword [esi+ecx*2],xmm1 
  movd        dword [esi+edx],xmm6 
  psrldq      xmm0,4 
  psrldq      xmm5,4 
  psrldq      xmm1,4 
  psrldq      xmm6,4 
  mov         edi,dword [esp+10h] 
  movd        dword [edi],xmm0 
  movd        dword [edi+ecx],xmm5 
  movd        dword [edi+ecx*2],xmm1 
  movd        dword [edi+edx],xmm6  
  pop         edi  
  pop         esi   
  mov         esp,ebp 
  pop         ebp  
  ret     
  
  
  
;*******************************************************************************
;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
;                                 int32_t iBeta, int8_t * pTC)
;*******************************************************************************
  

WELS_EXTERN  DeblockLumaLt4V_sse2
  
ALIGN  16

DeblockLumaLt4V_sse2:
    push	ebp
	mov	ebp, esp
	and	esp, -16				; fffffff0H
	sub	esp, 420				; 000001a4H
	mov	eax, dword [ebp+8]
	mov	ecx, dword [ebp+12]

	pxor	xmm0, xmm0
	push	ebx
	mov	edx, dword [ebp+24]
	movdqa	[esp+424-384], xmm0
	push	esi

	lea	esi, [ecx+ecx*2]
	push	edi
	mov	edi, eax
	sub	edi, esi
	movdqa	xmm0, [edi]

	lea	esi, [ecx+ecx]
	movdqa	[esp+432-208], xmm0
	mov	edi, eax
	sub	edi, esi
	movdqa	xmm0, [edi]
	movdqa	[esp+448-208], xmm0

	mov	ebx, eax
	sub	ebx, ecx
	movdqa	xmm0, [ebx]
	movdqa	[esp+464-208], xmm0

	movdqa	xmm0, [eax]

	add	ecx, eax
	movdqa	[esp+480-208], xmm0
	movdqa	xmm0, [ecx]
	mov	dword [esp+432-404], ecx

	movsx	ecx, word [ebp+16]
	movdqa	[esp+496-208], xmm0
	movdqa	xmm0, [esi+eax]

	movsx	si, byte [edx]
	movdqa	[esp+512-208], xmm0
	movd	xmm0, ecx
	movsx	ecx, word [ebp+20]
	movdqa	xmm1, xmm0
	punpcklwd xmm1, xmm0
	pshufd	xmm0, xmm1, 0
	movdqa	[esp+432-112], xmm0
	movd	xmm0, ecx
	movsx	cx, byte [edx+1]
	movdqa	xmm1, xmm0
	punpcklwd xmm1, xmm0
	mov	dword [esp+432-408], ebx
	movzx	ebx, cx
	pshufd	xmm0, xmm1, 0
	movd	xmm1, ebx
	movzx	ebx, cx
	movd	xmm2, ebx
	movzx	ebx, cx
	movzx	ecx, cx
	movd	xmm4, ecx
	movzx	ecx, si
	movd	xmm5, ecx
	movzx	ecx, si
	movd	xmm6, ecx
	movzx	ecx, si
	movd	xmm7, ecx
	movzx	ecx, si
	movdqa	[esp+432-336], xmm0
	movd	xmm0, ecx

	movsx	cx, byte [edx+3]
	movsx	dx, byte [edx+2]
	movd	xmm3, ebx
	punpcklwd xmm0, xmm4
	movzx	esi, cx
	punpcklwd xmm6, xmm2
	punpcklwd xmm5, xmm1
	punpcklwd xmm0, xmm6
	punpcklwd xmm7, xmm3
	punpcklwd xmm7, xmm5
	punpcklwd xmm0, xmm7
	movdqa	[esp+432-400], xmm0
	movd	xmm0, esi
	movzx	esi, cx
	movd	xmm2, esi
	movzx	esi, cx
	movzx	ecx, cx
	movd	xmm4, ecx
	movzx	ecx, dx
	movd	xmm3, esi
	movd	xmm5, ecx
	punpcklwd xmm5, xmm0

	movdqa	xmm0, [esp+432-384]
	movzx	ecx, dx
	movd	xmm6, ecx
	movzx	ecx, dx
	movzx	edx, dx
	punpcklwd xmm6, xmm2
	movd	xmm7, ecx
	movd	xmm1, edx

	movdqa	xmm2, [esp+448-208]
	punpcklbw xmm2, xmm0

	mov	ecx, 4
	movsx	edx, cx
	punpcklwd xmm7, xmm3
	punpcklwd xmm7, xmm5
	movdqa	xmm5, [esp+496-208]
	movdqa	xmm3, [esp+464-208]
	punpcklbw xmm5, xmm0
	movdqa	[esp+432-240], xmm5
	movdqa	xmm5, [esp+512-208]
	punpcklbw xmm5, xmm0
	movdqa	[esp+432-352], xmm5
	punpcklwd xmm1, xmm4
	movdqa	xmm4, [esp+432-208]
	punpcklwd xmm1, xmm6
	movdqa	xmm6, [esp+480-208]
	punpcklwd xmm1, xmm7
	punpcklbw xmm6, xmm0
	punpcklbw xmm3, xmm0
	punpcklbw xmm4, xmm0
	movdqa	xmm7, xmm3
	psubw	xmm7, xmm4
	pabsw	xmm7, xmm7
	movdqa	[esp+432-272], xmm4
	movdqa	xmm4, [esp+432-336]
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-288], xmm5
	movdqa	xmm7, xmm6
	psubw	xmm7, [esp+432-352]
	pabsw	xmm7, xmm7
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-256], xmm5
	movdqa	xmm5, xmm3
	pavgw	xmm5, xmm6
	movdqa	[esp+432-304], xmm5
	movdqa	xmm5, [esp+432-400]
	psubw	xmm5, [esp+432-288]
	psubw	xmm5, [esp+432-256]
	movdqa	[esp+432-224], xmm5
	movdqa	xmm5, xmm6
	psubw	xmm5, xmm3
	movdqa	[esp+432-32], xmm6
	psubw	xmm6, [esp+432-240]
	movdqa	xmm7, xmm5
	movdqa	[esp+432-384], xmm5
	movdqa	xmm5, [esp+432-112]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm5, xmm7
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm4
	pcmpgtw	xmm7, xmm6

	pand	xmm5, xmm7
	movdqa	xmm6, xmm3
	psubw	xmm6, xmm2
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm4
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+432-400]
	pand	xmm5, xmm7
	movdqa	xmm7, xmm6
	pcmpeqw	xmm6, xmm0
	pcmpgtw	xmm7, xmm0
	por	xmm7, xmm6
	pand	xmm5, xmm7
	movdqa	[esp+432-320], xmm5
	movd	xmm5, edx
	movdqa	xmm6, xmm5
	punpcklwd xmm6, xmm5
	pshufd	xmm5, xmm6, 0
	movdqa	[esp+432-336], xmm5
	movdqa	xmm5, [esp+432-224]
	movdqa	[esp+432-368], xmm5
	movdqa	xmm6, xmm0
	psubw	xmm6, xmm5
	movdqa	xmm5, [esp+432-384]
	psllw	xmm5, 2
	movdqa	xmm7, xmm2
	psubw	xmm7, [esp+432-240]
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+432-336]
	movdqa	xmm5, [esp+432-368]
	psraw	xmm7, 3
	pmaxsw	xmm6, xmm7
	pminsw	xmm5, xmm6

	pand	xmm5, [esp+432-320]
	movdqa	xmm6, [esp+432-400]
	movdqa	[esp+432-64], xmm5
	movdqa	[esp+432-384], xmm6
	movdqa	xmm5, xmm0
	psubw	xmm5, xmm6
	movdqa	[esp+432-368], xmm5
	movdqa	xmm6, xmm5
	movdqa	xmm5, [esp+432-272]
	paddw	xmm5, [esp+432-304]
	movdqa	xmm7, xmm2
	paddw	xmm7, xmm2
	psubw	xmm5, xmm7
	psraw	xmm5, 1
	pmaxsw	xmm6, xmm5
	movdqa	xmm5, [esp+432-384]
	pminsw	xmm5, xmm6

	pand	xmm5, [esp+432-320]
	pand	xmm5, [esp+432-288]
	movdqa	xmm6, [esp+432-240]
	movdqa	[esp+432-96], xmm5
	movdqa	xmm5, [esp+432-352]
	paddw	xmm5, [esp+432-304]
	movdqa	xmm7, xmm6
	paddw	xmm7, xmm6
	movdqa	xmm6, [esp+432-368]
	psubw	xmm5, xmm7

	movdqa	xmm7, [esp+496-208]
	psraw	xmm5, 1
	pmaxsw	xmm6, xmm5
	movdqa	xmm5, [esp+432-400]
	pminsw	xmm5, xmm6
	pand	xmm5, [esp+432-320]
	pand	xmm5, [esp+432-256]
	movdqa	xmm6, [esp+448-208]
	punpckhbw xmm7, xmm0
	movdqa	[esp+432-352], xmm7

	movdqa	xmm7, [esp+512-208]
	punpckhbw xmm6, xmm0
	movdqa	[esp+432-48], xmm5
	movdqa	xmm5, [esp+432-208]
	movdqa	[esp+432-368], xmm6
	movdqa	xmm6, [esp+464-208]
	punpckhbw xmm7, xmm0
	punpckhbw xmm5, xmm0
	movdqa	[esp+432-384], xmm7
	punpckhbw xmm6, xmm0
	movdqa	[esp+432-400], xmm6

	movdqa	xmm7, [esp+432-400]
	movdqa	xmm6, [esp+480-208]
	psubw	xmm7, xmm5
	movdqa	[esp+432-16], xmm5
	pabsw	xmm7, xmm7
	punpckhbw xmm6, xmm0
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-288], xmm5

	movdqa	xmm7, xmm6
	psubw	xmm7, [esp+432-384]
	pabsw	xmm7, xmm7
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-256], xmm5

	movdqa	xmm5, [esp+432-400]
	movdqa	[esp+432-80], xmm6
	pavgw	xmm5, xmm6
	movdqa	[esp+432-304], xmm5

	movdqa	xmm5, xmm1
	psubw	xmm5, [esp+432-288]
	psubw	xmm5, [esp+432-256]
	movdqa	[esp+432-224], xmm5
	movdqa	xmm5, xmm6
	psubw	xmm5, [esp+432-400]
	psubw	xmm6, [esp+432-352]
	movdqa	[esp+432-272], xmm5
	movdqa	xmm7, xmm5
	movdqa	xmm5, [esp+432-112]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm5, xmm7
	movdqa	xmm7, xmm4
	pabsw	xmm6, xmm6
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+432-368]

	pand	xmm5, xmm7
	movdqa	xmm7, [esp+432-400]
	psubw	xmm7, xmm6
	psubw	xmm6, [esp+432-352]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7
	pand	xmm5, xmm4

	paddw	xmm2, [esp+432-96]
	movdqa	xmm4, xmm1
	pcmpgtw	xmm4, xmm0
	movdqa	xmm7, xmm1
	pcmpeqw	xmm7, xmm0
	por	xmm4, xmm7
	pand	xmm5, xmm4
	movdqa	xmm4, [esp+432-224]
	movdqa	[esp+432-320], xmm5
	movdqa	xmm5, [esp+432-272]
	movdqa	xmm7, xmm0
	psubw	xmm7, xmm4
	psubw	xmm0, xmm1
	psllw	xmm5, 2
	paddw	xmm6, xmm5
	paddw	xmm6, [esp+432-336]
	movdqa	xmm5, [esp+432-368]
	movdqa	[esp+432-336], xmm0
	psraw	xmm6, 3
	pmaxsw	xmm7, xmm6
	pminsw	xmm4, xmm7
	pand	xmm4, [esp+432-320]
	movdqa	xmm6, xmm0
	movdqa	xmm0, [esp+432-16]
	paddw	xmm0, [esp+432-304]
	movdqa	[esp+432-272], xmm4
	movdqa	xmm4, [esp+432-368]
	paddw	xmm4, xmm4
	psubw	xmm0, xmm4

	movdqa	xmm4, [esp+432-64]
	psraw	xmm0, 1
	pmaxsw	xmm6, xmm0
	movdqa	xmm0, [esp+432-400]
	movdqa	xmm7, xmm1
	pminsw	xmm7, xmm6
	movdqa	xmm6, [esp+432-320]
	pand	xmm7, xmm6
	pand	xmm7, [esp+432-288]
	paddw	xmm5, xmm7
	packuswb xmm2, xmm5
	movdqa	xmm5, [esp+432-272]
	paddw	xmm0, xmm5
	paddw	xmm3, xmm4
	packuswb xmm3, xmm0

	movdqa	xmm0, [esp+432-32]
	psubw	xmm0, xmm4
	movdqa	xmm4, [esp+432-80]
	psubw	xmm4, xmm5

	movdqa	xmm5, [esp+432-240]
	paddw	xmm5, [esp+432-48]
	packuswb xmm0, xmm4
	movdqa	xmm4, [esp+432-384]
	paddw	xmm4, [esp+432-304]
	movdqa	[esp+480-208], xmm0
	movdqa	xmm0, [esp+432-352]
	movdqa	xmm7, xmm0
	paddw	xmm0, xmm0

	mov	ecx, dword [esp+432-408]

	mov	edx, dword [esp+432-404]
	psubw	xmm4, xmm0
	movdqa	xmm0, [esp+432-336]
	movdqa	[edi], xmm2
	psraw	xmm4, 1
	pmaxsw	xmm0, xmm4
	pminsw	xmm1, xmm0
	movdqa	xmm0, [esp+480-208]

	pop	edi
	pand	xmm1, xmm6
	pand	xmm1, [esp+428-256]
	movdqa	[ecx], xmm3
	paddw	xmm7, xmm1
	pop	esi
	packuswb xmm5, xmm7
	movdqa	[eax], xmm0
	movdqa	[edx], xmm5
	pop	ebx
	mov	esp, ebp
	pop	ebp
	ret


;*******************************************************************************
;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
;                                 int32_t iBeta)
;*******************************************************************************

WELS_EXTERN  DeblockLumaEq4V_sse2
  
ALIGN  16

DeblockLumaEq4V_sse2:

	push	ebp
	mov	ebp, esp
	and	esp, -16				; fffffff0H
	sub	esp, 628				; 00000274H
	mov	eax, dword [ebp+8]
	mov	ecx, dword [ebp+12]
	push	ebx
	push	esi

	lea	edx, [ecx*4]
	pxor	xmm0, xmm0
	movdqa	xmm2, xmm0

	movdqa	xmm0, [ecx+eax]
	mov	esi, eax
	sub	esi, edx
	movdqa	xmm3, [esi]
	movdqa	xmm5, [eax]
	push	edi
	lea	edi, [ecx+ecx]
	lea	ebx, [ecx+ecx*2]
	mov	dword [esp+640-600], edi
	mov	esi, eax
	sub	esi, edi
	movdqa	xmm1, [esi]
	movdqa	 [esp+720-272], xmm0
	mov	edi, eax
	sub	edi, ecx
	movdqa	xmm4, [edi]
	add	ecx, eax
	mov	dword [esp+640-596], ecx

	mov	ecx, dword [esp+640-600]
	movdqa	xmm0, [ecx+eax]
	movdqa	 [esp+736-272], xmm0

	movdqa	xmm0, [eax+ebx]
	mov	edx, eax
	sub	edx, ebx

	movsx	ebx, word [ebp+16]
	movdqa	xmm6, [edx]
	add	ecx, eax
	movdqa	 [esp+752-272], xmm0
	movd	xmm0, ebx

	movsx	ebx, word [ebp+20]
	movdqa	xmm7, xmm0
	punpcklwd xmm7, xmm0
	pshufd	xmm0, xmm7, 0
	movdqa	 [esp+640-320], xmm0
	movd	xmm0, ebx
	movdqa	xmm7, xmm0
	punpcklwd xmm7, xmm0
	pshufd	xmm0, xmm7, 0

	movdqa	xmm7, [esp+736-272]
	punpcklbw xmm7, xmm2
	movdqa	 [esp+640-416], xmm7
	movdqa	 [esp+640-512], xmm0
	movdqa	xmm0, xmm1
	movdqa	 [esp+672-272], xmm1
	movdqa	xmm1, xmm4
	movdqa	 [esp+704-272], xmm5
	punpcklbw xmm5, xmm2
	punpcklbw xmm1, xmm2

	movdqa	xmm7, xmm5
	psubw	xmm7, xmm1
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-560], xmm7
	punpcklbw xmm0, xmm2
	movdqa	 [esp+688-272], xmm4
	movdqa	xmm4, [esp+720-272]
	movdqa	 [esp+640-480], xmm0

	movdqa	xmm7, xmm1
	psubw	xmm7, xmm0

	movdqa	xmm0, [esp+640-512]
	pabsw	xmm7, xmm7
	punpcklbw xmm4, xmm2
	pcmpgtw	xmm0, xmm7
	movdqa	 [esp+640-384], xmm4
	movdqa	xmm7, xmm5
	psubw	xmm7, xmm4
	movdqa	xmm4, [esp+640-512]
	movdqa	 [esp+656-272], xmm6
	punpcklbw xmm6, xmm2
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-48], xmm2
	movdqa	 [esp+640-368], xmm6
	movdqa	 [esp+640-144], xmm1
	movdqa	 [esp+640-400], xmm5
	pcmpgtw	xmm4, xmm7
	pand	xmm0, xmm4
	movdqa	xmm4, [esp+640-320]
	pcmpgtw	xmm4, [esp+640-560]
	pand	xmm0, xmm4

	mov	ebx, 2
	movsx	ebx, bx
	movd	xmm4, ebx
	movdqa	xmm7, xmm4
	punpcklwd xmm7, xmm4
	movdqa	xmm4, [esp+640-320]
	psraw	xmm4, 2
	pshufd	xmm7, xmm7, 0
	paddw	xmm4, xmm7
	movdqa	 [esp+640-576], xmm4
	pcmpgtw	xmm4, [esp+640-560]
	movdqa	 [esp+640-560], xmm4

	movdqa	xmm4, [esp+640-512]
	movdqa	 [esp+640-624], xmm7
	movdqa	xmm7, xmm1
	psubw	xmm7, xmm6
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7

	pand	xmm4, [esp+640-560]
	movdqa	 [esp+640-544], xmm4
	movdqa	xmm4, [esp+640-512]
	movdqa	xmm7, xmm5
	psubw	xmm7, [esp+640-416]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7

	pand	xmm4, [esp+640-560]
	movdqa	 [esp+640-560], xmm4

	movdqa	xmm4, [esp+640-544]
	pandn	xmm4, xmm6
	movdqa	 [esp+640-16], xmm4
	mov	ebx, 4
	movsx	ebx, bx
	movd	xmm4, ebx
	movdqa	xmm7, xmm4
	punpcklwd xmm7, xmm4
	movdqa	xmm4, xmm3
	punpcklbw xmm4, xmm2
	psllw	xmm4, 1
	paddw	xmm4, xmm6
	paddw	xmm4, xmm6
	paddw	xmm4, xmm6
	paddw	xmm4, [esp+640-480]

	movdqa	xmm6, [esp+640-560]
	pshufd	xmm7, xmm7, 0
	paddw	xmm4, xmm1
	movdqa	 [esp+640-592], xmm7
	paddw	xmm4, xmm5
	paddw	xmm4, xmm7
	movdqa	xmm7, [esp+640-416]
	pandn	xmm6, xmm7
	movdqa	 [esp+640-80], xmm6
	movdqa	xmm6, [esp+752-272]
	punpcklbw xmm6, xmm2
	psllw	xmm6, 1
	paddw	xmm6, xmm7
	paddw	xmm6, xmm7
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-384]

	movdqa	xmm7, [esp+640-480]
	paddw	xmm6, xmm5
	paddw	xmm6, xmm1
	paddw	xmm6, [esp+640-592]
	psraw	xmm6, 3
	pand	xmm6, [esp+640-560]
	movdqa	 [esp+640-112], xmm6
	movdqa	xmm6, [esp+640-544]
	pandn	xmm6, xmm7
	movdqa	 [esp+640-336], xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	 [esp+640-528], xmm6
	movdqa	xmm6, [esp+640-368]
	paddw	xmm6, xmm7
	movdqa	xmm7, xmm1
	psraw	xmm4, 3
	pand	xmm4, [esp+640-544]
	paddw	xmm7, xmm5
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-624]
	movdqa	xmm7, [esp+640-528]

	paddw	xmm5, xmm1
	psraw	xmm6, 2
	pand	xmm7, xmm6

	movdqa	xmm6, [esp+640-384]
	movdqa	 [esp+640-64], xmm7
	movdqa	xmm7, [esp+640-560]
	pandn	xmm7, xmm6
	movdqa	 [esp+640-304], xmm7
	movdqa	xmm7, [esp+640-560]
	movdqa	 [esp+640-528], xmm7
	movdqa	xmm7, [esp+640-416]
	paddw	xmm7, xmm6
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+640-624]
	movdqa	xmm5, [esp+640-528]
	psraw	xmm7, 2
	pand	xmm5, xmm7
	movdqa	 [esp+640-32], xmm5

	movdqa	xmm5, [esp+640-544]
	movdqa	 [esp+640-528], xmm5
	movdqa	xmm5, [esp+640-480]
	movdqa	xmm7, xmm5
	paddw	xmm7, xmm5
	movdqa	xmm5, xmm1
	paddw	xmm5, xmm6
	paddw	xmm6, [esp+640-592]
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+640-624]
	movdqa	xmm5, [esp+640-528]
	psraw	xmm7, 2
	pandn	xmm5, xmm7
	movdqa	xmm7, [esp+640-480]
	paddw	xmm7, xmm1
	paddw	xmm7, [esp+640-400]
	movdqa	xmm1, [esp+640-544]
	movdqa	 [esp+640-352], xmm5
	movdqa	xmm5, [esp+640-368]
	psllw	xmm7, 1
	paddw	xmm7, xmm6
	paddw	xmm5, xmm7

	movdqa	xmm7, [esp+640-400]
	psraw	xmm5, 3
	pand	xmm1, xmm5
	movdqa	xmm5, [esp+640-480]
	movdqa	 [esp+640-96], xmm1
	movdqa	xmm1, [esp+640-560]
	movdqa	 [esp+640-528], xmm1
	movdqa	xmm1, [esp+640-384]
	movdqa	xmm6, xmm1
	paddw	xmm6, xmm1
	paddw	xmm1, [esp+640-400]
	paddw	xmm1, [esp+640-144]
	paddw	xmm7, xmm5
	paddw	xmm5, [esp+640-592]
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-624]
	movdqa	xmm7, [esp+640-528]
	psraw	xmm6, 2
	psllw	xmm1, 1
	paddw	xmm1, xmm5

	movdqa	xmm5, [esp+656-272]
	pandn	xmm7, xmm6
	movdqa	xmm6, [esp+640-416]
	paddw	xmm6, xmm1
	movdqa	xmm1, [esp+640-560]
	psraw	xmm6, 3
	pand	xmm1, xmm6

	movdqa	xmm6, [esp+704-272]
	movdqa	 [esp+640-128], xmm1
	movdqa	xmm1, [esp+672-272]
	punpckhbw xmm1, xmm2
	movdqa	 [esp+640-448], xmm1
	movdqa	xmm1, [esp+688-272]
	punpckhbw xmm1, xmm2
	punpckhbw xmm6, xmm2
	movdqa	 [esp+640-288], xmm7
	punpckhbw xmm5, xmm2
	movdqa	 [esp+640-496], xmm1
	movdqa	 [esp+640-432], xmm6

	movdqa	xmm7, [esp+720-272]
	punpckhbw xmm7, xmm2
	movdqa	 [esp+640-464], xmm7

	movdqa	xmm7, [esp+736-272]
	punpckhbw xmm7, xmm2
	movdqa	 [esp+640-528], xmm7

	movdqa	xmm7, xmm6

	psubw	xmm6, [esp+640-464]
	psubw	xmm7, xmm1
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-560], xmm7
	por	xmm4, [esp+640-16]
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm1
	psubw	xmm7, [esp+640-448]

	movdqa	xmm1, [esp+640-512]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm1, xmm7
	movdqa	xmm7, [esp+640-512]
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+640-320]
	pand	xmm1, xmm7
	movdqa	xmm7, [esp+640-560]
	pcmpgtw	xmm6, xmm7
	pand	xmm1, xmm6

	movdqa	xmm6, [esp+640-576]
	pcmpgtw	xmm6, xmm7

	movdqa	xmm7, [esp+640-496]
	punpckhbw xmm3, xmm2
	movdqa	 [esp+640-560], xmm6
	movdqa	xmm6, [esp+640-512]
	psubw	xmm7, xmm5
	pabsw	xmm7, xmm7
	pcmpgtw	xmm6, xmm7

	pand	xmm6, [esp+640-560]
	movdqa	xmm7, [esp+640-432]
	psubw	xmm7, [esp+640-528]

	psllw	xmm3, 1
	movdqa	 [esp+640-544], xmm6
	movdqa	xmm6, [esp+640-512]

	movdqa	xmm2, [esp+640-544]
	paddw	xmm3, xmm5
	paddw	xmm3, xmm5
	paddw	xmm3, xmm5
	paddw	xmm3, [esp+640-448]
	paddw	xmm3, [esp+640-496]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm6, xmm7
	pand	xmm6, [esp+640-560]
	movdqa	 [esp+640-560], xmm6

	movdqa	xmm6, xmm0
	pand	xmm6, xmm4
	movdqa	xmm4, xmm0
	pandn	xmm4, [esp+640-368]
	por	xmm6, xmm4
	movdqa	xmm4, [esp+640-432]
	paddw	xmm3, xmm4
	paddw	xmm3, [esp+640-592]
	psraw	xmm3, 3
	pand	xmm3, xmm2
	pandn	xmm2, xmm5
	por	xmm3, xmm2
	movdqa	xmm7, xmm1
	pand	xmm7, xmm3
	movdqa	xmm3, [esp+640-64]
	por	xmm3, [esp+640-336]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm5
	por	xmm7, xmm2

	movdqa	xmm2, xmm0
	pand	xmm2, xmm3
	movdqa	xmm3, xmm0
	pandn	xmm3, [esp+640-480]
	por	xmm2, xmm3
	packuswb xmm6, xmm7
	movdqa	 [esp+640-336], xmm2
	movdqa	 [esp+656-272], xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	xmm2, xmm5
	paddw	xmm2, [esp+640-448]
	movdqa	xmm3, xmm1
	movdqa	xmm7, [esp+640-496]
	paddw	xmm7, xmm4
	paddw	xmm2, xmm7
	paddw	xmm2, [esp+640-624]
	movdqa	xmm7, [esp+640-544]
	psraw	xmm2, 2
	pand	xmm6, xmm2
	movdqa	xmm2, [esp+640-448]
	pandn	xmm7, xmm2
	por	xmm6, xmm7
	pand	xmm3, xmm6
	movdqa	xmm6, xmm1
	pandn	xmm6, xmm2
	paddw	xmm2, [esp+640-496]
	paddw	xmm2, xmm4
	por	xmm3, xmm6
	movdqa	xmm6, [esp+640-336]
	packuswb xmm6, xmm3
	psllw	xmm2, 1
	movdqa	 [esp+672-272], xmm6
	movdqa	xmm6, [esp+640-96]
	por	xmm6, [esp+640-352]

	movdqa	xmm3, xmm0
	pand	xmm3, xmm6
	movdqa	xmm6, xmm0
	pandn	xmm6, [esp+640-144]
	por	xmm3, xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	 [esp+640-352], xmm3
	movdqa	xmm3, [esp+640-464]
	paddw	xmm3, [esp+640-592]
	paddw	xmm2, xmm3
	movdqa	xmm3, [esp+640-448]
	paddw	xmm5, xmm2
	movdqa	xmm2, [esp+640-496]
	psraw	xmm5, 3
	pand	xmm6, xmm5
	movdqa	xmm5, [esp+640-464]
	paddw	xmm2, xmm5
	paddw	xmm5, [esp+640-432]
	movdqa	xmm4, xmm3
	paddw	xmm4, xmm3
	paddw	xmm4, xmm2
	paddw	xmm4, [esp+640-624]
	movdqa	xmm2, [esp+640-544]
	paddw	xmm3, [esp+640-592]
	psraw	xmm4, 2
	pandn	xmm2, xmm4
	por	xmm6, xmm2
	movdqa	xmm7, xmm1
	pand	xmm7, xmm6
	movdqa	xmm6, [esp+640-496]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm6
	por	xmm7, xmm2
	movdqa	xmm2, [esp+640-352]
	packuswb xmm2, xmm7
	movdqa	 [esp+688-272], xmm2
	movdqa	xmm2, [esp+640-128]
	por	xmm2, [esp+640-288]

	movdqa	xmm4, xmm0
	pand	xmm4, xmm2
	paddw	xmm5, xmm6
	movdqa	xmm2, xmm0
	pandn	xmm2, [esp+640-400]
	por	xmm4, xmm2
	movdqa	xmm2, [esp+640-528]
	psllw	xmm5, 1
	paddw	xmm5, xmm3
	movdqa	xmm3, [esp+640-560]
	paddw	xmm2, xmm5
	psraw	xmm2, 3
	movdqa	 [esp+640-288], xmm4
	movdqa	xmm4, [esp+640-560]
	pand	xmm4, xmm2
	movdqa	xmm2, [esp+640-464]
	movdqa	xmm5, xmm2
	paddw	xmm5, xmm2
	movdqa	xmm2, [esp+640-432]
	paddw	xmm2, [esp+640-448]
	movdqa	xmm7, xmm1
	paddw	xmm5, xmm2
	paddw	xmm5, [esp+640-624]
	movdqa	xmm6, [esp+640-560]
	psraw	xmm5, 2
	pandn	xmm3, xmm5
	por	xmm4, xmm3
	movdqa	xmm3, [esp+640-32]
	por	xmm3, [esp+640-304]
	pand	xmm7, xmm4
	movdqa	xmm4, [esp+640-432]
	movdqa	xmm5, [esp+640-464]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm4
	paddw	xmm4, [esp+640-496]
	por	xmm7, xmm2
	movdqa	xmm2, [esp+640-288]
	packuswb xmm2, xmm7
	movdqa	 [esp+704-272], xmm2

	movdqa	xmm2, xmm0
	pand	xmm2, xmm3
	movdqa	xmm3, xmm0
	pandn	xmm3, [esp+640-384]
	por	xmm2, xmm3
	movdqa	 [esp+640-304], xmm2
	movdqa	xmm2, [esp+640-528]
	movdqa	xmm3, xmm2
	paddw	xmm3, [esp+640-464]
	paddw	xmm3, xmm4
	paddw	xmm3, [esp+640-624]
	psraw	xmm3, 2
	pand	xmm6, xmm3
	movdqa	xmm3, [esp+640-560]
	movdqa	xmm4, xmm3
	pandn	xmm4, xmm5
	por	xmm6, xmm4
	movdqa	xmm7, xmm1
	pand	xmm7, xmm6
	movdqa	xmm6, [esp+640-304]
	movdqa	xmm4, xmm1
	pandn	xmm4, xmm5
	por	xmm7, xmm4

	movdqa	xmm4, xmm0
	pandn	xmm0, [esp+640-416]
	packuswb xmm6, xmm7
	movdqa	xmm7, [esp+640-112]
	por	xmm7, [esp+640-80]
	pand	xmm4, xmm7
	por	xmm4, xmm0
	movdqa	xmm0, [esp+752-272]
	punpckhbw xmm0, [esp+640-48]
	psllw	xmm0, 1
	paddw	xmm0, xmm2
	paddw	xmm0, xmm2
	paddw	xmm0, xmm2
	paddw	xmm0, xmm5
	paddw	xmm0, [esp+640-432]
	paddw	xmm0, [esp+640-496]
	paddw	xmm0, [esp+640-592]
	psraw	xmm0, 3
	pand	xmm0, xmm3
	movdqa	xmm7, xmm1
	pandn	xmm3, xmm2
	por	xmm0, xmm3
	pand	xmm7, xmm0

	movdqa	xmm0, [esp+656-272]
	movdqa	 [edx], xmm0

	movdqa	xmm0, [esp+672-272]

	mov	edx, dword [esp+640-596]
	movdqa	 [esi], xmm0
	movdqa	xmm0, [esp+688-272]
	movdqa	 [edi], xmm0
	movdqa	xmm0, [esp+704-272]

	pop	edi
	pandn	xmm1, xmm2
	movdqa	 [eax], xmm0
	por	xmm7, xmm1
	pop	esi
	packuswb xmm4, xmm7
	movdqa	 [edx], xmm6
	movdqa	 [ecx], xmm4
	pop	ebx
	mov	esp, ebp
	pop	ebp
	ret
  
    
;********************************************************************************
;
;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
;
;********************************************************************************

WELS_EXTERN  DeblockLumaTransposeH2V_sse2

ALIGN  16

DeblockLumaTransposeH2V_sse2:
    push    ebp
    push    ebx
    mov     ebp,   esp
    and     esp,0FFFFFFF0h
    sub     esp,   10h    
    
    mov     eax,   [ebp + 0Ch]  
    mov     ecx,   [ebp + 10h]
    lea     edx,   [eax + ecx * 8]
    lea     ebx,   [ecx*3]
    
    movq    xmm0,  [eax] 
    movq    xmm7,  [edx]
    punpcklqdq   xmm0,  xmm7  
    movq    xmm1,  [eax + ecx]
    movq    xmm7,  [edx + ecx]
    punpcklqdq   xmm1,  xmm7
    movq    xmm2,  [eax + ecx*2] 
    movq    xmm7,  [edx + ecx*2]
    punpcklqdq   xmm2,  xmm7
    movq    xmm3,  [eax + ebx]
    movq    xmm7,  [edx + ebx]
    punpcklqdq   xmm3,  xmm7
    
    lea     eax,   [eax + ecx * 4]
    lea     edx,   [edx + ecx * 4]
    movq    xmm4,  [eax] 
    movq    xmm7,  [edx]
    punpcklqdq   xmm4,  xmm7  
    movq    xmm5,  [eax + ecx]
    movq    xmm7,  [edx + ecx]
    punpcklqdq   xmm5,  xmm7
    movq    xmm6,  [eax + ecx*2] 
    movq    xmm7,  [edx + ecx*2]
    punpcklqdq   xmm6,  xmm7
    
    movdqa  [esp],   xmm0
    movq    xmm7,  [eax + ebx]
    movq    xmm0,  [edx + ebx]
    punpcklqdq   xmm7,  xmm0
    movdqa  xmm0,   [esp]
    
    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
    
    mov    eax,   [ebp + 14h]
    movdqa  [eax],    xmm4 
    movdqa  [eax + 10h],  xmm2
    movdqa  [eax + 20h],  xmm3
    movdqa  [eax + 30h],  xmm7
    movdqa  [eax + 40h],  xmm5
    movdqa  [eax + 50h],  xmm1
    movdqa  [eax + 60h],  xmm6
    movdqa  [eax + 70h],  xmm0   
    
    mov     esp,   ebp
    pop     ebx
    pop     ebp
    ret
    
    
    
;*******************************************************************************************
;
;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
;
;*******************************************************************************************

WELS_EXTERN   DeblockLumaTransposeV2H_sse2

ALIGN  16

DeblockLumaTransposeV2H_sse2:
    push     ebp
    mov      ebp,   esp
    
    and     esp,  0FFFFFFF0h
    sub     esp,   10h  
    
    mov      eax,   [ebp + 10h]  
    mov      ecx,   [ebp + 0Ch]
    mov      edx,   [ebp + 08h]
      
    movdqa   xmm0,  [eax]
    movdqa   xmm1,  [eax + 10h]
    movdqa   xmm2,  [eax + 20h]
    movdqa   xmm3,	[eax + 30h]
    movdqa   xmm4,	[eax + 40h]
    movdqa   xmm5,	[eax + 50h]
    movdqa   xmm6,	[eax + 60h]
    movdqa   xmm7,	[eax + 70h]
    
    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
    
    lea      eax,   [ecx * 3]
    
    movq     [edx],  xmm4 
    movq     [edx + ecx],  xmm2
    movq     [edx + ecx*2],  xmm3
    movq     [edx + eax],  xmm7
    
    lea      edx,   [edx + ecx*4]
    movq     [edx],  xmm5 
    movq     [edx + ecx],  xmm1
    movq     [edx + ecx*2],  xmm6
    movq     [edx + eax],  xmm0    
    
    psrldq    xmm4,   8
    psrldq    xmm2,   8
    psrldq    xmm3,   8
    psrldq    xmm7,   8
    psrldq    xmm5,   8
    psrldq    xmm1,   8
    psrldq    xmm6,   8
    psrldq    xmm0,   8
    
    lea       edx,  [edx + ecx*4]
    movq     [edx],  xmm4 
    movq     [edx + ecx],  xmm2
    movq     [edx + ecx*2],  xmm3
    movq     [edx + eax],  xmm7
    
    lea      edx,   [edx + ecx*4]
    movq     [edx],  xmm5 
    movq     [edx + ecx],  xmm1
    movq     [edx + ecx*2],  xmm6
    movq     [edx + eax],  xmm0   
    
    
    mov      esp,   ebp
    pop      ebp
    ret