shithub: openh264

ref: 69f2d768b1ea8be8b549f3f47442a620587405b8
dir: /codec/common/x86/satd_sad.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  satd_sad.asm
;*
;*  Abstract
;*      WelsSampleSatd4x4_sse2
;*      WelsSampleSatd8x8_sse2
;*      WelsSampleSatd16x8_sse2
;*      WelsSampleSatd8x16_sse2
;*      WelsSampleSatd16x16_sse2
;*
;*      WelsSampleSad16x8_sse2
;*      WelsSampleSad16x16_sse2
;*
;*  History
;*      8/5/2009 Created
;*     24/9/2009 modified
;*
;*
;*************************************************************************/

%include "asm_inc.asm"

;***********************************************************************
; Data
;***********************************************************************
SECTION .rodata align=16

align 16
HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
align 16
HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
align 16
PDW1:  dw 1,1,1,1,1,1,1,1
align 16
PDQ2:  dw 2,0,0,0,2,0,0,0
align 16
HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1

;***********************************************************************
; Code
;***********************************************************************
SECTION .text

;***********************************************************************
;
;Pixel_satd_wxh_sse2 BEGIN
;
;***********************************************************************
%macro MMX_DW_1_2REG 2
      pxor %1, %1
      pcmpeqw %2, %2
      psubw %1, %2
%endmacro

%macro  SSE2_SumWHorizon1 2
	movdqa      %2, %1
	psrldq      %2, 8
	paddusw     %1, %2
	movdqa      %2, %1
	psrldq      %2, 4
	paddusw     %1, %2
	movdqa      %2, %1
	psrldq      %2, 2
	paddusw     %1, %2
%endmacro

%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
   SSE2_SumSub %1, %2, %5
   SSE2_SumSub %3, %4, %5
   SSE2_SumSub %2, %4, %5
   SSE2_SumSub %1, %3, %5
%endmacro

%macro SSE2_SumAbs4 7
	WELS_AbsW %1, %3
	WELS_AbsW %2, %3
	WELS_AbsW %4, %6
	WELS_AbsW %5, %6
	paddusw       %1, %2
	paddusw       %4, %5
	paddusw       %7, %1
	paddusw       %7, %4
%endmacro

%macro  SSE2_SumWHorizon 3
	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
%endmacro

%macro SSE2_GetSatd8x8 0
	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
	lea                 r0, [r0+2*r1]
	lea                 r2, [r2+2*r3]
	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]

	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6

	lea					r0,    [r0+2*r1]
    lea					r2,    [r2+2*r3]
	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
	lea                 r0, [r0+2*r1]
	lea                 r2, [r2+2*r3]
	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]

	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro

;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
	%assign  push_num 0
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
    movd      xmm0, [r0]
    movd      xmm1, [r0+r1]
    lea       r0 , [r0+2*r1]
    movd      xmm2, [r0]
    movd      xmm3, [r0+r1]
    punpckldq xmm0, xmm2
    punpckldq xmm1, xmm3

    movd      xmm4, [r2]
    movd      xmm5, [r2+r3]
    lea       r2 , [r2+2*r3]
    movd      xmm6, [r2]
    movd      xmm7, [r2+r3]
    punpckldq xmm4, xmm6
    punpckldq xmm5, xmm7

    pxor      xmm6, xmm6
    punpcklbw xmm0, xmm6
    punpcklbw xmm1, xmm6
    punpcklbw xmm4, xmm6
    punpcklbw xmm5, xmm6

    psubw     xmm0, xmm4
    psubw     xmm1, xmm5

    movdqa    xmm2, xmm0
    paddw     xmm0, xmm1
    psubw     xmm2, xmm1
    SSE2_XSawp qdq, xmm0, xmm2, xmm3

    movdqa     xmm4, xmm0
    paddw      xmm0, xmm3
    psubw      xmm4, xmm3

    movdqa         xmm2, xmm0
    punpcklwd      xmm0, xmm4
    punpckhwd      xmm4, xmm2

	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
	SSE2_XSawp     qdq, xmm0, xmm3, xmm5

    movdqa         xmm7, xmm0
    paddw          xmm0, xmm5
    psubw          xmm7, xmm5

	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1

    movdqa         xmm2, xmm0
    paddw          xmm0, xmm1
    psubw          xmm2, xmm1

    WELS_AbsW  xmm0, xmm3
    paddusw        xmm6, xmm0
	WELS_AbsW  xmm2, xmm4
    paddusw        xmm6, xmm2
    SSE2_SumWHorizon1  xmm6, xmm4
	movd           retrd,  xmm6
    and            retrd,  0xffff
    shr            retrd,  1
	POP_XMM
	LOAD_4_PARA_POP
	ret

 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
 ;
 ;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse2
	%assign  push_num 0
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	pxor   xmm6,   xmm6
    pxor   xmm7,   xmm7
    SSE2_GetSatd8x8
    psrlw   xmm6,  1
	SSE2_SumWHorizon   xmm6,xmm4,xmm7
	movd    retrd,   xmm6
	POP_XMM
	LOAD_4_PARA_POP
	ret

 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
 ;
 ;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse2
	 %assign  push_num 0
	 LOAD_4_PARA
	 PUSH_XMM 8
	 SIGN_EXTENSION r1, r1d
	 SIGN_EXTENSION r3, r3d
	 pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7

	 SSE2_GetSatd8x8
     lea    r0,    [r0+2*r1]
     lea    r2,    [r2+2*r3]
	 SSE2_GetSatd8x8

	 psrlw   xmm6,  1
	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
	 movd    retrd,   xmm6
	 POP_XMM
	 LOAD_4_PARA_POP
	 ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
	%assign  push_num 0
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	push r0
	push r2
	pxor   xmm6,   xmm6
    pxor   xmm7,   xmm7

	SSE2_GetSatd8x8

	pop r2
	pop r0
    add    r0,    8
    add    r2,    8
	SSE2_GetSatd8x8

	psrlw   xmm6,  1
	SSE2_SumWHorizon   xmm6,xmm4,xmm7
	movd    retrd,   xmm6
	POP_XMM
	LOAD_4_PARA_POP
	ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
	%assign  push_num 0
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	push r0
	push r2
	pxor   xmm6,   xmm6
    pxor   xmm7,   xmm7

	SSE2_GetSatd8x8
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	SSE2_GetSatd8x8

	pop r2
	pop r0
	add    r0,    8
	add    r2,    8

	SSE2_GetSatd8x8
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	SSE2_GetSatd8x8

 ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
    psrlw   xmm6,  1
	SSE2_SumWHorizon   xmm6,xmm4,xmm7
	movd    retrd,   xmm6
	POP_XMM
	LOAD_4_PARA_POP
	ret

;***********************************************************************
;
;Pixel_satd_wxh_sse2 END
;
;***********************************************************************

;***********************************************************************
;
;Pixel_satd_intra_sse2 BEGIN
;
;***********************************************************************


%macro SSE_DB_1_2REG 2
      pxor %1, %1
      pcmpeqw %2, %2
      psubb %1, %2
%endmacro

;***********************************************************************
;
;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
;
;***********************************************************************
WELS_EXTERN WelsSampleSatdThree4x4_sse2

%ifdef X86_32
	push r3
	push r4
	push r5
	push r6
	%assign  push_num 4
%else
	%assign  push_num 0
%endif
	PUSH_XMM 8

	mov  r2, arg3
	mov  r3, arg4
	SIGN_EXTENSION r3, r3d

	; load source 4x4 samples and Hadamard transform
	movd      xmm0, [r2]
	movd      xmm1, [r2+r3]
	lea       r2 , [r2+2*r3]
	movd      xmm2, [r2]
	movd      xmm3, [r2+r3]
	punpckldq xmm0, xmm2
	punpckldq xmm1, xmm3

	pxor      xmm6, xmm6
	punpcklbw xmm0, xmm6
	punpcklbw xmm1, xmm6

	movdqa    xmm2, xmm0
	paddw     xmm0, xmm1
	psubw     xmm2, xmm1
	SSE2_XSawp  qdq, xmm0, xmm2, xmm3

	movdqa    xmm4, xmm0
	paddw     xmm0, xmm3
	psubw     xmm4, xmm3

	movdqa    xmm2, xmm0
	punpcklwd xmm0, xmm4
	punpckhwd xmm4, xmm2

	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
	SSE2_XSawp  qdq, xmm0, xmm3, xmm5

	movdqa    xmm7, xmm0
	paddw     xmm0, xmm5
	psubw     xmm7, xmm5

	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1

	; Hadamard transform results are saved in xmm0 and xmm2
	movdqa    xmm2, xmm0
	paddw     xmm0, xmm1
	psubw     xmm2, xmm1

	;load top boundary samples: [a b c d]
	mov r0, arg1
	mov r1, arg2
	SIGN_EXTENSION r1, r1d
	sub r0, r1
%ifdef UNIX64
	push r4
	push r5
%endif

	movzx     r2d,  byte [r0]
	movzx     r3d,  byte [r0+1]
	movzx     r4d,  byte [r0+2]
	movzx     r5d,  byte [r0+3]

	; get the transform results of top boundary samples: [a b c d]
	add       r3d, r2d ; r3d = a + b
	add       r5d, r4d ; r5d = c + d
	add       r2d, r2d ; r2d = a + a
	add       r4d, r4d ; r4d = c + c
	sub       r2d, r3d ; r2d = a + a - a - b = a - b
	sub       r4d, r5d ; r4d = c + c - c - d = c - d
	add       r5d, r3d ; r5d = (a + b) + (c + d)
	add       r3d, r3d
	sub       r3d, r5d ; r3d = (a + b) - (c + d)
	add       r4d, r2d ; r4d = (a - b) + (c - d)
	add       r2d, r2d
	sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]

	movdqa    xmm6, xmm0
	movdqa    xmm7, xmm2
	movd      xmm5, r5d ; store the edi for DC mode
	pxor      xmm3, xmm3
	pxor      xmm4, xmm4
	pinsrw    xmm3, r5d, 0
	pinsrw    xmm3, r4d, 4
	psllw     xmm3, 2
	pinsrw    xmm4, r3d, 0
	pinsrw    xmm4, r2d, 4
	psllw     xmm4, 2

	; get the satd of H
	psubw     xmm0, xmm3
	psubw     xmm2, xmm4

	WELS_AbsW  xmm0, xmm1
	WELS_AbsW  xmm2, xmm1
	paddusw        xmm0, xmm2
	SSE2_SumWHorizon1  xmm0, xmm1 ; satd of V is stored in xmm0

	;load left boundary samples: [a b c d]'
	add r0, r1

	movzx     r2d,  byte [r0-1]
	movzx     r3d,  byte [r0+r1-1]
	lea       r0 , [r0+2*r1]
	movzx     r4d,  byte [r0-1]
	movzx     r5d,  byte [r0+r1-1]

	; get the transform results of left boundary samples: [a b c d]'
	add       r3d, r2d ; r3d = a + b
	add       r5d, r4d ; r5d = c + d
	add       r2d, r2d ; r2d = a + a
	add       r4d, r4d ; r4d = c + c
	sub       r2d, r3d ; r2d = a + a - a - b = a - b
	sub       r4d, r5d ; r4d = c + c - c - d = c - d
	add       r5d, r3d ; r5d = (a + b) + (c + d)
	add       r3d, r3d
	sub       r3d, r5d ; r3d = (a + b) - (c + d)
	add       r4d, r2d ; r4d = (a - b) + (c - d)
	add       r2d, r2d
	sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]

	; store the transform results in xmm3
	movd      xmm3, r5d
	pinsrw    xmm3, r3d, 1
	pinsrw    xmm3, r2d, 2
	pinsrw    xmm3, r4d, 3
	psllw     xmm3, 2

	; get the satd of V
	movdqa    xmm2, xmm6
	movdqa    xmm4, xmm7
	psubw     xmm2, xmm3
	WELS_AbsW  xmm2, xmm1
	WELS_AbsW  xmm4, xmm1
	paddusw        xmm2, xmm4
	SSE2_SumWHorizon1  xmm2, xmm1 ; satd of H is stored in xmm2

	; DC result is stored in xmm1
	add       r5d, 4
	movd      xmm1, r5d
	paddw     xmm1, xmm5
	psrlw     xmm1, 3
	movdqa    xmm5, xmm1
	psllw     xmm1, 4

	; get the satd of DC
	psubw          xmm6, xmm1
	WELS_AbsW  xmm6, xmm1
	WELS_AbsW  xmm7, xmm1
	paddusw        xmm6, xmm7
	SSE2_SumWHorizon1  xmm6, xmm1 ; satd of DC is stored in xmm6
%ifdef UNIX64
	pop r5
	pop r4
%endif
	; comparing order: DC H V

	mov  r4, arg5
	movd      r2d, xmm6
	movd      r3d, xmm2
	movd      r6d, xmm0

	and       r2d, 0xffff
	shr       r2d, 1
	and       r3d, 0xffff
	shr       r3d, 1
	and       r6d, 0xffff
	shr       r6d, 1
	add       r2d, dword arg7
	add       r3d, dword arg8
	add       r6d, dword arg9
	cmp       r2w, r3w
	jg near   not_dc
	cmp       r2w, r6w
	jg near   not_dc_h

	; for DC mode
	movd      r3d, xmm5
	imul      r3d, 0x01010101
	movd	  xmm5, r3d
	pshufd    xmm5, xmm5, 0
	movdqa    [r4], xmm5
	mov r5, arg6
	mov       dword [r5], 0x02
	mov retrd, r2d
	POP_XMM
%ifdef X86_32
	pop r6
	pop r5
	pop r4
	pop r3
%endif
	ret

not_dc:
	cmp       r3w, r6w
	jg near   not_dc_h

	; for H mode
	SSE_DB_1_2REG  xmm6, xmm7
	sub        r0, r1
	sub        r0, r1
	movzx      r6d,  byte [r0-1]
	movd       xmm0, r6d
	pmuludq    xmm0, xmm6

	movzx     r6d,  byte [r0+r1-1]
	movd      xmm1, r6d
	pmuludq   xmm1, xmm6
	punpckldq xmm0, xmm1

	lea       r0,	[r0+r1*2]
	movzx	  r6d,	byte [r0-1]
	movd	  xmm2,	r6d
	pmuludq   xmm2, xmm6

	movzx	  r6d,	byte [r0+r1-1]
	movd	  xmm3,	r6d
	pmuludq   xmm3, xmm6
	punpckldq  xmm2, xmm3
	punpcklqdq xmm0, xmm2

	movdqa	  [r4],xmm0

	mov       retrd, r3d
	mov r5, arg6
	mov       dword [r5], 0x01
	POP_XMM
%ifdef X86_32
	pop r6
	pop r5
	pop r4
	pop r3
%endif
	ret
not_dc_h:
	sub        r0, r1
	sub        r0, r1
	sub        r0, r1
	movd	  xmm0,	[r0]
	pshufd	  xmm0,	xmm0, 0
	movdqa	  [r4],xmm0
	mov       retrd, r6d
	mov r5, arg6
	mov       dword [r5], 0x00
	POP_XMM
%ifdef X86_32
	pop r6
	pop r5
	pop r4
	pop r3
%endif
	ret


%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
	pmaddubsw    %1, xmm5
	movdqa       %2, %1
	pmaddwd      %1, xmm7
	pmaddwd      %2, xmm6
	movdqa       %3, %1
	punpckldq    %1, %2
	punpckhdq    %2, %3
	movdqa       %3, %1
	punpcklqdq   %1, %2
	punpckhqdq   %3, %2
	paddd        xmm4, %1 ;for dc
	paddd        xmm4, %3 ;for dc
	packssdw     %1, %3
	psllw        %1, 2
%endmacro
%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
	pmaddubsw    %1, xmm5
	movdqa       %2, %1
	pmaddwd      %1, xmm7
	pmaddwd      %2, xmm6
	movdqa       %3, %1
	punpckldq    %1, %2
	punpckhdq    %2, %3
	movdqa       %3, %1
	punpcklqdq   %1, %2
	punpckhqdq   %3, %2
;    paddd        xmm4, %1 ;for dc
;	 paddd        xmm4, %3 ;for dc
	movdqa       %4, %1
	punpcklqdq   %4, %3
	packssdw     %1, %3
	psllw        %1, 2
%endmacro

%macro SSE41_GetX38x4SatdDec 0
	pxor        xmm7,   xmm7
	movq        xmm0,   [r2]
	movq        xmm1,   [r2+r3]
	lea         r2,    [r2+2*r3]
	movq        xmm2,   [r2]
	movq        xmm3,   [r2+r3]
	lea         r2,    [r2+2*r3]
	punpcklbw   xmm0,   xmm7
	punpcklbw   xmm1,   xmm7
	punpcklbw   xmm2,   xmm7
	punpcklbw   xmm3,   xmm7
	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
	;doesn't need another transpose
%endmacro

%macro SSE41_GetX38x4SatdV 2
	pxor        xmm0,   xmm0
	pinsrw      xmm0,   word[r6+%2],   0
	pinsrw      xmm0,   word[r6+%2+8], 4
	psubsw      xmm0,   xmm7
	pabsw       xmm0,   xmm0
	paddw       xmm4,   xmm0
	pxor        xmm0,   xmm0
	pinsrw      xmm0,   word[r6+%2+2],  0
	pinsrw      xmm0,   word[r6+%2+10], 4
	psubsw      xmm0,   xmm1
	pabsw       xmm0,   xmm0
	paddw       xmm4,   xmm0
	pxor        xmm0,   xmm0
	pinsrw      xmm0,   word[r6+%2+4],  0
	pinsrw      xmm0,   word[r6+%2+12], 4
	psubsw      xmm0,   xmm3
	pabsw       xmm0,   xmm0
	paddw       xmm4,   xmm0
	pxor        xmm0,   xmm0
	pinsrw      xmm0,   word[r6+%2+6],  0
	pinsrw      xmm0,   word[r6+%2+14], 4
	psubsw      xmm0,   xmm2
	pabsw       xmm0,   xmm0
	paddw       xmm4,   xmm0
%endmacro
%macro SSE41_GetX38x4SatdH  3
	movq        xmm0,   [r6+%3+8*%1]
	punpcklqdq  xmm0,   xmm0
	psubsw      xmm0,   xmm7
	pabsw       xmm0,   xmm0
	paddw       xmm5,   xmm0
	pabsw       xmm1,   xmm1
	pabsw       xmm2,   xmm2
	pabsw       xmm3,   xmm3
	paddw       xmm2,   xmm1;for DC
	paddw       xmm2,   xmm3;for DC
	paddw       xmm5,   xmm2
%endmacro
%macro SSE41_I16X16GetX38x4SatdDC 0
	pxor        xmm0,   xmm0
	movq2dq     xmm0,   mm4
	punpcklqdq  xmm0,   xmm0
	psubsw      xmm0,   xmm7
	pabsw       xmm0,   xmm0
	paddw       xmm6,   xmm0
	paddw       xmm6,   xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
	shl         %1,     4
	movdqa      xmm0,   [r6+32+%1]
	psubsw      xmm0,   xmm7
	pabsw       xmm0,   xmm0
	paddw       xmm6,   xmm0
	paddw       xmm6,   xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
	SSE41_GetX38x4SatdDec
	SSE41_GetX38x4SatdV   %1, %2
	SSE41_GetX38x4SatdH   %1, %2, 32
	SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
	SSE41_GetX38x4SatdDec
	SSE41_GetX38x4SatdV   %1, %2
	SSE41_GetX38x4SatdH   %1, %2, 16
	SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
	pmaddwd     %1, %2
	movhlps     %3, %1
	paddd       %1, %3
	pshuflw     %3, %1,0Eh
	paddd       %1, %3
%endmacro

WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
	%assign  push_num 0
	LOAD_7_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	SIGN_EXTENSION r5, r5d

%ifndef X86_32
	push r12
	mov  r12, r2
%endif

	pxor        xmm4,   xmm4
	movdqa      xmm5,   [HSumSubDB1]
	movdqa      xmm6,   [HSumSubDW1]
	movdqa      xmm7,   [PDW1]
	sub         r0,    r1
	movdqu		xmm0,   [r0]
	movhlps		xmm1,   xmm0
	punpcklqdq  xmm0,   xmm0
	punpcklqdq  xmm1,   xmm1
	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
	movdqa      [r6],  xmm0 ;V
	movdqa      [r6+16], xmm1
	add         r0,    r1
	pinsrb      xmm0,   byte[r0-1], 0
	pinsrb      xmm0,   byte[r0+r1-1], 1
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     2
	pinsrb      xmm0,   byte[r0+r1-1], 3
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     4
	pinsrb      xmm0,   byte[r0+r1-1], 5
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     6
	pinsrb      xmm0,   byte[r0+r1-1], 7
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     8
	pinsrb      xmm0,   byte[r0+r1-1], 9
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     10
	pinsrb      xmm0,   byte[r0+r1-1], 11
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     12
	pinsrb      xmm0,   byte[r0+r1-1], 13
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     14
	pinsrb      xmm0,   byte[r0+r1-1], 15
	movhlps		xmm1,   xmm0
	punpcklqdq  xmm0,   xmm0
	punpcklqdq  xmm1,   xmm1
	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
	movdqa      [r6+32], xmm0 ;H
	movdqa      [r6+48], xmm1
	movd        r0d,    xmm4 ;dc
	add         r0d,    16   ;(sum+16)
	shr         r0d,    5    ;((sum+16)>>5)
	shl         r0d,    4    ;
	movd        mm4,    r0d  ; mm4 copy DC
	pxor        xmm4,   xmm4 ;V
	pxor        xmm5,   xmm5 ;H
	pxor        xmm6,   xmm6 ;DC
%ifdef UNIX64
	push r4
%endif
	mov         r0,    0
	mov         r4,    0

.loop16x16_get_satd:
.loopStart1:
	SSE41_I16x16GetX38x4Satd r0, r4
	inc          r0
	cmp         r0, 4
	jl          .loopStart1
	cmp         r4, 16
	je          .loop16x16_get_satd_end
%ifdef X86_32
	mov r2, arg3
%else
	mov r2, r12
%endif
	add         r2, 8
	mov         r0, 0
	add         r4, 16
	jmp         .loop16x16_get_satd
 .loop16x16_get_satd_end:
	MMX_DW_1_2REG    xmm0, xmm1
	psrlw       xmm4, 1 ;/2
	psrlw       xmm5, 1 ;/2
	psrlw       xmm6, 1 ;/2
	SSE41_HSum8W     xmm4, xmm0, xmm1
	SSE41_HSum8W     xmm5, xmm0, xmm1
	SSE41_HSum8W     xmm6, xmm0, xmm1

%ifdef UNIX64
	pop r4
%endif
	; comparing order: DC H V
	movd      r3d, xmm6 ;DC
	movd      r1d, xmm5 ;H
	movd      r0d, xmm4 ;V
%ifndef X86_32
	pop r12
%endif
	shl       r5d, 1
	add       r1d, r5d
	add       r3d, r5d
	mov       r4, arg5
	cmp       r3d, r1d
	jge near   not_dc_16x16
	cmp        r3d, r0d
	jge near   not_dc_h_16x16

	; for DC mode
	mov       dword[r4], 2;I16_PRED_DC
	mov       retrd, r3d
	jmp near return_satd_intra_16x16_x3
not_dc_16x16:
	; for H mode
	cmp       r1d, r0d
	jge near   not_dc_h_16x16
	mov       dword[r4], 1;I16_PRED_H
	mov       retrd, r1d
	jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
	; for V mode
	mov       dword[r4], 0;I16_PRED_V
	mov       retrd, r0d
return_satd_intra_16x16_x3:
	WELSEMMS
	POP_XMM
	LOAD_7_PARA_POP
ret

%macro SSE41_ChromaGetX38x8Satd 0
	movdqa      xmm5,   [HSumSubDB1]
	movdqa      xmm6,   [HSumSubDW1]
	movdqa      xmm7,   [PDW1]
	sub         r0,    r1
	movq		xmm0,   [r0]
	punpcklqdq  xmm0,   xmm0
	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
	movdqa      [r6],  xmm0 ;V
	add         r0,    r1
	pinsrb      xmm0,   byte[r0-1], 0
	pinsrb      xmm0,   byte[r0+r1-1], 1
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     2
	pinsrb      xmm0,   byte[r0+r1-1], 3
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     4
	pinsrb      xmm0,   byte[r0+r1-1], 5
	lea         r0,    [r0+2*r1]
	pinsrb      xmm0,   byte[r0-1],     6
	pinsrb      xmm0,   byte[r0+r1-1], 7
	punpcklqdq  xmm0,   xmm0
	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
	movdqa      [r6+16], xmm0 ;H
;(sum+2)>>2
	movdqa      xmm6,   [PDQ2]
	movdqa      xmm5,   xmm4
	punpckhqdq  xmm5,   xmm1
	paddd       xmm5,   xmm6
	psrld       xmm5,   2
;(sum1+sum2+4)>>3
	paddd       xmm6,   xmm6
	paddd       xmm4,   xmm1
	paddd       xmm4,   xmm6
	psrld       xmm4,   3
;satd *16
	pslld       xmm5,   4
	pslld       xmm4,   4
;temp satd
	movdqa      xmm6,   xmm4
	punpcklqdq  xmm4,   xmm5
	psllq       xmm4,   32
	psrlq       xmm4,   32
	movdqa      [r6+32], xmm4
	punpckhqdq  xmm5,   xmm6
	psllq       xmm5,   32
	psrlq       xmm5,   32
	movdqa      [r6+48], xmm5

	pxor        xmm4,   xmm4 ;V
	pxor        xmm5,   xmm5 ;H
	pxor        xmm6,   xmm6 ;DC
	mov         r0,    0
	SSE41_ChromaGetX38x4Satd r0, 0
	inc             r0
	SSE41_ChromaGetX38x4Satd r0, 0
%endmacro

%macro SSEReg2MMX 3
	movdq2q     %2, %1
	movhlps     %1, %1
	movdq2q     %3, %1
%endmacro
%macro MMXReg2SSE 4
	movq2dq     %1, %3
	movq2dq     %2, %4
	punpcklqdq  %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41

WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
	%assign  push_num 0
	LOAD_7_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	SIGN_EXTENSION r5, r5d
loop_chroma_satdx3:
	SSE41_ChromaGetX38x8Satd
	SSEReg2MMX  xmm4, mm0,mm1
	SSEReg2MMX  xmm5, mm2,mm3
	SSEReg2MMX  xmm6, mm5,mm6
	mov r0,     arg8
	mov r2,     arg9

	SSE41_ChromaGetX38x8Satd

	MMXReg2SSE  xmm0, xmm3, mm0, mm1
	MMXReg2SSE  xmm1, xmm3, mm2, mm3
	MMXReg2SSE  xmm2, xmm3, mm5, mm6

	paddw       xmm4, xmm0
	paddw       xmm5, xmm1
	paddw       xmm6, xmm2

	MMX_DW_1_2REG    xmm0, xmm1
	psrlw       xmm4, 1 ;/2
	psrlw       xmm5, 1 ;/2
	psrlw       xmm6, 1 ;/2
	SSE41_HSum8W     xmm4, xmm0, xmm1
	SSE41_HSum8W     xmm5, xmm0, xmm1
	SSE41_HSum8W     xmm6, xmm0, xmm1
	; comparing order: DC H V
	movd      r3d, xmm6 ;DC
	movd      r1d, xmm5 ;H
	movd      r0d, xmm4 ;V


	shl       r5d, 1
	add       r1d, r5d
	add       r0d, r5d
	cmp       r3d, r1d
	jge near   not_dc_8x8
	cmp        r3d, r0d
	jge near   not_dc_h_8x8

	; for DC mode
	mov       dword[r4], 0;I8_PRED_DC
	mov       retrd, r3d
	jmp near return_satd_intra_8x8_x3
not_dc_8x8:
	; for H mode
	cmp       r1d, r0d
	jge near   not_dc_h_8x8
	mov       dword[r4], 1;I8_PRED_H
	mov       retrd, r1d
	jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
	; for V mode
	mov       dword[r4], 2;I8_PRED_V
	mov       retrd, r0d
return_satd_intra_8x8_x3:
	WELSEMMS
	POP_XMM
	LOAD_7_PARA_POP
ret


;***********************************************************************
;
;Pixel_satd_intra_sse2 END
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
  movd        xmm6,%1
  pshufb      xmm6,xmm1
  movdqa      %1,  xmm6
  movdqa      xmm0,%2
  psadbw      xmm0,xmm7
  paddw       xmm4,xmm0
  movdqa      xmm0,%2
  psadbw      xmm0,xmm5
  paddw       xmm2,xmm0
  psadbw      xmm6,%2
  paddw       xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
  movzx   %2, byte %1
  mov    %3, %2
  add     %4, %2
%endmacro

;***********************************************************************
;
;Pixel_sad_intra_ssse3 BEGIN
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
	%assign  push_num 0
	LOAD_7_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	SIGN_EXTENSION r5, r5d

	push  r5
	push  r4
	push  r3

	sub    r0,    r1
	movdqa      xmm5,[r0]
	pxor        xmm0,xmm0
	psadbw      xmm0,xmm5
	movhlps     xmm1,xmm0
	paddw       xmm0,xmm1
	movd        r5d, xmm0

	add         r0,r1
	lea         r3,[r1+2*r1]    ;ebx r3
	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d    ; esi r4d, eax r5d
	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
	lea         r0, [r0+4*r1]
	add         r6, 64
	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
	lea         r0, [r0+4*r1]
	add         r6, 64
	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
	lea         r0, [r0+4*r1]
	add         r6, 64
	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
	sub         r6, 192
	add         r5d,10h
	shr         r5d,5
	movd        xmm7,r5d
	pxor        xmm1,xmm1
	pshufb      xmm7,xmm1
	pxor        xmm4,xmm4
	pxor        xmm3,xmm3
	pxor        xmm2,xmm2
	;sad begin
	pop   r3
	lea         r4, [r3+2*r3] ;esi r4
	SSSE3_Get16BSadHVDC [r6], [r2]
	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
	add         r6, 64
	lea         r2, [r2+4*r3]
	SSSE3_Get16BSadHVDC [r6], [r2]
	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
	add         r6, 64
	lea         r2, [r2+4*r3]
	SSSE3_Get16BSadHVDC [r6], [r2]
	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
	add         r6, 64
	lea         r2, [r2+4*r3]
	SSSE3_Get16BSadHVDC [r6], [r2]
	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]

	pop r4
	pop r5
	pslldq      xmm3,4
	por         xmm3,xmm2
	movhlps     xmm1,xmm3
	paddw       xmm3,xmm1
	movhlps     xmm0,xmm4
	paddw       xmm4,xmm0
	; comparing order: DC H V
	movd        r1d, xmm4 ;DC   ;ebx r1d
	movd        r0d, xmm3 ;V    ;ecx r0d
	psrldq      xmm3, 4
	movd        r2d, xmm3 ;H    ;esi r2d

	;mov         eax, [esp+36] ;lamda ;eax r5
	shl         r5d, 1
	add         r2d, r5d
	add         r1d, r5d
	;mov         edx, [esp+32]  ;edx r4
	cmp         r1d, r2d
	jge near   not_dc_16x16_sad
	cmp        r1d, r0d
	jge near   not_dc_h_16x16_sad
	; for DC mode
	mov       dword[r4], 2;I16_PRED_DC
	mov       retrd, r1d
	sub        r6, 192
%assign x 0
%rep 16
	movdqa    [r6+16*x], xmm7
%assign x x+1
%endrep
	jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
	; for H mode
	cmp       r2d, r0d
	jge near   not_dc_h_16x16_sad
	mov       dword[r4], 1;I16_PRED_H
	mov       retrd, r2d
	jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
	; for V mode
	mov       dword[r4], 0;I16_PRED_V
	mov       retrd, r0d
	sub       r6, 192
%assign x 0
%rep 16
	movdqa    [r6+16*x], xmm5
%assign x x+1
%endrep
return_sad_intra_16x16_x3:
	POP_XMM
	LOAD_7_PARA_POP
	ret

;***********************************************************************
;
;Pixel_sad_intra_ssse3 END
;
;***********************************************************************
;***********************************************************************
;
;Pixel_satd_wxh_sse41 BEGIN
;
;***********************************************************************

;SSE4.1
%macro SSE41_GetSatd8x4 0
	movq             xmm0, [r0]
	punpcklqdq       xmm0, xmm0
	pmaddubsw        xmm0, xmm7
	movq             xmm1, [r0+r1]
	punpcklqdq       xmm1, xmm1
	pmaddubsw        xmm1, xmm7
	movq             xmm2, [r2]
	punpcklqdq       xmm2, xmm2
	pmaddubsw        xmm2, xmm7
	movq             xmm3, [r2+r3]
	punpcklqdq       xmm3, xmm3
	pmaddubsw        xmm3, xmm7
	psubsw           xmm0, xmm2
	psubsw           xmm1, xmm3
	movq             xmm2, [r0+2*r1]
	punpcklqdq       xmm2, xmm2
	pmaddubsw        xmm2, xmm7
	movq             xmm3, [r0+r4]
	punpcklqdq       xmm3, xmm3
	pmaddubsw        xmm3, xmm7
	movq             xmm4, [r2+2*r3]
	punpcklqdq       xmm4, xmm4
	pmaddubsw        xmm4, xmm7
	movq             xmm5, [r2+r5]
	punpcklqdq       xmm5, xmm5
	pmaddubsw        xmm5, xmm7
	psubsw           xmm2, xmm4
	psubsw           xmm3, xmm5
	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
	pabsw            xmm0, xmm0
	pabsw            xmm2, xmm2
	pabsw            xmm1, xmm1
	pabsw            xmm3, xmm3
	movdqa           xmm4, xmm3
	pblendw          xmm3, xmm1, 0xAA
	pslld            xmm1, 16
	psrld            xmm4, 16
	por              xmm1, xmm4
	pmaxuw           xmm1, xmm3
	paddw            xmm6, xmm1
	movdqa           xmm4, xmm0
	pblendw          xmm0, xmm2, 0xAA
	pslld            xmm2, 16
	psrld            xmm4, 16
	por              xmm2, xmm4
	pmaxuw           xmm0, xmm2
	paddw            xmm6, xmm0
%endmacro

%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
	MMX_DW_1_2REG    %3, %4
	pmaddwd     %2, %3
	movhlps     %4, %2
	paddd       %2, %4
	pshuflw     %4, %2,0Eh
	paddd       %2, %4
	movd		%1, %2
%endmacro
;***********************************************************************
;
;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
	%assign  push_num 0
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	movdqa      xmm4,[HSwapSumSubDB1]
	movd        xmm2,[r2]
	movd        xmm5,[r2+r3]
	shufps      xmm2,xmm5,0
	movd        xmm3,[r2+r3*2]
	lea         r2, [r3*2+r2]
	movd        xmm5,[r2+r3]
	shufps      xmm3,xmm5,0
	movd        xmm0,[r0]
	movd        xmm5,[r0+r1]
	shufps      xmm0,xmm5,0
	movd        xmm1,[r0+r1*2]
	lea         r0, [r1*2+r0]
	movd        xmm5,[r0+r1]
	shufps      xmm1,xmm5,0
	pmaddubsw   xmm0,xmm4
	pmaddubsw   xmm1,xmm4
	pmaddubsw   xmm2,xmm4
	pmaddubsw   xmm3,xmm4
	psubw       xmm0,xmm2
	psubw       xmm1,xmm3
	movdqa      xmm2,xmm0
	paddw       xmm0,xmm1
	psubw       xmm1,xmm2
	movdqa      xmm2,xmm0
	punpcklqdq  xmm0,xmm1
	punpckhqdq  xmm2,xmm1
	movdqa      xmm1,xmm0
	paddw       xmm0,xmm2
	psubw       xmm2,xmm1
	movdqa      xmm1,xmm0
	pblendw     xmm0,xmm2,0AAh
	pslld       xmm2,16
	psrld       xmm1,16
	por         xmm2,xmm1
	pabsw       xmm0,xmm0
	pabsw       xmm2,xmm2
	pmaxsw      xmm0,xmm2
	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
	POP_XMM
	LOAD_4_PARA_POP
	ret

;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
%ifdef X86_32
	push  r4
	push  r5
%endif
	%assign  push_num 2
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	movdqa      xmm7, [HSumSubDB1]
	lea         r4,  [r1+r1*2]
	lea         r5,  [r3+r3*2]
	pxor		xmm6, xmm6
	SSE41_GetSatd8x4
	lea			r0,	 [r0+4*r1]
	lea			r2,  [r2+4*r3]
	SSE41_GetSatd8x4
	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
	POP_XMM
	LOAD_4_PARA_POP
%ifdef X86_32
	pop  r5
	pop  r4
%endif
	ret

;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
%ifdef X86_32
	push  r4
	push  r5
	push  r6
%endif
	%assign  push_num 3
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	movdqa      xmm7, [HSumSubDB1]
	lea         r4,  [r1+r1*2]
	lea         r5,  [r3+r3*2]
	pxor        xmm6, xmm6
	mov         r6,    0
loop_get_satd_8x16:
	SSE41_GetSatd8x4
	lea			r0,  [r0+4*r1]
	lea			r2,  [r2+4*r3]
	inc         r6
	cmp         r6,  4
	jl          loop_get_satd_8x16
	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
	POP_XMM
	LOAD_4_PARA_POP
%ifdef X86_32
	pop  r6
	pop  r5
	pop  r4
%endif
	ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
%ifdef X86_32
	push  r4
	push  r5
%endif
	%assign  push_num 2
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	push  r0
	push  r2

	movdqa      xmm7, [HSumSubDB1]
	lea         r4,  [r1+r1*2]
	lea         r5,  [r3+r3*2]
	pxor		xmm6,   xmm6
	SSE41_GetSatd8x4
	lea			r0,  [r0+4*r1]
	lea			r2,  [r2+4*r3]
	SSE41_GetSatd8x4

	pop  r2
	pop  r0
	add			r0,    8
	add			r2,    8
	SSE41_GetSatd8x4
	lea			r0,  [r0+4*r1]
	lea			r2,  [r2+4*r3]
	SSE41_GetSatd8x4
	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
	POP_XMM
	LOAD_4_PARA_POP
%ifdef X86_32
	pop  r5
	pop  r4
%endif
	ret

;***********************************************************************
;
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************

WELS_EXTERN WelsSampleSatd16x16_sse41
%ifdef X86_32
	push  r4
	push  r5
	push  r6
%endif
	%assign  push_num 3
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d

	push  r0
	push  r2

	movdqa      xmm7, [HSumSubDB1]
	lea         r4,  [r1+r1*2]
	lea         r5,  [r3+r3*2]
	pxor		xmm6,   xmm6
	mov         r6,    0
loop_get_satd_16x16_left:
	SSE41_GetSatd8x4
	lea			r0,  [r0+4*r1]
	lea			r2,  [r2+4*r3]
	inc         r6
	cmp         r6,  4
	jl          loop_get_satd_16x16_left

	pop  r2
	pop  r0
	add			r0,    8
	add			r2,    8
	mov         r6,    0
loop_get_satd_16x16_right:
	SSE41_GetSatd8x4
	lea			r0,  [r0+4*r1]
	lea			r2,  [r2+4*r3]
	inc         r6
	cmp         r6,  4
	jl          loop_get_satd_16x16_right
	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
	POP_XMM
	LOAD_4_PARA_POP
%ifdef X86_32
	pop  r6
	pop  r5
	pop  r4
%endif
	ret

;***********************************************************************
;
;Pixel_satd_wxh_sse41 END
;
;***********************************************************************

;***********************************************************************
;
;Pixel_sad_wxh_sse2 BEGIN
;
;***********************************************************************

%macro SSE2_GetSad2x16 0
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqu xmm1,   [r2]
	MOVDQ  xmm2,   [r0];[eax] must aligned 16
	psadbw xmm1,   xmm2
	paddw  xmm0,   xmm1
	movdqu xmm1,   [r2+r3]
	MOVDQ  xmm2,   [r0+r1]
	psadbw xmm1,   xmm2
	paddw  xmm0,   xmm1
%endmacro


%macro SSE2_GetSad4x16 0
	movdqu xmm0,   [r2]
	MOVDQ  xmm2,   [r0]
	psadbw xmm0,   xmm2
	paddw  xmm7,   xmm0
	movdqu xmm1,   [r2+r3]
	MOVDQ  xmm2,   [r0+r1]
	psadbw xmm1,   xmm2
	paddw  xmm7,   xmm1
	movdqu xmm1,   [r2+2*r3]
	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
	psadbw xmm1,   xmm2
	paddw  xmm7,   xmm1
	movdqu xmm1,   [r2+r5]
	MOVDQ  xmm2,   [r0+r4]
	psadbw xmm1,   xmm2
	paddw  xmm7,   xmm1
%endmacro


%macro SSE2_GetSad8x4 0
	movq   xmm0,   [r0]
	movq   xmm1,   [r0+r1]
	lea    r0,     [r0+2*r1]
	movhps xmm0,   [r0]
	movhps xmm1,   [r0+r1]

	movq   xmm2,   [r2]
	movq   xmm3,   [r2+r3]
	lea    r2,     [r2+2*r3]
	movhps xmm2,   [r2]
	movhps xmm3,   [r2+r3]
	psadbw xmm0,   xmm2
	psadbw xmm1,   xmm3
	paddw  xmm6,   xmm0
	paddw  xmm6,   xmm1
%endmacro

;***********************************************************************
;
;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
%ifdef X86_32
	push  r4
	push  r5
%endif

	%assign  push_num 2
	LOAD_4_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	lea r4, [3*r1]
	lea r5, [3*r3]

	pxor   xmm7,   xmm7
	SSE2_GetSad4x16
	lea	   r0,  [r0+4*r1]
	lea	   r2,  [r2+4*r3]
	SSE2_GetSad4x16
	lea	   r0,  [r0+4*r1]
	lea	   r2,  [r2+4*r3]
	SSE2_GetSad4x16
	lea	   r0,  [r0+4*r1]
	lea	   r2,  [r2+4*r3]
	SSE2_GetSad4x16
	movhlps xmm0, xmm7
	paddw xmm0, xmm7
	movd retrd, xmm0
	POP_XMM
	LOAD_4_PARA_POP
%ifdef X86_32
	pop  r5
	pop  r4
%endif
	ret

;***********************************************************************
;
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
;First parameter can align to 16 bytes,
;In wels, the third parameter can't align to 16 bytes.
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
	%assign  push_num 0
	LOAD_4_PARA
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	movdqu xmm0,   [r2]
	MOVDQ  xmm2,   [r0]
	psadbw xmm0,   xmm2
	movdqu xmm1,   [r2+r3]
	MOVDQ  xmm2,   [r0+r1]
	psadbw xmm1,   xmm2
	paddw  xmm0,   xmm1

	SSE2_GetSad2x16
	SSE2_GetSad2x16
	SSE2_GetSad2x16

	movhlps     xmm1, xmm0
	paddw       xmm0, xmm1
	movd        retrd,  xmm0
	LOAD_4_PARA_POP
	ret



WELS_EXTERN WelsSampleSad8x16_sse2
	%assign  push_num 0
	LOAD_4_PARA
	PUSH_XMM 7
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
    pxor   xmm6,   xmm6

	SSE2_GetSad8x4
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
    SSE2_GetSad8x4
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	SSE2_GetSad8x4
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
    SSE2_GetSad8x4

    movhlps    xmm0, xmm6
	paddw      xmm0, xmm6
	movd       retrd,  xmm0
	POP_XMM
	LOAD_4_PARA_POP
	ret


%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
and    %1,  0x1f|(%3>>1)
cmp    %1,  (32-%2)|(%3>>1)
%endmacro

WELS_EXTERN WelsSampleSad8x8_sse21
	%assign  push_num 0
	mov		r2,  arg3
	push	r2
	CACHE_SPLIT_CHECK r2, 8, 64
	jle    near   .pixel_sad_8x8_nsplit
	pop		r2
%ifdef X86_32
	push	r3
	push	r4
	push	r5
%endif
	%assign  push_num 3
	PUSH_XMM 8
	mov		r0,  arg1
	mov		r1,  arg2
	SIGN_EXTENSION r1, r1d
    pxor   xmm7,   xmm7

    ;ecx r2, edx r4, edi r5

    mov    r5,    r2
    and    r5,    0x07
    sub    r2,    r5
    mov    r4,    8
    sub    r4,    r5

    shl    r5,    3
    shl    r4,    3
    movd   xmm5,   r5d
    movd   xmm6,   r4d
	mov    r5,    8
	add    r5,    r2
    mov    r3,    arg4
	SIGN_EXTENSION r3, r3d
    movq   xmm0,   [r0]
	movhps xmm0,   [r0+r1]

	movq   xmm1,   [r2]
	movq   xmm2,   [r5]
	movhps xmm1,   [r2+r3]
	movhps xmm2,   [r5+r3]
	psrlq  xmm1,   xmm5
	psllq  xmm2,   xmm6
	por    xmm1,   xmm2

	psadbw xmm0,   xmm1
	paddw  xmm7,   xmm0

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	lea    r5,    [r5+2*r3]

    movq   xmm0,   [r0]
	movhps xmm0,   [r0+r1]

	movq   xmm1,   [r2]
	movq   xmm2,   [r5]
	movhps xmm1,   [r2+r3]
	movhps xmm2,   [r5+r3]
	psrlq  xmm1,   xmm5
	psllq  xmm2,   xmm6
	por    xmm1,   xmm2

	psadbw xmm0,   xmm1
	paddw  xmm7,   xmm0

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	lea    r5,    [r5+2*r3]

    movq   xmm0,   [r0]
	movhps xmm0,   [r0+r1]

	movq   xmm1,   [r2]
	movq   xmm2,   [r5]
	movhps xmm1,   [r2+r3]
	movhps xmm2,   [r5+r3]
	psrlq  xmm1,   xmm5
	psllq  xmm2,   xmm6
	por    xmm1,   xmm2

	psadbw xmm0,   xmm1
	paddw  xmm7,   xmm0

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	lea    r5,    [r5+2*r3]

    movq   xmm0,   [r0]
	movhps xmm0,   [r0+r1]

	movq   xmm1,   [r2]
	movq   xmm2,   [r5]
	movhps xmm1,   [r2+r3]
	movhps xmm2,   [r5+r3]
	psrlq  xmm1,   xmm5
	psllq  xmm2,   xmm6
	por    xmm1,   xmm2

	psadbw xmm0,   xmm1
	paddw  xmm7,   xmm0

    movhlps    xmm0, xmm7
	paddw      xmm0, xmm7
	movd       retrd,  xmm0
	POP_XMM
%ifdef X86_32
	pop	 r5
	pop	 r4
	pop	 r3
%endif
	jmp        .return

.pixel_sad_8x8_nsplit:

	pop r2
	%assign  push_num 0
	LOAD_4_PARA
	PUSH_XMM 7
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	pxor   xmm6,   xmm6
	SSE2_GetSad8x4
    lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
    SSE2_GetSad8x4
    movhlps    xmm0, xmm6
	paddw      xmm0, xmm6
	movd       retrd,  xmm0
	POP_XMM
	LOAD_4_PARA_POP
.return:
	ret


;***********************************************************************
;
;Pixel_sad_wxh_sse2 END
;
;***********************************************************************


;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 BEGIN
;
;***********************************************************************


%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
	psadbw %1,   %4
	paddw  xmm5, %1
	psadbw %4,   %3
	paddw  xmm4, %4
	movdqu %4,   [%5-1]
	psadbw %4,   %2
	paddw  xmm6, %4
	movdqu %4,   [%5+1]
	psadbw %4,   %2
	paddw  xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
	%assign  push_num 0
	LOAD_5_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
	pxor   xmm6,   xmm6    ;sad pRefMb-1
	pxor   xmm7,   xmm7    ;sad pRefMb+1
	movdqa xmm0,   [r0]
	sub    r2,    r3
	movdqu xmm3,   [r2]
	psadbw xmm3,   xmm0
	paddw  xmm4,   xmm3

	movdqa xmm1,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	psadbw xmm3,   xmm1
	paddw  xmm4,   xmm3

	movdqu xmm2,   [r2+r3-1]
	psadbw xmm2,   xmm0
	paddw  xmm6,   xmm2

	movdqu xmm3,   [r2+r3+1]
	psadbw xmm3,   xmm0
	paddw  xmm7,   xmm3

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm2,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
	movdqa xmm0,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm1,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
	movdqa xmm2,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm0,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
	movdqa xmm1,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm2,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
	movdqa xmm0,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm1,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
	movdqa xmm2,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm0,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
	movdqa xmm1,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm2,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
	movdqa xmm0,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
	lea    r2,    [r2+2*r3]
	movdqu xmm3,   [r2]
	psadbw xmm2,   xmm3
	paddw xmm5,   xmm2

	movdqu xmm2,   [r2-1]
	psadbw xmm2,   xmm0
	paddw xmm6,   xmm2

	movdqu xmm3,   [r2+1]
	psadbw xmm3,   xmm0
	paddw xmm7,   xmm3

	movdqu xmm3,   [r2+r3]
	psadbw xmm0,   xmm3
	paddw xmm5,   xmm0

	movhlps    xmm0, xmm4
	paddw      xmm4, xmm0
	movhlps    xmm0, xmm5
	paddw      xmm5, xmm0
	movhlps    xmm0, xmm6
	paddw      xmm6, xmm0
	movhlps    xmm0, xmm7
	paddw      xmm7, xmm0
	punpckldq  xmm4, xmm5
	punpckldq  xmm6, xmm7
	punpcklqdq xmm4, xmm6
	movdqa     [r4],xmm4
	POP_XMM
	LOAD_5_PARA_POP
	ret


WELS_EXTERN WelsSampleSadFour16x8_sse2
	%assign  push_num 0
	LOAD_5_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
	pxor   xmm6,   xmm6    ;sad pRefMb-1
	pxor   xmm7,   xmm7    ;sad pRefMb+1
	movdqa xmm0,   [r0]
	sub    r2,    r3
	movdqu xmm3,   [r2]
	psadbw xmm3,   xmm0
	paddw xmm4,   xmm3

	movdqa xmm1,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	psadbw xmm3,   xmm1
	paddw xmm4,   xmm3

	movdqu xmm2,   [r2+r3-1]
	psadbw xmm2,   xmm0
	paddw xmm6,   xmm2

	movdqu xmm3,   [r2+r3+1]
	psadbw xmm3,   xmm0
	paddw xmm7,   xmm3

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm2,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
	movdqa xmm0,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm1,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
	movdqa xmm2,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movdqa xmm0,   [r0]
	movdqu xmm3,   [r2]
	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
	movdqa xmm1,   [r0+r1]
	movdqu xmm3,   [r2+r3]
	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
	lea    r2,    [r2+2*r3]
	movdqu xmm3,   [r2]
	psadbw xmm0,   xmm3
	paddw xmm5,   xmm0

	movdqu xmm0,   [r2-1]
	psadbw xmm0,   xmm1
	paddw xmm6,   xmm0

	movdqu xmm3,   [r2+1]
	psadbw xmm3,   xmm1
	paddw xmm7,   xmm3

	movdqu xmm3,   [r2+r3]
	psadbw xmm1,   xmm3
	paddw xmm5,   xmm1

	movhlps    xmm0, xmm4
	paddw      xmm4, xmm0
	movhlps    xmm0, xmm5
	paddw      xmm5, xmm0
	movhlps    xmm0, xmm6
	paddw      xmm6, xmm0
	movhlps    xmm0, xmm7
	paddw      xmm7, xmm0
	punpckldq  xmm4, xmm5
	punpckldq  xmm6, xmm7
	punpcklqdq xmm4, xmm6
	movdqa     [r4],xmm4
	POP_XMM
	LOAD_5_PARA_POP
	ret

WELS_EXTERN WelsSampleSadFour8x16_sse2
	%assign  push_num 0
	LOAD_5_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
	pxor   xmm6,   xmm6    ;sad pRefMb-1
	pxor   xmm7,   xmm7    ;sad pRefMb+1
	movq   xmm0,   [r0]
	movhps xmm0,   [r0+r1]
	sub    r2,    r3
	movq   xmm3,   [r2]
	movhps xmm3,   [r2+r3]
	psadbw xmm3,   xmm0
	paddw  xmm4,   xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]
	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movhlps    xmm0, xmm4
	paddw      xmm4, xmm0
	movhlps    xmm0, xmm5
	paddw      xmm5, xmm0
	movhlps    xmm0, xmm6
	paddw      xmm6, xmm0
	movhlps    xmm0, xmm7
	paddw      xmm7, xmm0
	punpckldq  xmm4, xmm5
	punpckldq  xmm6, xmm7
	punpcklqdq xmm4, xmm6
	movdqa     [r4],xmm4
	POP_XMM
	LOAD_5_PARA_POP
	ret


WELS_EXTERN WelsSampleSadFour8x8_sse2
	%assign  push_num 0
	LOAD_5_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
	pxor   xmm6,   xmm6    ;sad pRefMb-1
	pxor   xmm7,   xmm7    ;sad pRefMb+1
	movq   xmm0,   [r0]
	movhps xmm0,   [r0+r1]
	sub    r2,    r3
	movq   xmm3,   [r2]
	movhps xmm3,   [r2+r3]
	psadbw xmm3,   xmm0
	paddw  xmm4,   xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]
	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3

	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movq   xmm0,  [r0]
	movhps xmm0,  [r0+r1]
	psadbw xmm3,  xmm0
	paddw  xmm4,  xmm3


	movq   xmm1,  [r2+r3-1]
	movq   xmm3,  [r2+r3+1]

	lea    r0,    [r0+2*r1]
	lea    r2,    [r2+2*r3]
	movhps xmm1,  [r2-1]
	movhps xmm3,  [r2+1]

	psadbw xmm1,  xmm0
	paddw  xmm6,  xmm1
	psadbw xmm3,  xmm0
	paddw  xmm7,  xmm3

	movq   xmm3,  [r2]
	movhps xmm3,  [r2+r3]
	psadbw xmm0,  xmm3
	paddw  xmm5,  xmm0

	movhlps    xmm0, xmm4
	paddw      xmm4, xmm0
	movhlps    xmm0, xmm5
	paddw      xmm5, xmm0
	movhlps    xmm0, xmm6
	paddw      xmm6, xmm0
	movhlps    xmm0, xmm7
	paddw      xmm7, xmm0
	punpckldq  xmm4, xmm5
	punpckldq  xmm6, xmm7
	punpcklqdq xmm4, xmm6
	movdqa     [r4],xmm4
	POP_XMM
	LOAD_5_PARA_POP
	ret

WELS_EXTERN WelsSampleSadFour4x4_sse2
	%assign  push_num 0
	LOAD_5_PARA
	PUSH_XMM 8
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	movd   xmm0,   [r0]
	movd   xmm1,   [r0+r1]
	lea        r0,    [r0+2*r1]
	movd       xmm2,   [r0]
	movd       xmm3,   [r0+r1]
	punpckldq  xmm0, xmm1
	punpckldq  xmm2, xmm3
	punpcklqdq xmm0, xmm2
	sub        r2,  r3
	movd       xmm1, [r2]
	movd       xmm2, [r2+r3]
	punpckldq  xmm1, xmm2
	movd       xmm2, [r2+r3-1]
	movd       xmm3, [r2+r3+1]

	lea        r2,  [r2+2*r3]

	movd       xmm4, [r2]
	movd       xmm5, [r2-1]
	punpckldq  xmm2, xmm5
	movd       xmm5, [r2+1]
	punpckldq  xmm3, xmm5

	movd       xmm5, [r2+r3]
	punpckldq  xmm4, xmm5

	punpcklqdq xmm1, xmm4 ;-L

	movd       xmm5, [r2+r3-1]
	movd       xmm6, [r2+r3+1]

	lea        r2,  [r2+2*r3]
	movd       xmm7, [r2-1]
	punpckldq  xmm5, xmm7
	punpcklqdq xmm2, xmm5 ;-1
	movd       xmm7, [r2+1]
	punpckldq  xmm6, xmm7
	punpcklqdq xmm3, xmm6 ;+1
	movd       xmm6, [r2]
	movd       xmm7, [r2+r3]
	punpckldq  xmm6, xmm7
	punpcklqdq xmm4, xmm6 ;+L
	psadbw     xmm1, xmm0
	psadbw     xmm2, xmm0
	psadbw     xmm3, xmm0
	psadbw     xmm4, xmm0

	movhlps    xmm0, xmm1
	paddw      xmm1, xmm0
	movhlps    xmm0, xmm2
	paddw      xmm2, xmm0
	movhlps    xmm0, xmm3
	paddw      xmm3, xmm0
	movhlps    xmm0, xmm4
	paddw      xmm4, xmm0
	punpckldq  xmm1, xmm4
	punpckldq  xmm2, xmm3
	punpcklqdq xmm1, xmm2
	movdqa     [r4],xmm1
	POP_XMM
	LOAD_5_PARA_POP
	ret

;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 END
;
;***********************************************************************

;***********************************************************************
;   int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
;***********************************************************************
WELS_EXTERN WelsSampleSad4x4_mmx
    %assign  push_num 0
	LOAD_4_PARA
	SIGN_EXTENSION r1, r1d
	SIGN_EXTENSION r3, r3d
	movd	  mm0, [r0]
	movd	  mm1, [r0+r1]
	punpckldq mm0, mm1

	movd      mm3, [r2]
	movd      mm4, [r2+r3]
	punpckldq mm3, mm4
	psadbw    mm0, mm3

	lea       r0, [r0+2*r1]
	lea       r2, [r2+2*r3]

	movd      mm1, [r0]
	movd      mm2, [r0+r1]
	punpckldq mm1, mm2

	movd      mm3, [r2]
	movd      mm4, [r2+r3]
	punpckldq mm3, mm4
	psadbw    mm1, mm3
	paddw     mm0, mm1

    movd      retrd, mm0

	WELSEMMS
    LOAD_4_PARA_POP
    ret