shithub: openh264

Download patch

ref: ed9c03408f1ccb93fc4f6a8ce3b23e7f9e0d59c6
parent: 197423f271794ddc8a0d0a9070ab5a4834186ef9
author: Martin Storsjö <martin@martin.st>
date: Tue Mar 18 11:59:42 EDT 2014

Rename the asm subdirectories to x86

This is consistent with having the arm assembly in a subdirectory
called arm.

--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -349,7 +349,7 @@
 				Filter="*.asm;*.inc"
 				>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\block_add.asm"
+					RelativePath="..\..\..\decoder\core\x86\block_add.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -429,7 +429,7 @@
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\dct.asm"
+					RelativePath="..\..\..\decoder\core\x86\dct.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -549,7 +549,7 @@
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\intra_pred.asm"
+					RelativePath="..\..\..\decoder\core\x86\intra_pred.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -1670,7 +1670,7 @@
 			Filter="*.asm;*.inc"
 			>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\coeff.asm"
+				RelativePath="..\..\..\encoder\core\x86\coeff.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1750,7 +1750,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\dct.asm"
+				RelativePath="..\..\..\encoder\core\x86\dct.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1870,7 +1870,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\intra_pred.asm"
+				RelativePath="..\..\..\encoder\core\x86\intra_pred.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2030,7 +2030,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\memzero.asm"
+				RelativePath="..\..\..\encoder\core\x86\memzero.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2070,7 +2070,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\quant.asm"
+				RelativePath="..\..\..\encoder\core\x86\quant.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2150,7 +2150,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\score.asm"
+				RelativePath="..\..\..\encoder\core\x86\score.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
--- a/codec/decoder/core/asm/block_add.asm
+++ /dev/null
@@ -1,151 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  block_add.asm
-;*
-;*  Abstract
-;*      add block
-;*
-;*  History
-;*      09/21/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include  "asm_inc.asm"
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-;*******************************************************************************
-;  void WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
-;*******************************************************************************
-WELS_EXTERN   WelsResBlockZero16x16_sse2
-        %assign push_num 0
-        LOAD_2_PARA
-        PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-        lea 	r1, 	[r1*2]
-        lea 	r2,	[r1*3]
-
-	pxor     xmm7,       xmm7
-
-    ; four  lines
-	movdqa   [r0],      xmm7
-	movdqa   [r0+10h],  xmm7
-
-	movdqa   [r0+r1],  xmm7
-	movdqa   [r0+r1+10h],     xmm7
-
-    movdqa   [r0+r1*2],   xmm7
-	movdqa   [r0+r1*2+10h],   xmm7
-
-	movdqa   [r0+r2],     xmm7
-	movdqa   [r0+r2+10h],     xmm7
-
-    ;  four lines
-	lea      r0,       [r0+r1*4]
-	movdqa   [r0],      xmm7
-	movdqa   [r0+10h],  xmm7
-
-	movdqa   [r0+r1],  xmm7
-	movdqa   [r0+r1+10h],     xmm7
-
-    movdqa   [r0+r1*2],   xmm7
-	movdqa   [r0+r1*2+10h],   xmm7
-
-	movdqa   [r0+r2],     xmm7
-	movdqa   [r0+r2+10h],     xmm7
-
-	;  four lines
-	lea      r0,       [r0+r1*4]
-	movdqa   [r0],      xmm7
-	movdqa   [r0+10h],  xmm7
-
-	movdqa   [r0+r1],  xmm7
-	movdqa   [r0+r1+10h],     xmm7
-
-    movdqa   [r0+r1*2],   xmm7
-	movdqa   [r0+r1*2+10h],   xmm7
-
-	movdqa   [r0+r2],     xmm7
-	movdqa   [r0+r2+10h],     xmm7
-
-	;  four lines
-	lea      r0,       [r0+r1*4]
-	movdqa   [r0],      xmm7
-	movdqa   [r0+10h],  xmm7
-
-	movdqa   [r0+r1],  xmm7
-	movdqa   [r0+r1+10h],     xmm7
-
-    movdqa   [r0+r1*2],   xmm7
-	movdqa   [r0+r1*2+10h],   xmm7
-
-	movdqa   [r0+r2],     xmm7
-	movdqa   [r0+r2+10h],     xmm7
-
-	POP_XMM
-	ret
-
-
-;*******************************************************************************
-;  void WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
-;*******************************************************************************
-WELS_EXTERN   WelsResBlockZero8x8_sse2
-	  %assign push_num 0
-          LOAD_2_PARA
-          PUSH_XMM 8
-	  SIGN_EXTENSION r1, r1d
-	  lea       r1,     [r1*2]
-	  lea       r2,     [r1*3]
-
-	  pxor      xmm7,          xmm7
-
-	  movdqa    [r0],         xmm7
-	  movdqa    [r0+r1],     xmm7
-	  movdqa    [r0+r1*2],   xmm7
-	  movdqa    [r0+r2],     xmm7
-
-	  lea       r0,     [r0+r1*4]
-	  movdqa    [r0],         xmm7
-	  movdqa    [r0+r1],     xmm7
-	  movdqa    [r0+r1*2],   xmm7
-	  movdqa    [r0+r2],     xmm7
-
-
-	  POP_XMM
-	  ret
-
--- a/codec/decoder/core/asm/dct.asm
+++ /dev/null
@@ -1,115 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        ?Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        ?Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  dct.asm
-;*
-;*  Abstract
-;*      WelsDctFourT4_sse2
-;*
-;*  History
-;*      8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
-    movq    %3, %2
-    psraw   %3, $01
-    paddw   %3, %1
-    psraw   %1, $01
-    psubw   %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
-	movq    %3, %2
-    psubw   %2, %1
-    paddw   %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
-    MMX_SumSub      %4, %5, %6
-    MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
-    movd       %2, %5
-    punpcklbw  %2, %4
-    paddw      %1, %3
-    psraw      %1, $06
-    paddsw     %1, %2
-    packuswb   %1, %2
-    movd       %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-;*******************************************************************************
-;   void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-WELS_EXTERN IdctResAddPred_mmx
-    %assign push_num 0
-    LOAD_3_PARA
-    SIGN_EXTENSION r1, r1d
-    movq    mm0, [r2+ 0]
-    movq    mm1, [r2+ 8]
-    movq    mm2, [r2+16]
-    movq    mm3, [r2+24]
-
-	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
-
-    WELS_Zero			mm7
-    WELS_DW32			mm6
-
-    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [r0]
-    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [r0+r1]
-    lea     r0, [r0+2*r1]
-    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [r0]
-    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [r0+r1]
-
-
-	emms
-    ret
--- a/codec/decoder/core/asm/intra_pred.asm
+++ /dev/null
@@ -1,1414 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 and mmx function for intra predict operations(decoder)
-;*
-;*  History
-;*      18/09/2009 Created
-;*		19/11/2010 Added
-;*					WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
-;*					WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
-;*					and WelsDecoderIChromaPredDcNA_mmx
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-SECTION .rodata align=16
-%if 1
-	%define WELSEMMS	emms
-%else
-	%define WELSEMMS
-%endif
-
-align 16
-sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
-
-; for chroma plane mode
-sse2_plane_inc_c dw 1, 2, 3, 4
-sse2_plane_dec_c dw 4, 3, 2, 1
-align 16
-sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
-
-align 16
-mmx_01bytes:		times 16	db 1
-
-align 16
-mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
-
-align 16
-sse2_dc_0x80: times 16 db 0x80
-align 16
-sse2_wd_0x02: times 8 dw 0x02
-
-;*******************************************************************************
-; macros
-;*******************************************************************************
-;xmm0, xmm1, xmm2, eax, ecx
-;lower 64 bits of xmm0 save the result
-%macro SSE2_PRED_H_4X4_TWO_LINE 5
-    movd		%1,	[%4-1]
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-
-	;add			%4,	%5
-	movd		%2,	[%4+%5-1]
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	punpckldq	%1,	%2
-%endmacro
-
-
-%macro	LOAD_COLUMN 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]
-		movd	%4,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		lea		%5,	[%5+2*%6]
-		punpcklbw %3,	%2
-		punpcklwd %4,	%3
-		punpckhdq %1,	%4
-%endmacro
-
-%macro  SUMW_HORIZON 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro	LOAD_COLUMN_C 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]
-%endmacro
-
-%macro LOAD_2_LEFT_AND_ADD 0
-        lea         r0, [r0+2*r1]
-        movzx		r3, byte [r0-0x01]
-        add			r2, r3
-        movzx		r3, byte [r0+r1-0x01]
-        add			r2, r3
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-;*******************************************************************************
-;   void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
-;
-;	pPred must align to 16
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-
-	movzx		r2,	byte [r0-1]
-	movd		xmm0,	r2d
-	pmuludq		xmm0,	[mmx_01bytes]
-
-	movzx		r2,	byte [r0+r1-1]
-	movd		xmm1,	r2d
-	pmuludq		xmm1,	[mmx_01bytes]
-
-	lea			r0,	[r0+r1]
-	movzx		r2,	byte [r0+r1-1]
-	movd		xmm2,	r2d
-	pmuludq		xmm2,	[mmx_01bytes]
-
-	movzx		r2,	byte [r0+2*r1-1]
-	movd		xmm3,	r2d
-	pmuludq		xmm3,	[mmx_01bytes]
-
-	sub         r0,    r1
-	movd        [r0], xmm0
-	movd        [r0+r1], xmm1
-	lea         r0, [r0+2*r1]
-	movd        [r0], xmm2
-	movd        [r0+r1], xmm3
-
-	ret
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r1, r1d
-		mov r4, r0 ; save r0 in r4
-		sub		r0,	1
-		sub		r0,	r1
-
-		;for H
-		pxor	xmm7,	xmm7
-		movq	xmm0,	[r0]
-		movdqa	xmm5,	[sse2_plane_dec]
-		punpcklbw xmm0,	xmm7
-		pmullw	xmm0,	xmm5
-		movq	xmm1,	[r0 + 9]
-		movdqa	xmm6,	[sse2_plane_inc]
-		punpcklbw xmm1,	xmm7
-		pmullw	xmm1,	xmm6
-		psubw	xmm1,	xmm0
-
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r2d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	r2,	r2w
-		imul	r2,	5
-		add		r2,	32
-		sar		r2,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, r2d	; xmm1 = b,b,b,b,b,b,b,b
-
-		movzx	r3,	BYTE [r0+16]
-		sub	r0, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r0, r1
-
-		add		r0,	3
-		movzx	r2,	BYTE [r0+8*r1]
-		add		r3,	r2
-		shl		r3,	4			;	a = (left[15*kiStride] + top[15]) << 4;
-
-		sub	r0, 3
-		add		r0,	r1
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r0, r1
-		pxor	xmm4,	xmm4
-		punpckhbw xmm0,	xmm4
-		pmullw	xmm0,	xmm5
-		punpckhbw xmm7,	xmm4
-		pmullw	xmm7,	xmm6
-		psubw	xmm7,	xmm0
-
-		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    r2d,   xmm7			; V
-		movsx	r2,	r2w
-
-		imul	r2,	5
-		add		r2,	32
-		sar		r2,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, r2d		; xmm4 = c,c,c,c,c,c,c,c
-
-		mov r0, r4
-		add		r3,	16
-		imul	r2,	-7
-		add		r3,	r2		; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		r2,	r2
-		movdqa	xmm5,	[sse2_plane_inc_minus]
-
-get_i16x16_luma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		movdqa	xmm3,	xmm1
-		pmullw	xmm3,	xmm6
-		paddw	xmm3,	xmm0
-		psraw	xmm3,	5
-		packuswb xmm2,	xmm3
-		movdqa	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	r1
-		inc		r2
-		cmp		r2,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1
-
-		POP_XMM
-		pop r4
-		pop r3
-		ret
-
-
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
-    lea     %1,	[%1+%2*2]
-
-    COPY_16_TIMES %1,	xmm0
-    movdqa  [%1],	xmm0
-    COPY_16_TIMESS %1,	xmm0,	%2
-    movdqa  [%1+%2],	xmm0
-%endmacro
-
-WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-
-    COPY_16_TIMES r0,	xmm0
-    movdqa  [r0],		xmm0
-    COPY_16_TIMESS r0,	xmm0,	r1
-    movdqa  [r0+r1],	xmm0
-
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-
-    ret
-
-;*******************************************************************************
-; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-
-    sub     r0, r1
-    movdqa  xmm0, [r0]
-
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-    movdqa  [r0+r1], xmm0
-    lea     r0, [r0+2*r1]
-    movdqa  [r0],     xmm0
-
-    ret
-
-;*******************************************************************************
-; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r1, r1d
-		mov r4, r0
-		sub		r0,	1
-		sub		r0,	r1
-
-		pxor	mm7,	mm7
-		movq	mm0,	[r0]
-		movq	mm5,	[sse2_plane_dec_c]
-		punpcklbw mm0,	mm7
-		pmullw	mm0,	mm5
-		movq	mm1,	[r0 + 5]
-		movq	mm6,	[sse2_plane_inc_c]
-		punpcklbw mm1,	mm7
-		pmullw	mm1,	mm6
-		psubw	mm1,	mm0
-
-		movq2dq xmm1,   mm1
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r2d,	xmm1
-		movsx	r2,	r2w
-		imul	r2,	17
-		add		r2,	16
-		sar		r2,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, r2d	; mm1 = b,b,b,b,b,b,b,b
-
-		movzx	r3,	BYTE [r0+8]
-		sub	r0, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r0, r1
-
-		add		r0,	3
-		movzx	r2,	BYTE [r0+4*r1]
-		add		r3,	r2
-		shl		r3,	4			; a = (left[7*kiStride] + top[7]) << 4;
-
-		sub	r0, 3
-		add		r0,	r1
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r0, r1
-		pxor	mm4,	mm4
-		punpckhbw mm0,	mm4
-		pmullw	mm0,	mm5
-		punpckhbw mm7,	mm4
-		pmullw	mm7,	mm6
-		psubw	mm7,	mm0
-
-		movq2dq xmm7,   mm7
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    r2d,    xmm7			; V
-		movsx	r2,	r2w
-
-		imul	r2,	17
-		add		r2,	16
-		sar		r2,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c
-
-		mov 	r0, r4
-		add		r3,	16
-		imul	r2,	-3
-		add		r3,	r2				; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		r2,	r2
-		movdqa	xmm5,	[sse2_plane_mul_b_c]
-
-get_i_chroma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		packuswb xmm2,	xmm2
-		movq	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	r1
-		inc		r2
-		cmp		r2,	8
-		jnz get_i_chroma_pred_plane_sse2_1
-
-		POP_XMM
-		pop r4
-		pop r3
-		WELSEMMS
-		ret
-
-;*******************************************************************************
-;	0 |1 |2 |3 |4 |
-;	6 |7 |8 |9 |10|
-;	11|12|13|14|15|
-;	16|17|18|19|20|
-;	21|22|23|24|25|
-;	7 is the start pixel of current 4x4 block
-;	pPred[7] = ([6]+[0]*2+[1]+2)/4
-;
-;   void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
-;
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-
-	movq        mm1,[r2+r1-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[r2-8]			;get value of 6 mm2[8] = 6
-	sub		r2, r1			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[r2-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[r2]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
-	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
-	psllq       mm3,18h				;mm3[5]=[1]
-	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
-	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea 		r2,[r2+r1*2-8h]		;set eax point to 12
-	movq        mm4,[r2+r1]		;get value of 16, mm4[8]=[16]
-	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[16]
-	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
-	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[r2+r1*2]		;mm4[8]=[21]
-	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[21]
-	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
-	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
-	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
-	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
-	pand        mm1,[mmx_01bytes]	;set the odd bit
-	psubusb     mm3,mm1				;decrease 1 from odd bytes
-	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-
-	lea         r0,[r0+r1]
-	movd        [r0+2*r1],mm2
-	sub         r0,r1
-	psrlq       mm2,8
-	movd        [r0+2*r1],mm2
-	psrlq       mm2,8
-	movd        [r0+r1],mm2
-	psrlq       mm2,8
-	movd        [r0],mm2
-	WELSEMMS
-	ret
-
-
-;*******************************************************************************
-;	void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
-;   copy 8 pixel of 8 line from left
-;*******************************************************************************
-%macro MMX_PRED_H_8X8_ONE_LINE 4
-	movq		%1,		[%3-8]
-	psrlq		%1,		38h
-
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
-%endmacro
-
-%macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+r1-8]
-	psrlq		%1,		38h
-
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
-%endmacro
-
-WELS_EXTERN WelsDecoderIChromaPredH_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-
-	movq		mm0,	[r2-8]
-	psrlq		mm0,	38h
-
-	pmullw		mm0,		[mmx_01bytes]
-	pshufw		mm0,	mm0,	0
-	movq		[r0],	mm0
-
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
-
-	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
-
-	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
-
-    	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
-
-	WELSEMMS
-	ret
-
-
-;*******************************************************************************
-;	void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
-;   copy 8 pixels from top 8 pixels
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredV_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-
-	sub			r0,		r1
-	movq		mm0,		[r0]
-
-	movq		[r0+r1],		mm0
-	movq		[r0+2*r1],	mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
-
-	WELSEMMS
-	ret
-
-
-;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |a |b |
-;	|g |h |e |f |
-;	|i |j |g |h |
-
-;   a = (1 + lt + l0)>>1
-;   e = (1 + l0 + l1)>>1
-;   g = (1 + l1 + l2)>>1
-;   i = (1 + l2 + l3)>>1
-
-;   d = (2 + t0 + (t1<<1) + t2)>>2
-;   c = (2 + lt + (t0<<1) + t1)>>2
-;   b = (2 + l0 + (lt<<1) + t0)>>2
-
-;   f = (2 + l1 + (l0<<1) + lt)>>2
-;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2
-;   [b a f e h g j i] + [d c b a] --> mov to memory
-;
-;   void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
-	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
-	movd        mm1, [r2+2*r1-4]
-	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r2, [r2+2*r1]
-	movd        mm2, [r2+2*r1-4]
-	punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
-	psrlq       mm2, 20h
-	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
-	movq        mm1, mm0
-	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
-	movq        mm2, mm0
-	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
-	movq        mm3, mm2
-	movq        mm4, mm1
-	pavgb       mm1, mm0
-
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm4				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-
-	movq        mm4, mm0
-	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
-	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-
-	psrlq       mm2, 20h
-	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
-	movq        mm4, mm3
-	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
-	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-
-	movd        [r0], mm2
-	lea         r0, [r0+r1]
-	movd        [r0+2*r1], mm3
-	sub         r0, r1
-	psrlq       mm3, 10h
-	movd        [r0+2*r1], mm3
-	psrlq       mm3, 10h
-	movd        [r0+r1], mm3
-	WELSEMMS
-	ret
-
-
-
-;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|c |d |e |f |
-;	|e |f |g |g |
-;	|g |g |g |g |
-
-;   a = (1 + l0 + l1)>>1
-;   c = (1 + l1 + l2)>>1
-;   e = (1 + l2 + l3)>>1
-;   g = l3
-
-;   b = (2 + l0 + (l1<<1) + l2)>>2
-;   d = (2 + l1 + (l2<<1) + l3)>>2
-;   f = (2 + l2 + (l3<<1) + l3)>>2
-
-;   [g g f e d c b a] + [g g g g] --> mov to memory
-;
-;   void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-
-	movd        mm0, [r2-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         r2, [r2+2*r1]
-	movd        mm2, [r2-4]            ; mm2[3] = l2
-	movd        mm4, [r2+r1-4]        ; mm4[3] = l3
-	punpcklbw   mm2, mm4
-	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
-	psrlq       mm4, 18h
-	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
-	psrlq       mm0, 8h
-	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
-	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
-	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
-	movq        mm5, mm2
-	pavgb       mm2, mm0
-
-	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
-	pand        mm5, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm5				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-
-	psrlq       mm2, 8h
-	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-
-	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-
-	psrlq       mm4, 20h
-	lea         r0, [r0+r1]
-	movd        [r0+2*r1], mm4
-
-	sub         r0, r1
-	movd        [r0], mm1
-	psrlq       mm1, 10h
-	movd        [r0+r1], mm1
-	psrlq       mm1, 10h
-	movd        [r0+2*r1], mm1
-	WELSEMMS
-	ret
-
-
-
-;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	l3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|i |a |b |c |
-;	|j |e |f |g |
-
-;   a = (1 + lt + t0)>>1
-;   b = (1 + t0 + t1)>>1
-;   c = (1 + t1 + t2)>>1
-;   d = (1 + t2 + t3)>>1
-
-;   e = (2 + l0 + (lt<<1) + t0)>>2
-;   f = (2 + lt + (t0<<1) + t1)>>2
-;   g = (2 + t0 + (t1<<1) + t2)>>2
-
-;   h = (2 + t1 + (t2<<1) + t3)>>2
-;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2
-;
-;   void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
-	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
-	movd        mm1, [r2+2*r1-4]
-	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r2, [r2+2*r1]
-	movq        mm2, [r2+r1-8]        ; mm2[7] = l2
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
-	psrlq       mm2, 28h
-	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
-	movq        mm3, mm2
-	pavgb       mm2, mm0
-
-	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm3				; decrease 1 from odd bytes
-
-	movq        mm3, mm0
-	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
-	movq        mm2, mm3
-
-	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [r0], mm1
-
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [r0+r1], mm2
-
-	movq        mm4, mm3
-	psllq       mm4, 20h
-	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-
-	movq        mm5, mm3
-	psllq       mm5, 28h
-	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-
-	psllq       mm1, 8h
-	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [r0+2*r1], mm4
-
-	psllq       mm2, 8h
-	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm5
-	WELSEMMS
-	ret
-
-;*******************************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|b |c |d |e |
-;	|c |d |e |f |
-;	|d |e |f |g |
-
-;   a = (2 + t0 + t2 + (t1<<1))>>2
-;   b = (2 + t1 + t3 + (t2<<1))>>2
-;   c = (2 + t2 + t4 + (t3<<1))>>2
-;   d = (2 + t3 + t5 + (t4<<1))>>2
-
-;   e = (2 + t4 + t6 + (t5<<1))>>2
-;   f = (2 + t5 + t7 + (t6<<1))>>2
-;   g = (2 + t6 + t7 + (t7<<1))>>2
-
-;   [g f e d c b a] --> mov to memory
-;
-;   void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
-
-	movq        mm3, mm0
-	psrlq       mm3, 38h
-	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-
-	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
-	psrlq       mm2, 8h
-	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
-
-	movq        mm3, mm1
-	pavgb       mm1, mm2
-	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm3				; decrease 1 from odd bytes
-
-	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-
-	psrlq       mm0, 8h
-	movd        [r0], mm0
-	psrlq       mm0, 8h
-	movd        [r0+r1], mm0
-	psrlq       mm0, 8h
-	movd        [r0+2*r1], mm0
-	psrlq       mm0, 8h
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm0
-	WELSEMMS
-	ret
-
-
-;*******************************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|b |c |d |i |
-;	|f |g |h |j |
-
-;   a = (1 + t0 + t1)>>1
-;   b = (1 + t1 + t2)>>1
-;   c = (1 + t2 + t3)>>1
-;   d = (1 + t3 + t4)>>1
-;   i = (1 + t4 + t5)>>1
-
-;   e = (2 + t0 + (t1<<1) + t2)>>2
-;   f = (2 + t1 + (t2<<1) + t3)>>2
-;   g = (2 + t2 + (t3<<1) + t4)>>2
-;   h = (2 + t3 + (t4<<1) + t5)>>2
-;   j = (2 + t4 + (t5<<1) + t6)>>2
-
-;   [i d c b a] + [j h g f e] --> mov to memory
-;
-;   void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-
-	sub         r2, r1
-	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
-
-	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
-	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
-
-	movq        mm3, mm1
-	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-
-	movq        mm4, mm2
-	pavgb       mm2, mm0
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm4				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-
-	movd        [r0], mm3
-	psrlq       mm3, 8h
-	movd        [r0+2*r1], mm3
-
-	movd        [r0+r1], mm2
-	psrlq       mm2, 8h
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm2
-	WELSEMMS
-	ret
-
-;*******************************************************************************
-;
-;   void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDc_sse2
-	push 	r3
-	push 	r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
-
-	sub         r0, r1
-	movq        mm0, [r0]
-
-	movzx		r2, byte [r0+r1-0x01] ; l1
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l2
-	add			r2, r3
-	movzx		r3, byte [r0+r1-0x01] ; l3
-	add			r2, r3
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l4
-	add			r2, r3
-	movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
-
-	movzx		r2, byte [r0+r1-0x01] ; l5
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l6
-	add			r2, r3
-	movzx		r3, byte [r0+r1-0x01] ; l7
-	add			r2, r3
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l8
-	add			r2, r3
-	movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
-
-	movq        mm3, mm0
-	psrlq       mm0, 0x20
-	psllq       mm3, 0x20
-	psrlq       mm3, 0x20
-	pxor		mm4, mm4
-	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
-	paddq       mm3, mm1
-	movq        mm1, mm2
-	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
-	movq        mm4, [mmx_0x02]
-
-	paddq       mm0, mm4
-	psrlq       mm0, 0x02
-
-	paddq       mm2, mm4
-	psrlq       mm2, 0x02
-
-	paddq       mm3, mm4
-	paddq       mm3, mm4
-	psrlq       mm3, 0x03
-
-	paddq       mm1, mm4
-	paddq       mm1, mm4
-	psrlq       mm1, 0x03
-
-	pmuludq     mm0, [mmx_01bytes]
-	pmuludq     mm3, [mmx_01bytes]
-	psllq       mm0, 0x20
-	pxor        mm0, mm3                 ; mm0 = m_up
-
-	pmuludq     mm2, [mmx_01bytes]
-	pmuludq     mm1, [mmx_01bytes]
-	psllq       mm1, 0x20
-	pxor        mm1, mm2                 ; mm2 = m_down
-
-	movq        [r4],       mm0
-	movq        [r4+r1],   mm0
-	movq        [r4+2*r1], mm0
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm0
-
-	movq        [r4+2*r1], mm1
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm1
-	movq        [r4+2*r1], mm1
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm1
-
-	pop r4
-	pop r3
-	WELSEMMS
-	ret
-
-
-
-;*******************************************************************************
-;
-;   void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
-	push 	r3
-	push 	r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
-	sub         r0, r1
-	movdqa      xmm0, [r0]             ; read one row
-	pxor		xmm1, xmm1
-	psadbw		xmm0, xmm1
-	movdqa      xmm1, xmm0
-	psrldq      xmm1, 0x08
-	pslldq      xmm0, 0x08
-	psrldq      xmm0, 0x08
-	paddw       xmm0, xmm1
-
-	movzx		r2, byte [r0+r1-0x01]
-	movzx		r3, byte [r0+2*r1-0x01]
-	add		r2, r3
-	lea    		r0, [r0+r1]
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	add         r2, 0x10
-	movd        xmm1, r2d
-	paddw       xmm0, xmm1
-	psrld       xmm0, 0x05
-	pmuludq     xmm0, [mmx_01bytes]
-	pshufd      xmm0, xmm0, 0
-
-	movdqa      [r4],       xmm0
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
-
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
-
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
-
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
-
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
-
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
-
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
-
-	movdqa      [r4+r1],   xmm0
-
-	pop r4
-	pop r3
-
-	ret
-
-;*******************************************************************************
-; for intra prediction as follows, 11/19/2010
-;*******************************************************************************
-
-;*******************************************************************************
-;	void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub r2, r1
-	movdqa xmm0, [r2]		; pPred-kiStride, top line
-	pxor xmm7, xmm7
-	psadbw xmm0, xmm7
-	movdqa xmm1, xmm0
-	psrldq xmm1, 8
-	paddw  xmm0, xmm1
-	xor r2, r2
-	movd r2d, xmm0
-	;movdqa xmm1, xmm0
-	;punpcklbw xmm0, xmm7
-	;punpckhbw xmm1, xmm7
-
-	;paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
-	;pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
-	;paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
-	;pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
-	;paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
-	;pshuflw xmm1, xmm0, 0b1h	; 10110001
-	;paddw xmm0, xmm1			; sum in word unit (x8)
-	;xor r3, r3
-	;movd r3d, xmm0
-	;and edx, 0ffffh
-
-	add r2, 8
-	sar r2, 4
-	SSE2_Copy16Times xmm1, r2d
-	;mov dh, dl
-	;mov r2, edx
-	;shl r2, 010h
-	;or edx, r2
-	;movd xmm1, edx
-	;pshufd xmm0, xmm1, 00h
-	;movdqa xmm1, xmm0
-	movdqa xmm0, xmm1
-	lea r2, [2*r1+r1]		; 3*kiStride
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-
-	POP_XMM
-	ret
-
-;*******************************************************************************
-;	void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	lea r2, [2*r1+r1]		; 3*kiStride
-
-	movdqa xmm0, [sse2_dc_0x80]
-	movdqa xmm1, xmm0
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-
-	ret
-
-;*******************************************************************************
-;	void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
-	; for left
-	dec r0
-	xor r2, r2
-	xor r3, r3
-	movzx r2, byte [r0]
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	lea r0, [r0+2*r1]
-	movzx r3, byte [r0]
-	add r2, r3
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	add r2, 02h
-	sar r2, 02h
-	;SSE2_Copy16Times mm0, r2d
-	mov r3, r2
-	sal r3, 8
-	or r2, r3
-	movd mm1, r2d
-	pshufw mm0, mm1, 00h
-	;mov bh, bl
-	;movd mm1, ebx
-	;pshufw mm0, mm1, 00h	; up64
-	movq mm1, mm0
-	xor r2, r2
-	lea r0, [r0+2*r1]
-	movzx r2, byte [r0]
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	lea r0, [r0+2*r1]
-	movzx r3, byte [r0]
-	add r2, r3
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	add r2, 02h
-	sar r2, 02h
-	mov r3, r2
-	sal r3, 8
-	or r2, r3
-	movd mm3, r2d
-	pshufw mm2, mm3, 00h
-	;mov bh, bl
-	;movd mm3, ebx
-	;pshufw mm2, mm3, 00h	; down64
-	;SSE2_Copy16Times mm2, r2d
-	movq mm3, mm2
-	lea r2, [2*r1+r1]
-	movq [r4], mm0
-	movq [r4+r1], mm1
-	movq [r4+2*r1], mm0
-	movq [r4+r2], mm1
-	lea r4, [r4+4*r1]
-	movq [r4], mm2
-	movq [r4+r1], mm3
-	movq [r4+2*r1], mm2
-	movq [r4+r2], mm3
-	pop r4
-	pop r3
-	emms
-	ret
-
-;*******************************************************************************
-;	void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub r2, r1
-	movq xmm0, [r2]		; top: 8x1 pixels
-	pxor xmm7, xmm7
-	punpcklbw xmm0, xmm7		; ext 8x2 words
-	pshufd xmm1, xmm0, 0B1h		; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
-	paddw xmm0, xmm1			; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
-	movdqa xmm1, xmm0
-	pshuflw xmm2, xmm0, 0B1h	; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
-	pshufhw xmm3, xmm1, 0B1h	; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
-	paddw xmm0, xmm2			; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
-	paddw xmm1, xmm3			; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
-	punpckhqdq xmm1, xmm7
-	punpcklqdq xmm0, xmm1		; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
-	movdqa xmm6, [sse2_wd_0x02]
-	paddw xmm0, xmm6
-	psraw xmm0, 02h
-	packuswb xmm0, xmm7
-	lea r2, [2*r1+r1]
-	movq [r0], xmm0
-	movq [r0+r1], xmm0
-	movq [r0+2*r1], xmm0
-	movq [r0+r2], xmm0
-	lea r0, [r0+4*r1]
-	movq [r0], xmm0
-	movq [r0+r1], xmm0
-	movq [r0+2*r1], xmm0
-	movq [r0+r2], xmm0
-	POP_XMM
-	ret
-
-;*******************************************************************************
-;	void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
-;*******************************************************************************
-WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	lea r2, [2*r1+r1]
-	movq mm0, [sse2_dc_0x80]
-	movq mm1, mm0
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	movq [r0+2*r1], mm0
-	movq [r0+r2], mm1
-	lea r0, [r0+4*r1]
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	movq [r0+2*r1], mm0
-	movq [r0+r2], mm1
-	emms
-	ret
-
--- /dev/null
+++ b/codec/decoder/core/x86/block_add.asm
@@ -1,0 +1,151 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  block_add.asm
+;*
+;*  Abstract
+;*      add block
+;*
+;*  History
+;*      09/21/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include  "asm_inc.asm"
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+;*******************************************************************************
+;  void WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
+;*******************************************************************************
+WELS_EXTERN   WelsResBlockZero16x16_sse2
+        %assign push_num 0
+        LOAD_2_PARA
+        PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+        lea 	r1, 	[r1*2]
+        lea 	r2,	[r1*3]
+
+	pxor     xmm7,       xmm7
+
+    ; four  lines
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
+
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
+
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
+
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
+
+    ;  four lines
+	lea      r0,       [r0+r1*4]
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
+
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
+
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
+
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
+
+	;  four lines
+	lea      r0,       [r0+r1*4]
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
+
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
+
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
+
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
+
+	;  four lines
+	lea      r0,       [r0+r1*4]
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
+
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
+
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
+
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
+
+	POP_XMM
+	ret
+
+
+;*******************************************************************************
+;  void WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
+;*******************************************************************************
+WELS_EXTERN   WelsResBlockZero8x8_sse2
+	  %assign push_num 0
+          LOAD_2_PARA
+          PUSH_XMM 8
+	  SIGN_EXTENSION r1, r1d
+	  lea       r1,     [r1*2]
+	  lea       r2,     [r1*3]
+
+	  pxor      xmm7,          xmm7
+
+	  movdqa    [r0],         xmm7
+	  movdqa    [r0+r1],     xmm7
+	  movdqa    [r0+r1*2],   xmm7
+	  movdqa    [r0+r2],     xmm7
+
+	  lea       r0,     [r0+r1*4]
+	  movdqa    [r0],         xmm7
+	  movdqa    [r0+r1],     xmm7
+	  movdqa    [r0+r1*2],   xmm7
+	  movdqa    [r0+r2],     xmm7
+
+
+	  POP_XMM
+	  ret
+
--- /dev/null
+++ b/codec/decoder/core/x86/dct.asm
@@ -1,0 +1,115 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $01
+    paddw   %3, %1
+    psraw   %1, $01
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+    movd       %2, %5
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $06
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+;*******************************************************************************
+;   void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+WELS_EXTERN IdctResAddPred_mmx
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r1, r1d
+    movq    mm0, [r2+ 0]
+    movq    mm1, [r2+ 8]
+    movq    mm2, [r2+16]
+    movq    mm3, [r2+24]
+
+	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+
+    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [r0]
+    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [r0+r1]
+    lea     r0, [r0+2*r1]
+    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [r0]
+    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [r0+r1]
+
+
+	emms
+    ret
--- /dev/null
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -1,0 +1,1414 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 and mmx function for intra predict operations(decoder)
+;*
+;*  History
+;*      18/09/2009 Created
+;*		19/11/2010 Added
+;*					WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
+;*					WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
+;*					and WelsDecoderIChromaPredDcNA_mmx
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+SECTION .rodata align=16
+%if 1
+	%define WELSEMMS	emms
+%else
+	%define WELSEMMS
+%endif
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+align 16
+sse2_dc_0x80: times 16 db 0x80
+align 16
+sse2_wd_0x02: times 8 dw 0x02
+
+;*******************************************************************************
+; macros
+;*******************************************************************************
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+    movd		%1,	[%4-1]
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+
+	;add			%4,	%5
+	movd		%2,	[%4+%5-1]
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	punpckldq	%1,	%2
+%endmacro
+
+
+%macro	LOAD_COLUMN 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpcklwd %1,	%3
+		lea		%5,	[%5+2*%6]
+		movd	%4,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %4,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		lea		%5,	[%5+2*%6]
+		punpcklbw %3,	%2
+		punpcklwd %4,	%3
+		punpckhdq %1,	%4
+%endmacro
+
+%macro  SUMW_HORIZON 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro	LOAD_COLUMN_C 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpckhwd %1,	%3
+		lea		%5,	[%5+2*%6]
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+        lea         r0, [r0+2*r1]
+        movzx		r3, byte [r0-0x01]
+        add			r2, r3
+        movzx		r3, byte [r0+r1-0x01]
+        add			r2, r3
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+;*******************************************************************************
+;   void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
+;
+;	pPred must align to 16
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+
+	movzx		r2,	byte [r0-1]
+	movd		xmm0,	r2d
+	pmuludq		xmm0,	[mmx_01bytes]
+
+	movzx		r2,	byte [r0+r1-1]
+	movd		xmm1,	r2d
+	pmuludq		xmm1,	[mmx_01bytes]
+
+	lea			r0,	[r0+r1]
+	movzx		r2,	byte [r0+r1-1]
+	movd		xmm2,	r2d
+	pmuludq		xmm2,	[mmx_01bytes]
+
+	movzx		r2,	byte [r0+2*r1-1]
+	movd		xmm3,	r2d
+	pmuludq		xmm3,	[mmx_01bytes]
+
+	sub         r0,    r1
+	movd        [r0], xmm0
+	movd        [r0+r1], xmm1
+	lea         r0, [r0+2*r1]
+	movd        [r0], xmm2
+	movd        [r0+r1], xmm3
+
+	ret
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_2_PARA
+		PUSH_XMM 8
+		SIGN_EXTENSION r1, r1d
+		mov r4, r0 ; save r0 in r4
+		sub		r0,	1
+		sub		r0,	r1
+
+		;for H
+		pxor	xmm7,	xmm7
+		movq	xmm0,	[r0]
+		movdqa	xmm5,	[sse2_plane_dec]
+		punpcklbw xmm0,	xmm7
+		pmullw	xmm0,	xmm5
+		movq	xmm1,	[r0 + 9]
+		movdqa	xmm6,	[sse2_plane_inc]
+		punpcklbw xmm1,	xmm7
+		pmullw	xmm1,	xmm6
+		psubw	xmm1,	xmm0
+
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    r2d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
+		movsx	r2,	r2w
+		imul	r2,	5
+		add		r2,	32
+		sar		r2,	6			; b = (5 * H + 32) >> 6;
+		SSE2_Copy8Times	xmm1, r2d	; xmm1 = b,b,b,b,b,b,b,b
+
+		movzx	r3,	BYTE [r0+16]
+		sub	r0, 3
+		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r0, r1
+
+		add		r0,	3
+		movzx	r2,	BYTE [r0+8*r1]
+		add		r3,	r2
+		shl		r3,	4			;	a = (left[15*kiStride] + top[15]) << 4;
+
+		sub	r0, 3
+		add		r0,	r1
+		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r0, r1
+		pxor	xmm4,	xmm4
+		punpckhbw xmm0,	xmm4
+		pmullw	xmm0,	xmm5
+		punpckhbw xmm7,	xmm4
+		pmullw	xmm7,	xmm6
+		psubw	xmm7,	xmm0
+
+		SUMW_HORIZON   xmm7,xmm0,xmm2
+		movd    r2d,   xmm7			; V
+		movsx	r2,	r2w
+
+		imul	r2,	5
+		add		r2,	32
+		sar		r2,	6				; c = (5 * V + 32) >> 6;
+		SSE2_Copy8Times	xmm4, r2d		; xmm4 = c,c,c,c,c,c,c,c
+
+		mov r0, r4
+		add		r3,	16
+		imul	r2,	-7
+		add		r3,	r2		; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+
+		xor		r2,	r2
+		movdqa	xmm5,	[sse2_plane_inc_minus]
+
+get_i16x16_luma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		movdqa	xmm3,	xmm1
+		pmullw	xmm3,	xmm6
+		paddw	xmm3,	xmm0
+		psraw	xmm3,	5
+		packuswb xmm2,	xmm3
+		movdqa	[r0],	xmm2
+		paddw	xmm0,	xmm4
+		add		r0,	r1
+		inc		r2
+		cmp		r2,	16
+		jnz get_i16x16_luma_pred_plane_sse2_1
+
+		POP_XMM
+		pop r4
+		pop r3
+		ret
+
+
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
+    lea     %1,	[%1+%2*2]
+
+    COPY_16_TIMES %1,	xmm0
+    movdqa  [%1],	xmm0
+    COPY_16_TIMESS %1,	xmm0,	%2
+    movdqa  [%1+%2],	xmm0
+%endmacro
+
+WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+
+    COPY_16_TIMES r0,	xmm0
+    movdqa  [r0],		xmm0
+    COPY_16_TIMESS r0,	xmm0,	r1
+    movdqa  [r0+r1],	xmm0
+
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+
+    ret
+
+;*******************************************************************************
+; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+
+    sub     r0, r1
+    movdqa  xmm0, [r0]
+
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+
+    ret
+
+;*******************************************************************************
+; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_2_PARA
+		PUSH_XMM 8
+		SIGN_EXTENSION r1, r1d
+		mov r4, r0
+		sub		r0,	1
+		sub		r0,	r1
+
+		pxor	mm7,	mm7
+		movq	mm0,	[r0]
+		movq	mm5,	[sse2_plane_dec_c]
+		punpcklbw mm0,	mm7
+		pmullw	mm0,	mm5
+		movq	mm1,	[r0 + 5]
+		movq	mm6,	[sse2_plane_inc_c]
+		punpcklbw mm1,	mm7
+		pmullw	mm1,	mm6
+		psubw	mm1,	mm0
+
+		movq2dq xmm1,   mm1
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    r2d,	xmm1
+		movsx	r2,	r2w
+		imul	r2,	17
+		add		r2,	16
+		sar		r2,	5			; b = (17 * H + 16) >> 5;
+		SSE2_Copy8Times	xmm1, r2d	; mm1 = b,b,b,b,b,b,b,b
+
+		movzx	r3,	BYTE [r0+8]
+		sub	r0, 3
+		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r0, r1
+
+		add		r0,	3
+		movzx	r2,	BYTE [r0+4*r1]
+		add		r3,	r2
+		shl		r3,	4			; a = (left[7*kiStride] + top[7]) << 4;
+
+		sub	r0, 3
+		add		r0,	r1
+		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r0, r1
+		pxor	mm4,	mm4
+		punpckhbw mm0,	mm4
+		pmullw	mm0,	mm5
+		punpckhbw mm7,	mm4
+		pmullw	mm7,	mm6
+		psubw	mm7,	mm0
+
+		movq2dq xmm7,   mm7
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm7,xmm0,xmm2
+		movd    r2d,    xmm7			; V
+		movsx	r2,	r2w
+
+		imul	r2,	17
+		add		r2,	16
+		sar		r2,	5				; c = (17 * V + 16) >> 5;
+		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c
+
+		mov 	r0, r4
+		add		r3,	16
+		imul	r2,	-3
+		add		r3,	r2				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+
+		xor		r2,	r2
+		movdqa	xmm5,	[sse2_plane_mul_b_c]
+
+get_i_chroma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		packuswb xmm2,	xmm2
+		movq	[r0],	xmm2
+		paddw	xmm0,	xmm4
+		add		r0,	r1
+		inc		r2
+		cmp		r2,	8
+		jnz get_i_chroma_pred_plane_sse2_1
+
+		POP_XMM
+		pop r4
+		pop r3
+		WELSEMMS
+		ret
+
+;*******************************************************************************
+;	0 |1 |2 |3 |4 |
+;	6 |7 |8 |9 |10|
+;	11|12|13|14|15|
+;	16|17|18|19|20|
+;	21|22|23|24|25|
+;	7 is the start pixel of current 4x4 block
+;	pPred[7] = ([6]+[0]*2+[1]+2)/4
+;
+;   void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
+;
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+
+	movq        mm1,[r2+r1-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+	movq        mm2,[r2-8]			;get value of 6 mm2[8] = 6
+	sub		r2, r1			;mov eax to above line of current block(postion of 1)
+	punpckhbw   mm2,[r2-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+	movd        mm3,[r2]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+	psllq       mm3,18h				;mm3[5]=[1]
+	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	lea 		r2,[r2+r1*2-8h]		;set eax point to 12
+	movq        mm4,[r2+r1]		;get value of 16, mm4[8]=[16]
+	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[16]
+	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+	movq        mm4,[r2+r1*2]		;mm4[8]=[21]
+	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[21]
+	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
+	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
+	pand        mm1,[mmx_01bytes]	;set the odd bit
+	psubusb     mm3,mm1				;decrease 1 from odd bytes
+	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+
+	lea         r0,[r0+r1]
+	movd        [r0+2*r1],mm2
+	sub         r0,r1
+	psrlq       mm2,8
+	movd        [r0+2*r1],mm2
+	psrlq       mm2,8
+	movd        [r0+r1],mm2
+	psrlq       mm2,8
+	movd        [r0],mm2
+	WELSEMMS
+	ret
+
+
+;*******************************************************************************
+;	void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
+;   copy 8 pixel of 8 line from left
+;*******************************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+	movq		%1,		[%3-8]
+	psrlq		%1,		38h
+
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+	movq		%1,		[%3+r1-8]
+	psrlq		%1,		38h
+
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+WELS_EXTERN WelsDecoderIChromaPredH_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+
+	movq		mm0,	[r2-8]
+	psrlq		mm0,	38h
+
+	pmullw		mm0,		[mmx_01bytes]
+	pshufw		mm0,	mm0,	0
+	movq		[r0],	mm0
+
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+	lea			r2, [r2+r1*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+
+	lea         r0, [r0+2*r1]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+	lea			r2, [r2+r1*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+
+	lea         r0, [r0+2*r1]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+	lea			r2, [r2+r1*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+
+    	lea         r0, [r0+2*r1]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+
+	WELSEMMS
+	ret
+
+
+;*******************************************************************************
+;	void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
+;   copy 8 pixels from top 8 pixels
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredV_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+
+	sub			r0,		r1
+	movq		mm0,		[r0]
+
+	movq		[r0+r1],		mm0
+	movq		[r0+2*r1],	mm0
+	lea         r0, [r0+2*r1]
+	movq		[r0+r1],      mm0
+	movq		[r0+2*r1],    mm0
+	lea         r0, [r0+2*r1]
+	movq		[r0+r1],      mm0
+	movq		[r0+2*r1],    mm0
+	lea         r0, [r0+2*r1]
+	movq		[r0+r1],      mm0
+	movq		[r0+2*r1],    mm0
+
+	WELSEMMS
+	ret
+
+
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |a |b |
+;	|g |h |e |f |
+;	|i |j |g |h |
+
+;   a = (1 + lt + l0)>>1
+;   e = (1 + l0 + l1)>>1
+;   g = (1 + l1 + l2)>>1
+;   i = (1 + l2 + l3)>>1
+
+;   d = (2 + t0 + (t1<<1) + t2)>>2
+;   c = (2 + lt + (t0<<1) + t1)>>2
+;   b = (2 + l0 + (lt<<1) + t0)>>2
+
+;   f = (2 + l1 + (l0<<1) + lt)>>2
+;   h = (2 + l2 + (l1<<1) + l0)>>2
+;   j = (2 + l3 + (l2<<1) + l1)>>2
+;   [b a f e h g j i] + [d c b a] --> mov to memory
+;
+;   void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+	sub         r2, r1
+	movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+
+	movd        mm1, [r2+2*r1-4]
+	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r2, [r2+2*r1]
+	movd        mm2, [r2+2*r1-4]
+	punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+	psrlq       mm2, 20h
+	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+
+	movq        mm1, mm0
+	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+	movq        mm2, mm0
+	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+	movq        mm3, mm2
+	movq        mm4, mm1
+	pavgb       mm1, mm0
+
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm4				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+
+	movq        mm4, mm0
+	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+
+	psrlq       mm2, 20h
+	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+	movq        mm4, mm3
+	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+
+	movd        [r0], mm2
+	lea         r0, [r0+r1]
+	movd        [r0+2*r1], mm3
+	sub         r0, r1
+	psrlq       mm3, 10h
+	movd        [r0+2*r1], mm3
+	psrlq       mm3, 10h
+	movd        [r0+r1], mm3
+	WELSEMMS
+	ret
+
+
+
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|c |d |e |f |
+;	|e |f |g |g |
+;	|g |g |g |g |
+
+;   a = (1 + l0 + l1)>>1
+;   c = (1 + l1 + l2)>>1
+;   e = (1 + l2 + l3)>>1
+;   g = l3
+
+;   b = (2 + l0 + (l1<<1) + l2)>>2
+;   d = (2 + l1 + (l2<<1) + l3)>>2
+;   f = (2 + l2 + (l3<<1) + l3)>>2
+
+;   [g g f e d c b a] + [g g g g] --> mov to memory
+;
+;   void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+
+	movd        mm0, [r2-4]            ; mm0[3] = l0
+	punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
+	lea         r2, [r2+2*r1]
+	movd        mm2, [r2-4]            ; mm2[3] = l2
+	movd        mm4, [r2+r1-4]        ; mm4[3] = l3
+	punpcklbw   mm2, mm4
+	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+
+	psrlq       mm4, 18h
+	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+	psrlq       mm0, 8h
+	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+	movq        mm5, mm2
+	pavgb       mm2, mm0
+
+	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
+	pand        mm5, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm5				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+
+	psrlq       mm2, 8h
+	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+
+	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+
+	psrlq       mm4, 20h
+	lea         r0, [r0+r1]
+	movd        [r0+2*r1], mm4
+
+	sub         r0, r1
+	movd        [r0], mm1
+	psrlq       mm1, 10h
+	movd        [r0+r1], mm1
+	psrlq       mm1, 10h
+	movd        [r0+2*r1], mm1
+	WELSEMMS
+	ret
+
+
+
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	l3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|i |a |b |c |
+;	|j |e |f |g |
+
+;   a = (1 + lt + t0)>>1
+;   b = (1 + t0 + t1)>>1
+;   c = (1 + t1 + t2)>>1
+;   d = (1 + t2 + t3)>>1
+
+;   e = (2 + l0 + (lt<<1) + t0)>>2
+;   f = (2 + lt + (t0<<1) + t1)>>2
+;   g = (2 + t0 + (t1<<1) + t2)>>2
+
+;   h = (2 + t1 + (t2<<1) + t3)>>2
+;   i = (2 + lt + (l0<<1) + l1)>>2
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
+;   void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+	sub         r2, r1
+	movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+
+	movd        mm1, [r2+2*r1-4]
+	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r2, [r2+2*r1]
+	movq        mm2, [r2+r1-8]        ; mm2[7] = l2
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+	psrlq       mm2, 28h
+	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+	movq        mm3, mm2
+	pavgb       mm2, mm0
+
+	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm3				; decrease 1 from odd bytes
+
+	movq        mm3, mm0
+	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+	movq        mm2, mm3
+
+	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+	movd        [r0], mm1
+
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+	movd        [r0+r1], mm2
+
+	movq        mm4, mm3
+	psllq       mm4, 20h
+	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+
+	movq        mm5, mm3
+	psllq       mm5, 28h
+	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+
+	psllq       mm1, 8h
+	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+	movd        [r0+2*r1], mm4
+
+	psllq       mm2, 8h
+	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+	lea         r0, [r0+2*r1]
+	movd        [r0+r1], mm5
+	WELSEMMS
+	ret
+
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|b |c |d |e |
+;	|c |d |e |f |
+;	|d |e |f |g |
+
+;   a = (2 + t0 + t2 + (t1<<1))>>2
+;   b = (2 + t1 + t3 + (t2<<1))>>2
+;   c = (2 + t2 + t4 + (t3<<1))>>2
+;   d = (2 + t3 + t5 + (t4<<1))>>2
+
+;   e = (2 + t4 + t6 + (t5<<1))>>2
+;   f = (2 + t5 + t7 + (t6<<1))>>2
+;   g = (2 + t6 + t7 + (t7<<1))>>2
+
+;   [g f e d c b a] --> mov to memory
+;
+;   void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+	sub         r2, r1
+	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+
+	movq        mm3, mm0
+	psrlq       mm3, 38h
+	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+
+	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+	psrlq       mm2, 8h
+	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+	movq        mm3, mm1
+	pavgb       mm1, mm2
+	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm3				; decrease 1 from odd bytes
+
+	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+
+	psrlq       mm0, 8h
+	movd        [r0], mm0
+	psrlq       mm0, 8h
+	movd        [r0+r1], mm0
+	psrlq       mm0, 8h
+	movd        [r0+2*r1], mm0
+	psrlq       mm0, 8h
+	lea         r0, [r0+2*r1]
+	movd        [r0+r1], mm0
+	WELSEMMS
+	ret
+
+
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|b |c |d |i |
+;	|f |g |h |j |
+
+;   a = (1 + t0 + t1)>>1
+;   b = (1 + t1 + t2)>>1
+;   c = (1 + t2 + t3)>>1
+;   d = (1 + t3 + t4)>>1
+;   i = (1 + t4 + t5)>>1
+
+;   e = (2 + t0 + (t1<<1) + t2)>>2
+;   f = (2 + t1 + (t2<<1) + t3)>>2
+;   g = (2 + t2 + (t3<<1) + t4)>>2
+;   h = (2 + t3 + (t4<<1) + t5)>>2
+;   j = (2 + t4 + (t5<<1) + t6)>>2
+
+;   [i d c b a] + [j h g f e] --> mov to memory
+;
+;   void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+
+	sub         r2, r1
+	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+
+	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+	movq        mm3, mm1
+	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+
+	movq        mm4, mm2
+	pavgb       mm2, mm0
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm4				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+
+	movd        [r0], mm3
+	psrlq       mm3, 8h
+	movd        [r0+2*r1], mm3
+
+	movd        [r0+r1], mm2
+	psrlq       mm2, 8h
+	lea         r0, [r0+2*r1]
+	movd        [r0+r1], mm2
+	WELSEMMS
+	ret
+
+;*******************************************************************************
+;
+;   void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDc_sse2
+	push 	r3
+	push 	r4
+	%assign push_num 2
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r4, r0
+
+	sub         r0, r1
+	movq        mm0, [r0]
+
+	movzx		r2, byte [r0+r1-0x01] ; l1
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l2
+	add			r2, r3
+	movzx		r3, byte [r0+r1-0x01] ; l3
+	add			r2, r3
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l4
+	add			r2, r3
+	movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
+
+	movzx		r2, byte [r0+r1-0x01] ; l5
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l6
+	add			r2, r3
+	movzx		r3, byte [r0+r1-0x01] ; l7
+	add			r2, r3
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l8
+	add			r2, r3
+	movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
+
+	movq        mm3, mm0
+	psrlq       mm0, 0x20
+	psllq       mm3, 0x20
+	psrlq       mm3, 0x20
+	pxor		mm4, mm4
+	psadbw		mm0, mm4
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
+	paddq       mm3, mm1
+	movq        mm1, mm2
+	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+
+	movq        mm4, [mmx_0x02]
+
+	paddq       mm0, mm4
+	psrlq       mm0, 0x02
+
+	paddq       mm2, mm4
+	psrlq       mm2, 0x02
+
+	paddq       mm3, mm4
+	paddq       mm3, mm4
+	psrlq       mm3, 0x03
+
+	paddq       mm1, mm4
+	paddq       mm1, mm4
+	psrlq       mm1, 0x03
+
+	pmuludq     mm0, [mmx_01bytes]
+	pmuludq     mm3, [mmx_01bytes]
+	psllq       mm0, 0x20
+	pxor        mm0, mm3                 ; mm0 = m_up
+
+	pmuludq     mm2, [mmx_01bytes]
+	pmuludq     mm1, [mmx_01bytes]
+	psllq       mm1, 0x20
+	pxor        mm1, mm2                 ; mm2 = m_down
+
+	movq        [r4],       mm0
+	movq        [r4+r1],   mm0
+	movq        [r4+2*r1], mm0
+	lea         r4, [r4+2*r1]
+	movq        [r4+r1],   mm0
+
+	movq        [r4+2*r1], mm1
+	lea         r4, [r4+2*r1]
+	movq        [r4+r1],   mm1
+	movq        [r4+2*r1], mm1
+	lea         r4, [r4+2*r1]
+	movq        [r4+r1],   mm1
+
+	pop r4
+	pop r3
+	WELSEMMS
+	ret
+
+
+
+;*******************************************************************************
+;
+;   void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
+	push 	r3
+	push 	r4
+	%assign push_num 2
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r4, r0
+	sub         r0, r1
+	movdqa      xmm0, [r0]             ; read one row
+	pxor		xmm1, xmm1
+	psadbw		xmm0, xmm1
+	movdqa      xmm1, xmm0
+	psrldq      xmm1, 0x08
+	pslldq      xmm0, 0x08
+	psrldq      xmm0, 0x08
+	paddw       xmm0, xmm1
+
+	movzx		r2, byte [r0+r1-0x01]
+	movzx		r3, byte [r0+2*r1-0x01]
+	add		r2, r3
+	lea    		r0, [r0+r1]
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	add         r2, 0x10
+	movd        xmm1, r2d
+	paddw       xmm0, xmm1
+	psrld       xmm0, 0x05
+	pmuludq     xmm0, [mmx_01bytes]
+	pshufd      xmm0, xmm0, 0
+
+	movdqa      [r4],       xmm0
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
+
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
+
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
+
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
+
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
+
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
+
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
+
+	movdqa      [r4+r1],   xmm0
+
+	pop r4
+	pop r3
+
+	ret
+
+;*******************************************************************************
+; for intra prediction as follows, 11/19/2010
+;*******************************************************************************
+
+;*******************************************************************************
+;	void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+	sub r2, r1
+	movdqa xmm0, [r2]		; pPred-kiStride, top line
+	pxor xmm7, xmm7
+	psadbw xmm0, xmm7
+	movdqa xmm1, xmm0
+	psrldq xmm1, 8
+	paddw  xmm0, xmm1
+	xor r2, r2
+	movd r2d, xmm0
+	;movdqa xmm1, xmm0
+	;punpcklbw xmm0, xmm7
+	;punpckhbw xmm1, xmm7
+
+	;paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+	;pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
+	;paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+	;pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+	;paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+	;pshuflw xmm1, xmm0, 0b1h	; 10110001
+	;paddw xmm0, xmm1			; sum in word unit (x8)
+	;xor r3, r3
+	;movd r3d, xmm0
+	;and edx, 0ffffh
+
+	add r2, 8
+	sar r2, 4
+	SSE2_Copy16Times xmm1, r2d
+	;mov dh, dl
+	;mov r2, edx
+	;shl r2, 010h
+	;or edx, r2
+	;movd xmm1, edx
+	;pshufd xmm0, xmm1, 00h
+	;movdqa xmm1, xmm0
+	movdqa xmm0, xmm1
+	lea r2, [2*r1+r1]		; 3*kiStride
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+
+	POP_XMM
+	ret
+
+;*******************************************************************************
+;	void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	lea r2, [2*r1+r1]		; 3*kiStride
+
+	movdqa xmm0, [sse2_dc_0x80]
+	movdqa xmm1, xmm0
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+
+	ret
+
+;*******************************************************************************
+;	void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	mov r4, r0
+	; for left
+	dec r0
+	xor r2, r2
+	xor r3, r3
+	movzx r2, byte [r0]
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	lea r0, [r0+2*r1]
+	movzx r3, byte [r0]
+	add r2, r3
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	add r2, 02h
+	sar r2, 02h
+	;SSE2_Copy16Times mm0, r2d
+	mov r3, r2
+	sal r3, 8
+	or r2, r3
+	movd mm1, r2d
+	pshufw mm0, mm1, 00h
+	;mov bh, bl
+	;movd mm1, ebx
+	;pshufw mm0, mm1, 00h	; up64
+	movq mm1, mm0
+	xor r2, r2
+	lea r0, [r0+2*r1]
+	movzx r2, byte [r0]
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	lea r0, [r0+2*r1]
+	movzx r3, byte [r0]
+	add r2, r3
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	add r2, 02h
+	sar r2, 02h
+	mov r3, r2
+	sal r3, 8
+	or r2, r3
+	movd mm3, r2d
+	pshufw mm2, mm3, 00h
+	;mov bh, bl
+	;movd mm3, ebx
+	;pshufw mm2, mm3, 00h	; down64
+	;SSE2_Copy16Times mm2, r2d
+	movq mm3, mm2
+	lea r2, [2*r1+r1]
+	movq [r4], mm0
+	movq [r4+r1], mm1
+	movq [r4+2*r1], mm0
+	movq [r4+r2], mm1
+	lea r4, [r4+4*r1]
+	movq [r4], mm2
+	movq [r4+r1], mm3
+	movq [r4+2*r1], mm2
+	movq [r4+r2], mm3
+	pop r4
+	pop r3
+	emms
+	ret
+
+;*******************************************************************************
+;	void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	mov r2, r0
+	sub r2, r1
+	movq xmm0, [r2]		; top: 8x1 pixels
+	pxor xmm7, xmm7
+	punpcklbw xmm0, xmm7		; ext 8x2 words
+	pshufd xmm1, xmm0, 0B1h		; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
+	paddw xmm0, xmm1			; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
+	movdqa xmm1, xmm0
+	pshuflw xmm2, xmm0, 0B1h	; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
+	pshufhw xmm3, xmm1, 0B1h	; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
+	paddw xmm0, xmm2			; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
+	paddw xmm1, xmm3			; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
+	punpckhqdq xmm1, xmm7
+	punpcklqdq xmm0, xmm1		; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+	movdqa xmm6, [sse2_wd_0x02]
+	paddw xmm0, xmm6
+	psraw xmm0, 02h
+	packuswb xmm0, xmm7
+	lea r2, [2*r1+r1]
+	movq [r0], xmm0
+	movq [r0+r1], xmm0
+	movq [r0+2*r1], xmm0
+	movq [r0+r2], xmm0
+	lea r0, [r0+4*r1]
+	movq [r0], xmm0
+	movq [r0+r1], xmm0
+	movq [r0+2*r1], xmm0
+	movq [r0+r2], xmm0
+	POP_XMM
+	ret
+
+;*******************************************************************************
+;	void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
+	%assign push_num 0
+	LOAD_2_PARA
+	SIGN_EXTENSION r1, r1d
+	lea r2, [2*r1+r1]
+	movq mm0, [sse2_dc_0x80]
+	movq mm1, mm0
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	movq [r0+2*r1], mm0
+	movq [r0+r2], mm1
+	lea r0, [r0+4*r1]
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	movq [r0+2*r1], mm0
+	movq [r0+r2], mm1
+	emms
+	ret
+
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -27,9 +27,9 @@
 
 ifeq ($(ASM_ARCH), x86)
 DECODER_ASM_SRCS=\
-	$(DECODER_SRCDIR)/core/asm/block_add.asm\
-	$(DECODER_SRCDIR)/core/asm/dct.asm\
-	$(DECODER_SRCDIR)/core/asm/intra_pred.asm\
+	$(DECODER_SRCDIR)/core/x86/block_add.asm\
+	$(DECODER_SRCDIR)/core/x86/dct.asm\
+	$(DECODER_SRCDIR)/core/x86/intra_pred.asm\
 
 DECODER_OBJS += $(DECODER_ASM_SRCS:.asm=.$(OBJ))
 endif
--- a/codec/encoder/core/asm/coeff.asm
+++ /dev/null
@@ -1,459 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  memzero.asm
-;*
-;*  Abstract
-;*     cavlc
-;*
-;*  History
-;*      09/08/2010 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-
-
-%ifdef X86_32
-SECTION .rodata align=16
-
-align 16
-sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
-
-ALIGN  16
-sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
-
-align 16
-byte_1pos_table:
-	db 0,0,0,0,0,0,0,0, ;0
-	db 0,0,0,0,0,0,0,1, ;1
-	db 1,0,0,0,0,0,0,1, ;2
-	db 1,0,0,0,0,0,0,2, ;3
-	db 2,0,0,0,0,0,0,1, ;4
-	db 2,0,0,0,0,0,0,2, ;5
-	db 2,1,0,0,0,0,0,2, ;6
-	db 2,1,0,0,0,0,0,3, ;7
-	db 3,0,0,0,0,0,0,1, ;8
-	db 3,0,0,0,0,0,0,2, ;9
-	db 3,1,0,0,0,0,0,2, ;10
-	db 3,1,0,0,0,0,0,3, ;11
-	db 3,2,0,0,0,0,0,2, ;12
-	db 3,2,0,0,0,0,0,3, ;13
-	db 3,2,1,0,0,0,0,3, ;14
-	db 3,2,1,0,0,0,0,4, ;15
-	db 4,0,0,0,0,0,0,1, ;16
-	db 4,0,0,0,0,0,0,2, ;17
-	db 4,1,0,0,0,0,0,2, ;18
-	db 4,1,0,0,0,0,0,3, ;19
-	db 4,2,0,0,0,0,0,2, ;20
-	db 4,2,0,0,0,0,0,3, ;21
-	db 4,2,1,0,0,0,0,3, ;22
-	db 4,2,1,0,0,0,0,4, ;23
-	db 4,3,0,0,0,0,0,2, ;24
-	db 4,3,0,0,0,0,0,3, ;25
-	db 4,3,1,0,0,0,0,3, ;26
-	db 4,3,1,0,0,0,0,4, ;27
-	db 4,3,2,0,0,0,0,3, ;28
-	db 4,3,2,0,0,0,0,4, ;29
-	db 4,3,2,1,0,0,0,4, ;30
-	db 4,3,2,1,0,0,0,5, ;31
-	db 5,0,0,0,0,0,0,1, ;32
-	db 5,0,0,0,0,0,0,2, ;33
-	db 5,1,0,0,0,0,0,2, ;34
-	db 5,1,0,0,0,0,0,3, ;35
-	db 5,2,0,0,0,0,0,2, ;36
-	db 5,2,0,0,0,0,0,3, ;37
-	db 5,2,1,0,0,0,0,3, ;38
-	db 5,2,1,0,0,0,0,4, ;39
-	db 5,3,0,0,0,0,0,2, ;40
-	db 5,3,0,0,0,0,0,3, ;41
-	db 5,3,1,0,0,0,0,3, ;42
-	db 5,3,1,0,0,0,0,4, ;43
-	db 5,3,2,0,0,0,0,3, ;44
-	db 5,3,2,0,0,0,0,4, ;45
-	db 5,3,2,1,0,0,0,4, ;46
-	db 5,3,2,1,0,0,0,5, ;47
-	db 5,4,0,0,0,0,0,2, ;48
-	db 5,4,0,0,0,0,0,3, ;49
-	db 5,4,1,0,0,0,0,3, ;50
-	db 5,4,1,0,0,0,0,4, ;51
-	db 5,4,2,0,0,0,0,3, ;52
-	db 5,4,2,0,0,0,0,4, ;53
-	db 5,4,2,1,0,0,0,4, ;54
-	db 5,4,2,1,0,0,0,5, ;55
-	db 5,4,3,0,0,0,0,3, ;56
-	db 5,4,3,0,0,0,0,4, ;57
-	db 5,4,3,1,0,0,0,4, ;58
-	db 5,4,3,1,0,0,0,5, ;59
-	db 5,4,3,2,0,0,0,4, ;60
-	db 5,4,3,2,0,0,0,5, ;61
-	db 5,4,3,2,1,0,0,5, ;62
-	db 5,4,3,2,1,0,0,6, ;63
-	db 6,0,0,0,0,0,0,1, ;64
-	db 6,0,0,0,0,0,0,2, ;65
-	db 6,1,0,0,0,0,0,2, ;66
-	db 6,1,0,0,0,0,0,3, ;67
-	db 6,2,0,0,0,0,0,2, ;68
-	db 6,2,0,0,0,0,0,3, ;69
-	db 6,2,1,0,0,0,0,3, ;70
-	db 6,2,1,0,0,0,0,4, ;71
-	db 6,3,0,0,0,0,0,2, ;72
-	db 6,3,0,0,0,0,0,3, ;73
-	db 6,3,1,0,0,0,0,3, ;74
-	db 6,3,1,0,0,0,0,4, ;75
-	db 6,3,2,0,0,0,0,3, ;76
-	db 6,3,2,0,0,0,0,4, ;77
-	db 6,3,2,1,0,0,0,4, ;78
-	db 6,3,2,1,0,0,0,5, ;79
-	db 6,4,0,0,0,0,0,2, ;80
-	db 6,4,0,0,0,0,0,3, ;81
-	db 6,4,1,0,0,0,0,3, ;82
-	db 6,4,1,0,0,0,0,4, ;83
-	db 6,4,2,0,0,0,0,3, ;84
-	db 6,4,2,0,0,0,0,4, ;85
-	db 6,4,2,1,0,0,0,4, ;86
-	db 6,4,2,1,0,0,0,5, ;87
-	db 6,4,3,0,0,0,0,3, ;88
-	db 6,4,3,0,0,0,0,4, ;89
-	db 6,4,3,1,0,0,0,4, ;90
-	db 6,4,3,1,0,0,0,5, ;91
-	db 6,4,3,2,0,0,0,4, ;92
-	db 6,4,3,2,0,0,0,5, ;93
-	db 6,4,3,2,1,0,0,5, ;94
-	db 6,4,3,2,1,0,0,6, ;95
-	db 6,5,0,0,0,0,0,2, ;96
-	db 6,5,0,0,0,0,0,3, ;97
-	db 6,5,1,0,0,0,0,3, ;98
-	db 6,5,1,0,0,0,0,4, ;99
-	db 6,5,2,0,0,0,0,3, ;100
-	db 6,5,2,0,0,0,0,4, ;101
-	db 6,5,2,1,0,0,0,4, ;102
-	db 6,5,2,1,0,0,0,5, ;103
-	db 6,5,3,0,0,0,0,3, ;104
-	db 6,5,3,0,0,0,0,4, ;105
-	db 6,5,3,1,0,0,0,4, ;106
-	db 6,5,3,1,0,0,0,5, ;107
-	db 6,5,3,2,0,0,0,4, ;108
-	db 6,5,3,2,0,0,0,5, ;109
-	db 6,5,3,2,1,0,0,5, ;110
-	db 6,5,3,2,1,0,0,6, ;111
-	db 6,5,4,0,0,0,0,3, ;112
-	db 6,5,4,0,0,0,0,4, ;113
-	db 6,5,4,1,0,0,0,4, ;114
-	db 6,5,4,1,0,0,0,5, ;115
-	db 6,5,4,2,0,0,0,4, ;116
-	db 6,5,4,2,0,0,0,5, ;117
-	db 6,5,4,2,1,0,0,5, ;118
-	db 6,5,4,2,1,0,0,6, ;119
-	db 6,5,4,3,0,0,0,4, ;120
-	db 6,5,4,3,0,0,0,5, ;121
-	db 6,5,4,3,1,0,0,5, ;122
-	db 6,5,4,3,1,0,0,6, ;123
-	db 6,5,4,3,2,0,0,5, ;124
-	db 6,5,4,3,2,0,0,6, ;125
-	db 6,5,4,3,2,1,0,6, ;126
-	db 6,5,4,3,2,1,0,7, ;127
-	db 7,0,0,0,0,0,0,1, ;128
-	db 7,0,0,0,0,0,0,2, ;129
-	db 7,1,0,0,0,0,0,2, ;130
-	db 7,1,0,0,0,0,0,3, ;131
-	db 7,2,0,0,0,0,0,2, ;132
-	db 7,2,0,0,0,0,0,3, ;133
-	db 7,2,1,0,0,0,0,3, ;134
-	db 7,2,1,0,0,0,0,4, ;135
-	db 7,3,0,0,0,0,0,2, ;136
-	db 7,3,0,0,0,0,0,3, ;137
-	db 7,3,1,0,0,0,0,3, ;138
-	db 7,3,1,0,0,0,0,4, ;139
-	db 7,3,2,0,0,0,0,3, ;140
-	db 7,3,2,0,0,0,0,4, ;141
-	db 7,3,2,1,0,0,0,4, ;142
-	db 7,3,2,1,0,0,0,5, ;143
-	db 7,4,0,0,0,0,0,2, ;144
-	db 7,4,0,0,0,0,0,3, ;145
-	db 7,4,1,0,0,0,0,3, ;146
-	db 7,4,1,0,0,0,0,4, ;147
-	db 7,4,2,0,0,0,0,3, ;148
-	db 7,4,2,0,0,0,0,4, ;149
-	db 7,4,2,1,0,0,0,4, ;150
-	db 7,4,2,1,0,0,0,5, ;151
-	db 7,4,3,0,0,0,0,3, ;152
-	db 7,4,3,0,0,0,0,4, ;153
-	db 7,4,3,1,0,0,0,4, ;154
-	db 7,4,3,1,0,0,0,5, ;155
-	db 7,4,3,2,0,0,0,4, ;156
-	db 7,4,3,2,0,0,0,5, ;157
-	db 7,4,3,2,1,0,0,5, ;158
-	db 7,4,3,2,1,0,0,6, ;159
-	db 7,5,0,0,0,0,0,2, ;160
-	db 7,5,0,0,0,0,0,3, ;161
-	db 7,5,1,0,0,0,0,3, ;162
-	db 7,5,1,0,0,0,0,4, ;163
-	db 7,5,2,0,0,0,0,3, ;164
-	db 7,5,2,0,0,0,0,4, ;165
-	db 7,5,2,1,0,0,0,4, ;166
-	db 7,5,2,1,0,0,0,5, ;167
-	db 7,5,3,0,0,0,0,3, ;168
-	db 7,5,3,0,0,0,0,4, ;169
-	db 7,5,3,1,0,0,0,4, ;170
-	db 7,5,3,1,0,0,0,5, ;171
-	db 7,5,3,2,0,0,0,4, ;172
-	db 7,5,3,2,0,0,0,5, ;173
-	db 7,5,3,2,1,0,0,5, ;174
-	db 7,5,3,2,1,0,0,6, ;175
-	db 7,5,4,0,0,0,0,3, ;176
-	db 7,5,4,0,0,0,0,4, ;177
-	db 7,5,4,1,0,0,0,4, ;178
-	db 7,5,4,1,0,0,0,5, ;179
-	db 7,5,4,2,0,0,0,4, ;180
-	db 7,5,4,2,0,0,0,5, ;181
-	db 7,5,4,2,1,0,0,5, ;182
-	db 7,5,4,2,1,0,0,6, ;183
-	db 7,5,4,3,0,0,0,4, ;184
-	db 7,5,4,3,0,0,0,5, ;185
-	db 7,5,4,3,1,0,0,5, ;186
-	db 7,5,4,3,1,0,0,6, ;187
-	db 7,5,4,3,2,0,0,5, ;188
-	db 7,5,4,3,2,0,0,6, ;189
-	db 7,5,4,3,2,1,0,6, ;190
-	db 7,5,4,3,2,1,0,7, ;191
-	db 7,6,0,0,0,0,0,2, ;192
-	db 7,6,0,0,0,0,0,3, ;193
-	db 7,6,1,0,0,0,0,3, ;194
-	db 7,6,1,0,0,0,0,4, ;195
-	db 7,6,2,0,0,0,0,3, ;196
-	db 7,6,2,0,0,0,0,4, ;197
-	db 7,6,2,1,0,0,0,4, ;198
-	db 7,6,2,1,0,0,0,5, ;199
-	db 7,6,3,0,0,0,0,3, ;200
-	db 7,6,3,0,0,0,0,4, ;201
-	db 7,6,3,1,0,0,0,4, ;202
-	db 7,6,3,1,0,0,0,5, ;203
-	db 7,6,3,2,0,0,0,4, ;204
-	db 7,6,3,2,0,0,0,5, ;205
-	db 7,6,3,2,1,0,0,5, ;206
-	db 7,6,3,2,1,0,0,6, ;207
-	db 7,6,4,0,0,0,0,3, ;208
-	db 7,6,4,0,0,0,0,4, ;209
-	db 7,6,4,1,0,0,0,4, ;210
-	db 7,6,4,1,0,0,0,5, ;211
-	db 7,6,4,2,0,0,0,4, ;212
-	db 7,6,4,2,0,0,0,5, ;213
-	db 7,6,4,2,1,0,0,5, ;214
-	db 7,6,4,2,1,0,0,6, ;215
-	db 7,6,4,3,0,0,0,4, ;216
-	db 7,6,4,3,0,0,0,5, ;217
-	db 7,6,4,3,1,0,0,5, ;218
-	db 7,6,4,3,1,0,0,6, ;219
-	db 7,6,4,3,2,0,0,5, ;220
-	db 7,6,4,3,2,0,0,6, ;221
-	db 7,6,4,3,2,1,0,6, ;222
-	db 7,6,4,3,2,1,0,7, ;223
-	db 7,6,5,0,0,0,0,3, ;224
-	db 7,6,5,0,0,0,0,4, ;225
-	db 7,6,5,1,0,0,0,4, ;226
-	db 7,6,5,1,0,0,0,5, ;227
-	db 7,6,5,2,0,0,0,4, ;228
-	db 7,6,5,2,0,0,0,5, ;229
-	db 7,6,5,2,1,0,0,5, ;230
-	db 7,6,5,2,1,0,0,6, ;231
-	db 7,6,5,3,0,0,0,4, ;232
-	db 7,6,5,3,0,0,0,5, ;233
-	db 7,6,5,3,1,0,0,5, ;234
-	db 7,6,5,3,1,0,0,6, ;235
-	db 7,6,5,3,2,0,0,5, ;236
-	db 7,6,5,3,2,0,0,6, ;237
-	db 7,6,5,3,2,1,0,6, ;238
-	db 7,6,5,3,2,1,0,7, ;239
-	db 7,6,5,4,0,0,0,4, ;240
-	db 7,6,5,4,0,0,0,5, ;241
-	db 7,6,5,4,1,0,0,5, ;242
-	db 7,6,5,4,1,0,0,6, ;243
-	db 7,6,5,4,2,0,0,5, ;244
-	db 7,6,5,4,2,0,0,6, ;245
-	db 7,6,5,4,2,1,0,6, ;246
-	db 7,6,5,4,2,1,0,7, ;247
-	db 7,6,5,4,3,0,0,5, ;248
-	db 7,6,5,4,3,0,0,6, ;249
-	db 7,6,5,4,3,1,0,6, ;250
-	db 7,6,5,4,3,1,0,7, ;251
-	db 7,6,5,4,3,2,0,6, ;252
-	db 7,6,5,4,3,2,0,7, ;253
-	db 7,6,5,4,3,2,1,7, ;254
-	db 7,6,5,4,3,2,1,8, ;255
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-
-
-;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
-;***********************************************************************
-WELS_EXTERN CavlcParamCal_sse2
-	push ebx
-	push edi
-	push esi
-
-	mov			eax,	[esp+16]	;coffLevel
-	mov			edi,	[esp+24]	;Level
-	mov			ebx,	[esp+32]	;endIdx
-	cmp			ebx,	3
-	jne			.Level16
-	pxor		xmm1,	xmm1
-	movq		xmm0,	[eax]	; removed QWORD
-	jmp			.Cal_begin
-.Level16:
-	movdqa		xmm0,	[eax]
-	movdqa		xmm1,	[eax+16]
-.Cal_begin:
-    movdqa		xmm2,	xmm0
-	packsswb	xmm0,	xmm1
-	movdqa		xmm4,	xmm0
-	pxor		xmm3,	xmm3
-	pcmpgtb		xmm0,	xmm3
-	pcmpgtb		xmm3,	xmm4
-	por			xmm0,	xmm3
-	pmovmskb	edx,	xmm0
-	cmp			edx,	0
-	je near   .return
-	movdqa		xmm6,	[sse2_b_1]
-	pcmpeqw		xmm7,	xmm7	;generate -1
-    mov			ebx,	0xff
-    ;pinsrw		xmm6,	ebx,	3
-
-    mov       bl,   dh
-
-	lea       ebx,  [byte_1pos_table+8*ebx]
-	movq      xmm0, [ebx]
-	pextrw    ecx,  xmm0, 3
-	shr       ecx,  8
-    mov       dh,   cl
-
-.loopHighFind0:
-    cmp       ecx,   0
-    je        .loopHighFind0End
-    ;mov       esi, [ebx]
-    ;and       esi, 0xff
-    movzx	  esi, byte [ebx]
-    add       esi, 8
-    mov       esi, [eax+2*esi]
-    mov       [edi], si
-    add       edi,   2
-    ;add       ebx,   1
-    inc		  ebx
-    dec       ecx
-	jmp       .loopHighFind0
-.loopHighFind0End:
-    mov       cl,   dh
-    cmp       cl,   8
-	pand      xmm0, xmm6
-    jne       .LowByteFind0
-    sub       edi,   2
-    mov       esi,   [eax+16]
-    mov       [edi], esi
-    add       edi,   2
-.LowByteFind0:
-    and       edx,  0xff
-	lea       ebx,  [byte_1pos_table+8*edx]
-	movq      xmm1, [ebx]
-    pextrw    esi,  xmm1, 3
-    or        esi,  0xff
-    or        ecx,  0xff00
-    and       ecx,  esi
-    shr       esi,  8
-    pand      xmm1, xmm6
-.loopLowFind0:
-    cmp       esi, 0
-    je        .loopLowFind0End
-	;mov       edx, [ebx]
-	;and       edx, 0xff
-	movzx	  edx,	byte [ebx]
-	mov       edx, [eax+2*edx]
-	mov       [edi], dx
-	add       edi,   2
-	;add       ebx,   1
-	inc		  ebx
-    dec       esi
-	jmp       .loopLowFind0
-.loopLowFind0End:
-    cmp       ch,  8
-    jne       .getLevelEnd
-    sub       edi, 2
-    mov       edx, [eax]
-    mov       [edi], dx
-.getLevelEnd:
-	mov      edx, [esp+28]	;total_coeffs
-    ;mov      ebx,   ecx
-    ;and      ebx,   0xff
-    movzx	 ebx,	byte cl
-    add      cl,    ch
-	mov      [edx], cl
-;getRun
-    movq     xmm5, [sse2_b8]
-    paddb    xmm0, xmm5
-    pxor     xmm2, xmm2
-    pxor     xmm3, xmm3
-    mov      eax,  8
-    sub      eax,  ebx
-    shl      eax,  3
-    shl      ebx,  3
-	pinsrw   xmm2, ebx, 0
-    pinsrw   xmm3, eax, 0
-    psllq    xmm0, xmm3
-    psrlq    xmm0, xmm3
-    movdqa   xmm4, xmm1
-    psllq    xmm1, xmm2
-    psrlq    xmm4, xmm3
-    punpcklqdq xmm1, xmm4
-    por      xmm0,  xmm1
-
-    pextrw   eax,   xmm0, 0
-    and		 eax,   0xff
-    inc      eax
-    sub      al,    cl
-	movdqa   xmm1,  xmm0
-	paddb    xmm1,  xmm7
-	psrldq   xmm0,  1
-	psubb    xmm1,  xmm0
-    mov      ecx,   [esp+20] ;run
-	movdqa   [ecx], xmm1
-;getRunEnd
-.return:
-	pop esi
-	pop edi
-	pop ebx
-	ret
-%endif
--- a/codec/encoder/core/asm/dct.asm
+++ /dev/null
@@ -1,504 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        ?Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        ?Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  dct.asm
-;*
-;*  Abstract
-;*      WelsDctFourT4_sse2
-;*
-;*  History
-;*      8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-
-align 16
-SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
-			dw	10, 13, 10, 13, 13, 16, 13, 16,
-            dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  13, 16, 13, 16, 16, 20, 16, 20,
-			dw  13, 16, 13, 16, 16, 20, 16, 20,
-            dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  16, 20, 16, 20, 20, 25, 20, 25,
-			dw  16, 20, 16, 20, 20, 25, 20, 25,
-            dw  18, 23, 18, 23, 23, 29, 23, 29,
-			dw  18, 23, 18, 23, 23, 29, 23, 29
-
-
-;***********************************************************************
-; MMX functions
-;***********************************************************************
-
-%macro MMX_LoadDiff4P 5
-	movd        %1, [%3]
-	movd        %2, [%4]
-	punpcklbw   %1, %5
-	punpcklbw   %2, %5
-	psubw       %1, %2
-%endmacro
-
-%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
-	MMX_LoadDiff4P %1, %9, %5,    %7,    %10
-	MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
-	lea  %5, [%5+2*%6]
-	lea  %7, [%7+2*%8]
-	MMX_LoadDiff4P %3, %9, %5,    %7,    %10
-	MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
-%endmacro
-
-%macro MMX_SumSubMul2 3
-	movq    %3, %1
-	psllw   %1, $01
-	paddw   %1, %2
-	psllw   %2, $01
-    psubw   %3, %2
-%endmacro
-
-%macro MMX_SumSubDiv2 3
-    movq    %3, %2
-    psraw   %3, $01
-    paddw   %3, %1
-    psraw   %1, $01
-    psubw   %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
-	movq    %3, %2
-    psubw   %2, %1
-    paddw   %1, %3
-%endmacro
-
-%macro MMX_DCT 6
-    MMX_SumSub		%4, %1, %6
-    MMX_SumSub		%3, %2, %6
-    MMX_SumSub		%3, %4, %6
-    MMX_SumSubMul2  %1, %2, %5
-%endmacro
-
-%macro MMX_IDCT 6
-    MMX_SumSub      %4, %5, %6
-    MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
-%endmacro
-
-%macro MMX_StoreDiff4P 6
-    movd       %2, %6
-    punpcklbw  %2, %4
-    paddw      %1, %3
-    psraw      %1, $06
-    paddsw     %1, %2
-    packuswb   %1, %2
-    movd       %5, %1
-%endmacro
-SECTION .text
-;***********************************************************************
-;   void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
-;***********************************************************************
-WELS_EXTERN WelsDctT4_mmx
-    %assign push_num 0
-    LOAD_5_PARA
-    SIGN_EXTENSION r2, r2d
-    SIGN_EXTENSION r4, r4d
-    WELS_Zero    mm7
-
-    MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
-
-    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
-    MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
-
-    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
-    MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
-
-    movq    [r0+ 0],   mm2
-    movq    [r0+ 8],   mm1
-    movq    [r0+16],   mm5
-    movq    [r0+24],   mm4
-    WELSEMMS
-    LOAD_5_PARA_POP
-    ret
-
-
-;***********************************************************************
-;   void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
-;***********************************************************************
-WELS_EXTERN WelsIDctT4Rec_mmx
-    %assign push_num 0
-    LOAD_5_PARA
-    SIGN_EXTENSION r1, r1d
-    SIGN_EXTENSION r3, r3d
-    movq    mm0, [r4+ 0]
-    movq    mm1, [r4+ 8]
-    movq    mm2, [r4+16]
-    movq    mm3, [r4+24]
-
-	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W		mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
-
-    WELS_Zero			mm7
-    WELS_DW32			mm6
-
-    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [r0], [r2]
-    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
-    lea     r0, [r0+2*r1]
-    lea     r2, [r2+2*r3]
-    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [r0], [r2]
-    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
-
-	WELSEMMS
-    LOAD_5_PARA_POP
-    ret
-
-
-;***********************************************************************
-; SSE2 functions
-;***********************************************************************
-%macro SSE2_Store4x8p 6
-	SSE2_XSawp qdq, %2, %3, %6
-	SSE2_XSawp qdq, %4, %5, %3
-	MOVDQ    [%1+0x00], %2
-	MOVDQ    [%1+0x10], %4
-	MOVDQ    [%1+0x20], %6
-	MOVDQ    [%1+0x30], %3
-%endmacro
-
-%macro SSE2_Load4x8p 6
-	MOVDQ    %2,	[%1+0x00]
-	MOVDQ    %4,	[%1+0x10]
-	MOVDQ    %6,	[%1+0x20]
-	MOVDQ    %3,	[%1+0x30]
-	SSE2_XSawp qdq, %4, %3, %5
-	SSE2_XSawp qdq, %2, %6, %3
-%endmacro
-
-%macro SSE2_SumSubMul2 3
-    movdqa  %3, %1
-    paddw   %1, %1
-    paddw   %1, %2
-    psubw   %3, %2
-    psubw   %3, %2
-%endmacro
-
-%macro SSE2_SumSubDiv2 4
-    movdqa  %4, %1
-    movdqa  %3, %2
-    psraw   %2, $01
-    psraw   %4, $01
-    paddw   %1, %2
-    psubw   %4, %3
-%endmacro
-
-%macro SSE2_StoreDiff8p 6
-    paddw       %1, %3
-    psraw       %1, $06
-    movq		%2, %6
-    punpcklbw   %2, %4
-    paddsw      %2, %1
-    packuswb    %2, %2
-    movq	    %5, %2
-%endmacro
-
-%macro SSE2_StoreDiff8p 5
-    movq		%2, %5
-    punpcklbw   %2, %3
-    paddsw      %2, %1
-    packuswb    %2, %2
-    movq	    %4, %2
-%endmacro
-
-%macro SSE2_Load8DC	6
-	movdqa		%1,		%6		; %1 = dc0 dc1
-	paddw       %1,		%5
-    psraw       %1,		$06		; (dc + 32) >> 6
-
-    movdqa		%2,		%1
-    psrldq		%2,		4
- 	punpcklwd	%2,		%2
-	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
-
-    movdqa		%3,		%1
-    psrldq		%3,		8
- 	punpcklwd	%3,		%3
-	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-
-	movdqa		%4,		%1
-    psrldq		%4,		12
- 	punpcklwd	%4,		%4
-	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-
-	punpcklwd	%1,		%1
-	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
-%endmacro
-
-%macro SSE2_DCT 6
-    SSE2_SumSub		%6, %3,	%5
-	SSE2_SumSub		%1, %2, %5
-	SSE2_SumSub		%3, %2, %5
-	SSE2_SumSubMul2		%6, %1, %4
-%endmacro
-
-%macro SSE2_IDCT 7
-    SSE2_SumSub       %7, %2, %6
-    SSE2_SumSubDiv2     %1, %3, %5, %4
-    SSE2_SumSub	     %2, %1, %5
-    SSE2_SumSub		 %7, %4, %5
-%endmacro
-
-;***********************************************************************
-; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
-;***********************************************************************
-WELS_EXTERN WelsDctFourT4_sse2
-    %assign push_num 0
-    LOAD_5_PARA
-    PUSH_XMM 8
-    SIGN_EXTENSION r2, r2d
-    SIGN_EXTENSION r4, r4d
-    pxor    xmm7, xmm7
-	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
-    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
-	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
-    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
-
-	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-
-	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
-
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
-
-	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
-    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2  ], [r3+r4]
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
-    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
-    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
-
-	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-
-	lea		r0, [r0+64]
-	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
-
-	POP_XMM
-	LOAD_5_PARA_POP
-    ret
-
-
-;***********************************************************************
-; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
-;***********************************************************************
-WELS_EXTERN WelsIDctFourT4Rec_sse2
-	%assign push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	;Load 4x8
-	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
-
-	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-  	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
-    SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
-    SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-
-	WELS_Zero			xmm7
-    WELS_DW32			xmm6
-
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
-
-    add		r4, 64
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-   	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
-
-	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
-	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
-    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
-	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-
-	WELS_Zero			xmm7
-    WELS_DW32			xmm6
-
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],	[r2 + r3]
-	POP_XMM
-	LOAD_5_PARA_POP
-   ; pop		esi
-   ; pop		ebx
-    ret
-
-%macro SSE2_StoreDiff4x8p 8
-   	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
-	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
-%endmacro
-
- ;***********************************************************************
-; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
-;***********************************************************************
-WELS_EXTERN WelsIDctRecI16x16Dc_sse2
-	%assign push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor		xmm7,		xmm7
-    WELS_DW32	xmm6
-
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
-
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-	POP_XMM
-	LOAD_5_PARA_POP
-    ret
-
-
-
-%macro SSE2_SumSubD 3
-	movdqa  %3, %2
-    paddd   %2, %1
-    psubd   %1, %3
-%endmacro
-
-%macro SSE2_SumSubDiv2D 4
-	paddd   %1, %2
-	paddd	%1, %3
-	psrad	%1,	 1
-	movdqa	%4, %1
-	psubd	%4, %2
-%endmacro
-%macro		SSE2_Load4Col	5
-	movsx		r2,		WORD[%5]
- 	movd		%1,			r2d
- 	movsx		r2,		WORD[%5 + 0x20]
- 	movd		%2,			r2d
-	punpckldq	%1,			%2
-	movsx		r2,		WORD[%5 + 0x80]
- 	movd		%3,			r2d
-	movsx		r2,		WORD[%5 + 0xa0]
- 	movd		%4,			r2d
-	punpckldq	%3,			%4
-	punpcklqdq	%1,			%3
-%endmacro
-
-;***********************************************************************
-;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
-;***********************************************************************
-WELS_EXTERN WelsHadamardT4Dc_sse2
-		%assign push_num 0
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
-		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, r1 + 0x40
-		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, r1 + 0x100
-		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, r1 + 0x140
-
-		SSE2_SumSubD		xmm1, xmm2, xmm7
-		SSE2_SumSubD		xmm3, xmm4, xmm7
-		SSE2_SumSubD		xmm2, xmm4, xmm7
-		SSE2_SumSubD		xmm1, xmm3, xmm7
-
-		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
-
-		SSE2_SumSubD		xmm4, xmm3, xmm7
-		SSE2_SumSubD		xmm5, xmm1, xmm7
-
-		WELS_DD1 xmm6
-		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
-		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
-        SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
-
-		packssdw	xmm3,	xmm4
-		packssdw	xmm2,	xmm1
-		movdqa	[r0+ 0],   xmm3
-		movdqa	[r0+16],   xmm2
-
-		POP_XMM
-		ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ /dev/null
@@ -1,1416 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 function for intra predict operations
-;*
-;*  History
-;*      18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-align 16
-sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
-
-; for chroma plane mode
-sse2_plane_inc_c dw 1, 2, 3, 4
-sse2_plane_dec_c dw 4, 3, 2, 1
-align 16
-sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
-
-align 16
-mmx_01bytes:		times 16	db 1
-
-align 16
-mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
-
-
-;***********************************************************************
-; macros
-;***********************************************************************
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-;%1 will keep the last result
-%macro SSE_DB_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubb %1, %2
-%endmacro
-
-;xmm0, xmm1, xmm2, eax, ecx
-;lower 64 bits of xmm0 save the result
-%macro SSE2_PRED_H_4X4_TWO_LINE 5
-    movd		%1,	[%4-1]
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-
-	;add			%4,	%5
-	movd		%2,	[%4+%5-1]
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	punpckldq	%1,	%2
-%endmacro
-
-%macro  SUMW_HORIZON1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
-%endmacro
-
-%macro	LOAD_COLUMN 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]
-		movd	%4,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		lea		%5,	[%5+2*%6]
-		punpcklbw %3,	%2
-		punpcklwd %4,	%3
-		punpckhdq %1,	%4
-%endmacro
-
-%macro  SUMW_HORIZON 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro	LOAD_COLUMN_C 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]
-%endmacro
-
-%macro LOAD_2_LEFT_AND_ADD 0
-        lea         r1, [r1+2*r2]
-        movzx		r4, byte [r1-0x01]
-        add			r3, r4
-        movzx		r4, byte [r1+r2-0x01]
-        add			r3, r4
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-;   void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;
-;	pred must align to 16
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredH_sse2
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movzx		r3,	byte [r1-1]
-	movd		xmm0,	r3d
-	pmuludq		xmm0,	[mmx_01bytes]
-
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm1,	r3d
-	pmuludq		xmm1,	[mmx_01bytes]
-
-	unpcklps	xmm0,	xmm1
-
-	lea			r1,	[r1+r2*2]
-	movzx		r3,	byte [r1-1]
-	movd		xmm2,	r3d
-	pmuludq		xmm2,	[mmx_01bytes]
-
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm3,	r3d
-	pmuludq		xmm3,	[mmx_01bytes]
-
-	unpcklps	xmm2,	xmm3
-	unpcklpd	xmm0,	xmm2
-
-	movdqa		[r0],	xmm0
-	pop r3
-	ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r2, r2d
-		sub		r1,	1
-		sub		r1,	r2
-
-		;for H
-		pxor	xmm7,	xmm7
-		movq	xmm0,	[r1]
-		movdqa	xmm5,	[sse2_plane_dec]
-		punpcklbw xmm0,	xmm7
-		pmullw	xmm0,	xmm5
-		movq	xmm1,	[r1 + 9]
-		movdqa	xmm6,	[sse2_plane_inc]
-		punpcklbw xmm1,	xmm7
-		pmullw	xmm1,	xmm6
-		psubw	xmm1,	xmm0
-
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, r3d	; xmm1 = b,b,b,b,b,b,b,b
-
-		movzx	r4,	BYTE [r1+16]
-		sub	r1, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r1, r2
-
-		add		r1,	3
-		movzx	r3,	BYTE [r1+8*r2]
-		add		r4,	r3
-		shl		r4,	4			;	a = (left[15*stride] + top[15]) << 4;
-
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r1, r2
-		pxor	xmm4,	xmm4
-		punpckhbw xmm0,	xmm4
-		pmullw	xmm0,	xmm5
-		punpckhbw xmm7,	xmm4
-		pmullw	xmm7,	xmm6
-		psubw	xmm7,	xmm0
-
-		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    r3d,   xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, r3d		; xmm4 = c,c,c,c,c,c,c,c
-
-		add		r4,	16
-		imul	r3,	-7
-		add		r3,	r4				; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_inc_minus]
-
-get_i16x16_luma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		movdqa	xmm3,	xmm1
-		pmullw	xmm3,	xmm6
-		paddw	xmm3,	xmm0
-		psraw	xmm3,	5
-		packuswb xmm2,	xmm3
-		movdqa	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	16
-		inc		r3
-		cmp		r3,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1
-		POP_XMM
-		pop r4
-		pop r3
-		ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_ONE_LINE 0
-	add r0, 16
-	add r1, r2
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	dec r1
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	pop r3
-    ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-    %assign push_num 0
-    LOAD_3_PARA
-    SIGN_EXTENSION r2, r2d
-    sub     r1, r2
-    movdqa  xmm0, [r1]
-
-    movdqa  [r0], xmm0
-    movdqa  [r0+10h], xmm0
-    movdqa  [r0+20h], xmm0
-    movdqa  [r0+30h], xmm0
-    movdqa  [r0+40h], xmm0
-    movdqa  [r0+50h], xmm0
-    movdqa  [r0+60h], xmm0
-    movdqa  [r0+70h], xmm0
-    movdqa  [r0+80h], xmm0
-    movdqa  [r0+90h], xmm0
-    movdqa  [r0+160], xmm0
-    movdqa  [r0+176], xmm0
-    movdqa  [r0+192], xmm0
-    movdqa  [r0+208], xmm0
-    movdqa  [r0+224], xmm0
-    movdqa  [r0+240], xmm0
-
-    ret
-
-;***********************************************************************
-; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r2, r2d
-		sub		r1,	1
-		sub		r1,	r2
-
-		pxor	mm7,	mm7
-		movq	mm0,	[r1]
-		movq	mm5,	[sse2_plane_dec_c]
-		punpcklbw mm0,	mm7
-		pmullw	mm0,	mm5
-		movq	mm1,	[r1 + 5]
-		movq	mm6,	[sse2_plane_inc_c]
-		punpcklbw mm1,	mm7
-		pmullw	mm1,	mm6
-		psubw	mm1,	mm0
-
-		movq2dq xmm1,   mm1
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, r3d	; mm1 = b,b,b,b,b,b,b,b
-
-		movzx	r3,	BYTE [r1+8]
-		sub	r1, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r1, r2
-
-		add		r1,	3
-		movzx	r4,	BYTE [r1+4*r2]
-		add		r4,	r3
-		shl		r4,	4			; a = (left[7*stride] + top[7]) << 4;
-
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r1, r2
-		pxor	mm4,	mm4
-		punpckhbw mm0,	mm4
-		pmullw	mm0,	mm5
-		punpckhbw mm7,	mm4
-		pmullw	mm7,	mm6
-		psubw	mm7,	mm0
-
-		movq2dq xmm7,   mm7
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    r3d,    xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, r3d	; mm4 = c,c,c,c,c,c,c,c
-
-		add		r4,	16
-		imul	r3,	-3
-		add		r3,	r4		; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, r3d	; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_mul_b_c]
-
-get_i_chroma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		packuswb xmm2,	xmm2
-		movq	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	8
-		inc		r3
-		cmp		r3,	8
-		jnz get_i_chroma_pred_plane_sse2_1
-		POP_XMM
-		pop r4
-		pop r3
-		WELSEMMS
-		ret
-
-;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	6 |7 |8 |9 |10|
-;	11|12|13|14|15|
-;	16|17|18|19|20|
-;	21|22|23|24|25|
-;	7 is the start pixel of current 4x4 block
-;	pred[7] = ([6]+[0]*2+[1]+2)/4
-;
-;   void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
-	sub		r1, r2			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[r1-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[r1]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
-	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
-	psllq       mm3,18h				;mm3[5]=[1]
-	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
-	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea  	    r1,[r1+r2*2-8h]		;set eax point to 12
-	movq        mm4,[r1+r2]		;get value of 16, mm4[8]=[16]
-	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[16]
-	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
-	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[r1+r2*2]		;mm4[8]=[21]
-	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[21]
-	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
-	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
-	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
-	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
-	pand        mm1,[mmx_01bytes]	;set the odd bit
-	psubusb     mm3,mm1				;decrease 1 from odd bytes
-	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-
-	movd        [r0+12],mm2
-	psrlq       mm2,8
-	movd        [r0+8],mm2
-	psrlq       mm2,8
-	movd        [r0+4],mm2
-	psrlq       mm2,8
-	movd        [r0],mm2
-	WELSEMMS
-	ret
-
-;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	5 |6 |7 |8 |9 |
-;	10|11|12|13|14|
-;	15|16|17|18|19|
-;	20|21|22|23|24|
-;	6 is the start pixel of current 4x4 block
-;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
-;
-;   void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movzx		r4,	byte [r1-1h]
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pxor		xmm1,	xmm1
-	psadbw		xmm0,	xmm1
-	xor r3, r3
-	movd		r3d,	xmm0
-	add			r3,	r4
-	movzx		r4,	byte [r1+r2*2-1h]
-	add			r3,	r4
-
-	lea			r1,	[r1+r2*2-1]
-	movzx		r4,	byte [r1+r2]
-	add			r3,	r4
-
-	movzx		r4,	byte [r1+r2*2]
-	add			r3,	r4
-	add			r3,	4
-	sar			r3,	3
-	imul		r3,	0x01010101
-
-	movd		xmm0,	r3d
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	pop r4
-	pop r3
-	ret
-
-;***********************************************************************
-;	void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   copy 8 pixel of 8 line from left
-;***********************************************************************
-%macro MMX_PRED_H_8X8_ONE_LINE 4
-	movq		%1,		[%3-8]
-	psrlq		%1,		38h
-
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
-%endmacro
-
-%macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+r2-8]
-	psrlq		%1,		38h
-
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
-%endmacro
-
-WELS_EXTERN WelsIChromaPredH_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movq		mm0,	[r1-8]
-	psrlq		mm0,	38h
-
-	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
-	pmullw		mm0,		[mmx_01bytes]
-	pshufw		mm0,	mm0,	0
-	movq		[r0],	mm0
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+8
-
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+16
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+24
-
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+32
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+40
-
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+48
-
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+56
-	WELSEMMS
-	ret
-
-;***********************************************************************
-;	void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   copy pixels from top 4 pixels
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredV_sse2
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	ret
-
-;***********************************************************************
-;	void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   copy 8 pixels from top 8 pixels
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredV_sse2
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub		r1,		r2
-	movq		xmm0,		[r1]
-	movdqa		xmm1,		xmm0
-	punpcklqdq	xmm0,		xmm1
-	movdqa		[r0],		xmm0
-	movdqa		[r0+16],	xmm0
-	movdqa		[r0+32],	xmm0
-	movdqa		[r0+48],	xmm0
-	ret
-
-;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |a |b |
-;	|g |h |e |f |
-;	|i |j |g |h |
-
-;   a = (1 + lt + l0)>>1
-;   e = (1 + l0 + l1)>>1
-;   g = (1 + l1 + l2)>>1
-;   i = (1 + l2 + l3)>>1
-
-;   d = (2 + t0 + (t1<<1) + t2)>>2
-;   c = (2 + lt + (t0<<1) + t1)>>2
-;   b = (2 + l0 + (lt<<1) + t0)>>2
-
-;   f = (2 + l1 + (l0<<1) + lt)>>2
-;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2
-;   [b a f e h g j i] + [d c b a] --> mov to memory
-;
-;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHD_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
-	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1+2*r2-4]
-	punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
-	psrlq       mm2, 20h
-	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
-	movq        mm1, mm0
-	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
-	movq        mm2, mm0
-	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
-	movq        mm3, mm2
-	movq        mm4, mm1
-	pavgb       mm1, mm0
-
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm4				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-
-	movq        mm4, mm0
-	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
-	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-
-	psrlq       mm2, 20h
-	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
-	movq        mm4, mm3
-	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
-	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-
-	movd        [r0], mm2
-	movd        [r0+12], mm3
-	psrlq       mm3, 10h
-	movd        [r0+8], mm3
-	psrlq       mm3, 10h
-	movd        [r0+4], mm3
-	WELSEMMS
-	ret
-
-;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|c |d |e |f |
-;	|e |f |g |g |
-;	|g |g |g |g |
-
-;   a = (1 + l0 + l1)>>1
-;   c = (1 + l1 + l2)>>1
-;   e = (1 + l2 + l3)>>1
-;   g = l3
-
-;   b = (2 + l0 + (l1<<1) + l2)>>2
-;   d = (2 + l1 + (l2<<1) + l3)>>2
-;   f = (2 + l2 + (l3<<1) + l3)>>2
-
-;   [g g f e d c b a] + [g g g g] --> mov to memory
-;
-;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHU_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movd        mm0, [r1-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1-4]            ; mm2[3] = l2
-	movd        mm4, [r1+r2-4]        ; mm4[3] = l3
-	punpcklbw   mm2, mm4
-	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
-	psrlq       mm4, 18h
-	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
-	psrlq       mm0, 8h
-	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
-	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
-	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
-	movq        mm5, mm2
-	pavgb       mm2, mm0
-
-	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
-	pand        mm5, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm5				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-
-	psrlq       mm2, 8h
-	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-
-	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-
-	psrlq       mm4, 20h
-	movd        [r0+12], mm4
-
-	movd        [r0], mm1
-	psrlq       mm1, 10h
-	movd        [r0+4], mm1
-	psrlq       mm1, 10h
-	movd        [r0+8], mm1
-	WELSEMMS
-	ret
-
-
-
-;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	l3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|i |a |b |c |
-;	|j |e |f |g |
-
-;   a = (1 + lt + t0)>>1
-;   b = (1 + t0 + t1)>>1
-;   c = (1 + t1 + t2)>>1
-;   d = (1 + t2 + t3)>>1
-
-;   e = (2 + l0 + (lt<<1) + t0)>>2
-;   f = (2 + lt + (t0<<1) + t1)>>2
-;   g = (2 + t0 + (t1<<1) + t2)>>2
-
-;   h = (2 + t1 + (t2<<1) + t3)>>2
-;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2
-;
-;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVR_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
-	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movq        mm2, [r1+r2-8]        ; mm2[7] = l2
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
-	psrlq       mm2, 28h
-	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
-	movq        mm3, mm2
-	pavgb       mm2, mm0
-
-	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm3				; decrease 1 from odd bytes
-
-	movq        mm3, mm0
-	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
-	movq        mm2, mm3
-
-	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [r0], mm1
-
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [r0+4], mm2
-
-	movq        mm4, mm3
-	psllq       mm4, 20h
-	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-
-	movq        mm5, mm3
-	psllq       mm5, 28h
-	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-
-	psllq       mm1, 8h
-	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [r0+8], mm4
-
-	psllq       mm2, 8h
-	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	movd        [r0+12], mm5
-	WELSEMMS
-	ret
-
-;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|b |c |d |e |
-;	|c |d |e |f |
-;	|d |e |f |g |
-
-;   a = (2 + t0 + t2 + (t1<<1))>>2
-;   b = (2 + t1 + t3 + (t2<<1))>>2
-;   c = (2 + t2 + t4 + (t3<<1))>>2
-;   d = (2 + t3 + t5 + (t4<<1))>>2
-
-;   e = (2 + t4 + t6 + (t5<<1))>>2
-;   f = (2 + t5 + t7 + (t6<<1))>>2
-;   g = (2 + t6 + t7 + (t7<<1))>>2
-
-;   [g f e d c b a] --> mov to memory
-;
-;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
-
-	movq        mm3, mm0
-	psrlq       mm3, 38h
-	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-
-	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
-	psrlq       mm2, 8h
-	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
-
-	movq        mm3, mm1
-	pavgb       mm1, mm2
-	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm3				; decrease 1 from odd bytes
-
-	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-
-	psrlq       mm0, 8h
-	movd        [r0], mm0
-	psrlq       mm0, 8h
-	movd        [r0+4], mm0
-	psrlq       mm0, 8h
-	movd        [r0+8], mm0
-	psrlq       mm0, 8h
-	movd        [r0+12], mm0
-	WELSEMMS
-	ret
-
-
-;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
-;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|b |c |d |i |
-;	|f |g |h |j |
-
-;   a = (1 + t0 + t1)>>1
-;   b = (1 + t1 + t2)>>1
-;   c = (1 + t2 + t3)>>1
-;   d = (1 + t3 + t4)>>1
-;   i = (1 + t4 + t5)>>1
-
-;   e = (2 + t0 + (t1<<1) + t2)>>2
-;   f = (2 + t1 + (t2<<1) + t3)>>2
-;   g = (2 + t2 + (t3<<1) + t4)>>2
-;   h = (2 + t3 + (t4<<1) + t5)>>2
-;   j = (2 + t4 + (t5<<1) + t6)>>2
-
-;   [i d c b a] + [j h g f e] --> mov to memory
-;
-;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVL_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
-
-	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
-	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
-
-	movq        mm3, mm1
-	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-
-	movq        mm4, mm2
-	pavgb       mm2, mm0
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm4				; decrease 1 from odd bytes
-
-	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-
-	movd        [r0], mm3
-	psrlq       mm3, 8h
-	movd        [r0+8], mm3
-
-	movd        [r0+4], mm2
-	psrlq       mm2, 8h
-	movd        [r0+12], mm2
-	WELSEMMS
-	ret
-
-;***********************************************************************
-;
-;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]
-
-	movzx		r3, byte [r1+r2-0x01] ; l1
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l2
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l3
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l4
-	add		r3, r4
-	movd        	mm1, r3d                 ; mm1 = l1+l2+l3+l4
-
-	movzx		r3, byte [r1+r2-0x01] ; l5
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l6
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l7
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l8
-	add		r3, r4
-	movd        	mm2, r3d                 ; mm2 = l5+l6+l7+l8
-
-	movq        mm3, mm0
-	psrlq       mm0, 0x20
-	psllq       mm3, 0x20
-	psrlq       mm3, 0x20
-	pxor		mm4, mm4
-	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
-	paddq       mm3, mm1
-	movq        mm1, mm2
-	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
-	movq        mm4, [mmx_0x02]
-
-	paddq       mm0, mm4
-	psrlq       mm0, 0x02
-
-	paddq       mm2, mm4
-	psrlq       mm2, 0x02
-
-	paddq       mm3, mm4
-	paddq       mm3, mm4
-	psrlq       mm3, 0x03
-
-	paddq       mm1, mm4
-	paddq       mm1, mm4
-	psrlq       mm1, 0x03
-
-	pmuludq     mm0, [mmx_01bytes]
-	pmuludq     mm3, [mmx_01bytes]
-	psllq       mm0, 0x20
-	pxor        mm0, mm3                 ; mm0 = m_up
-
-	pmuludq     mm2, [mmx_01bytes]
-	pmuludq     mm1, [mmx_01bytes]
-	psllq       mm1, 0x20
-	pxor        mm1, mm2                 ; mm2 = m_down
-
-	movq        [r0], mm0
-	movq        [r0+0x08], mm0
-	movq        [r0+0x10], mm0
-	movq        [r0+0x18], mm0
-
-	movq        [r0+0x20], mm1
-	movq        [r0+0x28], mm1
-	movq        [r0+0x30], mm1
-	movq        [r0+0x38], mm1
-
-	pop r4
-	pop r3
-	WELSEMMS
-	ret
-
-
-
-;***********************************************************************
-;
-;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movdqa      xmm0, [r1]             ; read one row
-	pxor		xmm1, xmm1
-	psadbw		xmm0, xmm1
-	movdqa      xmm1, xmm0
-	psrldq      xmm1, 0x08
-	pslldq      xmm0, 0x08
-	psrldq      xmm0, 0x08
-	paddw       xmm0, xmm1
-
-	movzx		r3, byte [r1+r2-0x01]
-	movzx		r4, byte [r1+2*r2-0x01]
-	add		r3, r4
-	lea         r1, [r1+r2]
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	add         r3, 0x10
-	movd        xmm1, r3d
-	paddw       xmm0, xmm1
-	psrld       xmm0, 0x05
-	pmuludq     xmm0, [mmx_01bytes]
-	pshufd      xmm0, xmm0, 0
-
-	movdqa      [r0], xmm0
-	movdqa      [r0+0x10], xmm0
-	movdqa      [r0+0x20], xmm0
-	movdqa      [r0+0x30], xmm0
-	movdqa      [r0+0x40], xmm0
-	movdqa      [r0+0x50], xmm0
-	movdqa      [r0+0x60], xmm0
-	movdqa      [r0+0x70], xmm0
-	movdqa      [r0+0x80], xmm0
-	movdqa      [r0+0x90], xmm0
-	movdqa      [r0+0xa0], xmm0
-	movdqa      [r0+0xb0], xmm0
-	movdqa      [r0+0xc0], xmm0
-	movdqa      [r0+0xd0], xmm0
-	movdqa      [r0+0xe0], xmm0
-	movdqa      [r0+0xf0], xmm0
-
-	pop r4
-	pop r3
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
-;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
-;
-;***********************************************************************
-%ifdef X86_32
-WELS_EXTERN WelsSampleSatdThree4x4_sse2
-	push      ebx
-	push      esi
-	push      edi
-	mov       eax,  [esp+24];p_enc
-	mov       ebx,  [esp+28];linesize_enc
-
-	; load source 4x4 samples and Hadamard transform
-    movd      xmm0, [eax]
-    movd      xmm1, [eax+ebx]
-    lea       eax , [eax+2*ebx]
-    movd      xmm2, [eax]
-    movd      xmm3, [eax+ebx]
-    punpckldq xmm0, xmm2
-    punpckldq xmm1, xmm3
-
-    pxor      xmm6, xmm6
-    punpcklbw xmm0, xmm6
-    punpcklbw xmm1, xmm6
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
-
-    movdqa    xmm4, xmm0
-    paddw     xmm0, xmm3
-    psubw     xmm4, xmm3
-
-    movdqa    xmm2, xmm0
-    punpcklwd xmm0, xmm4
-    punpckhwd xmm4, xmm2
-
-	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
-
-    movdqa    xmm7, xmm0
-    paddw     xmm0, xmm5
-    psubw     xmm7, xmm5
-
-	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
-
-    ; Hadamard transform results are saved in xmm0 and xmm2
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-
-	; load top boundary samples: [a b c d]
-    mov       eax,  [esp+16];p_dec
-	sub		  eax,	[esp+20];linesize_dec
-	movzx     ecx,  byte [eax]
-	movzx     edx,  byte [eax+1]
-	movzx     esi,  byte [eax+2]
-	movzx     edi,  byte [eax+3]
-
-	; get the transform results of top boundary samples: [a b c d]
-	add       edx, ecx ; edx = a + b
-	add       edi, esi ; edi = c + d
-	add       ecx, ecx ; ecx = a + a
-	add       esi, esi ; esi = c + c
-	sub       ecx, edx ; ecx = a + a - a - b = a - b
-	sub       esi, edi ; esi = c + c - c - d = c - d
-	add       edi, edx ; edi = (a + b) + (c + d)
-	add       edx, edx
-	sub       edx, edi ; edx = (a + b) - (c + d)
-	add       esi, ecx ; esi = (a - b) + (c - d)
-	add       ecx, ecx
-	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
-	movdqa    xmm6, xmm0
-	movdqa    xmm7, xmm2
-	movd      xmm5, edi ; store the edi for DC mode
-	pxor      xmm3, xmm3
-	pxor      xmm4, xmm4
-	pinsrw    xmm3, edi, 0
-	pinsrw    xmm3, esi, 4
-	psllw     xmm3, 2
-	pinsrw    xmm4, edx, 0
-	pinsrw    xmm4, ecx, 4
-	psllw     xmm4, 2
-
-	; get the satd of H
-	psubw     xmm0, xmm3
-	psubw     xmm2, xmm4
-
-	WELS_AbsW  xmm0, xmm1
-	WELS_AbsW  xmm2, xmm1
-    paddusw        xmm0, xmm2
-    SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
-
-	; load left boundary samples: [a b c d]'
-    mov       eax,  [esp+16]
-	mov       ebx,  [esp+20]
-	movzx     ecx,  byte [eax-1]
-	movzx     edx,  byte [eax+ebx-1]
-	lea       eax , [eax+2*ebx]
-	movzx     esi,  byte [eax-1]
-	movzx     edi,  byte [eax+ebx-1]
-
-	; get the transform results of left boundary samples: [a b c d]'
-	add       edx, ecx ; edx = a + b
-	add       edi, esi ; edi = c + d
-	add       ecx, ecx ; ecx = a + a
-	add       esi, esi ; esi = c + c
-	sub       ecx, edx ; ecx = a + a - a - b = a - b
-	sub       esi, edi ; esi = c + c - c - d = c - d
-	add       edi, edx ; edi = (a + b) + (c + d)
-	add       edx, edx
-	sub       edx, edi ; edx = (a + b) - (c + d)
-	add       esi, ecx ; esi = (a - b) + (c - d)
-	add       ecx, ecx
-	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
-	; store the transform results in xmm3
-    movd      xmm3, edi
-	pinsrw    xmm3, edx, 1
-	pinsrw    xmm3, ecx, 2
-	pinsrw    xmm3, esi, 3
-	psllw     xmm3, 2
-
-	; get the satd of V
-	movdqa    xmm2, xmm6
-	movdqa    xmm4, xmm7
-	psubw     xmm2, xmm3
-	WELS_AbsW  xmm2, xmm1
-	WELS_AbsW  xmm4, xmm1
-    paddusw        xmm2, xmm4
-    SUMW_HORIZON1  xmm2, xmm1 ; satd of H is stored in xmm2
-
-	; DC result is stored in xmm1
-	add       edi, 4
-	movd      xmm1, edi
-	paddw     xmm1, xmm5
-	psrlw     xmm1, 3
-	movdqa    xmm5, xmm1
-	psllw     xmm1, 4
-
-    ; get the satd of DC
-    psubw          xmm6, xmm1
-    WELS_AbsW  xmm6, xmm1
-	WELS_AbsW  xmm7, xmm1
-    paddusw        xmm6, xmm7
-    SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
-
-    ; comparing order: DC H V
-    mov       edx, [esp+32]
-    movd      eax, xmm6
-    movd      edi, xmm2
-    movd      esi, xmm0
-    and       eax, 0xffff
-    shr       eax, 1
-    and       edi, 0xffff
-    shr       edi, 1
-    and       esi, 0xffff
-    shr       esi, 1
-    add       eax, [esp+40]
-    add       edi, [esp+44]
-    add       esi, [esp+48]
-    cmp       ax, di
-    jg near   not_dc
-    cmp       ax, si
-    jg near   not_dc_h
-
-    ; for DC mode
-    movd      ebx, xmm5
-    imul      ebx, 0x01010101
-    movd	  xmm5, ebx
-	pshufd    xmm5, xmm5, 0
-	movdqa    [edx], xmm5
-	mov       ebx, [esp+36]
-	mov       dword [ebx], 0x02
-	pop       edi
-    pop       esi
-    pop       ebx
-    ret
-
-not_dc:
-    cmp       di, si
-    jg near   not_dc_h
-
-    ; for H mode
-    SSE_DB_1_2REG  xmm6, xmm7
-    mov       eax,  [esp+16]
-	mov       ebx,  [esp+20]
-    movzx     ecx,  byte [eax-1]
-	movd      xmm0, ecx
-    pmuludq   xmm0, xmm6
-
-	movzx     ecx,  byte [eax+ebx-1]
-	movd      xmm1, ecx
-    pmuludq   xmm1, xmm6
-%if 1
-    punpckldq xmm0, xmm1
-%else
-	unpcklps  xmm0,	xmm1
-%endif
-	lea       eax,	[eax+ebx*2]
-	movzx	  ecx,	byte [eax-1]
-	movd	  xmm2,	ecx
-    pmuludq   xmm2, xmm6
-
-	movzx	  ecx,	byte [eax+ebx-1]
-	movd	  xmm3,	ecx
-    pmuludq   xmm3, xmm6
-%if 1
-    punpckldq  xmm2, xmm3
-    punpcklqdq xmm0, xmm2
-%else
-	unpcklps  xmm2,	xmm3
-	unpcklpd  xmm0,	xmm2
-%endif
-	movdqa	  [edx],xmm0
-
-	mov       eax, edi
-    mov       ebx, [esp+36]
-	mov       dword [ebx], 0x01
-
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-not_dc_h:
-    ; for V mode
-    mov       eax,  [esp+16]
-    sub		  eax,	[esp+20]
-	movd	  xmm0,	[eax]
-	pshufd	  xmm0,	xmm0, 0
-	movdqa	  [edx],xmm0
-
-	mov       eax, esi
-    mov       ebx, [esp+36]
-	mov       dword [ebx], 0x00
-
-    pop       edi
-    pop       esi
-    pop       ebx
-    ret
-%endif
-
--- a/codec/encoder/core/asm/memzero.asm
+++ /dev/null
@@ -1,132 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  memzero.asm
-;*
-;*  Abstract
-;*
-;*
-;*  History
-;*      9/16/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-;void WelsPrefetchZero_mmx(int8_t const*_A);
-;***********************************************************************
-WELS_EXTERN WelsPrefetchZero_mmx
-	%assign  push_num 0
-	LOAD_1_PARA
-	prefetchnta [r0]
-	ret
-
-
-;***********************************************************************
-;   void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroAligned64_sse2
-
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
-
-		pxor	xmm0,		xmm0
-.memzeroa64_sse2_loops:
-		movdqa	[r0],		xmm0
-		movdqa	[r0+16],	xmm0
-		movdqa	[r0+32],	xmm0
-		movdqa	[r0+48],	xmm0
-		add		r0, 0x40
-
-		add r1, 0x40
-		jnz near .memzeroa64_sse2_loops
-
-		ret
-
-;***********************************************************************
-;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize64_mmx
-
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
-
-		pxor	mm0,		mm0
-.memzero64_mmx_loops:
-		movq	[r0],		mm0
-		movq	[r0+8],	mm0
-		movq	[r0+16],	mm0
-		movq	[r0+24],	mm0
-		movq	[r0+32],	mm0
-		movq	[r0+40],	mm0
-		movq	[r0+48],	mm0
-		movq	[r0+56],	mm0
-		add		r0,		0x40
-
-		add r1, 0x40
-		jnz near .memzero64_mmx_loops
-
-		WELSEMMS
-		ret
-
-;***********************************************************************
-;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize8_mmx
-
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
-		pxor	mm0,		mm0
-
-.memzero8_mmx_loops:
-		movq	[r0],		mm0
-		add		r0,		0x08
-
-		add		r1,		0x08
-		jnz near .memzero8_mmx_loops
-
-		WELSEMMS
-		ret
-
-
--- a/codec/encoder/core/asm/quant.asm
+++ /dev/null
@@ -1,370 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  quant.asm
-;*
-;*  Abstract
-;*      sse2 quantize inter-block
-;*
-;*  History
-;*      7/6/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-
-SECTION .text
-;************************************************
-;NEW_QUANT
-;************************************************
-
-%macro SSE2_Quant8  5
-		MOVDQ	%1, %5
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pxor	%1, %2
-		psubw	%1, %2
-		MOVDQ	%5, %1
-%endmacro
-
-%macro SSE2_QuantMax8  6
-		MOVDQ	%1, %5
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pmaxsw	%6, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		MOVDQ	%5, %1
-%endmacro
-
-%define pDct				esp + 4
-%define ff					esp + 8
-%define mf					esp + 12
-%define max					esp + 16
-;***********************************************************************
-;	void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
-;***********************************************************************
-WELS_EXTERN WelsQuant4x4_sse2
-		%assign push_num 0
-                LOAD_3_PARA
-		movdqa	xmm2, [r1]
-		movdqa	xmm3, [r2]
-
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
-
-		ret
-
-;***********************************************************************
-;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
-;***********************************************************************
-WELS_EXTERN WelsQuant4x4Dc_sse2
- 		%assign push_num 0
-		LOAD_3_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		SSE2_Copy8Times xmm3, r2d
-
-		SSE2_Copy8Times xmm2, r1d
-
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
-
-		ret
-
-;***********************************************************************
-;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
-;***********************************************************************
-WELS_EXTERN WelsQuantFour4x4_sse2
-		%assign push_num 0
-		LOAD_3_PARA
-		MOVDQ	xmm2, [r1]
-		MOVDQ	xmm3, [r2]
-
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
-
-		ret
-
-;***********************************************************************
-;	void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
-;***********************************************************************
-WELS_EXTERN WelsQuantFour4x4Max_sse2
-		%assign push_num 0
-		LOAD_4_PARA
-		PUSH_XMM 8
-		MOVDQ	xmm2, [r1]
-		MOVDQ	xmm3, [r2]
-
-		pxor	xmm4, xmm4
-		pxor	xmm5, xmm5
-		pxor	xmm6, xmm6
-		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0	  ], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
-
-		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
-		pmaxsw  xmm0,  xmm4
-		pmaxsw  xmm0,  xmm5
-		pmaxsw  xmm0,  xmm7
-		movdqa	xmm1,  xmm0
-		punpckhqdq	xmm0, xmm1
-		pmaxsw	xmm0, xmm1
-
-		movq	[r3], xmm0
-		POP_XMM
-		LOAD_4_PARA_POP
-		ret
-
-%macro  MMX_Copy4Times 2
-		movd		%1, %2
-		punpcklwd	%1, %1
-		punpckldq	%1,	%1
-%endmacro
-
-SECTION .text
-
-%macro MMX_Quant4  4
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pxor	%1, %2
-		psubw	%1, %2
-%endmacro
-
-;***********************************************************************
-;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
-;***********************************************************************
-WELS_EXTERN WelsHadamardQuant2x2_mmx
-		%assign push_num 0
-		LOAD_5_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		movd		mm0,			[r0]
-		movd		mm1,			[r0 + 0x20]
-		punpcklwd	mm0,			mm1
-		movd		mm3,			[r0 + 0x40]
-		movd		mm1,			[r0 + 0x60]
-		punpcklwd	mm3,			mm1
-
-		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
-		movq		mm5,			mm3
-		paddw		mm3,			mm0
-		psubw		mm0,			mm5
-		punpcklwd	mm3,			mm0
-		movq		mm1,			mm3
-		psrlq		mm1,			32
-		movq		mm5,			mm1
-		paddw		mm1,			mm3
-		psubw		mm3,			mm5
-		punpcklwd	mm1,			mm3
-
-		;quant_2x2_dc
-		MMX_Copy4Times	mm3,		r2d
-		MMX_Copy4Times	mm2,		r1d
-		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-
-		; store dct_2x2
-		movq		[r3],			mm1
-		movq		[r4],			mm1
-
-		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF
-		pxor		mm3,			mm3
-		packsswb	mm1,			mm3
-		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
-		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
-		psadbw		mm1,			mm3		;
-		mov			r1w,				0
-		mov			[r0],			r1w
-		mov			[r0 + 0x20],	r1w
-		mov			[r0 + 0x40],	r1w
-		mov			[r0 + 0x60],	r1w
-
-
-		movd		retrd,		mm1
-
-		WELSEMMS
-		LOAD_5_PARA_POP
-		ret
-
-;***********************************************************************
-;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
-;***********************************************************************
-WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
-		%assign push_num 0
-		LOAD_3_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		movd		mm0,			[r0]
-		movd		mm1,			[r0 + 0x20]
-		punpcklwd	mm0,			mm1
-		movd		mm3,			[r0 + 0x40]
-		movd		mm1,			[r0 + 0x60]
-		punpcklwd	mm3,			mm1
-
-		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
-		movq		mm5,			mm3
-		paddw		mm3,			mm0
-		psubw		mm0,			mm5
-		punpcklwd	mm3,			mm0
-		movq		mm1,			mm3
-		psrlq		mm1,			32
-		movq		mm5,			mm1
-		paddw		mm1,			mm3
-		psubw		mm3,			mm5
-		punpcklwd	mm1,			mm3
-
-		;quant_2x2_dc
-		MMX_Copy4Times	mm3,		r2d
-		MMX_Copy4Times	mm2,		r1d
-		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-
-		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF
-		pxor		mm3,			mm3
-		packsswb	mm1,			mm3
-		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
-		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
-		psadbw		mm1,			mm3		;
-		movd		retrd,			mm1
-
-		WELSEMMS
-		ret
-
-
-%macro SSE2_DeQuant8 3
-    MOVDQ  %2, %1
-    pmullw %2, %3
-    MOVDQ  %1, %2
-%endmacro
-
-
-;***********************************************************************
-; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
-;***********************************************************************
-WELS_EXTERN WelsDequant4x4_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-
-	movdqa  xmm1, [r1]
-	SSE2_DeQuant8 [r0	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
-
-    ret
-
-;***********************************************************************====
-;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
-;***********************************************************************====
-
-WELS_EXTERN WelsDequantFour4x4_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-
-	movdqa  xmm1, [r1]
-	SSE2_DeQuant8 [r0	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x10	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x20	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x30	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x40	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x50	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x60	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x70	],  xmm0, xmm1
-
-    ret
-
-;***********************************************************************
-;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
-;***********************************************************************
-WELS_EXTERN WelsDequantIHadamard4x4_sse2
-		%assign push_num 0
-		LOAD_2_PARA
-		%ifndef X86_32
-		movzx r1, r1w
-		%endif
-
-		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		r1d
-		;psrlw		xmm1,		2		; for the (>>2) in ihdm
-		MOVDQ		xmm0,		[r0]
-		MOVDQ		xmm2,		[r0+0x10]
-		pmullw		xmm0,		xmm1
-		pmullw		xmm2,		xmm1
-
-		; ihdm_4x4
-		movdqa		xmm1,		xmm0
-		psrldq		xmm1,		8
-		movdqa		xmm3,		xmm2
-		psrldq		xmm3,		8
-
-		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
-		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
-		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
-		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
-
-		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
-		SSE2_SumSub		xmm2, xmm4,	xmm5
-		SSE2_SumSub		xmm1, xmm0, xmm5
-		SSE2_SumSub		xmm4, xmm0, xmm5
-		SSE2_SumSub		xmm2, xmm1, xmm5
-		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-
-		punpcklqdq	xmm0,		xmm1
-		MOVDQ		[r0],		xmm0
-
-		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[r0+16],	xmm2
-		ret
--- a/codec/encoder/core/asm/score.asm
+++ /dev/null
@@ -1,339 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  score.asm
-;*
-;*  Abstract
-;*      scan/score/count of sse2
-;*
-;*  History
-;*      8/21/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-SECTION .rodata align=16
-
-;align 16
-;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
-align 16
-sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
-align 16
-sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-align 16
-sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
-align 16
-pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
-align 16
-pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
-align 16
-pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
-align 16
-pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
-
-align 16
-nozero_count_table:
-db  0,1,1,2,1,2,2,3,1,2
-db  2,3,2,3,3,4,1,2,2,3
-db  2,3,3,4,2,3,3,4,3,4
-db  4,5,1,2,2,3,2,3,3,4
-db  2,3,3,4,3,4,4,5,2,3
-db  3,4,3,4,4,5,3,4,4,5
-db  4,5,5,6,1,2,2,3,2,3
-db  3,4,2,3,3,4,3,4,4,5
-db  2,3,3,4,3,4,4,5,3,4
-db  4,5,4,5,5,6,2,3,3,4
-db  3,4,4,5,3,4,4,5,4,5
-db  5,6,3,4,4,5,4,5,5,6
-db  4,5,5,6,5,6,6,7,1,2
-db  2,3,2,3,3,4,2,3,3,4
-db  3,4,4,5,2,3,3,4,3,4
-db  4,5,3,4,4,5,4,5,5,6
-db  2,3,3,4,3,4,4,5,3,4
-db  4,5,4,5,5,6,3,4,4,5
-db  4,5,5,6,4,5,5,6,5,6
-db  6,7,2,3,3,4,3,4,4,5
-db  3,4,4,5,4,5,5,6,3,4
-db  4,5,4,5,5,6,4,5,5,6
-db  5,6,6,7,3,4,4,5,4,5
-db  5,6,4,5,5,6,5,6,6,7
-db  4,5,5,6,5,6,6,7,5,6
-db  6,7,6,7,7,8
-
-align 16
-high_mask_table:
-	db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
-	db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
-	db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
-	db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
-	db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
-	db  5, 8, 5, 7, 8,11, 6, 8, 8,11
-	db  9,11,12,15, 0, 1, 1, 4, 1, 3
-	db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
-	db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
-	db  7,10, 8,10,11,14, 3, 4, 4, 7
-	db  5, 7, 8,11, 5, 7, 7,10, 8,10
-	db 11,14, 6, 7, 8,11, 8,10,11,14
-	db  9,11,11,14,12,14,15,18, 0, 0
-	db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
-	db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
-	db  7,10, 5, 7, 7,10, 8,10,11,14
-	db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
-	db  6, 9, 7, 9,10,13, 5, 6, 7,10
-	db  7, 9,10,13, 8,10,10,13,11,13
-	db 14,17, 3, 4, 4, 7, 4, 6, 7,10
-	db  5, 7, 7,10, 8,10,11,14, 5, 6
-	db  7,10, 7, 9,10,13, 8,10,10,13
-	db 11,13,14,17, 6, 7, 7,10, 8,10
-	db 11,14, 8,10,10,13,11,13,14,17
-	db  9,10,11,14,11,13,14,17,12,14
-	db 14,17,15,17,18,21
-
-align 16
-low_mask_table:
-    db  0, 3, 2, 6, 2, 5, 5, 9, 1, 5
-    db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
-    db  4, 7, 7,11, 4, 8, 7,11, 8,11
-    db 11,15, 1, 4, 3, 7, 4, 7, 7,11
-    db  3, 7, 6,10, 7,10,10,14, 4, 7
-    db  7,11, 7,10,10,14, 7,11,10,14
-    db 11,14,14,18, 0, 4, 3, 7, 3, 6
-    db  6,10, 3, 7, 6,10, 7,10,10,14
-    db  3, 6, 6,10, 6, 9, 9,13, 6,10
-    db  9,13,10,13,13,17, 4, 7, 6,10
-    db  7,10,10,14, 6,10, 9,13,10,13
-    db 13,17, 7,10,10,14,10,13,13,17
-    db 10,14,13,17,14,17,17,21, 0, 3
-    db  3, 7, 3, 6, 6,10, 2, 6, 5, 9
-    db  6, 9, 9,13, 3, 6, 6,10, 6, 9
-    db  9,13, 6,10, 9,13,10,13,13,17
-    db  3, 6, 5, 9, 6, 9, 9,13, 5, 9
-    db  8,12, 9,12,12,16, 6, 9, 9,13
-    db  9,12,12,16, 9,13,12,16,13,16
-    db 16,20, 3, 7, 6,10, 6, 9, 9,13
-    db  6,10, 9,13,10,13,13,17, 6, 9
-    db  9,13, 9,12,12,16, 9,13,12,16
-    db 13,16,16,20, 7,10, 9,13,10,13
-    db 13,17, 9,13,12,16,13,16,16,20
-    db 10,13,13,17,13,16,16,20,13,17
-    db 16,20,17,20,20,24
-
-
-SECTION .text
-
-;***********************************************************************
-;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
-;***********************************************************************
-WELS_EXTERN WelsScan4x4DcAc_sse2
-	%ifdef X86_32
-	push r3
-	%assign push_num 1
-	%else
-	%assign push_num 0
-	%endif
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]			; 7 6 5 4 3 2 1 0
-	movdqa     xmm1, [r1+16]		; f e d c b a 9 8
-	pextrw     r2d, xmm0, 7			; ecx = 7
-	pextrw     r3d, xmm1, 2			; edx = a
-	pextrw     r1d, xmm0, 5			; eax = 5
-	pinsrw     xmm1, r2d, 2			; f e d c b 7 9 8
-	pinsrw     xmm0, r1d, 7			; 5 6 5 4 3 2 1 0
-	pextrw     r2d, xmm1, 0			; ecx = 8
-	pinsrw     xmm0, r2d, 5			; 5 6 8 4 3 2 1 0
-	pinsrw     xmm1, r3d, 0			; f e d c b 7 9 a
-	pshufd     xmm2, xmm0, 0xd8		; 5 6 3 2 8 4 1 0
-	pshufd     xmm3, xmm1, 0xd8		; f e b 7 d c 9 a
-	pshufhw    xmm0, xmm2, 0x93		; 6 3 2 5 8 4 1 0
-	pshuflw    xmm1, xmm3, 0x39		; f e b 7 a d c 9
-	movdqa     [r0],xmm0
-	movdqa     [r0+16], xmm1
-	%ifdef X86_32
-	pop r3
-	%endif
-	ret
-
-;***********************************************************************
-;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
-;***********************************************************************
-WELS_EXTERN WelsScan4x4DcAc_ssse3
-	%assign push_num 0
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]
-	movdqa     xmm1, [r1+16]
-	pextrw		r2d,  xmm0, 7			; ecx = [7]
-	pextrw		r1d,  xmm1, 0			; eax = [8]
-	pinsrw		xmm0, r1d, 7			; xmm0[7]	=	[8]
-	pinsrw		xmm1, r2d, 0			; xmm1[0]	=	[7]
-	pshufb		xmm1, [pb_scanacdc_maskb]
-	pshufb		xmm0, [pb_scanacdc_maska]
-
-	movdqa     [r0],xmm0
-	movdqa     [r0+16], xmm1
-	ret
-;***********************************************************************
-;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
-;***********************************************************************
-WELS_EXTERN WelsScan4x4Ac_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]
-	movdqa     xmm1, [r1+16]
-	movdqa     xmm2, xmm0
-	punpcklqdq xmm0, xmm1
-	punpckhqdq xmm2, xmm1
-
-	movdqa     xmm3, xmm0
-	punpckldq  xmm0, xmm2
-	punpckhdq  xmm3, xmm2
-	pextrw     r1d , xmm0, 3
-	pextrw     r2d , xmm0, 7
-	pinsrw     xmm0, r1d,  7
-	pextrw     r1d,  xmm3, 4
-	pinsrw     xmm3, r2d,  4
-	pextrw     r2d,  xmm3, 0
-	pinsrw     xmm3, r1d,  0
-	pinsrw     xmm0, r2d,  3
-
-	pshufhw    xmm1, xmm0, 0x93
-	pshuflw    xmm2, xmm3, 0x39
-
-    movdqa     xmm3, xmm2
-    psrldq     xmm1, 2
-    pslldq     xmm3, 14
-    por        xmm1, xmm3
-    psrldq     xmm2, 2
-	movdqa     [r0],xmm1
-	movdqa     [r0+16], xmm2
-	ret
-
-
-;***********************************************************************
-;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
-;***********************************************************************
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
-	%ifdef X86_32
-	push r3
-	%assign push_num 1
-	%else
-	%assign push_num 0
-	%endif
-	LOAD_1_PARA
-	movdqa    xmm0, [r0]
-	movdqa    xmm1, [r0+16]
-
-	packsswb  xmm0, xmm1
-	; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
-	xor r3, r3
-    pxor      xmm3, xmm3
-    pcmpeqb   xmm0, xmm3
-    pmovmskb  r3d,  xmm0
-
-    xor       r3,  0xffff
-
-	xor       r0,  r0
-	mov       r2,  7
-	mov       r1,  8
-.loop_low8_find1:
-	bt        r3,  r2
-	jc        .loop_high8_find1
-	dec		  r2
-	jnz      .loop_low8_find1
-.loop_high8_find1:
-	bt        r3, r1
-	jc        .find1end
-	inc       r1
-	cmp       r1,16
-	jb        .loop_high8_find1
-.find1end:
-	sub       r1, r2
-	sub       r1, 1
-	lea	  r2,  [i_ds_table]
-	add       r0b,  [r2+r1]
-	mov       r1, r3
-	and       r3, 0xff
-	shr       r1, 8
-	and       r1, 0xff
-	lea	  r2 , [low_mask_table]
-	add       r0b,  [r2 +r3]
-	lea	  r2, [high_mask_table]
-	add       r0b,  [r2+r1]
-	%ifdef X86_32
-	pop r3
-	%else
-	mov retrd, r0d
-	%endif
-	ret
-
-
-;***********************************************************************
-; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
-;***********************************************************************
-WELS_EXTERN WelsGetNoneZeroCount_sse2
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa    xmm0, [r0]
-	movdqa    xmm1, [r0+16]
-	pxor      xmm2, xmm2
-	pcmpeqw   xmm0, xmm2
-	pcmpeqw   xmm1, xmm2
-	packsswb  xmm1, xmm0
-	xor r1, r1
-	pmovmskb  r1d,  xmm1
-	xor       r1d,  0xffff
-	mov       r2,  r1
-	and       r1,  0xff
-	shr       r2,  8
-;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-;	xor       retr,  retr
-	;add       al,  [nozero_count_table+r2]
-	lea 	  r0 , [nozero_count_table]
-	movzx	  r2, byte [r0+r2]
-	movzx	  r1,   byte [r0+r1]
-	mov	  retrq, r2
-	add	  retrq, r1
-	;add       al,  [nozero_count_table+r1]
-	ret
-
--- /dev/null
+++ b/codec/encoder/core/x86/coeff.asm
@@ -1,0 +1,459 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  memzero.asm
+;*
+;*  Abstract
+;*     cavlc
+;*
+;*  History
+;*      09/08/2010 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+
+%ifdef X86_32
+SECTION .rodata align=16
+
+align 16
+sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
+
+ALIGN  16
+sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
+
+align 16
+byte_1pos_table:
+	db 0,0,0,0,0,0,0,0, ;0
+	db 0,0,0,0,0,0,0,1, ;1
+	db 1,0,0,0,0,0,0,1, ;2
+	db 1,0,0,0,0,0,0,2, ;3
+	db 2,0,0,0,0,0,0,1, ;4
+	db 2,0,0,0,0,0,0,2, ;5
+	db 2,1,0,0,0,0,0,2, ;6
+	db 2,1,0,0,0,0,0,3, ;7
+	db 3,0,0,0,0,0,0,1, ;8
+	db 3,0,0,0,0,0,0,2, ;9
+	db 3,1,0,0,0,0,0,2, ;10
+	db 3,1,0,0,0,0,0,3, ;11
+	db 3,2,0,0,0,0,0,2, ;12
+	db 3,2,0,0,0,0,0,3, ;13
+	db 3,2,1,0,0,0,0,3, ;14
+	db 3,2,1,0,0,0,0,4, ;15
+	db 4,0,0,0,0,0,0,1, ;16
+	db 4,0,0,0,0,0,0,2, ;17
+	db 4,1,0,0,0,0,0,2, ;18
+	db 4,1,0,0,0,0,0,3, ;19
+	db 4,2,0,0,0,0,0,2, ;20
+	db 4,2,0,0,0,0,0,3, ;21
+	db 4,2,1,0,0,0,0,3, ;22
+	db 4,2,1,0,0,0,0,4, ;23
+	db 4,3,0,0,0,0,0,2, ;24
+	db 4,3,0,0,0,0,0,3, ;25
+	db 4,3,1,0,0,0,0,3, ;26
+	db 4,3,1,0,0,0,0,4, ;27
+	db 4,3,2,0,0,0,0,3, ;28
+	db 4,3,2,0,0,0,0,4, ;29
+	db 4,3,2,1,0,0,0,4, ;30
+	db 4,3,2,1,0,0,0,5, ;31
+	db 5,0,0,0,0,0,0,1, ;32
+	db 5,0,0,0,0,0,0,2, ;33
+	db 5,1,0,0,0,0,0,2, ;34
+	db 5,1,0,0,0,0,0,3, ;35
+	db 5,2,0,0,0,0,0,2, ;36
+	db 5,2,0,0,0,0,0,3, ;37
+	db 5,2,1,0,0,0,0,3, ;38
+	db 5,2,1,0,0,0,0,4, ;39
+	db 5,3,0,0,0,0,0,2, ;40
+	db 5,3,0,0,0,0,0,3, ;41
+	db 5,3,1,0,0,0,0,3, ;42
+	db 5,3,1,0,0,0,0,4, ;43
+	db 5,3,2,0,0,0,0,3, ;44
+	db 5,3,2,0,0,0,0,4, ;45
+	db 5,3,2,1,0,0,0,4, ;46
+	db 5,3,2,1,0,0,0,5, ;47
+	db 5,4,0,0,0,0,0,2, ;48
+	db 5,4,0,0,0,0,0,3, ;49
+	db 5,4,1,0,0,0,0,3, ;50
+	db 5,4,1,0,0,0,0,4, ;51
+	db 5,4,2,0,0,0,0,3, ;52
+	db 5,4,2,0,0,0,0,4, ;53
+	db 5,4,2,1,0,0,0,4, ;54
+	db 5,4,2,1,0,0,0,5, ;55
+	db 5,4,3,0,0,0,0,3, ;56
+	db 5,4,3,0,0,0,0,4, ;57
+	db 5,4,3,1,0,0,0,4, ;58
+	db 5,4,3,1,0,0,0,5, ;59
+	db 5,4,3,2,0,0,0,4, ;60
+	db 5,4,3,2,0,0,0,5, ;61
+	db 5,4,3,2,1,0,0,5, ;62
+	db 5,4,3,2,1,0,0,6, ;63
+	db 6,0,0,0,0,0,0,1, ;64
+	db 6,0,0,0,0,0,0,2, ;65
+	db 6,1,0,0,0,0,0,2, ;66
+	db 6,1,0,0,0,0,0,3, ;67
+	db 6,2,0,0,0,0,0,2, ;68
+	db 6,2,0,0,0,0,0,3, ;69
+	db 6,2,1,0,0,0,0,3, ;70
+	db 6,2,1,0,0,0,0,4, ;71
+	db 6,3,0,0,0,0,0,2, ;72
+	db 6,3,0,0,0,0,0,3, ;73
+	db 6,3,1,0,0,0,0,3, ;74
+	db 6,3,1,0,0,0,0,4, ;75
+	db 6,3,2,0,0,0,0,3, ;76
+	db 6,3,2,0,0,0,0,4, ;77
+	db 6,3,2,1,0,0,0,4, ;78
+	db 6,3,2,1,0,0,0,5, ;79
+	db 6,4,0,0,0,0,0,2, ;80
+	db 6,4,0,0,0,0,0,3, ;81
+	db 6,4,1,0,0,0,0,3, ;82
+	db 6,4,1,0,0,0,0,4, ;83
+	db 6,4,2,0,0,0,0,3, ;84
+	db 6,4,2,0,0,0,0,4, ;85
+	db 6,4,2,1,0,0,0,4, ;86
+	db 6,4,2,1,0,0,0,5, ;87
+	db 6,4,3,0,0,0,0,3, ;88
+	db 6,4,3,0,0,0,0,4, ;89
+	db 6,4,3,1,0,0,0,4, ;90
+	db 6,4,3,1,0,0,0,5, ;91
+	db 6,4,3,2,0,0,0,4, ;92
+	db 6,4,3,2,0,0,0,5, ;93
+	db 6,4,3,2,1,0,0,5, ;94
+	db 6,4,3,2,1,0,0,6, ;95
+	db 6,5,0,0,0,0,0,2, ;96
+	db 6,5,0,0,0,0,0,3, ;97
+	db 6,5,1,0,0,0,0,3, ;98
+	db 6,5,1,0,0,0,0,4, ;99
+	db 6,5,2,0,0,0,0,3, ;100
+	db 6,5,2,0,0,0,0,4, ;101
+	db 6,5,2,1,0,0,0,4, ;102
+	db 6,5,2,1,0,0,0,5, ;103
+	db 6,5,3,0,0,0,0,3, ;104
+	db 6,5,3,0,0,0,0,4, ;105
+	db 6,5,3,1,0,0,0,4, ;106
+	db 6,5,3,1,0,0,0,5, ;107
+	db 6,5,3,2,0,0,0,4, ;108
+	db 6,5,3,2,0,0,0,5, ;109
+	db 6,5,3,2,1,0,0,5, ;110
+	db 6,5,3,2,1,0,0,6, ;111
+	db 6,5,4,0,0,0,0,3, ;112
+	db 6,5,4,0,0,0,0,4, ;113
+	db 6,5,4,1,0,0,0,4, ;114
+	db 6,5,4,1,0,0,0,5, ;115
+	db 6,5,4,2,0,0,0,4, ;116
+	db 6,5,4,2,0,0,0,5, ;117
+	db 6,5,4,2,1,0,0,5, ;118
+	db 6,5,4,2,1,0,0,6, ;119
+	db 6,5,4,3,0,0,0,4, ;120
+	db 6,5,4,3,0,0,0,5, ;121
+	db 6,5,4,3,1,0,0,5, ;122
+	db 6,5,4,3,1,0,0,6, ;123
+	db 6,5,4,3,2,0,0,5, ;124
+	db 6,5,4,3,2,0,0,6, ;125
+	db 6,5,4,3,2,1,0,6, ;126
+	db 6,5,4,3,2,1,0,7, ;127
+	db 7,0,0,0,0,0,0,1, ;128
+	db 7,0,0,0,0,0,0,2, ;129
+	db 7,1,0,0,0,0,0,2, ;130
+	db 7,1,0,0,0,0,0,3, ;131
+	db 7,2,0,0,0,0,0,2, ;132
+	db 7,2,0,0,0,0,0,3, ;133
+	db 7,2,1,0,0,0,0,3, ;134
+	db 7,2,1,0,0,0,0,4, ;135
+	db 7,3,0,0,0,0,0,2, ;136
+	db 7,3,0,0,0,0,0,3, ;137
+	db 7,3,1,0,0,0,0,3, ;138
+	db 7,3,1,0,0,0,0,4, ;139
+	db 7,3,2,0,0,0,0,3, ;140
+	db 7,3,2,0,0,0,0,4, ;141
+	db 7,3,2,1,0,0,0,4, ;142
+	db 7,3,2,1,0,0,0,5, ;143
+	db 7,4,0,0,0,0,0,2, ;144
+	db 7,4,0,0,0,0,0,3, ;145
+	db 7,4,1,0,0,0,0,3, ;146
+	db 7,4,1,0,0,0,0,4, ;147
+	db 7,4,2,0,0,0,0,3, ;148
+	db 7,4,2,0,0,0,0,4, ;149
+	db 7,4,2,1,0,0,0,4, ;150
+	db 7,4,2,1,0,0,0,5, ;151
+	db 7,4,3,0,0,0,0,3, ;152
+	db 7,4,3,0,0,0,0,4, ;153
+	db 7,4,3,1,0,0,0,4, ;154
+	db 7,4,3,1,0,0,0,5, ;155
+	db 7,4,3,2,0,0,0,4, ;156
+	db 7,4,3,2,0,0,0,5, ;157
+	db 7,4,3,2,1,0,0,5, ;158
+	db 7,4,3,2,1,0,0,6, ;159
+	db 7,5,0,0,0,0,0,2, ;160
+	db 7,5,0,0,0,0,0,3, ;161
+	db 7,5,1,0,0,0,0,3, ;162
+	db 7,5,1,0,0,0,0,4, ;163
+	db 7,5,2,0,0,0,0,3, ;164
+	db 7,5,2,0,0,0,0,4, ;165
+	db 7,5,2,1,0,0,0,4, ;166
+	db 7,5,2,1,0,0,0,5, ;167
+	db 7,5,3,0,0,0,0,3, ;168
+	db 7,5,3,0,0,0,0,4, ;169
+	db 7,5,3,1,0,0,0,4, ;170
+	db 7,5,3,1,0,0,0,5, ;171
+	db 7,5,3,2,0,0,0,4, ;172
+	db 7,5,3,2,0,0,0,5, ;173
+	db 7,5,3,2,1,0,0,5, ;174
+	db 7,5,3,2,1,0,0,6, ;175
+	db 7,5,4,0,0,0,0,3, ;176
+	db 7,5,4,0,0,0,0,4, ;177
+	db 7,5,4,1,0,0,0,4, ;178
+	db 7,5,4,1,0,0,0,5, ;179
+	db 7,5,4,2,0,0,0,4, ;180
+	db 7,5,4,2,0,0,0,5, ;181
+	db 7,5,4,2,1,0,0,5, ;182
+	db 7,5,4,2,1,0,0,6, ;183
+	db 7,5,4,3,0,0,0,4, ;184
+	db 7,5,4,3,0,0,0,5, ;185
+	db 7,5,4,3,1,0,0,5, ;186
+	db 7,5,4,3,1,0,0,6, ;187
+	db 7,5,4,3,2,0,0,5, ;188
+	db 7,5,4,3,2,0,0,6, ;189
+	db 7,5,4,3,2,1,0,6, ;190
+	db 7,5,4,3,2,1,0,7, ;191
+	db 7,6,0,0,0,0,0,2, ;192
+	db 7,6,0,0,0,0,0,3, ;193
+	db 7,6,1,0,0,0,0,3, ;194
+	db 7,6,1,0,0,0,0,4, ;195
+	db 7,6,2,0,0,0,0,3, ;196
+	db 7,6,2,0,0,0,0,4, ;197
+	db 7,6,2,1,0,0,0,4, ;198
+	db 7,6,2,1,0,0,0,5, ;199
+	db 7,6,3,0,0,0,0,3, ;200
+	db 7,6,3,0,0,0,0,4, ;201
+	db 7,6,3,1,0,0,0,4, ;202
+	db 7,6,3,1,0,0,0,5, ;203
+	db 7,6,3,2,0,0,0,4, ;204
+	db 7,6,3,2,0,0,0,5, ;205
+	db 7,6,3,2,1,0,0,5, ;206
+	db 7,6,3,2,1,0,0,6, ;207
+	db 7,6,4,0,0,0,0,3, ;208
+	db 7,6,4,0,0,0,0,4, ;209
+	db 7,6,4,1,0,0,0,4, ;210
+	db 7,6,4,1,0,0,0,5, ;211
+	db 7,6,4,2,0,0,0,4, ;212
+	db 7,6,4,2,0,0,0,5, ;213
+	db 7,6,4,2,1,0,0,5, ;214
+	db 7,6,4,2,1,0,0,6, ;215
+	db 7,6,4,3,0,0,0,4, ;216
+	db 7,6,4,3,0,0,0,5, ;217
+	db 7,6,4,3,1,0,0,5, ;218
+	db 7,6,4,3,1,0,0,6, ;219
+	db 7,6,4,3,2,0,0,5, ;220
+	db 7,6,4,3,2,0,0,6, ;221
+	db 7,6,4,3,2,1,0,6, ;222
+	db 7,6,4,3,2,1,0,7, ;223
+	db 7,6,5,0,0,0,0,3, ;224
+	db 7,6,5,0,0,0,0,4, ;225
+	db 7,6,5,1,0,0,0,4, ;226
+	db 7,6,5,1,0,0,0,5, ;227
+	db 7,6,5,2,0,0,0,4, ;228
+	db 7,6,5,2,0,0,0,5, ;229
+	db 7,6,5,2,1,0,0,5, ;230
+	db 7,6,5,2,1,0,0,6, ;231
+	db 7,6,5,3,0,0,0,4, ;232
+	db 7,6,5,3,0,0,0,5, ;233
+	db 7,6,5,3,1,0,0,5, ;234
+	db 7,6,5,3,1,0,0,6, ;235
+	db 7,6,5,3,2,0,0,5, ;236
+	db 7,6,5,3,2,0,0,6, ;237
+	db 7,6,5,3,2,1,0,6, ;238
+	db 7,6,5,3,2,1,0,7, ;239
+	db 7,6,5,4,0,0,0,4, ;240
+	db 7,6,5,4,0,0,0,5, ;241
+	db 7,6,5,4,1,0,0,5, ;242
+	db 7,6,5,4,1,0,0,6, ;243
+	db 7,6,5,4,2,0,0,5, ;244
+	db 7,6,5,4,2,0,0,6, ;245
+	db 7,6,5,4,2,1,0,6, ;246
+	db 7,6,5,4,2,1,0,7, ;247
+	db 7,6,5,4,3,0,0,5, ;248
+	db 7,6,5,4,3,0,0,6, ;249
+	db 7,6,5,4,3,1,0,6, ;250
+	db 7,6,5,4,3,1,0,7, ;251
+	db 7,6,5,4,3,2,0,6, ;252
+	db 7,6,5,4,3,2,0,7, ;253
+	db 7,6,5,4,3,2,1,7, ;254
+	db 7,6,5,4,3,2,1,8, ;255
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+
+
+;***********************************************************************
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;***********************************************************************
+WELS_EXTERN CavlcParamCal_sse2
+	push ebx
+	push edi
+	push esi
+
+	mov			eax,	[esp+16]	;coffLevel
+	mov			edi,	[esp+24]	;Level
+	mov			ebx,	[esp+32]	;endIdx
+	cmp			ebx,	3
+	jne			.Level16
+	pxor		xmm1,	xmm1
+	movq		xmm0,	[eax]	; removed QWORD
+	jmp			.Cal_begin
+.Level16:
+	movdqa		xmm0,	[eax]
+	movdqa		xmm1,	[eax+16]
+.Cal_begin:
+    movdqa		xmm2,	xmm0
+	packsswb	xmm0,	xmm1
+	movdqa		xmm4,	xmm0
+	pxor		xmm3,	xmm3
+	pcmpgtb		xmm0,	xmm3
+	pcmpgtb		xmm3,	xmm4
+	por			xmm0,	xmm3
+	pmovmskb	edx,	xmm0
+	cmp			edx,	0
+	je near   .return
+	movdqa		xmm6,	[sse2_b_1]
+	pcmpeqw		xmm7,	xmm7	;generate -1
+    mov			ebx,	0xff
+    ;pinsrw		xmm6,	ebx,	3
+
+    mov       bl,   dh
+
+	lea       ebx,  [byte_1pos_table+8*ebx]
+	movq      xmm0, [ebx]
+	pextrw    ecx,  xmm0, 3
+	shr       ecx,  8
+    mov       dh,   cl
+
+.loopHighFind0:
+    cmp       ecx,   0
+    je        .loopHighFind0End
+    ;mov       esi, [ebx]
+    ;and       esi, 0xff
+    movzx	  esi, byte [ebx]
+    add       esi, 8
+    mov       esi, [eax+2*esi]
+    mov       [edi], si
+    add       edi,   2
+    ;add       ebx,   1
+    inc		  ebx
+    dec       ecx
+	jmp       .loopHighFind0
+.loopHighFind0End:
+    mov       cl,   dh
+    cmp       cl,   8
+	pand      xmm0, xmm6
+    jne       .LowByteFind0
+    sub       edi,   2
+    mov       esi,   [eax+16]
+    mov       [edi], esi
+    add       edi,   2
+.LowByteFind0:
+    and       edx,  0xff
+	lea       ebx,  [byte_1pos_table+8*edx]
+	movq      xmm1, [ebx]
+    pextrw    esi,  xmm1, 3
+    or        esi,  0xff
+    or        ecx,  0xff00
+    and       ecx,  esi
+    shr       esi,  8
+    pand      xmm1, xmm6
+.loopLowFind0:
+    cmp       esi, 0
+    je        .loopLowFind0End
+	;mov       edx, [ebx]
+	;and       edx, 0xff
+	movzx	  edx,	byte [ebx]
+	mov       edx, [eax+2*edx]
+	mov       [edi], dx
+	add       edi,   2
+	;add       ebx,   1
+	inc		  ebx
+    dec       esi
+	jmp       .loopLowFind0
+.loopLowFind0End:
+    cmp       ch,  8
+    jne       .getLevelEnd
+    sub       edi, 2
+    mov       edx, [eax]
+    mov       [edi], dx
+.getLevelEnd:
+	mov      edx, [esp+28]	;total_coeffs
+    ;mov      ebx,   ecx
+    ;and      ebx,   0xff
+    movzx	 ebx,	byte cl
+    add      cl,    ch
+	mov      [edx], cl
+;getRun
+    movq     xmm5, [sse2_b8]
+    paddb    xmm0, xmm5
+    pxor     xmm2, xmm2
+    pxor     xmm3, xmm3
+    mov      eax,  8
+    sub      eax,  ebx
+    shl      eax,  3
+    shl      ebx,  3
+	pinsrw   xmm2, ebx, 0
+    pinsrw   xmm3, eax, 0
+    psllq    xmm0, xmm3
+    psrlq    xmm0, xmm3
+    movdqa   xmm4, xmm1
+    psllq    xmm1, xmm2
+    psrlq    xmm4, xmm3
+    punpcklqdq xmm1, xmm4
+    por      xmm0,  xmm1
+
+    pextrw   eax,   xmm0, 0
+    and		 eax,   0xff
+    inc      eax
+    sub      al,    cl
+	movdqa   xmm1,  xmm0
+	paddb    xmm1,  xmm7
+	psrldq   xmm0,  1
+	psubb    xmm1,  xmm0
+    mov      ecx,   [esp+20] ;run
+	movdqa   [ecx], xmm1
+;getRunEnd
+.return:
+	pop esi
+	pop edi
+	pop ebx
+	ret
+%endif
--- /dev/null
+++ b/codec/encoder/core/x86/dct.asm
@@ -1,0 +1,504 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+
+align 16
+SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
+			dw	10, 13, 10, 13, 13, 16, 13, 16,
+            dw  11, 14, 11, 14, 14, 18, 14, 18,
+			dw  11, 14, 11, 14, 14, 18, 14, 18,
+			dw  13, 16, 13, 16, 16, 20, 16, 20,
+			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  14, 18, 14, 18, 18, 23, 18, 23,
+			dw  14, 18, 14, 18, 18, 23, 18, 23,
+			dw  16, 20, 16, 20, 20, 25, 20, 25,
+			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  18, 23, 18, 23, 23, 29, 23, 29,
+			dw  18, 23, 18, 23, 23, 29, 23, 29
+
+
+;***********************************************************************
+; MMX functions
+;***********************************************************************
+
+%macro MMX_LoadDiff4P 5
+	movd        %1, [%3]
+	movd        %2, [%4]
+	punpcklbw   %1, %5
+	punpcklbw   %2, %5
+	psubw       %1, %2
+%endmacro
+
+%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
+	MMX_LoadDiff4P %1, %9, %5,    %7,    %10
+	MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+	lea  %5, [%5+2*%6]
+	lea  %7, [%7+2*%8]
+	MMX_LoadDiff4P %3, %9, %5,    %7,    %10
+	MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+%endmacro
+
+%macro MMX_SumSubMul2 3
+	movq    %3, %1
+	psllw   %1, $01
+	paddw   %1, %2
+	psllw   %2, $01
+    psubw   %3, %2
+%endmacro
+
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $01
+    paddw   %3, %1
+    psraw   %1, $01
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_DCT 6
+    MMX_SumSub		%4, %1, %6
+    MMX_SumSub		%3, %2, %6
+    MMX_SumSub		%3, %4, %6
+    MMX_SumSubMul2  %1, %2, %5
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+%macro MMX_StoreDiff4P 6
+    movd       %2, %6
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $06
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+SECTION .text
+;***********************************************************************
+;   void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctT4_mmx
+    %assign push_num 0
+    LOAD_5_PARA
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r4, r4d
+    WELS_Zero    mm7
+
+    MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
+
+    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
+    MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
+
+    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
+    MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
+
+    movq    [r0+ 0],   mm2
+    movq    [r0+ 8],   mm1
+    movq    [r0+16],   mm5
+    movq    [r0+24],   mm4
+    WELSEMMS
+    LOAD_5_PARA_POP
+    ret
+
+
+;***********************************************************************
+;   void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
+;***********************************************************************
+WELS_EXTERN WelsIDctT4Rec_mmx
+    %assign push_num 0
+    LOAD_5_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movq    mm0, [r4+ 0]
+    movq    mm1, [r4+ 8]
+    movq    mm2, [r4+16]
+    movq    mm3, [r4+24]
+
+	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W		mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+
+    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+    lea     r0, [r0+2*r1]
+    lea     r2, [r2+2*r3]
+    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
+
+	WELSEMMS
+    LOAD_5_PARA_POP
+    ret
+
+
+;***********************************************************************
+; SSE2 functions
+;***********************************************************************
+%macro SSE2_Store4x8p 6
+	SSE2_XSawp qdq, %2, %3, %6
+	SSE2_XSawp qdq, %4, %5, %3
+	MOVDQ    [%1+0x00], %2
+	MOVDQ    [%1+0x10], %4
+	MOVDQ    [%1+0x20], %6
+	MOVDQ    [%1+0x30], %3
+%endmacro
+
+%macro SSE2_Load4x8p 6
+	MOVDQ    %2,	[%1+0x00]
+	MOVDQ    %4,	[%1+0x10]
+	MOVDQ    %6,	[%1+0x20]
+	MOVDQ    %3,	[%1+0x30]
+	SSE2_XSawp qdq, %4, %3, %5
+	SSE2_XSawp qdq, %2, %6, %3
+%endmacro
+
+%macro SSE2_SumSubMul2 3
+    movdqa  %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro SSE2_SumSubDiv2 4
+    movdqa  %4, %1
+    movdqa  %3, %2
+    psraw   %2, $01
+    psraw   %4, $01
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro SSE2_StoreDiff8p 6
+    paddw       %1, %3
+    psraw       %1, $06
+    movq		%2, %6
+    punpcklbw   %2, %4
+    paddsw      %2, %1
+    packuswb    %2, %2
+    movq	    %5, %2
+%endmacro
+
+%macro SSE2_StoreDiff8p 5
+    movq		%2, %5
+    punpcklbw   %2, %3
+    paddsw      %2, %1
+    packuswb    %2, %2
+    movq	    %4, %2
+%endmacro
+
+%macro SSE2_Load8DC	6
+	movdqa		%1,		%6		; %1 = dc0 dc1
+	paddw       %1,		%5
+    psraw       %1,		$06		; (dc + 32) >> 6
+
+    movdqa		%2,		%1
+    psrldq		%2,		4
+ 	punpcklwd	%2,		%2
+	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+
+    movdqa		%3,		%1
+    psrldq		%3,		8
+ 	punpcklwd	%3,		%3
+	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+
+	movdqa		%4,		%1
+    psrldq		%4,		12
+ 	punpcklwd	%4,		%4
+	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+
+	punpcklwd	%1,		%1
+	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+%endmacro
+
+%macro SSE2_DCT 6
+    SSE2_SumSub		%6, %3,	%5
+	SSE2_SumSub		%1, %2, %5
+	SSE2_SumSub		%3, %2, %5
+	SSE2_SumSubMul2		%6, %1, %4
+%endmacro
+
+%macro SSE2_IDCT 7
+    SSE2_SumSub       %7, %2, %6
+    SSE2_SumSubDiv2     %1, %3, %5, %4
+    SSE2_SumSub	     %2, %1, %5
+    SSE2_SumSub		 %7, %4, %5
+%endmacro
+
+;***********************************************************************
+; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctFourT4_sse2
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r4, r4d
+    pxor    xmm7, xmm7
+	;Load 4x8
+	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
+    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
+	lea		r1, [r1 + 2 * r2]
+	lea		r3, [r3 + 2 * r4]
+	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
+    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
+
+	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
+	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+
+	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+
+	lea		r1, [r1 + 2 * r2]
+	lea		r3, [r3 + 2 * r4]
+
+	;Load 4x8
+	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
+    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2  ], [r3+r4]
+	lea		r1, [r1 + 2 * r2]
+	lea		r3, [r3 + 2 * r4]
+    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
+    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
+
+	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+
+	lea		r0, [r0+64]
+	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+
+	POP_XMM
+	LOAD_5_PARA_POP
+    ret
+
+
+;***********************************************************************
+; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
+;***********************************************************************
+WELS_EXTERN WelsIDctFourT4Rec_sse2
+	%assign push_num 0
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	;Load 4x8
+	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
+
+	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
+  	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+    SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
+    SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+
+	WELS_Zero			xmm7
+    WELS_DW32			xmm6
+
+	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
+	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
+	lea		r0, [r0 + 2 * r1]
+	lea		r2, [r2 + 2 * r3]
+	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
+
+    add		r4, 64
+	lea		r0, [r0 + 2 * r1]
+	lea		r2, [r2 + 2 * r3]
+   	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
+
+	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
+	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
+	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+
+	WELS_Zero			xmm7
+    WELS_DW32			xmm6
+
+	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
+	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
+	lea		r0, [r0 + 2 * r1]
+	lea		r2, [r2 + 2 * r3]
+	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],	[r2 + r3]
+	POP_XMM
+	LOAD_5_PARA_POP
+   ; pop		esi
+   ; pop		ebx
+    ret
+
+%macro SSE2_StoreDiff4x8p 8
+   	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
+	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
+%endmacro
+
+ ;***********************************************************************
+; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
+;***********************************************************************
+WELS_EXTERN WelsIDctRecI16x16Dc_sse2
+	%assign push_num 0
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	pxor		xmm7,		xmm7
+    WELS_DW32	xmm6
+
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+	POP_XMM
+	LOAD_5_PARA_POP
+    ret
+
+
+
+%macro SSE2_SumSubD 3
+	movdqa  %3, %2
+    paddd   %2, %1
+    psubd   %1, %3
+%endmacro
+
+%macro SSE2_SumSubDiv2D 4
+	paddd   %1, %2
+	paddd	%1, %3
+	psrad	%1,	 1
+	movdqa	%4, %1
+	psubd	%4, %2
+%endmacro
+%macro		SSE2_Load4Col	5
+	movsx		r2,		WORD[%5]
+ 	movd		%1,			r2d
+ 	movsx		r2,		WORD[%5 + 0x20]
+ 	movd		%2,			r2d
+	punpckldq	%1,			%2
+	movsx		r2,		WORD[%5 + 0x80]
+ 	movd		%3,			r2d
+	movsx		r2,		WORD[%5 + 0xa0]
+ 	movd		%4,			r2d
+	punpckldq	%3,			%4
+	punpcklqdq	%1,			%3
+%endmacro
+
+;***********************************************************************
+;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
+;***********************************************************************
+WELS_EXTERN WelsHadamardT4Dc_sse2
+		%assign push_num 0
+		LOAD_2_PARA
+		PUSH_XMM 8
+		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
+		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, r1 + 0x140
+
+		SSE2_SumSubD		xmm1, xmm2, xmm7
+		SSE2_SumSubD		xmm3, xmm4, xmm7
+		SSE2_SumSubD		xmm2, xmm4, xmm7
+		SSE2_SumSubD		xmm1, xmm3, xmm7
+
+		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
+
+		SSE2_SumSubD		xmm4, xmm3, xmm7
+		SSE2_SumSubD		xmm5, xmm1, xmm7
+
+		WELS_DD1 xmm6
+		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
+		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
+        SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
+
+		packssdw	xmm3,	xmm4
+		packssdw	xmm2,	xmm1
+		movdqa	[r0+ 0],   xmm3
+		movdqa	[r0+16],   xmm2
+
+		POP_XMM
+		ret
--- /dev/null
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -1,0 +1,1416 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+
+;***********************************************************************
+; macros
+;***********************************************************************
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+;%1 will keep the last result
+%macro SSE_DB_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubb %1, %2
+%endmacro
+
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+    movd		%1,	[%4-1]
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+
+	;add			%4,	%5
+	movd		%2,	[%4+%5-1]
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	punpckldq	%1,	%2
+%endmacro
+
+%macro  SUMW_HORIZON1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro	LOAD_COLUMN 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpcklwd %1,	%3
+		lea		%5,	[%5+2*%6]
+		movd	%4,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %4,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		lea		%5,	[%5+2*%6]
+		punpcklbw %3,	%2
+		punpcklwd %4,	%3
+		punpckhdq %1,	%4
+%endmacro
+
+%macro  SUMW_HORIZON 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro	LOAD_COLUMN_C 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpckhwd %1,	%3
+		lea		%5,	[%5+2*%6]
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+        lea         r1, [r1+2*r2]
+        movzx		r4, byte [r1-0x01]
+        add			r3, r4
+        movzx		r4, byte [r1+r2-0x01]
+        add			r3, r4
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+;   void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;
+;	pred must align to 16
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredH_sse2
+	push r3
+	%assign push_num 1
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	movzx		r3,	byte [r1-1]
+	movd		xmm0,	r3d
+	pmuludq		xmm0,	[mmx_01bytes]
+
+	movzx		r3,	byte [r1+r2-1]
+	movd		xmm1,	r3d
+	pmuludq		xmm1,	[mmx_01bytes]
+
+	unpcklps	xmm0,	xmm1
+
+	lea			r1,	[r1+r2*2]
+	movzx		r3,	byte [r1-1]
+	movd		xmm2,	r3d
+	pmuludq		xmm2,	[mmx_01bytes]
+
+	movzx		r3,	byte [r1+r2-1]
+	movd		xmm3,	r3d
+	pmuludq		xmm3,	[mmx_01bytes]
+
+	unpcklps	xmm2,	xmm3
+	unpcklpd	xmm0,	xmm2
+
+	movdqa		[r0],	xmm0
+	pop r3
+	ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_3_PARA
+		PUSH_XMM 8
+		SIGN_EXTENSION r2, r2d
+		sub		r1,	1
+		sub		r1,	r2
+
+		;for H
+		pxor	xmm7,	xmm7
+		movq	xmm0,	[r1]
+		movdqa	xmm5,	[sse2_plane_dec]
+		punpcklbw xmm0,	xmm7
+		pmullw	xmm0,	xmm5
+		movq	xmm1,	[r1 + 9]
+		movdqa	xmm6,	[sse2_plane_inc]
+		punpcklbw xmm1,	xmm7
+		pmullw	xmm1,	xmm6
+		psubw	xmm1,	xmm0
+
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    r3d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
+		movsx	r3,	r3w
+		imul	r3,	5
+		add		r3,	32
+		sar		r3,	6			; b = (5 * H + 32) >> 6;
+		SSE2_Copy8Times	xmm1, r3d	; xmm1 = b,b,b,b,b,b,b,b
+
+		movzx	r4,	BYTE [r1+16]
+		sub	r1, 3
+		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r1, r2
+
+		add		r1,	3
+		movzx	r3,	BYTE [r1+8*r2]
+		add		r4,	r3
+		shl		r4,	4			;	a = (left[15*stride] + top[15]) << 4;
+
+		sub	r1, 3
+		add		r1,	r2
+		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r1, r2
+		pxor	xmm4,	xmm4
+		punpckhbw xmm0,	xmm4
+		pmullw	xmm0,	xmm5
+		punpckhbw xmm7,	xmm4
+		pmullw	xmm7,	xmm6
+		psubw	xmm7,	xmm0
+
+		SUMW_HORIZON   xmm7,xmm0,xmm2
+		movd    r3d,   xmm7			; V
+		movsx	r3,	r3w
+		imul	r3,	5
+		add		r3,	32
+		sar		r3,	6				; c = (5 * V + 32) >> 6;
+		SSE2_Copy8Times	xmm4, r3d		; xmm4 = c,c,c,c,c,c,c,c
+
+		add		r4,	16
+		imul	r3,	-7
+		add		r3,	r4				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+
+		xor		r3,	r3
+		movdqa	xmm5,	[sse2_plane_inc_minus]
+
+get_i16x16_luma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		movdqa	xmm3,	xmm1
+		pmullw	xmm3,	xmm6
+		paddw	xmm3,	xmm0
+		psraw	xmm3,	5
+		packuswb xmm2,	xmm3
+		movdqa	[r0],	xmm2
+		paddw	xmm0,	xmm4
+		add		r0,	16
+		inc		r3
+		cmp		r3,	16
+		jnz get_i16x16_luma_pred_plane_sse2_1
+		POP_XMM
+		pop r4
+		pop r3
+		ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+	add r0, 16
+	add r1, r2
+	movzx r3, byte [r1]
+	SSE2_Copy16Times xmm0, r3d
+	movdqa [r0], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+	push r3
+	%assign push_num 1
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	dec r1
+	movzx r3, byte [r1]
+	SSE2_Copy16Times xmm0, r3d
+	movdqa [r0], xmm0
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	pop r3
+    ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub     r1, r2
+    movdqa  xmm0, [r1]
+
+    movdqa  [r0], xmm0
+    movdqa  [r0+10h], xmm0
+    movdqa  [r0+20h], xmm0
+    movdqa  [r0+30h], xmm0
+    movdqa  [r0+40h], xmm0
+    movdqa  [r0+50h], xmm0
+    movdqa  [r0+60h], xmm0
+    movdqa  [r0+70h], xmm0
+    movdqa  [r0+80h], xmm0
+    movdqa  [r0+90h], xmm0
+    movdqa  [r0+160], xmm0
+    movdqa  [r0+176], xmm0
+    movdqa  [r0+192], xmm0
+    movdqa  [r0+208], xmm0
+    movdqa  [r0+224], xmm0
+    movdqa  [r0+240], xmm0
+
+    ret
+
+;***********************************************************************
+; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredPlane_sse2
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_3_PARA
+		PUSH_XMM 8
+		SIGN_EXTENSION r2, r2d
+		sub		r1,	1
+		sub		r1,	r2
+
+		pxor	mm7,	mm7
+		movq	mm0,	[r1]
+		movq	mm5,	[sse2_plane_dec_c]
+		punpcklbw mm0,	mm7
+		pmullw	mm0,	mm5
+		movq	mm1,	[r1 + 5]
+		movq	mm6,	[sse2_plane_inc_c]
+		punpcklbw mm1,	mm7
+		pmullw	mm1,	mm6
+		psubw	mm1,	mm0
+
+		movq2dq xmm1,   mm1
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    r3d,	xmm1
+		movsx	r3,	r3w
+		imul	r3,	17
+		add		r3,	16
+		sar		r3,	5			; b = (17 * H + 16) >> 5;
+		SSE2_Copy8Times	xmm1, r3d	; mm1 = b,b,b,b,b,b,b,b
+
+		movzx	r3,	BYTE [r1+8]
+		sub	r1, 3
+		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r1, r2
+
+		add		r1,	3
+		movzx	r4,	BYTE [r1+4*r2]
+		add		r4,	r3
+		shl		r4,	4			; a = (left[7*stride] + top[7]) << 4;
+
+		sub	r1, 3
+		add		r1,	r2
+		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r1, r2
+		pxor	mm4,	mm4
+		punpckhbw mm0,	mm4
+		pmullw	mm0,	mm5
+		punpckhbw mm7,	mm4
+		pmullw	mm7,	mm6
+		psubw	mm7,	mm0
+
+		movq2dq xmm7,   mm7
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm7,xmm0,xmm2
+		movd    r3d,    xmm7			; V
+		movsx	r3,	r3w
+		imul	r3,	17
+		add		r3,	16
+		sar		r3,	5				; c = (17 * V + 16) >> 5;
+		SSE2_Copy8Times	xmm4, r3d	; mm4 = c,c,c,c,c,c,c,c
+
+		add		r4,	16
+		imul	r3,	-3
+		add		r3,	r4		; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, r3d	; xmm0 = s,s,s,s,s,s,s,s
+
+		xor		r3,	r3
+		movdqa	xmm5,	[sse2_plane_mul_b_c]
+
+get_i_chroma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		packuswb xmm2,	xmm2
+		movq	[r0],	xmm2
+		paddw	xmm0,	xmm4
+		add		r0,	8
+		inc		r3
+		cmp		r3,	8
+		jnz get_i_chroma_pred_plane_sse2_1
+		POP_XMM
+		pop r4
+		pop r3
+		WELSEMMS
+		ret
+
+;***********************************************************************
+;	0 |1 |2 |3 |4 |
+;	6 |7 |8 |9 |10|
+;	11|12|13|14|15|
+;	16|17|18|19|20|
+;	21|22|23|24|25|
+;	7 is the start pixel of current 4x4 block
+;	pred[7] = ([6]+[0]*2+[1]+2)/4
+;
+;   void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
+	sub		r1, r2			;mov eax to above line of current block(postion of 1)
+	punpckhbw   mm2,[r1-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+	movd        mm3,[r1]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+	psllq       mm3,18h				;mm3[5]=[1]
+	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	lea  	    r1,[r1+r2*2-8h]		;set eax point to 12
+	movq        mm4,[r1+r2]		;get value of 16, mm4[8]=[16]
+	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[16]
+	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+	movq        mm4,[r1+r2*2]		;mm4[8]=[21]
+	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[21]
+	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
+	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
+	pand        mm1,[mmx_01bytes]	;set the odd bit
+	psubusb     mm3,mm1				;decrease 1 from odd bytes
+	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+
+	movd        [r0+12],mm2
+	psrlq       mm2,8
+	movd        [r0+8],mm2
+	psrlq       mm2,8
+	movd        [r0+4],mm2
+	psrlq       mm2,8
+	movd        [r0],mm2
+	WELSEMMS
+	ret
+
+;***********************************************************************
+;	0 |1 |2 |3 |4 |
+;	5 |6 |7 |8 |9 |
+;	10|11|12|13|14|
+;	15|16|17|18|19|
+;	20|21|22|23|24|
+;	6 is the start pixel of current 4x4 block
+;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;
+;   void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	movzx		r4,	byte [r1-1h]
+	sub			r1,	r2
+	movd		xmm0,	[r1]
+	pxor		xmm1,	xmm1
+	psadbw		xmm0,	xmm1
+	xor r3, r3
+	movd		r3d,	xmm0
+	add			r3,	r4
+	movzx		r4,	byte [r1+r2*2-1h]
+	add			r3,	r4
+
+	lea			r1,	[r1+r2*2-1]
+	movzx		r4,	byte [r1+r2]
+	add			r3,	r4
+
+	movzx		r4,	byte [r1+r2*2]
+	add			r3,	r4
+	add			r3,	4
+	sar			r3,	3
+	imul		r3,	0x01010101
+
+	movd		xmm0,	r3d
+	pshufd		xmm0,	xmm0,	0
+	movdqa		[r0],	xmm0
+	pop r4
+	pop r3
+	ret
+
+;***********************************************************************
+;	void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy 8 pixel of 8 line from left
+;***********************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+	movq		%1,		[%3-8]
+	psrlq		%1,		38h
+
+	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+	movq		%1,		[%3+r2-8]
+	psrlq		%1,		38h
+
+	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+WELS_EXTERN WelsIChromaPredH_mmx
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	movq		mm0,	[r1-8]
+	psrlq		mm0,	38h
+
+	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
+	pmullw		mm0,		[mmx_01bytes]
+	pshufw		mm0,	mm0,	0
+	movq		[r0],	mm0
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+8
+
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+16
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+24
+
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+32
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+40
+
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+48
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+56
+	WELSEMMS
+	ret
+
+;***********************************************************************
+;	void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy pixels from top 4 pixels
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredV_sse2
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub			r1,	r2
+	movd		xmm0,	[r1]
+	pshufd		xmm0,	xmm0,	0
+	movdqa		[r0],	xmm0
+	ret
+
+;***********************************************************************
+;	void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy 8 pixels from top 8 pixels
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredV_sse2
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub		r1,		r2
+	movq		xmm0,		[r1]
+	movdqa		xmm1,		xmm0
+	punpcklqdq	xmm0,		xmm1
+	movdqa		[r0],		xmm0
+	movdqa		[r0+16],	xmm0
+	movdqa		[r0+32],	xmm0
+	movdqa		[r0+48],	xmm0
+	ret
+
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |a |b |
+;	|g |h |e |f |
+;	|i |j |g |h |
+
+;   a = (1 + lt + l0)>>1
+;   e = (1 + l0 + l1)>>1
+;   g = (1 + l1 + l2)>>1
+;   i = (1 + l2 + l3)>>1
+
+;   d = (2 + t0 + (t1<<1) + t2)>>2
+;   c = (2 + lt + (t0<<1) + t1)>>2
+;   b = (2 + l0 + (lt<<1) + t0)>>2
+
+;   f = (2 + l1 + (l0<<1) + lt)>>2
+;   h = (2 + l2 + (l1<<1) + l0)>>2
+;   j = (2 + l3 + (l2<<1) + l1)>>2
+;   [b a f e h g j i] + [d c b a] --> mov to memory
+;
+;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHD_mmx
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub         r1, r2
+	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+
+	movd        mm1, [r1+2*r2-4]
+	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r1, [r1+2*r2]
+	movd        mm2, [r1+2*r2-4]
+	punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+	psrlq       mm2, 20h
+	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+
+	movq        mm1, mm0
+	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+	movq        mm2, mm0
+	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+	movq        mm3, mm2
+	movq        mm4, mm1
+	pavgb       mm1, mm0
+
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm4				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+
+	movq        mm4, mm0
+	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+
+	psrlq       mm2, 20h
+	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+	movq        mm4, mm3
+	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+
+	movd        [r0], mm2
+	movd        [r0+12], mm3
+	psrlq       mm3, 10h
+	movd        [r0+8], mm3
+	psrlq       mm3, 10h
+	movd        [r0+4], mm3
+	WELSEMMS
+	ret
+
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|c |d |e |f |
+;	|e |f |g |g |
+;	|g |g |g |g |
+
+;   a = (1 + l0 + l1)>>1
+;   c = (1 + l1 + l2)>>1
+;   e = (1 + l2 + l3)>>1
+;   g = l3
+
+;   b = (2 + l0 + (l1<<1) + l2)>>2
+;   d = (2 + l1 + (l2<<1) + l3)>>2
+;   f = (2 + l2 + (l3<<1) + l3)>>2
+
+;   [g g f e d c b a] + [g g g g] --> mov to memory
+;
+;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHU_mmx
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	movd        mm0, [r1-4]            ; mm0[3] = l0
+	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
+	lea         r1, [r1+2*r2]
+	movd        mm2, [r1-4]            ; mm2[3] = l2
+	movd        mm4, [r1+r2-4]        ; mm4[3] = l3
+	punpcklbw   mm2, mm4
+	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+
+	psrlq       mm4, 18h
+	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+	psrlq       mm0, 8h
+	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+	movq        mm5, mm2
+	pavgb       mm2, mm0
+
+	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
+	pand        mm5, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm5				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+
+	psrlq       mm2, 8h
+	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+
+	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+
+	psrlq       mm4, 20h
+	movd        [r0+12], mm4
+
+	movd        [r0], mm1
+	psrlq       mm1, 10h
+	movd        [r0+4], mm1
+	psrlq       mm1, 10h
+	movd        [r0+8], mm1
+	WELSEMMS
+	ret
+
+
+
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	l3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|i |a |b |c |
+;	|j |e |f |g |
+
+;   a = (1 + lt + t0)>>1
+;   b = (1 + t0 + t1)>>1
+;   c = (1 + t1 + t2)>>1
+;   d = (1 + t2 + t3)>>1
+
+;   e = (2 + l0 + (lt<<1) + t0)>>2
+;   f = (2 + lt + (t0<<1) + t1)>>2
+;   g = (2 + t0 + (t1<<1) + t2)>>2
+
+;   h = (2 + t1 + (t2<<1) + t3)>>2
+;   i = (2 + lt + (l0<<1) + l1)>>2
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
+;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVR_mmx
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub         r1, r2
+	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+
+	movd        mm1, [r1+2*r2-4]
+	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r1, [r1+2*r2]
+	movq        mm2, [r1+r2-8]        ; mm2[7] = l2
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+	psrlq       mm2, 28h
+	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+	movq        mm3, mm2
+	pavgb       mm2, mm0
+
+	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm3				; decrease 1 from odd bytes
+
+	movq        mm3, mm0
+	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+	movq        mm2, mm3
+
+	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+	movd        [r0], mm1
+
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+	movd        [r0+4], mm2
+
+	movq        mm4, mm3
+	psllq       mm4, 20h
+	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+
+	movq        mm5, mm3
+	psllq       mm5, 28h
+	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+
+	psllq       mm1, 8h
+	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+	movd        [r0+8], mm4
+
+	psllq       mm2, 8h
+	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+	movd        [r0+12], mm5
+	WELSEMMS
+	ret
+
+;***********************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|b |c |d |e |
+;	|c |d |e |f |
+;	|d |e |f |g |
+
+;   a = (2 + t0 + t2 + (t1<<1))>>2
+;   b = (2 + t1 + t3 + (t2<<1))>>2
+;   c = (2 + t2 + t4 + (t3<<1))>>2
+;   d = (2 + t3 + t5 + (t4<<1))>>2
+
+;   e = (2 + t4 + t6 + (t5<<1))>>2
+;   f = (2 + t5 + t7 + (t6<<1))>>2
+;   g = (2 + t6 + t7 + (t7<<1))>>2
+
+;   [g f e d c b a] --> mov to memory
+;
+;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDL_mmx
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub         r1, r2
+	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+
+	movq        mm3, mm0
+	psrlq       mm3, 38h
+	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+
+	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+	psrlq       mm2, 8h
+	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+	movq        mm3, mm1
+	pavgb       mm1, mm2
+	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm3				; decrease 1 from odd bytes
+
+	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+
+	psrlq       mm0, 8h
+	movd        [r0], mm0
+	psrlq       mm0, 8h
+	movd        [r0+4], mm0
+	psrlq       mm0, 8h
+	movd        [r0+8], mm0
+	psrlq       mm0, 8h
+	movd        [r0+12], mm0
+	WELSEMMS
+	ret
+
+
+;***********************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|b |c |d |i |
+;	|f |g |h |j |
+
+;   a = (1 + t0 + t1)>>1
+;   b = (1 + t1 + t2)>>1
+;   c = (1 + t2 + t3)>>1
+;   d = (1 + t3 + t4)>>1
+;   i = (1 + t4 + t5)>>1
+
+;   e = (2 + t0 + (t1<<1) + t2)>>2
+;   f = (2 + t1 + (t2<<1) + t3)>>2
+;   g = (2 + t2 + (t3<<1) + t4)>>2
+;   h = (2 + t3 + (t4<<1) + t5)>>2
+;   j = (2 + t4 + (t5<<1) + t6)>>2
+
+;   [i d c b a] + [j h g f e] --> mov to memory
+;
+;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVL_mmx
+	%assign push_num 0
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub         r1, r2
+	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+
+	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+	movq        mm3, mm1
+	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+
+	movq        mm4, mm2
+	pavgb       mm2, mm0
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm4				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+
+	movd        [r0], mm3
+	psrlq       mm3, 8h
+	movd        [r0+8], mm3
+
+	movd        [r0+4], mm2
+	psrlq       mm2, 8h
+	movd        [r0+12], mm2
+	WELSEMMS
+	ret
+
+;***********************************************************************
+;
+;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredDc_sse2
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub         r1, r2
+	movq        mm0, [r1]
+
+	movzx		r3, byte [r1+r2-0x01] ; l1
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l2
+	add		r3, r4
+	movzx		r4, byte [r1+r2-0x01] ; l3
+	add		r3, r4
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l4
+	add		r3, r4
+	movd        	mm1, r3d                 ; mm1 = l1+l2+l3+l4
+
+	movzx		r3, byte [r1+r2-0x01] ; l5
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l6
+	add		r3, r4
+	movzx		r4, byte [r1+r2-0x01] ; l7
+	add		r3, r4
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l8
+	add		r3, r4
+	movd        	mm2, r3d                 ; mm2 = l5+l6+l7+l8
+
+	movq        mm3, mm0
+	psrlq       mm0, 0x20
+	psllq       mm3, 0x20
+	psrlq       mm3, 0x20
+	pxor		mm4, mm4
+	psadbw		mm0, mm4
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
+	paddq       mm3, mm1
+	movq        mm1, mm2
+	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+
+	movq        mm4, [mmx_0x02]
+
+	paddq       mm0, mm4
+	psrlq       mm0, 0x02
+
+	paddq       mm2, mm4
+	psrlq       mm2, 0x02
+
+	paddq       mm3, mm4
+	paddq       mm3, mm4
+	psrlq       mm3, 0x03
+
+	paddq       mm1, mm4
+	paddq       mm1, mm4
+	psrlq       mm1, 0x03
+
+	pmuludq     mm0, [mmx_01bytes]
+	pmuludq     mm3, [mmx_01bytes]
+	psllq       mm0, 0x20
+	pxor        mm0, mm3                 ; mm0 = m_up
+
+	pmuludq     mm2, [mmx_01bytes]
+	pmuludq     mm1, [mmx_01bytes]
+	psllq       mm1, 0x20
+	pxor        mm1, mm2                 ; mm2 = m_down
+
+	movq        [r0], mm0
+	movq        [r0+0x08], mm0
+	movq        [r0+0x10], mm0
+	movq        [r0+0x18], mm0
+
+	movq        [r0+0x20], mm1
+	movq        [r0+0x28], mm1
+	movq        [r0+0x30], mm1
+	movq        [r0+0x38], mm1
+
+	pop r4
+	pop r3
+	WELSEMMS
+	ret
+
+
+
+;***********************************************************************
+;
+;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredDc_sse2
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	SIGN_EXTENSION r2, r2d
+	sub         r1, r2
+	movdqa      xmm0, [r1]             ; read one row
+	pxor		xmm1, xmm1
+	psadbw		xmm0, xmm1
+	movdqa      xmm1, xmm0
+	psrldq      xmm1, 0x08
+	pslldq      xmm0, 0x08
+	psrldq      xmm0, 0x08
+	paddw       xmm0, xmm1
+
+	movzx		r3, byte [r1+r2-0x01]
+	movzx		r4, byte [r1+2*r2-0x01]
+	add		r3, r4
+	lea         r1, [r1+r2]
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	add         r3, 0x10
+	movd        xmm1, r3d
+	paddw       xmm0, xmm1
+	psrld       xmm0, 0x05
+	pmuludq     xmm0, [mmx_01bytes]
+	pshufd      xmm0, xmm0, 0
+
+	movdqa      [r0], xmm0
+	movdqa      [r0+0x10], xmm0
+	movdqa      [r0+0x20], xmm0
+	movdqa      [r0+0x30], xmm0
+	movdqa      [r0+0x40], xmm0
+	movdqa      [r0+0x50], xmm0
+	movdqa      [r0+0x60], xmm0
+	movdqa      [r0+0x70], xmm0
+	movdqa      [r0+0x80], xmm0
+	movdqa      [r0+0x90], xmm0
+	movdqa      [r0+0xa0], xmm0
+	movdqa      [r0+0xb0], xmm0
+	movdqa      [r0+0xc0], xmm0
+	movdqa      [r0+0xd0], xmm0
+	movdqa      [r0+0xe0], xmm0
+	movdqa      [r0+0xf0], xmm0
+
+	pop r4
+	pop r3
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
+;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
+;
+;***********************************************************************
+%ifdef X86_32
+WELS_EXTERN WelsSampleSatdThree4x4_sse2
+	push      ebx
+	push      esi
+	push      edi
+	mov       eax,  [esp+24];p_enc
+	mov       ebx,  [esp+28];linesize_enc
+
+	; load source 4x4 samples and Hadamard transform
+    movd      xmm0, [eax]
+    movd      xmm1, [eax+ebx]
+    lea       eax , [eax+2*ebx]
+    movd      xmm2, [eax]
+    movd      xmm3, [eax+ebx]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
+
+    movdqa    xmm4, xmm0
+    paddw     xmm0, xmm3
+    psubw     xmm4, xmm3
+
+    movdqa    xmm2, xmm0
+    punpcklwd xmm0, xmm4
+    punpckhwd xmm4, xmm2
+
+	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
+
+    movdqa    xmm7, xmm0
+    paddw     xmm0, xmm5
+    psubw     xmm7, xmm5
+
+	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
+
+    ; Hadamard transform results are saved in xmm0 and xmm2
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+
+	; load top boundary samples: [a b c d]
+    mov       eax,  [esp+16];p_dec
+	sub		  eax,	[esp+20];linesize_dec
+	movzx     ecx,  byte [eax]
+	movzx     edx,  byte [eax+1]
+	movzx     esi,  byte [eax+2]
+	movzx     edi,  byte [eax+3]
+
+	; get the transform results of top boundary samples: [a b c d]
+	add       edx, ecx ; edx = a + b
+	add       edi, esi ; edi = c + d
+	add       ecx, ecx ; ecx = a + a
+	add       esi, esi ; esi = c + c
+	sub       ecx, edx ; ecx = a + a - a - b = a - b
+	sub       esi, edi ; esi = c + c - c - d = c - d
+	add       edi, edx ; edi = (a + b) + (c + d)
+	add       edx, edx
+	sub       edx, edi ; edx = (a + b) - (c + d)
+	add       esi, ecx ; esi = (a - b) + (c - d)
+	add       ecx, ecx
+	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
+
+	movdqa    xmm6, xmm0
+	movdqa    xmm7, xmm2
+	movd      xmm5, edi ; store the edi for DC mode
+	pxor      xmm3, xmm3
+	pxor      xmm4, xmm4
+	pinsrw    xmm3, edi, 0
+	pinsrw    xmm3, esi, 4
+	psllw     xmm3, 2
+	pinsrw    xmm4, edx, 0
+	pinsrw    xmm4, ecx, 4
+	psllw     xmm4, 2
+
+	; get the satd of H
+	psubw     xmm0, xmm3
+	psubw     xmm2, xmm4
+
+	WELS_AbsW  xmm0, xmm1
+	WELS_AbsW  xmm2, xmm1
+    paddusw        xmm0, xmm2
+    SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
+
+	; load left boundary samples: [a b c d]'
+    mov       eax,  [esp+16]
+	mov       ebx,  [esp+20]
+	movzx     ecx,  byte [eax-1]
+	movzx     edx,  byte [eax+ebx-1]
+	lea       eax , [eax+2*ebx]
+	movzx     esi,  byte [eax-1]
+	movzx     edi,  byte [eax+ebx-1]
+
+	; get the transform results of left boundary samples: [a b c d]'
+	add       edx, ecx ; edx = a + b
+	add       edi, esi ; edi = c + d
+	add       ecx, ecx ; ecx = a + a
+	add       esi, esi ; esi = c + c
+	sub       ecx, edx ; ecx = a + a - a - b = a - b
+	sub       esi, edi ; esi = c + c - c - d = c - d
+	add       edi, edx ; edi = (a + b) + (c + d)
+	add       edx, edx
+	sub       edx, edi ; edx = (a + b) - (c + d)
+	add       esi, ecx ; esi = (a - b) + (c - d)
+	add       ecx, ecx
+	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
+
+	; store the transform results in xmm3
+    movd      xmm3, edi
+	pinsrw    xmm3, edx, 1
+	pinsrw    xmm3, ecx, 2
+	pinsrw    xmm3, esi, 3
+	psllw     xmm3, 2
+
+	; get the satd of V
+	movdqa    xmm2, xmm6
+	movdqa    xmm4, xmm7
+	psubw     xmm2, xmm3
+	WELS_AbsW  xmm2, xmm1
+	WELS_AbsW  xmm4, xmm1
+    paddusw        xmm2, xmm4
+    SUMW_HORIZON1  xmm2, xmm1 ; satd of H is stored in xmm2
+
+	; DC result is stored in xmm1
+	add       edi, 4
+	movd      xmm1, edi
+	paddw     xmm1, xmm5
+	psrlw     xmm1, 3
+	movdqa    xmm5, xmm1
+	psllw     xmm1, 4
+
+    ; get the satd of DC
+    psubw          xmm6, xmm1
+    WELS_AbsW  xmm6, xmm1
+	WELS_AbsW  xmm7, xmm1
+    paddusw        xmm6, xmm7
+    SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
+
+    ; comparing order: DC H V
+    mov       edx, [esp+32]
+    movd      eax, xmm6
+    movd      edi, xmm2
+    movd      esi, xmm0
+    and       eax, 0xffff
+    shr       eax, 1
+    and       edi, 0xffff
+    shr       edi, 1
+    and       esi, 0xffff
+    shr       esi, 1
+    add       eax, [esp+40]
+    add       edi, [esp+44]
+    add       esi, [esp+48]
+    cmp       ax, di
+    jg near   not_dc
+    cmp       ax, si
+    jg near   not_dc_h
+
+    ; for DC mode
+    movd      ebx, xmm5
+    imul      ebx, 0x01010101
+    movd	  xmm5, ebx
+	pshufd    xmm5, xmm5, 0
+	movdqa    [edx], xmm5
+	mov       ebx, [esp+36]
+	mov       dword [ebx], 0x02
+	pop       edi
+    pop       esi
+    pop       ebx
+    ret
+
+not_dc:
+    cmp       di, si
+    jg near   not_dc_h
+
+    ; for H mode
+    SSE_DB_1_2REG  xmm6, xmm7
+    mov       eax,  [esp+16]
+	mov       ebx,  [esp+20]
+    movzx     ecx,  byte [eax-1]
+	movd      xmm0, ecx
+    pmuludq   xmm0, xmm6
+
+	movzx     ecx,  byte [eax+ebx-1]
+	movd      xmm1, ecx
+    pmuludq   xmm1, xmm6
+%if 1
+    punpckldq xmm0, xmm1
+%else
+	unpcklps  xmm0,	xmm1
+%endif
+	lea       eax,	[eax+ebx*2]
+	movzx	  ecx,	byte [eax-1]
+	movd	  xmm2,	ecx
+    pmuludq   xmm2, xmm6
+
+	movzx	  ecx,	byte [eax+ebx-1]
+	movd	  xmm3,	ecx
+    pmuludq   xmm3, xmm6
+%if 1
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+%else
+	unpcklps  xmm2,	xmm3
+	unpcklpd  xmm0,	xmm2
+%endif
+	movdqa	  [edx],xmm0
+
+	mov       eax, edi
+    mov       ebx, [esp+36]
+	mov       dword [ebx], 0x01
+
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+not_dc_h:
+    ; for V mode
+    mov       eax,  [esp+16]
+    sub		  eax,	[esp+20]
+	movd	  xmm0,	[eax]
+	pshufd	  xmm0,	xmm0, 0
+	movdqa	  [edx],xmm0
+
+	mov       eax, esi
+    mov       ebx, [esp+36]
+	mov       dword [ebx], 0x00
+
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+%endif
+
--- /dev/null
+++ b/codec/encoder/core/x86/memzero.asm
@@ -1,0 +1,132 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  memzero.asm
+;*
+;*  Abstract
+;*
+;*
+;*  History
+;*      9/16/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+;void WelsPrefetchZero_mmx(int8_t const*_A);
+;***********************************************************************
+WELS_EXTERN WelsPrefetchZero_mmx
+	%assign  push_num 0
+	LOAD_1_PARA
+	prefetchnta [r0]
+	ret
+
+
+;***********************************************************************
+;   void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroAligned64_sse2
+
+		%assign  push_num 0
+		LOAD_2_PARA
+		SIGN_EXTENSION r1, r1d
+		neg		r1
+
+		pxor	xmm0,		xmm0
+.memzeroa64_sse2_loops:
+		movdqa	[r0],		xmm0
+		movdqa	[r0+16],	xmm0
+		movdqa	[r0+32],	xmm0
+		movdqa	[r0+48],	xmm0
+		add		r0, 0x40
+
+		add r1, 0x40
+		jnz near .memzeroa64_sse2_loops
+
+		ret
+
+;***********************************************************************
+;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize64_mmx
+
+		%assign  push_num 0
+		LOAD_2_PARA
+		SIGN_EXTENSION r1, r1d
+		neg		r1
+
+		pxor	mm0,		mm0
+.memzero64_mmx_loops:
+		movq	[r0],		mm0
+		movq	[r0+8],	mm0
+		movq	[r0+16],	mm0
+		movq	[r0+24],	mm0
+		movq	[r0+32],	mm0
+		movq	[r0+40],	mm0
+		movq	[r0+48],	mm0
+		movq	[r0+56],	mm0
+		add		r0,		0x40
+
+		add r1, 0x40
+		jnz near .memzero64_mmx_loops
+
+		WELSEMMS
+		ret
+
+;***********************************************************************
+;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize8_mmx
+
+		%assign  push_num 0
+		LOAD_2_PARA
+		SIGN_EXTENSION r1, r1d
+		neg		r1
+		pxor	mm0,		mm0
+
+.memzero8_mmx_loops:
+		movq	[r0],		mm0
+		add		r0,		0x08
+
+		add		r1,		0x08
+		jnz near .memzero8_mmx_loops
+
+		WELSEMMS
+		ret
+
+
--- /dev/null
+++ b/codec/encoder/core/x86/quant.asm
@@ -1,0 +1,370 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  quant.asm
+;*
+;*  Abstract
+;*      sse2 quantize inter-block
+;*
+;*  History
+;*      7/6/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+SECTION .text
+;************************************************
+;NEW_QUANT
+;************************************************
+
+%macro SSE2_Quant8  5
+		MOVDQ	%1, %5
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
+		paddusw	%1, %3
+		pmulhuw	%1, %4
+		pxor	%1, %2
+		psubw	%1, %2
+		MOVDQ	%5, %1
+%endmacro
+
+%macro SSE2_QuantMax8  6
+		MOVDQ	%1, %5
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
+		paddusw	%1, %3
+		pmulhuw	%1, %4
+		pmaxsw	%6, %1
+		pxor	%1, %2
+		psubw	%1, %2
+		MOVDQ	%5, %1
+%endmacro
+
+%define pDct				esp + 4
+%define ff					esp + 8
+%define mf					esp + 12
+%define max					esp + 16
+;***********************************************************************
+;	void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;***********************************************************************
+WELS_EXTERN WelsQuant4x4_sse2
+		%assign push_num 0
+                LOAD_3_PARA
+		movdqa	xmm2, [r1]
+		movdqa	xmm3, [r2]
+
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+
+		ret
+
+;***********************************************************************
+;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsQuant4x4Dc_sse2
+ 		%assign push_num 0
+		LOAD_3_PARA
+		SIGN_EXTENSIONW r1, r1w
+		SIGN_EXTENSIONW r2, r2w
+		SSE2_Copy8Times xmm3, r2d
+
+		SSE2_Copy8Times xmm2, r1d
+
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+
+		ret
+
+;***********************************************************************
+;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;***********************************************************************
+WELS_EXTERN WelsQuantFour4x4_sse2
+		%assign push_num 0
+		LOAD_3_PARA
+		MOVDQ	xmm2, [r1]
+		MOVDQ	xmm3, [r2]
+
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
+
+		ret
+
+;***********************************************************************
+;	void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
+;***********************************************************************
+WELS_EXTERN WelsQuantFour4x4Max_sse2
+		%assign push_num 0
+		LOAD_4_PARA
+		PUSH_XMM 8
+		MOVDQ	xmm2, [r1]
+		MOVDQ	xmm3, [r2]
+
+		pxor	xmm4, xmm4
+		pxor	xmm5, xmm5
+		pxor	xmm6, xmm6
+		pxor	xmm7, xmm7
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0	  ], xmm4
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
+
+		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
+		pmaxsw  xmm0,  xmm4
+		pmaxsw  xmm0,  xmm5
+		pmaxsw  xmm0,  xmm7
+		movdqa	xmm1,  xmm0
+		punpckhqdq	xmm0, xmm1
+		pmaxsw	xmm0, xmm1
+
+		movq	[r3], xmm0
+		POP_XMM
+		LOAD_4_PARA_POP
+		ret
+
+%macro  MMX_Copy4Times 2
+		movd		%1, %2
+		punpcklwd	%1, %1
+		punpckldq	%1,	%1
+%endmacro
+
+SECTION .text
+
+%macro MMX_Quant4  4
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
+		paddusw	%1, %3
+		pmulhuw	%1, %4
+		pxor	%1, %2
+		psubw	%1, %2
+%endmacro
+
+;***********************************************************************
+;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
+;***********************************************************************
+WELS_EXTERN WelsHadamardQuant2x2_mmx
+		%assign push_num 0
+		LOAD_5_PARA
+		SIGN_EXTENSIONW r1, r1w
+		SIGN_EXTENSIONW r2, r2w
+		movd		mm0,			[r0]
+		movd		mm1,			[r0 + 0x20]
+		punpcklwd	mm0,			mm1
+		movd		mm3,			[r0 + 0x40]
+		movd		mm1,			[r0 + 0x60]
+		punpcklwd	mm3,			mm1
+
+		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
+		movq		mm5,			mm3
+		paddw		mm3,			mm0
+		psubw		mm0,			mm5
+		punpcklwd	mm3,			mm0
+		movq		mm1,			mm3
+		psrlq		mm1,			32
+		movq		mm5,			mm1
+		paddw		mm1,			mm3
+		psubw		mm3,			mm5
+		punpcklwd	mm1,			mm3
+
+		;quant_2x2_dc
+		MMX_Copy4Times	mm3,		r2d
+		MMX_Copy4Times	mm2,		r1d
+		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+
+		; store dct_2x2
+		movq		[r3],			mm1
+		movq		[r4],			mm1
+
+		; pNonZeroCount of dct_2x2
+		pcmpeqb		mm2,			mm2		; mm2 = FF
+		pxor		mm3,			mm3
+		packsswb	mm1,			mm3
+		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
+		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
+		psadbw		mm1,			mm3		;
+		mov			r1w,				0
+		mov			[r0],			r1w
+		mov			[r0 + 0x20],	r1w
+		mov			[r0 + 0x40],	r1w
+		mov			[r0 + 0x60],	r1w
+
+
+		movd		retrd,		mm1
+
+		WELSEMMS
+		LOAD_5_PARA_POP
+		ret
+
+;***********************************************************************
+;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
+		%assign push_num 0
+		LOAD_3_PARA
+		SIGN_EXTENSIONW r1, r1w
+		SIGN_EXTENSIONW r2, r2w
+		movd		mm0,			[r0]
+		movd		mm1,			[r0 + 0x20]
+		punpcklwd	mm0,			mm1
+		movd		mm3,			[r0 + 0x40]
+		movd		mm1,			[r0 + 0x60]
+		punpcklwd	mm3,			mm1
+
+		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
+		movq		mm5,			mm3
+		paddw		mm3,			mm0
+		psubw		mm0,			mm5
+		punpcklwd	mm3,			mm0
+		movq		mm1,			mm3
+		psrlq		mm1,			32
+		movq		mm5,			mm1
+		paddw		mm1,			mm3
+		psubw		mm3,			mm5
+		punpcklwd	mm1,			mm3
+
+		;quant_2x2_dc
+		MMX_Copy4Times	mm3,		r2d
+		MMX_Copy4Times	mm2,		r1d
+		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+
+		; pNonZeroCount of dct_2x2
+		pcmpeqb		mm2,			mm2		; mm2 = FF
+		pxor		mm3,			mm3
+		packsswb	mm1,			mm3
+		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
+		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
+		psadbw		mm1,			mm3		;
+		movd		retrd,			mm1
+
+		WELSEMMS
+		ret
+
+
+%macro SSE2_DeQuant8 3
+    MOVDQ  %2, %1
+    pmullw %2, %3
+    MOVDQ  %1, %2
+%endmacro
+
+
+;***********************************************************************
+; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
+;***********************************************************************
+WELS_EXTERN WelsDequant4x4_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+
+	movdqa  xmm1, [r1]
+	SSE2_DeQuant8 [r0	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
+
+    ret
+
+;***********************************************************************====
+;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
+;***********************************************************************====
+
+WELS_EXTERN WelsDequantFour4x4_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+
+	movdqa  xmm1, [r1]
+	SSE2_DeQuant8 [r0	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x10	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x20	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x30	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x40	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x50	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x60	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x70	],  xmm0, xmm1
+
+    ret
+
+;***********************************************************************
+;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsDequantIHadamard4x4_sse2
+		%assign push_num 0
+		LOAD_2_PARA
+		%ifndef X86_32
+		movzx r1, r1w
+		%endif
+
+		; WelsDequantLumaDc4x4
+		SSE2_Copy8Times	xmm1,		r1d
+		;psrlw		xmm1,		2		; for the (>>2) in ihdm
+		MOVDQ		xmm0,		[r0]
+		MOVDQ		xmm2,		[r0+0x10]
+		pmullw		xmm0,		xmm1
+		pmullw		xmm2,		xmm1
+
+		; ihdm_4x4
+		movdqa		xmm1,		xmm0
+		psrldq		xmm1,		8
+		movdqa		xmm3,		xmm2
+		psrldq		xmm3,		8
+
+		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
+		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
+
+		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
+		SSE2_SumSub		xmm2, xmm4,	xmm5
+		SSE2_SumSub		xmm1, xmm0, xmm5
+		SSE2_SumSub		xmm4, xmm0, xmm5
+		SSE2_SumSub		xmm2, xmm1, xmm5
+		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
+
+		punpcklqdq	xmm0,		xmm1
+		MOVDQ		[r0],		xmm0
+
+		punpcklqdq	xmm2,		xmm3
+		MOVDQ		[r0+16],	xmm2
+		ret
--- /dev/null
+++ b/codec/encoder/core/x86/score.asm
@@ -1,0 +1,339 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  score.asm
+;*
+;*  Abstract
+;*      scan/score/count of sse2
+;*
+;*  History
+;*      8/21/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+SECTION .rodata align=16
+
+;align 16
+;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
+align 16
+sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
+align 16
+sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+align 16
+sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
+align 16
+pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
+align 16
+pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
+align 16
+pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
+align 16
+pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
+
+align 16
+nozero_count_table:
+db  0,1,1,2,1,2,2,3,1,2
+db  2,3,2,3,3,4,1,2,2,3
+db  2,3,3,4,2,3,3,4,3,4
+db  4,5,1,2,2,3,2,3,3,4
+db  2,3,3,4,3,4,4,5,2,3
+db  3,4,3,4,4,5,3,4,4,5
+db  4,5,5,6,1,2,2,3,2,3
+db  3,4,2,3,3,4,3,4,4,5
+db  2,3,3,4,3,4,4,5,3,4
+db  4,5,4,5,5,6,2,3,3,4
+db  3,4,4,5,3,4,4,5,4,5
+db  5,6,3,4,4,5,4,5,5,6
+db  4,5,5,6,5,6,6,7,1,2
+db  2,3,2,3,3,4,2,3,3,4
+db  3,4,4,5,2,3,3,4,3,4
+db  4,5,3,4,4,5,4,5,5,6
+db  2,3,3,4,3,4,4,5,3,4
+db  4,5,4,5,5,6,3,4,4,5
+db  4,5,5,6,4,5,5,6,5,6
+db  6,7,2,3,3,4,3,4,4,5
+db  3,4,4,5,4,5,5,6,3,4
+db  4,5,4,5,5,6,4,5,5,6
+db  5,6,6,7,3,4,4,5,4,5
+db  5,6,4,5,5,6,5,6,6,7
+db  4,5,5,6,5,6,6,7,5,6
+db  6,7,6,7,7,8
+
+align 16
+high_mask_table:
+	db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
+	db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
+	db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
+	db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
+	db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
+	db  5, 8, 5, 7, 8,11, 6, 8, 8,11
+	db  9,11,12,15, 0, 1, 1, 4, 1, 3
+	db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
+	db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
+	db  7,10, 8,10,11,14, 3, 4, 4, 7
+	db  5, 7, 8,11, 5, 7, 7,10, 8,10
+	db 11,14, 6, 7, 8,11, 8,10,11,14
+	db  9,11,11,14,12,14,15,18, 0, 0
+	db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
+	db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
+	db  7,10, 5, 7, 7,10, 8,10,11,14
+	db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
+	db  6, 9, 7, 9,10,13, 5, 6, 7,10
+	db  7, 9,10,13, 8,10,10,13,11,13
+	db 14,17, 3, 4, 4, 7, 4, 6, 7,10
+	db  5, 7, 7,10, 8,10,11,14, 5, 6
+	db  7,10, 7, 9,10,13, 8,10,10,13
+	db 11,13,14,17, 6, 7, 7,10, 8,10
+	db 11,14, 8,10,10,13,11,13,14,17
+	db  9,10,11,14,11,13,14,17,12,14
+	db 14,17,15,17,18,21
+
+align 16
+low_mask_table:
+    db  0, 3, 2, 6, 2, 5, 5, 9, 1, 5
+    db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
+    db  4, 7, 7,11, 4, 8, 7,11, 8,11
+    db 11,15, 1, 4, 3, 7, 4, 7, 7,11
+    db  3, 7, 6,10, 7,10,10,14, 4, 7
+    db  7,11, 7,10,10,14, 7,11,10,14
+    db 11,14,14,18, 0, 4, 3, 7, 3, 6
+    db  6,10, 3, 7, 6,10, 7,10,10,14
+    db  3, 6, 6,10, 6, 9, 9,13, 6,10
+    db  9,13,10,13,13,17, 4, 7, 6,10
+    db  7,10,10,14, 6,10, 9,13,10,13
+    db 13,17, 7,10,10,14,10,13,13,17
+    db 10,14,13,17,14,17,17,21, 0, 3
+    db  3, 7, 3, 6, 6,10, 2, 6, 5, 9
+    db  6, 9, 9,13, 3, 6, 6,10, 6, 9
+    db  9,13, 6,10, 9,13,10,13,13,17
+    db  3, 6, 5, 9, 6, 9, 9,13, 5, 9
+    db  8,12, 9,12,12,16, 6, 9, 9,13
+    db  9,12,12,16, 9,13,12,16,13,16
+    db 16,20, 3, 7, 6,10, 6, 9, 9,13
+    db  6,10, 9,13,10,13,13,17, 6, 9
+    db  9,13, 9,12,12,16, 9,13,12,16
+    db 13,16,16,20, 7,10, 9,13,10,13
+    db 13,17, 9,13,12,16,13,16,16,20
+    db 10,13,13,17,13,16,16,20,13,17
+    db 16,20,17,20,20,24
+
+
+SECTION .text
+
+;***********************************************************************
+;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
+;***********************************************************************
+WELS_EXTERN WelsScan4x4DcAc_sse2
+	%ifdef X86_32
+	push r3
+	%assign push_num 1
+	%else
+	%assign push_num 0
+	%endif
+	LOAD_2_PARA
+	movdqa     xmm0, [r1]			; 7 6 5 4 3 2 1 0
+	movdqa     xmm1, [r1+16]		; f e d c b a 9 8
+	pextrw     r2d, xmm0, 7			; ecx = 7
+	pextrw     r3d, xmm1, 2			; edx = a
+	pextrw     r1d, xmm0, 5			; eax = 5
+	pinsrw     xmm1, r2d, 2			; f e d c b 7 9 8
+	pinsrw     xmm0, r1d, 7			; 5 6 5 4 3 2 1 0
+	pextrw     r2d, xmm1, 0			; ecx = 8
+	pinsrw     xmm0, r2d, 5			; 5 6 8 4 3 2 1 0
+	pinsrw     xmm1, r3d, 0			; f e d c b 7 9 a
+	pshufd     xmm2, xmm0, 0xd8		; 5 6 3 2 8 4 1 0
+	pshufd     xmm3, xmm1, 0xd8		; f e b 7 d c 9 a
+	pshufhw    xmm0, xmm2, 0x93		; 6 3 2 5 8 4 1 0
+	pshuflw    xmm1, xmm3, 0x39		; f e b 7 a d c 9
+	movdqa     [r0],xmm0
+	movdqa     [r0+16], xmm1
+	%ifdef X86_32
+	pop r3
+	%endif
+	ret
+
+;***********************************************************************
+;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
+;***********************************************************************
+WELS_EXTERN WelsScan4x4DcAc_ssse3
+	%assign push_num 0
+	LOAD_2_PARA
+	movdqa     xmm0, [r1]
+	movdqa     xmm1, [r1+16]
+	pextrw		r2d,  xmm0, 7			; ecx = [7]
+	pextrw		r1d,  xmm1, 0			; eax = [8]
+	pinsrw		xmm0, r1d, 7			; xmm0[7]	=	[8]
+	pinsrw		xmm1, r2d, 0			; xmm1[0]	=	[7]
+	pshufb		xmm1, [pb_scanacdc_maskb]
+	pshufb		xmm0, [pb_scanacdc_maska]
+
+	movdqa     [r0],xmm0
+	movdqa     [r0+16], xmm1
+	ret
+;***********************************************************************
+;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
+;***********************************************************************
+WELS_EXTERN WelsScan4x4Ac_sse2
+	%assign push_num 0
+	LOAD_2_PARA
+	movdqa     xmm0, [r1]
+	movdqa     xmm1, [r1+16]
+	movdqa     xmm2, xmm0
+	punpcklqdq xmm0, xmm1
+	punpckhqdq xmm2, xmm1
+
+	movdqa     xmm3, xmm0
+	punpckldq  xmm0, xmm2
+	punpckhdq  xmm3, xmm2
+	pextrw     r1d , xmm0, 3
+	pextrw     r2d , xmm0, 7
+	pinsrw     xmm0, r1d,  7
+	pextrw     r1d,  xmm3, 4
+	pinsrw     xmm3, r2d,  4
+	pextrw     r2d,  xmm3, 0
+	pinsrw     xmm3, r1d,  0
+	pinsrw     xmm0, r2d,  3
+
+	pshufhw    xmm1, xmm0, 0x93
+	pshuflw    xmm2, xmm3, 0x39
+
+    movdqa     xmm3, xmm2
+    psrldq     xmm1, 2
+    pslldq     xmm3, 14
+    por        xmm1, xmm3
+    psrldq     xmm2, 2
+	movdqa     [r0],xmm1
+	movdqa     [r0+16], xmm2
+	ret
+
+
+;***********************************************************************
+;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
+;***********************************************************************
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
+	%ifdef X86_32
+	push r3
+	%assign push_num 1
+	%else
+	%assign push_num 0
+	%endif
+	LOAD_1_PARA
+	movdqa    xmm0, [r0]
+	movdqa    xmm1, [r0+16]
+
+	packsswb  xmm0, xmm1
+	; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+	xor r3, r3
+    pxor      xmm3, xmm3
+    pcmpeqb   xmm0, xmm3
+    pmovmskb  r3d,  xmm0
+
+    xor       r3,  0xffff
+
+	xor       r0,  r0
+	mov       r2,  7
+	mov       r1,  8
+.loop_low8_find1:
+	bt        r3,  r2
+	jc        .loop_high8_find1
+	dec		  r2
+	jnz      .loop_low8_find1
+.loop_high8_find1:
+	bt        r3, r1
+	jc        .find1end
+	inc       r1
+	cmp       r1,16
+	jb        .loop_high8_find1
+.find1end:
+	sub       r1, r2
+	sub       r1, 1
+	lea	  r2,  [i_ds_table]
+	add       r0b,  [r2+r1]
+	mov       r1, r3
+	and       r3, 0xff
+	shr       r1, 8
+	and       r1, 0xff
+	lea	  r2 , [low_mask_table]
+	add       r0b,  [r2 +r3]
+	lea	  r2, [high_mask_table]
+	add       r0b,  [r2+r1]
+	%ifdef X86_32
+	pop r3
+	%else
+	mov retrd, r0d
+	%endif
+	ret
+
+
+;***********************************************************************
+; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
+;***********************************************************************
+WELS_EXTERN WelsGetNoneZeroCount_sse2
+	%assign push_num 0
+	LOAD_1_PARA
+	movdqa    xmm0, [r0]
+	movdqa    xmm1, [r0+16]
+	pxor      xmm2, xmm2
+	pcmpeqw   xmm0, xmm2
+	pcmpeqw   xmm1, xmm2
+	packsswb  xmm1, xmm0
+	xor r1, r1
+	pmovmskb  r1d,  xmm1
+	xor       r1d,  0xffff
+	mov       r2,  r1
+	and       r1,  0xff
+	shr       r2,  8
+;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
+;	xor       retr,  retr
+	;add       al,  [nozero_count_table+r2]
+	lea 	  r0 , [nozero_count_table]
+	movzx	  r2, byte [r0+r2]
+	movzx	  r1,   byte [r0+r1]
+	mov	  retrq, r2
+	add	  retrq, r1
+	;add       al,  [nozero_count_table+r1]
+	ret
+
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -37,12 +37,12 @@
 
 ifeq ($(ASM_ARCH), x86)
 ENCODER_ASM_SRCS=\
-	$(ENCODER_SRCDIR)/core/asm/coeff.asm\
-	$(ENCODER_SRCDIR)/core/asm/dct.asm\
-	$(ENCODER_SRCDIR)/core/asm/intra_pred.asm\
-	$(ENCODER_SRCDIR)/core/asm/memzero.asm\
-	$(ENCODER_SRCDIR)/core/asm/quant.asm\
-	$(ENCODER_SRCDIR)/core/asm/score.asm\
+	$(ENCODER_SRCDIR)/core/x86/coeff.asm\
+	$(ENCODER_SRCDIR)/core/x86/dct.asm\
+	$(ENCODER_SRCDIR)/core/x86/intra_pred.asm\
+	$(ENCODER_SRCDIR)/core/x86/memzero.asm\
+	$(ENCODER_SRCDIR)/core/x86/quant.asm\
+	$(ENCODER_SRCDIR)/core/x86/score.asm\
 
 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))
 endif
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -514,7 +514,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\src\asm\denoisefilter.asm"
+				RelativePath="..\..\src\x86\denoisefilter.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -554,7 +554,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\src\asm\downsample_bilinear.asm"
+				RelativePath="..\..\src\x86\downsample_bilinear.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -634,7 +634,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\src\asm\vaa.asm"
+				RelativePath="..\..\src\x86\vaa.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
--- a/codec/processing/src/asm/denoisefilter.asm
+++ /dev/null
@@ -1,272 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  predenoise.asm
-;*
-;*  Abstract
-;*      denoise for SVC2.1
-;*  History
-;*      4/13/2010 Created
-;*      7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-%macro	WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
-
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
-
-		movdqa		%1,	%3
-		psubusb		%1,	%8
-
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1
-		paddusw		%4,	%1
-		paddusw		%5,	%2
-%endmacro
-
-%macro	WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-%endmacro
-
-;***********************************************************************
-;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
-
-WELS_EXTERN BilateralLumaFilter8_sse2
-
-        push r3
-        %assign push_num 1
-        LOAD_2_PARA
-        PUSH_XMM 8
-
-		pxor		xmm7,	xmm7
-
-		mov         r3,     r0
-
-		movq        xmm6,   [r0]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
-
-        dec         r0
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 5
-
-		sub			r0,	r1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 3
-
-		lea			r0,	[r0 + r1 * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 8
-
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[r3],	xmm5
-
-
-		POP_XMM
-		pop r3
-		%assign push_num 0
-
-		ret
-
-;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-
-        push r3
-
-        %assign push_num 1
-
-        LOAD_2_PARA
-
-        mov		r3,	r1
-		add		r3,	r3
-		sub		r0,	r3			; pixels - 2 * stride
-		sub		r0,	2
-
-		pxor	xmm0,	xmm0
-		pxor	xmm3,	xmm3
-
-		movdqu		xmm1,	[r0]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		movdqu		xmm1,	[r0 + r1]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		add		r0,	r3
-		movdqu		xmm1,	[r0]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		movdqu		xmm1,	[r0 + r1]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		movdqu		xmm1,	[r0 + r1 * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[r0 + 2],		xmm3
-
-
-        pop r3
-
-        %assign push_num 0
-		ret
--- a/codec/processing/src/asm/downsample_bilinear.asm
+++ /dev/null
@@ -1,1205 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	upsampling.asm
-;*
-;*  Abstract
-;*		SIMD for pixel domain down sampling
-;*
-;*  History
-;*		10/22/2009	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-%ifdef X86_32
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
-	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
-	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $01			; iSrcHeight >> 1
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	; 2nd part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm1, [esi+16]		; 1st pSrc line + 16
-	movq mm2, [esi+24]		; 1st pSrc line + 24
-	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
-	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
-
-	; to handle mm1, mm2, mm3, mm4
-	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm5, mm6		; d c D C b a B A
-	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm6, mm7		; h g H G f e F E
-	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm7, mm1		; l k L K j i J I
-	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
-
-	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm1, mm2 		; p o P O n m N M
-	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
-
-	; to handle mm5, mm6, mm7, mm1
-	movq mm2, mm5
-	punpckldq mm2, mm6 	; H G F E D C B A
-	punpckhdq mm5, mm6 	; h g f e d c b a
-
-	movq mm3, mm7
-	punpckldq mm3, mm1 	; P O N M L K J I
-	punpckhdq mm7, mm1 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
-	movq [edi  ], mm0
-	movq [edi+8], mm2
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $01		; iSrcHeight >> 1
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movq [edi  ], mm0
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $01		; iSrcHeight >> 1
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $02		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 8 bytes
-.xloops:
-	; 1st part horizonal loop: x8 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A
-	;2nd Line Src:	mm1: h H g G f F e E
-	;=> target:
-	;: H G F E D C B A
-	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+ecx]		; 2nd pSrc line
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm2, mm3		; d c D C b a B A
-	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	; to handle mm2, mm4
-	movq mm0, mm2		;
-	punpckldq mm0, mm4 	; H G F E D C B A
-	punpckhdq mm2, mm4 	; h g f e d c b a
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movd [edi],	mm0
-
-	; next unit
-	lea esi, [esi+8]
-	lea edi, [edi+4]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $01			; iSrcHeight >> 1
-
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
-
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm4 high bits
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
-
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $01		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line
-
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm2 high bits
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
-
-	; write pDst
-	movq [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $01			; iSrcHeight >> 1
-
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
-
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
-
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $01		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
-
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
-
-	; write pDst
-	movq [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-
-
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;                           unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-
-	pxor	xmm0,	xmm0
-	mov		edx,	32767
-	mov		eax,	[uiScaleX]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
-	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
-
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
-	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
-
-	mov		edx,		40003fffh
-	movd	xmm5,		edx
-	punpcklwd	xmm5,	xmm0					; 16384 16383
-	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
-
-
-DOWNSAMPLE:
-
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,			16384
-	mov		[yInverse],		eax
-
-	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
-
-HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-
-	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
-
-WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	pxor	xmm0,		xmm0
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
-
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	movdqa	xmm0,	xmm2
-	pmuludq	xmm2,	xmm1
-	psrlq	xmm0,	32
-	psrlq	xmm1,	32
-	pmuludq	xmm0,	xmm1
-	paddq	xmm2,	xmm0
-	pshufd	xmm1,	xmm2,	00001110b
-	paddq	xmm2,	xmm1
-	psrlq	xmm2,	29
-
-	movd	eax,	xmm2
-	inc		eax
-	shr		eax,	1
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	paddw	xmm3,		xmm7			; inc u
-	psllw	xmm3,		1
-	psrlw	xmm3,		1
-
-	loop	WIDTH
-
-WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-
-	dec		dword [tmpHeight]
-	jg		HEIGHT
-
-
-LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-
-LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	loop	LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
-
-
-
-
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;               unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-
-	pxor	xmm0,	xmm0
-	mov		edx,	65535
-	mov		eax,	[uiScaleX]
-	and		eax,	edx
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	65535
-	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 uinc 0 -uinc
-	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
-
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 vinc 0 -vinc
-	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
-
-	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx
-	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
-	mov		ebx,		16384
-
-
-FAST_DOWNSAMPLE:
-
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,		16384
-	mov		[yInverse],		eax
-
-	pshuflw	xmm4,		xmm5,	01010000b
-	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
-
-FAST_HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-
-	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
-
-FAST_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	pmaddwd		xmm2,	xmm1
-	pshufd	xmm1,	xmm2,	00000001b
-	paddd	xmm2,	xmm1
-	movd	xmm1,	ebx
-	paddd	xmm2,	xmm1
-	psrld	xmm2,	15
-
-	packuswb	xmm2,	xmm0
-	movd	eax,	xmm2
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	paddw	xmm3,		xmm7			; inc u
-
-	loop	FAST_WIDTH
-
-FAST_WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-
-	dec		dword [tmpHeight]
-	jg		FAST_HEIGHT
-
-
-FAST_LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-
-FAST_LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	loop	FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
-%endif
--- a/codec/processing/src/asm/vaa.asm
+++ /dev/null
@@ -1,2030 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*      vaa.asm
-;*
-;*      Abstract
-;*      sse2 for pVaa routines
-;*
-;*  History
-;*      04/14/2010      Created
-;*              06/07/2010      Added AnalysisVaaInfoIntra_sse2(ssse3)
-;*              06/10/2010      Tune rc_sad_frame_sse2 and got about 40% improvement
-;*              08/11/2010      Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-%macro SUM_SQR_SSE2     3       ; dst, pSrc, zero
-  movdqa %1, %2
-  punpcklbw %1, %3
-  punpckhbw %2, %3
-  pmaddwd %1, %1
-  pmaddwd %2, %2
-  paddd %1, %2
-  pshufd %2, %1, 04Eh   ; 01001110 B
-  paddd %1, %2
-  pshufd %2, %1, 0B1h   ; 10110001 B
-  paddd %1, %2
-%endmacro       ; END OF SUM_SQR_SSE2
-
-%macro WELS_SAD_16x2_SSE2  3 ;esi :%1 edi:%2 ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   [%1+%3]
-  movdqa        xmm4,   [%2+%3]
-  psadbw        xmm1,   xmm2
-  psadbw        xmm3,   xmm4
-  paddd xmm6,   xmm1
-  paddd xmm6,   xmm3
-  lea           %1,     [%1+%3*2]
-  lea           %2,     [%2+%3*2]
-%endmacro
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2        2       ; dst(pSrc), tmp
-  ; @sum_8x2 begin
-  pshufd %2, %1, 04Eh   ; 01001110 B
-  paddw %1, %2
-  pshuflw %2, %1, 04Eh  ; 01001110 B
-  paddw %1, %2
-  pshuflw %2, %1, 0B1h  ; 10110001 B
-  paddw %1, %2
-  ; end of @sum_8x2
-%endmacro       ; END of SUM_WORD_8x2_SSE2
-
-%macro  WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm2
-  paddd         xmm6,   xmm3
-
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm0
-  paddd         xmm5,   xmm3
-
-  movdqa        xmm2,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm2,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm2,   xmm2
-  paddd         xmm4,   xmm1
-  paddd         xmm4,   xmm2
-
-  add           %1,     %3
-  add           %2,     %3
-%endmacro
-
-%macro  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm2
-  paddd         xmm7,   xmm3    ; sad
-
-  movdqa        xmm3,   xmm1
-  pmaxub        xmm3,   xmm2
-  pminub        xmm2,   xmm1
-  psubb xmm3,   xmm2    ; diff
-
-  movdqa        xmm2,   xmm1
-  psadbw        xmm2,   xmm0
-  paddd xmm6,   xmm2    ; sum
-
-  movdqa                xmm2,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm2,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm2,   xmm2
-  paddd         xmm5,   xmm1
-  paddd         xmm5,   xmm2    ; sqsum
-
-  movdqa                xmm1,   xmm3
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm3,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm3,   xmm3
-  paddd         xmm4,   xmm1
-  paddd         xmm4,   xmm3    ; sqdiff
-
-  add           %1,     %3
-  add           %2,     %3
-%endmacro
-
-%macro  WELS_SAD_SD_MAD_16x1_SSE2       7 ;esi:%5 edi:%6 ebx:%7
-%define sad_reg                 %1
-%define sum_cur_reg             %2
-%define sum_ref_reg             %3
-%define mad_reg                 %4
-  movdqa        xmm1,           [%5]
-  movdqa        xmm2,           [%6]
-  movdqa        xmm3,           xmm1
-  psadbw        xmm3,           xmm0
-  paddd         sum_cur_reg,    xmm3    ; sum_cur
-  movdqa        xmm3,           xmm2
-  psadbw        xmm3,           xmm0
-  paddd sum_ref_reg,                    xmm3    ; sum_ref
-
-  movdqa        xmm3,           xmm1
-  pmaxub        xmm3,           xmm2
-  pminub        xmm2,           xmm1
-  psubb xmm3,           xmm2    ; abs diff
-  pmaxub        mad_reg,        xmm3    ; max abs diff
-
-  psadbw        xmm3,           xmm0
-  paddd sad_reg,        xmm3    ; sad
-
-  add                   %5,             %7
-  add                   %6,             %7
-%endmacro
-
-
-%macro  WELS_MAX_REG_SSE2       1       ; xmm1, xmm2, xmm3 can be used
-%define max_reg  %1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           4
-  pmaxub        max_reg,        xmm1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           2
-  pmaxub        max_reg,        xmm1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           1
-  pmaxub        max_reg,        xmm1
-%endmacro
-
-%macro  WELS_SAD_BGD_SQDIFF_16x1_SSE2   7 ;esi:%5 edi:%6 ebx:%7
-%define sad_reg         %1
-%define sum_reg         %2
-%define mad_reg         %3
-%define sqdiff_reg      %4
-  movdqa                xmm1,           [%5]
-  movdqa                xmm2,           xmm1
-  movdqa                xmm3,           xmm1
-  punpcklbw     xmm2,           xmm0
-  punpckhbw     xmm3,           xmm0
-  pmaddwd               xmm2,           xmm2
-  pmaddwd               xmm3,           xmm3
-  paddd         xmm2,           xmm3
-  movdqa                xmm3,           xmm2
-  psllq         xmm2,           32
-  psrlq         xmm3,           32
-  psllq         xmm3,           32
-  paddd         xmm2,           xmm3
-  paddd         sad_reg,        xmm2            ; sqsum
-
-  movdqa        xmm2,           [%6]
-  movdqa        xmm3,           xmm1
-  psadbw        xmm3,           xmm0
-  paddd sum_reg,                        xmm3    ; sum_cur
-  movdqa        xmm3,           xmm2
-  psadbw        xmm3,           xmm0
-  pslldq        xmm3,           4
-  paddd sum_reg,                        xmm3    ; sum_ref
-
-  movdqa        xmm3,           xmm1
-  pmaxub        xmm3,           xmm2
-  pminub        xmm2,           xmm1
-  psubb xmm3,           xmm2    ; abs diff
-  pmaxub        mad_reg,        xmm3    ; max abs diff
-
-  movdqa        xmm1,           xmm3
-  psadbw        xmm3,           xmm0
-  paddd sad_reg,        xmm3    ; sad
-
-  movdqa                xmm3,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm3,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm3,   xmm3
-  paddd         sqdiff_reg,     xmm1
-  paddd         sqdiff_reg,     xmm3    ; sqdiff
-
-  add           %5,     %7
-  add           %6,     %7
-%endmacro
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-%ifdef X86_32
-
-;***********************************************************************
-;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-WELS_EXTERN SampleVariance16x16_sse2
-  push esi
-  push edi
-  push ebx
-
-  sub esp, 16
-  %define SUM                   [esp]
-  %define SUM_CUR               [esp+4]
-  %define SQR                   [esp+8]
-  %define SQR_CUR               [esp+12]
-  %define PUSH_SIZE     28      ; 12 + 16
-
-  mov edi, [esp+PUSH_SIZE+4]    ; y_ref
-  mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
-  mov esi, [esp+PUSH_SIZE+12]   ; y_src
-  mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
-  mov ecx, 010h                         ; height = 16
-
-  pxor xmm7, xmm7
-  movdqu SUM, xmm7
-
-.hloops:
-  movdqa xmm0, [edi]            ; y_ref
-  movdqa xmm1, [esi]            ; y_src
-  movdqa xmm2, xmm0             ; store first for future process
-  movdqa xmm3, xmm1
-  ; sum += diff;
-  movdqa xmm4, xmm0
-  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
-  ; to be continued for sum
-  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
-  paddw xmm4, xmm5
-  movd ebx, xmm4
-  add SUM, ebx
-
-  ; sqr += diff * diff;
-  pmaxub xmm0, xmm1
-  pminub xmm1, xmm2
-  psubb xmm0, xmm1                              ; diff
-  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
-  movd ebx, xmm1
-  add SQR, ebx
-
-  ; sum_cur += y_src[x];
-  movdqa xmm0, xmm3             ; cur_orig
-  movdqa xmm1, xmm0
-  punpcklbw xmm0, xmm7
-  punpckhbw xmm1, xmm7
-  paddw xmm0, xmm1              ; 8x2
-  SUM_WORD_8x2_SSE2 xmm0, xmm1
-  movd ebx, xmm0
-  and ebx, 0ffffh
-  add SUM_CUR, ebx
-
-  ; sqr_cur += y_src[x] * y_src[x];
-  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
-  movd ebx, xmm0
-  add SQR_CUR, ebx
-
-  lea edi, [edi+edx]
-  lea esi, [esi+eax]
-  dec ecx
-  jnz near .hloops
-
-  mov ebx, 0
-  mov bx, word SUM
-  sar ebx, 8
-  imul ebx, ebx
-  mov ecx, SQR
-  sar ecx, 8
-  sub ecx, ebx
-  mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
-  mov [edi], cx                         ; to store uiMotionIndex
-  mov ebx, 0
-  mov bx, word SUM_CUR
-  sar ebx, 8
-  imul ebx, ebx
-  mov ecx, SQR_CUR
-  sar ecx, 8
-  sub ecx, ebx
-  mov [edi+2], cx                               ; to store uiTextureIndex
-
-  %undef SUM
-  %undef SUM_CUR
-  %undef SQR
-  %undef SQR_CUR
-  %undef PUSH_SIZE
-
-  add esp, 16
-  pop ebx
-  pop edi
-  pop esi
-
-  ret
-
-
-
-;*************************************************************************************************************
-;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSad_sse2
-%define         cur_data                        esp + pushsize + 4
-%define         ref_data                        esp + pushsize + 8
-%define         iPicWidth                       esp + pushsize + 12
-%define         iPicHeight                      esp + pushsize + 16
-%define         iPicStride                      esp + pushsize + 20
-%define         psadframe                       esp + pushsize + 24
-%define         psad8x8                         esp + pushsize + 28
-%define         pushsize        12
-  push  esi
-  push  edi
-  push  ebx
-  mov           esi,    [cur_data]
-  mov           edi,    [ref_data]
-  mov           ebx,    [iPicStride]
-  mov           edx,    [psad8x8]
-  mov           eax,    ebx
-
-  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl           eax,    4                                                               ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
-height_loop:
-  mov           ecx,    dword [iPicWidth]
-  push  esi
-  push  edi
-width_loop:
-  pxor  xmm6,   xmm6            ;
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx],          xmm6
-  psrldq        xmm6,           8
-  movd  [edx+4],        xmm6
-
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx+8],        xmm6
-  psrldq        xmm6,           8
-  movd  [edx+12],       xmm6
-
-  add           edx,    16
-  sub           esi,    eax
-  sub           edi,    eax
-  add           esi,    16
-  add           edi,    16
-
-  dec           ecx
-  jnz           width_loop
-
-  pop           edi
-  pop           esi
-  add           esi,    eax
-  add           edi,    eax
-
-  dec   dword [iPicHeight]
-  jnz           height_loop
-
-  mov           edx,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [edx],  xmm7
-
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          pushsize
-  pop           ebx
-  pop           edi
-  pop           esi
-  ret
-
-%else  ;64-bit
-
-;***********************************************************************
-;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-WELS_EXTERN SampleVariance16x16_sse2
-  %define SUM                   r10;[esp]
-  %define SUM_CUR               r11;[esp+4]
-  %define SQR                   r13;[esp+8]
-  %define SQR_CUR               r15;[esp+12]
-
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  LOAD_5_PARA
-  PUSH_XMM 8
-  SIGN_EXTENSION r1,r1d
-  SIGN_EXTENSION r3,r3d
-
-  mov r12,010h
-  pxor xmm7, xmm7
-  movq SUM, xmm7
-  movq SUM_CUR,xmm7
-  movq SQR,xmm7
-  movq SQR_CUR,xmm7
-
-.hloops:
-  mov r14,0
-  movdqa xmm0, [r0]             ; y_ref
-  movdqa xmm1, [r2]             ; y_src
-  movdqa xmm2, xmm0             ; store first for future process
-  movdqa xmm3, xmm1
-  ; sum += diff;
-  movdqa xmm4, xmm0
-  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
-  ; to be continued for sum
-  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
-  paddw xmm4, xmm5
-  movd r14d, xmm4
-  add SUM, r14
-
-  ; sqr += diff * diff;
-  pmaxub xmm0, xmm1
-  pminub xmm1, xmm2
-  psubb xmm0, xmm1                              ; diff
-  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
-  movd r14d, xmm1
-  add SQR, r14
-
-  ; sum_cur += y_src[x];
-  movdqa xmm0, xmm3             ; cur_orig
-  movdqa xmm1, xmm0
-  punpcklbw xmm0, xmm7
-  punpckhbw xmm1, xmm7
-  paddw xmm0, xmm1              ; 8x2
-  SUM_WORD_8x2_SSE2 xmm0, xmm1
-  movd r14d, xmm0
-  and r14, 0ffffh
-  add SUM_CUR, r14
-
-  ; sqr_cur += y_src[x] * y_src[x];
-  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
-  movd r14d, xmm0
-  add SQR_CUR, r14
-
-  lea r0, [r0+r1]
-  lea r2, [r2+r3]
-  dec r12
-  jnz near .hloops
-
-  mov r0, SUM
-  sar r0, 8
-  imul r0, r0
-  mov r1, SQR
-  sar r1, 8
-  sub r1, r0
-  mov [r4], r1w                         ; to store uiMotionIndex
-  mov r0, SUM_CUR
-  sar r0, 8
-  imul r0, r0
-  mov r1, SQR_CUR
-  sar r1, 8
-  sub r1, r0
-  mov [r4+2], r1w                               ; to store uiTextureIndex
-
-  POP_XMM
-  LOAD_5_PARA_POP
-  pop r15
-  pop r14
-  pop r13
-  pop r12
-
-
-  %assign push_num 0
-
-  ret
-
-
-;*************************************************************************************************************
-;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSad_sse2
-%define         cur_data                        r0
-%define         ref_data                        r1
-%define         iPicWidth                       r2
-%define         iPicHeight              r3
-%define         iPicStride              r4
-%define         psadframe                       r5
-%define         psad8x8                         r6
-
-  push r12
-  push r13
-  %assign push_num 2
-  LOAD_7_PARA
-  PUSH_XMM 8
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
-
-  mov   r12,r4
-  shr           r2,     4                                       ; iPicWidth/16
-  shr           r3,     4                                       ; iPicHeight/16
-
-  shl           r12,    4                                                               ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
-height_loop:
-  mov           r13,    r2
-  push  r0
-  push  r1
-width_loop:
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r6],           xmm6
-  psrldq        xmm6,           8
-  movd  [r6+4], xmm6
-
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r6+8], xmm6
-  psrldq        xmm6,           8
-  movd  [r6+12],        xmm6
-
-  add           r6,     16
-  sub           r0,     r12
-  sub           r1,     r12
-  add           r0,     16
-  add           r1,     16
-
-  dec           r13
-  jnz           width_loop
-
-  pop           r1
-  pop           r0
-  add           r0,     r12
-  add           r1,     r12
-
-  dec   r3
-  jnz           height_loop
-
-  ;mov          r13,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [psadframe],    xmm7
-
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          pushsize
-  POP_XMM
-  LOAD_7_PARA_POP
-  pop r13
-  pop r12
-  %assign push_num 0
-  ret
-
-%endif
-
-
-%ifdef X86_32
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-%define         localsize               8
-%define         cur_data                        esp + pushsize + localsize + 4
-%define         ref_data                        esp + pushsize + localsize + 8
-%define         iPicWidth                       esp + pushsize + localsize + 12
-%define         iPicHeight                      esp + pushsize + localsize + 16
-%define         iPicStride                      esp + pushsize + localsize + 20
-%define         psadframe                       esp + pushsize + localsize + 24
-%define         psad8x8                         esp + pushsize + localsize + 28
-%define         psum16x16                       esp + pushsize + localsize + 32
-%define         psqsum16x16                     esp + pushsize + localsize + 36
-%define         tmp_esi                         esp + 0
-%define         tmp_edi                         esp + 4
-%define         pushsize                16
-  push  ebp
-  push  esi
-  push  edi
-  push  ebx
-  sub           esp,    localsize
-  mov           esi,    [cur_data]
-  mov           edi,    [ref_data]
-  mov           ebx,    [iPicStride]
-  mov           edx,    [psad8x8]
-  mov           eax,    ebx
-
-  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl           eax,    4                                                       ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
-var_height_loop:
-  mov           ecx,    dword [iPicWidth]
-  mov           [tmp_esi],      esi
-  mov           [tmp_edi],      edi
-var_width_loop:
-  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
-  pxor  xmm5,   xmm5            ; pSum16x16
-  pxor  xmm4,   xmm4            ; sqsum_16x16
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx],          xmm6
-  psrldq        xmm6,           8
-  movd  [edx+4],        xmm6
-
-  pxor  xmm6,   xmm6
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx+8],        xmm6
-  psrldq        xmm6,           8
-  movd  [edx+12],       xmm6
-
-  mov           ebp,    [psum16x16]
-  movdqa        xmm1,   xmm5
-  psrldq        xmm1,   8
-  paddd xmm5,   xmm1
-  movd  [ebp],  xmm5
-  add           dword [psum16x16], 4
-
-  movdqa        xmm5,   xmm4
-  psrldq        xmm5,   8
-  paddd xmm4,   xmm5
-  movdqa        xmm3,   xmm4
-  psrldq        xmm3,   4
-  paddd xmm4,   xmm3
-
-  mov           ebp,    [psqsum16x16]
-  movd  [ebp],  xmm4
-  add           dword [psqsum16x16], 4
-
-  add           edx,    16
-  sub           esi,    eax
-  sub           edi,    eax
-  add           esi,    16
-  add           edi,    16
-
-  dec           ecx
-  jnz           var_width_loop
-
-  mov           esi,    [tmp_esi]
-  mov           edi,    [tmp_edi]
-  add           esi,    eax
-  add           edi,    eax
-
-  dec   dword [iPicHeight]
-  jnz           var_height_loop
-
-  mov           edx,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [edx],  xmm7
-
-  add           esp,    localsize
-  pop           ebx
-  pop           edi
-  pop           esi
-  pop           ebp
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          psum16x16
-%undef          psqsum16x16
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          pushsize
-%undef          localsize
-  ret
-
-%else  ;64-bit
-
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-%define         cur_data                        arg1 ;r0
-%define         ref_data                        arg2 ;r1
-%define         iPicWidth                       arg3 ;r2
-%define         iPicHeight                  arg4 ;r3
-%define         iPicStride                  arg5
-%define         psadframe                       arg6
-%define         psad8x8                         arg7
-%define         psum16x16                       arg8
-%define         psqsum16x16                 arg9
-
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  PUSH_XMM 8
-
-%ifdef WIN64
-  mov r4, arg5  ;iPicStride
-  mov r5, arg6  ;psad8x8
-%endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
-
-  mov   r13,r4
-  shr   r2,4
-  shr   r3,4
-
-  shl   r13,4   ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
-var_height_loop:
-  push    r2
-  %assign push_num push_num+1
-  mov           r11,    r0
-  mov           r12,    r1
-var_width_loop:
-  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
-  pxor  xmm5,   xmm5            ; pSum16x16
-  pxor  xmm4,   xmm4            ; sqsum_16x16
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r14],          xmm6
-  psrldq        xmm6,           8
-  movd  [r14+4],        xmm6
-
-  pxor  xmm6,   xmm6
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  paddd   xmm7,           xmm6
-  movd    [r14+8],        xmm6
-  psrldq  xmm6,           8
-  movd    [r14+12],       xmm6
-
-  mov             r15,    psum16x16
-  movdqa  xmm1,   xmm5
-  psrldq  xmm1,   8
-  paddd   xmm5,   xmm1
-  movd    [r15],  xmm5
-  add             dword psum16x16, 4
-
-  movdqa  xmm5,   xmm4
-  psrldq  xmm5,   8
-  paddd   xmm4,   xmm5
-  movdqa  xmm3,   xmm4
-  psrldq  xmm3,   4
-  paddd   xmm4,   xmm3
-
-  mov             r15,    psqsum16x16
-  movd    [r15],  xmm4
-  add             dword psqsum16x16, 4
-
-  add             r14,16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
-
-  dec             r2
-  jnz             var_width_loop
-
-  pop     r2
-  %assign push_num push_num-1
-  mov             r0,     r11
-  mov             r1,     r12
-  add             r0,     r13
-  add             r1,     r13
-  dec     r3
-  jnz             var_height_loop
-
-  mov             r15,    psadframe
-  movdqa  xmm5,   xmm7
-  psrldq  xmm7,   8
-  paddd   xmm7,   xmm5
-  movd    [r15],  xmm7
-
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
-%assign push_num 0
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          psum16x16
-%undef          psqsum16x16
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          pushsize
-%undef          localsize
-  ret
-
-%endif
-
-%ifdef X86_32
-
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-%define         localsize               12
-%define         cur_data                        esp + pushsize + localsize + 4
-%define         ref_data                        esp + pushsize + localsize + 8
-%define         iPicWidth                       esp + pushsize + localsize + 12
-%define         iPicHeight                      esp + pushsize + localsize + 16
-%define         iPicStride                      esp + pushsize + localsize + 20
-%define         psadframe                       esp + pushsize + localsize + 24
-%define         psad8x8                         esp + pushsize + localsize + 28
-%define         psum16x16                       esp + pushsize + localsize + 32
-%define         psqsum16x16                     esp + pushsize + localsize + 36
-%define         psqdiff16x16            esp + pushsize + localsize + 40
-%define         tmp_esi                         esp + 0
-%define         tmp_edi                         esp + 4
-%define         tmp_sadframe            esp + 8
-%define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
-
-  mov             ecx,    [iPicWidth]
-  mov             ecx,    [iPicHeight]
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             edx,    [psad8x8]
-  mov             eax,    ebx
-
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  mov             ecx,    [iPicWidth]
-  mov             ecx,    [iPicHeight]
-  pxor    xmm0,   xmm0
-  movd    [tmp_sadframe], xmm0
-sqdiff_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
-sqdiff_width_loop:
-  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
-  pxor    xmm6,   xmm6            ; pSum16x16
-  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  movdqa  xmm1,           xmm7
-  movd    [edx],          xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [edx+4],        xmm7
-  movd    ebp,            xmm1
-  add             [tmp_sadframe], ebp
-
-  pxor    xmm7,   xmm7
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  movdqa  xmm1,           xmm7
-  movd    [edx+8],        xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [edx+12],       xmm7
-  movd    ebp,            xmm1
-  add             [tmp_sadframe], ebp
-
-  mov             ebp,    [psum16x16]
-  movdqa  xmm1,   xmm6
-  psrldq  xmm1,   8
-  paddd   xmm6,   xmm1
-  movd    [ebp],  xmm6
-  add             dword [psum16x16], 4
-
-  mov             ebp,    [psqsum16x16]
-  pshufd  xmm6,   xmm5,   14 ;00001110
-  paddd   xmm6,   xmm5
-  pshufd  xmm5,   xmm6,   1  ;00000001
-  paddd   xmm5,   xmm6
-  movd    [ebp],  xmm5
-  add             dword [psqsum16x16], 4
-
-  mov             ebp,    [psqdiff16x16]
-  pshufd  xmm5,   xmm4,   14      ; 00001110
-  paddd   xmm5,   xmm4
-  pshufd  xmm4,   xmm5,   1       ; 00000001
-  paddd   xmm4,   xmm5
-  movd    [ebp],  xmm4
-  add             dword   [psqdiff16x16], 4
-
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
-
-  dec             ecx
-  jnz             sqdiff_width_loop
-
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
-
-  dec     dword [iPicHeight]
-  jnz             sqdiff_height_loop
-
-  mov             ebx,    [tmp_sadframe]
-  mov             eax,    [psadframe]
-  mov             [eax],  ebx
-
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          psum16x16
-%undef          psqsum16x16
-%undef          psqdiff16x16
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          tmp_sadframe
-%undef          pushsize
-%undef          localsize
-  ret
-
-%else
-
-
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-%define         localsize               12
-%define         cur_data                        arg1;r0
-%define         ref_data                        arg2;r1
-%define         iPicWidth                       arg3;r2
-%define         iPicHeight                      arg4;r3
-%define         iPicStride                      arg5;
-%define         psadframe                       arg6;
-%define         psad8x8                         arg7;
-%define         psum16x16                       arg8;
-%define         psqsum16x16                     arg9;
-%define         psqdiff16x16                    arg10
-
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  PUSH_XMM 10
-
-%ifdef WIN64
-  mov r4,arg5
-%endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
-
-  mov        r13,r4
-  shr     r2,4   ; iPicWidth/16
-  shr     r3,4   ; iPicHeight/16
-  shl     r13,4   ; iPicStride*16
-  pxor    xmm0,   xmm0
-  pxor  xmm8, xmm8  ;framesad
-  pxor  xmm9, xmm9
-sqdiff_height_loop:
-  ;mov            ecx,    dword [iPicWidth]
-  ;mov      r14,r2
-  push r2
-  %assign push_num push_num +1
-  mov             r10,    r0
-  mov             r11,    r1
-sqdiff_width_loop:
-  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
-  pxor    xmm6,   xmm6            ; pSum16x16
-  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  movdqa  xmm1,           xmm7
-  movd    [r14],          xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [r14+4],        xmm7
-  movd    r15d,           xmm1
-  movd  xmm9, r15d
-  paddd xmm8,xmm9
-
-
-  pxor    xmm7,   xmm7
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  movdqa  xmm1,           xmm7
-  movd    [r14+8],        xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [r14+12],       xmm7
-  movd    r15d,           xmm1
-  movd  xmm9, r15d
-  paddd xmm8,xmm9
-
-  mov             r15,    psum16x16
-  movdqa  xmm1,   xmm6
-  psrldq  xmm1,   8
-  paddd   xmm6,   xmm1
-  movd    [r15],  xmm6
-  add             dword psum16x16, 4
-
-  mov             r15,    psqsum16x16
-  pshufd  xmm6,   xmm5,   14 ;00001110
-  paddd   xmm6,   xmm5
-  pshufd  xmm5,   xmm6,   1  ;00000001
-  paddd   xmm5,   xmm6
-  movd    [r15],  xmm5
-  add             dword psqsum16x16, 4
-
-  mov             r15,    psqdiff16x16
-  pshufd  xmm5,   xmm4,   14      ; 00001110
-  paddd   xmm5,   xmm4
-  pshufd  xmm4,   xmm5,   1       ; 00000001
-  paddd   xmm4,   xmm5
-  movd    [r15],  xmm4
-  add             dword   psqdiff16x16,   4
-
-  add             r14,16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
-
-  dec             r2
-  jnz             sqdiff_width_loop
-
-  pop r2
-  %assign push_num push_num -1
-
-  mov             r0,     r10
-  mov             r1,     r11
-  add             r0,     r13
-  add             r1,     r13
-
-  dec     r3
-  jnz             sqdiff_height_loop
-
-  mov             r13,    psadframe
-  movd    [r13],  xmm8
-
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
-  %assign push_num 0
-
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          psum16x16
-%undef          psqsum16x16
-%undef          psqdiff16x16
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          tmp_sadframe
-%undef          pushsize
-%undef          localsize
-  ret
-
-
-
-%endif
-
-%ifdef X86_32
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-%define         localsize               12
-%define         cur_data                        esp + pushsize + localsize + 4
-%define         ref_data                        esp + pushsize + localsize + 8
-%define         iPicWidth                       esp + pushsize + localsize + 12
-%define         iPicHeight                      esp + pushsize + localsize + 16
-%define         iPicStride                      esp + pushsize + localsize + 20
-%define         psadframe                       esp + pushsize + localsize + 24
-%define         psad8x8                         esp + pushsize + localsize + 28
-%define         p_sd8x8                         esp + pushsize + localsize + 32
-%define         p_mad8x8                        esp + pushsize + localsize + 36
-%define         tmp_esi                         esp + 0
-%define         tmp_edi                         esp + 4
-%define         tmp_ecx                         esp + 8
-%define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             eax,    ebx
-
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  xor             ebp,    ebp
-  pxor    xmm0,   xmm0
-bgd_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
-bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8
-  pxor    xmm6,   xmm6            ; sum_cur_8x8
-  pxor    xmm5,   xmm5            ; sum_ref_8x8
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-
-
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
-
-  ;movdqa         xmm1,   xmm4
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm4,   xmm0
-  ;punpcklwd      xmm4,   xmm0
-  ;movd           [edx+4],        xmm4
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  mov                     [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm4
-  movd            ecx,    xmm4
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
-
-
-  pslldq          xmm7,   4
-  pslldq          xmm6,   4
-  pslldq          xmm5,   4
-
-
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
-
-  ;movdqa         xmm1,   xmm4
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm4,   xmm0
-  ;punpcklwd      xmm4,   xmm0
-  ;movd           [edx+4],        xmm4
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  movhlps         xmm1,   xmm4
-  movd            ecx,    xmm4
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
-
-  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
-
-  mov             edx,    [psad8x8]
-  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
-  movdqa  [edx],  xmm1
-  add             edx,    16
-  mov             [psad8x8],      edx                                     ; sad8x8
-
-  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
-  pshufd  xmm2,   xmm1,   00000011b
-  paddd   xmm1,   xmm2
-  movd    edx,    xmm1
-  add             ebp,    edx                                             ; sad frame
-
-  mov             edx,    [p_sd8x8]
-  psubd   xmm6,   xmm5
-  pshufd  xmm1,   xmm6,   10001101b
-  movdqa  [edx],  xmm1
-  add             edx,    16
-  mov             [p_sd8x8],      edx
-
-
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
-
-  mov             ecx,    [tmp_ecx]
-  dec             ecx
-  jnz             bgd_width_loop
-
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
-
-  dec             dword [iPicHeight]
-  jnz             bgd_height_loop
-
-  mov             edx,    [psadframe]
-  mov             [edx],  ebp
-
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          p_sd8x8
-%undef          p_mad8x8
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          pushsize
-%undef          localsize
-  ret
-
-
-
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-%define         localsize               16
-%define         cur_data                        esp + pushsize + localsize + 4
-%define         ref_data                        esp + pushsize + localsize + 8
-%define         iPicWidth                       esp + pushsize + localsize + 12
-%define         iPicHeight                      esp + pushsize + localsize + 16
-%define         iPicStride                      esp + pushsize + localsize + 20
-%define         psadframe                       esp + pushsize + localsize + 24
-%define         psad8x8                         esp + pushsize + localsize + 28
-%define         psum16x16                       esp + pushsize + localsize + 32
-%define         psqsum16x16                     esp + pushsize + localsize + 36
-%define         psqdiff16x16            esp + pushsize + localsize + 40
-%define         p_sd8x8                         esp + pushsize + localsize + 44
-%define         p_mad8x8                        esp + pushsize + localsize + 48
-%define         tmp_esi                         esp + 0
-%define         tmp_edi                         esp + 4
-%define         tmp_sadframe            esp + 8
-%define         tmp_ecx                         esp + 12
-%define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             eax,    ebx
-
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  pxor    xmm0,   xmm0
-  movd    [tmp_sadframe], xmm0
-sqdiff_bgd_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
-sqdiff_bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-
-  mov             edx,            [psad8x8]
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [edx],          xmm2
-  movd    [edx+4],        xmm1
-  add             edx,            8
-  mov             [psad8x8],      edx                     ; sad8x8
-
-  paddd   xmm1,                           xmm2
-  movd    edx,                            xmm1
-  add             [tmp_sadframe],         edx                     ; iFrameSad
-
-  mov             edx,            [psum16x16]
-  movdqa  xmm1,           xmm6
-  pshufd  xmm2,           xmm1,           00001110b
-  paddd   xmm1,           xmm2
-  movd    [edx],          xmm1                            ; sum
-
-  mov             edx,            [p_sd8x8]
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [edx],          xmm1
-  add             edx,            8
-  mov             [p_sd8x8],      edx
-
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm5
-  ;movdqa         xmm1,   xmm5
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm5,   xmm0
-  ;punpcklwd      xmm5,   xmm0
-  ;movd           [edx+4],        xmm5
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  mov                     [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm5
-  movd            ecx,    xmm5
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
-
-  psrlq   xmm7,   32
-  psllq   xmm7,   32                      ; clear sad
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-
-  mov             edx,            [psad8x8]
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [edx],          xmm2
-  movd    [edx+4],        xmm1
-  add             edx,            8
-  mov             [psad8x8],      edx                     ; sad8x8
-
-  paddd   xmm1,                           xmm2
-  movd    edx,                            xmm1
-  add             [tmp_sadframe],         edx                     ; iFrameSad
-
-  mov             edx,                    [psum16x16]
-  movdqa  xmm1,                   xmm6
-  pshufd  xmm2,                   xmm1,           00001110b
-  paddd   xmm1,                   xmm2
-  movd    ebp,                    xmm1                            ; sum
-  add             [edx],                  ebp
-  add             edx,                    4
-  mov             [psum16x16],    edx
-
-  mov             edx,                    [psqsum16x16]
-  psrlq   xmm7,                   32
-  pshufd  xmm2,                   xmm7,           00001110b
-  paddd   xmm2,                   xmm7
-  movd    [edx],                  xmm2                            ; sqsum
-  add             edx,                    4
-  mov             [psqsum16x16],  edx
-
-  mov             edx,            [p_sd8x8]
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [edx],          xmm1
-  add             edx,            8
-  mov             [p_sd8x8],      edx
-
-  mov             edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm5
-  ;movdqa         xmm1,   xmm5
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm5,   xmm0
-  ;punpcklwd      xmm5,   xmm0
-  ;movd           [edx+4],        xmm5
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  movhlps         xmm1,   xmm5
-  movd            ecx,    xmm5
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
-
-  mov             edx,            [psqdiff16x16]
-  pshufd  xmm1,           xmm4,           00001110b
-  paddd   xmm4,           xmm1
-  pshufd  xmm1,           xmm4,           00000001b
-  paddd   xmm4,           xmm1
-  movd    [edx],          xmm4
-  add             edx,            4
-  mov             [psqdiff16x16], edx
-
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
-
-  mov             ecx,    [tmp_ecx]
-  dec             ecx
-  jnz             sqdiff_bgd_width_loop
-
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
-
-  dec     dword [iPicHeight]
-  jnz             sqdiff_bgd_height_loop
-
-  mov             edx,    [psadframe]
-  mov             ebp,    [tmp_sadframe]
-  mov             [edx],  ebp
-
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          psum16x16
-%undef          psqsum16x16
-%undef          psqdiff16x16
-%undef          p_sd8x8
-%undef          p_mad8x8
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          pushsize
-%undef          localsize
-   ret
-%else
-
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-%define         cur_data                        arg1;
-%define         ref_data                        arg2;
-%define         iPicWidth                       arg3;
-%define         iPicHeight                      arg4;
-%define         iPicStride                      arg5;
-%define         psadframe                       arg6;
-%define         psad8x8                         arg7;
-%define         p_sd8x8                         arg8;
-%define         p_mad8x8                        arg9;
-
-  push r12
-  push r13
-  push r14
-  push r15
-%assign push_num 4
-  PUSH_XMM 10
-%ifdef WIN64
-  mov r4,arg5
-  ;  mov r5,arg6
-%endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
-
-
-  mov     r13,r4
-  mov     r15,r0
-  shr     r2,4
-  shr     r3,4
-  shl     r13,4
-  pxor    xmm0,   xmm0
-  pxor    xmm8,   xmm8
-  pxor    xmm9,   xmm9
-bgd_height_loop:
-  ;mov            ecx,    dword [iPicWidth]
-  push r2
-  %assign push_num push_num+1
-  mov             r10,    r15
-  mov             r11,    r1
-bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8
-  pxor    xmm6,   xmm6            ; sum_cur_8x8
-  pxor    xmm5,   xmm5            ; sum_ref_8x8
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-
-
-  mov                     r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm4
-
-  ;mov                    [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm4
-  movd            r0d,    xmm4
-
-
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  add                     r14,    2
-  ;mov                     p_mad8x8,       r14
-
-
-  pslldq          xmm7,   4
-  pslldq          xmm6,   4
-  pslldq          xmm5,   4
-
-
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-
-  ;mov                     r14,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
-
-  movhlps         xmm1,   xmm4
-  movd            r0d,    xmm4
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
-
-  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
-
-  mov             r14,    psad8x8
-  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
-  movdqa  [r14],  xmm1
-  add             r14,    16
-  mov             psad8x8,        r14                                     ; sad8x8
-
-  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
-  pshufd  xmm2,   xmm1,   00000011b
-  paddd   xmm1,   xmm2
-  movd    r14d,   xmm1
-  movd    xmm9, r14d
-  paddd   xmm8,   xmm9                                            ; sad frame
-
-  mov             r14,    p_sd8x8
-  psubd   xmm6,   xmm5
-  pshufd  xmm1,   xmm6,   10001101b
-  movdqa  [r14],  xmm1
-  add             r14,    16
-  mov             p_sd8x8,        r14
-
-
-  ;add            edx,    16
-  sub             r15,    r13
-  sub             r1,     r13
-  add             r15,    16
-  add             r1,     16
-
-
-  dec             r2
-  jnz             bgd_width_loop
-  pop     r2
-%assign push_num push_num-1
-  mov             r15,    r10
-  mov             r1,     r11
-  add             r15,    r13
-  add             r1,     r13
-
-  dec             r3
-  jnz             bgd_height_loop
-
-  mov             r13,    psadframe
-  movd    [r13],  xmm8
-
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
-%assign push_num 0
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          p_sd8x8
-%undef          p_mad8x8
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          pushsize
-%undef          localsize
-  ret
-
-
-
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-%define         cur_data                        arg1;
-%define         ref_data                        arg2;
-%define         iPicWidth                       arg3;
-%define         iPicHeight                      arg4;
-%define         iPicStride                      arg5;
-%define         psadframe                       arg6;
-%define         psad8x8                         arg7;
-%define         psum16x16                       arg8;
-%define         psqsum16x16                     arg9;
-%define         psqdiff16x16                    arg10;
-%define         p_sd8x8                         arg11
-%define         p_mad8x8                        arg12
-
-  push r12
-  push r13
-  push r14
-  push r15
-%assign push_num 4
-  PUSH_XMM 10
-%ifdef WIN64
-  mov r4,arg5
-  ;mov r5,arg6
-%endif
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
-
-  mov     r13,r4
-  shr             r2,     4                                       ; iPicWidth/16
-  shr             r3,     4                                       ; iPicHeight/16
-  shl             r13,    4                                                       ; iPicStride*16
-  pxor    xmm0,   xmm0
-  pxor    xmm8,   xmm8
-  pxor    xmm9,   xmm9
-
-
-sqdiff_bgd_height_loop:
-  mov             r10,    r0
-  mov             r11,    r1
-  push r2
-%assign push_num push_num+1
-sqdiff_bgd_width_loop:
-
-  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-
-  mov             r14,            psad8x8
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [r14],          xmm2
-  movd    [r14+4],        xmm1
-  add             r14,            8
-  mov             psad8x8,        r14                     ; sad8x8
-
-  paddd   xmm1,                           xmm2
-  movd    r14d,                           xmm1
-  movd    xmm9,r14d
-  paddd           xmm8,           xmm9                    ; iFrameSad
-
-  mov             r14,            psum16x16
-  movdqa  xmm1,           xmm6
-  pshufd  xmm2,           xmm1,           00001110b
-  paddd   xmm1,           xmm2
-  movd    [r14],          xmm1                            ; sum
-
-  mov             r14,            p_sd8x8
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [r14],          xmm1
-  add             r14,            8
-  mov             p_sd8x8,        r14
-
-  mov                     r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm5
-
-  movhlps         xmm1,   xmm5
-  push r0
-  movd            r0d,    xmm5
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  pop r0
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
-
-  psrlq   xmm7,   32
-  psllq   xmm7,   32                      ; clear sad
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-
-  mov             r14,            psad8x8
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [r14],          xmm2
-  movd    [r14+4],        xmm1
-  add             r14,            8
-  mov             psad8x8,        r14                     ; sad8x8
-
-  paddd   xmm1,                           xmm2
-  movd    r14d,                           xmm1
-  movd    xmm9, r14d
-  paddd   xmm8,           xmm9            ; iFrameSad
-
-  mov             r14,                    psum16x16
-  movdqa  xmm1,                   xmm6
-  pshufd  xmm2,                   xmm1,           00001110b
-  paddd   xmm1,                   xmm2
-  movd    r15d,                   xmm1                            ; sum
-  add             [r14],                  r15d
-  add             r14,                    4
-  mov             psum16x16,      r14
-
-  mov             r14,                    psqsum16x16
-  psrlq   xmm7,                   32
-  pshufd  xmm2,                   xmm7,           00001110b
-  paddd   xmm2,                   xmm7
-  movd    [r14],                  xmm2                            ; sqsum
-  add             r14,                    4
-  mov             psqsum16x16,    r14
-
-  mov             r14,            p_sd8x8
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [r14],          xmm1
-  add             r14,            8
-  mov             p_sd8x8,        r14
-
-  mov             r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm5
-
-
-  movhlps         xmm1,   xmm5
-  push r0
-  movd            r0d,    xmm5
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  pop r0
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
-
-  mov             r14,            psqdiff16x16
-  pshufd  xmm1,           xmm4,           00001110b
-  paddd   xmm4,           xmm1
-  pshufd  xmm1,           xmm4,           00000001b
-  paddd   xmm4,           xmm1
-  movd    [r14],          xmm4
-  add             r14,            4
-  mov             psqdiff16x16,   r14
-
-  add             r14,    16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
-
-  dec             r2
-  jnz             sqdiff_bgd_width_loop
-  pop r2
-  %assign push_num push_num-1
-  mov             r0,     r10
-  mov             r1,     r11
-  add             r0,     r13
-  add             r1,     r13
-
-  dec     r3
-  jnz             sqdiff_bgd_height_loop
-
-  mov             r14,    psadframe
-  movd    [r14],  xmm8
-
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
-%assign push_num 0
-%undef          cur_data
-%undef          ref_data
-%undef          iPicWidth
-%undef          iPicHeight
-%undef          iPicStride
-%undef          psadframe
-%undef          psad8x8
-%undef          psum16x16
-%undef          psqsum16x16
-%undef          psqdiff16x16
-%undef          p_sd8x8
-%undef          p_mad8x8
-%undef          tmp_esi
-%undef          tmp_edi
-%undef          pushsize
-%undef          localsize
-  ret
-%endif
--- /dev/null
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -1,0 +1,272 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  predenoise.asm
+;*
+;*  Abstract
+;*      denoise for SVC2.1
+;*  History
+;*      4/13/2010 Created
+;*      7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro	WEIGHT_LINE	9
+		movq		%2,	%9
+		punpcklbw	%2,	%7
+		movdqa		%8,	%2
+
+		movdqa		%1,	%6
+		psubusb		%1,	%8
+		psubusb		%8,	%6
+		por			%8,	%1		; ABS(curPixel - centerPixel);
+
+		movdqa		%1,	%3
+		psubusb		%1,	%8
+
+		pmullw		%1,	%1
+		psrlw		%1,	5
+		pmullw		%2,	%1
+		paddusw		%4,	%1
+		paddusw		%5,	%2
+%endmacro
+
+%macro	WEIGHT_LINE1_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE2_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE3_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		pmullw		%2,	[sse2_20]
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+%endmacro
+
+;***********************************************************************
+;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;	1	2	3
+;	4	0	5
+;	6	7	8
+;	0:	the center point
+
+WELS_EXTERN BilateralLumaFilter8_sse2
+
+        push r3
+        %assign push_num 1
+        LOAD_2_PARA
+        PUSH_XMM 8
+
+		pxor		xmm7,	xmm7
+
+		mov         r3,     r0
+
+		movq        xmm6,   [r0]
+		punpcklbw	xmm6,	xmm7
+		movdqa		xmm3,	[sse2_32]
+		pxor		xmm4,	xmm4		; nTotWeight
+		pxor		xmm5,	xmm5		; nSum
+
+        dec         r0
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 4
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 5
+
+		sub			r0,	r1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 2
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 3
+
+		lea			r0,	[r0 + r1 * 2]
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 6
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 7
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 8
+
+		pcmpeqw		xmm0,	xmm0
+		psrlw		xmm0,	15
+		psllw		xmm0,	8
+		psubusw		xmm0,	xmm4
+		pmullw		xmm0,	xmm6
+		paddusw		xmm5,	xmm0
+		psrlw		xmm5,	8
+		packuswb	xmm5,	xmm5
+		movq		[r3],	xmm5
+
+
+		POP_XMM
+		pop r3
+		%assign push_num 0
+
+		ret
+
+;***********************************************************************
+; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1	1	2	1	1
+;1	2	4	2	1
+;2	4	20	4	2
+;1	2	4	2	1
+;1	1	2	1	1
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+
+        push r3
+
+        %assign push_num 1
+
+        LOAD_2_PARA
+
+        mov		r3,	r1
+		add		r3,	r3
+		sub		r0,	r3			; pixels - 2 * stride
+		sub		r0,	2
+
+		pxor	xmm0,	xmm0
+		pxor	xmm3,	xmm3
+
+		movdqu		xmm1,	[r0]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[r0 + r1]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		add		r0,	r3
+		movdqu		xmm1,	[r0]
+		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[r0 + r1]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[r0 + r1 * 2]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		psrlw		xmm3,		6
+		packuswb	xmm3,		xmm3
+		movq		[r0 + 2],		xmm3
+
+
+        pop r3
+
+        %assign push_num 0
+		ret
--- /dev/null
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -1,0 +1,1205 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	upsampling.asm
+;*
+;*  Abstract
+;*		SIMD for pixel domain down sampling
+;*
+;*  History
+;*		10/22/2009	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+%ifdef X86_32
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $01			; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $01			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	; 2nd part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm1, [esi+16]		; 1st pSrc line + 16
+	movq mm2, [esi+24]		; 1st pSrc line + 24
+	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
+	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+
+	; to handle mm1, mm2, mm3, mm4
+	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm5, mm6		; d c D C b a B A
+	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm6, mm7		; h g H G f e F E
+	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm7, mm1		; l k L K j i J I
+	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+
+	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm1, mm2 		; p o P O n m N M
+	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+
+	; to handle mm5, mm6, mm7, mm1
+	movq mm2, mm5
+	punpckldq mm2, mm6 	; H G F E D C B A
+	punpckhdq mm5, mm6 	; h g f e d c b a
+
+	movq mm3, mm7
+	punpckldq mm3, mm1 	; P O N M L K J I
+	punpckhdq mm7, mm1 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+	movq [edi  ], mm0
+	movq [edi+8], mm2
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $01		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $01		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movq [edi  ], mm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $01		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $01		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $02		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 8 bytes
+.xloops:
+	; 1st part horizonal loop: x8 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A
+	;2nd Line Src:	mm1: h H g G f F e E
+	;=> target:
+	;: H G F E D C B A
+	;: h g f e d c b a
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+ecx]		; 2nd pSrc line
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm2, mm3		; d c D C b a B A
+	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm4, mm5		; h g H G f e F E
+	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	; to handle mm2, mm4
+	movq mm0, mm2		;
+	punpckldq mm0, mm4 	; H G F E D C B A
+	punpckhdq mm2, mm4 	; h g f e d c b a
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+	pshufw mm1, mm0, 04eh	; 01001110 B
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movd [edi],	mm0
+
+	; next unit
+	lea esi, [esi+8]
+	lea edi, [edi+4]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $01			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $01			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm4 high bits
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $01		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $01		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm2 high bits
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $01			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $01			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $01		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $01		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+
+
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	32767
+	mov		eax,	[uiScaleX]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
+	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
+	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		40003fffh
+	movd	xmm5,		edx
+	punpcklwd	xmm5,	xmm0					; 16384 16383
+	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,			16384
+	mov		[yInverse],		eax
+
+	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+
+WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	pxor	xmm0,		xmm0
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	movdqa	xmm0,	xmm2
+	pmuludq	xmm2,	xmm1
+	psrlq	xmm0,	32
+	psrlq	xmm1,	32
+	pmuludq	xmm0,	xmm1
+	paddq	xmm2,	xmm0
+	pshufd	xmm1,	xmm2,	00001110b
+	paddq	xmm2,	xmm1
+	psrlq	xmm2,	29
+
+	movd	eax,	xmm2
+	inc		eax
+	shr		eax,	1
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+	psllw	xmm3,		1
+	psrlw	xmm3,		1
+
+	loop	WIDTH
+
+WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		HEIGHT
+
+
+LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+
+
+
+
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	65535
+	mov		eax,	[uiScaleX]
+	and		eax,	edx
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	65535
+	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 uinc 0 -uinc
+	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 vinc 0 -vinc
+	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		80007fffh				; 32768 32767
+	movd	xmm5,		edx
+	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
+	mov		ebx,		16384
+
+
+FAST_DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,		16384
+	mov		[yInverse],		eax
+
+	pshuflw	xmm4,		xmm5,	01010000b
+	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	pmaddwd		xmm2,	xmm1
+	pshufd	xmm1,	xmm2,	00000001b
+	paddd	xmm2,	xmm1
+	movd	xmm1,	ebx
+	paddd	xmm2,	xmm1
+	psrld	xmm2,	15
+
+	packuswb	xmm2,	xmm0
+	movd	eax,	xmm2
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+
+	loop	FAST_WIDTH
+
+FAST_WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+%endif
--- /dev/null
+++ b/codec/processing/src/x86/vaa.asm
@@ -1,0 +1,2030 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*      vaa.asm
+;*
+;*      Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010      Created
+;*              06/07/2010      Added AnalysisVaaInfoIntra_sse2(ssse3)
+;*              06/10/2010      Tune rc_sad_frame_sse2 and got about 40% improvement
+;*              08/11/2010      Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+%macro SUM_SQR_SSE2     3       ; dst, pSrc, zero
+  movdqa %1, %2
+  punpcklbw %1, %3
+  punpckhbw %2, %3
+  pmaddwd %1, %1
+  pmaddwd %2, %2
+  paddd %1, %2
+  pshufd %2, %1, 04Eh   ; 01001110 B
+  paddd %1, %2
+  pshufd %2, %1, 0B1h   ; 10110001 B
+  paddd %1, %2
+%endmacro       ; END OF SUM_SQR_SSE2
+
+%macro WELS_SAD_16x2_SSE2  3 ;esi :%1 edi:%2 ebx:%3
+  movdqa        xmm1,   [%1]
+  movdqa        xmm2,   [%2]
+  movdqa        xmm3,   [%1+%3]
+  movdqa        xmm4,   [%2+%3]
+  psadbw        xmm1,   xmm2
+  psadbw        xmm3,   xmm4
+  paddd xmm6,   xmm1
+  paddd xmm6,   xmm3
+  lea           %1,     [%1+%3*2]
+  lea           %2,     [%2+%3*2]
+%endmacro
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2        2       ; dst(pSrc), tmp
+  ; @sum_8x2 begin
+  pshufd %2, %1, 04Eh   ; 01001110 B
+  paddw %1, %2
+  pshuflw %2, %1, 04Eh  ; 01001110 B
+  paddw %1, %2
+  pshuflw %2, %1, 0B1h  ; 10110001 B
+  paddw %1, %2
+  ; end of @sum_8x2
+%endmacro       ; END of SUM_WORD_8x2_SSE2
+
+%macro  WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
+  movdqa        xmm1,   [%1]
+  movdqa        xmm2,   [%2]
+  movdqa        xmm3,   xmm1
+  psadbw        xmm3,   xmm2
+  paddd         xmm6,   xmm3
+
+  movdqa        xmm3,   xmm1
+  psadbw        xmm3,   xmm0
+  paddd         xmm5,   xmm3
+
+  movdqa        xmm2,   xmm1
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm2,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm2,   xmm2
+  paddd         xmm4,   xmm1
+  paddd         xmm4,   xmm2
+
+  add           %1,     %3
+  add           %2,     %3
+%endmacro
+
+%macro  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
+  movdqa        xmm1,   [%1]
+  movdqa        xmm2,   [%2]
+  movdqa        xmm3,   xmm1
+  psadbw        xmm3,   xmm2
+  paddd         xmm7,   xmm3    ; sad
+
+  movdqa        xmm3,   xmm1
+  pmaxub        xmm3,   xmm2
+  pminub        xmm2,   xmm1
+  psubb xmm3,   xmm2    ; diff
+
+  movdqa        xmm2,   xmm1
+  psadbw        xmm2,   xmm0
+  paddd xmm6,   xmm2    ; sum
+
+  movdqa                xmm2,   xmm1
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm2,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm2,   xmm2
+  paddd         xmm5,   xmm1
+  paddd         xmm5,   xmm2    ; sqsum
+
+  movdqa                xmm1,   xmm3
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm3,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm3,   xmm3
+  paddd         xmm4,   xmm1
+  paddd         xmm4,   xmm3    ; sqdiff
+
+  add           %1,     %3
+  add           %2,     %3
+%endmacro
+
+%macro  WELS_SAD_SD_MAD_16x1_SSE2       7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg                 %1
+%define sum_cur_reg             %2
+%define sum_ref_reg             %3
+%define mad_reg                 %4
+  movdqa        xmm1,           [%5]
+  movdqa        xmm2,           [%6]
+  movdqa        xmm3,           xmm1
+  psadbw        xmm3,           xmm0
+  paddd         sum_cur_reg,    xmm3    ; sum_cur
+  movdqa        xmm3,           xmm2
+  psadbw        xmm3,           xmm0
+  paddd sum_ref_reg,                    xmm3    ; sum_ref
+
+  movdqa        xmm3,           xmm1
+  pmaxub        xmm3,           xmm2
+  pminub        xmm2,           xmm1
+  psubb xmm3,           xmm2    ; abs diff
+  pmaxub        mad_reg,        xmm3    ; max abs diff
+
+  psadbw        xmm3,           xmm0
+  paddd sad_reg,        xmm3    ; sad
+
+  add                   %5,             %7
+  add                   %6,             %7
+%endmacro
+
+
+%macro  WELS_MAX_REG_SSE2       1       ; xmm1, xmm2, xmm3 can be used
+%define max_reg  %1
+  movdqa        xmm1,           max_reg
+  psrldq        xmm1,           4
+  pmaxub        max_reg,        xmm1
+  movdqa        xmm1,           max_reg
+  psrldq        xmm1,           2
+  pmaxub        max_reg,        xmm1
+  movdqa        xmm1,           max_reg
+  psrldq        xmm1,           1
+  pmaxub        max_reg,        xmm1
+%endmacro
+
+%macro  WELS_SAD_BGD_SQDIFF_16x1_SSE2   7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg         %1
+%define sum_reg         %2
+%define mad_reg         %3
+%define sqdiff_reg      %4
+  movdqa                xmm1,           [%5]
+  movdqa                xmm2,           xmm1
+  movdqa                xmm3,           xmm1
+  punpcklbw     xmm2,           xmm0
+  punpckhbw     xmm3,           xmm0
+  pmaddwd               xmm2,           xmm2
+  pmaddwd               xmm3,           xmm3
+  paddd         xmm2,           xmm3
+  movdqa                xmm3,           xmm2
+  psllq         xmm2,           32
+  psrlq         xmm3,           32
+  psllq         xmm3,           32
+  paddd         xmm2,           xmm3
+  paddd         sad_reg,        xmm2            ; sqsum
+
+  movdqa        xmm2,           [%6]
+  movdqa        xmm3,           xmm1
+  psadbw        xmm3,           xmm0
+  paddd sum_reg,                        xmm3    ; sum_cur
+  movdqa        xmm3,           xmm2
+  psadbw        xmm3,           xmm0
+  pslldq        xmm3,           4
+  paddd sum_reg,                        xmm3    ; sum_ref
+
+  movdqa        xmm3,           xmm1
+  pmaxub        xmm3,           xmm2
+  pminub        xmm2,           xmm1
+  psubb xmm3,           xmm2    ; abs diff
+  pmaxub        mad_reg,        xmm3    ; max abs diff
+
+  movdqa        xmm1,           xmm3
+  psadbw        xmm3,           xmm0
+  paddd sad_reg,        xmm3    ; sad
+
+  movdqa                xmm3,   xmm1
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm3,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm3,   xmm3
+  paddd         sqdiff_reg,     xmm1
+  paddd         sqdiff_reg,     xmm3    ; sqdiff
+
+  add           %5,     %7
+  add           %6,     %7
+%endmacro
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+%ifdef X86_32
+
+;***********************************************************************
+;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+WELS_EXTERN SampleVariance16x16_sse2
+  push esi
+  push edi
+  push ebx
+
+  sub esp, 16
+  %define SUM                   [esp]
+  %define SUM_CUR               [esp+4]
+  %define SQR                   [esp+8]
+  %define SQR_CUR               [esp+12]
+  %define PUSH_SIZE     28      ; 12 + 16
+
+  mov edi, [esp+PUSH_SIZE+4]    ; y_ref
+  mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
+  mov esi, [esp+PUSH_SIZE+12]   ; y_src
+  mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
+  mov ecx, 010h                         ; height = 16
+
+  pxor xmm7, xmm7
+  movdqu SUM, xmm7
+
+.hloops:
+  movdqa xmm0, [edi]            ; y_ref
+  movdqa xmm1, [esi]            ; y_src
+  movdqa xmm2, xmm0             ; store first for future process
+  movdqa xmm3, xmm1
+  ; sum += diff;
+  movdqa xmm4, xmm0
+  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+  ; to be continued for sum
+  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+  paddw xmm4, xmm5
+  movd ebx, xmm4
+  add SUM, ebx
+
+  ; sqr += diff * diff;
+  pmaxub xmm0, xmm1
+  pminub xmm1, xmm2
+  psubb xmm0, xmm1                              ; diff
+  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+  movd ebx, xmm1
+  add SQR, ebx
+
+  ; sum_cur += y_src[x];
+  movdqa xmm0, xmm3             ; cur_orig
+  movdqa xmm1, xmm0
+  punpcklbw xmm0, xmm7
+  punpckhbw xmm1, xmm7
+  paddw xmm0, xmm1              ; 8x2
+  SUM_WORD_8x2_SSE2 xmm0, xmm1
+  movd ebx, xmm0
+  and ebx, 0ffffh
+  add SUM_CUR, ebx
+
+  ; sqr_cur += y_src[x] * y_src[x];
+  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+  movd ebx, xmm0
+  add SQR_CUR, ebx
+
+  lea edi, [edi+edx]
+  lea esi, [esi+eax]
+  dec ecx
+  jnz near .hloops
+
+  mov ebx, 0
+  mov bx, word SUM
+  sar ebx, 8
+  imul ebx, ebx
+  mov ecx, SQR
+  sar ecx, 8
+  sub ecx, ebx
+  mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
+  mov [edi], cx                         ; to store uiMotionIndex
+  mov ebx, 0
+  mov bx, word SUM_CUR
+  sar ebx, 8
+  imul ebx, ebx
+  mov ecx, SQR_CUR
+  sar ecx, 8
+  sub ecx, ebx
+  mov [edi+2], cx                               ; to store uiTextureIndex
+
+  %undef SUM
+  %undef SUM_CUR
+  %undef SQR
+  %undef SQR_CUR
+  %undef PUSH_SIZE
+
+  add esp, 16
+  pop ebx
+  pop edi
+  pop esi
+
+  ret
+
+
+
+;*************************************************************************************************************
+;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSad_sse2
+%define         cur_data                        esp + pushsize + 4
+%define         ref_data                        esp + pushsize + 8
+%define         iPicWidth                       esp + pushsize + 12
+%define         iPicHeight                      esp + pushsize + 16
+%define         iPicStride                      esp + pushsize + 20
+%define         psadframe                       esp + pushsize + 24
+%define         psad8x8                         esp + pushsize + 28
+%define         pushsize        12
+  push  esi
+  push  edi
+  push  ebx
+  mov           esi,    [cur_data]
+  mov           edi,    [ref_data]
+  mov           ebx,    [iPicStride]
+  mov           edx,    [psad8x8]
+  mov           eax,    ebx
+
+  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl           eax,    4                                                               ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
+height_loop:
+  mov           ecx,    dword [iPicWidth]
+  push  esi
+  push  edi
+width_loop:
+  pxor  xmm6,   xmm6            ;
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx],          xmm6
+  psrldq        xmm6,           8
+  movd  [edx+4],        xmm6
+
+  pxor  xmm6,   xmm6
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx+8],        xmm6
+  psrldq        xmm6,           8
+  movd  [edx+12],       xmm6
+
+  add           edx,    16
+  sub           esi,    eax
+  sub           edi,    eax
+  add           esi,    16
+  add           edi,    16
+
+  dec           ecx
+  jnz           width_loop
+
+  pop           edi
+  pop           esi
+  add           esi,    eax
+  add           edi,    eax
+
+  dec   dword [iPicHeight]
+  jnz           height_loop
+
+  mov           edx,    [psadframe]
+  movdqa        xmm5,   xmm7
+  psrldq        xmm7,   8
+  paddd xmm7,   xmm5
+  movd  [edx],  xmm7
+
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          pushsize
+  pop           ebx
+  pop           edi
+  pop           esi
+  ret
+
+%else  ;64-bit
+
+;***********************************************************************
+;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+WELS_EXTERN SampleVariance16x16_sse2
+  %define SUM                   r10;[esp]
+  %define SUM_CUR               r11;[esp+4]
+  %define SQR                   r13;[esp+8]
+  %define SQR_CUR               r15;[esp+12]
+
+  push r12
+  push r13
+  push r14
+  push r15
+  %assign push_num 4
+  LOAD_5_PARA
+  PUSH_XMM 8
+  SIGN_EXTENSION r1,r1d
+  SIGN_EXTENSION r3,r3d
+
+  mov r12,010h
+  pxor xmm7, xmm7
+  movq SUM, xmm7
+  movq SUM_CUR,xmm7
+  movq SQR,xmm7
+  movq SQR_CUR,xmm7
+
+.hloops:
+  mov r14,0
+  movdqa xmm0, [r0]             ; y_ref
+  movdqa xmm1, [r2]             ; y_src
+  movdqa xmm2, xmm0             ; store first for future process
+  movdqa xmm3, xmm1
+  ; sum += diff;
+  movdqa xmm4, xmm0
+  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+  ; to be continued for sum
+  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+  paddw xmm4, xmm5
+  movd r14d, xmm4
+  add SUM, r14
+
+  ; sqr += diff * diff;
+  pmaxub xmm0, xmm1
+  pminub xmm1, xmm2
+  psubb xmm0, xmm1                              ; diff
+  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+  movd r14d, xmm1
+  add SQR, r14
+
+  ; sum_cur += y_src[x];
+  movdqa xmm0, xmm3             ; cur_orig
+  movdqa xmm1, xmm0
+  punpcklbw xmm0, xmm7
+  punpckhbw xmm1, xmm7
+  paddw xmm0, xmm1              ; 8x2
+  SUM_WORD_8x2_SSE2 xmm0, xmm1
+  movd r14d, xmm0
+  and r14, 0ffffh
+  add SUM_CUR, r14
+
+  ; sqr_cur += y_src[x] * y_src[x];
+  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+  movd r14d, xmm0
+  add SQR_CUR, r14
+
+  lea r0, [r0+r1]
+  lea r2, [r2+r3]
+  dec r12
+  jnz near .hloops
+
+  mov r0, SUM
+  sar r0, 8
+  imul r0, r0
+  mov r1, SQR
+  sar r1, 8
+  sub r1, r0
+  mov [r4], r1w                         ; to store uiMotionIndex
+  mov r0, SUM_CUR
+  sar r0, 8
+  imul r0, r0
+  mov r1, SQR_CUR
+  sar r1, 8
+  sub r1, r0
+  mov [r4+2], r1w                               ; to store uiTextureIndex
+
+  POP_XMM
+  LOAD_5_PARA_POP
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+
+
+  %assign push_num 0
+
+  ret
+
+
+;*************************************************************************************************************
+;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSad_sse2
+%define         cur_data                        r0
+%define         ref_data                        r1
+%define         iPicWidth                       r2
+%define         iPicHeight              r3
+%define         iPicStride              r4
+%define         psadframe                       r5
+%define         psad8x8                         r6
+
+  push r12
+  push r13
+  %assign push_num 2
+  LOAD_7_PARA
+  PUSH_XMM 8
+  SIGN_EXTENSION r2,r2d
+  SIGN_EXTENSION r3,r3d
+  SIGN_EXTENSION r4,r4d
+
+  mov   r12,r4
+  shr           r2,     4                                       ; iPicWidth/16
+  shr           r3,     4                                       ; iPicHeight/16
+
+  shl           r12,    4                                                               ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
+height_loop:
+  mov           r13,    r2
+  push  r0
+  push  r1
+width_loop:
+  pxor  xmm6,   xmm6
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  paddd xmm7,           xmm6
+  movd  [r6],           xmm6
+  psrldq        xmm6,           8
+  movd  [r6+4], xmm6
+
+  pxor  xmm6,   xmm6
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  paddd xmm7,           xmm6
+  movd  [r6+8], xmm6
+  psrldq        xmm6,           8
+  movd  [r6+12],        xmm6
+
+  add           r6,     16
+  sub           r0,     r12
+  sub           r1,     r12
+  add           r0,     16
+  add           r1,     16
+
+  dec           r13
+  jnz           width_loop
+
+  pop           r1
+  pop           r0
+  add           r0,     r12
+  add           r1,     r12
+
+  dec   r3
+  jnz           height_loop
+
+  ;mov          r13,    [psadframe]
+  movdqa        xmm5,   xmm7
+  psrldq        xmm7,   8
+  paddd xmm7,   xmm5
+  movd  [psadframe],    xmm7
+
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          pushsize
+  POP_XMM
+  LOAD_7_PARA_POP
+  pop r13
+  pop r12
+  %assign push_num 0
+  ret
+
+%endif
+
+
+%ifdef X86_32
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+%define         localsize               8
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         psum16x16                       esp + pushsize + localsize + 32
+%define         psqsum16x16                     esp + pushsize + localsize + 36
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         pushsize                16
+  push  ebp
+  push  esi
+  push  edi
+  push  ebx
+  sub           esp,    localsize
+  mov           esi,    [cur_data]
+  mov           edi,    [ref_data]
+  mov           ebx,    [iPicStride]
+  mov           edx,    [psad8x8]
+  mov           eax,    ebx
+
+  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl           eax,    4                                                       ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
+var_height_loop:
+  mov           ecx,    dword [iPicWidth]
+  mov           [tmp_esi],      esi
+  mov           [tmp_edi],      edi
+var_width_loop:
+  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+  pxor  xmm5,   xmm5            ; pSum16x16
+  pxor  xmm4,   xmm4            ; sqsum_16x16
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx],          xmm6
+  psrldq        xmm6,           8
+  movd  [edx+4],        xmm6
+
+  pxor  xmm6,   xmm6
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx+8],        xmm6
+  psrldq        xmm6,           8
+  movd  [edx+12],       xmm6
+
+  mov           ebp,    [psum16x16]
+  movdqa        xmm1,   xmm5
+  psrldq        xmm1,   8
+  paddd xmm5,   xmm1
+  movd  [ebp],  xmm5
+  add           dword [psum16x16], 4
+
+  movdqa        xmm5,   xmm4
+  psrldq        xmm5,   8
+  paddd xmm4,   xmm5
+  movdqa        xmm3,   xmm4
+  psrldq        xmm3,   4
+  paddd xmm4,   xmm3
+
+  mov           ebp,    [psqsum16x16]
+  movd  [ebp],  xmm4
+  add           dword [psqsum16x16], 4
+
+  add           edx,    16
+  sub           esi,    eax
+  sub           edi,    eax
+  add           esi,    16
+  add           edi,    16
+
+  dec           ecx
+  jnz           var_width_loop
+
+  mov           esi,    [tmp_esi]
+  mov           edi,    [tmp_edi]
+  add           esi,    eax
+  add           edi,    eax
+
+  dec   dword [iPicHeight]
+  jnz           var_height_loop
+
+  mov           edx,    [psadframe]
+  movdqa        xmm5,   xmm7
+  psrldq        xmm7,   8
+  paddd xmm7,   xmm5
+  movd  [edx],  xmm7
+
+  add           esp,    localsize
+  pop           ebx
+  pop           edi
+  pop           esi
+  pop           ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
+
+%else  ;64-bit
+
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+%define         cur_data                        arg1 ;r0
+%define         ref_data                        arg2 ;r1
+%define         iPicWidth                       arg3 ;r2
+%define         iPicHeight                  arg4 ;r3
+%define         iPicStride                  arg5
+%define         psadframe                       arg6
+%define         psad8x8                         arg7
+%define         psum16x16                       arg8
+%define         psqsum16x16                 arg9
+
+  push r12
+  push r13
+  push r14
+  push r15
+  %assign push_num 4
+  PUSH_XMM 8
+
+%ifdef WIN64
+  mov r4, arg5  ;iPicStride
+  mov r5, arg6  ;psad8x8
+%endif
+  mov r14,arg7
+  SIGN_EXTENSION r2,r2d
+  SIGN_EXTENSION r3,r3d
+  SIGN_EXTENSION r4,r4d
+
+  mov   r13,r4
+  shr   r2,4
+  shr   r3,4
+
+  shl   r13,4   ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
+var_height_loop:
+  push    r2
+  %assign push_num push_num+1
+  mov           r11,    r0
+  mov           r12,    r1
+var_width_loop:
+  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+  pxor  xmm5,   xmm5            ; pSum16x16
+  pxor  xmm4,   xmm4            ; sqsum_16x16
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  paddd xmm7,           xmm6
+  movd  [r14],          xmm6
+  psrldq        xmm6,           8
+  movd  [r14+4],        xmm6
+
+  pxor  xmm6,   xmm6
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  paddd   xmm7,           xmm6
+  movd    [r14+8],        xmm6
+  psrldq  xmm6,           8
+  movd    [r14+12],       xmm6
+
+  mov             r15,    psum16x16
+  movdqa  xmm1,   xmm5
+  psrldq  xmm1,   8
+  paddd   xmm5,   xmm1
+  movd    [r15],  xmm5
+  add             dword psum16x16, 4
+
+  movdqa  xmm5,   xmm4
+  psrldq  xmm5,   8
+  paddd   xmm4,   xmm5
+  movdqa  xmm3,   xmm4
+  psrldq  xmm3,   4
+  paddd   xmm4,   xmm3
+
+  mov             r15,    psqsum16x16
+  movd    [r15],  xmm4
+  add             dword psqsum16x16, 4
+
+  add             r14,16
+  sub             r0,     r13
+  sub             r1,     r13
+  add             r0,     16
+  add             r1,     16
+
+  dec             r2
+  jnz             var_width_loop
+
+  pop     r2
+  %assign push_num push_num-1
+  mov             r0,     r11
+  mov             r1,     r12
+  add             r0,     r13
+  add             r1,     r13
+  dec     r3
+  jnz             var_height_loop
+
+  mov             r15,    psadframe
+  movdqa  xmm5,   xmm7
+  psrldq  xmm7,   8
+  paddd   xmm7,   xmm5
+  movd    [r15],  xmm7
+
+  POP_XMM
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+%assign push_num 0
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
+
+%endif
+
+%ifdef X86_32
+
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+%define         localsize               12
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         psum16x16                       esp + pushsize + localsize + 32
+%define         psqsum16x16                     esp + pushsize + localsize + 36
+%define         psqdiff16x16            esp + pushsize + localsize + 40
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         tmp_sadframe            esp + 8
+%define         pushsize                16
+  push    ebp
+  push    esi
+  push    edi
+  push    ebx
+  sub             esp,    localsize
+
+  mov             ecx,    [iPicWidth]
+  mov             ecx,    [iPicHeight]
+  mov             esi,    [cur_data]
+  mov             edi,    [ref_data]
+  mov             ebx,    [iPicStride]
+  mov             edx,    [psad8x8]
+  mov             eax,    ebx
+
+  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl             eax,    4                                                       ; iPicStride*16
+  mov             ecx,    [iPicWidth]
+  mov             ecx,    [iPicHeight]
+  pxor    xmm0,   xmm0
+  movd    [tmp_sadframe], xmm0
+sqdiff_height_loop:
+  mov             ecx,    dword [iPicWidth]
+  mov             [tmp_esi],      esi
+  mov             [tmp_edi],      edi
+sqdiff_width_loop:
+  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+  pxor    xmm6,   xmm6            ; pSum16x16
+  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  movdqa  xmm1,           xmm7
+  movd    [edx],          xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [edx+4],        xmm7
+  movd    ebp,            xmm1
+  add             [tmp_sadframe], ebp
+
+  pxor    xmm7,   xmm7
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  movdqa  xmm1,           xmm7
+  movd    [edx+8],        xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [edx+12],       xmm7
+  movd    ebp,            xmm1
+  add             [tmp_sadframe], ebp
+
+  mov             ebp,    [psum16x16]
+  movdqa  xmm1,   xmm6
+  psrldq  xmm1,   8
+  paddd   xmm6,   xmm1
+  movd    [ebp],  xmm6
+  add             dword [psum16x16], 4
+
+  mov             ebp,    [psqsum16x16]
+  pshufd  xmm6,   xmm5,   14 ;00001110
+  paddd   xmm6,   xmm5
+  pshufd  xmm5,   xmm6,   1  ;00000001
+  paddd   xmm5,   xmm6
+  movd    [ebp],  xmm5
+  add             dword [psqsum16x16], 4
+
+  mov             ebp,    [psqdiff16x16]
+  pshufd  xmm5,   xmm4,   14      ; 00001110
+  paddd   xmm5,   xmm4
+  pshufd  xmm4,   xmm5,   1       ; 00000001
+  paddd   xmm4,   xmm5
+  movd    [ebp],  xmm4
+  add             dword   [psqdiff16x16], 4
+
+  add             edx,    16
+  sub             esi,    eax
+  sub             edi,    eax
+  add             esi,    16
+  add             edi,    16
+
+  dec             ecx
+  jnz             sqdiff_width_loop
+
+  mov             esi,    [tmp_esi]
+  mov             edi,    [tmp_edi]
+  add             esi,    eax
+  add             edi,    eax
+
+  dec     dword [iPicHeight]
+  jnz             sqdiff_height_loop
+
+  mov             ebx,    [tmp_sadframe]
+  mov             eax,    [psadframe]
+  mov             [eax],  ebx
+
+  add             esp,    localsize
+  pop             ebx
+  pop             edi
+  pop             esi
+  pop             ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          tmp_sadframe
+%undef          pushsize
+%undef          localsize
+  ret
+
+%else
+
+
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+%define         localsize               12
+%define         cur_data                        arg1;r0
+%define         ref_data                        arg2;r1
+%define         iPicWidth                       arg3;r2
+%define         iPicHeight                      arg4;r3
+%define         iPicStride                      arg5;
+%define         psadframe                       arg6;
+%define         psad8x8                         arg7;
+%define         psum16x16                       arg8;
+%define         psqsum16x16                     arg9;
+%define         psqdiff16x16                    arg10
+
+  push r12
+  push r13
+  push r14
+  push r15
+  %assign push_num 4
+  PUSH_XMM 10
+
+%ifdef WIN64
+  mov r4,arg5
+%endif
+  mov r14,arg7
+  SIGN_EXTENSION r2,r2d
+  SIGN_EXTENSION r3,r3d
+  SIGN_EXTENSION r4,r4d
+
+  mov        r13,r4
+  shr     r2,4   ; iPicWidth/16
+  shr     r3,4   ; iPicHeight/16
+  shl     r13,4   ; iPicStride*16
+  pxor    xmm0,   xmm0
+  pxor  xmm8, xmm8  ;framesad
+  pxor  xmm9, xmm9
+sqdiff_height_loop:
+  ;mov            ecx,    dword [iPicWidth]
+  ;mov      r14,r2
+  push r2
+  %assign push_num push_num +1
+  mov             r10,    r0
+  mov             r11,    r1
+sqdiff_width_loop:
+  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+  pxor    xmm6,   xmm6            ; pSum16x16
+  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  movdqa  xmm1,           xmm7
+  movd    [r14],          xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [r14+4],        xmm7
+  movd    r15d,           xmm1
+  movd  xmm9, r15d
+  paddd xmm8,xmm9
+
+
+  pxor    xmm7,   xmm7
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  movdqa  xmm1,           xmm7
+  movd    [r14+8],        xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [r14+12],       xmm7
+  movd    r15d,           xmm1
+  movd  xmm9, r15d
+  paddd xmm8,xmm9
+
+  mov             r15,    psum16x16
+  movdqa  xmm1,   xmm6
+  psrldq  xmm1,   8
+  paddd   xmm6,   xmm1
+  movd    [r15],  xmm6
+  add             dword psum16x16, 4
+
+  mov             r15,    psqsum16x16
+  pshufd  xmm6,   xmm5,   14 ;00001110
+  paddd   xmm6,   xmm5
+  pshufd  xmm5,   xmm6,   1  ;00000001
+  paddd   xmm5,   xmm6
+  movd    [r15],  xmm5
+  add             dword psqsum16x16, 4
+
+  mov             r15,    psqdiff16x16
+  pshufd  xmm5,   xmm4,   14      ; 00001110
+  paddd   xmm5,   xmm4
+  pshufd  xmm4,   xmm5,   1       ; 00000001
+  paddd   xmm4,   xmm5
+  movd    [r15],  xmm4
+  add             dword   psqdiff16x16,   4
+
+  add             r14,16
+  sub             r0,     r13
+  sub             r1,     r13
+  add             r0,     16
+  add             r1,     16
+
+  dec             r2
+  jnz             sqdiff_width_loop
+
+  pop r2
+  %assign push_num push_num -1
+
+  mov             r0,     r10
+  mov             r1,     r11
+  add             r0,     r13
+  add             r1,     r13
+
+  dec     r3
+  jnz             sqdiff_height_loop
+
+  mov             r13,    psadframe
+  movd    [r13],  xmm8
+
+  POP_XMM
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+  %assign push_num 0
+
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          tmp_sadframe
+%undef          pushsize
+%undef          localsize
+  ret
+
+
+
+%endif
+
+%ifdef X86_32
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+%define         localsize               12
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         p_sd8x8                         esp + pushsize + localsize + 32
+%define         p_mad8x8                        esp + pushsize + localsize + 36
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         tmp_ecx                         esp + 8
+%define         pushsize                16
+  push    ebp
+  push    esi
+  push    edi
+  push    ebx
+  sub             esp,    localsize
+  mov             esi,    [cur_data]
+  mov             edi,    [ref_data]
+  mov             ebx,    [iPicStride]
+  mov             eax,    ebx
+
+  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl             eax,    4                                                       ; iPicStride*16
+  xor             ebp,    ebp
+  pxor    xmm0,   xmm0
+bgd_height_loop:
+  mov             ecx,    dword [iPicWidth]
+  mov             [tmp_esi],      esi
+  mov             [tmp_edi],      edi
+bgd_width_loop:
+  pxor    xmm7,   xmm7            ; pSad8x8
+  pxor    xmm6,   xmm6            ; sum_cur_8x8
+  pxor    xmm5,   xmm5            ; sum_ref_8x8
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+
+
+  mov                     edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm4
+
+  ;movdqa         xmm1,   xmm4
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm4,   xmm0
+  ;punpcklwd      xmm4,   xmm0
+  ;movd           [edx+4],        xmm4
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  mov                     [tmp_ecx],      ecx
+  movhlps         xmm1,   xmm4
+  movd            ecx,    xmm4
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+
+  pslldq          xmm7,   4
+  pslldq          xmm6,   4
+  pslldq          xmm5,   4
+
+
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+
+  mov                     edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm4
+
+  ;movdqa         xmm1,   xmm4
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm4,   xmm0
+  ;punpcklwd      xmm4,   xmm0
+  ;movd           [edx+4],        xmm4
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  movhlps         xmm1,   xmm4
+  movd            ecx,    xmm4
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+
+  mov             edx,    [psad8x8]
+  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+  movdqa  [edx],  xmm1
+  add             edx,    16
+  mov             [psad8x8],      edx                                     ; sad8x8
+
+  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+  pshufd  xmm2,   xmm1,   00000011b
+  paddd   xmm1,   xmm2
+  movd    edx,    xmm1
+  add             ebp,    edx                                             ; sad frame
+
+  mov             edx,    [p_sd8x8]
+  psubd   xmm6,   xmm5
+  pshufd  xmm1,   xmm6,   10001101b
+  movdqa  [edx],  xmm1
+  add             edx,    16
+  mov             [p_sd8x8],      edx
+
+
+  add             edx,    16
+  sub             esi,    eax
+  sub             edi,    eax
+  add             esi,    16
+  add             edi,    16
+
+  mov             ecx,    [tmp_ecx]
+  dec             ecx
+  jnz             bgd_width_loop
+
+  mov             esi,    [tmp_esi]
+  mov             edi,    [tmp_edi]
+  add             esi,    eax
+  add             edi,    eax
+
+  dec             dword [iPicHeight]
+  jnz             bgd_height_loop
+
+  mov             edx,    [psadframe]
+  mov             [edx],  ebp
+
+  add             esp,    localsize
+  pop             ebx
+  pop             edi
+  pop             esi
+  pop             ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
+
+
+
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+%define         localsize               16
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         psum16x16                       esp + pushsize + localsize + 32
+%define         psqsum16x16                     esp + pushsize + localsize + 36
+%define         psqdiff16x16            esp + pushsize + localsize + 40
+%define         p_sd8x8                         esp + pushsize + localsize + 44
+%define         p_mad8x8                        esp + pushsize + localsize + 48
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         tmp_sadframe            esp + 8
+%define         tmp_ecx                         esp + 12
+%define         pushsize                16
+  push    ebp
+  push    esi
+  push    edi
+  push    ebx
+  sub             esp,    localsize
+  mov             esi,    [cur_data]
+  mov             edi,    [ref_data]
+  mov             ebx,    [iPicStride]
+  mov             eax,    ebx
+
+  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl             eax,    4                                                       ; iPicStride*16
+  pxor    xmm0,   xmm0
+  movd    [tmp_sadframe], xmm0
+sqdiff_bgd_height_loop:
+  mov             ecx,    dword [iPicWidth]
+  mov             [tmp_esi],      esi
+  mov             [tmp_edi],      edi
+sqdiff_bgd_width_loop:
+  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+
+  mov             edx,            [psad8x8]
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [edx],          xmm2
+  movd    [edx+4],        xmm1
+  add             edx,            8
+  mov             [psad8x8],      edx                     ; sad8x8
+
+  paddd   xmm1,                           xmm2
+  movd    edx,                            xmm1
+  add             [tmp_sadframe],         edx                     ; iFrameSad
+
+  mov             edx,            [psum16x16]
+  movdqa  xmm1,           xmm6
+  pshufd  xmm2,           xmm1,           00001110b
+  paddd   xmm1,           xmm2
+  movd    [edx],          xmm1                            ; sum
+
+  mov             edx,            [p_sd8x8]
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [edx],          xmm1
+  add             edx,            8
+  mov             [p_sd8x8],      edx
+
+  mov                     edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm5
+  ;movdqa         xmm1,   xmm5
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm5,   xmm0
+  ;punpcklwd      xmm5,   xmm0
+  ;movd           [edx+4],        xmm5
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  mov                     [tmp_ecx],      ecx
+  movhlps         xmm1,   xmm5
+  movd            ecx,    xmm5
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+  psrlq   xmm7,   32
+  psllq   xmm7,   32                      ; clear sad
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+
+  mov             edx,            [psad8x8]
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [edx],          xmm2
+  movd    [edx+4],        xmm1
+  add             edx,            8
+  mov             [psad8x8],      edx                     ; sad8x8
+
+  paddd   xmm1,                           xmm2
+  movd    edx,                            xmm1
+  add             [tmp_sadframe],         edx                     ; iFrameSad
+
+  mov             edx,                    [psum16x16]
+  movdqa  xmm1,                   xmm6
+  pshufd  xmm2,                   xmm1,           00001110b
+  paddd   xmm1,                   xmm2
+  movd    ebp,                    xmm1                            ; sum
+  add             [edx],                  ebp
+  add             edx,                    4
+  mov             [psum16x16],    edx
+
+  mov             edx,                    [psqsum16x16]
+  psrlq   xmm7,                   32
+  pshufd  xmm2,                   xmm7,           00001110b
+  paddd   xmm2,                   xmm7
+  movd    [edx],                  xmm2                            ; sqsum
+  add             edx,                    4
+  mov             [psqsum16x16],  edx
+
+  mov             edx,            [p_sd8x8]
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [edx],          xmm1
+  add             edx,            8
+  mov             [p_sd8x8],      edx
+
+  mov             edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm5
+  ;movdqa         xmm1,   xmm5
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm5,   xmm0
+  ;punpcklwd      xmm5,   xmm0
+  ;movd           [edx+4],        xmm5
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  movhlps         xmm1,   xmm5
+  movd            ecx,    xmm5
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+  mov             edx,            [psqdiff16x16]
+  pshufd  xmm1,           xmm4,           00001110b
+  paddd   xmm4,           xmm1
+  pshufd  xmm1,           xmm4,           00000001b
+  paddd   xmm4,           xmm1
+  movd    [edx],          xmm4
+  add             edx,            4
+  mov             [psqdiff16x16], edx
+
+  add             edx,    16
+  sub             esi,    eax
+  sub             edi,    eax
+  add             esi,    16
+  add             edi,    16
+
+  mov             ecx,    [tmp_ecx]
+  dec             ecx
+  jnz             sqdiff_bgd_width_loop
+
+  mov             esi,    [tmp_esi]
+  mov             edi,    [tmp_edi]
+  add             esi,    eax
+  add             edi,    eax
+
+  dec     dword [iPicHeight]
+  jnz             sqdiff_bgd_height_loop
+
+  mov             edx,    [psadframe]
+  mov             ebp,    [tmp_sadframe]
+  mov             [edx],  ebp
+
+  add             esp,    localsize
+  pop             ebx
+  pop             edi
+  pop             esi
+  pop             ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+   ret
+%else
+
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+%define         cur_data                        arg1;
+%define         ref_data                        arg2;
+%define         iPicWidth                       arg3;
+%define         iPicHeight                      arg4;
+%define         iPicStride                      arg5;
+%define         psadframe                       arg6;
+%define         psad8x8                         arg7;
+%define         p_sd8x8                         arg8;
+%define         p_mad8x8                        arg9;
+
+  push r12
+  push r13
+  push r14
+  push r15
+%assign push_num 4
+  PUSH_XMM 10
+%ifdef WIN64
+  mov r4,arg5
+  ;  mov r5,arg6
+%endif
+  mov r14,arg7
+  SIGN_EXTENSION r2,r2d
+  SIGN_EXTENSION r3,r3d
+  SIGN_EXTENSION r4,r4d
+
+
+  mov     r13,r4
+  mov     r15,r0
+  shr     r2,4
+  shr     r3,4
+  shl     r13,4
+  pxor    xmm0,   xmm0
+  pxor    xmm8,   xmm8
+  pxor    xmm9,   xmm9
+bgd_height_loop:
+  ;mov            ecx,    dword [iPicWidth]
+  push r2
+  %assign push_num push_num+1
+  mov             r10,    r15
+  mov             r11,    r1
+bgd_width_loop:
+  pxor    xmm7,   xmm7            ; pSad8x8
+  pxor    xmm6,   xmm6            ; sum_cur_8x8
+  pxor    xmm5,   xmm5            ; sum_ref_8x8
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+
+
+  mov                     r14,            p_mad8x8
+  WELS_MAX_REG_SSE2       xmm4
+
+  ;mov                    [tmp_ecx],      ecx
+  movhlps         xmm1,   xmm4
+  movd            r0d,    xmm4
+
+
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  add                     r14,    2
+  ;mov                     p_mad8x8,       r14
+
+
+  pslldq          xmm7,   4
+  pslldq          xmm6,   4
+  pslldq          xmm5,   4
+
+
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+
+  ;mov                     r14,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm4
+
+  movhlps         xmm1,   xmm4
+  movd            r0d,    xmm4
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  add                     r14,    2
+  mov                     p_mad8x8,       r14
+
+  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+
+  mov             r14,    psad8x8
+  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+  movdqa  [r14],  xmm1
+  add             r14,    16
+  mov             psad8x8,        r14                                     ; sad8x8
+
+  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+  pshufd  xmm2,   xmm1,   00000011b
+  paddd   xmm1,   xmm2
+  movd    r14d,   xmm1
+  movd    xmm9, r14d
+  paddd   xmm8,   xmm9                                            ; sad frame
+
+  mov             r14,    p_sd8x8
+  psubd   xmm6,   xmm5
+  pshufd  xmm1,   xmm6,   10001101b
+  movdqa  [r14],  xmm1
+  add             r14,    16
+  mov             p_sd8x8,        r14
+
+
+  ;add            edx,    16
+  sub             r15,    r13
+  sub             r1,     r13
+  add             r15,    16
+  add             r1,     16
+
+
+  dec             r2
+  jnz             bgd_width_loop
+  pop     r2
+%assign push_num push_num-1
+  mov             r15,    r10
+  mov             r1,     r11
+  add             r15,    r13
+  add             r1,     r13
+
+  dec             r3
+  jnz             bgd_height_loop
+
+  mov             r13,    psadframe
+  movd    [r13],  xmm8
+
+  POP_XMM
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+%assign push_num 0
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
+
+
+
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+%define         cur_data                        arg1;
+%define         ref_data                        arg2;
+%define         iPicWidth                       arg3;
+%define         iPicHeight                      arg4;
+%define         iPicStride                      arg5;
+%define         psadframe                       arg6;
+%define         psad8x8                         arg7;
+%define         psum16x16                       arg8;
+%define         psqsum16x16                     arg9;
+%define         psqdiff16x16                    arg10;
+%define         p_sd8x8                         arg11
+%define         p_mad8x8                        arg12
+
+  push r12
+  push r13
+  push r14
+  push r15
+%assign push_num 4
+  PUSH_XMM 10
+%ifdef WIN64
+  mov r4,arg5
+  ;mov r5,arg6
+%endif
+  SIGN_EXTENSION r2,r2d
+  SIGN_EXTENSION r3,r3d
+  SIGN_EXTENSION r4,r4d
+
+  mov     r13,r4
+  shr             r2,     4                                       ; iPicWidth/16
+  shr             r3,     4                                       ; iPicHeight/16
+  shl             r13,    4                                                       ; iPicStride*16
+  pxor    xmm0,   xmm0
+  pxor    xmm8,   xmm8
+  pxor    xmm9,   xmm9
+
+
+sqdiff_bgd_height_loop:
+  mov             r10,    r0
+  mov             r11,    r1
+  push r2
+%assign push_num push_num+1
+sqdiff_bgd_width_loop:
+
+  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+
+  mov             r14,            psad8x8
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [r14],          xmm2
+  movd    [r14+4],        xmm1
+  add             r14,            8
+  mov             psad8x8,        r14                     ; sad8x8
+
+  paddd   xmm1,                           xmm2
+  movd    r14d,                           xmm1
+  movd    xmm9,r14d
+  paddd           xmm8,           xmm9                    ; iFrameSad
+
+  mov             r14,            psum16x16
+  movdqa  xmm1,           xmm6
+  pshufd  xmm2,           xmm1,           00001110b
+  paddd   xmm1,           xmm2
+  movd    [r14],          xmm1                            ; sum
+
+  mov             r14,            p_sd8x8
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [r14],          xmm1
+  add             r14,            8
+  mov             p_sd8x8,        r14
+
+  mov                     r14,            p_mad8x8
+  WELS_MAX_REG_SSE2       xmm5
+
+  movhlps         xmm1,   xmm5
+  push r0
+  movd            r0d,    xmm5
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  pop r0
+  add                     r14,    2
+  mov                     p_mad8x8,       r14
+
+  psrlq   xmm7,   32
+  psllq   xmm7,   32                      ; clear sad
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+
+  mov             r14,            psad8x8
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [r14],          xmm2
+  movd    [r14+4],        xmm1
+  add             r14,            8
+  mov             psad8x8,        r14                     ; sad8x8
+
+  paddd   xmm1,                           xmm2
+  movd    r14d,                           xmm1
+  movd    xmm9, r14d
+  paddd   xmm8,           xmm9            ; iFrameSad
+
+  mov             r14,                    psum16x16
+  movdqa  xmm1,                   xmm6
+  pshufd  xmm2,                   xmm1,           00001110b
+  paddd   xmm1,                   xmm2
+  movd    r15d,                   xmm1                            ; sum
+  add             [r14],                  r15d
+  add             r14,                    4
+  mov             psum16x16,      r14
+
+  mov             r14,                    psqsum16x16
+  psrlq   xmm7,                   32
+  pshufd  xmm2,                   xmm7,           00001110b
+  paddd   xmm2,                   xmm7
+  movd    [r14],                  xmm2                            ; sqsum
+  add             r14,                    4
+  mov             psqsum16x16,    r14
+
+  mov             r14,            p_sd8x8
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [r14],          xmm1
+  add             r14,            8
+  mov             p_sd8x8,        r14
+
+  mov             r14,            p_mad8x8
+  WELS_MAX_REG_SSE2       xmm5
+
+
+  movhlps         xmm1,   xmm5
+  push r0
+  movd            r0d,    xmm5
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  pop r0
+  add                     r14,    2
+  mov                     p_mad8x8,       r14
+
+  mov             r14,            psqdiff16x16
+  pshufd  xmm1,           xmm4,           00001110b
+  paddd   xmm4,           xmm1
+  pshufd  xmm1,           xmm4,           00000001b
+  paddd   xmm4,           xmm1
+  movd    [r14],          xmm4
+  add             r14,            4
+  mov             psqdiff16x16,   r14
+
+  add             r14,    16
+  sub             r0,     r13
+  sub             r1,     r13
+  add             r0,     16
+  add             r1,     16
+
+  dec             r2
+  jnz             sqdiff_bgd_width_loop
+  pop r2
+  %assign push_num push_num-1
+  mov             r0,     r10
+  mov             r1,     r11
+  add             r0,     r13
+  add             r1,     r13
+
+  dec     r3
+  jnz             sqdiff_bgd_height_loop
+
+  mov             r14,    psadframe
+  movd    [r14],  xmm8
+
+  POP_XMM
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+%assign push_num 0
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
+%endif
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -21,9 +21,9 @@
 
 ifeq ($(ASM_ARCH), x86)
 PROCESSING_ASM_SRCS=\
-	$(PROCESSING_SRCDIR)/src/asm/denoisefilter.asm\
-	$(PROCESSING_SRCDIR)/src/asm/downsample_bilinear.asm\
-	$(PROCESSING_SRCDIR)/src/asm/vaa.asm\
+	$(PROCESSING_SRCDIR)/src/x86/denoisefilter.asm\
+	$(PROCESSING_SRCDIR)/src/x86/downsample_bilinear.asm\
+	$(PROCESSING_SRCDIR)/src/x86/vaa.asm\
 
 PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.$(OBJ))
 endif