shithub: openh264

Download patch

ref: 9230b497282ada7649b2ac4cfd51a7b5d9e2f75d
parent: 301b06ad363e964daccaeb823e88de528a55c6a6
parent: 65b339815efb168f48e256231299a932f397fb51
author: Ethan Hugg <[email protected]>
date: Mon Jan 13 16:21:17 EST 2014

Merge pull request #97 from mstorsjo/asm-source-cleanup

Make all asm sources consistently use unix newlines and remove trailing whitespace

--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -154,7 +154,7 @@
 %define  PUSHRFLAGS     pushfq
 %define  POPRFLAGS      popfq
 %define  retrq          rax
-%define  retrd          eax 
+%define  retrd          eax
 
 %elifdef X86_32 ; X86_32 ;************************************
 
@@ -233,7 +233,7 @@
 %macro LOAD_4_PARA 0
     %ifdef X86_32
         push r3
-        %assign  push_num push_num+1	
+        %assign  push_num push_num+1
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -245,7 +245,7 @@
     %ifdef X86_32
         push r3
         push r4
-        %assign  push_num push_num+2	
+        %assign  push_num push_num+2
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -261,7 +261,7 @@
 	push r3
         push r4
         push r5
-        %assign  push_num push_num+3	
+        %assign  push_num push_num+3
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -280,7 +280,7 @@
         push r4
         push r5
         push r6
-        %assign  push_num push_num+4	
+        %assign  push_num push_num+4
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -334,7 +334,7 @@
             movsx %1, %2
     %endif
 %endmacro
- 
+
 %macro WELS_EXTERN 1
     %ifdef PREFIX
         global _%1
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -81,17 +81,17 @@
 %ifdef       WIN64
 
 WelsCPUId:
-    push     rbx        
-    push     rdx    
- 
+    push     rbx
+    push     rdx
+
     mov      eax,     ecx
     mov      rcx,     [r9]
-    cpuid  
+    cpuid
     mov      [r9],    ecx
     mov      [r8],    ebx
-    mov      rcx,    [rsp + 2*8 + 40]        
+    mov      rcx,    [rsp + 2*8 + 40]
     mov      [rcx],   edx
-    pop      rdx 
+    pop      rdx
     mov      [rdx],   eax
 
     pop      rbx
@@ -103,8 +103,8 @@
     push     rcx
     push     rdx
 
-    mov      eax,     edi 
-    mov      rcx,     [rcx]   
+    mov      eax,     edi
+    mov      rcx,     [rcx]
     cpuid
     mov      [r8],    edx
     pop      rdx
@@ -156,9 +156,9 @@
 %elifdef   UNIX64
         mov eax, edi
         mov ecx, esi
-%else 
+%else
         mov eax, [esp+4]
-        mov ecx, [esp+8]  
+        mov ecx, [esp+8]
 %endif
 
         ; refer to detection of AVX addressed in INTEL AVX manual document
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -57,264 +57,264 @@
 
 SECTION .text
 
-%ifdef  WIN64 
+%ifdef  WIN64
 
 
 WELS_EXTERN   DeblockLumaLt4V_sse2
 
 DeblockLumaLt4V_sse2:
-  push        rbp      
+  push        rbp
   mov         r11,[rsp + 16 + 20h]  ; pTC
-  sub         rsp,1B0h                                                       
-  lea         rbp,[rsp+20h]                                                  
-  movd        xmm4,r8d                                                                                                  
-  movd        xmm2,r9d                                                       
-  mov         qword [rbp+180h],r12                                       
-  mov         r10,rcx                                                        
-  movsxd      r12,edx                                                        
-  add         edx,edx                                                        
-  movsxd      rdx,edx                                                        
-  sub         r10,r12                                                        
-  movsx       r8d,byte [r11]                                             
-  pxor        xmm3,xmm3                                                      
-  punpcklwd   xmm2,xmm2                                                      
-  movaps      [rbp+50h],xmm14                                    
-  lea         rax,[r12+r12*2]                                                
-  movdqa      xmm14,[rdx+rcx]                                    
-  neg         rax                                                            
-  pshufd      xmm0,xmm2,0                                                    
-  movd        xmm2,r8d                                                       
-  movsx       edx,byte [r11+1]                                           
-  movsx       r8d,byte [r11+2]                                           
-  movsx       r11d,byte [r11+3]                                          
-  movaps      [rbp+70h],xmm12                                    
-  movd        xmm1,edx                                                       
-  movaps      [rbp+80h],xmm11                                    
-  movd        xmm12,r8d                                                      
-  movd        xmm11,r11d                                                     
-  movdqa      xmm5, [rax+rcx]                                     
-  lea         rax,[r12+r12]                                                  
-  punpcklwd   xmm12,xmm12                                                    
-  neg         rax                                                            
-  punpcklwd   xmm11,xmm11                                                    
-  movaps      [rbp],xmm8                                         
-  movdqa      xmm8, [r10]                                         
-  punpcklwd   xmm2,xmm2                                                      
-  punpcklwd   xmm1,xmm1                                                      
-  punpcklqdq  xmm12,xmm12                                                    
-  punpcklqdq  xmm11,xmm11                                                    
-  punpcklqdq  xmm2,xmm2                                                      
-  punpcklqdq  xmm1,xmm1                                                      
-  shufps      xmm12,xmm11,88h                                                
-  movdqa      xmm11,xmm8                                                     
-  movaps      [rbp+30h],xmm9                                     
-  movdqa      xmm9,[rcx]                                         
-  shufps      xmm2,xmm1,88h                                                  
-  movdqa      xmm1,xmm5                                                      
-  punpcklbw   xmm11,xmm3                                                     
-  movaps      [rbp+20h],xmm6                                     
-  movaps      [rbp+60h],xmm13                                    
-  movdqa      xmm13,xmm11                                                    
-  movaps      [rbp+90h],xmm10                                    
-  movdqa      xmm10,xmm9                                                     
-  movdqa      xmm6,[rax+rcx]                                     
-  punpcklbw   xmm1,xmm3                                                      
-  movaps      [rbp+0A0h],xmm12                                   
-  psubw       xmm13,xmm1                                                     
-  movaps      [rbp+40h],xmm15                                    
-  movdqa      xmm15,xmm14                                                    
-  movaps      [rbp+10h],xmm7                                     
-  movdqa      xmm7,xmm6                                                      
-  punpcklbw   xmm10,xmm3                                                     
-  movdqa      xmm12,[r12+rcx]                                    
-  punpcklbw   xmm7,xmm3                                                      
-  punpcklbw   xmm12,xmm3                                                     
-  punpcklbw   xmm15,xmm3                                                     
-  pabsw       xmm3,xmm13                                                     
-  movdqa      xmm13,xmm10                                                    
-  psubw       xmm13,xmm15                                                    
-  movdqa      [rbp+0F0h],xmm15                                   
-  pabsw       xmm15,xmm13                                                    
-  movdqa      xmm13,xmm11                                                    
-  movdqa      [rbp+0B0h],xmm1                                    
-  movdqa      xmm1,xmm0                                                      
-  pavgw       xmm13,xmm10                                                    
-  pcmpgtw     xmm1,xmm3                                                      
-  movdqa      [rbp+120h],xmm13                                   
-  movaps      xmm13,xmm2                                                     
-  punpcklwd   xmm4,xmm4                                                      
-  movdqa      xmm3,xmm0                                                      
-  movdqa      [rbp+100h],xmm1                                    
-  psubw       xmm13,xmm1                                                     
-  movdqa      xmm1,xmm10                                                     
-  pcmpgtw     xmm3,xmm15                                                     
-  pshufd      xmm4,xmm4,0                                                    
-  psubw       xmm1,xmm11                                                     
-  movdqa      [rbp+0D0h],xmm10                                   
-  psubw       xmm13,xmm3                                                     
-  movdqa      [rbp+110h],xmm3                                    
-  pabsw       xmm15,xmm1                                                     
-  movdqa      xmm3,xmm4                                                      
-  psubw       xmm10,xmm12                                                    
-  pcmpgtw     xmm3,xmm15                                                     
-  pabsw       xmm15,xmm10                                                    
-  movdqa      xmm10,xmm0                                                     
-  psllw       xmm1,2                                                         
-  movdqa      [rbp+0C0h],xmm11                                   
-  psubw       xmm11,xmm7                                                     
-  pcmpgtw     xmm10,xmm15                                                    
-  pabsw       xmm11,xmm11                                                    
-  movdqa      xmm15,xmm0                                                     
-  pand        xmm3,xmm10                                                     
-  pcmpgtw     xmm15,xmm11                                                    
-  movaps      xmm11,xmm2                                                     
-  pxor        xmm10,xmm10                                                    
-  pand        xmm3,xmm15                                                     
-  pcmpgtw     xmm11,xmm10                                                    
-  pcmpeqw     xmm10,xmm2                                                     
-  por         xmm11,xmm10                                                    
-  pand        xmm3,xmm11                                                     
-  movdqa      xmm11,xmm7                                                     
-  psubw       xmm11,xmm12                                                    
-  pxor        xmm15,xmm15                                                    
-  paddw       xmm11,xmm1                                                     
-  psubw       xmm15,xmm13                                                    
-  movdqa      [rbp+0E0h],xmm12                                   
-  paddw       xmm11,[FOUR_16B_SSE2] 
-  pxor        xmm12,xmm12                                                    
-  psraw       xmm11,3                                                        
-  punpckhbw   xmm8,xmm12                                                     
-  pmaxsw      xmm15,xmm11                                                    
-  punpckhbw   xmm5,xmm12                                                     
-  movdqa      xmm11,xmm8                                                     
-  pminsw      xmm13,xmm15                                                    
-  psubw       xmm11,xmm5                                                     
-  punpckhbw   xmm9,xmm12                                                     
-  pand        xmm13,xmm3                                                     
-  movdqa      [rbp+130h],xmm13                                   
-  pabsw       xmm13,xmm11                                                    
-  punpckhbw   xmm14,xmm12                                                    
-  movdqa      xmm11,xmm9                                                     
-  psubw       xmm11,xmm14                                                    
-  movdqa      xmm15,xmm0                                                     
-  movdqa      [rbp+140h],xmm14                                   
-  pabsw       xmm14,xmm11                                                    
-  movdqa      xmm11,xmm8                                                     
-  pcmpgtw     xmm15,xmm14                                                    
-  movdqa      xmm1,[r12+rcx]                                     
-  pavgw       xmm11,xmm9                                                     
-  movdqa      [rbp+170h],xmm11                                   
-  movdqa      xmm10,xmm9                                                     
-  punpckhbw   xmm6,xmm12                                                     
-  psubw       xmm10,xmm8                                                     
-  punpckhbw   xmm1,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  movaps      xmm11,[rbp+0A0h]                                   
-  pcmpgtw     xmm12,xmm13                                                    
-  movaps      xmm13,xmm11                                                    
-  psubw       xmm13,xmm12                                                    
-  movdqa      [rbp+160h],xmm15                                   
-  psubw       xmm13,xmm15                                                    
-  movdqa      xmm15,xmm9                                                     
-  psubw       xmm15,xmm1                                                     
-  movdqa      [rbp+150h],xmm12                                   
-  pabsw       xmm12,xmm10                                                    
-  pabsw       xmm14,xmm15                                                    
-  movdqa      xmm15,xmm8                                                     
-  pcmpgtw     xmm4,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  psubw       xmm15,xmm6                                                     
-  pcmpgtw     xmm12,xmm14                                                    
-  pabsw       xmm14,xmm15                                                    
-  psllw       xmm10,2                                                        
-  pcmpgtw     xmm0,xmm14                                                     
-  movdqa      xmm14,xmm6                                                     
-  psubw       xmm14,xmm1                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm14,xmm10                                                    
-  pand        xmm4,xmm0                                                      
-  paddw       xmm14,[FOUR_16B_SSE2] 
-  pxor        xmm15,xmm15                                                    
-  movaps      xmm12,xmm11                                                    
-  psubw       xmm15,xmm13                                                    
-  pxor        xmm0,xmm0                                                      
-  psraw       xmm14,3                                                        
-  pcmpgtw     xmm12,xmm0                                                     
-  pcmpeqw     xmm0,xmm11                                                     
-  pmaxsw      xmm15,xmm14                                                    
-  por         xmm12,xmm0                                                     
-  movdqa      xmm0,[rbp+120h]                                    
-  pminsw      xmm13,xmm15                                                    
-  movdqa      xmm15,[rbp+0B0h]                                   
-  movdqa      xmm10,xmm7                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm15,xmm0                                                     
-  pxor        xmm12,xmm12                                                    
-  paddw       xmm10,xmm7                                                     
-  movdqa      xmm14,xmm12                                                    
-  psubw       xmm15,xmm10                                                    
-  psubw       xmm14,xmm2                                                     
-  psraw       xmm15,1                                                        
-  pmaxsw      xmm15,xmm14                                                    
-  movdqa      xmm10,xmm6                                                     
-  pminsw      xmm15,xmm2                                                     
-  paddw       xmm10,xmm6                                                     
-  pand        xmm15,xmm3                                                     
-  psubw       xmm12,xmm11                                                    
-  pand        xmm15,[rbp+100h]                                   
-  pand        xmm13,xmm4                                                     
-  paddw       xmm7,xmm15                                                     
-  paddw       xmm8,xmm13                                                     
-  movdqa      xmm15,[rbp+170h]                                   
-  psubw       xmm9,xmm13                                                     
-  paddw       xmm5,xmm15                                                     
-  psubw       xmm5,xmm10                                                     
-  psraw       xmm5,1                                                         
-  pmaxsw      xmm5,xmm12                                                     
-  pminsw      xmm5,xmm11                                                     
-  pand        xmm5,xmm4                                                      
-  pand        xmm5,[rbp+150h]                                    
-  paddw       xmm6,xmm5                                                      
-  movdqa      xmm5,[rbp+0C0h]                                    
-  packuswb    xmm7,xmm6                                                      
-  movdqa      xmm6,[rbp+130h]                                    
-  paddw       xmm5,xmm6                                                      
-  packuswb    xmm5,xmm8                                                      
-  movdqa      xmm8,[rbp+0D0h]                                    
-  psubw       xmm8,xmm6                                                      
-  movdqa      xmm6,[rbp+0F0h]                                    
-  paddw       xmm6,xmm0                                                      
-  movdqa      xmm0,[rbp+0E0h]                                    
-  packuswb    xmm8,xmm9                                                      
-  movdqa      xmm9,xmm0                                                      
-  paddw       xmm9,xmm0                                                      
-  psubw       xmm6,xmm9                                                      
-  psraw       xmm6,1                                                         
-  pmaxsw      xmm14,xmm6                                                     
-  pminsw      xmm2,xmm14                                                     
-  pand        xmm2,xmm3                                                      
-  pand        xmm2,[rbp+110h]                                    
-  paddw       xmm0,xmm2                                                      
-  movdqa      xmm2,[rbp+140h]                                    
-  paddw       xmm2,xmm15                                                     
-  movdqa      xmm15,xmm1                                                     
-  paddw       xmm15,xmm1                                                     
-  psubw       xmm2,xmm15                                                     
-  psraw       xmm2,1                                                         
-  pmaxsw      xmm12,xmm2                                                     
-  pminsw      xmm11,xmm12                                                    
-  pand        xmm11,xmm4                                                     
-  pand        xmm11,[rbp+160h]                                   
-  paddw       xmm1,xmm11                                                     
-  movdqa      [rax+rcx],xmm7                                     
-  movdqa      [r10],xmm5                                         
-  packuswb    xmm0,xmm1                                                      
-  movdqa      [rcx],xmm8                                         
-  movdqa      [r12+rcx],xmm0                                                                        
-  mov         r12,qword [rbp+180h]                                       
-  lea         rsp,[rbp+190h]                                                 
-  pop         rbp                                                            
-  ret                                                                        
+  sub         rsp,1B0h
+  lea         rbp,[rsp+20h]
+  movd        xmm4,r8d
+  movd        xmm2,r9d
+  mov         qword [rbp+180h],r12
+  mov         r10,rcx
+  movsxd      r12,edx
+  add         edx,edx
+  movsxd      rdx,edx
+  sub         r10,r12
+  movsx       r8d,byte [r11]
+  pxor        xmm3,xmm3
+  punpcklwd   xmm2,xmm2
+  movaps      [rbp+50h],xmm14
+  lea         rax,[r12+r12*2]
+  movdqa      xmm14,[rdx+rcx]
+  neg         rax
+  pshufd      xmm0,xmm2,0
+  movd        xmm2,r8d
+  movsx       edx,byte [r11+1]
+  movsx       r8d,byte [r11+2]
+  movsx       r11d,byte [r11+3]
+  movaps      [rbp+70h],xmm12
+  movd        xmm1,edx
+  movaps      [rbp+80h],xmm11
+  movd        xmm12,r8d
+  movd        xmm11,r11d
+  movdqa      xmm5, [rax+rcx]
+  lea         rax,[r12+r12]
+  punpcklwd   xmm12,xmm12
+  neg         rax
+  punpcklwd   xmm11,xmm11
+  movaps      [rbp],xmm8
+  movdqa      xmm8, [r10]
+  punpcklwd   xmm2,xmm2
+  punpcklwd   xmm1,xmm1
+  punpcklqdq  xmm12,xmm12
+  punpcklqdq  xmm11,xmm11
+  punpcklqdq  xmm2,xmm2
+  punpcklqdq  xmm1,xmm1
+  shufps      xmm12,xmm11,88h
+  movdqa      xmm11,xmm8
+  movaps      [rbp+30h],xmm9
+  movdqa      xmm9,[rcx]
+  shufps      xmm2,xmm1,88h
+  movdqa      xmm1,xmm5
+  punpcklbw   xmm11,xmm3
+  movaps      [rbp+20h],xmm6
+  movaps      [rbp+60h],xmm13
+  movdqa      xmm13,xmm11
+  movaps      [rbp+90h],xmm10
+  movdqa      xmm10,xmm9
+  movdqa      xmm6,[rax+rcx]
+  punpcklbw   xmm1,xmm3
+  movaps      [rbp+0A0h],xmm12
+  psubw       xmm13,xmm1
+  movaps      [rbp+40h],xmm15
+  movdqa      xmm15,xmm14
+  movaps      [rbp+10h],xmm7
+  movdqa      xmm7,xmm6
+  punpcklbw   xmm10,xmm3
+  movdqa      xmm12,[r12+rcx]
+  punpcklbw   xmm7,xmm3
+  punpcklbw   xmm12,xmm3
+  punpcklbw   xmm15,xmm3
+  pabsw       xmm3,xmm13
+  movdqa      xmm13,xmm10
+  psubw       xmm13,xmm15
+  movdqa      [rbp+0F0h],xmm15
+  pabsw       xmm15,xmm13
+  movdqa      xmm13,xmm11
+  movdqa      [rbp+0B0h],xmm1
+  movdqa      xmm1,xmm0
+  pavgw       xmm13,xmm10
+  pcmpgtw     xmm1,xmm3
+  movdqa      [rbp+120h],xmm13
+  movaps      xmm13,xmm2
+  punpcklwd   xmm4,xmm4
+  movdqa      xmm3,xmm0
+  movdqa      [rbp+100h],xmm1
+  psubw       xmm13,xmm1
+  movdqa      xmm1,xmm10
+  pcmpgtw     xmm3,xmm15
+  pshufd      xmm4,xmm4,0
+  psubw       xmm1,xmm11
+  movdqa      [rbp+0D0h],xmm10
+  psubw       xmm13,xmm3
+  movdqa      [rbp+110h],xmm3
+  pabsw       xmm15,xmm1
+  movdqa      xmm3,xmm4
+  psubw       xmm10,xmm12
+  pcmpgtw     xmm3,xmm15
+  pabsw       xmm15,xmm10
+  movdqa      xmm10,xmm0
+  psllw       xmm1,2
+  movdqa      [rbp+0C0h],xmm11
+  psubw       xmm11,xmm7
+  pcmpgtw     xmm10,xmm15
+  pabsw       xmm11,xmm11
+  movdqa      xmm15,xmm0
+  pand        xmm3,xmm10
+  pcmpgtw     xmm15,xmm11
+  movaps      xmm11,xmm2
+  pxor        xmm10,xmm10
+  pand        xmm3,xmm15
+  pcmpgtw     xmm11,xmm10
+  pcmpeqw     xmm10,xmm2
+  por         xmm11,xmm10
+  pand        xmm3,xmm11
+  movdqa      xmm11,xmm7
+  psubw       xmm11,xmm12
+  pxor        xmm15,xmm15
+  paddw       xmm11,xmm1
+  psubw       xmm15,xmm13
+  movdqa      [rbp+0E0h],xmm12
+  paddw       xmm11,[FOUR_16B_SSE2]
+  pxor        xmm12,xmm12
+  psraw       xmm11,3
+  punpckhbw   xmm8,xmm12
+  pmaxsw      xmm15,xmm11
+  punpckhbw   xmm5,xmm12
+  movdqa      xmm11,xmm8
+  pminsw      xmm13,xmm15
+  psubw       xmm11,xmm5
+  punpckhbw   xmm9,xmm12
+  pand        xmm13,xmm3
+  movdqa      [rbp+130h],xmm13
+  pabsw       xmm13,xmm11
+  punpckhbw   xmm14,xmm12
+  movdqa      xmm11,xmm9
+  psubw       xmm11,xmm14
+  movdqa      xmm15,xmm0
+  movdqa      [rbp+140h],xmm14
+  pabsw       xmm14,xmm11
+  movdqa      xmm11,xmm8
+  pcmpgtw     xmm15,xmm14
+  movdqa      xmm1,[r12+rcx]
+  pavgw       xmm11,xmm9
+  movdqa      [rbp+170h],xmm11
+  movdqa      xmm10,xmm9
+  punpckhbw   xmm6,xmm12
+  psubw       xmm10,xmm8
+  punpckhbw   xmm1,xmm12
+  movdqa      xmm12,xmm0
+  movaps      xmm11,[rbp+0A0h]
+  pcmpgtw     xmm12,xmm13
+  movaps      xmm13,xmm11
+  psubw       xmm13,xmm12
+  movdqa      [rbp+160h],xmm15
+  psubw       xmm13,xmm15
+  movdqa      xmm15,xmm9
+  psubw       xmm15,xmm1
+  movdqa      [rbp+150h],xmm12
+  pabsw       xmm12,xmm10
+  pabsw       xmm14,xmm15
+  movdqa      xmm15,xmm8
+  pcmpgtw     xmm4,xmm12
+  movdqa      xmm12,xmm0
+  psubw       xmm15,xmm6
+  pcmpgtw     xmm12,xmm14
+  pabsw       xmm14,xmm15
+  psllw       xmm10,2
+  pcmpgtw     xmm0,xmm14
+  movdqa      xmm14,xmm6
+  psubw       xmm14,xmm1
+  pand        xmm4,xmm12
+  paddw       xmm14,xmm10
+  pand        xmm4,xmm0
+  paddw       xmm14,[FOUR_16B_SSE2]
+  pxor        xmm15,xmm15
+  movaps      xmm12,xmm11
+  psubw       xmm15,xmm13
+  pxor        xmm0,xmm0
+  psraw       xmm14,3
+  pcmpgtw     xmm12,xmm0
+  pcmpeqw     xmm0,xmm11
+  pmaxsw      xmm15,xmm14
+  por         xmm12,xmm0
+  movdqa      xmm0,[rbp+120h]
+  pminsw      xmm13,xmm15
+  movdqa      xmm15,[rbp+0B0h]
+  movdqa      xmm10,xmm7
+  pand        xmm4,xmm12
+  paddw       xmm15,xmm0
+  pxor        xmm12,xmm12
+  paddw       xmm10,xmm7
+  movdqa      xmm14,xmm12
+  psubw       xmm15,xmm10
+  psubw       xmm14,xmm2
+  psraw       xmm15,1
+  pmaxsw      xmm15,xmm14
+  movdqa      xmm10,xmm6
+  pminsw      xmm15,xmm2
+  paddw       xmm10,xmm6
+  pand        xmm15,xmm3
+  psubw       xmm12,xmm11
+  pand        xmm15,[rbp+100h]
+  pand        xmm13,xmm4
+  paddw       xmm7,xmm15
+  paddw       xmm8,xmm13
+  movdqa      xmm15,[rbp+170h]
+  psubw       xmm9,xmm13
+  paddw       xmm5,xmm15
+  psubw       xmm5,xmm10
+  psraw       xmm5,1
+  pmaxsw      xmm5,xmm12
+  pminsw      xmm5,xmm11
+  pand        xmm5,xmm4
+  pand        xmm5,[rbp+150h]
+  paddw       xmm6,xmm5
+  movdqa      xmm5,[rbp+0C0h]
+  packuswb    xmm7,xmm6
+  movdqa      xmm6,[rbp+130h]
+  paddw       xmm5,xmm6
+  packuswb    xmm5,xmm8
+  movdqa      xmm8,[rbp+0D0h]
+  psubw       xmm8,xmm6
+  movdqa      xmm6,[rbp+0F0h]
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[rbp+0E0h]
+  packuswb    xmm8,xmm9
+  movdqa      xmm9,xmm0
+  paddw       xmm9,xmm0
+  psubw       xmm6,xmm9
+  psraw       xmm6,1
+  pmaxsw      xmm14,xmm6
+  pminsw      xmm2,xmm14
+  pand        xmm2,xmm3
+  pand        xmm2,[rbp+110h]
+  paddw       xmm0,xmm2
+  movdqa      xmm2,[rbp+140h]
+  paddw       xmm2,xmm15
+  movdqa      xmm15,xmm1
+  paddw       xmm15,xmm1
+  psubw       xmm2,xmm15
+  psraw       xmm2,1
+  pmaxsw      xmm12,xmm2
+  pminsw      xmm11,xmm12
+  pand        xmm11,xmm4
+  pand        xmm11,[rbp+160h]
+  paddw       xmm1,xmm11
+  movdqa      [rax+rcx],xmm7
+  movdqa      [r10],xmm5
+  packuswb    xmm0,xmm1
+  movdqa      [rcx],xmm8
+  movdqa      [r12+rcx],xmm0
+  mov         r12,qword [rbp+180h]
+  lea         rsp,[rbp+190h]
+  pop         rbp
+  ret
 
 
 WELS_EXTERN   DeblockLumaEq4V_sse2
@@ -321,462 +321,462 @@
 
 ALIGN  16
 DeblockLumaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp  
-  push        rsi  
-  push        rdi  
-  sub         rsp,1D8h 
-  movaps      [rax-38h],xmm6 
-  movaps      [rax-48h],xmm7 
-  movaps      [rax-58h],xmm8 
-  pxor        xmm1,xmm1 
-  movsxd      r10,edx 
-  mov         rbp,rcx 
-  mov         r11d,r8d 
-  mov         rdx,rcx 
-  mov         rdi,rbp 
-  mov         rbx,rbp 
-  movdqa      xmm5,[rbp] 
-  movaps      [rax-68h],xmm9 
-  movaps      [rax-78h],xmm10 
-  punpcklbw   xmm5,xmm1 
-  movaps      [rax-88h],xmm11 
-  movaps      [rax-98h],xmm12 
-  movaps      [rax-0A8h],xmm13 
-  movaps      [rax-0B8h],xmm14 
-  movdqa      xmm14,[r10+rbp] 
-  movaps      [rax-0C8h],xmm15 
-  lea         eax,[r10*4] 
-  movsxd      r8,eax 
-  lea         eax,[r10+r10*2] 
-  movsxd      rcx,eax 
-  lea         eax,[r10+r10] 
-  sub         rdx,r8 
-  punpcklbw   xmm14,xmm1 
-  movdqa      [rsp+90h],xmm5 
-  movdqa      [rsp+30h],xmm14 
-  movsxd      rsi,eax 
-  movsx       eax,r11w 
-  sub         rdi,rcx 
-  sub         rbx,rsi 
-  mov         r8,rbp 
-  sub         r8,r10 
-  movd        xmm0,eax 
-  movsx       eax,r9w 
-  movdqa      xmm12,[rdi] 
-  movdqa      xmm6, [rsi+rbp] 
-  movdqa      xmm13,[rbx] 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm6,xmm1 
-  movdqa      xmm8,[r8] 
-  movd        xmm0,eax 
-  movdqa      xmm10,xmm11 
-  mov         eax,2 
-  punpcklbw   xmm8,xmm1 
-  punpcklbw   xmm12,xmm1 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  psraw       xmm10,2 
-  movdqa      xmm1,xmm8 
-  movdqa      [rsp+0F0h],xmm13 
-  movdqa      [rsp+0B0h],xmm8 
-  pshufd      xmm7,xmm0,0 
-  psubw       xmm1,xmm13 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm4,xmm7 
-  movdqa      xmm2,xmm7 
-  psubw       xmm0,xmm8 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm5 
-  movdqa      [rsp+40h],xmm7 
-  movdqa      [rsp+60h],xmm6 
-  pcmpgtw     xmm4,xmm0 
-  psubw       xmm1,xmm14 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm4,xmm2 
-  movdqa      xmm0,xmm11 
-  pcmpgtw     xmm0,xmm3 
-  pand        xmm4,xmm0 
-  movd        xmm0,eax 
-  movdqa      [rsp+20h],xmm4 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm2,xmm0,0 
-  paddw       xmm10,xmm2 
-  movdqa      [rsp+0A0h],xmm2 
-  movdqa      xmm15,xmm7 
-  pxor        xmm4,xmm4 
-  movdqa      xmm0,xmm8 
-  psubw       xmm0,xmm12 
-  mov         eax,4 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm10 
-  cwde             
-  pcmpgtw     xmm15,xmm0 
-  pcmpgtw     xmm1,xmm3 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm7,[rdx] 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm6 
-  pand        xmm15,xmm1 
-  punpcklbw   xmm7,xmm4 
-  movdqa      xmm9,xmm15 
-  pabsw       xmm0,xmm0 
-  psllw       xmm7,1 
-  pandn       xmm9,xmm12 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm7,xmm12 
-  movd        xmm0,eax 
-  pand        xmm3,xmm1 
-  paddw       xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm7,xmm12 
-  pshufd      xmm1,xmm0,0 
-  paddw       xmm7,xmm13 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm6 
-  paddw       xmm7,xmm8 
-  movdqa      [rsp+70h],xmm1 
-  paddw       xmm7,xmm5 
-  movdqa      [rsp+120h],xmm0 
-  movdqa      xmm0,[rcx+rbp] 
-  punpcklbw   xmm0,xmm4 
-  paddw       xmm7,xmm1 
-  movdqa      xmm4,xmm15 
-  psllw       xmm0,1 
-  psraw       xmm7,3 
-  paddw       xmm0,xmm6 
-  pand        xmm7,xmm15 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm14 
-  movdqa      xmm6,xmm15 
-  paddw       xmm0,xmm5 
-  pandn       xmm6,xmm13 
-  paddw       xmm0,xmm8 
-  paddw       xmm0,xmm1 
-  psraw       xmm0,3 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,xmm13 
-  pand        xmm0,xmm3 
-  movdqa      [rsp+100h],xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,xmm5 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm3 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pandn       xmm0,xmm14 
-  pand        xmm4,xmm1 
-  movdqa      [rsp+0E0h],xmm0 
-  movdqa      xmm0,xmm5 
-  paddw       xmm0,xmm8 
-  movdqa      xmm1,[rsp+60h] 
-  paddw       xmm1,xmm14 
-  movdqa      xmm14,xmm3 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,[rsp+30h] 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pand        xmm14,xmm1 
-  movdqa      xmm1,xmm13 
-  paddw       xmm1,xmm13 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  movdqa      xmm0,[rsp+30h] 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm5,xmm15 
-  paddw       xmm0,[rsp+70h] 
-  pandn       xmm5,xmm1 
-  paddw       xmm2,xmm8 
-  movdqa      xmm8,[rsp+90h] 
-  movdqa      xmm1,xmm12 
-  paddw       xmm2,xmm8 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,xmm8 
-  movdqa      xmm8,xmm3 
-  movdqa      xmm2,[rsp+30h] 
-  paddw       xmm0,xmm13 
-  psraw       xmm1,3 
-  pand        xmm15,xmm1 
-  movdqa      xmm1,xmm2 
-  paddw       xmm1,xmm2 
-  paddw       xmm2,[rsp+90h] 
-  paddw       xmm2,[rsp+0B0h] 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm13,[r8] 
-  paddw       xmm0, [rsp+70h] 
-  paddw       xmm1, [rsp+0A0h] 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,2 
-  movdqa      xmm0, [rdi] 
-  pandn       xmm8,xmm1 
-  movdqa      xmm1, [rsp+60h] 
-  paddw       xmm1,xmm2 
-  movdqa      xmm2, [rbx] 
-  psraw       xmm1,3 
-  pand        xmm3,xmm1 
-  movdqa      xmm1, [rbp] 
-  movdqa      [rsp+0D0h],xmm3 
-  pxor        xmm3,xmm3 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm1,xmm3 
-  punpckhbw   xmm13,xmm3 
-  movdqa      [rsp+0C0h],xmm0 
-  movdqa      xmm0,[r10+rbp] 
-  movdqa      [rsp],xmm1 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm2,xmm3 
-  movdqa      [rsp+80h],xmm0 
-  movdqa      xmm0,[rsi+rbp] 
-  movdqa      [rsp+10h],xmm13 
-  punpckhbw   xmm0,xmm3 
-  movdqa      [rsp+50h],xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm1,xmm13 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm2 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,[rsp] 
-  movdqa      xmm13,[rsp+40h] 
-  movdqa      [rsp+110h],xmm2 
-  psubw       xmm1, [rsp+80h] 
-  pcmpgtw     xmm13,xmm0 
-  pcmpgtw     xmm11,xmm3 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm10,xmm3 
-  movdqa      xmm1, [rsp+40h] 
-  movdqa      xmm2,xmm1 
-  movdqa      xmm3,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  movdqa      xmm0, [rsp+10h] 
-  pand        xmm13,xmm2 
-  pand        xmm13,xmm11 
-  movdqa      xmm11,[rsp+0C0h] 
-  psubw       xmm0,xmm11 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm3,xmm0 
-  pand        xmm3,xmm10 
-  movdqa      xmm0,[rsp] 
-  psubw       xmm0,[rsp+50h] 
-  movdqa      xmm2,[rdx] 
-  pabsw       xmm0,xmm0 
-  por         xmm7,xmm9 
-  movdqa      xmm9,[rsp+20h] 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm9,xmm7 
-  movdqa      xmm7,[rsp+20h] 
-  movdqa      xmm0,xmm7 
-  pandn       xmm0,xmm12 
-  movdqa      xmm12,[rsp+110h] 
-  pand        xmm1,xmm10 
-  movdqa      xmm10,[rsp+70h] 
-  movdqa      [rsp+40h],xmm1 
-  movdqa      xmm1,xmm13 
-  por         xmm9,xmm0 
-  pxor        xmm0,xmm0 
-  por         xmm4,xmm6 
-  movdqa      xmm6,xmm7 
-  punpckhbw   xmm2,xmm0 
-  por         xmm15,xmm5 
-  movdqa      xmm5,[rsp+20h] 
-  movdqa      xmm0,xmm3 
-  psllw       xmm2,1 
-  pandn       xmm0,xmm11 
-  pand        xmm6,xmm4 
-  movdqa      xmm4,[rsp] 
-  paddw       xmm2,xmm11 
-  pand        xmm5,xmm15 
-  movdqa      xmm15,[rsp+20h] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm12 
-  paddw       xmm2,[rsp+10h] 
-  paddw       xmm2,[rsp] 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  pand        xmm2,xmm3 
-  por         xmm2,xmm0 
-  pand        xmm1,xmm2 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm2,xmm11 
-  pandn       xmm0,xmm11 
-  paddw       xmm2,xmm12 
-  por         xmm1,xmm0 
-  packuswb    xmm9,xmm1 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm7,[rsp+0A0h] 
-  pandn       xmm0,[rsp+0F0h] 
-  movdqa      xmm1,xmm3 
-  por         xmm6,xmm0 
-  movdqa      xmm0,[rsp+10h] 
-  paddw       xmm0,xmm4 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm12 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  pandn       xmm0,xmm12 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,[rsp+10h] 
-  por         xmm2,xmm0 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+0B0h] 
-  paddw       xmm1,xmm4 
-  packuswb    xmm6,xmm2 
-  movdqa      xmm2,xmm3 
-  psllw       xmm1,1 
-  por         xmm5,xmm0 
-  movdqa      xmm0,[rsp+80h] 
-  paddw       xmm0,xmm10 
-  paddw       xmm1,xmm0 
-  paddw       xmm11,xmm1 
-  psraw       xmm11,3 
-  movdqa      xmm1,xmm12 
-  pand        xmm2,xmm11 
-  paddw       xmm1,xmm12 
-  movdqa      xmm11,[rsp+80h] 
-  movdqa      xmm0, [rsp+10h] 
-  por         xmm14,[rsp+0E0h] 
-  paddw       xmm0,xmm11 
-  movdqa      xmm4,xmm15 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  paddw       xmm1,xmm7 
-  psraw       xmm1,2 
-  pandn       xmm3,xmm1 
-  por         xmm2,xmm3 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm3,[rsp+10h] 
-  pandn       xmm0,xmm3 
-  pand        xmm1,xmm2 
-  movdqa      xmm2,xmm11 
-  paddw       xmm2,[rsp] 
-  por         xmm1,xmm0 
-  movdqa      xmm0,[rsp+0D0h] 
-  por         xmm0,xmm8 
-  paddw       xmm2,xmm3 
-  packuswb    xmm5,xmm1 
-  movdqa      xmm8,[rsp+40h] 
-  movdqa      xmm1,[rsp+50h] 
-  movdqa      xmm3,xmm8 
-  pand        xmm4,xmm0 
-  psllw       xmm2,1 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+90h] 
-  por         xmm4,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm10 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,[rsp] 
-  movdqa      xmm2,xmm11 
-  paddw       xmm0,xmm12 
-  movdqa      xmm12,[rsp] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,3 
-  movdqa      xmm0,xmm8 
-  pand        xmm3,xmm1 
-  paddw       xmm2,xmm7 
-  movdqa      xmm1,xmm13 
-  psraw       xmm2,2 
-  pandn       xmm0,xmm2 
-  por         xmm3,xmm0 
-  movdqa      xmm2,[rsp+50h] 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm3 
-  paddw       xmm2,xmm11 
-  movdqa      xmm3,xmm15 
-  por         xmm1,xmm0 
-  pand        xmm3,xmm14 
-  movdqa      xmm14,[rsp+10h] 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+30h] 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm1,xmm8 
-  por         xmm3,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm14 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm8 
-  pandn       xmm0,xmm11 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm11 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm15 
-  por         xmm2,xmm0 
-  packuswb    xmm3,xmm2 
-  movdqa      xmm0,[rsp+100h] 
-  por         xmm0,[rsp+120h] 
-  pand        xmm1,xmm0 
-  movdqa      xmm2,[rcx+rbp] 
-  movdqa      xmm7,[rsp+50h] 
-  pandn       xmm15,[rsp+60h] 
-  lea         r11,[rsp+1D8h] 
-  pxor        xmm0,xmm0 
-  por         xmm1,xmm15 
-  movaps      xmm15,[r11-0A8h] 
-  movdqa      [rdi],xmm9 
-  movaps      xmm9,[r11-48h] 
-  punpckhbw   xmm2,xmm0 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm7 
-  movdqa      [rbx],xmm6 
-  movaps      xmm6,[r11-18h] 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm11 
-  movaps      xmm11,[r11-68h] 
-  paddw       xmm2,xmm12 
-  movaps      xmm12,[r11-78h] 
-  paddw       xmm2,xmm14 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  movaps      xmm10,[r11-58h] 
-  movaps      xmm14,[r11-98h] 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm8 
-  pandn       xmm8,xmm7 
-  pandn       xmm13,xmm7 
-  por         xmm2,xmm8 
-  movaps      xmm7,[r11-28h] 
-  movaps      xmm8,[r11-38h] 
-  movdqa      [r8],xmm5 
-  pand        xmm0,xmm2 
-  por         xmm0,xmm13 
-  packuswb    xmm1,xmm0 
-  movaps      xmm13,[r11-88h] 
-  movdqa      [rbp],xmm4 
-  movdqa      [r10+rbp],xmm3 
-  movdqa      [rsi+rbp],xmm1 
-  mov         rsp,r11 
-  pop         rdi  
-  pop         rsi  
-  pop         rbp  
-  pop         rbx  
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        rsi
+  push        rdi
+  sub         rsp,1D8h
+  movaps      [rax-38h],xmm6
+  movaps      [rax-48h],xmm7
+  movaps      [rax-58h],xmm8
+  pxor        xmm1,xmm1
+  movsxd      r10,edx
+  mov         rbp,rcx
+  mov         r11d,r8d
+  mov         rdx,rcx
+  mov         rdi,rbp
+  mov         rbx,rbp
+  movdqa      xmm5,[rbp]
+  movaps      [rax-68h],xmm9
+  movaps      [rax-78h],xmm10
+  punpcklbw   xmm5,xmm1
+  movaps      [rax-88h],xmm11
+  movaps      [rax-98h],xmm12
+  movaps      [rax-0A8h],xmm13
+  movaps      [rax-0B8h],xmm14
+  movdqa      xmm14,[r10+rbp]
+  movaps      [rax-0C8h],xmm15
+  lea         eax,[r10*4]
+  movsxd      r8,eax
+  lea         eax,[r10+r10*2]
+  movsxd      rcx,eax
+  lea         eax,[r10+r10]
+  sub         rdx,r8
+  punpcklbw   xmm14,xmm1
+  movdqa      [rsp+90h],xmm5
+  movdqa      [rsp+30h],xmm14
+  movsxd      rsi,eax
+  movsx       eax,r11w
+  sub         rdi,rcx
+  sub         rbx,rsi
+  mov         r8,rbp
+  sub         r8,r10
+  movd        xmm0,eax
+  movsx       eax,r9w
+  movdqa      xmm12,[rdi]
+  movdqa      xmm6, [rsi+rbp]
+  movdqa      xmm13,[rbx]
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm6,xmm1
+  movdqa      xmm8,[r8]
+  movd        xmm0,eax
+  movdqa      xmm10,xmm11
+  mov         eax,2
+  punpcklbw   xmm8,xmm1
+  punpcklbw   xmm12,xmm1
+  cwde
+  punpcklwd   xmm0,xmm0
+  psraw       xmm10,2
+  movdqa      xmm1,xmm8
+  movdqa      [rsp+0F0h],xmm13
+  movdqa      [rsp+0B0h],xmm8
+  pshufd      xmm7,xmm0,0
+  psubw       xmm1,xmm13
+  movdqa      xmm0,xmm5
+  movdqa      xmm4,xmm7
+  movdqa      xmm2,xmm7
+  psubw       xmm0,xmm8
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm5
+  movdqa      [rsp+40h],xmm7
+  movdqa      [rsp+60h],xmm6
+  pcmpgtw     xmm4,xmm0
+  psubw       xmm1,xmm14
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm4,xmm2
+  movdqa      xmm0,xmm11
+  pcmpgtw     xmm0,xmm3
+  pand        xmm4,xmm0
+  movd        xmm0,eax
+  movdqa      [rsp+20h],xmm4
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm2,xmm0,0
+  paddw       xmm10,xmm2
+  movdqa      [rsp+0A0h],xmm2
+  movdqa      xmm15,xmm7
+  pxor        xmm4,xmm4
+  movdqa      xmm0,xmm8
+  psubw       xmm0,xmm12
+  mov         eax,4
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm10
+  cwde
+  pcmpgtw     xmm15,xmm0
+  pcmpgtw     xmm1,xmm3
+  movdqa      xmm3,xmm7
+  movdqa      xmm7,[rdx]
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm6
+  pand        xmm15,xmm1
+  punpcklbw   xmm7,xmm4
+  movdqa      xmm9,xmm15
+  pabsw       xmm0,xmm0
+  psllw       xmm7,1
+  pandn       xmm9,xmm12
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm7,xmm12
+  movd        xmm0,eax
+  pand        xmm3,xmm1
+  paddw       xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  paddw       xmm7,xmm12
+  pshufd      xmm1,xmm0,0
+  paddw       xmm7,xmm13
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm6
+  paddw       xmm7,xmm8
+  movdqa      [rsp+70h],xmm1
+  paddw       xmm7,xmm5
+  movdqa      [rsp+120h],xmm0
+  movdqa      xmm0,[rcx+rbp]
+  punpcklbw   xmm0,xmm4
+  paddw       xmm7,xmm1
+  movdqa      xmm4,xmm15
+  psllw       xmm0,1
+  psraw       xmm7,3
+  paddw       xmm0,xmm6
+  pand        xmm7,xmm15
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm14
+  movdqa      xmm6,xmm15
+  paddw       xmm0,xmm5
+  pandn       xmm6,xmm13
+  paddw       xmm0,xmm8
+  paddw       xmm0,xmm1
+  psraw       xmm0,3
+  movdqa      xmm1,xmm12
+  paddw       xmm1,xmm13
+  pand        xmm0,xmm3
+  movdqa      [rsp+100h],xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,xmm5
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm3
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pandn       xmm0,xmm14
+  pand        xmm4,xmm1
+  movdqa      [rsp+0E0h],xmm0
+  movdqa      xmm0,xmm5
+  paddw       xmm0,xmm8
+  movdqa      xmm1,[rsp+60h]
+  paddw       xmm1,xmm14
+  movdqa      xmm14,xmm3
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,[rsp+30h]
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pand        xmm14,xmm1
+  movdqa      xmm1,xmm13
+  paddw       xmm1,xmm13
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  movdqa      xmm0,[rsp+30h]
+  movdqa      xmm2,xmm13
+  movdqa      xmm5,xmm15
+  paddw       xmm0,[rsp+70h]
+  pandn       xmm5,xmm1
+  paddw       xmm2,xmm8
+  movdqa      xmm8,[rsp+90h]
+  movdqa      xmm1,xmm12
+  paddw       xmm2,xmm8
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,xmm8
+  movdqa      xmm8,xmm3
+  movdqa      xmm2,[rsp+30h]
+  paddw       xmm0,xmm13
+  psraw       xmm1,3
+  pand        xmm15,xmm1
+  movdqa      xmm1,xmm2
+  paddw       xmm1,xmm2
+  paddw       xmm2,[rsp+90h]
+  paddw       xmm2,[rsp+0B0h]
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  movdqa      xmm13,[r8]
+  paddw       xmm0, [rsp+70h]
+  paddw       xmm1, [rsp+0A0h]
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  psraw       xmm1,2
+  movdqa      xmm0, [rdi]
+  pandn       xmm8,xmm1
+  movdqa      xmm1, [rsp+60h]
+  paddw       xmm1,xmm2
+  movdqa      xmm2, [rbx]
+  psraw       xmm1,3
+  pand        xmm3,xmm1
+  movdqa      xmm1, [rbp]
+  movdqa      [rsp+0D0h],xmm3
+  pxor        xmm3,xmm3
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm1,xmm3
+  punpckhbw   xmm13,xmm3
+  movdqa      [rsp+0C0h],xmm0
+  movdqa      xmm0,[r10+rbp]
+  movdqa      [rsp],xmm1
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm2,xmm3
+  movdqa      [rsp+80h],xmm0
+  movdqa      xmm0,[rsi+rbp]
+  movdqa      [rsp+10h],xmm13
+  punpckhbw   xmm0,xmm3
+  movdqa      [rsp+50h],xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm1,xmm13
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm2
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,[rsp]
+  movdqa      xmm13,[rsp+40h]
+  movdqa      [rsp+110h],xmm2
+  psubw       xmm1, [rsp+80h]
+  pcmpgtw     xmm13,xmm0
+  pcmpgtw     xmm11,xmm3
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm10,xmm3
+  movdqa      xmm1, [rsp+40h]
+  movdqa      xmm2,xmm1
+  movdqa      xmm3,xmm1
+  pcmpgtw     xmm2,xmm0
+  movdqa      xmm0, [rsp+10h]
+  pand        xmm13,xmm2
+  pand        xmm13,xmm11
+  movdqa      xmm11,[rsp+0C0h]
+  psubw       xmm0,xmm11
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm3,xmm0
+  pand        xmm3,xmm10
+  movdqa      xmm0,[rsp]
+  psubw       xmm0,[rsp+50h]
+  movdqa      xmm2,[rdx]
+  pabsw       xmm0,xmm0
+  por         xmm7,xmm9
+  movdqa      xmm9,[rsp+20h]
+  pcmpgtw     xmm1,xmm0
+  pand        xmm9,xmm7
+  movdqa      xmm7,[rsp+20h]
+  movdqa      xmm0,xmm7
+  pandn       xmm0,xmm12
+  movdqa      xmm12,[rsp+110h]
+  pand        xmm1,xmm10
+  movdqa      xmm10,[rsp+70h]
+  movdqa      [rsp+40h],xmm1
+  movdqa      xmm1,xmm13
+  por         xmm9,xmm0
+  pxor        xmm0,xmm0
+  por         xmm4,xmm6
+  movdqa      xmm6,xmm7
+  punpckhbw   xmm2,xmm0
+  por         xmm15,xmm5
+  movdqa      xmm5,[rsp+20h]
+  movdqa      xmm0,xmm3
+  psllw       xmm2,1
+  pandn       xmm0,xmm11
+  pand        xmm6,xmm4
+  movdqa      xmm4,[rsp]
+  paddw       xmm2,xmm11
+  pand        xmm5,xmm15
+  movdqa      xmm15,[rsp+20h]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm12
+  paddw       xmm2,[rsp+10h]
+  paddw       xmm2,[rsp]
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  pand        xmm2,xmm3
+  por         xmm2,xmm0
+  pand        xmm1,xmm2
+  movdqa      xmm0,xmm13
+  movdqa      xmm2,xmm11
+  pandn       xmm0,xmm11
+  paddw       xmm2,xmm12
+  por         xmm1,xmm0
+  packuswb    xmm9,xmm1
+  movdqa      xmm0,xmm7
+  movdqa      xmm7,[rsp+0A0h]
+  pandn       xmm0,[rsp+0F0h]
+  movdqa      xmm1,xmm3
+  por         xmm6,xmm0
+  movdqa      xmm0,[rsp+10h]
+  paddw       xmm0,xmm4
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm12
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  pandn       xmm0,xmm12
+  movdqa      xmm1,xmm12
+  paddw       xmm1,[rsp+10h]
+  por         xmm2,xmm0
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+0B0h]
+  paddw       xmm1,xmm4
+  packuswb    xmm6,xmm2
+  movdqa      xmm2,xmm3
+  psllw       xmm1,1
+  por         xmm5,xmm0
+  movdqa      xmm0,[rsp+80h]
+  paddw       xmm0,xmm10
+  paddw       xmm1,xmm0
+  paddw       xmm11,xmm1
+  psraw       xmm11,3
+  movdqa      xmm1,xmm12
+  pand        xmm2,xmm11
+  paddw       xmm1,xmm12
+  movdqa      xmm11,[rsp+80h]
+  movdqa      xmm0, [rsp+10h]
+  por         xmm14,[rsp+0E0h]
+  paddw       xmm0,xmm11
+  movdqa      xmm4,xmm15
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  paddw       xmm1,xmm7
+  psraw       xmm1,2
+  pandn       xmm3,xmm1
+  por         xmm2,xmm3
+  movdqa      xmm1,xmm13
+  movdqa      xmm3,[rsp+10h]
+  pandn       xmm0,xmm3
+  pand        xmm1,xmm2
+  movdqa      xmm2,xmm11
+  paddw       xmm2,[rsp]
+  por         xmm1,xmm0
+  movdqa      xmm0,[rsp+0D0h]
+  por         xmm0,xmm8
+  paddw       xmm2,xmm3
+  packuswb    xmm5,xmm1
+  movdqa      xmm8,[rsp+40h]
+  movdqa      xmm1,[rsp+50h]
+  movdqa      xmm3,xmm8
+  pand        xmm4,xmm0
+  psllw       xmm2,1
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+90h]
+  por         xmm4,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm10
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,[rsp]
+  movdqa      xmm2,xmm11
+  paddw       xmm0,xmm12
+  movdqa      xmm12,[rsp]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm0
+  psraw       xmm1,3
+  movdqa      xmm0,xmm8
+  pand        xmm3,xmm1
+  paddw       xmm2,xmm7
+  movdqa      xmm1,xmm13
+  psraw       xmm2,2
+  pandn       xmm0,xmm2
+  por         xmm3,xmm0
+  movdqa      xmm2,[rsp+50h]
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm3
+  paddw       xmm2,xmm11
+  movdqa      xmm3,xmm15
+  por         xmm1,xmm0
+  pand        xmm3,xmm14
+  movdqa      xmm14,[rsp+10h]
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+30h]
+  packuswb    xmm4,xmm1
+  movdqa      xmm1,xmm8
+  por         xmm3,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm14
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm8
+  pandn       xmm0,xmm11
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm11
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm15
+  por         xmm2,xmm0
+  packuswb    xmm3,xmm2
+  movdqa      xmm0,[rsp+100h]
+  por         xmm0,[rsp+120h]
+  pand        xmm1,xmm0
+  movdqa      xmm2,[rcx+rbp]
+  movdqa      xmm7,[rsp+50h]
+  pandn       xmm15,[rsp+60h]
+  lea         r11,[rsp+1D8h]
+  pxor        xmm0,xmm0
+  por         xmm1,xmm15
+  movaps      xmm15,[r11-0A8h]
+  movdqa      [rdi],xmm9
+  movaps      xmm9,[r11-48h]
+  punpckhbw   xmm2,xmm0
+  psllw       xmm2,1
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm7
+  movdqa      [rbx],xmm6
+  movaps      xmm6,[r11-18h]
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm11
+  movaps      xmm11,[r11-68h]
+  paddw       xmm2,xmm12
+  movaps      xmm12,[r11-78h]
+  paddw       xmm2,xmm14
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  movaps      xmm10,[r11-58h]
+  movaps      xmm14,[r11-98h]
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm8
+  pandn       xmm8,xmm7
+  pandn       xmm13,xmm7
+  por         xmm2,xmm8
+  movaps      xmm7,[r11-28h]
+  movaps      xmm8,[r11-38h]
+  movdqa      [r8],xmm5
+  pand        xmm0,xmm2
+  por         xmm0,xmm13
+  packuswb    xmm1,xmm0
+  movaps      xmm13,[r11-88h]
+  movdqa      [rbp],xmm4
+  movdqa      [r10+rbp],xmm3
+  movdqa      [rsi+rbp],xmm1
+  mov         rsp,r11
+  pop         rdi
+  pop         rsi
+  pop         rbp
+  pop         rbx
   ret
 
 
@@ -784,161 +784,161 @@
 
 ALIGN  16
 DeblockChromaLt4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rdi     
-  sub         rsp,0C8h 
+  mov         rax,rsp
+  push        rbx
+  push        rdi
+  sub         rsp,0C8h
   mov         r10,qword [rax + 30h]  ; pTC
-  pxor        xmm1,xmm1 
-  mov         rbx,rcx 
-  movsxd      r11,r8d 
-  movsx       ecx,byte [r10] 
-  movsx       r8d,byte [r10+2] 
-  mov         rdi,rdx 
-  movq        xmm2,[rbx] 
-  movq        xmm9,[r11+rbx] 
-  movsx       edx,byte [r10+1] 
-  mov         word [rsp+2],cx 
-  mov         word [rsp],cx 
-  movsx       eax,byte [r10+3] 
-  mov         word [rsp+6],dx 
-  mov         word [rsp+4],dx 
-  movdqa      xmm11,xmm1 
-  mov         word [rsp+0Eh],ax 
-  mov         word [rsp+0Ch],ax 
-  lea         eax,[r11+r11] 
-  movsxd      rcx,eax 
-  mov         rax,rbx 
-  mov         rdx,rdi 
-  sub         rax,rcx 
-  mov         word [rsp+0Ah],r8w 
-  mov         word [rsp+8],r8w 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm7,xmm6 
-  movq        xmm13, [rax] 
-  mov         rax,rdi 
-  sub         rax,rcx 
-  mov         rcx,rbx 
-  pcmpgtw     xmm7,xmm1 
-  psubw       xmm11,xmm6 
-  sub         rcx,r11 
-  sub         rdx,r11 
-  movq        xmm0,[rax] 
-  movsx       eax,r9w 
-  movq        xmm15,[rcx] 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rdx] 
-  movdqa      xmm4,xmm13 
-  punpcklqdq  xmm15,xmm0 
-  movq        xmm0, [rdi] 
-  punpcklbw   xmm4,xmm1 
-  movdqa      xmm12,xmm15 
-  punpcklqdq  xmm2,xmm0 
-  movq        xmm0, [r11+rdi] 
-  punpcklbw   xmm12,xmm1 
-  movdqa      xmm14,xmm2 
-  punpcklqdq  xmm9,xmm0 
-  punpckhbw   xmm2,xmm1 
-  punpcklbw   xmm14,xmm1 
-  movd        xmm0,eax 
+  pxor        xmm1,xmm1
+  mov         rbx,rcx
+  movsxd      r11,r8d
+  movsx       ecx,byte [r10]
+  movsx       r8d,byte [r10+2]
+  mov         rdi,rdx
+  movq        xmm2,[rbx]
+  movq        xmm9,[r11+rbx]
+  movsx       edx,byte [r10+1]
+  mov         word [rsp+2],cx
+  mov         word [rsp],cx
+  movsx       eax,byte [r10+3]
+  mov         word [rsp+6],dx
+  mov         word [rsp+4],dx
+  movdqa      xmm11,xmm1
+  mov         word [rsp+0Eh],ax
+  mov         word [rsp+0Ch],ax
+  lea         eax,[r11+r11]
+  movsxd      rcx,eax
+  mov         rax,rbx
+  mov         rdx,rdi
+  sub         rax,rcx
+  mov         word [rsp+0Ah],r8w
+  mov         word [rsp+8],r8w
+  movdqa      xmm6,[rsp]
+  movdqa      xmm7,xmm6
+  movq        xmm13, [rax]
+  mov         rax,rdi
+  sub         rax,rcx
+  mov         rcx,rbx
+  pcmpgtw     xmm7,xmm1
+  psubw       xmm11,xmm6
+  sub         rcx,r11
+  sub         rdx,r11
+  movq        xmm0,[rax]
+  movsx       eax,r9w
+  movq        xmm15,[rcx]
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rdx]
+  movdqa      xmm4,xmm13
+  punpcklqdq  xmm15,xmm0
+  movq        xmm0, [rdi]
+  punpcklbw   xmm4,xmm1
+  movdqa      xmm12,xmm15
+  punpcklqdq  xmm2,xmm0
+  movq        xmm0, [r11+rdi]
+  punpcklbw   xmm12,xmm1
+  movdqa      xmm14,xmm2
+  punpcklqdq  xmm9,xmm0
+  punpckhbw   xmm2,xmm1
+  punpcklbw   xmm14,xmm1
+  movd        xmm0,eax
   movsx       eax,word [rsp + 0C8h + 38h] ; iBeta
-  punpckhbw   xmm13,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm9 
-  movdqa      [rsp+10h],xmm2 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm9,xmm1 
-  punpcklbw   xmm3,xmm1 
-  movdqa      xmm1,xmm14 
-  pshufd      xmm10,xmm0,0 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm8,xmm0,0 
-  movd        xmm0,eax 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  psubw       xmm1,xmm12 
-  movdqa      xmm2,xmm10 
-  lea         r11,[rsp+0C8h] 
-  psllw       xmm1,2 
-  movdqa      xmm0,xmm4 
-  psubw       xmm4,xmm12 
-  psubw       xmm0,xmm3 
-  psubw       xmm3,xmm14 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm11 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm12 
-  psubw       xmm0,xmm14 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  movdqa      xmm3,[rsp] 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm9 
-  psubw       xmm13,xmm15 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  paddw       xmm12,xmm6 
-  psubw       xmm14,xmm6 
-  movdqa      xmm2,[rsp+10h] 
-  movaps      xmm6,[r11-18h] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm15 
-  psubw       xmm9,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm15 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  pmaxsw      xmm11,xmm1 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm10,xmm0 
-  pabsw       xmm0,xmm13 
-  pminsw      xmm3,xmm11 
-  movaps      xmm11,[r11-68h] 
-  movaps      xmm13,[rsp+40h] 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm9 
-  movaps      xmm9, [r11-48h] 
-  pand        xmm10,xmm1 
-  pcmpgtw     xmm8,xmm0 
-  pand        xmm10,xmm8 
-  pand        xmm10,xmm7 
-  movaps      xmm8,[r11-38h] 
-  movaps      xmm7,[r11-28h] 
-  pand        xmm3,xmm10 
-  paddw       xmm15,xmm3 
-  psubw       xmm2,xmm3 
-  movaps      xmm10,[r11-58h] 
-  packuswb    xmm12,xmm15 
-  movaps      xmm15,[rsp+20h] 
-  packuswb    xmm14,xmm2 
-  movq        [rcx],xmm12 
-  movq        [rbx],xmm14 
-  psrldq      xmm12,8 
-  psrldq      xmm14,8 
-  movq        [rdx],xmm12 
-  movaps      xmm12,[r11-78h] 
-  movq        [rdi],xmm14 
-  movaps      xmm14,[rsp+30h] 
-  mov         rsp,r11 
-  pop         rdi  
-  pop         rbx  
+  punpckhbw   xmm13,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm9
+  movdqa      [rsp+10h],xmm2
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm9,xmm1
+  punpcklbw   xmm3,xmm1
+  movdqa      xmm1,xmm14
+  pshufd      xmm10,xmm0,0
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm8,xmm0,0
+  movd        xmm0,eax
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  psubw       xmm1,xmm12
+  movdqa      xmm2,xmm10
+  lea         r11,[rsp+0C8h]
+  psllw       xmm1,2
+  movdqa      xmm0,xmm4
+  psubw       xmm4,xmm12
+  psubw       xmm0,xmm3
+  psubw       xmm3,xmm14
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm11
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm12
+  psubw       xmm0,xmm14
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  movdqa      xmm3,[rsp]
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm9
+  psubw       xmm13,xmm15
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  paddw       xmm12,xmm6
+  psubw       xmm14,xmm6
+  movdqa      xmm2,[rsp+10h]
+  movaps      xmm6,[r11-18h]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm15
+  psubw       xmm9,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm15
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  pmaxsw      xmm11,xmm1
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm10,xmm0
+  pabsw       xmm0,xmm13
+  pminsw      xmm3,xmm11
+  movaps      xmm11,[r11-68h]
+  movaps      xmm13,[rsp+40h]
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm9
+  movaps      xmm9, [r11-48h]
+  pand        xmm10,xmm1
+  pcmpgtw     xmm8,xmm0
+  pand        xmm10,xmm8
+  pand        xmm10,xmm7
+  movaps      xmm8,[r11-38h]
+  movaps      xmm7,[r11-28h]
+  pand        xmm3,xmm10
+  paddw       xmm15,xmm3
+  psubw       xmm2,xmm3
+  movaps      xmm10,[r11-58h]
+  packuswb    xmm12,xmm15
+  movaps      xmm15,[rsp+20h]
+  packuswb    xmm14,xmm2
+  movq        [rcx],xmm12
+  movq        [rbx],xmm14
+  psrldq      xmm12,8
+  psrldq      xmm14,8
+  movq        [rdx],xmm12
+  movaps      xmm12,[r11-78h]
+  movq        [rdi],xmm14
+  movaps      xmm14,[rsp+30h]
+  mov         rsp,r11
+  pop         rdi
+  pop         rbx
   ret
 
 
@@ -945,151 +945,151 @@
 WELS_EXTERN   DeblockChromaEq4V_sse2
 ALIGN 16
 DeblockChromaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  sub         rsp,90h 
-  pxor        xmm1,xmm1 
-  mov         r11,rcx 
-  mov         rbx,rdx 
-  mov         r10d,r9d   
-  movq        xmm13,[r11] 
-  lea         eax,[r8+r8] 
-  movsxd      r9,eax 
-  mov         rax,rcx 
-  sub         rax,r9 
-  movq        xmm14,[rax] 
-  mov         rax,rdx 
-  sub         rax,r9 
-  movq        xmm0,[rax] 
-  movsxd      rax,r8d 
-  sub         rcx,rax 
-  sub         rdx,rax 
-  movq        xmm12,[rax+r11] 
-  movq        xmm10,[rcx] 
-  punpcklqdq  xmm14,xmm0 
-  movdqa      xmm8,xmm14 
-  movq        xmm0,[rdx] 
-  punpcklbw   xmm8,xmm1 
-  punpckhbw   xmm14,xmm1 
-  punpcklqdq  xmm10,xmm0 
-  movq        xmm0,[rbx] 
-  movdqa      xmm5,xmm10 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rax+rbx] 
-  punpcklbw   xmm5,xmm1 
-  movsx       eax,r10w 
-  movdqa      xmm9,xmm13 
-  punpcklqdq  xmm12,xmm0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm10,xmm1 
-  movd        xmm0,eax 
+  mov         rax,rsp
+  push        rbx
+  sub         rsp,90h
+  pxor        xmm1,xmm1
+  mov         r11,rcx
+  mov         rbx,rdx
+  mov         r10d,r9d
+  movq        xmm13,[r11]
+  lea         eax,[r8+r8]
+  movsxd      r9,eax
+  mov         rax,rcx
+  sub         rax,r9
+  movq        xmm14,[rax]
+  mov         rax,rdx
+  sub         rax,r9
+  movq        xmm0,[rax]
+  movsxd      rax,r8d
+  sub         rcx,rax
+  sub         rdx,rax
+  movq        xmm12,[rax+r11]
+  movq        xmm10,[rcx]
+  punpcklqdq  xmm14,xmm0
+  movdqa      xmm8,xmm14
+  movq        xmm0,[rdx]
+  punpcklbw   xmm8,xmm1
+  punpckhbw   xmm14,xmm1
+  punpcklqdq  xmm10,xmm0
+  movq        xmm0,[rbx]
+  movdqa      xmm5,xmm10
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rax+rbx]
+  punpcklbw   xmm5,xmm1
+  movsx       eax,r10w
+  movdqa      xmm9,xmm13
+  punpcklqdq  xmm12,xmm0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm10,xmm1
+  movd        xmm0,eax
   movsx       eax,word [rsp + 90h + 8h + 28h]   ; iBeta
-  punpckhbw   xmm13,xmm1 
-  movdqa      xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm12,xmm1 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm7,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm1,xmm8 
-  psubw       xmm1,xmm5 
-  punpcklwd   xmm0,xmm0 
-  movdqa      xmm6,xmm11 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm9 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm10 
-  movdqa      xmm1,xmm14 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm10 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm11,xmm0 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm11,xmm2 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm4,xmm6 
-  movdqa      xmm1,xmm8 
-  mov         eax,2 
-  cwde             
-  paddw       xmm1,xmm8 
-  psubw       xmm0,xmm13 
-  paddw       xmm1,xmm5 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm2,xmm14 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm14 
-  movd        xmm0,eax 
-  pand        xmm11,xmm3 
-  paddw       xmm7,xmm7 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm2,xmm12 
-  paddw       xmm12,xmm12 
-  pshufd      xmm3,xmm0,0 
-  paddw       xmm7,xmm9 
-  paddw       xmm12,xmm13 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm5 
-  paddw       xmm7,xmm8 
-  psraw       xmm1,2 
-  paddw       xmm12,xmm14 
-  paddw       xmm7,xmm3 
-  movaps      xmm14,[rsp] 
-  pand        xmm4,xmm1 
-  paddw       xmm12,xmm3 
-  psraw       xmm7,2 
-  movdqa      xmm1,xmm11 
-  por         xmm4,xmm0 
-  psraw       xmm12,2 
-  paddw       xmm2,xmm3 
-  movdqa      xmm0,xmm11 
-  pandn       xmm0,xmm10 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm0,xmm11 
-  movdqa      xmm1,xmm6 
-  pand        xmm1,xmm7 
-  movaps      xmm7,[rsp+70h] 
-  movq        [rcx],xmm4 
-  pandn       xmm6,xmm9 
-  pandn       xmm11,xmm13 
-  pand        xmm0,xmm12 
-  por         xmm1,xmm6 
-  por         xmm0,xmm11 
-  psrldq      xmm4,8 
-  packuswb    xmm1,xmm0 
-  movq        [r11],xmm1 
-  psrldq      xmm1,8 
-  movq        [rdx],xmm4 
-  lea         r11,[rsp+90h] 
-  movaps      xmm6,[r11-10h] 
-  movaps      xmm8,[r11-30h] 
-  movaps      xmm9,[r11-40h] 
-  movq        [rbx],xmm1 
-  movaps      xmm10,[r11-50h] 
-  movaps      xmm11,[r11-60h] 
-  movaps      xmm12,[r11-70h] 
-  movaps      xmm13,[r11-80h] 
-  mov         rsp,r11 
-  pop         rbx  
+  punpckhbw   xmm13,xmm1
+  movdqa      xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm12,xmm1
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm7,xmm1
+  movd        xmm0,eax
+  movdqa      xmm1,xmm8
+  psubw       xmm1,xmm5
+  punpcklwd   xmm0,xmm0
+  movdqa      xmm6,xmm11
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm9
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm10
+  movdqa      xmm1,xmm14
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm10
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm11,xmm0
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm11,xmm2
+  movdqa      xmm0,xmm12
+  movdqa      xmm4,xmm6
+  movdqa      xmm1,xmm8
+  mov         eax,2
+  cwde
+  paddw       xmm1,xmm8
+  psubw       xmm0,xmm13
+  paddw       xmm1,xmm5
+  pabsw       xmm0,xmm0
+  movdqa      xmm2,xmm14
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm14
+  movd        xmm0,eax
+  pand        xmm11,xmm3
+  paddw       xmm7,xmm7
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  paddw       xmm2,xmm12
+  paddw       xmm12,xmm12
+  pshufd      xmm3,xmm0,0
+  paddw       xmm7,xmm9
+  paddw       xmm12,xmm13
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm5
+  paddw       xmm7,xmm8
+  psraw       xmm1,2
+  paddw       xmm12,xmm14
+  paddw       xmm7,xmm3
+  movaps      xmm14,[rsp]
+  pand        xmm4,xmm1
+  paddw       xmm12,xmm3
+  psraw       xmm7,2
+  movdqa      xmm1,xmm11
+  por         xmm4,xmm0
+  psraw       xmm12,2
+  paddw       xmm2,xmm3
+  movdqa      xmm0,xmm11
+  pandn       xmm0,xmm10
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  packuswb    xmm4,xmm1
+  movdqa      xmm0,xmm11
+  movdqa      xmm1,xmm6
+  pand        xmm1,xmm7
+  movaps      xmm7,[rsp+70h]
+  movq        [rcx],xmm4
+  pandn       xmm6,xmm9
+  pandn       xmm11,xmm13
+  pand        xmm0,xmm12
+  por         xmm1,xmm6
+  por         xmm0,xmm11
+  psrldq      xmm4,8
+  packuswb    xmm1,xmm0
+  movq        [r11],xmm1
+  psrldq      xmm1,8
+  movq        [rdx],xmm4
+  lea         r11,[rsp+90h]
+  movaps      xmm6,[r11-10h]
+  movaps      xmm8,[r11-30h]
+  movaps      xmm9,[r11-40h]
+  movq        [rbx],xmm1
+  movaps      xmm10,[r11-50h]
+  movaps      xmm11,[r11-60h]
+  movaps      xmm12,[r11-70h]
+  movaps      xmm13,[r11-80h]
+  mov         rsp,r11
+  pop         rbx
   ret
 
 
@@ -1099,263 +1099,263 @@
 WELS_EXTERN   DeblockChromaEq4H_sse2
 ALIGN  16
 DeblockChromaEq4H_sse2:
-  mov         rax,rsp 
-  mov         [rax+20h],rbx 
-  push        rdi  
-  sub         rsp,140h    
-  mov         rdi,rdx 
-  lea         eax,[r8*4] 
-  movsxd      r10,eax 
-  mov         eax,[rcx-2] 
-  mov         [rsp+10h],eax 
-  lea         rbx,[r10+rdx-2] 
-  lea         r11,[r10+rcx-2] 
-  movdqa      xmm5,[rsp+10h] 
-  movsxd      r10,r8d 
-  mov         eax,[r10+rcx-2] 
-  lea         rdx,[r10+r10*2] 
-  mov         [rsp+20h],eax 
-  mov         eax,[rcx+r10*2-2] 
-  mov         [rsp+30h],eax 
-  mov         eax,[rdx+rcx-2] 
-  movdqa      xmm2,[rsp+20h] 
-  mov         [rsp+40h],eax 
-  mov         eax, [rdi-2] 
-  movdqa      xmm4,[rsp+30h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rdi-2] 
-  movdqa      xmm3,[rsp+40h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[rdi+r10*2-2] 
-  punpckldq   xmm5,[rsp+50h] 
-  mov         [rsp+70h],eax 
-  mov         eax, [rdx+rdi-2] 
-  punpckldq   xmm2, [rsp+60h] 
-  mov          [rsp+80h],eax 
-  mov         eax,[r11] 
-  punpckldq   xmm4, [rsp+70h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx] 
-  punpckldq   xmm3,[rsp+80h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[r10+r11] 
-  movdqa      xmm0, [rsp+50h] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+60h],eax 
-  mov         eax,[r11+r10*2] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx+r10*2] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  mov         eax, [rdx+r11] 
-  movdqa      xmm15,xmm1 
-  punpckldq   xmm0,[rsp+60h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax, [rdx+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm15,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm12,xmm15 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm12,xmm0 
-  punpckhdq   xmm15,xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm11,xmm12 
-  punpckldq   xmm0,xmm5 
-  punpckhdq   xmm1,xmm5 
-  punpcklqdq  xmm11,xmm0 
-  punpckhqdq  xmm12,xmm0 
-  movsx       eax,r9w 
-  movdqa      xmm14,xmm15 
-  punpcklqdq  xmm14,xmm1 
-  punpckhqdq  xmm15,xmm1 
-  pxor        xmm1,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm4,xmm12 
-  movdqa      xmm8,xmm11 
-  movsx       eax,word [rsp+170h] ; iBeta
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm4,xmm1 
-  punpckhbw   xmm12,xmm1 
-  movdqa      xmm9,xmm14 
-  movdqa      xmm7,xmm15 
-  movdqa      xmm10,xmm15 
-  pshufd      xmm13,xmm0,0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm14,xmm1 
-  movdqa      xmm6,xmm13 
-  movd        xmm0,eax 
-  movdqa      [rsp],xmm11 
-  mov         eax,2 
-  cwde             
-  punpckhbw   xmm11,xmm1 
-  punpckhbw   xmm10,xmm1 
-  punpcklbw   xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm8,xmm1 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm4 
-  psubw       xmm0,xmm9 
-  psubw       xmm1,xmm4 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm1,xmm11 
-  psubw       xmm0,xmm14 
-  psubw       xmm1,xmm12 
-  movdqa      xmm5,xmm6 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm13,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm2,xmm0 
-  paddw       xmm1,xmm8 
-  movdqa      xmm0,xmm10 
-  pand        xmm13,xmm2 
-  psubw       xmm0,xmm14 
-  paddw       xmm1,xmm4 
-  movdqa      xmm2,xmm11 
-  pabsw       xmm0,xmm0 
-  paddw       xmm2,xmm11 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm12 
-  movd        xmm0,eax 
-  pand        xmm13,xmm3 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm4 
-  paddw       xmm2,xmm3 
-  psraw       xmm1,2 
-  pand        xmm5,xmm1 
-  por         xmm5,xmm0 
-  paddw       xmm7,xmm7 
-  paddw       xmm10,xmm10 
-  psraw       xmm2,2 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm2 
-  paddw       xmm7,xmm9 
-  por         xmm1,xmm0 
-  paddw       xmm10,xmm14 
-  paddw       xmm7,xmm8 
-  movdqa      xmm0,xmm13 
-  packuswb    xmm5,xmm1 
-  paddw       xmm7,xmm3 
-  paddw       xmm10,xmm11 
-  movdqa      xmm1,xmm6 
-  paddw       xmm10,xmm3 
-  pandn       xmm6,xmm9 
-  psraw       xmm7,2 
-  pand        xmm1,xmm7 
-  psraw       xmm10,2 
-  pandn       xmm13,xmm14 
-  pand        xmm0,xmm10 
-  por         xmm1,xmm6 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm4,xmm6 
-  por         xmm0,xmm13 
-  punpcklbw   xmm4,xmm5 
-  punpckhbw   xmm6,xmm5 
-  movdqa      xmm3,xmm4 
-  packuswb    xmm1,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckhbw   xmm1,xmm15 
-  punpcklbw   xmm0,xmm15 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm6 
-  movdqa      xmm2,xmm3 
-  punpcklwd   xmm0,xmm1 
-  punpckhwd   xmm6,xmm1 
-  movdqa      xmm1,xmm4 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm6 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm6 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+10h],xmm0 
-  movdqa      [rsp+60h],xmm2 
-  movdqa      xmm0,xmm3 
-  mov         eax,[rsp+10h] 
-  mov         [rcx-2],eax 
-  mov         eax,[rsp+60h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [r10+rcx-2],eax 
-  movdqa      [rsp+20h],xmm0 
-  mov         eax, [rsp+20h] 
-  movdqa      [rsp+70h],xmm3 
-  mov         [rcx+r10*2-2],eax 
-  mov         eax,[rsp+70h] 
-  mov         [rdx+rcx-2],eax 
-  mov         eax,[rsp+18h] 
-  mov         [r11],eax 
-  mov         eax,[rsp+68h] 
-  mov         [r10+r11],eax 
-  mov         eax,[rsp+28h] 
-  mov         [r11+r10*2],eax 
-  mov         eax,[rsp+78h] 
-  mov         [rdx+r11],eax 
-  mov         eax,[rsp+14h] 
-  mov         [rdi-2],eax 
-  mov         eax,[rsp+64h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+24h] 
-  mov         [rdi+r10*2-2],eax 
-  mov         eax, [rsp+74h] 
-  mov         [rdx+rdi-2],eax 
-  mov         eax, [rsp+1Ch] 
-  mov         [rbx],eax 
-  mov         eax, [rsp+6Ch] 
-  mov         [r10+rbx],eax 
-  mov         eax,[rsp+2Ch] 
-  mov         [rbx+r10*2],eax 
-  mov         eax,[rsp+7Ch] 
-  mov         [rdx+rbx],eax  
-  lea         r11,[rsp+140h] 
-  mov         rbx, [r11+28h]    
-  mov         rsp,r11 
-  pop         rdi  
+  mov         rax,rsp
+  mov         [rax+20h],rbx
+  push        rdi
+  sub         rsp,140h
+  mov         rdi,rdx
+  lea         eax,[r8*4]
+  movsxd      r10,eax
+  mov         eax,[rcx-2]
+  mov         [rsp+10h],eax
+  lea         rbx,[r10+rdx-2]
+  lea         r11,[r10+rcx-2]
+  movdqa      xmm5,[rsp+10h]
+  movsxd      r10,r8d
+  mov         eax,[r10+rcx-2]
+  lea         rdx,[r10+r10*2]
+  mov         [rsp+20h],eax
+  mov         eax,[rcx+r10*2-2]
+  mov         [rsp+30h],eax
+  mov         eax,[rdx+rcx-2]
+  movdqa      xmm2,[rsp+20h]
+  mov         [rsp+40h],eax
+  mov         eax, [rdi-2]
+  movdqa      xmm4,[rsp+30h]
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rdi-2]
+  movdqa      xmm3,[rsp+40h]
+  mov         [rsp+60h],eax
+  mov         eax,[rdi+r10*2-2]
+  punpckldq   xmm5,[rsp+50h]
+  mov         [rsp+70h],eax
+  mov         eax, [rdx+rdi-2]
+  punpckldq   xmm2, [rsp+60h]
+  mov          [rsp+80h],eax
+  mov         eax,[r11]
+  punpckldq   xmm4, [rsp+70h]
+  mov         [rsp+50h],eax
+  mov         eax,[rbx]
+  punpckldq   xmm3,[rsp+80h]
+  mov         [rsp+60h],eax
+  mov         eax,[r10+r11]
+  movdqa      xmm0, [rsp+50h]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm0,[rsp+50h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+60h],eax
+  mov         eax,[r11+r10*2]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[rbx+r10*2]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  mov         eax, [rdx+r11]
+  movdqa      xmm15,xmm1
+  punpckldq   xmm0,[rsp+60h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax, [rdx+rbx]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm15,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm12,xmm15
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm12,xmm0
+  punpckhdq   xmm15,xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm11,xmm12
+  punpckldq   xmm0,xmm5
+  punpckhdq   xmm1,xmm5
+  punpcklqdq  xmm11,xmm0
+  punpckhqdq  xmm12,xmm0
+  movsx       eax,r9w
+  movdqa      xmm14,xmm15
+  punpcklqdq  xmm14,xmm1
+  punpckhqdq  xmm15,xmm1
+  pxor        xmm1,xmm1
+  movd        xmm0,eax
+  movdqa      xmm4,xmm12
+  movdqa      xmm8,xmm11
+  movsx       eax,word [rsp+170h] ; iBeta
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm4,xmm1
+  punpckhbw   xmm12,xmm1
+  movdqa      xmm9,xmm14
+  movdqa      xmm7,xmm15
+  movdqa      xmm10,xmm15
+  pshufd      xmm13,xmm0,0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm14,xmm1
+  movdqa      xmm6,xmm13
+  movd        xmm0,eax
+  movdqa      [rsp],xmm11
+  mov         eax,2
+  cwde
+  punpckhbw   xmm11,xmm1
+  punpckhbw   xmm10,xmm1
+  punpcklbw   xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm8,xmm1
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm4
+  psubw       xmm0,xmm9
+  psubw       xmm1,xmm4
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm12
+  movdqa      xmm1,xmm11
+  psubw       xmm0,xmm14
+  psubw       xmm1,xmm12
+  movdqa      xmm5,xmm6
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm13,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm2,xmm0
+  paddw       xmm1,xmm8
+  movdqa      xmm0,xmm10
+  pand        xmm13,xmm2
+  psubw       xmm0,xmm14
+  paddw       xmm1,xmm4
+  movdqa      xmm2,xmm11
+  pabsw       xmm0,xmm0
+  paddw       xmm2,xmm11
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm12
+  movd        xmm0,eax
+  pand        xmm13,xmm3
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm4
+  paddw       xmm2,xmm3
+  psraw       xmm1,2
+  pand        xmm5,xmm1
+  por         xmm5,xmm0
+  paddw       xmm7,xmm7
+  paddw       xmm10,xmm10
+  psraw       xmm2,2
+  movdqa      xmm1,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm2
+  paddw       xmm7,xmm9
+  por         xmm1,xmm0
+  paddw       xmm10,xmm14
+  paddw       xmm7,xmm8
+  movdqa      xmm0,xmm13
+  packuswb    xmm5,xmm1
+  paddw       xmm7,xmm3
+  paddw       xmm10,xmm11
+  movdqa      xmm1,xmm6
+  paddw       xmm10,xmm3
+  pandn       xmm6,xmm9
+  psraw       xmm7,2
+  pand        xmm1,xmm7
+  psraw       xmm10,2
+  pandn       xmm13,xmm14
+  pand        xmm0,xmm10
+  por         xmm1,xmm6
+  movdqa      xmm6,[rsp]
+  movdqa      xmm4,xmm6
+  por         xmm0,xmm13
+  punpcklbw   xmm4,xmm5
+  punpckhbw   xmm6,xmm5
+  movdqa      xmm3,xmm4
+  packuswb    xmm1,xmm0
+  movdqa      xmm0,xmm1
+  punpckhbw   xmm1,xmm15
+  punpcklbw   xmm0,xmm15
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm6
+  movdqa      xmm2,xmm3
+  punpcklwd   xmm0,xmm1
+  punpckhwd   xmm6,xmm1
+  movdqa      xmm1,xmm4
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm6
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm6
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+10h],xmm0
+  movdqa      [rsp+60h],xmm2
+  movdqa      xmm0,xmm3
+  mov         eax,[rsp+10h]
+  mov         [rcx-2],eax
+  mov         eax,[rsp+60h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [r10+rcx-2],eax
+  movdqa      [rsp+20h],xmm0
+  mov         eax, [rsp+20h]
+  movdqa      [rsp+70h],xmm3
+  mov         [rcx+r10*2-2],eax
+  mov         eax,[rsp+70h]
+  mov         [rdx+rcx-2],eax
+  mov         eax,[rsp+18h]
+  mov         [r11],eax
+  mov         eax,[rsp+68h]
+  mov         [r10+r11],eax
+  mov         eax,[rsp+28h]
+  mov         [r11+r10*2],eax
+  mov         eax,[rsp+78h]
+  mov         [rdx+r11],eax
+  mov         eax,[rsp+14h]
+  mov         [rdi-2],eax
+  mov         eax,[rsp+64h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+24h]
+  mov         [rdi+r10*2-2],eax
+  mov         eax, [rsp+74h]
+  mov         [rdx+rdi-2],eax
+  mov         eax, [rsp+1Ch]
+  mov         [rbx],eax
+  mov         eax, [rsp+6Ch]
+  mov         [r10+rbx],eax
+  mov         eax,[rsp+2Ch]
+  mov         [rbx+r10*2],eax
+  mov         eax,[rsp+7Ch]
+  mov         [rdx+rbx],eax
+  lea         r11,[rsp+140h]
+  mov         rbx, [r11+28h]
+  mov         rsp,r11
+  pop         rdi
   ret
 
 
@@ -1363,283 +1363,283 @@
 WELS_EXTERN DeblockChromaLt4H_sse2
 ALIGN  16
 DeblockChromaLt4H_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp  
-  push        rsi  
-  push        rdi  
-  push        r12  
-  sub         rsp,170h  
-  
-  movsxd      rsi,r8d 
-  lea         eax,[r8*4] 
-  mov         r11d,r9d 
-  movsxd      r10,eax 
-  mov         eax, [rcx-2] 
-  mov         r12,rdx 
-  mov         [rsp+40h],eax 
-  mov         eax, [rsi+rcx-2] 
-  lea         rbx,[r10+rcx-2] 
-  movdqa      xmm5,[rsp+40h] 
-  mov         [rsp+50h],eax 
-  mov         eax, [rcx+rsi*2-2] 
-  lea         rbp,[r10+rdx-2] 
-  movdqa      xmm2, [rsp+50h] 
-  mov         [rsp+60h],eax 
-  lea         r10,[rsi+rsi*2] 
-  mov         rdi,rcx 
-  mov         eax,[r10+rcx-2] 
-  movdqa      xmm4,[rsp+60h] 
-  mov         [rsp+70h],eax 
-  mov         eax,[rdx-2] 
-  mov         [rsp+80h],eax 
-  mov         eax, [rsi+rdx-2] 
-  movdqa      xmm3,[rsp+70h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rdx+rsi*2-2] 
-  punpckldq   xmm5,[rsp+80h] 
-  mov         [rsp+0A0h],eax 
-  mov         eax, [r10+rdx-2] 
-  punpckldq   xmm2,[rsp+90h] 
-  mov         [rsp+0B0h],eax 
-  mov         eax, [rbx] 
-  punpckldq   xmm4,[rsp+0A0h] 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp] 
-  punpckldq   xmm3,[rsp+0B0h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rsi+rbx] 
-  movdqa      xmm0,[rsp+80h] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rsi+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+90h],eax 
-  mov         eax,[rbx+rsi*2] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp+rsi*2] 
-  movdqa      xmm0, [rsp+80h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm7,xmm1 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax, [r10+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  mov         [rsp+90h],eax 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm7,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm6,xmm7 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm6,xmm0 
-  punpckhdq   xmm7,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckldq   xmm0,xmm5 
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        rsi
+  push        rdi
+  push        r12
+  sub         rsp,170h
+
+  movsxd      rsi,r8d
+  lea         eax,[r8*4]
+  mov         r11d,r9d
+  movsxd      r10,eax
+  mov         eax, [rcx-2]
+  mov         r12,rdx
+  mov         [rsp+40h],eax
+  mov         eax, [rsi+rcx-2]
+  lea         rbx,[r10+rcx-2]
+  movdqa      xmm5,[rsp+40h]
+  mov         [rsp+50h],eax
+  mov         eax, [rcx+rsi*2-2]
+  lea         rbp,[r10+rdx-2]
+  movdqa      xmm2, [rsp+50h]
+  mov         [rsp+60h],eax
+  lea         r10,[rsi+rsi*2]
+  mov         rdi,rcx
+  mov         eax,[r10+rcx-2]
+  movdqa      xmm4,[rsp+60h]
+  mov         [rsp+70h],eax
+  mov         eax,[rdx-2]
+  mov         [rsp+80h],eax
+  mov         eax, [rsi+rdx-2]
+  movdqa      xmm3,[rsp+70h]
+  mov         [rsp+90h],eax
+  mov         eax,[rdx+rsi*2-2]
+  punpckldq   xmm5,[rsp+80h]
+  mov         [rsp+0A0h],eax
+  mov         eax, [r10+rdx-2]
+  punpckldq   xmm2,[rsp+90h]
+  mov         [rsp+0B0h],eax
+  mov         eax, [rbx]
+  punpckldq   xmm4,[rsp+0A0h]
+  mov         [rsp+80h],eax
+  mov         eax,[rbp]
+  punpckldq   xmm3,[rsp+0B0h]
+  mov         [rsp+90h],eax
+  mov         eax,[rsi+rbx]
+  movdqa      xmm0,[rsp+80h]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rsi+rbp]
+  movdqa      xmm0,[rsp+80h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+90h],eax
+  mov         eax,[rbx+rsi*2]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rbp+rsi*2]
+  movdqa      xmm0, [rsp+80h]
+  mov         [rsp+90h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm7,xmm1
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax, [r10+rbp]
+  movdqa      xmm0,[rsp+80h]
+  mov         [rsp+90h],eax
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm7,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm6,xmm7
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm6,xmm0
+  punpckhdq   xmm7,xmm0
+  movdqa      xmm0,xmm1
+  punpckldq   xmm0,xmm5
   mov         rax, [rsp+1C8h]    ; pTC
-  punpckhdq   xmm1,xmm5 
-  movdqa      xmm9,xmm6 
-  punpckhqdq  xmm6,xmm0 
-  punpcklqdq  xmm9,xmm0 
-  movdqa      xmm2,xmm7 
-  movdqa      xmm13,xmm6 
-  movdqa      xmm4,xmm9 
-  movdqa      [rsp+10h],xmm9 
-  punpcklqdq  xmm2,xmm1 
-  punpckhqdq  xmm7,xmm1 
-  pxor        xmm1,xmm1 
-  movsx       ecx,byte [rax+3] 
-  movsx       edx,byte [rax+2] 
-  movsx       r8d,byte [rax+1] 
-  movsx       r9d,byte [rax] 
-  movdqa      xmm10,xmm1 
-  movdqa      xmm15,xmm2 
-  punpckhbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm4,xmm1 
-  movsx       eax,r11w 
-  mov         word [rsp+0Eh],cx 
-  mov         word [rsp+0Ch],cx 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm8,xmm7 
-  movdqa      [rsp+20h],xmm7 
-  punpcklbw   xmm15,xmm1 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm3,xmm1 
-  mov         word [rsp+0Ah],dx 
-  mov         word [rsp+8],dx 
-  mov         word [rsp+6],r8w 
-  movd        xmm0,eax 
-  movdqa      [rsp+30h],xmm6 
-  punpckhbw   xmm9,xmm1 
-  punpckhbw   xmm8,xmm1 
-  punpcklwd   xmm0,xmm0 
+  punpckhdq   xmm1,xmm5
+  movdqa      xmm9,xmm6
+  punpckhqdq  xmm6,xmm0
+  punpcklqdq  xmm9,xmm0
+  movdqa      xmm2,xmm7
+  movdqa      xmm13,xmm6
+  movdqa      xmm4,xmm9
+  movdqa      [rsp+10h],xmm9
+  punpcklqdq  xmm2,xmm1
+  punpckhqdq  xmm7,xmm1
+  pxor        xmm1,xmm1
+  movsx       ecx,byte [rax+3]
+  movsx       edx,byte [rax+2]
+  movsx       r8d,byte [rax+1]
+  movsx       r9d,byte [rax]
+  movdqa      xmm10,xmm1
+  movdqa      xmm15,xmm2
+  punpckhbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm4,xmm1
+  movsx       eax,r11w
+  mov         word [rsp+0Eh],cx
+  mov         word [rsp+0Ch],cx
+  movdqa      xmm3,xmm7
+  movdqa      xmm8,xmm7
+  movdqa      [rsp+20h],xmm7
+  punpcklbw   xmm15,xmm1
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm3,xmm1
+  mov         word [rsp+0Ah],dx
+  mov         word [rsp+8],dx
+  mov         word [rsp+6],r8w
+  movd        xmm0,eax
+  movdqa      [rsp+30h],xmm6
+  punpckhbw   xmm9,xmm1
+  punpckhbw   xmm8,xmm1
+  punpcklwd   xmm0,xmm0
   movsx       eax,word [rsp+1C0h]   ; iBeta
-  mov         word [rsp+4],r8w 
-  mov         word [rsp+2],r9w 
-  pshufd      xmm12,xmm0,0 
-  mov         word [rsp],r9w 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  movdqa      xmm14, [rsp] 
-  movdqa      [rsp],xmm2 
-  movdqa      xmm2,xmm12 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  psubw       xmm10,xmm14 
-  movd        xmm0,eax 
-  movdqa      xmm7,xmm14 
-  movdqa      xmm6,xmm14 
-  pcmpgtw     xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  movdqa      xmm0,xmm4 
-  movdqa      xmm1,xmm15 
-  psubw       xmm4,xmm13 
-  psubw       xmm0,xmm3 
-  psubw       xmm1,xmm13 
-  psubw       xmm3,xmm15 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm10 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm0,xmm13 
-  psubw       xmm0,xmm15 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm3,[rsp+30h] 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm9 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm8 
-  psubw       xmm9,xmm3 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  psubw       xmm15,xmm6 
-  paddw       xmm13,xmm6 
-  movdqa      xmm2,[rsp] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  psubw       xmm8,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm3 
-  movdqa      xmm5,[rsp+10h] 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  movdqa      xmm4,xmm5 
-  pabsw       xmm0,xmm0 
-  pmaxsw      xmm10,xmm1 
-  movdqa      xmm1,xmm11 
-  pcmpgtw     xmm12,xmm0 
-  pabsw       xmm0,xmm9 
-  pminsw      xmm14,xmm10 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm8 
-  pcmpgtw     xmm11,xmm0 
-  pand        xmm12,xmm1 
-  movdqa      xmm1,[rsp+20h] 
-  pand        xmm12,xmm11 
-  pand        xmm12,xmm7 
-  pand        xmm14,xmm12 
-  paddw       xmm3,xmm14 
-  psubw       xmm2,xmm14 
-  packuswb    xmm13,xmm3 
-  packuswb    xmm15,xmm2 
-  punpcklbw   xmm4,xmm13 
-  punpckhbw   xmm5,xmm13 
-  movdqa      xmm0,xmm15 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm4 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm2,xmm3 
-  movdqa      xmm1,xmm4 
-  punpcklwd   xmm0,xmm15 
-  punpckhwd   xmm5,xmm15 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm5 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm5 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+40h],xmm0 
-  movdqa      xmm0,xmm3 
-  movdqa      [rsp+90h],xmm2 
-  mov         eax,[rsp+40h] 
-  mov         [rdi-2],eax 
-  mov         eax, [rsp+90h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [rsi+rdi-2],eax 
-  movdqa      [rsp+50h],xmm0 
-  mov         eax,[rsp+50h] 
-  movdqa      [rsp+0A0h],xmm3 
-  mov         [rdi+rsi*2-2],eax 
-  mov         eax,[rsp+0A0h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+48h] 
-  mov         [rbx],eax 
-  mov         eax,[rsp+98h] 
-  mov         [rsi+rbx],eax 
-  mov         eax,[rsp+58h] 
-  mov         [rbx+rsi*2],eax 
-  mov         eax, [rsp+0A8h] 
-  mov         [r10+rbx],eax 
-  mov         eax, [rsp+44h] 
-  mov         [r12-2],eax 
-  mov         eax,[rsp+94h] 
-  mov         [rsi+r12-2],eax 
-  mov         eax,[rsp+54h] 
-  mov         [r12+rsi*2-2],eax 
-  mov         eax, [rsp+0A4h] 
-  mov         [r10+r12-2],eax 
-  mov         eax,[rsp+4Ch] 
-  mov         [rbp],eax 
-  mov         eax,[rsp+9Ch] 
-  mov         [rsi+rbp],eax 
-  mov         eax, [rsp+5Ch] 
-  mov         [rbp+rsi*2],eax 
-  mov         eax,[rsp+0ACh] 
-  mov         [r10+rbp],eax   
-  lea         r11,[rsp+170h]    
-  mov         rsp,r11 
-  pop         r12  
-  pop         rdi  
-  pop         rsi  
-  pop         rbp  
-  pop         rbx  
-  ret 
+  mov         word [rsp+4],r8w
+  mov         word [rsp+2],r9w
+  pshufd      xmm12,xmm0,0
+  mov         word [rsp],r9w
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  movdqa      xmm14, [rsp]
+  movdqa      [rsp],xmm2
+  movdqa      xmm2,xmm12
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  psubw       xmm10,xmm14
+  movd        xmm0,eax
+  movdqa      xmm7,xmm14
+  movdqa      xmm6,xmm14
+  pcmpgtw     xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  movdqa      xmm0,xmm4
+  movdqa      xmm1,xmm15
+  psubw       xmm4,xmm13
+  psubw       xmm0,xmm3
+  psubw       xmm1,xmm13
+  psubw       xmm3,xmm15
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm10
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm11
+  movdqa      xmm0,xmm13
+  psubw       xmm0,xmm15
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm11
+  movdqa      xmm3,[rsp+30h]
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm9
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm8
+  psubw       xmm9,xmm3
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  psubw       xmm15,xmm6
+  paddw       xmm13,xmm6
+  movdqa      xmm2,[rsp]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  psubw       xmm8,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm3
+  movdqa      xmm5,[rsp+10h]
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  movdqa      xmm4,xmm5
+  pabsw       xmm0,xmm0
+  pmaxsw      xmm10,xmm1
+  movdqa      xmm1,xmm11
+  pcmpgtw     xmm12,xmm0
+  pabsw       xmm0,xmm9
+  pminsw      xmm14,xmm10
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm8
+  pcmpgtw     xmm11,xmm0
+  pand        xmm12,xmm1
+  movdqa      xmm1,[rsp+20h]
+  pand        xmm12,xmm11
+  pand        xmm12,xmm7
+  pand        xmm14,xmm12
+  paddw       xmm3,xmm14
+  psubw       xmm2,xmm14
+  packuswb    xmm13,xmm3
+  packuswb    xmm15,xmm2
+  punpcklbw   xmm4,xmm13
+  punpckhbw   xmm5,xmm13
+  movdqa      xmm0,xmm15
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm4
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm2,xmm3
+  movdqa      xmm1,xmm4
+  punpcklwd   xmm0,xmm15
+  punpckhwd   xmm5,xmm15
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm5
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm5
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+40h],xmm0
+  movdqa      xmm0,xmm3
+  movdqa      [rsp+90h],xmm2
+  mov         eax,[rsp+40h]
+  mov         [rdi-2],eax
+  mov         eax, [rsp+90h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [rsi+rdi-2],eax
+  movdqa      [rsp+50h],xmm0
+  mov         eax,[rsp+50h]
+  movdqa      [rsp+0A0h],xmm3
+  mov         [rdi+rsi*2-2],eax
+  mov         eax,[rsp+0A0h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+48h]
+  mov         [rbx],eax
+  mov         eax,[rsp+98h]
+  mov         [rsi+rbx],eax
+  mov         eax,[rsp+58h]
+  mov         [rbx+rsi*2],eax
+  mov         eax, [rsp+0A8h]
+  mov         [r10+rbx],eax
+  mov         eax, [rsp+44h]
+  mov         [r12-2],eax
+  mov         eax,[rsp+94h]
+  mov         [rsi+r12-2],eax
+  mov         eax,[rsp+54h]
+  mov         [r12+rsi*2-2],eax
+  mov         eax, [rsp+0A4h]
+  mov         [r10+r12-2],eax
+  mov         eax,[rsp+4Ch]
+  mov         [rbp],eax
+  mov         eax,[rsp+9Ch]
+  mov         [rsi+rbp],eax
+  mov         eax, [rsp+5Ch]
+  mov         [rbp+rsi*2],eax
+  mov         eax,[rsp+0ACh]
+  mov         [r10+rbp],eax
+  lea         r11,[rsp+170h]
+  mov         rsp,r11
+  pop         r12
+  pop         rdi
+  pop         rsi
+  pop         rbp
+  pop         rbx
+  ret
 
 
 
@@ -1649,258 +1649,258 @@
 WELS_EXTERN   DeblockLumaLt4V_sse2
 
 DeblockLumaLt4V_sse2:
-  push        rbp      
-  mov         r11,r8  ; pTC                                                    
-  sub         rsp,1B0h                                                       
-  lea         rbp,[rsp+20h]                                                  
-  movd        xmm4,edx                                                                                                  
-  movd        xmm2,ecx                                                       
-  mov         qword [rbp+180h],r12                                       
-  mov         r10,rdi                                                        
-  movsxd      r12,esi                                                        
+  push        rbp
+  mov         r11,r8  ; pTC
+  sub         rsp,1B0h
+  lea         rbp,[rsp+20h]
+  movd        xmm4,edx
+  movd        xmm2,ecx
+  mov         qword [rbp+180h],r12
+  mov         r10,rdi
+  movsxd      r12,esi
   add         rsi,rsi
-  movsxd      rdx,esi 
-  sub         r10,r12                                                        
-  movsx       r8d,byte [r11]                                             
-  pxor        xmm3,xmm3                                                      
-  punpcklwd   xmm2,xmm2                                                      
-  movaps      [rbp+50h],xmm14                                    
-  lea         rax,[r12+r12*2]                                                
-  movdqa      xmm14,[rdx+rdi]                                    
-  neg         rax                                                            
-  pshufd      xmm0,xmm2,0                                                    
-  movd        xmm2,r8d                                                       
-  movsx       rsi,byte [r11+1]                                           
-  movsx       r8d,byte [r11+2]                                           
-  movsx       r11d,byte [r11+3]                                          
-  movaps      [rbp+70h],xmm12                                    
-  movd        xmm1,esi                                                      
-  movaps      [rbp+80h],xmm11                                    
-  movd        xmm12,r8d                                                      
-  movd        xmm11,r11d                                                     
-  movdqa      xmm5, [rax+rdi]                                     
-  lea         rax,[r12+r12]                                                  
-  punpcklwd   xmm12,xmm12                                                    
-  neg         rax                                                            
-  punpcklwd   xmm11,xmm11                                                    
-  movaps      [rbp],xmm8                                         
-  movdqa      xmm8, [r10]                                         
-  punpcklwd   xmm2,xmm2                                                      
-  punpcklwd   xmm1,xmm1                                                      
-  punpcklqdq  xmm12,xmm12                                                    
-  punpcklqdq  xmm11,xmm11                                                    
-  punpcklqdq  xmm2,xmm2                                                      
-  punpcklqdq  xmm1,xmm1                                                      
-  shufps      xmm12,xmm11,88h                                                
-  movdqa      xmm11,xmm8                                                     
-  movaps      [rbp+30h],xmm9                                     
-  movdqa      xmm9,[rdi]                                         
-  shufps      xmm2,xmm1,88h                                                  
-  movdqa      xmm1,xmm5                                                      
-  punpcklbw   xmm11,xmm3                                                     
-  movaps      [rbp+20h],xmm6                                     
-  movaps      [rbp+60h],xmm13                                    
-  movdqa      xmm13,xmm11                                                    
-  movaps      [rbp+90h],xmm10                                    
-  movdqa      xmm10,xmm9                                                     
-  movdqa      xmm6,[rax+rdi]                                     
-  punpcklbw   xmm1,xmm3                                                      
-  movaps      [rbp+0A0h],xmm12                                   
-  psubw       xmm13,xmm1                                                     
-  movaps      [rbp+40h],xmm15                                    
-  movdqa      xmm15,xmm14                                                    
-  movaps      [rbp+10h],xmm7                                     
-  movdqa      xmm7,xmm6                                                      
-  punpcklbw   xmm10,xmm3                                                     
-  movdqa      xmm12,[r12+rdi]                                    
-  punpcklbw   xmm7,xmm3                                                      
-  punpcklbw   xmm12,xmm3                                                     
-  punpcklbw   xmm15,xmm3                                                     
-  pabsw       xmm3,xmm13                                                     
-  movdqa      xmm13,xmm10                                                    
-  psubw       xmm13,xmm15                                                    
-  movdqa      [rbp+0F0h],xmm15                                   
-  pabsw       xmm15,xmm13                                                    
-  movdqa      xmm13,xmm11                                                    
-  movdqa      [rbp+0B0h],xmm1                                    
-  movdqa      xmm1,xmm0                                                      
-  pavgw       xmm13,xmm10                                                    
-  pcmpgtw     xmm1,xmm3                                                      
-  movdqa      [rbp+120h],xmm13                                   
-  movaps      xmm13,xmm2                                                     
-  punpcklwd   xmm4,xmm4                                                      
-  movdqa      xmm3,xmm0                                                      
-  movdqa      [rbp+100h],xmm1                                    
-  psubw       xmm13,xmm1                                                     
-  movdqa      xmm1,xmm10                                                     
-  pcmpgtw     xmm3,xmm15                                                     
-  pshufd      xmm4,xmm4,0                                                    
-  psubw       xmm1,xmm11                                                     
-  movdqa      [rbp+0D0h],xmm10                                   
-  psubw       xmm13,xmm3                                                     
-  movdqa      [rbp+110h],xmm3                                    
-  pabsw       xmm15,xmm1                                                     
-  movdqa      xmm3,xmm4                                                      
-  psubw       xmm10,xmm12                                                    
-  pcmpgtw     xmm3,xmm15                                                     
-  pabsw       xmm15,xmm10                                                    
-  movdqa      xmm10,xmm0                                                     
-  psllw       xmm1,2                                                         
-  movdqa      [rbp+0C0h],xmm11                                   
-  psubw       xmm11,xmm7                                                     
-  pcmpgtw     xmm10,xmm15                                                    
-  pabsw       xmm11,xmm11                                                    
-  movdqa      xmm15,xmm0                                                     
-  pand        xmm3,xmm10                                                     
-  pcmpgtw     xmm15,xmm11                                                    
-  movaps      xmm11,xmm2                                                     
-  pxor        xmm10,xmm10                                                    
-  pand        xmm3,xmm15                                                     
-  pcmpgtw     xmm11,xmm10                                                    
-  pcmpeqw     xmm10,xmm2                                                     
-  por         xmm11,xmm10                                                    
-  pand        xmm3,xmm11                                                     
-  movdqa      xmm11,xmm7                                                     
-  psubw       xmm11,xmm12                                                    
-  pxor        xmm15,xmm15                                                    
-  paddw       xmm11,xmm1                                                     
-  psubw       xmm15,xmm13                                                    
-  movdqa      [rbp+0E0h],xmm12                                   
-  paddw       xmm11,[FOUR_16B_SSE2] 
-  pxor        xmm12,xmm12                                                    
-  psraw       xmm11,3                                                        
-  punpckhbw   xmm8,xmm12                                                     
-  pmaxsw      xmm15,xmm11                                                    
-  punpckhbw   xmm5,xmm12                                                     
-  movdqa      xmm11,xmm8                                                     
-  pminsw      xmm13,xmm15                                                    
-  psubw       xmm11,xmm5                                                     
-  punpckhbw   xmm9,xmm12                                                     
-  pand        xmm13,xmm3                                                     
-  movdqa      [rbp+130h],xmm13                                   
-  pabsw       xmm13,xmm11                                                    
-  punpckhbw   xmm14,xmm12                                                    
-  movdqa      xmm11,xmm9                                                     
-  psubw       xmm11,xmm14                                                    
-  movdqa      xmm15,xmm0                                                     
-  movdqa      [rbp+140h],xmm14                                   
-  pabsw       xmm14,xmm11                                                    
-  movdqa      xmm11,xmm8                                                     
-  pcmpgtw     xmm15,xmm14                                                    
-  movdqa      xmm1,[r12+rdi]                                     
-  pavgw       xmm11,xmm9                                                     
-  movdqa      [rbp+170h],xmm11                                   
-  movdqa      xmm10,xmm9                                                     
-  punpckhbw   xmm6,xmm12                                                     
-  psubw       xmm10,xmm8                                                     
-  punpckhbw   xmm1,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  movaps      xmm11,[rbp+0A0h]                                   
-  pcmpgtw     xmm12,xmm13                                                    
-  movaps      xmm13,xmm11                                                    
-  psubw       xmm13,xmm12                                                    
-  movdqa      [rbp+160h],xmm15                                   
-  psubw       xmm13,xmm15                                                    
-  movdqa      xmm15,xmm9                                                     
-  psubw       xmm15,xmm1                                                     
-  movdqa      [rbp+150h],xmm12                                   
-  pabsw       xmm12,xmm10                                                    
-  pabsw       xmm14,xmm15                                                    
-  movdqa      xmm15,xmm8                                                     
-  pcmpgtw     xmm4,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  psubw       xmm15,xmm6                                                     
-  pcmpgtw     xmm12,xmm14                                                    
-  pabsw       xmm14,xmm15                                                    
-  psllw       xmm10,2                                                        
-  pcmpgtw     xmm0,xmm14                                                     
-  movdqa      xmm14,xmm6                                                     
-  psubw       xmm14,xmm1                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm14,xmm10                                                    
-  pand        xmm4,xmm0                                                      
-  paddw       xmm14,[FOUR_16B_SSE2] 
-  pxor        xmm15,xmm15                                                    
-  movaps      xmm12,xmm11                                                    
-  psubw       xmm15,xmm13                                                    
-  pxor        xmm0,xmm0                                                      
-  psraw       xmm14,3                                                        
-  pcmpgtw     xmm12,xmm0                                                     
-  pcmpeqw     xmm0,xmm11                                                     
-  pmaxsw      xmm15,xmm14                                                    
-  por         xmm12,xmm0                                                     
-  movdqa      xmm0,[rbp+120h]                                    
-  pminsw      xmm13,xmm15                                                    
-  movdqa      xmm15,[rbp+0B0h]                                   
-  movdqa      xmm10,xmm7                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm15,xmm0                                                     
-  pxor        xmm12,xmm12                                                    
-  paddw       xmm10,xmm7                                                     
-  movdqa      xmm14,xmm12                                                    
-  psubw       xmm15,xmm10                                                    
-  psubw       xmm14,xmm2                                                     
-  psraw       xmm15,1                                                        
-  pmaxsw      xmm15,xmm14                                                    
-  movdqa      xmm10,xmm6                                                     
-  pminsw      xmm15,xmm2                                                     
-  paddw       xmm10,xmm6                                                     
-  pand        xmm15,xmm3                                                     
-  psubw       xmm12,xmm11                                                    
-  pand        xmm15,[rbp+100h]                                   
-  pand        xmm13,xmm4                                                     
-  paddw       xmm7,xmm15                                                     
-  paddw       xmm8,xmm13                                                     
-  movdqa      xmm15,[rbp+170h]                                   
-  psubw       xmm9,xmm13                                                     
-  paddw       xmm5,xmm15                                                     
-  psubw       xmm5,xmm10                                                     
-  psraw       xmm5,1                                                         
-  pmaxsw      xmm5,xmm12                                                     
-  pminsw      xmm5,xmm11                                                     
-  pand        xmm5,xmm4                                                      
-  pand        xmm5,[rbp+150h]                                    
-  paddw       xmm6,xmm5                                                      
-  movdqa      xmm5,[rbp+0C0h]                                    
-  packuswb    xmm7,xmm6                                                      
-  movdqa      xmm6,[rbp+130h]                                    
-  paddw       xmm5,xmm6                                                      
-  packuswb    xmm5,xmm8                                                      
-  movdqa      xmm8,[rbp+0D0h]                                    
-  psubw       xmm8,xmm6                                                      
-  movdqa      xmm6,[rbp+0F0h]                                    
-  paddw       xmm6,xmm0                                                      
-  movdqa      xmm0,[rbp+0E0h]                                    
-  packuswb    xmm8,xmm9                                                      
-  movdqa      xmm9,xmm0                                                      
-  paddw       xmm9,xmm0                                                      
-  psubw       xmm6,xmm9                                                      
-  psraw       xmm6,1                                                         
-  pmaxsw      xmm14,xmm6                                                     
-  pminsw      xmm2,xmm14                                                     
-  pand        xmm2,xmm3                                                      
-  pand        xmm2,[rbp+110h]                                    
-  paddw       xmm0,xmm2                                                      
-  movdqa      xmm2,[rbp+140h]                                    
-  paddw       xmm2,xmm15                                                     
-  movdqa      xmm15,xmm1                                                     
-  paddw       xmm15,xmm1                                                     
-  psubw       xmm2,xmm15                                                     
-  psraw       xmm2,1                                                         
-  pmaxsw      xmm12,xmm2                                                     
-  pminsw      xmm11,xmm12                                                    
-  pand        xmm11,xmm4                                                     
-  pand        xmm11,[rbp+160h]                                   
-  paddw       xmm1,xmm11                                                     
-  movdqa      [rax+rdi],xmm7                                     
-  movdqa      [r10],xmm5                                         
-  packuswb    xmm0,xmm1                                                      
-  movdqa      [rdi],xmm8                                         
-  movdqa      [r12+rdi],xmm0                                                                        
-  mov         r12,qword [rbp+180h]                                       
-  lea         rsp,[rbp+190h]                                                 
-  pop         rbp                                                            
-  ret 
+  movsxd      rdx,esi
+  sub         r10,r12
+  movsx       r8d,byte [r11]
+  pxor        xmm3,xmm3
+  punpcklwd   xmm2,xmm2
+  movaps      [rbp+50h],xmm14
+  lea         rax,[r12+r12*2]
+  movdqa      xmm14,[rdx+rdi]
+  neg         rax
+  pshufd      xmm0,xmm2,0
+  movd        xmm2,r8d
+  movsx       rsi,byte [r11+1]
+  movsx       r8d,byte [r11+2]
+  movsx       r11d,byte [r11+3]
+  movaps      [rbp+70h],xmm12
+  movd        xmm1,esi
+  movaps      [rbp+80h],xmm11
+  movd        xmm12,r8d
+  movd        xmm11,r11d
+  movdqa      xmm5, [rax+rdi]
+  lea         rax,[r12+r12]
+  punpcklwd   xmm12,xmm12
+  neg         rax
+  punpcklwd   xmm11,xmm11
+  movaps      [rbp],xmm8
+  movdqa      xmm8, [r10]
+  punpcklwd   xmm2,xmm2
+  punpcklwd   xmm1,xmm1
+  punpcklqdq  xmm12,xmm12
+  punpcklqdq  xmm11,xmm11
+  punpcklqdq  xmm2,xmm2
+  punpcklqdq  xmm1,xmm1
+  shufps      xmm12,xmm11,88h
+  movdqa      xmm11,xmm8
+  movaps      [rbp+30h],xmm9
+  movdqa      xmm9,[rdi]
+  shufps      xmm2,xmm1,88h
+  movdqa      xmm1,xmm5
+  punpcklbw   xmm11,xmm3
+  movaps      [rbp+20h],xmm6
+  movaps      [rbp+60h],xmm13
+  movdqa      xmm13,xmm11
+  movaps      [rbp+90h],xmm10
+  movdqa      xmm10,xmm9
+  movdqa      xmm6,[rax+rdi]
+  punpcklbw   xmm1,xmm3
+  movaps      [rbp+0A0h],xmm12
+  psubw       xmm13,xmm1
+  movaps      [rbp+40h],xmm15
+  movdqa      xmm15,xmm14
+  movaps      [rbp+10h],xmm7
+  movdqa      xmm7,xmm6
+  punpcklbw   xmm10,xmm3
+  movdqa      xmm12,[r12+rdi]
+  punpcklbw   xmm7,xmm3
+  punpcklbw   xmm12,xmm3
+  punpcklbw   xmm15,xmm3
+  pabsw       xmm3,xmm13
+  movdqa      xmm13,xmm10
+  psubw       xmm13,xmm15
+  movdqa      [rbp+0F0h],xmm15
+  pabsw       xmm15,xmm13
+  movdqa      xmm13,xmm11
+  movdqa      [rbp+0B0h],xmm1
+  movdqa      xmm1,xmm0
+  pavgw       xmm13,xmm10
+  pcmpgtw     xmm1,xmm3
+  movdqa      [rbp+120h],xmm13
+  movaps      xmm13,xmm2
+  punpcklwd   xmm4,xmm4
+  movdqa      xmm3,xmm0
+  movdqa      [rbp+100h],xmm1
+  psubw       xmm13,xmm1
+  movdqa      xmm1,xmm10
+  pcmpgtw     xmm3,xmm15
+  pshufd      xmm4,xmm4,0
+  psubw       xmm1,xmm11
+  movdqa      [rbp+0D0h],xmm10
+  psubw       xmm13,xmm3
+  movdqa      [rbp+110h],xmm3
+  pabsw       xmm15,xmm1
+  movdqa      xmm3,xmm4
+  psubw       xmm10,xmm12
+  pcmpgtw     xmm3,xmm15
+  pabsw       xmm15,xmm10
+  movdqa      xmm10,xmm0
+  psllw       xmm1,2
+  movdqa      [rbp+0C0h],xmm11
+  psubw       xmm11,xmm7
+  pcmpgtw     xmm10,xmm15
+  pabsw       xmm11,xmm11
+  movdqa      xmm15,xmm0
+  pand        xmm3,xmm10
+  pcmpgtw     xmm15,xmm11
+  movaps      xmm11,xmm2
+  pxor        xmm10,xmm10
+  pand        xmm3,xmm15
+  pcmpgtw     xmm11,xmm10
+  pcmpeqw     xmm10,xmm2
+  por         xmm11,xmm10
+  pand        xmm3,xmm11
+  movdqa      xmm11,xmm7
+  psubw       xmm11,xmm12
+  pxor        xmm15,xmm15
+  paddw       xmm11,xmm1
+  psubw       xmm15,xmm13
+  movdqa      [rbp+0E0h],xmm12
+  paddw       xmm11,[FOUR_16B_SSE2]
+  pxor        xmm12,xmm12
+  psraw       xmm11,3
+  punpckhbw   xmm8,xmm12
+  pmaxsw      xmm15,xmm11
+  punpckhbw   xmm5,xmm12
+  movdqa      xmm11,xmm8
+  pminsw      xmm13,xmm15
+  psubw       xmm11,xmm5
+  punpckhbw   xmm9,xmm12
+  pand        xmm13,xmm3
+  movdqa      [rbp+130h],xmm13
+  pabsw       xmm13,xmm11
+  punpckhbw   xmm14,xmm12
+  movdqa      xmm11,xmm9
+  psubw       xmm11,xmm14
+  movdqa      xmm15,xmm0
+  movdqa      [rbp+140h],xmm14
+  pabsw       xmm14,xmm11
+  movdqa      xmm11,xmm8
+  pcmpgtw     xmm15,xmm14
+  movdqa      xmm1,[r12+rdi]
+  pavgw       xmm11,xmm9
+  movdqa      [rbp+170h],xmm11
+  movdqa      xmm10,xmm9
+  punpckhbw   xmm6,xmm12
+  psubw       xmm10,xmm8
+  punpckhbw   xmm1,xmm12
+  movdqa      xmm12,xmm0
+  movaps      xmm11,[rbp+0A0h]
+  pcmpgtw     xmm12,xmm13
+  movaps      xmm13,xmm11
+  psubw       xmm13,xmm12
+  movdqa      [rbp+160h],xmm15
+  psubw       xmm13,xmm15
+  movdqa      xmm15,xmm9
+  psubw       xmm15,xmm1
+  movdqa      [rbp+150h],xmm12
+  pabsw       xmm12,xmm10
+  pabsw       xmm14,xmm15
+  movdqa      xmm15,xmm8
+  pcmpgtw     xmm4,xmm12
+  movdqa      xmm12,xmm0
+  psubw       xmm15,xmm6
+  pcmpgtw     xmm12,xmm14
+  pabsw       xmm14,xmm15
+  psllw       xmm10,2
+  pcmpgtw     xmm0,xmm14
+  movdqa      xmm14,xmm6
+  psubw       xmm14,xmm1
+  pand        xmm4,xmm12
+  paddw       xmm14,xmm10
+  pand        xmm4,xmm0
+  paddw       xmm14,[FOUR_16B_SSE2]
+  pxor        xmm15,xmm15
+  movaps      xmm12,xmm11
+  psubw       xmm15,xmm13
+  pxor        xmm0,xmm0
+  psraw       xmm14,3
+  pcmpgtw     xmm12,xmm0
+  pcmpeqw     xmm0,xmm11
+  pmaxsw      xmm15,xmm14
+  por         xmm12,xmm0
+  movdqa      xmm0,[rbp+120h]
+  pminsw      xmm13,xmm15
+  movdqa      xmm15,[rbp+0B0h]
+  movdqa      xmm10,xmm7
+  pand        xmm4,xmm12
+  paddw       xmm15,xmm0
+  pxor        xmm12,xmm12
+  paddw       xmm10,xmm7
+  movdqa      xmm14,xmm12
+  psubw       xmm15,xmm10
+  psubw       xmm14,xmm2
+  psraw       xmm15,1
+  pmaxsw      xmm15,xmm14
+  movdqa      xmm10,xmm6
+  pminsw      xmm15,xmm2
+  paddw       xmm10,xmm6
+  pand        xmm15,xmm3
+  psubw       xmm12,xmm11
+  pand        xmm15,[rbp+100h]
+  pand        xmm13,xmm4
+  paddw       xmm7,xmm15
+  paddw       xmm8,xmm13
+  movdqa      xmm15,[rbp+170h]
+  psubw       xmm9,xmm13
+  paddw       xmm5,xmm15
+  psubw       xmm5,xmm10
+  psraw       xmm5,1
+  pmaxsw      xmm5,xmm12
+  pminsw      xmm5,xmm11
+  pand        xmm5,xmm4
+  pand        xmm5,[rbp+150h]
+  paddw       xmm6,xmm5
+  movdqa      xmm5,[rbp+0C0h]
+  packuswb    xmm7,xmm6
+  movdqa      xmm6,[rbp+130h]
+  paddw       xmm5,xmm6
+  packuswb    xmm5,xmm8
+  movdqa      xmm8,[rbp+0D0h]
+  psubw       xmm8,xmm6
+  movdqa      xmm6,[rbp+0F0h]
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[rbp+0E0h]
+  packuswb    xmm8,xmm9
+  movdqa      xmm9,xmm0
+  paddw       xmm9,xmm0
+  psubw       xmm6,xmm9
+  psraw       xmm6,1
+  pmaxsw      xmm14,xmm6
+  pminsw      xmm2,xmm14
+  pand        xmm2,xmm3
+  pand        xmm2,[rbp+110h]
+  paddw       xmm0,xmm2
+  movdqa      xmm2,[rbp+140h]
+  paddw       xmm2,xmm15
+  movdqa      xmm15,xmm1
+  paddw       xmm15,xmm1
+  psubw       xmm2,xmm15
+  psraw       xmm2,1
+  pmaxsw      xmm12,xmm2
+  pminsw      xmm11,xmm12
+  pand        xmm11,xmm4
+  pand        xmm11,[rbp+160h]
+  paddw       xmm1,xmm11
+  movdqa      [rax+rdi],xmm7
+  movdqa      [r10],xmm5
+  packuswb    xmm0,xmm1
+  movdqa      [rdi],xmm8
+  movdqa      [r12+rdi],xmm0
+  mov         r12,qword [rbp+180h]
+  lea         rsp,[rbp+190h]
+  pop         rbp
+  ret
 
 
 WELS_EXTERN DeblockLumaEq4V_sse2
@@ -1907,637 +1907,637 @@
 
 ALIGN  16
 DeblockLumaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp   
+  mov         rax,rsp
+  push        rbx
+  push        rbp
   mov         r8,   rdx
   mov         r9,   rcx
   mov         rcx,  rdi
   mov         rdx,  rsi
-  sub         rsp,1D8h 
-  movaps      [rax-38h],xmm6 
-  movaps      [rax-48h],xmm7 
-  movaps      [rax-58h],xmm8 
-  pxor        xmm1,xmm1 
-  movsxd      r10,edx 
-  mov         rbp,rcx 
-  mov         r11d,r8d 
-  mov         rdx,rcx 
-  mov         rdi,rbp 
-  mov         rbx,rbp 
-  movdqa      xmm5,[rbp] 
-  movaps      [rax-68h],xmm9 
-  movaps      [rax-78h],xmm10 
-  punpcklbw   xmm5,xmm1 
-  movaps      [rax-88h],xmm11 
-  movaps      [rax-98h],xmm12 
-  movaps      [rax-0A8h],xmm13 
-  movaps      [rax-0B8h],xmm14 
-  movdqa      xmm14,[r10+rbp] 
-  movaps      [rax-0C8h],xmm15 
-  lea         eax,[r10*4] 
-  movsxd      r8,eax 
-  lea         eax,[r10+r10*2] 
-  movsxd      rcx,eax 
-  lea         eax,[r10+r10] 
-  sub         rdx,r8 
-  punpcklbw   xmm14,xmm1 
-  movdqa      [rsp+90h],xmm5 
-  movdqa      [rsp+30h],xmm14 
-  movsxd      rsi,eax 
-  movsx       eax,r11w 
-  sub         rdi,rcx 
-  sub         rbx,rsi 
-  mov         r8,rbp 
-  sub         r8,r10 
-  movd        xmm0,eax 
-  movsx       eax,r9w 
-  movdqa      xmm12,[rdi] 
-  movdqa      xmm6, [rsi+rbp] 
-  movdqa      xmm13,[rbx] 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm6,xmm1 
-  movdqa      xmm8,[r8] 
-  movd        xmm0,eax 
-  movdqa      xmm10,xmm11 
-  mov         eax,2 
-  punpcklbw   xmm8,xmm1 
-  punpcklbw   xmm12,xmm1 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  psraw       xmm10,2 
-  movdqa      xmm1,xmm8 
-  movdqa      [rsp+0F0h],xmm13 
-  movdqa      [rsp+0B0h],xmm8 
-  pshufd      xmm7,xmm0,0 
-  psubw       xmm1,xmm13 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm4,xmm7 
-  movdqa      xmm2,xmm7 
-  psubw       xmm0,xmm8 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm5 
-  movdqa      [rsp+40h],xmm7 
-  movdqa      [rsp+60h],xmm6 
-  pcmpgtw     xmm4,xmm0 
-  psubw       xmm1,xmm14 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm4,xmm2 
-  movdqa      xmm0,xmm11 
-  pcmpgtw     xmm0,xmm3 
-  pand        xmm4,xmm0 
-  movd        xmm0,eax 
-  movdqa      [rsp+20h],xmm4 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm2,xmm0,0 
-  paddw       xmm10,xmm2 
-  movdqa      [rsp+0A0h],xmm2 
-  movdqa      xmm15,xmm7 
-  pxor        xmm4,xmm4 
-  movdqa      xmm0,xmm8 
-  psubw       xmm0,xmm12 
-  mov         eax,4 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm10 
-  cwde             
-  pcmpgtw     xmm15,xmm0 
-  pcmpgtw     xmm1,xmm3 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm7,[rdx] 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm6 
-  pand        xmm15,xmm1 
-  punpcklbw   xmm7,xmm4 
-  movdqa      xmm9,xmm15 
-  pabsw       xmm0,xmm0 
-  psllw       xmm7,1 
-  pandn       xmm9,xmm12 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm7,xmm12 
-  movd        xmm0,eax 
-  pand        xmm3,xmm1 
-  paddw       xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm7,xmm12 
-  pshufd      xmm1,xmm0,0 
-  paddw       xmm7,xmm13 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm6 
-  paddw       xmm7,xmm8 
-  movdqa      [rsp+70h],xmm1 
-  paddw       xmm7,xmm5 
-  movdqa      [rsp+120h],xmm0 
-  movdqa      xmm0,[rcx+rbp] 
-  punpcklbw   xmm0,xmm4 
-  paddw       xmm7,xmm1 
-  movdqa      xmm4,xmm15 
-  psllw       xmm0,1 
-  psraw       xmm7,3 
-  paddw       xmm0,xmm6 
-  pand        xmm7,xmm15 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm14 
-  movdqa      xmm6,xmm15 
-  paddw       xmm0,xmm5 
-  pandn       xmm6,xmm13 
-  paddw       xmm0,xmm8 
-  paddw       xmm0,xmm1 
-  psraw       xmm0,3 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,xmm13 
-  pand        xmm0,xmm3 
-  movdqa      [rsp+100h],xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,xmm5 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm3 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pandn       xmm0,xmm14 
-  pand        xmm4,xmm1 
-  movdqa      [rsp+0E0h],xmm0 
-  movdqa      xmm0,xmm5 
-  paddw       xmm0,xmm8 
-  movdqa      xmm1,[rsp+60h] 
-  paddw       xmm1,xmm14 
-  movdqa      xmm14,xmm3 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,[rsp+30h] 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pand        xmm14,xmm1 
-  movdqa      xmm1,xmm13 
-  paddw       xmm1,xmm13 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  movdqa      xmm0,[rsp+30h] 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm5,xmm15 
-  paddw       xmm0,[rsp+70h] 
-  pandn       xmm5,xmm1 
-  paddw       xmm2,xmm8 
-  movdqa      xmm8,[rsp+90h] 
-  movdqa      xmm1,xmm12 
-  paddw       xmm2,xmm8 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,xmm8 
-  movdqa      xmm8,xmm3 
-  movdqa      xmm2,[rsp+30h] 
-  paddw       xmm0,xmm13 
-  psraw       xmm1,3 
-  pand        xmm15,xmm1 
-  movdqa      xmm1,xmm2 
-  paddw       xmm1,xmm2 
-  paddw       xmm2,[rsp+90h] 
-  paddw       xmm2,[rsp+0B0h] 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm13,[r8] 
-  paddw       xmm0, [rsp+70h] 
-  paddw       xmm1, [rsp+0A0h] 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,2 
-  movdqa      xmm0, [rdi] 
-  pandn       xmm8,xmm1 
-  movdqa      xmm1, [rsp+60h] 
-  paddw       xmm1,xmm2 
-  movdqa      xmm2, [rbx] 
-  psraw       xmm1,3 
-  pand        xmm3,xmm1 
-  movdqa      xmm1, [rbp] 
-  movdqa      [rsp+0D0h],xmm3 
-  pxor        xmm3,xmm3 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm1,xmm3 
-  punpckhbw   xmm13,xmm3 
-  movdqa      [rsp+0C0h],xmm0 
-  movdqa      xmm0,[r10+rbp] 
-  movdqa      [rsp],xmm1 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm2,xmm3 
-  movdqa      [rsp+80h],xmm0 
-  movdqa      xmm0,[rsi+rbp] 
-  movdqa      [rsp+10h],xmm13 
-  punpckhbw   xmm0,xmm3 
-  movdqa      [rsp+50h],xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm1,xmm13 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm2 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,[rsp] 
-  movdqa      xmm13,[rsp+40h] 
-  movdqa      [rsp+110h],xmm2 
-  psubw       xmm1, [rsp+80h] 
-  pcmpgtw     xmm13,xmm0 
-  pcmpgtw     xmm11,xmm3 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm10,xmm3 
-  movdqa      xmm1, [rsp+40h] 
-  movdqa      xmm2,xmm1 
-  movdqa      xmm3,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  movdqa      xmm0, [rsp+10h] 
-  pand        xmm13,xmm2 
-  pand        xmm13,xmm11 
-  movdqa      xmm11,[rsp+0C0h] 
-  psubw       xmm0,xmm11 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm3,xmm0 
-  pand        xmm3,xmm10 
-  movdqa      xmm0,[rsp] 
-  psubw       xmm0,[rsp+50h] 
-  movdqa      xmm2,[rdx] 
-  pabsw       xmm0,xmm0 
-  por         xmm7,xmm9 
-  movdqa      xmm9,[rsp+20h] 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm9,xmm7 
-  movdqa      xmm7,[rsp+20h] 
-  movdqa      xmm0,xmm7 
-  pandn       xmm0,xmm12 
-  movdqa      xmm12,[rsp+110h] 
-  pand        xmm1,xmm10 
-  movdqa      xmm10,[rsp+70h] 
-  movdqa      [rsp+40h],xmm1 
-  movdqa      xmm1,xmm13 
-  por         xmm9,xmm0 
-  pxor        xmm0,xmm0 
-  por         xmm4,xmm6 
-  movdqa      xmm6,xmm7 
-  punpckhbw   xmm2,xmm0 
-  por         xmm15,xmm5 
-  movdqa      xmm5,[rsp+20h] 
-  movdqa      xmm0,xmm3 
-  psllw       xmm2,1 
-  pandn       xmm0,xmm11 
-  pand        xmm6,xmm4 
-  movdqa      xmm4,[rsp] 
-  paddw       xmm2,xmm11 
-  pand        xmm5,xmm15 
-  movdqa      xmm15,[rsp+20h] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm12 
-  paddw       xmm2,[rsp+10h] 
-  paddw       xmm2,[rsp] 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  pand        xmm2,xmm3 
-  por         xmm2,xmm0 
-  pand        xmm1,xmm2 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm2,xmm11 
-  pandn       xmm0,xmm11 
-  paddw       xmm2,xmm12 
-  por         xmm1,xmm0 
-  packuswb    xmm9,xmm1 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm7,[rsp+0A0h] 
-  pandn       xmm0,[rsp+0F0h] 
-  movdqa      xmm1,xmm3 
-  por         xmm6,xmm0 
-  movdqa      xmm0,[rsp+10h] 
-  paddw       xmm0,xmm4 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm12 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  pandn       xmm0,xmm12 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,[rsp+10h] 
-  por         xmm2,xmm0 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+0B0h] 
-  paddw       xmm1,xmm4 
-  packuswb    xmm6,xmm2 
-  movdqa      xmm2,xmm3 
-  psllw       xmm1,1 
-  por         xmm5,xmm0 
-  movdqa      xmm0,[rsp+80h] 
-  paddw       xmm0,xmm10 
-  paddw       xmm1,xmm0 
-  paddw       xmm11,xmm1 
-  psraw       xmm11,3 
-  movdqa      xmm1,xmm12 
-  pand        xmm2,xmm11 
-  paddw       xmm1,xmm12 
-  movdqa      xmm11,[rsp+80h] 
-  movdqa      xmm0, [rsp+10h] 
-  por         xmm14,[rsp+0E0h] 
-  paddw       xmm0,xmm11 
-  movdqa      xmm4,xmm15 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  paddw       xmm1,xmm7 
-  psraw       xmm1,2 
-  pandn       xmm3,xmm1 
-  por         xmm2,xmm3 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm3,[rsp+10h] 
-  pandn       xmm0,xmm3 
-  pand        xmm1,xmm2 
-  movdqa      xmm2,xmm11 
-  paddw       xmm2,[rsp] 
-  por         xmm1,xmm0 
-  movdqa      xmm0,[rsp+0D0h] 
-  por         xmm0,xmm8 
-  paddw       xmm2,xmm3 
-  packuswb    xmm5,xmm1 
-  movdqa      xmm8,[rsp+40h] 
-  movdqa      xmm1,[rsp+50h] 
-  movdqa      xmm3,xmm8 
-  pand        xmm4,xmm0 
-  psllw       xmm2,1 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+90h] 
-  por         xmm4,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm10 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,[rsp] 
-  movdqa      xmm2,xmm11 
-  paddw       xmm0,xmm12 
-  movdqa      xmm12,[rsp] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,3 
-  movdqa      xmm0,xmm8 
-  pand        xmm3,xmm1 
-  paddw       xmm2,xmm7 
-  movdqa      xmm1,xmm13 
-  psraw       xmm2,2 
-  pandn       xmm0,xmm2 
-  por         xmm3,xmm0 
-  movdqa      xmm2,[rsp+50h] 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm3 
-  paddw       xmm2,xmm11 
-  movdqa      xmm3,xmm15 
-  por         xmm1,xmm0 
-  pand        xmm3,xmm14 
-  movdqa      xmm14,[rsp+10h] 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+30h] 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm1,xmm8 
-  por         xmm3,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm14 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm8 
-  pandn       xmm0,xmm11 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm11 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm15 
-  por         xmm2,xmm0 
-  packuswb    xmm3,xmm2 
-  movdqa      xmm0,[rsp+100h] 
-  por         xmm0,[rsp+120h] 
-  pand        xmm1,xmm0 
-  movdqa      xmm2,[rcx+rbp] 
-  movdqa      xmm7,[rsp+50h] 
-  pandn       xmm15,[rsp+60h] 
-  lea         r11,[rsp+1D8h] 
-  pxor        xmm0,xmm0 
-  por         xmm1,xmm15 
-  movaps      xmm15,[r11-0A8h] 
-  movdqa      [rdi],xmm9 
-  movaps      xmm9,[r11-48h] 
-  punpckhbw   xmm2,xmm0 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm7 
-  movdqa      [rbx],xmm6 
-  movaps      xmm6,[r11-18h] 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm11 
-  movaps      xmm11,[r11-68h] 
-  paddw       xmm2,xmm12 
-  movaps      xmm12,[r11-78h] 
-  paddw       xmm2,xmm14 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  movaps      xmm10,[r11-58h] 
-  movaps      xmm14,[r11-98h] 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm8 
-  pandn       xmm8,xmm7 
-  pandn       xmm13,xmm7 
-  por         xmm2,xmm8 
-  movaps      xmm7,[r11-28h] 
-  movaps      xmm8,[r11-38h] 
-  movdqa      [r8],xmm5 
-  pand        xmm0,xmm2 
-  por         xmm0,xmm13 
-  packuswb    xmm1,xmm0 
-  movaps      xmm13,[r11-88h] 
-  movdqa      [rbp],xmm4 
-  movdqa      [r10+rbp],xmm3 
-  movdqa      [rsi+rbp],xmm1 
-  mov         rsp,r11   
-  pop         rbp  
-  pop         rbx  
+  sub         rsp,1D8h
+  movaps      [rax-38h],xmm6
+  movaps      [rax-48h],xmm7
+  movaps      [rax-58h],xmm8
+  pxor        xmm1,xmm1
+  movsxd      r10,edx
+  mov         rbp,rcx
+  mov         r11d,r8d
+  mov         rdx,rcx
+  mov         rdi,rbp
+  mov         rbx,rbp
+  movdqa      xmm5,[rbp]
+  movaps      [rax-68h],xmm9
+  movaps      [rax-78h],xmm10
+  punpcklbw   xmm5,xmm1
+  movaps      [rax-88h],xmm11
+  movaps      [rax-98h],xmm12
+  movaps      [rax-0A8h],xmm13
+  movaps      [rax-0B8h],xmm14
+  movdqa      xmm14,[r10+rbp]
+  movaps      [rax-0C8h],xmm15
+  lea         eax,[r10*4]
+  movsxd      r8,eax
+  lea         eax,[r10+r10*2]
+  movsxd      rcx,eax
+  lea         eax,[r10+r10]
+  sub         rdx,r8
+  punpcklbw   xmm14,xmm1
+  movdqa      [rsp+90h],xmm5
+  movdqa      [rsp+30h],xmm14
+  movsxd      rsi,eax
+  movsx       eax,r11w
+  sub         rdi,rcx
+  sub         rbx,rsi
+  mov         r8,rbp
+  sub         r8,r10
+  movd        xmm0,eax
+  movsx       eax,r9w
+  movdqa      xmm12,[rdi]
+  movdqa      xmm6, [rsi+rbp]
+  movdqa      xmm13,[rbx]
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm6,xmm1
+  movdqa      xmm8,[r8]
+  movd        xmm0,eax
+  movdqa      xmm10,xmm11
+  mov         eax,2
+  punpcklbw   xmm8,xmm1
+  punpcklbw   xmm12,xmm1
+  cwde
+  punpcklwd   xmm0,xmm0
+  psraw       xmm10,2
+  movdqa      xmm1,xmm8
+  movdqa      [rsp+0F0h],xmm13
+  movdqa      [rsp+0B0h],xmm8
+  pshufd      xmm7,xmm0,0
+  psubw       xmm1,xmm13
+  movdqa      xmm0,xmm5
+  movdqa      xmm4,xmm7
+  movdqa      xmm2,xmm7
+  psubw       xmm0,xmm8
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm5
+  movdqa      [rsp+40h],xmm7
+  movdqa      [rsp+60h],xmm6
+  pcmpgtw     xmm4,xmm0
+  psubw       xmm1,xmm14
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm4,xmm2
+  movdqa      xmm0,xmm11
+  pcmpgtw     xmm0,xmm3
+  pand        xmm4,xmm0
+  movd        xmm0,eax
+  movdqa      [rsp+20h],xmm4
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm2,xmm0,0
+  paddw       xmm10,xmm2
+  movdqa      [rsp+0A0h],xmm2
+  movdqa      xmm15,xmm7
+  pxor        xmm4,xmm4
+  movdqa      xmm0,xmm8
+  psubw       xmm0,xmm12
+  mov         eax,4
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm10
+  cwde
+  pcmpgtw     xmm15,xmm0
+  pcmpgtw     xmm1,xmm3
+  movdqa      xmm3,xmm7
+  movdqa      xmm7,[rdx]
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm6
+  pand        xmm15,xmm1
+  punpcklbw   xmm7,xmm4
+  movdqa      xmm9,xmm15
+  pabsw       xmm0,xmm0
+  psllw       xmm7,1
+  pandn       xmm9,xmm12
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm7,xmm12
+  movd        xmm0,eax
+  pand        xmm3,xmm1
+  paddw       xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  paddw       xmm7,xmm12
+  pshufd      xmm1,xmm0,0
+  paddw       xmm7,xmm13
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm6
+  paddw       xmm7,xmm8
+  movdqa      [rsp+70h],xmm1
+  paddw       xmm7,xmm5
+  movdqa      [rsp+120h],xmm0
+  movdqa      xmm0,[rcx+rbp]
+  punpcklbw   xmm0,xmm4
+  paddw       xmm7,xmm1
+  movdqa      xmm4,xmm15
+  psllw       xmm0,1
+  psraw       xmm7,3
+  paddw       xmm0,xmm6
+  pand        xmm7,xmm15
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm14
+  movdqa      xmm6,xmm15
+  paddw       xmm0,xmm5
+  pandn       xmm6,xmm13
+  paddw       xmm0,xmm8
+  paddw       xmm0,xmm1
+  psraw       xmm0,3
+  movdqa      xmm1,xmm12
+  paddw       xmm1,xmm13
+  pand        xmm0,xmm3
+  movdqa      [rsp+100h],xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,xmm5
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm3
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pandn       xmm0,xmm14
+  pand        xmm4,xmm1
+  movdqa      [rsp+0E0h],xmm0
+  movdqa      xmm0,xmm5
+  paddw       xmm0,xmm8
+  movdqa      xmm1,[rsp+60h]
+  paddw       xmm1,xmm14
+  movdqa      xmm14,xmm3
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,[rsp+30h]
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pand        xmm14,xmm1
+  movdqa      xmm1,xmm13
+  paddw       xmm1,xmm13
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  movdqa      xmm0,[rsp+30h]
+  movdqa      xmm2,xmm13
+  movdqa      xmm5,xmm15
+  paddw       xmm0,[rsp+70h]
+  pandn       xmm5,xmm1
+  paddw       xmm2,xmm8
+  movdqa      xmm8,[rsp+90h]
+  movdqa      xmm1,xmm12
+  paddw       xmm2,xmm8
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,xmm8
+  movdqa      xmm8,xmm3
+  movdqa      xmm2,[rsp+30h]
+  paddw       xmm0,xmm13
+  psraw       xmm1,3
+  pand        xmm15,xmm1
+  movdqa      xmm1,xmm2
+  paddw       xmm1,xmm2
+  paddw       xmm2,[rsp+90h]
+  paddw       xmm2,[rsp+0B0h]
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  movdqa      xmm13,[r8]
+  paddw       xmm0, [rsp+70h]
+  paddw       xmm1, [rsp+0A0h]
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  psraw       xmm1,2
+  movdqa      xmm0, [rdi]
+  pandn       xmm8,xmm1
+  movdqa      xmm1, [rsp+60h]
+  paddw       xmm1,xmm2
+  movdqa      xmm2, [rbx]
+  psraw       xmm1,3
+  pand        xmm3,xmm1
+  movdqa      xmm1, [rbp]
+  movdqa      [rsp+0D0h],xmm3
+  pxor        xmm3,xmm3
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm1,xmm3
+  punpckhbw   xmm13,xmm3
+  movdqa      [rsp+0C0h],xmm0
+  movdqa      xmm0,[r10+rbp]
+  movdqa      [rsp],xmm1
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm2,xmm3
+  movdqa      [rsp+80h],xmm0
+  movdqa      xmm0,[rsi+rbp]
+  movdqa      [rsp+10h],xmm13
+  punpckhbw   xmm0,xmm3
+  movdqa      [rsp+50h],xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm1,xmm13
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm2
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,[rsp]
+  movdqa      xmm13,[rsp+40h]
+  movdqa      [rsp+110h],xmm2
+  psubw       xmm1, [rsp+80h]
+  pcmpgtw     xmm13,xmm0
+  pcmpgtw     xmm11,xmm3
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm10,xmm3
+  movdqa      xmm1, [rsp+40h]
+  movdqa      xmm2,xmm1
+  movdqa      xmm3,xmm1
+  pcmpgtw     xmm2,xmm0
+  movdqa      xmm0, [rsp+10h]
+  pand        xmm13,xmm2
+  pand        xmm13,xmm11
+  movdqa      xmm11,[rsp+0C0h]
+  psubw       xmm0,xmm11
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm3,xmm0
+  pand        xmm3,xmm10
+  movdqa      xmm0,[rsp]
+  psubw       xmm0,[rsp+50h]
+  movdqa      xmm2,[rdx]
+  pabsw       xmm0,xmm0
+  por         xmm7,xmm9
+  movdqa      xmm9,[rsp+20h]
+  pcmpgtw     xmm1,xmm0
+  pand        xmm9,xmm7
+  movdqa      xmm7,[rsp+20h]
+  movdqa      xmm0,xmm7
+  pandn       xmm0,xmm12
+  movdqa      xmm12,[rsp+110h]
+  pand        xmm1,xmm10
+  movdqa      xmm10,[rsp+70h]
+  movdqa      [rsp+40h],xmm1
+  movdqa      xmm1,xmm13
+  por         xmm9,xmm0
+  pxor        xmm0,xmm0
+  por         xmm4,xmm6
+  movdqa      xmm6,xmm7
+  punpckhbw   xmm2,xmm0
+  por         xmm15,xmm5
+  movdqa      xmm5,[rsp+20h]
+  movdqa      xmm0,xmm3
+  psllw       xmm2,1
+  pandn       xmm0,xmm11
+  pand        xmm6,xmm4
+  movdqa      xmm4,[rsp]
+  paddw       xmm2,xmm11
+  pand        xmm5,xmm15
+  movdqa      xmm15,[rsp+20h]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm12
+  paddw       xmm2,[rsp+10h]
+  paddw       xmm2,[rsp]
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  pand        xmm2,xmm3
+  por         xmm2,xmm0
+  pand        xmm1,xmm2
+  movdqa      xmm0,xmm13
+  movdqa      xmm2,xmm11
+  pandn       xmm0,xmm11
+  paddw       xmm2,xmm12
+  por         xmm1,xmm0
+  packuswb    xmm9,xmm1
+  movdqa      xmm0,xmm7
+  movdqa      xmm7,[rsp+0A0h]
+  pandn       xmm0,[rsp+0F0h]
+  movdqa      xmm1,xmm3
+  por         xmm6,xmm0
+  movdqa      xmm0,[rsp+10h]
+  paddw       xmm0,xmm4
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm12
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  pandn       xmm0,xmm12
+  movdqa      xmm1,xmm12
+  paddw       xmm1,[rsp+10h]
+  por         xmm2,xmm0
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+0B0h]
+  paddw       xmm1,xmm4
+  packuswb    xmm6,xmm2
+  movdqa      xmm2,xmm3
+  psllw       xmm1,1
+  por         xmm5,xmm0
+  movdqa      xmm0,[rsp+80h]
+  paddw       xmm0,xmm10
+  paddw       xmm1,xmm0
+  paddw       xmm11,xmm1
+  psraw       xmm11,3
+  movdqa      xmm1,xmm12
+  pand        xmm2,xmm11
+  paddw       xmm1,xmm12
+  movdqa      xmm11,[rsp+80h]
+  movdqa      xmm0, [rsp+10h]
+  por         xmm14,[rsp+0E0h]
+  paddw       xmm0,xmm11
+  movdqa      xmm4,xmm15
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  paddw       xmm1,xmm7
+  psraw       xmm1,2
+  pandn       xmm3,xmm1
+  por         xmm2,xmm3
+  movdqa      xmm1,xmm13
+  movdqa      xmm3,[rsp+10h]
+  pandn       xmm0,xmm3
+  pand        xmm1,xmm2
+  movdqa      xmm2,xmm11
+  paddw       xmm2,[rsp]
+  por         xmm1,xmm0
+  movdqa      xmm0,[rsp+0D0h]
+  por         xmm0,xmm8
+  paddw       xmm2,xmm3
+  packuswb    xmm5,xmm1
+  movdqa      xmm8,[rsp+40h]
+  movdqa      xmm1,[rsp+50h]
+  movdqa      xmm3,xmm8
+  pand        xmm4,xmm0
+  psllw       xmm2,1
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+90h]
+  por         xmm4,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm10
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,[rsp]
+  movdqa      xmm2,xmm11
+  paddw       xmm0,xmm12
+  movdqa      xmm12,[rsp]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm0
+  psraw       xmm1,3
+  movdqa      xmm0,xmm8
+  pand        xmm3,xmm1
+  paddw       xmm2,xmm7
+  movdqa      xmm1,xmm13
+  psraw       xmm2,2
+  pandn       xmm0,xmm2
+  por         xmm3,xmm0
+  movdqa      xmm2,[rsp+50h]
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm3
+  paddw       xmm2,xmm11
+  movdqa      xmm3,xmm15
+  por         xmm1,xmm0
+  pand        xmm3,xmm14
+  movdqa      xmm14,[rsp+10h]
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+30h]
+  packuswb    xmm4,xmm1
+  movdqa      xmm1,xmm8
+  por         xmm3,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm14
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm8
+  pandn       xmm0,xmm11
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm11
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm15
+  por         xmm2,xmm0
+  packuswb    xmm3,xmm2
+  movdqa      xmm0,[rsp+100h]
+  por         xmm0,[rsp+120h]
+  pand        xmm1,xmm0
+  movdqa      xmm2,[rcx+rbp]
+  movdqa      xmm7,[rsp+50h]
+  pandn       xmm15,[rsp+60h]
+  lea         r11,[rsp+1D8h]
+  pxor        xmm0,xmm0
+  por         xmm1,xmm15
+  movaps      xmm15,[r11-0A8h]
+  movdqa      [rdi],xmm9
+  movaps      xmm9,[r11-48h]
+  punpckhbw   xmm2,xmm0
+  psllw       xmm2,1
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm7
+  movdqa      [rbx],xmm6
+  movaps      xmm6,[r11-18h]
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm11
+  movaps      xmm11,[r11-68h]
+  paddw       xmm2,xmm12
+  movaps      xmm12,[r11-78h]
+  paddw       xmm2,xmm14
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  movaps      xmm10,[r11-58h]
+  movaps      xmm14,[r11-98h]
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm8
+  pandn       xmm8,xmm7
+  pandn       xmm13,xmm7
+  por         xmm2,xmm8
+  movaps      xmm7,[r11-28h]
+  movaps      xmm8,[r11-38h]
+  movdqa      [r8],xmm5
+  pand        xmm0,xmm2
+  por         xmm0,xmm13
+  packuswb    xmm1,xmm0
+  movaps      xmm13,[r11-88h]
+  movdqa      [rbp],xmm4
+  movdqa      [r10+rbp],xmm3
+  movdqa      [rsi+rbp],xmm1
+  mov         rsp,r11
+  pop         rbp
+  pop         rbx
   ret
 
 WELS_EXTERN  DeblockChromaLt4V_sse2
-ALIGN  16 
-DeblockChromaLt4V_sse2: 
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp    
+ALIGN  16
+DeblockChromaLt4V_sse2:
+  mov         rax,rsp
+  push        rbx
+  push        rbp
   mov         r10,  rdx
   mov         r11,  rcx
   mov         rcx,  rdi
-  mov         rdx,  rsi  
+  mov         rdx,  rsi
   mov         rsi,  r10
   mov         r10,  r9
   mov         rbp,  r8
   mov         r8,   rsi
   mov         r9,   r11
-  sub         rsp,0C8h   
-  pxor        xmm1,xmm1 
-  mov         rbx,rcx 
-  movsxd      r11,r8d 
-  movsx       ecx,byte [r10] 
-  movsx       r8d,byte [r10+2] 
-  mov         rdi,rdx 
-  movq        xmm2,[rbx] 
-  movq        xmm9,[r11+rbx] 
-  movsx       edx,byte [r10+1] 
-  mov         word [rsp+2],cx 
-  mov         word [rsp],cx 
-  movsx       eax,byte [r10+3] 
-  mov         word [rsp+6],dx 
-  mov         word [rsp+4],dx 
-  movdqa      xmm11,xmm1 
-  mov         word [rsp+0Eh],ax 
-  mov         word [rsp+0Ch],ax 
-  lea         eax,[r11+r11] 
-  movsxd      rcx,eax 
-  mov         rax,rbx 
-  mov         rdx,rdi 
-  sub         rax,rcx 
-  mov         word [rsp+0Ah],r8w 
-  mov         word [rsp+8],r8w 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm7,xmm6 
-  movq        xmm13, [rax] 
-  mov         rax,rdi 
-  sub         rax,rcx 
-  mov         rcx,rbx 
-  pcmpgtw     xmm7,xmm1 
-  psubw       xmm11,xmm6 
-  sub         rcx,r11 
-  sub         rdx,r11 
-  movq        xmm0,[rax] 
-  movsx       eax,r9w 
-  movq        xmm15,[rcx] 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rdx] 
-  movdqa      xmm4,xmm13 
-  punpcklqdq  xmm15,xmm0 
-  movq        xmm0, [rdi] 
-  punpcklbw   xmm4,xmm1 
-  movdqa      xmm12,xmm15 
-  punpcklqdq  xmm2,xmm0 
-  movq        xmm0, [r11+rdi] 
-  punpcklbw   xmm12,xmm1 
-  movdqa      xmm14,xmm2 
-  punpcklqdq  xmm9,xmm0 
-  punpckhbw   xmm2,xmm1 
-  punpcklbw   xmm14,xmm1 
-  movd        xmm0,eax 
+  sub         rsp,0C8h
+  pxor        xmm1,xmm1
+  mov         rbx,rcx
+  movsxd      r11,r8d
+  movsx       ecx,byte [r10]
+  movsx       r8d,byte [r10+2]
+  mov         rdi,rdx
+  movq        xmm2,[rbx]
+  movq        xmm9,[r11+rbx]
+  movsx       edx,byte [r10+1]
+  mov         word [rsp+2],cx
+  mov         word [rsp],cx
+  movsx       eax,byte [r10+3]
+  mov         word [rsp+6],dx
+  mov         word [rsp+4],dx
+  movdqa      xmm11,xmm1
+  mov         word [rsp+0Eh],ax
+  mov         word [rsp+0Ch],ax
+  lea         eax,[r11+r11]
+  movsxd      rcx,eax
+  mov         rax,rbx
+  mov         rdx,rdi
+  sub         rax,rcx
+  mov         word [rsp+0Ah],r8w
+  mov         word [rsp+8],r8w
+  movdqa      xmm6,[rsp]
+  movdqa      xmm7,xmm6
+  movq        xmm13, [rax]
+  mov         rax,rdi
+  sub         rax,rcx
+  mov         rcx,rbx
+  pcmpgtw     xmm7,xmm1
+  psubw       xmm11,xmm6
+  sub         rcx,r11
+  sub         rdx,r11
+  movq        xmm0,[rax]
+  movsx       eax,r9w
+  movq        xmm15,[rcx]
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rdx]
+  movdqa      xmm4,xmm13
+  punpcklqdq  xmm15,xmm0
+  movq        xmm0, [rdi]
+  punpcklbw   xmm4,xmm1
+  movdqa      xmm12,xmm15
+  punpcklqdq  xmm2,xmm0
+  movq        xmm0, [r11+rdi]
+  punpcklbw   xmm12,xmm1
+  movdqa      xmm14,xmm2
+  punpcklqdq  xmm9,xmm0
+  punpckhbw   xmm2,xmm1
+  punpcklbw   xmm14,xmm1
+  movd        xmm0,eax
   mov         eax, ebp ; iBeta
-  punpckhbw   xmm13,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm9 
-  movdqa      [rsp+10h],xmm2 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm9,xmm1 
-  punpcklbw   xmm3,xmm1 
-  movdqa      xmm1,xmm14 
-  pshufd      xmm10,xmm0,0 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm8,xmm0,0 
-  movd        xmm0,eax 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  psubw       xmm1,xmm12 
-  movdqa      xmm2,xmm10 
-  lea         r11,[rsp+0C8h] 
-  psllw       xmm1,2 
-  movdqa      xmm0,xmm4 
-  psubw       xmm4,xmm12 
-  psubw       xmm0,xmm3 
-  psubw       xmm3,xmm14 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm11 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm12 
-  psubw       xmm0,xmm14 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  movdqa      xmm3,[rsp] 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm9 
-  psubw       xmm13,xmm15 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  paddw       xmm12,xmm6 
-  psubw       xmm14,xmm6 
-  movdqa      xmm2,[rsp+10h] 
-  movaps      xmm6,[r11-18h] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm15 
-  psubw       xmm9,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm15 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  pmaxsw      xmm11,xmm1 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm10,xmm0 
-  pabsw       xmm0,xmm13 
-  pminsw      xmm3,xmm11 
-  movaps      xmm11,[r11-68h] 
-  movaps      xmm13,[rsp+40h] 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm9 
-  movaps      xmm9, [r11-48h] 
-  pand        xmm10,xmm1 
-  pcmpgtw     xmm8,xmm0 
-  pand        xmm10,xmm8 
-  pand        xmm10,xmm7 
-  movaps      xmm8,[r11-38h] 
-  movaps      xmm7,[r11-28h] 
-  pand        xmm3,xmm10 
-  paddw       xmm15,xmm3 
-  psubw       xmm2,xmm3 
-  movaps      xmm10,[r11-58h] 
-  packuswb    xmm12,xmm15 
-  movaps      xmm15,[rsp+20h] 
-  packuswb    xmm14,xmm2 
-  movq        [rcx],xmm12 
-  movq        [rbx],xmm14 
-  psrldq      xmm12,8 
-  psrldq      xmm14,8 
-  movq        [rdx],xmm12 
-  movaps      xmm12,[r11-78h] 
-  movq        [rdi],xmm14 
-  movaps      xmm14,[rsp+30h] 
-  mov         rsp,r11 
-  pop         rbp  
-  pop         rbx  
+  punpckhbw   xmm13,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm9
+  movdqa      [rsp+10h],xmm2
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm9,xmm1
+  punpcklbw   xmm3,xmm1
+  movdqa      xmm1,xmm14
+  pshufd      xmm10,xmm0,0
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm8,xmm0,0
+  movd        xmm0,eax
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  psubw       xmm1,xmm12
+  movdqa      xmm2,xmm10
+  lea         r11,[rsp+0C8h]
+  psllw       xmm1,2
+  movdqa      xmm0,xmm4
+  psubw       xmm4,xmm12
+  psubw       xmm0,xmm3
+  psubw       xmm3,xmm14
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm11
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm12
+  psubw       xmm0,xmm14
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  movdqa      xmm3,[rsp]
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm9
+  psubw       xmm13,xmm15
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  paddw       xmm12,xmm6
+  psubw       xmm14,xmm6
+  movdqa      xmm2,[rsp+10h]
+  movaps      xmm6,[r11-18h]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm15
+  psubw       xmm9,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm15
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  pmaxsw      xmm11,xmm1
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm10,xmm0
+  pabsw       xmm0,xmm13
+  pminsw      xmm3,xmm11
+  movaps      xmm11,[r11-68h]
+  movaps      xmm13,[rsp+40h]
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm9
+  movaps      xmm9, [r11-48h]
+  pand        xmm10,xmm1
+  pcmpgtw     xmm8,xmm0
+  pand        xmm10,xmm8
+  pand        xmm10,xmm7
+  movaps      xmm8,[r11-38h]
+  movaps      xmm7,[r11-28h]
+  pand        xmm3,xmm10
+  paddw       xmm15,xmm3
+  psubw       xmm2,xmm3
+  movaps      xmm10,[r11-58h]
+  packuswb    xmm12,xmm15
+  movaps      xmm15,[rsp+20h]
+  packuswb    xmm14,xmm2
+  movq        [rcx],xmm12
+  movq        [rbx],xmm14
+  psrldq      xmm12,8
+  psrldq      xmm14,8
+  movq        [rdx],xmm12
+  movaps      xmm12,[r11-78h]
+  movq        [rdi],xmm14
+  movaps      xmm14,[rsp+30h]
+  mov         rsp,r11
+  pop         rbp
+  pop         rbx
   ret
 
 WELS_EXTERN   DeblockChromaEq4V_sse2
 ALIGN 16
 DeblockChromaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
+  mov         rax,rsp
+  push        rbx
   push        rbp
 
   mov         rbp, r8
@@ -2545,143 +2545,143 @@
   mov         r9, rcx
   mov         rcx, rdi
   mov         rdx, rsi
-  
-  sub         rsp,90h 
-  pxor        xmm1,xmm1 
-  mov         r11,rcx 
-  mov         rbx,rdx 
-  mov         r10d,r9d   
-  movq        xmm13,[r11] 
-  lea         eax,[r8+r8] 
-  movsxd      r9,eax 
-  mov         rax,rcx 
-  sub         rax,r9 
-  movq        xmm14,[rax] 
-  mov         rax,rdx 
-  sub         rax,r9 
-  movq        xmm0,[rax] 
-  movsxd      rax,r8d 
-  sub         rcx,rax 
-  sub         rdx,rax 
-  movq        xmm12,[rax+r11] 
-  movq        xmm10,[rcx] 
-  punpcklqdq  xmm14,xmm0 
-  movdqa      xmm8,xmm14 
-  movq        xmm0,[rdx] 
-  punpcklbw   xmm8,xmm1 
-  punpckhbw   xmm14,xmm1 
-  punpcklqdq  xmm10,xmm0 
-  movq        xmm0,[rbx] 
-  movdqa      xmm5,xmm10 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rax+rbx] 
-  punpcklbw   xmm5,xmm1 
-  movsx       eax,r10w 
-  movdqa      xmm9,xmm13 
-  punpcklqdq  xmm12,xmm0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm10,xmm1 
-  movd        xmm0,eax 
+
+  sub         rsp,90h
+  pxor        xmm1,xmm1
+  mov         r11,rcx
+  mov         rbx,rdx
+  mov         r10d,r9d
+  movq        xmm13,[r11]
+  lea         eax,[r8+r8]
+  movsxd      r9,eax
+  mov         rax,rcx
+  sub         rax,r9
+  movq        xmm14,[rax]
+  mov         rax,rdx
+  sub         rax,r9
+  movq        xmm0,[rax]
+  movsxd      rax,r8d
+  sub         rcx,rax
+  sub         rdx,rax
+  movq        xmm12,[rax+r11]
+  movq        xmm10,[rcx]
+  punpcklqdq  xmm14,xmm0
+  movdqa      xmm8,xmm14
+  movq        xmm0,[rdx]
+  punpcklbw   xmm8,xmm1
+  punpckhbw   xmm14,xmm1
+  punpcklqdq  xmm10,xmm0
+  movq        xmm0,[rbx]
+  movdqa      xmm5,xmm10
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rax+rbx]
+  punpcklbw   xmm5,xmm1
+  movsx       eax,r10w
+  movdqa      xmm9,xmm13
+  punpcklqdq  xmm12,xmm0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm10,xmm1
+  movd        xmm0,eax
   mov         eax, ebp   ; iBeta
-  punpckhbw   xmm13,xmm1 
-  movdqa      xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm12,xmm1 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm7,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm1,xmm8 
-  psubw       xmm1,xmm5 
-  punpcklwd   xmm0,xmm0 
-  movdqa      xmm6,xmm11 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm9 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm10 
-  movdqa      xmm1,xmm14 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm10 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm11,xmm0 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm11,xmm2 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm4,xmm6 
-  movdqa      xmm1,xmm8 
-  mov         eax,2 
-  cwde             
-  paddw       xmm1,xmm8 
-  psubw       xmm0,xmm13 
-  paddw       xmm1,xmm5 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm2,xmm14 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm14 
-  movd        xmm0,eax 
-  pand        xmm11,xmm3 
-  paddw       xmm7,xmm7 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm2,xmm12 
-  paddw       xmm12,xmm12 
-  pshufd      xmm3,xmm0,0 
-  paddw       xmm7,xmm9 
-  paddw       xmm12,xmm13 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm5 
-  paddw       xmm7,xmm8 
-  psraw       xmm1,2 
-  paddw       xmm12,xmm14 
-  paddw       xmm7,xmm3 
-  ;movaps      xmm14,[rsp] 
-  pand        xmm4,xmm1 
-  paddw       xmm12,xmm3 
-  psraw       xmm7,2 
-  movdqa      xmm1,xmm11 
-  por         xmm4,xmm0 
-  psraw       xmm12,2 
-  paddw       xmm2,xmm3 
-  movdqa      xmm0,xmm11 
-  pandn       xmm0,xmm10 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm0,xmm11 
-  movdqa      xmm1,xmm6 
-  pand        xmm1,xmm7 
-  movq        [rcx],xmm4 
-  pandn       xmm6,xmm9 
-  pandn       xmm11,xmm13 
-  pand        xmm0,xmm12 
-  por         xmm1,xmm6 
-  por         xmm0,xmm11 
-  psrldq      xmm4,8 
-  packuswb    xmm1,xmm0 
-  movq        [r11],xmm1 
-  psrldq      xmm1,8 
-  movq        [rdx],xmm4 
-  lea         r11,[rsp+90h] 
-  movq        [rbx],xmm1 
-  mov         rsp,r11 
+  punpckhbw   xmm13,xmm1
+  movdqa      xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm12,xmm1
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm7,xmm1
+  movd        xmm0,eax
+  movdqa      xmm1,xmm8
+  psubw       xmm1,xmm5
+  punpcklwd   xmm0,xmm0
+  movdqa      xmm6,xmm11
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm9
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm10
+  movdqa      xmm1,xmm14
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm10
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm11,xmm0
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm11,xmm2
+  movdqa      xmm0,xmm12
+  movdqa      xmm4,xmm6
+  movdqa      xmm1,xmm8
+  mov         eax,2
+  cwde
+  paddw       xmm1,xmm8
+  psubw       xmm0,xmm13
+  paddw       xmm1,xmm5
+  pabsw       xmm0,xmm0
+  movdqa      xmm2,xmm14
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm14
+  movd        xmm0,eax
+  pand        xmm11,xmm3
+  paddw       xmm7,xmm7
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  paddw       xmm2,xmm12
+  paddw       xmm12,xmm12
+  pshufd      xmm3,xmm0,0
+  paddw       xmm7,xmm9
+  paddw       xmm12,xmm13
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm5
+  paddw       xmm7,xmm8
+  psraw       xmm1,2
+  paddw       xmm12,xmm14
+  paddw       xmm7,xmm3
+  ;movaps      xmm14,[rsp]
+  pand        xmm4,xmm1
+  paddw       xmm12,xmm3
+  psraw       xmm7,2
+  movdqa      xmm1,xmm11
+  por         xmm4,xmm0
+  psraw       xmm12,2
+  paddw       xmm2,xmm3
+  movdqa      xmm0,xmm11
+  pandn       xmm0,xmm10
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  packuswb    xmm4,xmm1
+  movdqa      xmm0,xmm11
+  movdqa      xmm1,xmm6
+  pand        xmm1,xmm7
+  movq        [rcx],xmm4
+  pandn       xmm6,xmm9
+  pandn       xmm11,xmm13
+  pand        xmm0,xmm12
+  por         xmm1,xmm6
+  por         xmm0,xmm11
+  psrldq      xmm4,8
+  packuswb    xmm1,xmm0
+  movq        [r11],xmm1
+  psrldq      xmm1,8
+  movq        [rdx],xmm4
+  lea         r11,[rsp+90h]
+  movq        [rbx],xmm1
+  mov         rsp,r11
   pop         rbp
-  pop         rbx  
+  pop         rbx
   ret
 
 
@@ -2688,270 +2688,270 @@
 WELS_EXTERN   DeblockChromaEq4H_sse2
 ALIGN  16
 DeblockChromaEq4H_sse2:
-  mov         rax,rsp 
-  push        rbx 
-  push        rbp 
+  mov         rax,rsp
+  push        rbx
+  push        rbp
   push        r12
-  
-  mov         rbp,   r8  
+
+  mov         rbp,   r8
   mov         r8,    rdx
   mov         r9,    rcx
   mov         rcx,   rdi
-  mov         rdx,   rsi  
+  mov         rdx,   rsi
   mov         rdi,   rdx
 
-  sub         rsp,140h     
-  lea         eax,[r8*4] 
-  movsxd      r10,eax 
-  mov         eax,[rcx-2] 
-  mov         [rsp+10h],eax 
-  lea         rbx,[r10+rdx-2] 
-  lea         r11,[r10+rcx-2] 
+  sub         rsp,140h
+  lea         eax,[r8*4]
+  movsxd      r10,eax
+  mov         eax,[rcx-2]
+  mov         [rsp+10h],eax
+  lea         rbx,[r10+rdx-2]
+  lea         r11,[r10+rcx-2]
 
-  movdqa      xmm5,[rsp+10h] 
-  movsxd      r10,r8d 
-  mov         eax,[r10+rcx-2] 
-  lea         rdx,[r10+r10*2] 
-  mov         [rsp+20h],eax 
-  mov         eax,[rcx+r10*2-2] 
-  mov         [rsp+30h],eax 
+  movdqa      xmm5,[rsp+10h]
+  movsxd      r10,r8d
+  mov         eax,[r10+rcx-2]
+  lea         rdx,[r10+r10*2]
+  mov         [rsp+20h],eax
+  mov         eax,[rcx+r10*2-2]
+  mov         [rsp+30h],eax
   mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h] 
-  mov         [rsp+40h],eax 
-  mov         eax, [rdi-2] 
-  movdqa      xmm4,[rsp+30h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rdi-2] 
-  movdqa      xmm3,[rsp+40h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[rdi+r10*2-2] 
-  punpckldq   xmm5,[rsp+50h] 
-  mov         [rsp+70h],eax 
-  mov         eax, [rdx+rdi-2] 
-  punpckldq   xmm2, [rsp+60h] 
-  mov          [rsp+80h],eax 
-  mov         eax,[r11] 
-  punpckldq   xmm4, [rsp+70h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx] 
-  punpckldq   xmm3,[rsp+80h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[r10+r11] 
-  movdqa      xmm0, [rsp+50h] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+60h],eax 
-  mov         eax,[r11+r10*2] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx+r10*2] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  mov         eax, [rdx+r11] 
-  movdqa      xmm15,xmm1 
-  punpckldq   xmm0,[rsp+60h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax, [rdx+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm15,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm12,xmm15 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm12,xmm0 
-  punpckhdq   xmm15,xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm11,xmm12 
-  punpckldq   xmm0,xmm5 
-  punpckhdq   xmm1,xmm5 
-  punpcklqdq  xmm11,xmm0 
-  punpckhqdq  xmm12,xmm0 
-  movsx       eax,r9w 
-  movdqa      xmm14,xmm15 
-  punpcklqdq  xmm14,xmm1 
-  punpckhqdq  xmm15,xmm1 
-  pxor        xmm1,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm4,xmm12 
-  movdqa      xmm8,xmm11 
-  mov         eax, ebp ; iBeta
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm4,xmm1 
-  punpckhbw   xmm12,xmm1 
-  movdqa      xmm9,xmm14 
-  movdqa      xmm7,xmm15 
-  movdqa      xmm10,xmm15 
-  pshufd      xmm13,xmm0,0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm14,xmm1 
-  movdqa      xmm6,xmm13 
-  movd        xmm0,eax 
-  movdqa      [rsp],xmm11 
-  mov         eax,2 
-  cwde             
-  punpckhbw   xmm11,xmm1 
-  punpckhbw   xmm10,xmm1 
-  punpcklbw   xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm8,xmm1 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm4 
-  psubw       xmm0,xmm9 
-  psubw       xmm1,xmm4 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm1,xmm11 
-  psubw       xmm0,xmm14 
-  psubw       xmm1,xmm12 
-  movdqa      xmm5,xmm6 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm13,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm2,xmm0 
-  paddw       xmm1,xmm8 
-  movdqa      xmm0,xmm10 
-  pand        xmm13,xmm2 
-  psubw       xmm0,xmm14 
-  paddw       xmm1,xmm4 
-  movdqa      xmm2,xmm11 
-  pabsw       xmm0,xmm0 
-  paddw       xmm2,xmm11 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm12 
-  movd        xmm0,eax 
-  pand        xmm13,xmm3 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm4 
-  paddw       xmm2,xmm3 
-  psraw       xmm1,2 
-  pand        xmm5,xmm1 
-  por         xmm5,xmm0 
-  paddw       xmm7,xmm7 
-  paddw       xmm10,xmm10 
-  psraw       xmm2,2 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm2 
-  paddw       xmm7,xmm9 
-  por         xmm1,xmm0 
-  paddw       xmm10,xmm14 
-  paddw       xmm7,xmm8 
-  movdqa      xmm0,xmm13 
-  packuswb    xmm5,xmm1 
-  paddw       xmm7,xmm3 
-  paddw       xmm10,xmm11 
-  movdqa      xmm1,xmm6 
-  paddw       xmm10,xmm3 
-  pandn       xmm6,xmm9 
-  psraw       xmm7,2 
-  pand        xmm1,xmm7 
-  psraw       xmm10,2 
-  pandn       xmm13,xmm14 
-  pand        xmm0,xmm10 
-  por         xmm1,xmm6 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm4,xmm6 
-  por         xmm0,xmm13 
-  punpcklbw   xmm4,xmm5 
-  punpckhbw   xmm6,xmm5 
-  movdqa      xmm3,xmm4 
-  packuswb    xmm1,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckhbw   xmm1,xmm15 
-  punpcklbw   xmm0,xmm15 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm6 
-  movdqa      xmm2,xmm3 
-  punpcklwd   xmm0,xmm1 
-  punpckhwd   xmm6,xmm1 
-  movdqa      xmm1,xmm4 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm6 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm6 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+10h],xmm0 
-  movdqa      [rsp+60h],xmm2 
-  movdqa      xmm0,xmm3 
-  mov         eax,[rsp+10h] 
-  mov         [rcx-2],eax 
-  mov         eax,[rsp+60h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [r10+rcx-2],eax 
-  movdqa      [rsp+20h],xmm0 
-  mov         eax, [rsp+20h] 
-  movdqa      [rsp+70h],xmm3 
-  mov         [rcx+r10*2-2],eax 
-  mov         eax,[rsp+70h] 
-  mov         [rdx+rcx-2],eax 
-  mov         eax,[rsp+18h] 
-  mov         [r11],eax 
-  mov         eax,[rsp+68h] 
-  mov         [r10+r11],eax 
-  mov         eax,[rsp+28h] 
-  mov         [r11+r10*2],eax 
-  mov         eax,[rsp+78h] 
-  mov         [rdx+r11],eax 
-  mov         eax,[rsp+14h] 
-  mov         [rdi-2],eax 
-  mov         eax,[rsp+64h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+24h] 
-  mov         [rdi+r10*2-2],eax 
-  mov         eax, [rsp+74h] 
-  mov         [rdx+rdi-2],eax 
-  mov         eax, [rsp+1Ch] 
-  mov         [rbx],eax 
-  mov         eax, [rsp+6Ch] 
-  mov         [r10+rbx],eax 
-  mov         eax,[rsp+2Ch] 
-  mov         [rbx+r10*2],eax 
-  mov         eax,[rsp+7Ch] 
-  mov         [rdx+rbx],eax  
-  lea         r11,[rsp+140h] 
-  mov         rbx, [r11+28h]    
+  movdqa      xmm2,[rsp+20h]
+  mov         [rsp+40h],eax
+  mov         eax, [rdi-2]
+  movdqa      xmm4,[rsp+30h]
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rdi-2]
+  movdqa      xmm3,[rsp+40h]
+  mov         [rsp+60h],eax
+  mov         eax,[rdi+r10*2-2]
+  punpckldq   xmm5,[rsp+50h]
+  mov         [rsp+70h],eax
+  mov         eax, [rdx+rdi-2]
+  punpckldq   xmm2, [rsp+60h]
+  mov          [rsp+80h],eax
+  mov         eax,[r11]
+  punpckldq   xmm4, [rsp+70h]
+  mov         [rsp+50h],eax
+  mov         eax,[rbx]
+  punpckldq   xmm3,[rsp+80h]
+  mov         [rsp+60h],eax
+  mov         eax,[r10+r11]
+  movdqa      xmm0, [rsp+50h]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm0,[rsp+50h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+60h],eax
+  mov         eax,[r11+r10*2]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[rbx+r10*2]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  mov         eax, [rdx+r11]
+  movdqa      xmm15,xmm1
+  punpckldq   xmm0,[rsp+60h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax, [rdx+rbx]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm15,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm12,xmm15
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm12,xmm0
+  punpckhdq   xmm15,xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm11,xmm12
+  punpckldq   xmm0,xmm5
+  punpckhdq   xmm1,xmm5
+  punpcklqdq  xmm11,xmm0
+  punpckhqdq  xmm12,xmm0
+  movsx       eax,r9w
+  movdqa      xmm14,xmm15
+  punpcklqdq  xmm14,xmm1
+  punpckhqdq  xmm15,xmm1
+  pxor        xmm1,xmm1
+  movd        xmm0,eax
+  movdqa      xmm4,xmm12
+  movdqa      xmm8,xmm11
+  mov         eax, ebp ; iBeta
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm4,xmm1
+  punpckhbw   xmm12,xmm1
+  movdqa      xmm9,xmm14
+  movdqa      xmm7,xmm15
+  movdqa      xmm10,xmm15
+  pshufd      xmm13,xmm0,0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm14,xmm1
+  movdqa      xmm6,xmm13
+  movd        xmm0,eax
+  movdqa      [rsp],xmm11
+  mov         eax,2
+  cwde
+  punpckhbw   xmm11,xmm1
+  punpckhbw   xmm10,xmm1
+  punpcklbw   xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm8,xmm1
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm4
+  psubw       xmm0,xmm9
+  psubw       xmm1,xmm4
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm12
+  movdqa      xmm1,xmm11
+  psubw       xmm0,xmm14
+  psubw       xmm1,xmm12
+  movdqa      xmm5,xmm6
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm13,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm2,xmm0
+  paddw       xmm1,xmm8
+  movdqa      xmm0,xmm10
+  pand        xmm13,xmm2
+  psubw       xmm0,xmm14
+  paddw       xmm1,xmm4
+  movdqa      xmm2,xmm11
+  pabsw       xmm0,xmm0
+  paddw       xmm2,xmm11
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm12
+  movd        xmm0,eax
+  pand        xmm13,xmm3
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm4
+  paddw       xmm2,xmm3
+  psraw       xmm1,2
+  pand        xmm5,xmm1
+  por         xmm5,xmm0
+  paddw       xmm7,xmm7
+  paddw       xmm10,xmm10
+  psraw       xmm2,2
+  movdqa      xmm1,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm2
+  paddw       xmm7,xmm9
+  por         xmm1,xmm0
+  paddw       xmm10,xmm14
+  paddw       xmm7,xmm8
+  movdqa      xmm0,xmm13
+  packuswb    xmm5,xmm1
+  paddw       xmm7,xmm3
+  paddw       xmm10,xmm11
+  movdqa      xmm1,xmm6
+  paddw       xmm10,xmm3
+  pandn       xmm6,xmm9
+  psraw       xmm7,2
+  pand        xmm1,xmm7
+  psraw       xmm10,2
+  pandn       xmm13,xmm14
+  pand        xmm0,xmm10
+  por         xmm1,xmm6
+  movdqa      xmm6,[rsp]
+  movdqa      xmm4,xmm6
+  por         xmm0,xmm13
+  punpcklbw   xmm4,xmm5
+  punpckhbw   xmm6,xmm5
+  movdqa      xmm3,xmm4
+  packuswb    xmm1,xmm0
+  movdqa      xmm0,xmm1
+  punpckhbw   xmm1,xmm15
+  punpcklbw   xmm0,xmm15
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm6
+  movdqa      xmm2,xmm3
+  punpcklwd   xmm0,xmm1
+  punpckhwd   xmm6,xmm1
+  movdqa      xmm1,xmm4
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm6
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm6
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+10h],xmm0
+  movdqa      [rsp+60h],xmm2
+  movdqa      xmm0,xmm3
+  mov         eax,[rsp+10h]
+  mov         [rcx-2],eax
+  mov         eax,[rsp+60h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [r10+rcx-2],eax
+  movdqa      [rsp+20h],xmm0
+  mov         eax, [rsp+20h]
+  movdqa      [rsp+70h],xmm3
+  mov         [rcx+r10*2-2],eax
+  mov         eax,[rsp+70h]
+  mov         [rdx+rcx-2],eax
+  mov         eax,[rsp+18h]
+  mov         [r11],eax
+  mov         eax,[rsp+68h]
+  mov         [r10+r11],eax
+  mov         eax,[rsp+28h]
+  mov         [r11+r10*2],eax
+  mov         eax,[rsp+78h]
+  mov         [rdx+r11],eax
+  mov         eax,[rsp+14h]
+  mov         [rdi-2],eax
+  mov         eax,[rsp+64h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+24h]
+  mov         [rdi+r10*2-2],eax
+  mov         eax, [rsp+74h]
+  mov         [rdx+rdi-2],eax
+  mov         eax, [rsp+1Ch]
+  mov         [rbx],eax
+  mov         eax, [rsp+6Ch]
+  mov         [r10+rbx],eax
+  mov         eax,[rsp+2Ch]
+  mov         [rbx+r10*2],eax
+  mov         eax,[rsp+7Ch]
+  mov         [rdx+rbx],eax
+  lea         r11,[rsp+140h]
+  mov         rbx, [r11+28h]
   mov         rsp,r11
   pop         r12
   pop         rbp
@@ -2962,14 +2962,14 @@
 WELS_EXTERN DeblockChromaLt4H_sse2
 ALIGN  16
 DeblockChromaLt4H_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp  
-  push        r12  
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        r12
   push        r13
   push        r14
-  sub         rsp,170h  
-  
+  sub         rsp,170h
+
   mov         r13,   r8
   mov         r14,   r9
   mov         r8,    rdx
@@ -2977,275 +2977,275 @@
   mov         rdx,   rdi
   mov         rcx,   rsi
 
-  movsxd      rsi,r8d 
-  lea         eax,[r8*4] 
-  mov         r11d,r9d 
-  movsxd      r10,eax 
-  mov         eax, [rcx-2] 
-  mov         r12,rdx 
-  mov         [rsp+40h],eax 
-  mov         eax, [rsi+rcx-2] 
-  lea         rbx,[r10+rcx-2] 
-  movdqa      xmm5,[rsp+40h] 
-  mov         [rsp+50h],eax 
-  mov         eax, [rcx+rsi*2-2] 
-  lea         rbp,[r10+rdx-2] 
-  movdqa      xmm2, [rsp+50h] 
-  mov         [rsp+60h],eax 
-  lea         r10,[rsi+rsi*2] 
-  mov         rdi,rcx 
-  mov         eax,[r10+rcx-2] 
-  movdqa      xmm4,[rsp+60h] 
-  mov         [rsp+70h],eax 
-  mov         eax,[rdx-2] 
-  mov         [rsp+80h],eax 
-  mov         eax, [rsi+rdx-2] 
-  movdqa      xmm3,[rsp+70h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rdx+rsi*2-2] 
-  punpckldq   xmm5,[rsp+80h] 
-  mov         [rsp+0A0h],eax 
-  mov         eax, [r10+rdx-2] 
-  punpckldq   xmm2,[rsp+90h] 
-  mov         [rsp+0B0h],eax 
-  mov         eax, [rbx] 
-  punpckldq   xmm4,[rsp+0A0h] 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp] 
-  punpckldq   xmm3,[rsp+0B0h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rsi+rbx] 
-  movdqa      xmm0,[rsp+80h] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rsi+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+90h],eax 
-  mov         eax,[rbx+rsi*2] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp+rsi*2] 
-  movdqa      xmm0, [rsp+80h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm7,xmm1 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax, [r10+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  mov         [rsp+90h],eax 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm7,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm6,xmm7 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm6,xmm0 
-  punpckhdq   xmm7,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckldq   xmm0,xmm5 
+  movsxd      rsi,r8d
+  lea         eax,[r8*4]
+  mov         r11d,r9d
+  movsxd      r10,eax
+  mov         eax, [rcx-2]
+  mov         r12,rdx
+  mov         [rsp+40h],eax
+  mov         eax, [rsi+rcx-2]
+  lea         rbx,[r10+rcx-2]
+  movdqa      xmm5,[rsp+40h]
+  mov         [rsp+50h],eax
+  mov         eax, [rcx+rsi*2-2]
+  lea         rbp,[r10+rdx-2]
+  movdqa      xmm2, [rsp+50h]
+  mov         [rsp+60h],eax
+  lea         r10,[rsi+rsi*2]
+  mov         rdi,rcx
+  mov         eax,[r10+rcx-2]
+  movdqa      xmm4,[rsp+60h]
+  mov         [rsp+70h],eax
+  mov         eax,[rdx-2]
+  mov         [rsp+80h],eax
+  mov         eax, [rsi+rdx-2]
+  movdqa      xmm3,[rsp+70h]
+  mov         [rsp+90h],eax
+  mov         eax,[rdx+rsi*2-2]
+  punpckldq   xmm5,[rsp+80h]
+  mov         [rsp+0A0h],eax
+  mov         eax, [r10+rdx-2]
+  punpckldq   xmm2,[rsp+90h]
+  mov         [rsp+0B0h],eax
+  mov         eax, [rbx]
+  punpckldq   xmm4,[rsp+0A0h]
+  mov         [rsp+80h],eax
+  mov         eax,[rbp]
+  punpckldq   xmm3,[rsp+0B0h]
+  mov         [rsp+90h],eax
+  mov         eax,[rsi+rbx]
+  movdqa      xmm0,[rsp+80h]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rsi+rbp]
+  movdqa      xmm0,[rsp+80h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+90h],eax
+  mov         eax,[rbx+rsi*2]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rbp+rsi*2]
+  movdqa      xmm0, [rsp+80h]
+  mov         [rsp+90h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm7,xmm1
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax, [r10+rbp]
+  movdqa      xmm0,[rsp+80h]
+  mov         [rsp+90h],eax
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm7,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm6,xmm7
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm6,xmm0
+  punpckhdq   xmm7,xmm0
+  movdqa      xmm0,xmm1
+  punpckldq   xmm0,xmm5
   mov         rax, r14    ; pTC
-  punpckhdq   xmm1,xmm5 
-  movdqa      xmm9,xmm6 
-  punpckhqdq  xmm6,xmm0 
-  punpcklqdq  xmm9,xmm0 
-  movdqa      xmm2,xmm7 
-  movdqa      xmm13,xmm6 
-  movdqa      xmm4,xmm9 
-  movdqa      [rsp+10h],xmm9 
-  punpcklqdq  xmm2,xmm1 
-  punpckhqdq  xmm7,xmm1 
-  pxor        xmm1,xmm1 
-  movsx       ecx,byte [rax+3] 
-  movsx       edx,byte [rax+2] 
-  movsx       r8d,byte [rax+1] 
-  movsx       r9d,byte [rax] 
-  movdqa      xmm10,xmm1 
-  movdqa      xmm15,xmm2 
-  punpckhbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm4,xmm1 
-  movsx       eax,r11w 
-  mov         word [rsp+0Eh],cx 
-  mov         word [rsp+0Ch],cx 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm8,xmm7 
-  movdqa      [rsp+20h],xmm7 
-  punpcklbw   xmm15,xmm1 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm3,xmm1 
-  mov         word [rsp+0Ah],dx 
-  mov         word [rsp+8],dx 
-  mov         word [rsp+6],r8w 
-  movd        xmm0,eax 
-  movdqa      [rsp+30h],xmm6 
-  punpckhbw   xmm9,xmm1 
-  punpckhbw   xmm8,xmm1 
-  punpcklwd   xmm0,xmm0 
+  punpckhdq   xmm1,xmm5
+  movdqa      xmm9,xmm6
+  punpckhqdq  xmm6,xmm0
+  punpcklqdq  xmm9,xmm0
+  movdqa      xmm2,xmm7
+  movdqa      xmm13,xmm6
+  movdqa      xmm4,xmm9
+  movdqa      [rsp+10h],xmm9
+  punpcklqdq  xmm2,xmm1
+  punpckhqdq  xmm7,xmm1
+  pxor        xmm1,xmm1
+  movsx       ecx,byte [rax+3]
+  movsx       edx,byte [rax+2]
+  movsx       r8d,byte [rax+1]
+  movsx       r9d,byte [rax]
+  movdqa      xmm10,xmm1
+  movdqa      xmm15,xmm2
+  punpckhbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm4,xmm1
+  movsx       eax,r11w
+  mov         word [rsp+0Eh],cx
+  mov         word [rsp+0Ch],cx
+  movdqa      xmm3,xmm7
+  movdqa      xmm8,xmm7
+  movdqa      [rsp+20h],xmm7
+  punpcklbw   xmm15,xmm1
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm3,xmm1
+  mov         word [rsp+0Ah],dx
+  mov         word [rsp+8],dx
+  mov         word [rsp+6],r8w
+  movd        xmm0,eax
+  movdqa      [rsp+30h],xmm6
+  punpckhbw   xmm9,xmm1
+  punpckhbw   xmm8,xmm1
+  punpcklwd   xmm0,xmm0
   mov         eax, r13d   ; iBeta
-  mov         word [rsp+4],r8w 
-  mov         word [rsp+2],r9w 
-  pshufd      xmm12,xmm0,0 
-  mov         word [rsp],r9w 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  movdqa      xmm14, [rsp] 
-  movdqa      [rsp],xmm2 
-  movdqa      xmm2,xmm12 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  psubw       xmm10,xmm14 
-  movd        xmm0,eax 
-  movdqa      xmm7,xmm14 
-  movdqa      xmm6,xmm14 
-  pcmpgtw     xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  movdqa      xmm0,xmm4 
-  movdqa      xmm1,xmm15 
-  psubw       xmm4,xmm13 
-  psubw       xmm0,xmm3 
-  psubw       xmm1,xmm13 
-  psubw       xmm3,xmm15 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm10 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm0,xmm13 
-  psubw       xmm0,xmm15 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm3,[rsp+30h] 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm9 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm8 
-  psubw       xmm9,xmm3 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  psubw       xmm15,xmm6 
-  paddw       xmm13,xmm6 
-  movdqa      xmm2,[rsp] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  psubw       xmm8,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm3 
-  movdqa      xmm5,[rsp+10h] 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  movdqa      xmm4,xmm5 
-  pabsw       xmm0,xmm0 
-  pmaxsw      xmm10,xmm1 
-  movdqa      xmm1,xmm11 
-  pcmpgtw     xmm12,xmm0 
-  pabsw       xmm0,xmm9 
-  pminsw      xmm14,xmm10 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm8 
-  pcmpgtw     xmm11,xmm0 
-  pand        xmm12,xmm1 
-  movdqa      xmm1,[rsp+20h] 
-  pand        xmm12,xmm11 
-  pand        xmm12,xmm7 
-  pand        xmm14,xmm12 
-  paddw       xmm3,xmm14 
-  psubw       xmm2,xmm14 
-  packuswb    xmm13,xmm3 
-  packuswb    xmm15,xmm2 
-  punpcklbw   xmm4,xmm13 
-  punpckhbw   xmm5,xmm13 
-  movdqa      xmm0,xmm15 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm4 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm2,xmm3 
-  movdqa      xmm1,xmm4 
-  punpcklwd   xmm0,xmm15 
-  punpckhwd   xmm5,xmm15 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm5 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm5 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+40h],xmm0 
-  movdqa      xmm0,xmm3 
-  movdqa      [rsp+90h],xmm2 
-  mov         eax,[rsp+40h] 
-  mov         [rdi-2],eax 
-  mov         eax, [rsp+90h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [rsi+rdi-2],eax 
-  movdqa      [rsp+50h],xmm0 
-  mov         eax,[rsp+50h] 
-  movdqa      [rsp+0A0h],xmm3 
-  mov         [rdi+rsi*2-2],eax 
-  mov         eax,[rsp+0A0h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+48h] 
-  mov         [rbx],eax 
-  mov         eax,[rsp+98h] 
-  mov         [rsi+rbx],eax 
-  mov         eax,[rsp+58h] 
-  mov         [rbx+rsi*2],eax 
-  mov         eax, [rsp+0A8h] 
-  mov         [r10+rbx],eax 
-  mov         eax, [rsp+44h] 
-  mov         [r12-2],eax 
-  mov         eax,[rsp+94h] 
-  mov         [rsi+r12-2],eax 
-  mov         eax,[rsp+54h] 
-  mov         [r12+rsi*2-2],eax 
-  mov         eax, [rsp+0A4h] 
-  mov         [r10+r12-2],eax 
-  mov         eax,[rsp+4Ch] 
-  mov         [rbp],eax 
-  mov         eax,[rsp+9Ch] 
-  mov         [rsi+rbp],eax 
-  mov         eax, [rsp+5Ch] 
-  mov         [rbp+rsi*2],eax 
-  mov         eax,[rsp+0ACh] 
-  mov         [r10+rbp],eax   
-  lea         r11,[rsp+170h]    
-  mov         rsp,r11 
+  mov         word [rsp+4],r8w
+  mov         word [rsp+2],r9w
+  pshufd      xmm12,xmm0,0
+  mov         word [rsp],r9w
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  movdqa      xmm14, [rsp]
+  movdqa      [rsp],xmm2
+  movdqa      xmm2,xmm12
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  psubw       xmm10,xmm14
+  movd        xmm0,eax
+  movdqa      xmm7,xmm14
+  movdqa      xmm6,xmm14
+  pcmpgtw     xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  movdqa      xmm0,xmm4
+  movdqa      xmm1,xmm15
+  psubw       xmm4,xmm13
+  psubw       xmm0,xmm3
+  psubw       xmm1,xmm13
+  psubw       xmm3,xmm15
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm10
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm11
+  movdqa      xmm0,xmm13
+  psubw       xmm0,xmm15
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm11
+  movdqa      xmm3,[rsp+30h]
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm9
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm8
+  psubw       xmm9,xmm3
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  psubw       xmm15,xmm6
+  paddw       xmm13,xmm6
+  movdqa      xmm2,[rsp]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  psubw       xmm8,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm3
+  movdqa      xmm5,[rsp+10h]
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  movdqa      xmm4,xmm5
+  pabsw       xmm0,xmm0
+  pmaxsw      xmm10,xmm1
+  movdqa      xmm1,xmm11
+  pcmpgtw     xmm12,xmm0
+  pabsw       xmm0,xmm9
+  pminsw      xmm14,xmm10
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm8
+  pcmpgtw     xmm11,xmm0
+  pand        xmm12,xmm1
+  movdqa      xmm1,[rsp+20h]
+  pand        xmm12,xmm11
+  pand        xmm12,xmm7
+  pand        xmm14,xmm12
+  paddw       xmm3,xmm14
+  psubw       xmm2,xmm14
+  packuswb    xmm13,xmm3
+  packuswb    xmm15,xmm2
+  punpcklbw   xmm4,xmm13
+  punpckhbw   xmm5,xmm13
+  movdqa      xmm0,xmm15
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm4
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm2,xmm3
+  movdqa      xmm1,xmm4
+  punpcklwd   xmm0,xmm15
+  punpckhwd   xmm5,xmm15
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm5
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm5
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+40h],xmm0
+  movdqa      xmm0,xmm3
+  movdqa      [rsp+90h],xmm2
+  mov         eax,[rsp+40h]
+  mov         [rdi-2],eax
+  mov         eax, [rsp+90h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [rsi+rdi-2],eax
+  movdqa      [rsp+50h],xmm0
+  mov         eax,[rsp+50h]
+  movdqa      [rsp+0A0h],xmm3
+  mov         [rdi+rsi*2-2],eax
+  mov         eax,[rsp+0A0h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+48h]
+  mov         [rbx],eax
+  mov         eax,[rsp+98h]
+  mov         [rsi+rbx],eax
+  mov         eax,[rsp+58h]
+  mov         [rbx+rsi*2],eax
+  mov         eax, [rsp+0A8h]
+  mov         [r10+rbx],eax
+  mov         eax, [rsp+44h]
+  mov         [r12-2],eax
+  mov         eax,[rsp+94h]
+  mov         [rsi+r12-2],eax
+  mov         eax,[rsp+54h]
+  mov         [r12+rsi*2-2],eax
+  mov         eax, [rsp+0A4h]
+  mov         [r10+r12-2],eax
+  mov         eax,[rsp+4Ch]
+  mov         [rbp],eax
+  mov         eax,[rsp+9Ch]
+  mov         [rsi+rbp],eax
+  mov         eax, [rsp+5Ch]
+  mov         [rbp+rsi*2],eax
+  mov         eax,[rsp+0ACh]
+  mov         [r10+rbp],eax
+  lea         r11,[rsp+170h]
+  mov         rsp,r11
   pop         r14
   pop         r13
-  pop         r12  
-  pop         rbp  
-  pop         rbx  
-  ret 
+  pop         r12
+  pop         rbp
+  pop         rbx
+  ret
 
 
 
@@ -5162,7 +5162,7 @@
 	mov	esp, ebp
 	pop	ebp
 	ret
-    
+
 %endif
 
 
@@ -5178,16 +5178,16 @@
 ALIGN  16
 
 DeblockLumaTransposeH2V_sse2:
-    push     r3 
-    push     r4  
+    push     r3
+    push     r4
     push     r5
 
-%assign   push_num   3 
-    LOAD_3_PARA    
+%assign   push_num   3
+    LOAD_3_PARA
 
     SIGN_EXTENTION   r1, r1d
 
-    mov      r5,    r7 
+    mov      r5,    r7
     mov      r3,    r7
     and      r3,    0Fh
     sub      r7,    r3
@@ -5229,7 +5229,7 @@
 
     SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
     ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-   
+
     movdqa  [r2],    xmm4
     movdqa  [r2 + 10h],  xmm2
     movdqa  [r2 + 20h],  xmm3
@@ -5258,17 +5258,17 @@
 
 DeblockLumaTransposeV2H_sse2:
     push     r3
-    push     r4 
+    push     r4
 
 %assign  push_num 2
     LOAD_3_PARA
 
-    SIGN_EXTENTION   r1, r1d 
+    SIGN_EXTENTION   r1, r1d
 
     mov      r4,    r7
-    mov      r3,    r7 
+    mov      r3,    r7
     and      r3,    0Fh
-    sub      r7,    r3 
+    sub      r7,    r3
     sub      r7,    10h
 
     movdqa   xmm0,   [r2]
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -244,7 +244,7 @@
 %macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
     ;r6 [height]
     ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
-    ;r3 [pSrc+(w-1)] r4[pSrc+w] 
+    ;r3 [pSrc+(w-1)] r4[pSrc+w]
 
 %if %1 == 32		; for luma
 .left_right_loops:
@@ -375,13 +375,13 @@
 
     %assign push_num 3
     LOAD_4_PARA
-    
+
     SIGN_EXTENTION r1, r1d
     SIGN_EXTENTION r2, r2d
     SIGN_EXTENTION r3, r3d
 
     ;also prepare for cross border pData top-left:xmm3
-    
+
     movzx r6d,byte[r0]
     SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
 
@@ -395,22 +395,22 @@
     dec r3                      ;h-1
     imul r3,r1                  ;(h-1)*stride
     lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-    
+
     mov r6,r1                    ;r6 = stride
     sal r6,05h                   ;r6 = 32*stride
     lea r4,[r3+r6]               ;r4 = dst bottom
-    
+
     ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-    
+
     movzx r6d,byte [r3]             ;bottom-left
     SSE2_Copy16Times xmm5,r6d
-    
+
     lea r6,[r3+r2-1]
     movzx r6d,byte [r6]
     SSE2_Copy16Times xmm6,r6d ;bottom-right
-    
+
     neg r1  ;r1 = -stride
-    
+
     push r0
     push r1
     push r2
@@ -419,13 +419,13 @@
 
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    
+
     pop r2
     pop r1
     pop r0
 
     lea r5,[r0-32]                          ;left border dst  luma =32 chroma = -16
-    
+
     lea r3,[r0+r2-1]                        ;right border src
     lea r4,[r3+1]                           ;right border dst
 
@@ -432,7 +432,7 @@
     ;prepare for cross border data: top-rigth with xmm4
      movzx r6d,byte [r3]                         ;top -rigth
      SSE2_Copy16Times xmm4,r6d
-    
+
     neg r1   ;r1 = stride
 
 
@@ -444,7 +444,7 @@
     push r1
     push r2
     push r6
-    
+
     exp_left_right_sse2  32,a
 
     pop r6
@@ -455,22 +455,22 @@
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-    
+
     neg r1  ;r1 = -stride
     lea r3,[r0-32]
     lea r3,[r3+r1]    ;last line of top-left border
-    
+
     lea r4,[r0+r2]    ;psrc +width
     lea r4,[r4+r1]    ;psrc +width -stride
-    
-    
+
+
     neg r1  ;r1 = stride
     add r6,32         ;height +32(16) ,luma = 32, chroma = 16
     imul r6,r1
-    
+
     lea r5,[r3+r6]    ;last line of bottom-left border
     lea r6,[r4+r6]    ;last line of botoom-right border
-    
+
     neg r1 ; r1 = -stride
 
     ; for left & right border expanding
@@ -477,11 +477,11 @@
     exp_cross_sse2 32,a
 
     LOAD_4_PARA_POP
-    
+
     pop r6
     pop r5
     pop r4
-    
+
     %assign push_num 0
 
 
@@ -495,7 +495,7 @@
 ;										const int32_t iHeight	);
 ;***********************************************************************----------------
 ExpandPictureChromaAlign_sse2:
-	
+
     push r4
     push r5
     push r6
@@ -508,7 +508,7 @@
     SIGN_EXTENTION r3,r3d
 
     ;also prepare for cross border pData top-left:xmm3
-    
+
     movzx r6d,byte [r0]
     SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
 
@@ -522,24 +522,24 @@
     dec r3                      ;h-1
     imul r3,r1                  ;(h-1)*stride
     lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-    
+
     mov r6,r1                    ;r6 = stride
     sal r6,04h                   ;r6 = 32*stride
-    lea r4,[r3+r6]               ;r4 = dst bottom 
-    
+    lea r4,[r3+r6]               ;r4 = dst bottom
+
     ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-    
+
     movzx r6d,byte [r3]             ;bottom-left
     SSE2_Copy16Times xmm5,r6d
-    
+
     lea r6,[r3+r2-1]
     movzx r6d,byte [r6]
     SSE2_Copy16Times xmm6,r6d ;bottom-right
-    
+
     neg r1  ;r1 = -stride
-    
+
     push r0
-    push r1 
+    push r1
     push r2
 
     exp_top_bottom_sse2 16
@@ -546,20 +546,20 @@
 
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    
+
     pop r2
     pop r1
     pop r0
 
     lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
-    
-    lea r3,[r0+r2-1]                        ;right border src 
+
+    lea r3,[r0+r2-1]                        ;right border src
     lea r4,[r3+1]                           ;right border dst
 
     ;prepare for cross border data: top-rigth with xmm4
     movzx r6d,byte [r3]                         ;top -rigth
     SSE2_Copy16Times xmm4,r6d
-    
+
     neg r1   ;r1 = stride
 
 
@@ -568,7 +568,7 @@
 
 
     push r0
-    push r1 
+    push r1
     push r2
 	push r6
     exp_left_right_sse2 16,a
@@ -581,22 +581,22 @@
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-    
+
     neg r1  ;r1 = -stride
     lea r3,[r0-16]
     lea r3,[r3+r1]    ;last line of top-left border
-    
+
     lea r4,[r0+r2]    ;psrc +width
-    lea r4,[r4+r1]    ;psrc +width -stride  
-    
-    
+    lea r4,[r4+r1]    ;psrc +width -stride
+
+
     neg r1  ;r1 = stride
     add r6,16         ;height +32(16) ,luma = 32, chroma = 16
     imul r6,r1
-    
+
     lea r5,[r3+r6]    ;last line of bottom-left border
     lea r6,[r4+r6]    ;last line of botoom-right border
-    
+
     neg r1 ; r1 = -stride
 
     ; for left & right border expanding
@@ -603,11 +603,11 @@
     exp_cross_sse2 16,a
 
     LOAD_4_PARA_POP
-    
+
     pop r6
     pop r5
     pop r4
-    
+
     %assign push_num 0
 
 
@@ -633,7 +633,7 @@
     SIGN_EXTENTION r3,r3d
 
     ;also prepare for cross border pData top-left:xmm3
-    
+
     movzx r6d,byte [r0]
     SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
 
@@ -647,24 +647,24 @@
     dec r3                      ;h-1
     imul r3,r1                  ;(h-1)*stride
     lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-    
+
     mov r6,r1                    ;r6 = stride
     sal r6,04h                   ;r6 = 32*stride
-    lea r4,[r3+r6]               ;r4 = dst bottom 
-    
+    lea r4,[r3+r6]               ;r4 = dst bottom
+
     ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-    
+
     movzx r6d,byte [r3]             ;bottom-left
     SSE2_Copy16Times xmm5,r6d
-    
+
     lea r6,[r3+r2-1]
     movzx r6d,byte [r6]
     SSE2_Copy16Times xmm6,r6d ;bottom-right
-    
+
     neg r1  ;r1 = -stride
-    
+
     push r0
-    push r1 
+    push r1
     push r2
 
     exp_top_bottom_sse2 16
@@ -671,20 +671,20 @@
 
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    
+
     pop r2
     pop r1
     pop r0
 
     lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
-    
-    lea r3,[r0+r2-1]                        ;right border src 
+
+    lea r3,[r0+r2-1]                        ;right border src
     lea r4,[r3+1]                           ;right border dst
 
     ;prepare for cross border data: top-rigth with xmm4
     movzx r6d,byte [r3]                         ;top -rigth
     SSE2_Copy16Times xmm4,r6d
-    
+
     neg r1   ;r1 = stride
 
 
@@ -693,7 +693,7 @@
 
 
     push r0
-    push r1 
+    push r1
     push r2
 	push r6
     exp_left_right_sse2 16,u
@@ -706,22 +706,22 @@
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-    
+
     neg r1  ;r1 = -stride
     lea r3,[r0-16]
     lea r3,[r3+r1]    ;last line of top-left border
-    
+
     lea r4,[r0+r2]    ;psrc +width
-    lea r4,[r4+r1]    ;psrc +width -stride  
-    
-    
+    lea r4,[r4+r1]    ;psrc +width -stride
+
+
     neg r1  ;r1 = stride
     add r6,16         ;height +32(16) ,luma = 32, chroma = 16
     imul r6,r1
-    
+
     lea r5,[r3+r6]    ;last line of bottom-left border
     lea r6,[r4+r6]    ;last line of botoom-right border
-    
+
     neg r1 ; r1 = -stride
 
     ; for left & right border expanding
@@ -728,13 +728,12 @@
     exp_cross_sse2 16,u
 
     LOAD_4_PARA_POP
-    
+
     pop r6
     pop r5
     pop r4
-    
+
     %assign push_num 0
 
 
 	ret
-    
\ No newline at end of file
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -1,701 +1,701 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mb_copy.asm
-;*
-;*  Abstract
-;*      mb_copy and mb_copy1
-;*
-;*  History
-;*      15/09/2009 Created
-;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN WelsCopy16x16_sse2
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
-WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
-WELS_EXTERN WelsCopy8x16_mmx		;
-WELS_EXTERN UpdateMbMv_sse2		;
-
-;***********************************************************************
-; void WelsCopy16x16_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x16_sse2:
-
-	push r4
-	push r5
-	%assign  push_num 2
-    LOAD_4_PARA
-
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
-
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
-
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4	
-	ret
-
-;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WelsCopy16x16NotAligned_sse2:
-	;push esi
-	;push edi
-	;push ebx
-
-	;mov edi, [esp+16]	; Dst
-	;mov eax, [esp+20]	; iStrideD
-	;mov esi, [esp+24]	; Src
-	;mov ecx, [esp+28]	; iStrideS
-	
-	push r4
-	push r5
-	%assign  push_num 2
-    LOAD_4_PARA
-
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
-
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
-
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
-
-; , 12/29/2011
-;***********************************************************************
-; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x8NotAligned_sse2:
-	;push esi
-	;push edi
-	;push ebx
-
-	;mov edi, [esp+16]	; Dst
-	;mov eax, [esp+20]	; iStrideD
-	;mov esi, [esp+24]	; Src
-	;mov ecx, [esp+28]	; iStrideS
-	
-	push r4
-	push r5
-	%assign  push_num 2
-    LOAD_4_PARA
-
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
-
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4	
-	ret
-
-
-;***********************************************************************
-; void WelsCopy8x16_mmx(uint8_t* Dst,
-;                       int32_t  iStrideD,
-;                       uint8_t* Src,
-;                       int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x16_mmx:
-	;push ebx
-
-	;mov eax, [esp + 8 ]           ;Dst
-	;mov ecx, [esp + 12]           ;iStrideD
-	;mov ebx, [esp + 16]           ;Src
-	;mov edx, [esp + 20]           ;iStrideS
-	
-	%assign  push_num 0
-    LOAD_4_PARA
-
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-	lea r2, [r2+2*r3]
-	
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-	lea r0, [r0+2*r1]
-
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-	
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-
-	WELSEMMS
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-; void WelsCopy8x8_mmx(  uint8_t* Dst,
-;                        int32_t  iStrideD,
-;                        uint8_t* Src,
-;                        int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x8_mmx:
-	;push ebx
-	;push esi
-	;mov eax, [esp + 12]           ;Dst
-	;mov ecx, [esp + 16]           ;iStrideD
-	;mov esi, [esp + 20]           ;Src
-	;mov ebx, [esp + 24]           ;iStrideS
-	
-	push r4
-	%assign  push_num 1
-    LOAD_4_PARA
-	lea r4, [r3+2*r3]	;edx, [ebx+2*ebx]
-
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-
-	WELSEMMS
-	;pop esi
-	;pop ebx	
-	LOAD_4_PARA_POP
-	pop r4
-	ret
-
-; (dunhuang@cisco), 12/21/2011
-;***********************************************************************
-; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
-;***********************************************************************
-ALIGN 16
-UpdateMbMv_sse2:
-
-    %assign  push_num 0
-    LOAD_2_PARA
-    
-	;mov eax, [esp+4]	; mv_buffer
-	;movd xmm0, [esp+8]	; _mv
-	movd xmm0, r1d	; _mv
-	pshufd xmm1, xmm0, $0
-	movdqa [r0     ], xmm1
-	movdqa [r0+0x10], xmm1
-	movdqa [r0+0x20], xmm1
-	movdqa [r0+0x30], xmm1
-	ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-;SECTION .rodata data align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN PixelAvgWidthEq4_mmx
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
-;                           uint8_t *pSrcA, int iSrcAStride,
-;                           uint8_t *pSrcB, int iSrcBStride,
-;                           int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq4_mmx:
- 
-    %assign  push_num 0
-    LOAD_7_PARA
-
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-	movsx	r6, r6d
-%endif
-
-ALIGN 4
-.height_loop:
-	movd        mm0, [r4]
-    pavgb       mm0, [r2]
-    movd        [r0], mm0
-
-    dec         r6
-    lea         r0, [r0+r1]
-    lea         r2, [r2+r3]
-    lea         r4, [r4+r5]
-    jne         .height_loop
-
-	WELSEMMS
-	LOAD_7_PARA_POP
-    ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
-;                           uint8_t *pSrcA, int iSrcAStride,
-;                           uint8_t *pSrcB, int iSrcBStride,
-;                           int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq8_mmx:
-
-    ;push        esi
-    ;push        edi
-    ;push        ebp
-    ;push        ebx
-
-    ;mov         edi, [esp+20]       ; pDst
-    ;mov         eax, [esp+24]       ; iDstStride
-    ;mov         esi, [esp+28]       ; pSrcA
-    ;mov         ecx, [esp+32]       ; iSrcAStride
-    ;mov         ebp, [esp+36]       ; pSrcB
-    ;mov         edx, [esp+40]       ; iSrcBStride
-    ;mov         ebx, [esp+44]       ; iHeight
-    
-    %assign  push_num 0
-    LOAD_7_PARA
-
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-	movsx	r6, r6d
-%endif
-    
-ALIGN 4
-.height_loop:
-	movq        mm0, [r2]
-    pavgb       mm0, [r4]
-    movq        [r0], mm0
-    movq        mm0, [r2+r3]
-    pavgb       mm0, [r4+r5]
-    movq		[r0+r1], mm0
-
-    lea			r2,  [r2+2*r3]
-    lea			r4,  [r4+2*r5]
-    lea			r0,  [r0+2*r1]
-
-    sub         r6, 2
-    jnz         .height_loop
-
-	WELSEMMS
-	LOAD_7_PARA_POP
-    ret
-
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
-;                          uint8_t *pSrcA, int iSrcAStride,
-;                          uint8_t *pSrcB, int iSrcBStride,
-;                          int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq16_sse2:
-        
-    %assign  push_num 0
-    LOAD_7_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-	movsx	r6, r6d
-%endif
-ALIGN 4
-.height_loop:
-	movdqu      xmm0, [r2]
-	movdqu	    xmm1, [r4]
-	pavgb	    xmm0, xmm1
-	;pavgb       xmm0, [r4]
-    movdqu      [r0], xmm0
-
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
-    movdqu      [r0+r1], xmm0
-
-	movdqu      xmm0, [r2+2*r3]
-	movdqu       xmm1, [r4+2*r5]
-	pavgb	    xmm0, xmm1
-    movdqu      [r0+2*r1], xmm0
-
-    lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
-
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
-    movdqu      [r0+r1], xmm0
-
-    lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
-    
-    sub         r6, 4
-    jne         .height_loop
-
-	WELSEMMS
-	LOAD_7_PARA_POP
-    ret
-
-ALIGN 16
-;*******************************************************************************
-;  void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
-;                          uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq4_mmx:
-    ;push    esi
-    ;push    edi
-    ;push    ebx
-
-
-    ;mov esi,  [esp+16]
-    ;mov eax, [esp+20]
-    ;mov edi,  [esp+24]
-    ;mov ecx,  [esp+28]
-    ;mov edx,  [esp+32]
-    
-    push	r5
-    %assign  push_num 1
-    LOAD_5_PARA
-   
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-    
-ALIGN 4
-.height_loop:
-	mov r5d, [r0]
-	mov [r2], r5d
-
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-	WELSEMMS
-    LOAD_5_PARA_POP
-    pop	   r5
-    ret
-
-ALIGN 16
-;*******************************************************************************
-;   void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-;                           uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq8_mmx:
-    ;push  esi
-    ;push  edi
-	;mov  esi, [esp+12]
-	;mov eax, [esp+16]
-	;mov edi, [esp+20]
-	;mov ecx, [esp+24]
-	;mov edx, [esp+28]
-	
-    %assign  push_num 0
-    LOAD_5_PARA
-
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-
-ALIGN 4
-.height_loop:
-	movq mm0, [r0]
-	movq [r2], mm0
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-
-	WELSEMMS
-	LOAD_5_PARA_POP
-    ret
-
-
-ALIGN 16
-;*******************************************************************************
-;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-;read unaligned memory
-%macro SSE_READ_UNA 2
-	movq	%1, [%2]
-	movhps	%1,	[%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE_WRITE_UNA 2
-	movq	[%1],	%2
-	movhps	[%1+8], %2
-%endmacro
-McCopyWidthEq16_sse2:
-    ;push    esi
-    ;push    edi
-
-    ;mov     esi, [esp+12]       ; pSrc
-    ;mov     eax, [esp+16]       ; iSrcStride
-    ;mov     edi, [esp+20]       ; pDst
-    ;mov     edx, [esp+24]       ; iDstStride
-    ;mov     ecx, [esp+28]       ; iHeight
-
-    %assign  push_num 0
-    LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-ALIGN 4
-.height_loop:
-    SSE_READ_UNA	xmm0, r0
-    SSE_READ_UNA	xmm1, r0+r1
-    SSE_WRITE_UNA	r2, xmm0
-    SSE_WRITE_UNA	r2+r3, xmm1
-
-	sub		r4,	2
-    lea     r0, [r0+r1*2]
-    lea     r2, [r2+r3*2]
-    jnz     .height_loop
-
-	LOAD_5_PARA_POP
-    ret
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mb_copy.asm
+;*
+;*  Abstract
+;*      mb_copy and mb_copy1
+;*
+;*  History
+;*      15/09/2009 Created
+;*		12/28/2009 Modified with larger throughput
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN WelsCopy16x16_sse2
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
+WELS_EXTERN WelsCopy8x8_mmx
+WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
+WELS_EXTERN WelsCopy8x16_mmx		;
+WELS_EXTERN UpdateMbMv_sse2		;
+
+;***********************************************************************
+; void WelsCopy16x16_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x16_sse2:
+
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+2*r3]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+2*r3]
+	movdqa xmm7, [r2+r5]
+	lea r2, [r2+4*r3]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	lea r0, [r0+4*r1]
+
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+2*r3]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+2*r3]
+	movdqa xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+;***********************************************************************
+; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
+WelsCopy16x16NotAligned_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	;mov edi, [esp+16]	; Dst
+	;mov eax, [esp+20]	; iStrideD
+	;mov esi, [esp+24]	; Src
+	;mov ecx, [esp+28]	; iStrideS
+
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+	lea r2, [r2+4*r3]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	lea r0, [r0+4*r1]
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+; , 12/29/2011
+;***********************************************************************
+; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x8NotAligned_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	;mov edi, [esp+16]	; Dst
+	;mov eax, [esp+20]	; iStrideD
+	;mov esi, [esp+24]	; Src
+	;mov ecx, [esp+28]	; iStrideS
+
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+
+;***********************************************************************
+; void WelsCopy8x16_mmx(uint8_t* Dst,
+;                       int32_t  iStrideD,
+;                       uint8_t* Src,
+;                       int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x16_mmx:
+	;push ebx
+
+	;mov eax, [esp + 8 ]           ;Dst
+	;mov ecx, [esp + 12]           ;iStrideD
+	;mov ebx, [esp + 16]           ;Src
+	;mov edx, [esp + 20]           ;iStrideS
+
+	%assign  push_num 0
+    LOAD_4_PARA
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+	lea r2, [r2+2*r3]
+
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+	lea r0, [r0+2*r1]
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+
+	WELSEMMS
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+; void WelsCopy8x8_mmx(  uint8_t* Dst,
+;                        int32_t  iStrideD,
+;                        uint8_t* Src,
+;                        int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x8_mmx:
+	;push ebx
+	;push esi
+	;mov eax, [esp + 12]           ;Dst
+	;mov ecx, [esp + 16]           ;iStrideD
+	;mov esi, [esp + 20]           ;Src
+	;mov ebx, [esp + 24]           ;iStrideS
+
+	push r4
+	%assign  push_num 1
+    LOAD_4_PARA
+	lea r4, [r3+2*r3]	;edx, [ebx+2*ebx]
+
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+
+	WELSEMMS
+	;pop esi
+	;pop ebx
+	LOAD_4_PARA_POP
+	pop r4
+	ret
+
+; (dunhuang@cisco), 12/21/2011
+;***********************************************************************
+; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
+;***********************************************************************
+ALIGN 16
+UpdateMbMv_sse2:
+
+    %assign  push_num 0
+    LOAD_2_PARA
+
+	;mov eax, [esp+4]	; mv_buffer
+	;movd xmm0, [esp+8]	; _mv
+	movd xmm0, r1d	; _mv
+	pshufd xmm1, xmm0, $0
+	movdqa [r0     ], xmm1
+	movdqa [r0+0x10], xmm1
+	movdqa [r0+0x20], xmm1
+	movdqa [r0+0x30], xmm1
+	ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+;SECTION .rodata data align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN PixelAvgWidthEq4_mmx
+WELS_EXTERN PixelAvgWidthEq8_mmx
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+WELS_EXTERN McCopyWidthEq4_mmx
+WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq16_sse2
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq4_mmx:
+
+    %assign  push_num 0
+    LOAD_7_PARA
+
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+	movsx	r6, r6d
+%endif
+
+ALIGN 4
+.height_loop:
+	movd        mm0, [r4]
+    pavgb       mm0, [r2]
+    movd        [r0], mm0
+
+    dec         r6
+    lea         r0, [r0+r1]
+    lea         r2, [r2+r3]
+    lea         r4, [r4+r5]
+    jne         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq8_mmx:
+
+    ;push        esi
+    ;push        edi
+    ;push        ebp
+    ;push        ebx
+
+    ;mov         edi, [esp+20]       ; pDst
+    ;mov         eax, [esp+24]       ; iDstStride
+    ;mov         esi, [esp+28]       ; pSrcA
+    ;mov         ecx, [esp+32]       ; iSrcAStride
+    ;mov         ebp, [esp+36]       ; pSrcB
+    ;mov         edx, [esp+40]       ; iSrcBStride
+    ;mov         ebx, [esp+44]       ; iHeight
+
+    %assign  push_num 0
+    LOAD_7_PARA
+
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+	movsx	r6, r6d
+%endif
+
+ALIGN 4
+.height_loop:
+	movq        mm0, [r2]
+    pavgb       mm0, [r4]
+    movq        [r0], mm0
+    movq        mm0, [r2+r3]
+    pavgb       mm0, [r4+r5]
+    movq		[r0+r1], mm0
+
+    lea			r2,  [r2+2*r3]
+    lea			r4,  [r4+2*r5]
+    lea			r0,  [r0+2*r1]
+
+    sub         r6, 2
+    jnz         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
+;                          uint8_t *pSrcA, int iSrcAStride,
+;                          uint8_t *pSrcB, int iSrcBStride,
+;                          int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq16_sse2:
+
+    %assign  push_num 0
+    LOAD_7_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+	movsx	r6, r6d
+%endif
+ALIGN 4
+.height_loop:
+	movdqu      xmm0, [r2]
+	movdqu	    xmm1, [r4]
+	pavgb	    xmm0, xmm1
+	;pavgb       xmm0, [r4]
+    movdqu      [r0], xmm0
+
+	movdqu      xmm0, [r2+r3]
+	movdqu      xmm1, [r4+r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+r1], xmm0
+
+	movdqu      xmm0, [r2+2*r3]
+	movdqu       xmm1, [r4+2*r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+2*r1], xmm0
+
+    lea         r2, [r2+2*r3]
+    lea			r4, [r4+2*r5]
+    lea			r0, [r0+2*r1]
+
+	movdqu      xmm0, [r2+r3]
+	movdqu      xmm1, [r4+r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+r1], xmm0
+
+    lea         r2, [r2+2*r3]
+    lea			r4, [r4+2*r5]
+    lea			r0, [r0+2*r1]
+
+    sub         r6, 4
+    jne         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+ALIGN 16
+;*******************************************************************************
+;  void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
+;                          uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq4_mmx:
+    ;push    esi
+    ;push    edi
+    ;push    ebx
+
+
+    ;mov esi,  [esp+16]
+    ;mov eax, [esp+20]
+    ;mov edi,  [esp+24]
+    ;mov ecx,  [esp+28]
+    ;mov edx,  [esp+32]
+
+    push	r5
+    %assign  push_num 1
+    LOAD_5_PARA
+
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+
+ALIGN 4
+.height_loop:
+	mov r5d, [r0]
+	mov [r2], r5d
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+	WELSEMMS
+    LOAD_5_PARA_POP
+    pop	   r5
+    ret
+
+ALIGN 16
+;*******************************************************************************
+;   void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+;                           uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq8_mmx:
+    ;push  esi
+    ;push  edi
+	;mov  esi, [esp+12]
+	;mov eax, [esp+16]
+	;mov edi, [esp+20]
+	;mov ecx, [esp+24]
+	;mov edx, [esp+28]
+
+    %assign  push_num 0
+    LOAD_5_PARA
+
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+
+ALIGN 4
+.height_loop:
+	movq mm0, [r0]
+	movq [r2], mm0
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+
+	WELSEMMS
+	LOAD_5_PARA_POP
+    ret
+
+
+ALIGN 16
+;*******************************************************************************
+;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+;read unaligned memory
+%macro SSE_READ_UNA 2
+	movq	%1, [%2]
+	movhps	%1,	[%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE_WRITE_UNA 2
+	movq	[%1],	%2
+	movhps	[%1+8], %2
+%endmacro
+McCopyWidthEq16_sse2:
+    ;push    esi
+    ;push    edi
+
+    ;mov     esi, [esp+12]       ; pSrc
+    ;mov     eax, [esp+16]       ; iSrcStride
+    ;mov     edi, [esp+20]       ; pDst
+    ;mov     edx, [esp+24]       ; iDstStride
+    ;mov     ecx, [esp+28]       ; iHeight
+
+    %assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+ALIGN 4
+.height_loop:
+    SSE_READ_UNA	xmm0, r0
+    SSE_READ_UNA	xmm1, r0+r1
+    SSE_WRITE_UNA	r2, xmm0
+    SSE_WRITE_UNA	r2+r3, xmm1
+
+	sub		r4,	2
+    lea     r0, [r0+r1*2]
+    lea     r2, [r2+r3*2]
+    jnz     .height_loop
+
+	LOAD_5_PARA_POP
+    ret
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -1,345 +1,345 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-;							int32_t iSrcStride,
-;							uint8_t *pDst,
-;							int32_t iDstStride,
-;							uint8_t *pABCD,
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	;push esi
-	;push edi
-	;push ebx
-	
-	%assign  push_num 0
-	LOAD_6_PARA 
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-%endif
-	
-	;mov eax, [esp +12 + 20]
-	
-	movd mm3, [r4];	[eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3
-	punpckhwd mm4, mm4
-
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-
-	;mov esi, [esp +12+ 4]
-	;mov eax, [esp + 12 + 8]
-	;mov edi, [esp + 12 + 12]
-	;mov edx, [esp + 12 + 16]
-    ;mov ecx, [esp + 12 + 24]
-
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movd mm0, [r0]
-	movd mm1, [r0+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-
-	movd  mm1, [r4]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-
-	movd mm1, [r4+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [r2], mm0
-
-	movq mm0, mm2
-
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
-
-	dec r5
-	jnz near .xloop
-	WELSEMMS
-	LOAD_6_PARA_POP
-	;pop ebx
-	;pop edi
-	;pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-;						int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						uint8_t *pABCD,
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	;push esi
-	;push edi
-	;push ebx
-
-	%assign  push_num 0
-	LOAD_6_PARA 	
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-%endif
-
-	;mov eax, [esp +12 + 20]
-	movd xmm3, [r4]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-
-	;mov esi, [esp +12+ 4]
-	;mov eax, [esp + 12 + 8]
-	;mov edi, [esp + 12 + 12]
-	;mov edx, [esp + 12 + 16]
-    ;mov ecx, [esp + 12 + 24]
-
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movq xmm0, [r0]
-	movq xmm1, [r0+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-
-	movq  xmm1, [r4]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-
-	movq xmm1, [r4+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
-
-	movdqa xmm0, xmm2
-
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
-
-	dec r5
-	jnz near .xloop
-	
-	LOAD_6_PARA_POP
-
-	;pop ebx
-	;pop edi
-	;pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride,
-;                        uint8_t *pDst,
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	;push ebx
-	;push esi
-	;push edi
-	%assign  push_num 0
-	LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r5, r5d
-%endif
-	
-	;mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [r4]
-    punpcklwd xmm5, xmm5
-    punpckldq xmm5, xmm5
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6
-
-	;mov eax, [esp + 12 + 4]
-	;mov edx, [esp + 12 + 8]
-	;mov esi, [esp + 12 + 12]
-	;mov edi, [esp + 12 + 16]
-    ;mov ecx, [esp + 12 + 24]
-
-    sub r2, r3 ;sub esi, edi
-    sub r2, r3
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [r0]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-
-.hloop_chroma:
-	lea	r2, [r2+2*r3]
-
-	movdqu xmm2, [r0+r1]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [r2],xmm0
-
-    lea r0, [r0+2*r1]
-    movdqu xmm2, [r0]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [r2+r3],xmm4
-
-	sub r5, 2
-	jnz .hloop_chroma
-	
-	LOAD_6_PARA_POP
-	
-	;pop edi
-	;pop esi
-	;pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	;push esi
+	;push edi
+	;push ebx
+
+	%assign  push_num 0
+	LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+%endif
+
+	;mov eax, [esp +12 + 20]
+
+	movd mm3, [r4];	[eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	;mov esi, [esp +12+ 4]
+	;mov eax, [esp + 12 + 8]
+	;mov edi, [esp + 12 + 12]
+	;mov edx, [esp + 12 + 16]
+    ;mov ecx, [esp + 12 + 24]
+
+	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+	movd mm0, [r0]
+	movd mm1, [r0+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [r4]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [r4+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [r2], mm0
+
+	movq mm0, mm2
+
+	lea r2, [r2 + r3]
+	lea r4, [r4 + r1]
+
+	dec r5
+	jnz near .xloop
+	WELSEMMS
+	LOAD_6_PARA_POP
+	;pop ebx
+	;pop edi
+	;pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	%assign  push_num 0
+	LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+%endif
+
+	;mov eax, [esp +12 + 20]
+	movd xmm3, [r4]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	;mov esi, [esp +12+ 4]
+	;mov eax, [esp + 12 + 8]
+	;mov edi, [esp + 12 + 12]
+	;mov edx, [esp + 12 + 16]
+    ;mov ecx, [esp + 12 + 24]
+
+	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+	movq xmm0, [r0]
+	movq xmm1, [r0+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [r4]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [r4+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea r2, [r2 + r3]
+	lea r4, [r4 + r1]
+
+	dec r5
+	jnz near .xloop
+
+	LOAD_6_PARA_POP
+
+	;pop ebx
+	;pop edi
+	;pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	;push ebx
+	;push esi
+	;push edi
+	%assign  push_num 0
+	LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+%endif
+
+	;mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [r4]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	;mov eax, [esp + 12 + 4]
+	;mov edx, [esp + 12 + 8]
+	;mov esi, [esp + 12 + 12]
+	;mov edi, [esp + 12 + 16]
+    ;mov ecx, [esp + 12 + 24]
+
+    sub r2, r3 ;sub esi, edi
+    sub r2, r3
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [r0]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	r2, [r2+2*r3]
+
+	movdqu xmm2, [r0+r1]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [r2],xmm0
+
+    lea r0, [r0+2*r1]
+    movdqu xmm2, [r0]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [r2+r3],xmm4
+
+	sub r5, 2
+	jnz .hloop_chroma
+
+	LOAD_6_PARA_POP
+
+	;pop edi
+	;pop esi
+	;pop ebx
+
+	ret
+
+
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -1,1293 +1,1293 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_luma.asm
-;*
-;*  Abstract
-;*      sse2 motion compensation
-;*
-;*  History
-;*      17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10:
-	dw 16, 16, 16, 16	
-ALIGN 16
-h264_w0x10_1:
-	dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
-	dw 32, 32, 32, 32, 32, 32, 32, 32
-
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20WidthEq4_mmx
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
-;                       int iSrcStride,
-;						uint8_t *pDst,
-;						int iDstStride,
-;						int iHeight)
-;*******************************************************************************
-McHorVer20WidthEq4_mmx:
-	;push esi
-	;push edi
-
-	;mov esi, [esp+12]
-	;mov eax, [esp+16]
-	;mov edi, [esp+20]
-	;mov ecx, [esp+24]
-	;mov edx, [esp+28]
-	
-    %assign  push_num 0
-    LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-    
-	sub r0, 2
-	WELS_Zero mm7
-	movq mm6, [h264_w0x10]
-.height_loop:
-	movd mm0, [r0]
-	punpcklbw mm0, mm7
-	movd mm1, [r0+5]
-	punpcklbw mm1, mm7
-	movd mm2, [r0+1]
-	punpcklbw mm2, mm7
-	movd mm3, [r0+4]
-	punpcklbw mm3, mm7
-	movd mm4, [r0+2]
-	punpcklbw mm4, mm7
-	movd mm5, [r0+3]
-	punpcklbw mm5, mm7
-
-	paddw mm2, mm3
-	paddw mm4, mm5
-	psllw mm4, 2
-	psubw mm4, mm2
-	paddw mm0, mm1
-	paddw mm0, mm4
-	psllw mm4, 2
-	paddw mm0, mm4
-	paddw mm0, mm6
-	psraw mm0, 5
-	packuswb mm0, mm7
-	movd [r2], mm0
-
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-
-	WELSEMMS
-	LOAD_5_PARA_POP
-	ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-
-%macro SSE_LOAD_8P 3
-	movq %1, %3
-	punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
-	paddw	%1, %6
-	movdqa	%8, %3
-	movdqa	%7, %2
-	paddw	%1, [h264_w0x10_1]
-	paddw	%8, %4
-	paddw	%7, %5
-	psllw	%8, 2
-	psubw	%8, %7
-	paddw	%1, %8
-	psllw	%8, 2
-	paddw	%1, %8
-	psraw   %1, 5
-	WELS_Zero %8
-	packuswb %1, %8
-	movq    %9, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
-
-ALIGN 16
-;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
-;                       int16_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride
-;						int32_t iHeight
-;                       )
-;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
-	;push esi
-	;push edi
-	;push ebx
-	;mov esi, [esp+16]     ;pSrc
-	;mov eax, [esp+20]	;iSrcStride
-	;mov edi, [esp+24]		;pDst
-	;mov edx, [esp+28]	;iDstStride
-	;mov ebx, [esp+32]	;iHeight
-	
-	%assign  push_num 0
-    LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-	pxor xmm7, xmm7
-
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
-
-.yloop_width_8:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
-
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .yloop_width_8
-	LOAD_5_PARA_POP
-	ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
-;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
-;                      );
-;*******************************************************************************
-McHorVer20WidthEq8_sse2:
-	;push	esi
-	;push	edi
-
-	;mov esi, [esp + 12]         ;pSrc
-	;mov eax, [esp + 16]         ;iSrcStride
-	;mov edi, [esp + 20]         ;pDst
-	;mov ecx, [esp + 28]         ;iHeight
-	;mov edx, [esp + 24]			;iDstStride
-	
-	%assign  push_num 0
-    LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-	lea r0, [r0-2]            ;pSrc -= 2;
-
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
-
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
-
-	LOAD_5_PARA_POP
-	ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
-;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
-;                      );
-;*******************************************************************************
-McHorVer20WidthEq16_sse2:
-	;push	esi
-	;push	edi
-	;mov esi, [esp + 12]         ;pSrc
-	;mov eax, [esp + 16]         ;iSrcStride
-	;mov edi, [esp + 20]         ;pDst
-	;mov ecx, [esp + 28]         ;iHeight
-	;mov edx, [esp + 24]			;iDstStride
-	
-	%assign  push_num 0
-    LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-	lea r0, [r0-2]            ;pSrc -= 2;
-
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
-
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2+8], xmm0
-
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
-	
-	LOAD_5_PARA_POP
-	ret
-
-
-;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-;                       int iSrcStride,
-;                       uint8_t *pDst,
-;                       int iDstStride,
-;                       int iHeight )
-;*******************************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
-	;push esi
-	;push edi
-	;mov esi, [esp + 12]           ;pSrc
-	;mov edx, [esp + 16]	          ;iSrcStride
-	;mov edi, [esp + 20]           ;pDst
-	;mov eax, [esp + 24]           ;iDstStride
-	;mov ecx, [esp + 28]           ;iHeight
-
-	%assign  push_num 0
-    LOAD_5_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-%endif
-	sub r0, r1
-	sub r0, r1
-
-	WELS_Zero xmm7
-
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
-.start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
-
-.xx_exit:
-	LOAD_5_PARA_POP
-	ret
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20Width9Or17_sse2
-WELS_EXTERN McHorVer02Height9Or17_sse2
-WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
-WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
-WELS_EXTERN McHorVer22HorFirst_sse2
-
-
-;***********************************************************************
-; void McHorVer02Height9Or17_sse2(	uint8_t *pSrc,
-;                       int32_t iSrcStride,
-;                       uint8_t *pDst,
-;                       int32_t iDstStride,
-;						int32_t iWidth,
-;                       int32_t iHeight )
-;***********************************************************************
-ALIGN 16
-McHorVer02Height9Or17_sse2:
-	;push esi
-	;push edi
-	;push ebx
-
-	;mov esi, [esp + 16]
-	;mov edx, [esp + 20]
-	;mov edi, [esp + 24]
-	;mov eax, [esp + 28]
-	;mov ecx, [esp + 36]
-	;mov ebx, [esp + 32]
-	
-	%assign  push_num 0
-    LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
- 
-%ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
-%endif
-    
-	shr r4, 3
-	sub r0, r1
-	sub r0, r1
-
-.xloop:
-	WELS_Zero xmm7
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	movdqa xmm0,xmm1
-	movdqa xmm1,xmm2
-	movdqa xmm2,xmm3
-	movdqa xmm3,xmm4
-	movdqa xmm4,xmm5
-	movdqa xmm5,xmm6
-	add r2, r3
-	sub r0, r1
-
-.start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
-
-.x_loop_dec:
-	dec r4
-	jz  near .xx_exit
-	;mov esi, [esp + 16]
-	;mov edi, [esp + 24]
-	;mov ecx, [esp + 36]
-%ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
-%else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
-%endif
-	sub r0, r1
-	sub r0, r1
-	add r0, 8
-	add r2, 8	
-	jmp near .xloop
-
-.xx_exit:
-%ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
-%endif
-	LOAD_6_PARA_POP
-	ret
-
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer20Width9Or17_sse2(		uint8_t *pSrc,
-;                       int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						int32_t iWidth,
-;						int32_t iHeight
-;                      );
-;***********************************************************************
-McHorVer20Width9Or17_sse2:
-	;push esi
-	;push edi
-	;push ebx
-	;mov esi, [esp+16]
-	;mov eax, [esp+20]
-	;mov edi, [esp+24]
-	;mov edx, [esp+28]
-	;mov ecx, [esp+32]
-	;mov ebx, [esp+36]
-
-	%assign  push_num 0
-    LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif    
-	sub r0, 2
-	pxor xmm7, xmm7
-
-	cmp r4, 9
-	jne near .width_17
-
-.yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2], xmm0
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+1], xmm2
-
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	LOAD_6_PARA_POP
-	ret
-
-
-.width_17:
-.yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movq [r2], xmm0
-
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2+8], xmm0
-
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+9], xmm2
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	LOAD_6_PARA_POP
-	ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;void McHorVer22HorFirst_sse2
-;							(uint8_t *pSrc,
-;							int32_t iSrcStride,
-;							uint8_t * pTap,
-;							int32_t iTapStride,
-;							int32_t iWidth,int32_t iHeight);
-;***********************************************************************
-McHorVer22HorFirst_sse2:
-	;push esi
-	;push edi
-	;push ebx
-	;mov esi, [esp+16]
-	;mov eax, [esp+20]
-	;mov edi, [esp+24]
-	;mov edx, [esp+28]
-	;mov ecx, [esp+32]
-	;mov ebx, [esp+36]
-	
-	%assign  push_num 0
-    LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif    
-	pxor xmm7, xmm7
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
-
-	cmp r4, 9
-	jne near .width_17
-
-.yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2], xmm0
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+2], xmm2
-	movhps [r2+2+8], xmm2
-
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	LOAD_6_PARA_POP
-	ret
-
-
-.width_17:
-.yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
-
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2+16], xmm0
-
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+18], xmm2
-	movhps [r2+18+8], xmm2
-
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	LOAD_6_PARA_POP
-	ret
-
-
-%macro FILTER_VER 9
-	paddw  %1, %6
-	movdqa %7, %2
-	movdqa %8, %3
-
-
-	paddw %7, %5
-	paddw %8, %4
-
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %1, %8
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %8, %1
-	paddw  %8, [h264_mc_hc_32]
-	psraw   %8, 6
-	packuswb %8, %8
-	movq %9, %8
-%endmacro
-;***********************************************************************
-;void McHorVer22Width8VerLastAlign_sse2(
-;											uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
-;***********************************************************************
-
- McHorVer22Width8VerLastAlign_sse2:
-	;push esi
-	;push edi
-	;push ebx
-	;push ebp
-
-	;mov esi, [esp+20]
-	;mov eax, [esp+24]
-	;mov edi, [esp+28]
-	;mov edx, [esp+32]
-	;mov ebx, [esp+36]
-	;mov ecx, [esp+40]
-	
-	%assign  push_num 0
-    LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
-%ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
-%endif
-
-	shr r4, 3
-
-.width_loop:
-	movdqa xmm0, [r0]
-	movdqa xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	movdqa xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	movdqa xmm5, [r0+r1]
-
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
-
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
-
-	add r2, r3
-	sub r0, r1
-
-.start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm5, [r0+r1]
-	jmp near .start
-
-.x_loop_dec:
-	dec r4
-	jz near .exit
-	;mov esi, [esp+20]
-	;mov edi, [esp+28]
-	;mov ecx, [esp+40]
-%ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
-%else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
-%endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
-
-.exit:
-%ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
-%endif
-	LOAD_6_PARA_POP
-	ret
-
-;***********************************************************************
-;void McHorVer22Width8VerLastUnAlign_sse2(
-;											uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
-;***********************************************************************
-
- McHorVer22Width8VerLastUnAlign_sse2:
-	;push esi
-	;push edi
-	;push ebx
-	;push ebp
-
-	;mov esi, [esp+20]
-	;mov eax, [esp+24]
-	;mov edi, [esp+28]
-	;mov edx, [esp+32]
-	;mov ebx, [esp+36]
-	;mov ecx, [esp+40]
-
-	%assign  push_num 0
-    LOAD_6_PARA
-%ifndef X86_32
-	movsx	r1, r1d
-	movsx	r3, r3d
-	movsx	r4, r4d
-	movsx	r5, r5d
-%endif
-%ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
-%endif
-	shr r4, 3
-
-.width_loop:
-	movdqu xmm0, [r0]
-	movdqu xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	movdqu xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	movdqu xmm5, [r0+r1]
-
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
-
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
-
-	add r2, r3
-	sub r0, r1
-
-.start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm5, [r0+r1]
-	jmp near .start
-
-.x_loop_dec:
-	dec r4
-	jz near .exit
-	;mov esi, [esp+20]
-	;mov edi, [esp+28]
-	;mov ecx, [esp+40]
-%ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
-%else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
-%endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
-
-.exit:
-%ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
-%endif
-	LOAD_6_PARA_POP
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_luma.asm
+;*
+;*  Abstract
+;*      sse2 motion compensation
+;*
+;*  History
+;*      17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10:
+	dw 16, 16, 16, 16
+ALIGN 16
+h264_w0x10_1:
+	dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+	dw 32, 32, 32, 32, 32, 32, 32, 32
+
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20WidthEq4_mmx
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+;                       int iSrcStride,
+;						uint8_t *pDst,
+;						int iDstStride,
+;						int iHeight)
+;*******************************************************************************
+McHorVer20WidthEq4_mmx:
+	;push esi
+	;push edi
+
+	;mov esi, [esp+12]
+	;mov eax, [esp+16]
+	;mov edi, [esp+20]
+	;mov ecx, [esp+24]
+	;mov edx, [esp+28]
+
+    %assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+
+	sub r0, 2
+	WELS_Zero mm7
+	movq mm6, [h264_w0x10]
+.height_loop:
+	movd mm0, [r0]
+	punpcklbw mm0, mm7
+	movd mm1, [r0+5]
+	punpcklbw mm1, mm7
+	movd mm2, [r0+1]
+	punpcklbw mm2, mm7
+	movd mm3, [r0+4]
+	punpcklbw mm3, mm7
+	movd mm4, [r0+2]
+	punpcklbw mm4, mm7
+	movd mm5, [r0+3]
+	punpcklbw mm5, mm7
+
+	paddw mm2, mm3
+	paddw mm4, mm5
+	psllw mm4, 2
+	psubw mm4, mm2
+	paddw mm0, mm1
+	paddw mm0, mm4
+	psllw mm4, 2
+	paddw mm0, mm4
+	paddw mm0, mm6
+	psraw mm0, 5
+	packuswb mm0, mm7
+	movd [r2], mm0
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+
+	WELSEMMS
+	LOAD_5_PARA_POP
+	ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+
+%macro SSE_LOAD_8P 3
+	movq %1, %3
+	punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+	paddw	%1, %6
+	movdqa	%8, %3
+	movdqa	%7, %2
+	paddw	%1, [h264_w0x10_1]
+	paddw	%8, %4
+	paddw	%7, %5
+	psllw	%8, 2
+	psubw	%8, %7
+	paddw	%1, %8
+	psllw	%8, 2
+	paddw	%1, %8
+	psraw   %1, 5
+	WELS_Zero %8
+	packuswb %1, %8
+	movq    %9, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+WELS_EXTERN McHorVer02WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq16_sse2
+
+ALIGN 16
+;***********************************************************************
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+;                       int16_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride
+;						int32_t iHeight
+;                       )
+;***********************************************************************
+McHorVer22Width8HorFirst_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;mov esi, [esp+16]     ;pSrc
+	;mov eax, [esp+20]	;iSrcStride
+	;mov edi, [esp+24]		;pDst
+	;mov edx, [esp+28]	;iDstStride
+	;mov ebx, [esp+32]	;iHeight
+
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	pxor xmm7, xmm7
+
+	sub r0, r1				;;;;;;;;need more 5 lines.
+	sub r0, r1
+
+.yloop_width_8:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [r2], xmm0
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .yloop_width_8
+	LOAD_5_PARA_POP
+	ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
+;												int iHeight,
+;                      );
+;*******************************************************************************
+McHorVer20WidthEq8_sse2:
+	;push	esi
+	;push	edi
+
+	;mov esi, [esp + 12]         ;pSrc
+	;mov eax, [esp + 16]         ;iSrcStride
+	;mov edi, [esp + 20]         ;pDst
+	;mov ecx, [esp + 28]         ;iHeight
+	;mov edx, [esp + 24]			;iDstStride
+
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	lea r0, [r0-2]            ;pSrc -= 2;
+
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	lea r2, [r2+r3]
+	lea r0, [r0+r1]
+	dec r4
+	jnz near .y_loop
+
+	LOAD_5_PARA_POP
+	ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
+;												int iHeight,
+;                      );
+;*******************************************************************************
+McHorVer20WidthEq16_sse2:
+	;push	esi
+	;push	edi
+	;mov esi, [esp + 12]         ;pSrc
+	;mov eax, [esp + 16]         ;iSrcStride
+	;mov edi, [esp + 20]         ;pDst
+	;mov ecx, [esp + 28]         ;iHeight
+	;mov edx, [esp + 24]			;iDstStride
+
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	lea r0, [r0-2]            ;pSrc -= 2;
+
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [r2+8], xmm0
+
+	lea r2, [r2+r3]
+	lea r0, [r0+r1]
+	dec r4
+	jnz near .y_loop
+
+	LOAD_5_PARA_POP
+	ret
+
+
+;*******************************************************************************
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
+;                       int iHeight )
+;*******************************************************************************
+ALIGN 16
+McHorVer02WidthEq8_sse2:
+	;push esi
+	;push edi
+	;mov esi, [esp + 12]           ;pSrc
+	;mov edx, [esp + 16]	          ;iSrcStride
+	;mov edi, [esp + 20]           ;pDst
+	;mov eax, [esp + 24]           ;iDstStride
+	;mov ecx, [esp + 28]           ;iHeight
+
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	sub r0, r1
+	sub r0, r1
+
+	WELS_Zero xmm7
+
+	SSE_LOAD_8P xmm0, xmm7, [r0]
+	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm7, [r0]
+	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm7, [r0]
+	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+.start:
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm0, xmm1, [r0]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm3, [r0]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm5, [r0]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+	jmp near .start
+
+.xx_exit:
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20Width9Or17_sse2
+WELS_EXTERN McHorVer02Height9Or17_sse2
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
+WELS_EXTERN McHorVer22HorFirst_sse2
+
+
+;***********************************************************************
+; void McHorVer02Height9Or17_sse2(	uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;						int32_t iWidth,
+;                       int32_t iHeight )
+;***********************************************************************
+ALIGN 16
+McHorVer02Height9Or17_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	;mov esi, [esp + 16]
+	;mov edx, [esp + 20]
+	;mov edi, [esp + 24]
+	;mov eax, [esp + 28]
+	;mov ecx, [esp + 36]
+	;mov ebx, [esp + 32]
+
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+
+	shr r4, 3
+	sub r0, r1
+	sub r0, r1
+
+.xloop:
+	WELS_Zero xmm7
+	SSE_LOAD_8P xmm0, xmm7, [r0]
+	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm7, [r0]
+	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm7, [r0]
+	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	movdqa xmm0,xmm1
+	movdqa xmm1,xmm2
+	movdqa xmm2,xmm3
+	movdqa xmm3,xmm4
+	movdqa xmm4,xmm5
+	movdqa xmm5,xmm6
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm0, xmm1, [r0]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm3, [r0]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm5, [r0]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz  near .xx_exit
+	;mov esi, [esp + 16]
+	;mov edi, [esp + 24]
+	;mov ecx, [esp + 36]
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	sub r0, r1
+	sub r0, r1
+	add r0, 8
+	add r2, 8
+	jmp near .xloop
+
+.xx_exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	LOAD_6_PARA_POP
+	ret
+
+
+ALIGN 16
+;***********************************************************************
+; void McHorVer20Width9Or17_sse2(		uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						int32_t iWidth,
+;						int32_t iHeight
+;                      );
+;***********************************************************************
+McHorVer20Width9Or17_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;mov esi, [esp+16]
+	;mov eax, [esp+20]
+	;mov edi, [esp+24]
+	;mov edx, [esp+28]
+	;mov ecx, [esp+32]
+	;mov ebx, [esp+36]
+
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+	sub r0, 2
+	pxor xmm7, xmm7
+
+	cmp r4, 9
+	jne near .width_17
+
+.yloop_width_9:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [r2], xmm0
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [r2+1], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_9
+	LOAD_6_PARA_POP
+	ret
+
+
+.width_17:
+.yloop_width_17:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movq [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [r2+8], xmm0
+
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6+8]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [r2+9], xmm2
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_17
+	LOAD_6_PARA_POP
+	ret
+
+
+
+ALIGN 16
+;***********************************************************************
+;void McHorVer22HorFirst_sse2
+;							(uint8_t *pSrc,
+;							int32_t iSrcStride,
+;							uint8_t * pTap,
+;							int32_t iTapStride,
+;							int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+McHorVer22HorFirst_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;mov esi, [esp+16]
+	;mov eax, [esp+20]
+	;mov edi, [esp+24]
+	;mov edx, [esp+28]
+	;mov ecx, [esp+32]
+	;mov ebx, [esp+36]
+
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+	pxor xmm7, xmm7
+	sub r0, r1				;;;;;;;;need more 5 lines.
+	sub r0, r1
+
+	cmp r4, 9
+	jne near .width_17
+
+.yloop_width_9:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [r2], xmm0
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [r2+2], xmm2
+	movhps [r2+2+8], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_9
+	LOAD_6_PARA_POP
+	ret
+
+
+.width_17:
+.yloop_width_17:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [r2+16], xmm0
+
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6+8]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [r2+18], xmm2
+	movhps [r2+18+8], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_17
+	LOAD_6_PARA_POP
+	ret
+
+
+%macro FILTER_VER 9
+	paddw  %1, %6
+	movdqa %7, %2
+	movdqa %8, %3
+
+
+	paddw %7, %5
+	paddw %8, %4
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
+	paddw  %8, [h264_mc_hc_32]
+	psraw   %8, 6
+	packuswb %8, %8
+	movq %9, %8
+%endmacro
+;***********************************************************************
+;void McHorVer22Width8VerLastAlign_sse2(
+;											uint8_t *pTap,
+;											int32_t iTapStride,
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastAlign_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;push ebp
+
+	;mov esi, [esp+20]
+	;mov eax, [esp+24]
+	;mov edi, [esp+28]
+	;mov edx, [esp+32]
+	;mov ebx, [esp+36]
+	;mov ecx, [esp+40]
+
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+
+	shr r4, 3
+
+.width_loop:
+	movdqa xmm0, [r0]
+	movdqa xmm1, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqa xmm2, [r0]
+	movdqa xmm3, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqa xmm4, [r0]
+	movdqa xmm5, [r0+r1]
+
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	movdqa xmm6, [r0]
+
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm6, [r0]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm7, [r0+r1]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm0, [r0]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm1, [r0+r1]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm2, [r0]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm3, [r0+r1]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm4, [r0]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm5, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz near .exit
+	;mov esi, [esp+20]
+	;mov edi, [esp+28]
+	;mov ecx, [esp+40]
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	add r0, 16
+	add r2, 8
+	jmp .width_loop
+
+.exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	LOAD_6_PARA_POP
+	ret
+
+;***********************************************************************
+;void McHorVer22Width8VerLastUnAlign_sse2(
+;											uint8_t *pTap,
+;											int32_t iTapStride,
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastUnAlign_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;push ebp
+
+	;mov esi, [esp+20]
+	;mov eax, [esp+24]
+	;mov edi, [esp+28]
+	;mov edx, [esp+32]
+	;mov ebx, [esp+36]
+	;mov ecx, [esp+40]
+
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+	shr r4, 3
+
+.width_loop:
+	movdqu xmm0, [r0]
+	movdqu xmm1, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqu xmm2, [r0]
+	movdqu xmm3, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqu xmm4, [r0]
+	movdqu xmm5, [r0+r1]
+
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	movdqu xmm6, [r0]
+
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm6, [r0]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm7, [r0+r1]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm0, [r0]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm1, [r0+r1]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm2, [r0]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm3, [r0+r1]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm4, [r0]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm5, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz near .exit
+	;mov esi, [esp+20]
+	;mov edi, [esp+28]
+	;mov ecx, [esp+40]
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	add r0, 16
+	add r2, 8
+	jmp .width_loop
+
+.exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	LOAD_6_PARA_POP
 	ret
\ No newline at end of file
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -160,7 +160,7 @@
 AnalysisVaaInfoIntra_sse2:
 
     %assign push_num 0
-    LOAD_2_PARA 
+    LOAD_2_PARA
     SIGN_EXTENTION r1,r1d
 
 %ifdef X86_32
@@ -175,16 +175,16 @@
     and  r5,0fh
     sub  r7,r5
     sub  r7,32
-    
-    
-    mov r2,r1    
+
+
+    mov r2,r1
     sal r2,$1   ;r2 = 2*iLineSize
     mov r3,r2
     add r3,r1   ;r3 = 3*iLineSize
-    
+
     mov r4,r2
     sal r4,$1   ;r4 = 4*iLineSize
-    
+
 	pxor xmm7, xmm7
 
 	; loops
@@ -225,8 +225,8 @@
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
 
-	
-	
+
+
 	movd r2d, xmm0
 	and r2, 0ffffh		; effective low work truncated
 	mov r3, r2
@@ -234,7 +234,7 @@
 	sar r2, $4
 	movd retrd, xmm1
 	sub retrd, r2d
-	
+
 	add r7,32
 	add r7,r5
 
@@ -244,7 +244,7 @@
 	pop r4
 	pop r3
 %endif
-	
+
 	ret
 
 WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@@ -255,7 +255,7 @@
 AnalysisVaaInfoIntra_ssse3:
 
     %assign push_num 0
-    LOAD_2_PARA 
+    LOAD_2_PARA
     SIGN_EXTENTION r1,r1d
 
 %ifdef X86_32
@@ -265,41 +265,41 @@
     push r6
     %assign push_num push_num+4
 %endif
-   
+
     mov  r5,r7
     and  r5,0fh
     sub  r7,r5
     sub  r7,32
-    
 
-    mov r2,r1    
+
+    mov r2,r1
     sal r2,$1   ;r2 = 2*iLineSize
     mov r3,r2
     add r3,r1   ;r3 = 3*iLineSize
-    
+
     mov r4,r2
     sal r4,$1   ;r4 = 4*iLineSize
-     
+
 	pxor xmm7, xmm7
 
 	; loops
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7],xmm0
-    
+
 	lea r0,[r0+r4]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+8],xmm1
-    
-    
+
+
 	lea r0,[r0+r4]
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7+16],xmm0
-    
+
 	lea r0,[r0+r4]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+24],xmm1
-    
-    
+
+
 	movdqa xmm0,[r7]
 	movdqa xmm1,[r7+16]
 	movdqa xmm2, xmm0
@@ -322,7 +322,7 @@
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
 
-    
+
     movd r2d, xmm0
     and r2, 0ffffh          ; effective low work truncated
     mov r3, r2
@@ -339,7 +339,7 @@
 	pop r4
 	pop r3
 %endif
-	
+
 	ret
 
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
@@ -368,7 +368,7 @@
 	paddd xmm3, xmm4
 	movd r0d, xmm3
 	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
-	
+
 	jb near .threshold_exit
 	pshufd xmm0, xmm0, 01Bh
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
@@ -412,7 +412,7 @@
 	paddd xmm4, xmm5
 	pshufd xmm5, xmm4, 0B1h
 	paddd xmm5, xmm4
-	
+
 	movd r0d, xmm5
 	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
 	jb near .threshold_exit
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -477,7 +477,7 @@
 		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c
 
 		;mov		esi,	[esp + pushsize + 4]
-		mov 	r0, r4 
+		mov 	r0, r4
 		add		r3,	16
 		imul	r2,	-3
 		add		r3,	r2				; s = a + 16 + (-3)*c
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -186,7 +186,7 @@
 	movsx r1, r1d
 	movsx r3, r3d
 	%endif
-;	mov     eax, [pDct   ] 
+;	mov     eax, [pDct   ]
     movq    mm0, [r4+ 0]
     movq    mm1, [r4+ 8]
     movq    mm2, [r4+16]
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -45,8 +45,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 	LOAD_1_PARA
 	;mov  eax,[esp+4]
 	prefetchnta [r0]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -71,7 +71,7 @@
 		LOAD_2_PARA
 		SIGN_EXTENTION r1, r1d
 		neg		r1
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[r0],		xmm0
@@ -79,12 +79,12 @@
 		movdqa	[r0+32],	xmm0
 		movdqa	[r0+48],	xmm0
 		add		r0, 0x40
-		
+
 		add r1, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -96,7 +96,7 @@
 		LOAD_2_PARA
 		SIGN_EXTENTION r1, r1d
 		neg		r1
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[r0],		mm0
@@ -106,16 +106,16 @@
 		movq	[r0+32],	mm0
 		movq	[r0+40],	mm0
 		movq	[r0+48],	mm0
-		movq	[r0+56],	mm0		
+		movq	[r0+56],	mm0
 		add		r0,		0x40
-		
+
 		add r1, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -125,17 +125,17 @@
 		%assign  push_num 0
 		LOAD_2_PARA
 		SIGN_EXTENTION r1, r1d
-		neg		r1			
+		neg		r1
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[r0],		mm0
 		add		r0,		0x08
-	
+
 		add		r1,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -1,2344 +1,2344 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  satd_sad.asm
-;*
-;*  Abstract
-;*      WelsSampleSatd4x4_sse2
-;*      WelsSampleSatd8x8_sse2
-;*      WelsSampleSatd16x8_sse2
-;*      WelsSampleSatd8x16_sse2
-;*      WelsSampleSatd16x16_sse2
-;*
-;*      WelsSampleSad16x8_sse2
-;*      WelsSampleSad16x16_sse2
-;*
-;*  History
-;*      8/5/2009 Created
-;*     24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1:  dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2:  dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubw %1, %2
-%endmacro
-
-%macro  SSE2_SumWHorizon1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
-   SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5
-   SSE2_SumSub %2, %4, %5
-   SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
-	WELS_AbsW %1, %3
-	WELS_AbsW %2, %3
-	WELS_AbsW %4, %6
-	WELS_AbsW %5, %6
-	paddusw       %1, %2
-	paddusw       %4, %5
-	paddusw       %7, %1
-	paddusw       %7, %4
-%endmacro
-
-%macro  SSE2_SumWHorizon 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
-	lea					r0,    [r0+2*r1]
-    lea					r2,    [r2+2*r3]
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
-	;push      ebx
-	;mov       eax,  [esp+8]
-	;mov       ebx,  [esp+12]
-	;mov       ecx,  [esp+16]
-	;mov       edx,  [esp+20]
-	
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-    movd      xmm0, [r0]
-    movd      xmm1, [r0+r1]
-    lea       r0 , [r0+2*r1]
-    movd      xmm2, [r0]
-    movd      xmm3, [r0+r1]
-    punpckldq xmm0, xmm2
-    punpckldq xmm1, xmm3
-
-    movd      xmm4, [r2]
-    movd      xmm5, [r2+r3]
-    lea       r2 , [r2+2*r3]
-    movd      xmm6, [r2]
-    movd      xmm7, [r2+r3]
-    punpckldq xmm4, xmm6
-    punpckldq xmm5, xmm7
-
-    pxor      xmm6, xmm6
-    punpcklbw xmm0, xmm6
-    punpcklbw xmm1, xmm6
-    punpcklbw xmm4, xmm6
-    punpcklbw xmm5, xmm6
-
-    psubw     xmm0, xmm4
-    psubw     xmm1, xmm5
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-    SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
-    movdqa     xmm4, xmm0
-    paddw      xmm0, xmm3
-    psubw      xmm4, xmm3
-
-    movdqa         xmm2, xmm0
-    punpcklwd      xmm0, xmm4
-    punpckhwd      xmm4, xmm2
-
-	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
-
-    movdqa         xmm7, xmm0
-    paddw          xmm0, xmm5
-    psubw          xmm7, xmm5
-
-	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
-
-    movdqa         xmm2, xmm0
-    paddw          xmm0, xmm1
-    psubw          xmm2, xmm1
-
-    WELS_AbsW  xmm0, xmm3
-    paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4
-    paddusw        xmm6, xmm2
-    SSE2_SumWHorizon1  xmm6, xmm4
-	movd           retrd,  xmm6
-    and            retrd,  0xffff
-    shr            retrd,  1
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
-	 ;push   ebx
-	 ;mov    eax,    [esp+8]
-	 ;mov    ebx,    [esp+12]
-	 ;mov    ecx,    [esp+16]
-	 ;mov    edx,    [esp+20]
-	 
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-    SSE2_GetSatd8x8
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
-	 ;push   ebx
-	 ;mov    eax,    [esp+8]
-	 ;mov    ebx,    [esp+12]
-	 ;mov    ecx,    [esp+16]
-	 ;mov    edx,    [esp+20]
-	 
-	 %assign  push_num 0
-	 LOAD_4_PARA
-	 SIGN_EXTENTION r1, r1d
-	 SIGN_EXTENTION r3, r3d	 
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
-
-	 SSE2_GetSatd8x8
-     lea    r0,    [r0+2*r1]
-     lea    r2,    [r2+2*r3]
-	 SSE2_GetSatd8x8
-
-	 psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    retrd,   xmm6
-	 LOAD_4_PARA_POP
-	 ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-	
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	push r0
-	push r2	
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-	
-	pop r2
-	pop r0
-	;mov    eax,    [esp+8]
-    ;mov    ecx,    [esp+16]
-    add    r0,    8
-    add    r2,    8
-	SSE2_GetSatd8x8
-
-	psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-	
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	push r0
-	push r2	
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
-	pop r2
-	pop r0
-	;mov    eax,    [esp+8]
-	;mov    ecx,    [esp+16]
-	add    r0,    8
-	add    r2,    8
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-	paddd        xmm4, %1 ;for dc
-	paddd        xmm4, %3 ;for dc
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-;    paddd        xmm4, %1 ;for dc
-;	 paddd        xmm4, %3 ;for dc
-	movdqa       %4, %1
-	punpcklqdq   %4, %3
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
-	pxor        xmm7,   xmm7
-	movq        xmm0,   [eax]
-	movq        xmm1,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	movq        xmm2,   [eax]
-	movq        xmm3,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	punpcklbw   xmm0,   xmm7
-	punpcklbw   xmm1,   xmm7
-	punpcklbw   xmm2,   xmm7
-	punpcklbw   xmm3,   xmm7
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
-	;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2],   0
-	pinsrw      xmm0,   word[esi+%2+8], 4
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+2],  0
-	pinsrw      xmm0,   word[esi+%2+10], 4
-	psubsw      xmm0,   xmm1
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+4],  0
-	pinsrw      xmm0,   word[esi+%2+12], 4
-	psubsw      xmm0,   xmm3
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+6],  0
-	pinsrw      xmm0,   word[esi+%2+14], 4
-	psubsw      xmm0,   xmm2
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH  3
-	movq        xmm0,   [esi+%3+8*%1]
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm5,   xmm0
-	pabsw       xmm1,   xmm1
-	pabsw       xmm2,   xmm2
-	pabsw       xmm3,   xmm3
-	paddw       xmm2,   xmm1;for DC
-	paddw       xmm2,   xmm3;for DC
-	paddw       xmm5,   xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
-	pxor        xmm0,   xmm0
-	movq2dq     xmm0,   mm4
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
-	shl         %1,     4
-	movdqa      xmm0,   [esi+32+%1]
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 16
-	SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
-	pmaddwd     %1, %2
-	movhlps     %3, %1
-	paddd       %1, %3
-	pshuflw     %3, %1,0Eh
-	paddd       %1, %3
-%endmacro
-
-
-%ifdef X86_32
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	pxor        xmm4,   xmm4
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movdqu 		xmm0,   [ecx]
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     8
-	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     10
-	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     12
-	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     14
-	pinsrb      xmm0,   byte[ecx+edx-1], 15
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi+32], xmm0 ;H
-	movdqa      [esi+48], xmm1
-	movd        ecx,    xmm4 ;dc
-	add         ecx,    16   ;(sum+16)
-	shr         ecx,    5    ;((sum+16)>>5)
-	shl         ecx,    4    ;
-	movd        mm4,    ecx  ; mm4 copy DC
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-	mov         edi,    0
-.loop16x16_get_satd:
-.loopStart1:
-	SSE41_I16x16GetX38x4Satd ecx, edi
-	inc          ecx
-	cmp         ecx, 4
-	jl          .loopStart1
-	cmp         edi, 16
-	je          .loop16x16_get_satd_end
-	mov         eax, [esp+24]
-	add         eax, 8
-	mov         ecx, 0
-	add         edi, 16
-	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov      edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ebx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_16x16
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16
-
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_16x16
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-return_satd_intra_16x16_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movq 		xmm0,   [ecx]
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-	movdqa      [esi],  xmm0 ;V
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-	movdqa      [esi+16], xmm0 ;H
-;(sum+2)>>2
-	movdqa      xmm6,   [PDQ2]
-	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1
-	paddd       xmm5,   xmm6
-	psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-	paddd       xmm6,   xmm6
-	paddd       xmm4,   xmm1
-	paddd       xmm4,   xmm6
-	psrld       xmm4,   3
-;satd *16
-	pslld       xmm5,   4
-	pslld       xmm4,   4
-;temp satd
-	movdqa      xmm6,   xmm4
-	punpcklqdq  xmm4,   xmm5
-	psllq       xmm4,   32
-	psrlq       xmm4,   32
-	movdqa      [esi+32], xmm4
-	punpckhqdq  xmm5,   xmm6
-	psllq       xmm5,   32
-	psrlq       xmm5,   32
-	movdqa      [esi+48], xmm5
-
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:
-	SSE41_ChromaGetX38x4Satd ecx, 0
-	inc             ecx
-	cmp             ecx, 2
-	jl              loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
-	movdq2q     %2, %1
-	movhlps     %1, %1
-	movdq2q     %3, %1
-%endmacro
-%macro MMXReg2SSE 4
-	movq2dq     %1, %3
-	movq2dq     %2, %4
-	punpcklqdq  %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	xor    edi,    edi
-loop_chroma_satdx3:
-	SSE41_ChromaGetX38x8Satd
-	cmp             edi, 1
-	je              loop_chroma_satdx3end
-	inc             edi
-	SSEReg2MMX  xmm4, mm0,mm1
-	SSEReg2MMX  xmm5, mm2,mm3
-	SSEReg2MMX  xmm6, mm5,mm6
-	mov         ecx,  [esp+44]
-	mov         eax,  [esp+48]
-	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:
-	MMXReg2SSE  xmm0, xmm3, mm0, mm1
-	MMXReg2SSE  xmm1, xmm3, mm2, mm3
-	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-
-	paddw       xmm4, xmm0
-	paddw       xmm5, xmm1
-	paddw       xmm6, xmm2
-
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov       edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ecx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_8x8
-	cmp        ebx, ecx
-	jge near   not_dc_h_8x8
-
-	; for DC mode
-	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_8x8
-	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
-	; for V mode
-	mov       dword[edx], 2;I8_PRED_V
-	mov       eax, ecx
-return_satd_intra_8x8_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1
-  pshufb      xmm6,xmm1
-  movdqa      %1,  xmm6
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm7
-  paddw       xmm4,xmm0
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm5
-  paddw       xmm2,xmm0
-  psadbw      xmm6,%2
-  paddw       xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
-    movzx   %2, byte %1
-    mov    %3, %2
-    add     %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    edi,    [esp+40] ;temp_sad
-	sub    ecx,    edx
-    movdqa      xmm5,[ecx]
-    pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5
-    movhlps     xmm1,xmm0
-    paddw       xmm0,xmm1
-    movd        eax,xmm0
-
-    add         ecx,edx
-    lea         ebx, [edx+2*edx]
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    sub        edi, 192
-    add         eax,10h
-    shr         eax,5
-    movd        xmm7,eax
-    pxor        xmm1,xmm1
-    pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4
-    pxor        xmm3,xmm3
-    pxor        xmm2,xmm2
-;sad begin
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-    lea         esi, [ebx+2*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
-    pslldq      xmm3,4
-    por         xmm3,xmm2
-    movhlps     xmm1,xmm3
-    paddw       xmm3,xmm1
-    movhlps     xmm0,xmm4
-    paddw       xmm4,xmm0
-; comparing order: DC H V
-	movd        ebx, xmm4 ;DC
-	movd        ecx, xmm3 ;V
-	psrldq      xmm3, 4
-	movd        esi, xmm3 ;H
-	mov         eax, [esp+36] ;lamda
-	shl         eax, 1
-	add         esi, eax
-	add         ebx, eax
-	mov         edx, [esp+32]
-	cmp         ebx, esi
-	jge near   not_dc_16x16_sad
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16_sad
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-    sub        edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm7
-%assign x x+1
-%endrep
-	jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
-	; for H mode
-	cmp       esi, ecx
-	jge near   not_dc_h_16x16_sad
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi
-	jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-    sub       edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
-	pop    edi
-	pop    esi
-	pop    ebx
-	ret
-%endif
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
-	movq             xmm0, [r0]
-	punpcklqdq       xmm0, xmm0
-	pmaddubsw        xmm0, xmm7
-	movq             xmm1, [r0+r1]
-	punpcklqdq       xmm1, xmm1
-	pmaddubsw        xmm1, xmm7
-	movq             xmm2, [r2]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r2+r3]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	psubsw           xmm0, xmm2
-	psubsw           xmm1, xmm3
-	movq             xmm2, [r0+2*r1]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r0+r4]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	movq             xmm4, [r2+2*r3]
-	punpcklqdq       xmm4, xmm4
-	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [r2+r5]
-	punpcklqdq       xmm5, xmm5
-	pmaddubsw        xmm5, xmm7
-	psubsw           xmm2, xmm4
-	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
-	pabsw            xmm0, xmm0
-	pabsw            xmm2, xmm2
-	pabsw            xmm1, xmm1
-	pabsw            xmm3, xmm3
-	movdqa           xmm4, xmm3
-	pblendw          xmm3, xmm1, 0xAA
-	pslld            xmm1, 16
-	psrld            xmm4, 16
-	por              xmm1, xmm4
-	pmaxuw           xmm1, xmm3
-	paddw            xmm6, xmm1
-	movdqa           xmm4, xmm0
-	pblendw          xmm0, xmm2, 0xAA
-	pslld            xmm2, 16
-	psrld            xmm4, 16
-	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2
-	paddw            xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4
-	pmaddwd     %2, %3
-	movhlps     %4, %2
-	paddd       %2, %4
-	pshuflw     %4, %2,0Eh
-	paddd       %2, %4
-	movd		%1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
-	;push        ebx
-	;mov         eax,[esp+8]
-	;mov         ebx,[esp+12]
-	;mov         ecx,[esp+16]
-	;mov         edx,[esp+20]
-	
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	movdqa      xmm4,[HSwapSumSubDB1]
-	movd        xmm2,[r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm2,xmm5,0
-	movd        xmm3,[r2+r3*2]
-	lea         r2, [r3*2+r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm3,xmm5,0
-	movd        xmm0,[r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm0,xmm5,0
-	movd        xmm1,[r0+r1*2]
-	lea         r0, [r1*2+r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm1,xmm5,0
-	pmaddubsw   xmm0,xmm4
-	pmaddubsw   xmm1,xmm4
-	pmaddubsw   xmm2,xmm4
-	pmaddubsw   xmm3,xmm4
-	psubw       xmm0,xmm2
-	psubw       xmm1,xmm3
-	movdqa      xmm2,xmm0
-	paddw       xmm0,xmm1
-	psubw       xmm1,xmm2
-	movdqa      xmm2,xmm0
-	punpcklqdq  xmm0,xmm1
-	punpckhqdq  xmm2,xmm1
-	movdqa      xmm1,xmm0
-	paddw       xmm0,xmm2
-	psubw       xmm2,xmm1
-	movdqa      xmm1,xmm0
-	pblendw     xmm0,xmm2,0AAh
-	pslld       xmm2,16
-	psrld       xmm1,16
-	por         xmm2,xmm1
-	pabsw       xmm0,xmm0
-	pabsw       xmm2,xmm2
-	pmaxsw      xmm0,xmm2
-	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;mov    eax,    [esp+16]
-	;mov    ebx,    [esp+20]
-	;mov    ecx,    [esp+24]
-	;mov    edx,    [esp+28]
-%ifdef X86_32	
-	push  r4
-	push  r5
-%endif	
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6, xmm6
-	SSE41_GetSatd8x4
-	lea			r0,	 [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;push   ebp
-	;%define pushsize   16
-	;mov    eax,    [esp+pushsize+4]
-	;mov    ebx,    [esp+pushsize+8]
-	;mov    ecx,    [esp+pushsize+12]
-	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32	
-	push  r4
-	push  r5
-	push  r6
-%endif	
-	%assign  push_num 3
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor        xmm6, xmm6
-	mov         r6,    0
-loop_get_satd_8x16:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_8x16
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;mov    eax,    [esp+16]
-	;mov    ebx,    [esp+20]
-	;mov    ecx,    [esp+24]
-	;mov    edx,    [esp+28]
-%ifdef X86_32	
-	push  r4
-	push  r5
-%endif	
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	push  r0
-	push  r2
-	
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	
-	pop  r2
-	pop  r0
-	;mov			eax,    [esp+16]
-	;mov			ecx,    [esp+24]
-	add			r0,    8
-	add			r2,    8
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
-	;push   ebx
-	;push   esi
-	;push   edi
-	;push   ebp
-	;%define pushsize   16
-	;mov    eax,    [esp+pushsize+4]
-	;mov    ebx,    [esp+pushsize+8]
-	;mov    ecx,    [esp+pushsize+12]
-	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32	
-	push  r4
-	push  r5
-	push  r6
-%endif	
-	%assign  push_num 3
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	
-	push  r0
-	push  r2
-	
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	mov         r6,    0
-loop_get_satd_16x16_left:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_left
-
-	pop  r2
-	pop  r0	
-	;mov			eax,    [esp+pushsize+4]
-	;mov			ecx,    [esp+pushsize+12]
-	add			r0,    8
-	add			r2,    8
-	mov         r6,    0
-loop_get_satd_16x16_right:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_right
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	;%undef pushsize
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqu xmm1,   [r2]
-	MOVDQ  xmm2,   [r0];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	paddw  xmm7,   xmm0
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+2*r3]
-	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+r5]
-	MOVDQ  xmm2,   [r0+r4]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
-
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
-	;push ebx
-	;push edi
-	;push esi
-	;%define _STACK_SIZE		12
-	;mov eax, [esp+_STACK_SIZE+4 ]
-	;mov	ebx, [esp+_STACK_SIZE+8 ]	
-	;mov ecx, [esp+_STACK_SIZE+12]
-	;mov edx, [esp+_STACK_SIZE+16]
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif	
-
-	%assign  push_num 2
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-	lea r4, [3*r1]
-	lea r5, [3*r3]
-
-	pxor   xmm7,   xmm7
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	movhlps xmm0, xmm7
-	paddw xmm0, xmm7
-	movd retrd, xmm0
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-	
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-
-	movhlps     xmm1, xmm0
-	paddw       xmm0, xmm1
-	movd        retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
-	;push   ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-	
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d
-    pxor   xmm6,   xmm6
-
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
-    ;mov    ecx,    [esp+12]
-	;mov    edx,    ecx
-    ;CACHE_SPLIT_CHECK edx, 8, 64
-	;jle    near   .pixel_sad_8x8_nsplit
-	;push   ebx
-	;push   edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
-%ifdef X86_32	
-	push	r3
-	push	r4
-	push	r5
-%endif
-	%assign  push_num 3
-	mov		r0,  arg1
-	mov		r1,  arg2	
-	SIGN_EXTENTION r1, r1d
-    pxor   xmm7,   xmm7
-    
-    ;ecx r2, edx r4, edi r5
-
-    mov    r5,    r2
-    and    r5,    0x07
-    sub    r2,    r5
-    mov    r4,    8
-    sub    r4,    r5
-
-    shl    r5,    3
-    shl    r4,    3
-    movd   xmm5,   r5d
-    movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
-    mov    r3,    arg4
-	SIGN_EXTENTION r3, r3d
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-%ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
-%endif
-	jmp        .return
-	
-.pixel_sad_8x8_nsplit:
-    ;push   ebx
-    ;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    edx,    [esp+20]
-	
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	LOAD_4_PARA_POP
-.return:
-	ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
-	psadbw %1,   %4
-	paddw  xmm5, %1
-	psadbw %4,   %3
-	paddw  xmm4, %4
-	movdqu %4,   [%5-1]
-	psadbw %4,   %2
-	paddw  xmm6, %4
-	movdqu %4,   [%5+1]
-	psadbw %4,   %2
-	paddw  xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
-	;push ebx
-	;mov    eax,    [esp+8]
-	;mov    ebx,    [esp+12]
-	;mov    ecx,    [esp+16]
-	;mov    edx,    [esp+20]
-	
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw  xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw  xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw  xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm2,   xmm3
-	paddw xmm5,   xmm2
-
-	movdqu xmm2,   [r2-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	;mov        ecx,  [esp+24]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-	
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	movdqu xmm0,   [r2-1]
-	psadbw xmm0,   xmm1
-	paddw xmm6,   xmm0
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm1
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm1,   xmm3
-	paddw xmm5,   xmm1
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-	
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-	
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	;mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
-	;push ebx
-	;push edi
-	;mov    eax,    [esp+12]
-	;mov    ebx,    [esp+16]
-	;mov    edi,    [esp+20]
-	;mov    edx,    [esp+24]
-	
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
-	movd   xmm0,   [r0]
-	movd   xmm1,   [r0+r1]
-	lea        r0,    [r0+2*r1]
-	movd       xmm2,   [r0]
-	movd       xmm3,   [r0+r1]
-	punpckldq  xmm0, xmm1
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
-	sub        r2,  r3
-	movd       xmm1, [r2]
-	movd       xmm2, [r2+r3]
-	punpckldq  xmm1, xmm2
-	movd       xmm2, [r2+r3-1]
-	movd       xmm3, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-
-	movd       xmm4, [r2]
-	movd       xmm5, [r2-1]
-	punpckldq  xmm2, xmm5
-	movd       xmm5, [r2+1]
-	punpckldq  xmm3, xmm5
-
-	movd       xmm5, [r2+r3]
-	punpckldq  xmm4, xmm5
-
-	punpcklqdq xmm1, xmm4 ;-L
-
-	movd       xmm5, [r2+r3-1]
-	movd       xmm6, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-	movd       xmm7, [r2-1]
-	punpckldq  xmm5, xmm7
-	punpcklqdq xmm2, xmm5 ;-1
-	movd       xmm7, [r2+1]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm3, xmm6 ;+1
-	movd       xmm6, [r2]
-	movd       xmm7, [r2+r3]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L
-	psadbw     xmm1, xmm0
-	psadbw     xmm2, xmm0
-	psadbw     xmm3, xmm0
-	psadbw     xmm4, xmm0
-
-	movhlps    xmm0, xmm1
-	paddw      xmm1, xmm0
-	movhlps    xmm0, xmm2
-	paddw      xmm2, xmm0
-	movhlps    xmm0, xmm3
-	paddw      xmm3, xmm0
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	;mov        edi,  [esp+28]
-	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm1, xmm2
-	movdqa     [r4],xmm1
-	LOAD_5_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
-;***********************************************************************
-;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WelsSampleSad4x4_mmx:
-    ;push    ebx
-	;%define pushsize     4
-	;%define pix1address	 esp+pushsize+4
-	;%define pix1stride   esp+pushsize+8
-	;%define pix2address  esp+pushsize+12
-	;%define pix2stride   esp+pushsize+16
-    ;mov		  eax, [pix1address]
-    ;mov		  ebx, [pix1stride ]
-    ;mov		  ecx, [pix2address]
-    ;mov		  edx, [pix2stride ]
-    
-    %assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
-	movd	  mm0, [r0]
-	movd	  mm1, [r0+r1]
-	punpckldq mm0, mm1
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm0, mm3
-
-	lea       r0, [r0+2*r1]
-	lea       r2, [r2+2*r3]
-
-	movd      mm1, [r0]
-	movd      mm2, [r0+r1]
-	punpckldq mm1, mm2
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm1, mm3
-	paddw     mm0, mm1
-
-    movd      retrd, mm0
-
-	WELSEMMS
-    LOAD_4_PARA_POP
-    ret
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  satd_sad.asm
+;*
+;*  Abstract
+;*      WelsSampleSatd4x4_sse2
+;*      WelsSampleSatd8x8_sse2
+;*      WelsSampleSatd16x8_sse2
+;*      WelsSampleSatd8x16_sse2
+;*      WelsSampleSatd16x16_sse2
+;*
+;*      WelsSampleSad16x8_sse2
+;*      WelsSampleSad16x16_sse2
+;*
+;*  History
+;*      8/5/2009 Created
+;*     24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1:  dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2:  dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubw %1, %2
+%endmacro
+
+%macro  SSE2_SumWHorizon1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
+   SSE2_SumSub %1, %2, %5
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+	WELS_AbsW %1, %3
+	WELS_AbsW %2, %3
+	WELS_AbsW %4, %6
+	WELS_AbsW %5, %6
+	paddusw       %1, %2
+	paddusw       %4, %5
+	paddusw       %7, %1
+	paddusw       %7, %4
+%endmacro
+
+%macro  SSE2_SumWHorizon 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+	lea					r0,    [r0+2*r1]
+    lea					r2,    [r2+2*r3]
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+	;push      ebx
+	;mov       eax,  [esp+8]
+	;mov       ebx,  [esp+12]
+	;mov       ecx,  [esp+16]
+	;mov       edx,  [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    movd      xmm0, [r0]
+    movd      xmm1, [r0+r1]
+    lea       r0 , [r0+2*r1]
+    movd      xmm2, [r0]
+    movd      xmm3, [r0+r1]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+
+    movd      xmm4, [r2]
+    movd      xmm5, [r2+r3]
+    lea       r2 , [r2+2*r3]
+    movd      xmm6, [r2]
+    movd      xmm7, [r2+r3]
+    punpckldq xmm4, xmm6
+    punpckldq xmm5, xmm7
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+    punpcklbw xmm4, xmm6
+    punpcklbw xmm5, xmm6
+
+    psubw     xmm0, xmm4
+    psubw     xmm1, xmm5
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+    movdqa     xmm4, xmm0
+    paddw      xmm0, xmm3
+    psubw      xmm4, xmm3
+
+    movdqa         xmm2, xmm0
+    punpcklwd      xmm0, xmm4
+    punpckhwd      xmm4, xmm2
+
+	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+
+    movdqa         xmm7, xmm0
+    paddw          xmm0, xmm5
+    psubw          xmm7, xmm5
+
+	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+
+    movdqa         xmm2, xmm0
+    paddw          xmm0, xmm1
+    psubw          xmm2, xmm1
+
+    WELS_AbsW  xmm0, xmm3
+    paddusw        xmm6, xmm0
+	WELS_AbsW  xmm2, xmm4
+    paddusw        xmm6, xmm2
+    SSE2_SumWHorizon1  xmm6, xmm4
+	movd           retrd,  xmm6
+    and            retrd,  0xffff
+    shr            retrd,  1
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+    SSE2_GetSatd8x8
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+
+	 %assign  push_num 0
+	 LOAD_4_PARA
+	 SIGN_EXTENTION r1, r1d
+	 SIGN_EXTENTION r3, r3d
+	 pxor   xmm6,   xmm6
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
+     lea    r0,    [r0+2*r1]
+     lea    r2,    [r2+2*r3]
+	 SSE2_GetSatd8x8
+
+	 psrlw   xmm6,  1
+	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	 movd    retrd,   xmm6
+	 LOAD_4_PARA_POP
+	 ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+    ;mov    ecx,    [esp+16]
+    add    r0,    8
+    add    r2,    8
+	SSE2_GetSatd8x8
+
+	psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+	;mov    ecx,    [esp+16]
+	add    r0,    8
+	add    r2,    8
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+	paddd        xmm4, %1 ;for dc
+	paddd        xmm4, %3 ;for dc
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+;    paddd        xmm4, %1 ;for dc
+;	 paddd        xmm4, %3 ;for dc
+	movdqa       %4, %1
+	punpcklqdq   %4, %3
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+	pxor        xmm7,   xmm7
+	movq        xmm0,   [eax]
+	movq        xmm1,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	movq        xmm2,   [eax]
+	movq        xmm3,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	punpcklbw   xmm0,   xmm7
+	punpcklbw   xmm1,   xmm7
+	punpcklbw   xmm2,   xmm7
+	punpcklbw   xmm3,   xmm7
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+	;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2],   0
+	pinsrw      xmm0,   word[esi+%2+8], 4
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+2],  0
+	pinsrw      xmm0,   word[esi+%2+10], 4
+	psubsw      xmm0,   xmm1
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+4],  0
+	pinsrw      xmm0,   word[esi+%2+12], 4
+	psubsw      xmm0,   xmm3
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+6],  0
+	pinsrw      xmm0,   word[esi+%2+14], 4
+	psubsw      xmm0,   xmm2
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH  3
+	movq        xmm0,   [esi+%3+8*%1]
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm5,   xmm0
+	pabsw       xmm1,   xmm1
+	pabsw       xmm2,   xmm2
+	pabsw       xmm3,   xmm3
+	paddw       xmm2,   xmm1;for DC
+	paddw       xmm2,   xmm3;for DC
+	paddw       xmm5,   xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+	pxor        xmm0,   xmm0
+	movq2dq     xmm0,   mm4
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+	shl         %1,     4
+	movdqa      xmm0,   [esi+32+%1]
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 32
+	SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 16
+	SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	pxor        xmm4,   xmm4
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movdqu 		xmm0,   [ecx]
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi],  xmm0 ;V
+	movdqa      [esi+16], xmm1
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     8
+	pinsrb      xmm0,   byte[ecx+edx-1], 9
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     10
+	pinsrb      xmm0,   byte[ecx+edx-1], 11
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     12
+	pinsrb      xmm0,   byte[ecx+edx-1], 13
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     14
+	pinsrb      xmm0,   byte[ecx+edx-1], 15
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi+32], xmm0 ;H
+	movdqa      [esi+48], xmm1
+	movd        ecx,    xmm4 ;dc
+	add         ecx,    16   ;(sum+16)
+	shr         ecx,    5    ;((sum+16)>>5)
+	shl         ecx,    4    ;
+	movd        mm4,    ecx  ; mm4 copy DC
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+	mov         edi,    0
+.loop16x16_get_satd:
+.loopStart1:
+	SSE41_I16x16GetX38x4Satd ecx, edi
+	inc          ecx
+	cmp         ecx, 4
+	jl          .loopStart1
+	cmp         edi, 16
+	je          .loop16x16_get_satd_end
+	mov         eax, [esp+24]
+	add         eax, 8
+	mov         ecx, 0
+	add         edi, 16
+	jmp         .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov      edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ebx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_16x16
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16
+
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_16x16
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+return_satd_intra_16x16_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movq 		xmm0,   [ecx]
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+	movdqa      [esi],  xmm0 ;V
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+	movdqa      [esi+16], xmm0 ;H
+;(sum+2)>>2
+	movdqa      xmm6,   [PDQ2]
+	movdqa      xmm5,   xmm4
+	punpckhqdq  xmm5,   xmm1
+	paddd       xmm5,   xmm6
+	psrld       xmm5,   2
+;(sum1+sum2+4)>>3
+	paddd       xmm6,   xmm6
+	paddd       xmm4,   xmm1
+	paddd       xmm4,   xmm6
+	psrld       xmm4,   3
+;satd *16
+	pslld       xmm5,   4
+	pslld       xmm4,   4
+;temp satd
+	movdqa      xmm6,   xmm4
+	punpcklqdq  xmm4,   xmm5
+	psllq       xmm4,   32
+	psrlq       xmm4,   32
+	movdqa      [esi+32], xmm4
+	punpckhqdq  xmm5,   xmm6
+	psllq       xmm5,   32
+	psrlq       xmm5,   32
+	movdqa      [esi+48], xmm5
+
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+loop_chroma_satdx3_cb_cr:
+	SSE41_ChromaGetX38x4Satd ecx, 0
+	inc             ecx
+	cmp             ecx, 2
+	jl              loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+	movq2dq     %1, %3
+	movq2dq     %2, %4
+	punpcklqdq  %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	xor    edi,    edi
+loop_chroma_satdx3:
+	SSE41_ChromaGetX38x8Satd
+	cmp             edi, 1
+	je              loop_chroma_satdx3end
+	inc             edi
+	SSEReg2MMX  xmm4, mm0,mm1
+	SSEReg2MMX  xmm5, mm2,mm3
+	SSEReg2MMX  xmm6, mm5,mm6
+	mov         ecx,  [esp+44]
+	mov         eax,  [esp+48]
+	jmp         loop_chroma_satdx3
+loop_chroma_satdx3end:
+	MMXReg2SSE  xmm0, xmm3, mm0, mm1
+	MMXReg2SSE  xmm1, xmm3, mm2, mm3
+	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+
+	paddw       xmm4, xmm0
+	paddw       xmm5, xmm1
+	paddw       xmm6, xmm2
+
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov       edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ecx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_8x8
+	cmp        ebx, ecx
+	jge near   not_dc_h_8x8
+
+	; for DC mode
+	mov       dword[edx], 0;I8_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_8x8
+	mov       dword[edx], 1;I8_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+	; for V mode
+	mov       dword[edx], 2;I8_PRED_V
+	mov       eax, ecx
+return_satd_intra_8x8_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
+  movdqa      %1,  xmm6
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
+  psadbw      xmm6,%2
+  paddw       xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+    movzx   %2, byte %1
+    mov    %3, %2
+    add     %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    edi,    [esp+40] ;temp_sad
+	sub    ecx,    edx
+    movdqa      xmm5,[ecx]
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
+    movd        eax,xmm0
+
+    add         ecx,edx
+    lea         ebx, [edx+2*edx]
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    sub        edi, 192
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+    lea         esi, [ebx+2*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+; comparing order: DC H V
+	movd        ebx, xmm4 ;DC
+	movd        ecx, xmm3 ;V
+	psrldq      xmm3, 4
+	movd        esi, xmm3 ;H
+	mov         eax, [esp+36] ;lamda
+	shl         eax, 1
+	add         esi, eax
+	add         ebx, eax
+	mov         edx, [esp+32]
+	cmp         ebx, esi
+	jge near   not_dc_16x16_sad
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16_sad
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+    sub        edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm7
+%assign x x+1
+%endrep
+	jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+	; for H mode
+	cmp       esi, ecx
+	jge near   not_dc_h_16x16_sad
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, esi
+	jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+    sub       edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+	pop    edi
+	pop    esi
+	pop    ebx
+	ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+	movq             xmm0, [r0]
+	punpcklqdq       xmm0, xmm0
+	pmaddubsw        xmm0, xmm7
+	movq             xmm1, [r0+r1]
+	punpcklqdq       xmm1, xmm1
+	pmaddubsw        xmm1, xmm7
+	movq             xmm2, [r2]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r2+r3]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	psubsw           xmm0, xmm2
+	psubsw           xmm1, xmm3
+	movq             xmm2, [r0+2*r1]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r0+r4]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	movq             xmm4, [r2+2*r3]
+	punpcklqdq       xmm4, xmm4
+	pmaddubsw        xmm4, xmm7
+	movq             xmm5, [r2+r5]
+	punpcklqdq       xmm5, xmm5
+	pmaddubsw        xmm5, xmm7
+	psubsw           xmm2, xmm4
+	psubsw           xmm3, xmm5
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
+	pabsw            xmm0, xmm0
+	pabsw            xmm2, xmm2
+	pabsw            xmm1, xmm1
+	pabsw            xmm3, xmm3
+	movdqa           xmm4, xmm3
+	pblendw          xmm3, xmm1, 0xAA
+	pslld            xmm1, 16
+	psrld            xmm4, 16
+	por              xmm1, xmm4
+	pmaxuw           xmm1, xmm3
+	paddw            xmm6, xmm1
+	movdqa           xmm4, xmm0
+	pblendw          xmm0, xmm2, 0xAA
+	pslld            xmm2, 16
+	psrld            xmm4, 16
+	por              xmm2, xmm4
+	pmaxuw           xmm0, xmm2
+	paddw            xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+	;push        ebx
+	;mov         eax,[esp+8]
+	;mov         ebx,[esp+12]
+	;mov         ecx,[esp+16]
+	;mov         edx,[esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[r2+r3*2]
+	lea         r2, [r3*2+r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[r0+r1*2]
+	lea         r0, [r1*2+r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
+	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6, xmm6
+	SSE41_GetSatd8x4
+	lea			r0,	 [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor        xmm6, xmm6
+	mov         r6,    0
+loop_get_satd_8x16:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_8x16
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+
+	pop  r2
+	pop  r0
+	;mov			eax,    [esp+16]
+	;mov			ecx,    [esp+24]
+	add			r0,    8
+	add			r2,    8
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	mov         r6,    0
+loop_get_satd_16x16_left:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_left
+
+	pop  r2
+	pop  r0
+	;mov			eax,    [esp+pushsize+4]
+	;mov			ecx,    [esp+pushsize+12]
+	add			r0,    8
+	add			r2,    8
+	mov         r6,    0
+loop_get_satd_16x16_right:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_right
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	;%undef pushsize
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqu xmm1,   [r2]
+	MOVDQ  xmm2,   [r0];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	paddw  xmm7,   xmm0
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+2*r3]
+	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+r5]
+	MOVDQ  xmm2,   [r0+r4]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [r0]
+	movq   xmm1,   [r0+r1]
+	lea    r0,     [r0+2*r1]
+	movhps xmm0,   [r0]
+	movhps xmm1,   [r0+r1]
+
+	movq   xmm2,   [r2]
+	movq   xmm3,   [r2+r3]
+	lea    r2,     [r2+2*r3]
+	movhps xmm2,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+	;push ebx
+	;push edi
+	;push esi
+	;%define _STACK_SIZE		12
+	;mov eax, [esp+_STACK_SIZE+4 ]
+	;mov	ebx, [esp+_STACK_SIZE+8 ]
+	;mov ecx, [esp+_STACK_SIZE+12]
+	;mov edx, [esp+_STACK_SIZE+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	lea r4, [3*r1]
+	lea r5, [3*r3]
+
+	pxor   xmm7,   xmm7
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	movhlps xmm0, xmm7
+	paddw xmm0, xmm7
+	movd retrd, xmm0
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+
+	movhlps     xmm1, xmm0
+	paddw       xmm0, xmm1
+	movd        retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    pxor   xmm6,   xmm6
+
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+    ;mov    ecx,    [esp+12]
+	;mov    edx,    ecx
+    ;CACHE_SPLIT_CHECK edx, 8, 64
+	;jle    near   .pixel_sad_8x8_nsplit
+	;push   ebx
+	;push   edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+
+	%assign  push_num 0
+	mov		r2,  arg3
+	push	r2
+	CACHE_SPLIT_CHECK r2, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	pop		r2
+%ifdef X86_32
+	push	r3
+	push	r4
+	push	r5
+%endif
+	%assign  push_num 3
+	mov		r0,  arg1
+	mov		r1,  arg2
+	SIGN_EXTENTION r1, r1d
+    pxor   xmm7,   xmm7
+
+    ;ecx r2, edx r4, edi r5
+
+    mov    r5,    r2
+    and    r5,    0x07
+    sub    r2,    r5
+    mov    r4,    8
+    sub    r4,    r5
+
+    shl    r5,    3
+    shl    r4,    3
+    movd   xmm5,   r5d
+    movd   xmm6,   r4d
+	mov    r5,    8
+	add    r5,    r2
+    mov    r3,    arg4
+	SIGN_EXTENTION r3, r3d
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       retrd,  xmm0
+%ifdef X86_32
+	pop	 r5
+	pop	 r4
+	pop	 r3
+%endif
+	jmp        .return
+
+.pixel_sad_8x8_nsplit:
+    ;push   ebx
+    ;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    edx,    [esp+20]
+
+	pop r2
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+.return:
+	ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+	psadbw %1,   %4
+	paddw  xmm5, %1
+	psadbw %4,   %3
+	paddw  xmm4, %4
+	movdqu %4,   [%5-1]
+	psadbw %4,   %2
+	paddw  xmm6, %4
+	movdqu %4,   [%5+1]
+	psadbw %4,   %2
+	paddw  xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+	;push ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw  xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw  xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw  xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm2,   xmm3
+	paddw xmm5,   xmm2
+
+	movdqu xmm2,   [r2-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	;mov        ecx,  [esp+24]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	movdqu xmm0,   [r2-1]
+	psadbw xmm0,   xmm1
+	paddw xmm6,   xmm0
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm1
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm1,   xmm3
+	paddw xmm5,   xmm1
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movd   xmm0,   [r0]
+	movd   xmm1,   [r0+r1]
+	lea        r0,    [r0+2*r1]
+	movd       xmm2,   [r0]
+	movd       xmm3,   [r0+r1]
+	punpckldq  xmm0, xmm1
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm0, xmm2
+	sub        r2,  r3
+	movd       xmm1, [r2]
+	movd       xmm2, [r2+r3]
+	punpckldq  xmm1, xmm2
+	movd       xmm2, [r2+r3-1]
+	movd       xmm3, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+
+	movd       xmm4, [r2]
+	movd       xmm5, [r2-1]
+	punpckldq  xmm2, xmm5
+	movd       xmm5, [r2+1]
+	punpckldq  xmm3, xmm5
+
+	movd       xmm5, [r2+r3]
+	punpckldq  xmm4, xmm5
+
+	punpcklqdq xmm1, xmm4 ;-L
+
+	movd       xmm5, [r2+r3-1]
+	movd       xmm6, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+	movd       xmm7, [r2-1]
+	punpckldq  xmm5, xmm7
+	punpcklqdq xmm2, xmm5 ;-1
+	movd       xmm7, [r2+1]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm3, xmm6 ;+1
+	movd       xmm6, [r2]
+	movd       xmm7, [r2+r3]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6 ;+L
+	psadbw     xmm1, xmm0
+	psadbw     xmm2, xmm0
+	psadbw     xmm3, xmm0
+	psadbw     xmm4, xmm0
+
+	movhlps    xmm0, xmm1
+	paddw      xmm1, xmm0
+	movhlps    xmm0, xmm2
+	paddw      xmm2, xmm0
+	movhlps    xmm0, xmm3
+	paddw      xmm3, xmm0
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	;mov        edi,  [esp+28]
+	punpckldq  xmm1, xmm4
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm1, xmm2
+	movdqa     [r4],xmm1
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+    ;push    ebx
+	;%define pushsize     4
+	;%define pix1address	 esp+pushsize+4
+	;%define pix1stride   esp+pushsize+8
+	;%define pix2address  esp+pushsize+12
+	;%define pix2stride   esp+pushsize+16
+    ;mov		  eax, [pix1address]
+    ;mov		  ebx, [pix1stride ]
+    ;mov		  ecx, [pix2address]
+    ;mov		  edx, [pix2stride ]
+
+    %assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	movd	  mm0, [r0]
+	movd	  mm1, [r0+r1]
+	punpckldq mm0, mm1
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm0, mm3
+
+	lea       r0, [r0+2*r1]
+	lea       r2, [r2+2*r3]
+
+	movd      mm1, [r0]
+	movd      mm2, [r0+r1]
+	punpckldq mm1, mm2
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm1, mm3
+	paddw     mm0, mm1
+
+    movd      retrd, mm0
+
+	WELSEMMS
+    LOAD_4_PARA_POP
+    ret
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -179,15 +179,15 @@
 ;%define         stride r1
 
 BilateralLumaFilter8_sse2:
-       
-        push r3 
+
+        push r3
         %assign push_num 1
         LOAD_2_PARA
 
 		pxor		xmm7,	xmm7
-	
+
 		mov         r3,     r0
-		
+
 		movq        xmm6,   [r0]
 		punpcklbw	xmm6,	xmm7
 		movdqa		xmm3,	[sse2_32]
@@ -218,10 +218,10 @@
 		packuswb	xmm5,	xmm5
 		movq		[r3],	xmm5
 
-       
+
 		pop r3
 		%assign push_num 0
-		
+
 		ret
 
 WELS_EXTERN WaverageChromaFilter8_sse2
@@ -239,11 +239,11 @@
 WaverageChromaFilter8_sse2:
 
         push r3
-       
+
         %assign push_num 1
-        
+
         LOAD_2_PARA
-        
+
         mov		r3,	r1
 		add		r3,	r3
 		sub		r0,	r3			; pixels - 2 * stride
@@ -272,8 +272,8 @@
 		packuswb	xmm3,		xmm3
 		movq		[r0 + 2],		xmm3
 
-              
+
         pop r3
-        
+
         %assign push_num 0
 		ret
--- a/codec/processing/src/asm/sad.asm
+++ b/codec/processing/src/asm/sad.asm
@@ -84,7 +84,7 @@
 	;push   edi
 	;mov    eax,    [esp+12]
 	;mov    ebx,    [esp+16]
-	
+
 	%assign  push_num 0
 	mov		r2,  arg3
 	push	r2
@@ -91,7 +91,7 @@
 	CACHE_SPLIT_CHECK r2, 8, 64
 	jle    near   .pixel_sad_8x8_nsplit
 	pop		r2
-%ifdef X86_32	
+%ifdef X86_32
 	push	r3
 	push	r4
 	push	r5
@@ -98,10 +98,10 @@
 %endif
 	%assign  push_num 3
 	mov		r0,  arg1
-	mov		r1,  arg2	
+	mov		r1,  arg2
 	SIGN_EXTENTION r1, r1d
     pxor   xmm7,   xmm7
-    
+
     ;ecx r2, edx r4, edi r5
 
     mov    r5,    r2
@@ -195,18 +195,18 @@
 	pop	 r3
 %endif
 	jmp        .return
-	
+
 .pixel_sad_8x8_nsplit:
     ;push   ebx
     ;mov    eax,    [esp+8]
 	;mov    ebx,    [esp+12]
 	;mov    edx,    [esp+20]
-	
+
 	pop r2
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    r0,    [r0+2*r1]