shithub: openh264

Download patch

ref: fb1958ad13d2172f3b2091fc88b1715bd358a705
parent: b7b01faf29d039578d7a8c9d36d1b4ed25a0ae93
parent: 4633626d69220ed0a1776fbe510454a94279f80e
author: volvet <qizh@cisco.com>
date: Tue Mar 18 11:04:54 EDT 2014

Merge pull request #519 from mstorsjo/push-xmm-registers

Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64

Reviewed by zhiliang

--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -335,6 +335,82 @@
     %endif
 %endmacro
 
+%macro PUSH_XMM 1
+    %ifdef WIN64
+        %assign xmm_num_regs %1
+        %if xmm_num_regs > 6
+            %ifdef push_num
+                %assign push_num push_num+2*(%1-6)
+            %endif
+            sub rsp, 16*(%1 - 6)
+            movdqu [rsp], xmm6
+        %endif
+        %if xmm_num_regs > 7
+            movdqu [rsp+16], xmm7
+        %endif
+        %if xmm_num_regs > 8
+            movdqu [rsp+32], xmm8
+        %endif
+        %if xmm_num_regs > 9
+            movdqu [rsp+48], xmm9
+        %endif
+        %if xmm_num_regs > 10
+            movdqu [rsp+64], xmm10
+        %endif
+        %if xmm_num_regs > 11
+            movdqu [rsp+80], xmm11
+        %endif
+        %if xmm_num_regs > 12
+            movdqu [rsp+96], xmm12
+        %endif
+        %if xmm_num_regs > 13
+            movdqu [rsp+112], xmm13
+        %endif
+        %if xmm_num_regs > 14
+            movdqu [rsp+128], xmm14
+        %endif
+        %if xmm_num_regs > 15
+            movdqu [rsp+144], xmm15
+        %endif
+    %endif
+%endmacro
+
+%macro POP_XMM 0
+    %ifdef WIN64
+        %if xmm_num_regs > 15
+            movdqu xmm15, [rsp+144]
+        %endif
+        %if xmm_num_regs > 14
+            movdqu xmm14, [rsp+128]
+        %endif
+        %if xmm_num_regs > 13
+            movdqu xmm13, [rsp+112]
+        %endif
+        %if xmm_num_regs > 12
+            movdqu xmm12, [rsp+96]
+        %endif
+        %if xmm_num_regs > 11
+            movdqu xmm11, [rsp+80]
+        %endif
+        %if xmm_num_regs > 10
+            movdqu xmm10, [rsp+64]
+        %endif
+        %if xmm_num_regs > 9
+            movdqu xmm9, [rsp+48]
+        %endif
+        %if xmm_num_regs > 8
+            movdqu xmm8, [rsp+32]
+        %endif
+        %if xmm_num_regs > 7
+            movdqu xmm7, [rsp+16]
+        %endif
+        %if xmm_num_regs > 6
+            movdqu xmm6, [rsp]
+            add rsp, 16*(xmm_num_regs - 6)
+        %endif
+    %endif
+%endmacro
+
 %macro SIGN_EXTENSION 2
     %ifndef X86_32
             movsxd %1, %2
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@@ -207,9 +207,6 @@
   }
 }
 
-void WelsXmmRegEmptyOp(void * pSrc) {
-}
-
 #elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
 #if defined(ANDROID_NDK)
 uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
--- a/codec/common/cpu.h
+++ b/codec/common/cpu.h
@@ -67,60 +67,14 @@
  */
 void     WelsCPURestore (const uint32_t kuiCPU);
 
-#ifdef  WIN64
-void     WelsXmmRegStore(void * src);
-void     WelsXmmRegLoad(void * src);
-#endif
-
 #else
 #define WelsEmms()
 #endif
 
-void     WelsXmmRegEmptyOp(void * pSrc);
-
 uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
 
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
-
-typedef  void (*WelsXmmRegProtectFunc)(void * pSrc);
-
-
-#if defined(WIN64) && defined(X86_ASM)
-#define   XMMREG_PROTECT_DECLARE(name) \
-  WelsXmmRegProtectFunc name##load;\
-  WelsXmmRegProtectFunc name##store;\
-  uint8_t               name##Buffer[160];
-
-#define   XMMREG_PROTECT_INIT(name) \
-  { \
-    uint32_t uiCpuFlag = WelsCPUFeatureDetect(NULL);\
-    if( uiCpuFlag & WELS_CPU_SSE2 ){\
-      name##load = WelsXmmRegLoad;\
-      name##store = WelsXmmRegStore; \
-    } else { \
-      name##load = WelsXmmRegEmptyOp; \
-      name##store = WelsXmmRegEmptyOp; \
-    } \
-  }
-
-#define   XMMREG_PROTECT_UNINIT(name) \
-
-#define   XMMREG_PROTECT_STORE(name) \
-  name##store(name##Buffer);
-
-#define   XMMREG_PROTECT_LOAD(name) \
-  name##load(name##Buffer);
-
-#else
-
-#define   XMMREG_PROTECT_DECLARE(name)
-#define   XMMREG_PROTECT_INIT(name)
-#define   XMMREG_PROTECT_UNINIT(name)
-#define   XMMREG_PROTECT_STORE(name)
-#define   XMMREG_PROTECT_LOAD(name)
-
-#endif
 
 #endif//WELS_CPU_DETECTION_H__
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -210,44 +210,3 @@
 	emms	; empty mmx technology states
 	ret
 
-
-%ifdef     WIN64
-
-WELS_EXTERN WelsXmmRegStore
-ALIGN 16
-;******************************************************************************************
-;   void WelsXmmRegStore(void *src)
-;******************************************************************************************
-WelsXmmRegStore:
-  movdqu [rcx], xmm6
-  movdqu [rcx+16], xmm7
-  movdqu [rcx+32], xmm8
-  movdqu [rcx+48], xmm9
-  movdqu [rcx+64], xmm10
-  movdqu [rcx+80], xmm11
-  movdqu [rcx+96], xmm12
-  movdqu [rcx+112], xmm13
-  movdqu [rcx+128], xmm14
-  movdqu [rcx+144], xmm15
-  ret
-
-WELS_EXTERN WelsXmmRegLoad
-ALIGN 16
-;******************************************************************************************
-;   void WelsXmmRegLoad(void *src)
-;******************************************************************************************
-WelsXmmRegLoad:
-  movdqu xmm6, [rcx]
-  movdqu xmm7, [rcx+16]
-  movdqu xmm8, [rcx+32]
-  movdqu xmm9, [rcx+48]
-  movdqu xmm10, [rcx+64]
-  movdqu xmm11, [rcx+80]
-  movdqu xmm12, [rcx+96]
-  movdqu xmm13, [rcx+112]
-  movdqu xmm14, [rcx+128]
-  movdqu xmm15, [rcx+144]
-  ret
-%endif
-
-
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -63,6 +63,7 @@
 WELS_EXTERN   DeblockLumaLt4V_ssse3
   push        rbp
   mov         r11,[rsp + 16 + 20h]  ; pTC
+  PUSH_XMM 16
   sub         rsp,1B0h
   lea         rbp,[rsp+20h]
   movd        xmm4,r8d
@@ -311,6 +312,7 @@
   movdqa      [r12+rcx],xmm0
   mov         r12,qword [rbp+180h]
   lea         rsp,[rbp+190h]
+  POP_XMM
   pop         rbp
   ret
 
@@ -779,6 +781,7 @@
   mov         rax,rsp
   push        rbx
   push        rdi
+  PUSH_XMM 16
   sub         rsp,0C8h
   mov         r10,qword [rax + 30h]  ; pTC
   pxor        xmm1,xmm1
@@ -833,7 +836,7 @@
   punpckhbw   xmm2,xmm1
   punpcklbw   xmm14,xmm1
   movd        xmm0,eax
-  movsx       eax,word [rsp + 0C8h + 38h] ; iBeta
+  movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
   punpckhbw   xmm13,xmm1
   punpckhbw   xmm15,xmm1
   movdqa      xmm3,xmm9
@@ -929,6 +932,7 @@
   movq        [rdi],xmm14
   movaps      xmm14,[rsp+30h]
   mov         rsp,r11
+  POP_XMM
   pop         rdi
   pop         rbx
   ret
@@ -937,6 +941,7 @@
 WELS_EXTERN   DeblockChromaEq4V_ssse3
   mov         rax,rsp
   push        rbx
+  PUSH_XMM 15
   sub         rsp,90h
   pxor        xmm1,xmm1
   mov         r11,rcx
@@ -973,7 +978,7 @@
   punpcklbw   xmm9,xmm1
   punpckhbw   xmm10,xmm1
   movd        xmm0,eax
-  movsx       eax,word [rsp + 90h + 8h + 28h]   ; iBeta
+  movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
   punpckhbw   xmm13,xmm1
   movdqa      xmm7,xmm12
   punpcklwd   xmm0,xmm0
@@ -1079,6 +1084,7 @@
   movaps      xmm12,[r11-70h]
   movaps      xmm13,[r11-80h]
   mov         rsp,r11
+  POP_XMM
   pop         rbx
   ret
 
@@ -1090,6 +1096,7 @@
   mov         rax,rsp
   mov         [rax+20h],rbx
   push        rdi
+  PUSH_XMM 16
   sub         rsp,140h
   mov         rdi,rdx
   lea         eax,[r8*4]
@@ -1182,7 +1189,7 @@
   movd        xmm0,eax
   movdqa      xmm4,xmm12
   movdqa      xmm8,xmm11
-  movsx       eax,word [rsp+170h] ; iBeta
+  movsx       eax,word [rsp+170h + 160] ; iBeta
   punpcklwd   xmm0,xmm0
   punpcklbw   xmm4,xmm1
   punpckhbw   xmm12,xmm1
@@ -1340,9 +1347,9 @@
   mov         [rbx+r10*2],eax
   mov         eax,[rsp+7Ch]
   mov         [rdx+rbx],eax
-  lea         r11,[rsp+140h]
-  mov         rbx, [r11+28h]
-  mov         rsp,r11
+  lea         rsp,[rsp+140h]
+  POP_XMM
+  mov         rbx, [rsp+28h]
   pop         rdi
   ret
 
@@ -1355,6 +1362,7 @@
   push        rsi
   push        rdi
   push        r12
+  PUSH_XMM 16
   sub         rsp,170h
 
   movsxd      rsi,r8d
@@ -1438,7 +1446,7 @@
   punpckhdq   xmm7,xmm0
   movdqa      xmm0,xmm1
   punpckldq   xmm0,xmm5
-  mov         rax, [rsp+1C8h]    ; pTC
+  mov         rax, [rsp+1C8h+160]    ; pTC
   punpckhdq   xmm1,xmm5
   movdqa      xmm9,xmm6
   punpckhqdq  xmm6,xmm0
@@ -1476,7 +1484,7 @@
   punpckhbw   xmm9,xmm1
   punpckhbw   xmm8,xmm1
   punpcklwd   xmm0,xmm0
-  movsx       eax,word [rsp+1C0h]   ; iBeta
+  movsx       eax,word [rsp+1C0h+160]   ; iBeta
   mov         word [rsp+4],r8w
   mov         word [rsp+2],r9w
   pshufd      xmm12,xmm0,0
@@ -1620,6 +1628,7 @@
   mov         [r10+rbp],eax
   lea         r11,[rsp+170h]
   mov         rsp,r11
+  POP_XMM
   pop         r12
   pop         rdi
   pop         rsi
@@ -5132,6 +5141,7 @@
 
 %assign   push_num   3
     LOAD_3_PARA
+    PUSH_XMM 8
 
     SIGN_EXTENSION   r1, r1d
 
@@ -5188,6 +5198,7 @@
     movdqa  [r2 + 70h],  xmm0
 
     mov     r7,   r5
+    POP_XMM
     pop     r5
     pop     r4
     pop     r3
@@ -5206,6 +5217,7 @@
 
 %assign  push_num 2
     LOAD_3_PARA
+    PUSH_XMM 8
 
     SIGN_EXTENSION   r1, r1d
 
@@ -5263,6 +5275,7 @@
 
 
     mov      r7,   r4
+    POP_XMM
     pop      r4
     pop      r3
     ret
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -360,6 +360,7 @@
 
     %assign push_num 3
     LOAD_4_PARA
+    PUSH_XMM 7
 
     SIGN_EXTENSION r1, r1d
     SIGN_EXTENSION r2, r2d
@@ -461,6 +462,7 @@
     ; for left & right border expanding
     exp_cross_sse2 32,a
 
+    POP_XMM
     LOAD_4_PARA_POP
 
     pop r6
@@ -486,6 +488,7 @@
 
     %assign push_num 3
     LOAD_4_PARA
+    PUSH_XMM 7
 
     SIGN_EXTENSION r1,r1d
     SIGN_EXTENSION r2,r2d
@@ -586,6 +589,7 @@
     ; for left & right border expanding
     exp_cross_sse2 16,a
 
+    POP_XMM
     LOAD_4_PARA_POP
 
     pop r6
@@ -610,6 +614,7 @@
 
     %assign push_num 3
     LOAD_4_PARA
+    PUSH_XMM 7
 
     SIGN_EXTENSION r1,r1d
     SIGN_EXTENSION r2,r2d
@@ -710,6 +715,7 @@
     ; for left & right border expanding
     exp_cross_sse2 16,u
 
+    POP_XMM
     LOAD_4_PARA_POP
 
     pop r6
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -67,6 +67,7 @@
 	push r5
 	%assign  push_num 2
     LOAD_4_PARA
+    PUSH_XMM 8
 
 	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
 	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
@@ -112,6 +113,7 @@
 	movdqa [r0+r1], xmm5
 	movdqa [r0+2*r1], xmm6
 	movdqa [r0+r4], xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	pop r5
 	pop r4
@@ -129,6 +131,7 @@
 	push r5
 	%assign  push_num 2
     LOAD_4_PARA
+    PUSH_XMM 8
 
 	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
 	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
@@ -174,6 +177,7 @@
 	movdqa [r0+r1], xmm5
 	movdqa [r0+2*r1], xmm6
 	movdqa [r0+r4], xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	pop r5
 	pop r4
@@ -191,6 +195,7 @@
 	push r5
 	%assign  push_num 2
     LOAD_4_PARA
+    PUSH_XMM 8
 
 	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
 	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
@@ -214,6 +219,7 @@
 	movdqa [r0+r1], xmm5
 	movdqa [r0+2*r1], xmm6
 	movdqa [r0+r4], xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	pop r5
 	pop r4
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -149,6 +149,7 @@
 WELS_EXTERN McChromaWidthEq8_sse2
 	%assign  push_num 0
 	LOAD_6_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r5, r5d
@@ -208,6 +209,7 @@
 	dec r5
 	jnz near .xloop
 
+	POP_XMM
 	LOAD_6_PARA_POP
 
 	ret
@@ -226,6 +228,7 @@
 WELS_EXTERN McChromaWidthEq8_ssse3
 	%assign  push_num 0
 	LOAD_6_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r5, r5d
@@ -282,6 +285,7 @@
 	sub r5, 2
 	jnz .hloop_chroma
 
+	POP_XMM
 	LOAD_6_PARA_POP
 
 	ret
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -171,6 +171,7 @@
 WELS_EXTERN McHorVer22Width8HorFirst_sse2
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -207,6 +208,7 @@
 	add r2, r3
 	dec r4
 	jnz .yloop_width_8
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -221,6 +223,7 @@
 WELS_EXTERN McHorVer20WidthEq8_sse2
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -261,6 +264,7 @@
 	dec r4
 	jnz near .y_loop
 
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -275,6 +279,7 @@
 WELS_EXTERN McHorVer20WidthEq16_sse2
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -341,6 +346,7 @@
 	dec r4
 	jnz near .y_loop
 
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -355,6 +361,7 @@
 WELS_EXTERN McHorVer02WidthEq8_sse2
 	%assign  push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -424,6 +431,7 @@
 	jmp near .start
 
 .xx_exit:
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -446,6 +454,7 @@
 WELS_EXTERN McHorVer02Height9Or17_sse2
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -563,6 +572,7 @@
 	pop r13
 	pop r12
 %endif
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -579,6 +589,7 @@
 WELS_EXTERN McHorVer20Width9Or17_sse2
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -639,6 +650,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_9
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -720,6 +732,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_17
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -736,6 +749,7 @@
 WELS_EXTERN McHorVer22HorFirst_sse2
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -792,6 +806,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_9
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -866,6 +881,7 @@
 	add r2, r3
 	dec r5
 	jnz .yloop_width_17
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -903,6 +919,7 @@
 WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -1016,6 +1033,7 @@
 	pop r13
 	pop r12
 %endif
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
 
@@ -1032,6 +1050,7 @@
 WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
 	%assign  push_num 0
     LOAD_6_PARA
+    PUSH_XMM 8
 	SIGN_EXTENSION	r1, r1d
 	SIGN_EXTENSION	r3, r3d
 	SIGN_EXTENSION	r4, r4d
@@ -1144,5 +1163,6 @@
 	pop r13
 	pop r12
 %endif
+	POP_XMM
 	LOAD_6_PARA_POP
 	ret
--- a/codec/common/satd_sad.asm
+++ b/codec/common/satd_sad.asm
@@ -158,6 +158,7 @@
 WELS_EXTERN WelsSampleSatd4x4_sse2
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
     movd      xmm0, [r0]
@@ -219,6 +220,7 @@
 	movd           retrd,  xmm6
     and            retrd,  0xffff
     shr            retrd,  1
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -230,6 +232,7 @@
 WELS_EXTERN WelsSampleSatd8x8_sse2
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm6,   xmm6
@@ -238,6 +241,7 @@
     psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	movd    retrd,   xmm6
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -249,6 +253,7 @@
 WELS_EXTERN WelsSampleSatd8x16_sse2
 	 %assign  push_num 0
 	 LOAD_4_PARA
+	 PUSH_XMM 8
 	 SIGN_EXTENSION r1, r1d
 	 SIGN_EXTENSION r3, r3d
 	 pxor   xmm6,   xmm6
@@ -262,6 +267,7 @@
 	 psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    retrd,   xmm6
+	 POP_XMM
 	 LOAD_4_PARA_POP
 	 ret
 
@@ -273,6 +279,7 @@
 WELS_EXTERN WelsSampleSatd16x8_sse2
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	push r0
@@ -291,6 +298,7 @@
 	psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	movd    retrd,   xmm6
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -302,6 +310,7 @@
 WELS_EXTERN WelsSampleSatd16x16_sse2
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	push r0
@@ -328,6 +337,7 @@
     psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	movd    retrd,   xmm6
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -976,6 +986,7 @@
 WELS_EXTERN WelsSampleSatd4x4_sse41
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	movdqa      xmm4,[HSwapSumSubDB1]
@@ -1017,6 +1028,7 @@
 	pabsw       xmm2,xmm2
 	pmaxsw      xmm0,xmm2
 	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -1032,6 +1044,7 @@
 %endif
 	%assign  push_num 2
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	movdqa      xmm7, [HSumSubDB1]
@@ -1043,6 +1056,7 @@
 	lea			r2,  [r2+4*r3]
 	SSE41_GetSatd8x4
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r5
@@ -1063,6 +1077,7 @@
 %endif
 	%assign  push_num 3
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	movdqa      xmm7, [HSumSubDB1]
@@ -1078,6 +1093,7 @@
 	cmp         r6,  4
 	jl          loop_get_satd_8x16
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r6
@@ -1098,6 +1114,7 @@
 %endif
 	%assign  push_num 2
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	push  r0
@@ -1121,6 +1138,7 @@
 	lea			r2,  [r2+4*r3]
 	SSE41_GetSatd8x4
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r5
@@ -1142,6 +1160,7 @@
 %endif
 	%assign  push_num 3
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 
@@ -1174,6 +1193,7 @@
 	cmp         r6,  4
 	jl          loop_get_satd_16x16_right
 	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r6
@@ -1261,6 +1281,7 @@
 
 	%assign  push_num 2
 	LOAD_4_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	lea r4, [3*r1]
@@ -1280,6 +1301,7 @@
 	movhlps xmm0, xmm7
 	paddw xmm0, xmm7
 	movd retrd, xmm0
+	POP_XMM
 	LOAD_4_PARA_POP
 %ifdef X86_32
 	pop  r5
@@ -1322,6 +1344,7 @@
 WELS_EXTERN WelsSampleSad8x16_sse2
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 7
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
     pxor   xmm6,   xmm6
@@ -1340,6 +1363,7 @@
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       retrd,  xmm0
+	POP_XMM
 	LOAD_4_PARA_POP
 	ret
 
@@ -1362,6 +1386,7 @@
 	push	r5
 %endif
 	%assign  push_num 3
+	PUSH_XMM 8
 	mov		r0,  arg1
 	mov		r1,  arg2
 	SIGN_EXTENSION r1, r1d
@@ -1454,6 +1479,7 @@
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       retrd,  xmm0
+	POP_XMM
 %ifdef X86_32
 	pop	 r5
 	pop	 r4
@@ -1466,6 +1492,7 @@
 	pop r2
 	%assign  push_num 0
 	LOAD_4_PARA
+	PUSH_XMM 7
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm6,   xmm6
@@ -1476,6 +1503,7 @@
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       retrd,  xmm0
+	POP_XMM
 	LOAD_4_PARA_POP
 .return:
 	ret
@@ -1510,6 +1538,7 @@
 WELS_EXTERN WelsSampleSadFour16x16_sse2
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -1620,6 +1649,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -1627,6 +1657,7 @@
 WELS_EXTERN WelsSampleSadFour16x8_sse2
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -1705,6 +1736,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -1711,6 +1743,7 @@
 WELS_EXTERN WelsSampleSadFour8x16_sse2
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -1915,6 +1948,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
@@ -1922,6 +1956,7 @@
 WELS_EXTERN WelsSampleSadFour8x8_sse2
 	%assign  push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
@@ -2035,6 +2070,7 @@
 	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [r4],xmm4
+	POP_XMM
 	LOAD_5_PARA_POP
 	ret
 
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -149,6 +149,7 @@
 
     %assign push_num 0
     LOAD_2_PARA
+    PUSH_XMM 8
     SIGN_EXTENSION r1,r1d
 
 %ifdef X86_32
@@ -232,6 +233,7 @@
 	pop r4
 	pop r3
 %endif
+	POP_XMM
 
 	ret
 
@@ -242,6 +244,7 @@
 
     %assign push_num 0
     LOAD_2_PARA
+    PUSH_XMM 8
     SIGN_EXTENSION r1,r1d
 
 %ifdef X86_32
@@ -325,6 +328,7 @@
 	pop r4
 	pop r3
 %endif
+	POP_XMM
 
 	ret
 
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -55,6 +55,7 @@
 WELS_EXTERN   WelsResBlockZero16x16_sse2
         %assign push_num 0
         LOAD_2_PARA
+        PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
         lea 	r1, 	[r1*2]
         lea 	r2,	[r1*3]
@@ -116,6 +117,7 @@
 	movdqa   [r0+r2],     xmm7
 	movdqa   [r0+r2+10h],     xmm7
 
+	POP_XMM
 	ret
 
 
@@ -125,6 +127,7 @@
 WELS_EXTERN   WelsResBlockZero8x8_sse2
 	  %assign push_num 0
           LOAD_2_PARA
+          PUSH_XMM 8
 	  SIGN_EXTENSION r1, r1d
 	  lea       r1,     [r1*2]
 	  lea       r2,     [r1*3]
@@ -143,5 +146,6 @@
 	  movdqa    [r0+r2],     xmm7
 
 
+	  POP_XMM
 	  ret
 
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -223,6 +223,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_2_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r1, r1d
 		mov r4, r0 ; save r0 in r4
 		sub		r0,	1
@@ -302,6 +303,7 @@
 		cmp		r2,	16
 		jnz get_i16x16_luma_pred_plane_sse2_1
 
+		POP_XMM
 		pop r4
 		pop r3
 		ret
@@ -387,6 +389,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_2_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r1, r1d
 		mov r4, r0
 		sub		r0,	1
@@ -465,6 +468,7 @@
 		cmp		r2,	8
 		jnz get_i_chroma_pred_plane_sse2_1
 
+		POP_XMM
 		pop r4
 		pop r3
 		WELSEMMS
@@ -1181,6 +1185,7 @@
 WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
 	%assign push_num 0
 	LOAD_2_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	sub r2, r1
@@ -1243,6 +1248,7 @@
 	movdqa [r0+2*r1], xmm0
 	movdqa [r0+r2], xmm1
 
+	POP_XMM
 	ret
 
 ;*******************************************************************************
@@ -1355,6 +1361,7 @@
 WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
 	%assign push_num 0
 	LOAD_2_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	mov r2, r0
 	sub r2, r1
@@ -1384,6 +1391,7 @@
 	movq [r0+r1], xmm0
 	movq [r0+2*r1], xmm0
 	movq [r0+r2], xmm0
+	POP_XMM
 	ret
 
 ;*******************************************************************************
--- a/codec/decoder/plus/inc/welsDecoderExt.h
+++ b/codec/decoder/plus/inc/welsDecoderExt.h
@@ -104,8 +104,6 @@
 void InitDecoder (void);
 void UninitDecoder (void);
 
-XMMREG_PROTECT_DECLARE(CWelsH264Decoder);
-
 #ifdef OUTPUT_BIT_STREAM
 WelsFileHandle* m_pFBS;
 WelsFileHandle* m_pFBSSize;
--- a/codec/decoder/plus/src/welsDecoderExt.cpp
+++ b/codec/decoder/plus/src/welsDecoderExt.cpp
@@ -101,7 +101,6 @@
   m_pTrace = CreateWelsTrace (Wels_Trace_Type);
 
   IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::CWelsDecoder() entry");
-  XMMREG_PROTECT_INIT(CWelsH264Decoder);
 
 #ifdef OUTPUT_BIT_STREAM
   SWelsTime sCurTime;
@@ -167,7 +166,6 @@
   IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::~CWelsDecoder()");
 
   UninitDecoder();
-  XMMREG_PROTECT_UNINIT(CWelsH264Decoder);
 
 #ifdef OUTPUT_BIT_STREAM
   if (m_pFBS) {
@@ -361,10 +359,8 @@
 
   m_pDecContext->iFeedbackTidInAu             = -1; //initialize
 
-  XMMREG_PROTECT_STORE(CWelsH264Decoder);
   WelsDecodeBs (m_pDecContext, kpSrc, kiSrcLen, (unsigned char**)ppDst,
                 pDstInfo); //iErrorCode has been modified in this function
-  XMMREG_PROTECT_LOAD(CWelsH264Decoder);
 
   if (m_pDecContext->iErrorCode) {
     ENalUnitType eNalType =
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -290,6 +290,7 @@
 WELS_EXTERN WelsDctFourT4_sse2
     %assign push_num 0
     LOAD_5_PARA
+    PUSH_XMM 8
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
     pxor    xmm7, xmm7
@@ -327,6 +328,7 @@
 	lea		r0, [r0+64]
 	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
+	POP_XMM
 	LOAD_5_PARA_POP
     ret
 
@@ -337,6 +339,7 @@
 WELS_EXTERN WelsIDctFourT4Rec_sse2
 	%assign push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	;Load 4x8
@@ -376,6 +379,7 @@
 	lea		r2, [r2 + 2 * r3]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
 	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],	[r2 + r3]
+	POP_XMM
 	LOAD_5_PARA_POP
    ; pop		esi
    ; pop		ebx
@@ -394,6 +398,7 @@
 WELS_EXTERN WelsIDctRecI16x16Dc_sse2
 	%assign push_num 0
 	LOAD_5_PARA
+	PUSH_XMM 8
 	SIGN_EXTENSION r1, r1d
 	SIGN_EXTENSION r3, r3d
 	pxor		xmm7,		xmm7
@@ -430,6 +435,7 @@
 	lea			r0,		[r0 + 2 * r1]
 	lea			r2,		[r2 + 2 * r3]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+	POP_XMM
 	LOAD_5_PARA_POP
     ret
 
@@ -468,6 +474,7 @@
 WELS_EXTERN WelsHadamardT4Dc_sse2
 		%assign push_num 0
 		LOAD_2_PARA
+		PUSH_XMM 8
 		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
 		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, r1 + 0x40
 		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, r1 + 0x100
@@ -493,4 +500,5 @@
 		movdqa	[r0+ 0],   xmm3
 		movdqa	[r0+16],   xmm2
 
+		POP_XMM
 		ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -229,6 +229,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_3_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r2, r2d
 		sub		r1,	1
 		sub		r1,	r2
@@ -304,6 +305,7 @@
 		inc		r3
 		cmp		r3,	16
 		jnz get_i16x16_luma_pred_plane_sse2_1
+		POP_XMM
 		pop r4
 		pop r3
 		ret
@@ -384,6 +386,7 @@
 		push r4
 		%assign push_num 2
 		LOAD_3_PARA
+		PUSH_XMM 8
 		SIGN_EXTENSION r2, r2d
 		sub		r1,	1
 		sub		r1,	r2
@@ -458,6 +461,7 @@
 		inc		r3
 		cmp		r3,	8
 		jnz get_i_chroma_pred_plane_sse2_1
+		POP_XMM
 		pop r4
 		pop r3
 		WELSEMMS
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -136,6 +136,7 @@
 WELS_EXTERN WelsQuantFour4x4Max_sse2
 		%assign push_num 0
 		LOAD_4_PARA
+		PUSH_XMM 8
 		MOVDQ	xmm2, [r1]
 		MOVDQ	xmm3, [r2]
 
@@ -161,6 +162,7 @@
 		pmaxsw	xmm0, xmm1
 
 		movq	[r3], xmm0
+		POP_XMM
 		LOAD_4_PARA_POP
 		ret
 
--- a/codec/encoder/plus/inc/welsEncoderExt.h
+++ b/codec/encoder/plus/inc/welsEncoderExt.h
@@ -132,8 +132,6 @@
   void    InitEncoder (void);
   int32_t RawData2SrcPic (const uint8_t* pSrc);
   void    DumpSrcPicture (const uint8_t* pSrc);
-
-  XMMREG_PROTECT_DECLARE(CWelsH264SVCEncoder);
 };
 }
 #endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
--- a/codec/encoder/plus/src/welsEncoderExt.cpp
+++ b/codec/encoder/plus/src/welsEncoderExt.cpp
@@ -138,7 +138,6 @@
 #endif//OUTPUT_BIT_STREAM
 
   InitEncoder();
-  XMMREG_PROTECT_INIT(CWelsH264SVCEncoder);
 }
 
 CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
@@ -172,7 +171,6 @@
 #endif//OUTPUT_BIT_STREAM
 
   Uninitialize();
-  XMMREG_PROTECT_UNINIT(CWelsH264SVCEncoder);
 }
 
 void CWelsH264SVCEncoder::InitEncoder (void) {
@@ -551,9 +549,7 @@
 
   int32_t iFrameTypeReturned = 0;
   int32_t iFrameType = videoFrameTypeInvalid;
-  XMMREG_PROTECT_STORE(CWelsH264SVCEncoder);
   const int32_t kiEncoderReturn = WelsEncoderEncodeExt (m_pEncContext, pBsInfo, pSrcPic);
-  XMMREG_PROTECT_LOAD(CWelsH264SVCEncoder);
 
   if(kiEncoderReturn == ENC_RETURN_MEMALLOCERR) {
     WelsUninitEncoderExt (&m_pEncContext);
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -49,11 +49,9 @@
   m_pfVar   = NULL;
   WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
   WelsInitVarFunc (m_pfVar, m_CPUFlag);
-  XMMREG_PROTECT_INIT(AdaptiveQuantization);
 }
 
 CAdaptiveQuantization::~CAdaptiveQuantization() {
-  XMMREG_PROTECT_UNINIT(AdaptiveQuantization);
 }
 
 EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
@@ -102,7 +100,6 @@
       pRefFrameTmp  = pRefFrameY;
       pCurFrameTmp  = pCurFrameY;
       for (i = 0; i < iMbWidth; i++) {
-        XMMREG_PROTECT_STORE(AdaptiveQuantization);
         iSumDiff =  pVaaCalcResults->pSad8x8[iMbIndex][0];
         iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
         iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
@@ -111,7 +108,6 @@
         iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
         uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
         iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
-        XMMREG_PROTECT_LOAD(AdaptiveQuantization);
 
         iSumDiff = iSumDiff >> 8;
         pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
@@ -134,9 +130,7 @@
       pRefFrameTmp  = pRefFrameY;
       pCurFrameTmp  = pCurFrameY;
       for (i = 0; i < iMbWidth; i++) {
-        XMMREG_PROTECT_STORE(AdaptiveQuantization);
         m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
-        XMMREG_PROTECT_LOAD(AdaptiveQuantization);
         dAverageMotionIndex += pMotionTexture->uiMotionIndex;
         dAverageTextureIndex += pMotionTexture->uiTextureIndex;
         pMotionTexture++;
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -84,7 +84,6 @@
   PVarFunc			                   m_pfVar;
   int32_t                                  m_CPUFlag;
   SAdaptiveQuantizationParam    m_sAdaptiveQuantParam;
-  XMMREG_PROTECT_DECLARE(AdaptiveQuantization);
 };
 
 WELSVP_NAMESPACE_END
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -176,6 +176,7 @@
         push r3
         %assign push_num 1
         LOAD_2_PARA
+        PUSH_XMM 8
 
 		pxor		xmm7,	xmm7
 
@@ -212,6 +213,7 @@
 		movq		[r3],	xmm5
 
 
+		POP_XMM
 		pop r3
 		%assign push_num 0
 
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -452,6 +452,7 @@
   push r15
   %assign push_num 4
   LOAD_5_PARA
+  PUSH_XMM 8
   SIGN_EXTENSION r1,r1d
   SIGN_EXTENSION r3,r3d
 
@@ -521,6 +522,7 @@
   sub r1, r0
   mov [r4+2], r1w                               ; to store uiTextureIndex
 
+  POP_XMM
   LOAD_5_PARA_POP
   pop r15
   pop r14
@@ -552,6 +554,7 @@
   push r13
   %assign push_num 2
   LOAD_7_PARA
+  PUSH_XMM 8
   SIGN_EXTENSION r2,r2d
   SIGN_EXTENSION r3,r3d
   SIGN_EXTENSION r4,r4d
@@ -619,6 +622,7 @@
 %undef          psadframe
 %undef          psad8x8
 %undef          pushsize
+  POP_XMM
   LOAD_7_PARA_POP
   pop r13
   pop r12
@@ -785,6 +789,7 @@
   push r14
   push r15
   %assign push_num 4
+  PUSH_XMM 8
 
 %ifdef WIN64
   mov r4, arg5  ;iPicStride
@@ -880,6 +885,7 @@
   paddd   xmm7,   xmm5
   movd    [r15],  xmm7
 
+  POP_XMM
   pop r15
   pop r14
   pop r13
@@ -1082,6 +1088,7 @@
   push r14
   push r15
   %assign push_num 4
+  PUSH_XMM 10
 
 %ifdef WIN64
   mov r4,arg5
@@ -1192,6 +1199,7 @@
   mov             r13,    psadframe
   movd    [r13],  xmm8
 
+  POP_XMM
   pop r15
   pop r14
   pop r13
@@ -1648,6 +1656,7 @@
   push r14
   push r15
 %assign push_num 4
+  PUSH_XMM 10
 %ifdef WIN64
   mov r4,arg5
   ;  mov r5,arg6
@@ -1773,6 +1782,7 @@
   mov             r13,    psadframe
   movd    [r13],  xmm8
 
+  POP_XMM
   pop r15
   pop r14
   pop r13
@@ -1821,6 +1831,7 @@
   push r14
   push r15
 %assign push_num 4
+  PUSH_XMM 10
 %ifdef WIN64
   mov r4,arg5
   ;mov r5,arg6
@@ -1993,6 +2004,7 @@
   mov             r14,    psadframe
   movd    [r14],  xmm8
 
+  POP_XMM
   pop r15
   pop r14
   pop r13