ref: fb1958ad13d2172f3b2091fc88b1715bd358a705
parent: b7b01faf29d039578d7a8c9d36d1b4ed25a0ae93
parent: 4633626d69220ed0a1776fbe510454a94279f80e
author: volvet <qizh@cisco.com>
date: Tue Mar 18 11:04:54 EDT 2014
Merge pull request #519 from mstorsjo/push-xmm-registers Backup/restore the xmm6-xmm15 SSE registers within asm functions on win64 Reviewed by zhiliang
--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -335,6 +335,82 @@
%endif
%endmacro
+%macro PUSH_XMM 1
+ %ifdef WIN64
+ %assign xmm_num_regs %1
+ %if xmm_num_regs > 6
+ %ifdef push_num
+ %assign push_num push_num+2*(%1-6)
+ %endif
+ sub rsp, 16*(%1 - 6)
+ movdqu [rsp], xmm6
+ %endif
+ %if xmm_num_regs > 7
+ movdqu [rsp+16], xmm7
+ %endif
+ %if xmm_num_regs > 8
+ movdqu [rsp+32], xmm8
+ %endif
+ %if xmm_num_regs > 9
+ movdqu [rsp+48], xmm9
+ %endif
+ %if xmm_num_regs > 10
+ movdqu [rsp+64], xmm10
+ %endif
+ %if xmm_num_regs > 11
+ movdqu [rsp+80], xmm11
+ %endif
+ %if xmm_num_regs > 12
+ movdqu [rsp+96], xmm12
+ %endif
+ %if xmm_num_regs > 13
+ movdqu [rsp+112], xmm13
+ %endif
+ %if xmm_num_regs > 14
+ movdqu [rsp+128], xmm14
+ %endif
+ %if xmm_num_regs > 15
+ movdqu [rsp+144], xmm15
+ %endif
+ %endif
+%endmacro
+
+%macro POP_XMM 0
+ %ifdef WIN64
+ %if xmm_num_regs > 15
+ movdqu xmm15, [rsp+144]
+ %endif
+ %if xmm_num_regs > 14
+ movdqu xmm14, [rsp+128]
+ %endif
+ %if xmm_num_regs > 13
+ movdqu xmm13, [rsp+112]
+ %endif
+ %if xmm_num_regs > 12
+ movdqu xmm12, [rsp+96]
+ %endif
+ %if xmm_num_regs > 11
+ movdqu xmm11, [rsp+80]
+ %endif
+ %if xmm_num_regs > 10
+ movdqu xmm10, [rsp+64]
+ %endif
+ %if xmm_num_regs > 9
+ movdqu xmm9, [rsp+48]
+ %endif
+ %if xmm_num_regs > 8
+ movdqu xmm8, [rsp+32]
+ %endif
+ %if xmm_num_regs > 7
+ movdqu xmm7, [rsp+16]
+ %endif
+ %if xmm_num_regs > 6
+ movdqu xmm6, [rsp]
+ add rsp, 16*(xmm_num_regs - 6)
+ %endif
+ %endif
+%endmacro
+
%macro SIGN_EXTENSION 2
%ifndef X86_32
movsxd %1, %2
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@@ -207,9 +207,6 @@
}
}
-void WelsXmmRegEmptyOp(void * pSrc) {
-}
-
#elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
#if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
--- a/codec/common/cpu.h
+++ b/codec/common/cpu.h
@@ -67,60 +67,14 @@
*/
void WelsCPURestore (const uint32_t kuiCPU);
-#ifdef WIN64
-void WelsXmmRegStore(void * src);
-void WelsXmmRegLoad(void * src);
-#endif
-
#else
#define WelsEmms()
#endif
-void WelsXmmRegEmptyOp(void * pSrc);
-
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
#if defined(__cplusplus)
}
#endif//__cplusplus
-
-typedef void (*WelsXmmRegProtectFunc)(void * pSrc);
-
-
-#if defined(WIN64) && defined(X86_ASM)
-#define XMMREG_PROTECT_DECLARE(name) \
- WelsXmmRegProtectFunc name##load;\
- WelsXmmRegProtectFunc name##store;\
- uint8_t name##Buffer[160];
-
-#define XMMREG_PROTECT_INIT(name) \
- { \
- uint32_t uiCpuFlag = WelsCPUFeatureDetect(NULL);\
- if( uiCpuFlag & WELS_CPU_SSE2 ){\
- name##load = WelsXmmRegLoad;\
- name##store = WelsXmmRegStore; \
- } else { \
- name##load = WelsXmmRegEmptyOp; \
- name##store = WelsXmmRegEmptyOp; \
- } \
- }
-
-#define XMMREG_PROTECT_UNINIT(name) \
-
-#define XMMREG_PROTECT_STORE(name) \
- name##store(name##Buffer);
-
-#define XMMREG_PROTECT_LOAD(name) \
- name##load(name##Buffer);
-
-#else
-
-#define XMMREG_PROTECT_DECLARE(name)
-#define XMMREG_PROTECT_INIT(name)
-#define XMMREG_PROTECT_UNINIT(name)
-#define XMMREG_PROTECT_STORE(name)
-#define XMMREG_PROTECT_LOAD(name)
-
-#endif
#endif//WELS_CPU_DETECTION_H__
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -210,44 +210,3 @@
emms ; empty mmx technology states
ret
-
-%ifdef WIN64
-
-WELS_EXTERN WelsXmmRegStore
-ALIGN 16
-;******************************************************************************************
-; void WelsXmmRegStore(void *src)
-;******************************************************************************************
-WelsXmmRegStore:
- movdqu [rcx], xmm6
- movdqu [rcx+16], xmm7
- movdqu [rcx+32], xmm8
- movdqu [rcx+48], xmm9
- movdqu [rcx+64], xmm10
- movdqu [rcx+80], xmm11
- movdqu [rcx+96], xmm12
- movdqu [rcx+112], xmm13
- movdqu [rcx+128], xmm14
- movdqu [rcx+144], xmm15
- ret
-
-WELS_EXTERN WelsXmmRegLoad
-ALIGN 16
-;******************************************************************************************
-; void WelsXmmRegLoad(void *src)
-;******************************************************************************************
-WelsXmmRegLoad:
- movdqu xmm6, [rcx]
- movdqu xmm7, [rcx+16]
- movdqu xmm8, [rcx+32]
- movdqu xmm9, [rcx+48]
- movdqu xmm10, [rcx+64]
- movdqu xmm11, [rcx+80]
- movdqu xmm12, [rcx+96]
- movdqu xmm13, [rcx+112]
- movdqu xmm14, [rcx+128]
- movdqu xmm15, [rcx+144]
- ret
-%endif
-
-
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -63,6 +63,7 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
push rbp
mov r11,[rsp + 16 + 20h] ; pTC
+ PUSH_XMM 16
sub rsp,1B0h
lea rbp,[rsp+20h]
movd xmm4,r8d
@@ -311,6 +312,7 @@
movdqa [r12+rcx],xmm0
mov r12,qword [rbp+180h]
lea rsp,[rbp+190h]
+ POP_XMM
pop rbp
ret
@@ -779,6 +781,7 @@
mov rax,rsp
push rbx
push rdi
+ PUSH_XMM 16
sub rsp,0C8h
mov r10,qword [rax + 30h] ; pTC
pxor xmm1,xmm1
@@ -833,7 +836,7 @@
punpckhbw xmm2,xmm1
punpcklbw xmm14,xmm1
movd xmm0,eax
- movsx eax,word [rsp + 0C8h + 38h] ; iBeta
+ movsx eax,word [rsp + 0C8h + 38h + 160] ; iBeta
punpckhbw xmm13,xmm1
punpckhbw xmm15,xmm1
movdqa xmm3,xmm9
@@ -929,6 +932,7 @@
movq [rdi],xmm14
movaps xmm14,[rsp+30h]
mov rsp,r11
+ POP_XMM
pop rdi
pop rbx
ret
@@ -937,6 +941,7 @@
WELS_EXTERN DeblockChromaEq4V_ssse3
mov rax,rsp
push rbx
+ PUSH_XMM 15
sub rsp,90h
pxor xmm1,xmm1
mov r11,rcx
@@ -973,7 +978,7 @@
punpcklbw xmm9,xmm1
punpckhbw xmm10,xmm1
movd xmm0,eax
- movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
+ movsx eax,word [rsp + 90h + 8h + 28h + 144] ; iBeta
punpckhbw xmm13,xmm1
movdqa xmm7,xmm12
punpcklwd xmm0,xmm0
@@ -1079,6 +1084,7 @@
movaps xmm12,[r11-70h]
movaps xmm13,[r11-80h]
mov rsp,r11
+ POP_XMM
pop rbx
ret
@@ -1090,6 +1096,7 @@
mov rax,rsp
mov [rax+20h],rbx
push rdi
+ PUSH_XMM 16
sub rsp,140h
mov rdi,rdx
lea eax,[r8*4]
@@ -1182,7 +1189,7 @@
movd xmm0,eax
movdqa xmm4,xmm12
movdqa xmm8,xmm11
- movsx eax,word [rsp+170h] ; iBeta
+ movsx eax,word [rsp+170h + 160] ; iBeta
punpcklwd xmm0,xmm0
punpcklbw xmm4,xmm1
punpckhbw xmm12,xmm1
@@ -1340,9 +1347,9 @@
mov [rbx+r10*2],eax
mov eax,[rsp+7Ch]
mov [rdx+rbx],eax
- lea r11,[rsp+140h]
- mov rbx, [r11+28h]
- mov rsp,r11
+ lea rsp,[rsp+140h]
+ POP_XMM
+ mov rbx, [rsp+28h]
pop rdi
ret
@@ -1355,6 +1362,7 @@
push rsi
push rdi
push r12
+ PUSH_XMM 16
sub rsp,170h
movsxd rsi,r8d
@@ -1438,7 +1446,7 @@
punpckhdq xmm7,xmm0
movdqa xmm0,xmm1
punpckldq xmm0,xmm5
- mov rax, [rsp+1C8h] ; pTC
+ mov rax, [rsp+1C8h+160] ; pTC
punpckhdq xmm1,xmm5
movdqa xmm9,xmm6
punpckhqdq xmm6,xmm0
@@ -1476,7 +1484,7 @@
punpckhbw xmm9,xmm1
punpckhbw xmm8,xmm1
punpcklwd xmm0,xmm0
- movsx eax,word [rsp+1C0h] ; iBeta
+ movsx eax,word [rsp+1C0h+160] ; iBeta
mov word [rsp+4],r8w
mov word [rsp+2],r9w
pshufd xmm12,xmm0,0
@@ -1620,6 +1628,7 @@
mov [r10+rbp],eax
lea r11,[rsp+170h]
mov rsp,r11
+ POP_XMM
pop r12
pop rdi
pop rsi
@@ -5132,6 +5141,7 @@
%assign push_num 3
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -5188,6 +5198,7 @@
movdqa [r2 + 70h], xmm0
mov r7, r5
+ POP_XMM
pop r5
pop r4
pop r3
@@ -5206,6 +5217,7 @@
%assign push_num 2
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
@@ -5263,6 +5275,7 @@
mov r7, r4
+ POP_XMM
pop r4
pop r3
ret
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -360,6 +360,7 @@
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r2, r2d
@@ -461,6 +462,7 @@
; for left & right border expanding
exp_cross_sse2 32,a
+ POP_XMM
LOAD_4_PARA_POP
pop r6
@@ -486,6 +488,7 @@
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
@@ -586,6 +589,7 @@
; for left & right border expanding
exp_cross_sse2 16,a
+ POP_XMM
LOAD_4_PARA_POP
pop r6
@@ -610,6 +614,7 @@
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r2,r2d
@@ -710,6 +715,7 @@
; for left & right border expanding
exp_cross_sse2 16,u
+ POP_XMM
LOAD_4_PARA_POP
pop r6
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -67,6 +67,7 @@
push r5
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@@ -112,6 +113,7 @@
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
+ POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
@@ -129,6 +131,7 @@
push r5
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@@ -174,6 +177,7 @@
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
+ POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
@@ -191,6 +195,7 @@
push r5
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
@@ -214,6 +219,7 @@
movdqa [r0+r1], xmm5
movdqa [r0+2*r1], xmm6
movdqa [r0+r4], xmm7
+ POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -149,6 +149,7 @@
WELS_EXTERN McChromaWidthEq8_sse2
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
@@ -208,6 +209,7 @@
dec r5
jnz near .xloop
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -226,6 +228,7 @@
WELS_EXTERN McChromaWidthEq8_ssse3
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r5, r5d
@@ -282,6 +285,7 @@
sub r5, 2
jnz .hloop_chroma
+ POP_XMM
LOAD_6_PARA_POP
ret
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -171,6 +171,7 @@
WELS_EXTERN McHorVer22Width8HorFirst_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -207,6 +208,7 @@
add r2, r3
dec r4
jnz .yloop_width_8
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -221,6 +223,7 @@
WELS_EXTERN McHorVer20WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -261,6 +264,7 @@
dec r4
jnz near .y_loop
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -275,6 +279,7 @@
WELS_EXTERN McHorVer20WidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -341,6 +346,7 @@
dec r4
jnz near .y_loop
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -355,6 +361,7 @@
WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -424,6 +431,7 @@
jmp near .start
.xx_exit:
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -446,6 +454,7 @@
WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -563,6 +572,7 @@
pop r13
pop r12
%endif
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -579,6 +589,7 @@
WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -639,6 +650,7 @@
add r2, r3
dec r5
jnz .yloop_width_9
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -720,6 +732,7 @@
add r2, r3
dec r5
jnz .yloop_width_17
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -736,6 +749,7 @@
WELS_EXTERN McHorVer22HorFirst_sse2
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -792,6 +806,7 @@
add r2, r3
dec r5
jnz .yloop_width_9
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -866,6 +881,7 @@
add r2, r3
dec r5
jnz .yloop_width_17
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -903,6 +919,7 @@
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -1016,6 +1033,7 @@
pop r13
pop r12
%endif
+ POP_XMM
LOAD_6_PARA_POP
ret
@@ -1032,6 +1050,7 @@
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
LOAD_6_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
@@ -1144,5 +1163,6 @@
pop r13
pop r12
%endif
+ POP_XMM
LOAD_6_PARA_POP
ret
--- a/codec/common/satd_sad.asm
+++ b/codec/common/satd_sad.asm
@@ -158,6 +158,7 @@
WELS_EXTERN WelsSampleSatd4x4_sse2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movd xmm0, [r0]
@@ -219,6 +220,7 @@
movd retrd, xmm6
and retrd, 0xffff
shr retrd, 1
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -230,6 +232,7 @@
WELS_EXTERN WelsSampleSatd8x8_sse2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -238,6 +241,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -249,6 +253,7 @@
WELS_EXTERN WelsSampleSatd8x16_sse2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -262,6 +267,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -273,6 +279,7 @@
WELS_EXTERN WelsSampleSatd16x8_sse2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@@ -291,6 +298,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -302,6 +310,7 @@
WELS_EXTERN WelsSampleSatd16x16_sse2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@@ -328,6 +337,7 @@
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd retrd, xmm6
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -976,6 +986,7 @@
WELS_EXTERN WelsSampleSatd4x4_sse41
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm4,[HSwapSumSubDB1]
@@ -1017,6 +1028,7 @@
pabsw xmm2,xmm2
pmaxsw xmm0,xmm2
SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -1032,6 +1044,7 @@
%endif
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [HSumSubDB1]
@@ -1043,6 +1056,7 @@
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@@ -1063,6 +1077,7 @@
%endif
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [HSumSubDB1]
@@ -1078,6 +1093,7 @@
cmp r6, 4
jl loop_get_satd_8x16
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
@@ -1098,6 +1114,7 @@
%endif
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
push r0
@@ -1121,6 +1138,7 @@
lea r2, [r2+4*r3]
SSE41_GetSatd8x4
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@@ -1142,6 +1160,7 @@
%endif
%assign push_num 3
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
@@ -1174,6 +1193,7 @@
cmp r6, 4
jl loop_get_satd_16x16_right
SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r6
@@ -1261,6 +1281,7 @@
%assign push_num 2
LOAD_4_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
lea r4, [3*r1]
@@ -1280,6 +1301,7 @@
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
+ POP_XMM
LOAD_4_PARA_POP
%ifdef X86_32
pop r5
@@ -1322,6 +1344,7 @@
WELS_EXTERN WelsSampleSad8x16_sse2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -1340,6 +1363,7 @@
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
+ POP_XMM
LOAD_4_PARA_POP
ret
@@ -1362,6 +1386,7 @@
push r5
%endif
%assign push_num 3
+ PUSH_XMM 8
mov r0, arg1
mov r1, arg2
SIGN_EXTENSION r1, r1d
@@ -1454,6 +1479,7 @@
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd retrd, xmm0
+ POP_XMM
%ifdef X86_32
pop r5
pop r4
@@ -1466,6 +1492,7 @@
pop r2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm6, xmm6
@@ -1476,6 +1503,7 @@
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd retrd, xmm0
+ POP_XMM
LOAD_4_PARA_POP
.return:
ret
@@ -1510,6 +1538,7 @@
WELS_EXTERN WelsSampleSadFour16x16_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -1620,6 +1649,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -1627,6 +1657,7 @@
WELS_EXTERN WelsSampleSadFour16x8_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -1705,6 +1736,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -1711,6 +1743,7 @@
WELS_EXTERN WelsSampleSadFour8x16_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -1915,6 +1948,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -1922,6 +1956,7 @@
WELS_EXTERN WelsSampleSadFour8x8_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
@@ -2035,6 +2070,7 @@
punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [r4],xmm4
+ POP_XMM
LOAD_5_PARA_POP
ret
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -149,6 +149,7 @@
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1,r1d
%ifdef X86_32
@@ -232,6 +233,7 @@
pop r4
pop r3
%endif
+ POP_XMM
ret
@@ -242,6 +244,7 @@
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1,r1d
%ifdef X86_32
@@ -325,6 +328,7 @@
pop r4
pop r3
%endif
+ POP_XMM
ret
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -55,6 +55,7 @@
WELS_EXTERN WelsResBlockZero16x16_sse2
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
lea r1, [r1*2]
lea r2, [r1*3]
@@ -116,6 +117,7 @@
movdqa [r0+r2], xmm7
movdqa [r0+r2+10h], xmm7
+ POP_XMM
ret
@@ -125,6 +127,7 @@
WELS_EXTERN WelsResBlockZero8x8_sse2
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
lea r1, [r1*2]
lea r2, [r1*3]
@@ -143,5 +146,6 @@
movdqa [r0+r2], xmm7
+ POP_XMM
ret
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -223,6 +223,7 @@
push r4
%assign push_num 2
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r4, r0 ; save r0 in r4
sub r0, 1
@@ -302,6 +303,7 @@
cmp r2, 16
jnz get_i16x16_luma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
ret
@@ -387,6 +389,7 @@
push r4
%assign push_num 2
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r4, r0
sub r0, 1
@@ -465,6 +468,7 @@
cmp r2, 8
jnz get_i_chroma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
WELSEMMS
@@ -1181,6 +1185,7 @@
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r2, r0
sub r2, r1
@@ -1243,6 +1248,7 @@
movdqa [r0+2*r1], xmm0
movdqa [r0+r2], xmm1
+ POP_XMM
ret
;*******************************************************************************
@@ -1355,6 +1361,7 @@
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
mov r2, r0
sub r2, r1
@@ -1384,6 +1391,7 @@
movq [r0+r1], xmm0
movq [r0+2*r1], xmm0
movq [r0+r2], xmm0
+ POP_XMM
ret
;*******************************************************************************
--- a/codec/decoder/plus/inc/welsDecoderExt.h
+++ b/codec/decoder/plus/inc/welsDecoderExt.h
@@ -104,8 +104,6 @@
void InitDecoder (void);
void UninitDecoder (void);
-XMMREG_PROTECT_DECLARE(CWelsH264Decoder);
-
#ifdef OUTPUT_BIT_STREAM
WelsFileHandle* m_pFBS;
WelsFileHandle* m_pFBSSize;
--- a/codec/decoder/plus/src/welsDecoderExt.cpp
+++ b/codec/decoder/plus/src/welsDecoderExt.cpp
@@ -101,7 +101,6 @@
m_pTrace = CreateWelsTrace (Wels_Trace_Type);
IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::CWelsDecoder() entry");
- XMMREG_PROTECT_INIT(CWelsH264Decoder);
#ifdef OUTPUT_BIT_STREAM
SWelsTime sCurTime;
@@ -167,7 +166,6 @@
IWelsTrace::WelsVTrace (m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::~CWelsDecoder()");
UninitDecoder();
- XMMREG_PROTECT_UNINIT(CWelsH264Decoder);
#ifdef OUTPUT_BIT_STREAM
if (m_pFBS) {
@@ -361,10 +359,8 @@
m_pDecContext->iFeedbackTidInAu = -1; //initialize
- XMMREG_PROTECT_STORE(CWelsH264Decoder);
WelsDecodeBs (m_pDecContext, kpSrc, kiSrcLen, (unsigned char**)ppDst,
pDstInfo); //iErrorCode has been modified in this function
- XMMREG_PROTECT_LOAD(CWelsH264Decoder);
if (m_pDecContext->iErrorCode) {
ENalUnitType eNalType =
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -290,6 +290,7 @@
WELS_EXTERN WelsDctFourT4_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
@@ -327,6 +328,7 @@
lea r0, [r0+64]
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -337,6 +339,7 @@
WELS_EXTERN WelsIDctFourT4Rec_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
;Load 4x8
@@ -376,6 +379,7 @@
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
+ POP_XMM
LOAD_5_PARA_POP
; pop esi
; pop ebx
@@ -394,6 +398,7 @@
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
%assign push_num 0
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
@@ -430,6 +435,7 @@
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ POP_XMM
LOAD_5_PARA_POP
ret
@@ -468,6 +474,7 @@
WELS_EXTERN WelsHadamardT4Dc_sse2
%assign push_num 0
LOAD_2_PARA
+ PUSH_XMM 8
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
@@ -493,4 +500,5 @@
movdqa [r0+ 0], xmm3
movdqa [r0+16], xmm2
+ POP_XMM
ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -229,6 +229,7 @@
push r4
%assign push_num 2
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2, r2d
sub r1, 1
sub r1, r2
@@ -304,6 +305,7 @@
inc r3
cmp r3, 16
jnz get_i16x16_luma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
ret
@@ -384,6 +386,7 @@
push r4
%assign push_num 2
LOAD_3_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2, r2d
sub r1, 1
sub r1, r2
@@ -458,6 +461,7 @@
inc r3
cmp r3, 8
jnz get_i_chroma_pred_plane_sse2_1
+ POP_XMM
pop r4
pop r3
WELSEMMS
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -136,6 +136,7 @@
WELS_EXTERN WelsQuantFour4x4Max_sse2
%assign push_num 0
LOAD_4_PARA
+ PUSH_XMM 8
MOVDQ xmm2, [r1]
MOVDQ xmm3, [r2]
@@ -161,6 +162,7 @@
pmaxsw xmm0, xmm1
movq [r3], xmm0
+ POP_XMM
LOAD_4_PARA_POP
ret
--- a/codec/encoder/plus/inc/welsEncoderExt.h
+++ b/codec/encoder/plus/inc/welsEncoderExt.h
@@ -132,8 +132,6 @@
void InitEncoder (void);
int32_t RawData2SrcPic (const uint8_t* pSrc);
void DumpSrcPicture (const uint8_t* pSrc);
-
- XMMREG_PROTECT_DECLARE(CWelsH264SVCEncoder);
};
}
#endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
--- a/codec/encoder/plus/src/welsEncoderExt.cpp
+++ b/codec/encoder/plus/src/welsEncoderExt.cpp
@@ -138,7 +138,6 @@
#endif//OUTPUT_BIT_STREAM
InitEncoder();
- XMMREG_PROTECT_INIT(CWelsH264SVCEncoder);
}
CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
@@ -172,7 +171,6 @@
#endif//OUTPUT_BIT_STREAM
Uninitialize();
- XMMREG_PROTECT_UNINIT(CWelsH264SVCEncoder);
}
void CWelsH264SVCEncoder::InitEncoder (void) {
@@ -551,9 +549,7 @@
int32_t iFrameTypeReturned = 0;
int32_t iFrameType = videoFrameTypeInvalid;
- XMMREG_PROTECT_STORE(CWelsH264SVCEncoder);
const int32_t kiEncoderReturn = WelsEncoderEncodeExt (m_pEncContext, pBsInfo, pSrcPic);
- XMMREG_PROTECT_LOAD(CWelsH264SVCEncoder);
if(kiEncoderReturn == ENC_RETURN_MEMALLOCERR) {
WelsUninitEncoderExt (&m_pEncContext);
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -49,11 +49,9 @@
m_pfVar = NULL;
WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
WelsInitVarFunc (m_pfVar, m_CPUFlag);
- XMMREG_PROTECT_INIT(AdaptiveQuantization);
}
CAdaptiveQuantization::~CAdaptiveQuantization() {
- XMMREG_PROTECT_UNINIT(AdaptiveQuantization);
}
EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
@@ -102,7 +100,6 @@
pRefFrameTmp = pRefFrameY;
pCurFrameTmp = pCurFrameY;
for (i = 0; i < iMbWidth; i++) {
- XMMREG_PROTECT_STORE(AdaptiveQuantization);
iSumDiff = pVaaCalcResults->pSad8x8[iMbIndex][0];
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
@@ -111,7 +108,6 @@
iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
- XMMREG_PROTECT_LOAD(AdaptiveQuantization);
iSumDiff = iSumDiff >> 8;
pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
@@ -134,9 +130,7 @@
pRefFrameTmp = pRefFrameY;
pCurFrameTmp = pCurFrameY;
for (i = 0; i < iMbWidth; i++) {
- XMMREG_PROTECT_STORE(AdaptiveQuantization);
m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
- XMMREG_PROTECT_LOAD(AdaptiveQuantization);
dAverageMotionIndex += pMotionTexture->uiMotionIndex;
dAverageTextureIndex += pMotionTexture->uiTextureIndex;
pMotionTexture++;
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -84,7 +84,6 @@
PVarFunc m_pfVar;
int32_t m_CPUFlag;
SAdaptiveQuantizationParam m_sAdaptiveQuantParam;
- XMMREG_PROTECT_DECLARE(AdaptiveQuantization);
};
WELSVP_NAMESPACE_END
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -176,6 +176,7 @@
push r3
%assign push_num 1
LOAD_2_PARA
+ PUSH_XMM 8
pxor xmm7, xmm7
@@ -212,6 +213,7 @@
movq [r3], xmm5
+ POP_XMM
pop r3
%assign push_num 0
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -452,6 +452,7 @@
push r15
%assign push_num 4
LOAD_5_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r1,r1d
SIGN_EXTENSION r3,r3d
@@ -521,6 +522,7 @@
sub r1, r0
mov [r4+2], r1w ; to store uiTextureIndex
+ POP_XMM
LOAD_5_PARA_POP
pop r15
pop r14
@@ -552,6 +554,7 @@
push r13
%assign push_num 2
LOAD_7_PARA
+ PUSH_XMM 8
SIGN_EXTENSION r2,r2d
SIGN_EXTENSION r3,r3d
SIGN_EXTENSION r4,r4d
@@ -619,6 +622,7 @@
%undef psadframe
%undef psad8x8
%undef pushsize
+ POP_XMM
LOAD_7_PARA_POP
pop r13
pop r12
@@ -785,6 +789,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 8
%ifdef WIN64
mov r4, arg5 ;iPicStride
@@ -880,6 +885,7 @@
paddd xmm7, xmm5
movd [r15], xmm7
+ POP_XMM
pop r15
pop r14
pop r13
@@ -1082,6 +1088,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
@@ -1192,6 +1199,7 @@
mov r13, psadframe
movd [r13], xmm8
+ POP_XMM
pop r15
pop r14
pop r13
@@ -1648,6 +1656,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
; mov r5,arg6
@@ -1773,6 +1782,7 @@
mov r13, psadframe
movd [r13], xmm8
+ POP_XMM
pop r15
pop r14
pop r13
@@ -1821,6 +1831,7 @@
push r14
push r15
%assign push_num 4
+ PUSH_XMM 10
%ifdef WIN64
mov r4,arg5
;mov r5,arg6
@@ -1993,6 +2004,7 @@
mov r14, psadframe
movd [r14], xmm8
+ POP_XMM
pop r15
pop r14
pop r13