ref: eec968234d3af6bb734ef31594f0255d5e4d0598
parent: 918b211990ec2ab891cac748aa3561d5b0db74f8
author: Martin Storsjö <[email protected]>
date: Sun Mar 16 09:23:24 EDT 2014
Fold ALIGN 16 and the function label into WELS_EXTERN This simplifies the structure for all x86 assembly functions, reducing the amount of duplicated code structure.
--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -342,6 +342,7 @@
%endmacro
%macro WELS_EXTERN 1
+ ALIGN 16
%ifdef PREFIX
global _%1
%define %1 _%1
@@ -348,6 +349,7 @@
%else
global %1
%endif
+ %1:
%endmacro
%macro WELS_AbsW 2
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -55,12 +55,10 @@
; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
; section CPUID - CPU Identification
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
;******************************************************************************************
; int32_t WelsCPUIdVerify()
;******************************************************************************************
-WelsCPUIdVerify:
+WELS_EXTERN WelsCPUIdVerify
push r1
PUSHRFLAGS
PUSHRFLAGS
@@ -73,14 +71,12 @@
pop r1
ret
-WELS_EXTERN WelsCPUId
-ALIGN 16
;****************************************************************************************************
; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
;****************************************************************************************************
%ifdef WIN64
-WelsCPUId:
+WELS_EXTERN WelsCPUId
push rbx
push rdx
@@ -98,7 +94,7 @@
ret
%elifdef UNIX64
-WelsCPUId:
+WELS_EXTERN WelsCPUId
push rbx
push rcx
push rdx
@@ -118,7 +114,7 @@
%elifdef X86_32
-WelsCPUId:
+WELS_EXTERN WelsCPUId
push ebx
push edi
@@ -143,13 +139,11 @@
%endif
-WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
;****************************************************************************************************
; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
;****************************************************************************************************
-WelsCPUSupportAVX:
+WELS_EXTERN WelsCPUSupportAVX
%ifdef WIN64
mov eax, ecx
mov ecx, edx
@@ -178,13 +172,11 @@
ret
-WELS_EXTERN WelsCPUSupportFMA
; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
;****************************************************************************************************
; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
;****************************************************************************************************
-WelsCPUSupportFMA:
+WELS_EXTERN WelsCPUSupportFMA
%ifdef WIN64
mov eax, ecx
mov ecx, edx
@@ -211,12 +203,10 @@
mov eax, 0
ret
-WELS_EXTERN WelsEmms
-ALIGN 16
;******************************************************************************************
; void WelsEmms()
;******************************************************************************************
-WelsEmms:
+WELS_EXTERN WelsEmms
emms ; empty mmx technology states
ret
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -61,8 +61,6 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
-
-DeblockLumaLt4V_ssse3:
push rbp
mov r11,[rsp + 16 + 20h] ; pTC
sub rsp,1B0h
@@ -318,9 +316,6 @@
WELS_EXTERN DeblockLumaEq4V_ssse3
-
-ALIGN 16
-DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -781,9 +776,6 @@
WELS_EXTERN DeblockChromaLt4V_ssse3
-
-ALIGN 16
-DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rdi
@@ -943,8 +935,6 @@
WELS_EXTERN DeblockChromaEq4V_ssse3
-ALIGN 16
-DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
sub rsp,90h
@@ -1097,8 +1087,6 @@
WELS_EXTERN DeblockChromaEq4H_ssse3
-ALIGN 16
-DeblockChromaEq4H_ssse3:
mov rax,rsp
mov [rax+20h],rbx
push rdi
@@ -1361,8 +1349,6 @@
WELS_EXTERN DeblockChromaLt4H_ssse3
-ALIGN 16
-DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -1647,8 +1633,6 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
-
-DeblockLumaLt4V_ssse3:
push rbp
mov r11,r8 ; pTC
sub rsp,1B0h
@@ -1904,9 +1888,6 @@
WELS_EXTERN DeblockLumaEq4V_ssse3
-
-ALIGN 16
-DeblockLumaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2366,8 +2347,6 @@
ret
WELS_EXTERN DeblockChromaLt4V_ssse3
-ALIGN 16
-DeblockChromaLt4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2534,8 +2513,6 @@
ret
WELS_EXTERN DeblockChromaEq4V_ssse3
-
-DeblockChromaEq4V_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2685,9 +2662,6 @@
ret
WELS_EXTERN DeblockChromaEq4H_ssse3
-
-ALIGN 16
-DeblockChromaEq4H_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -2960,8 +2934,6 @@
WELS_EXTERN DeblockChromaLt4H_ssse3
-ALIGN 16
-DeblockChromaLt4H_ssse3:
mov rax,rsp
push rbx
push rbp
@@ -3256,9 +3228,6 @@
; int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN DeblockChromaEq4V_ssse3
-
-ALIGN 16
-DeblockChromaEq4V_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -3426,8 +3395,6 @@
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4V_ssse3
-
-DeblockChromaLt4V_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -3629,10 +3596,6 @@
;***************************************************************************
WELS_EXTERN DeblockChromaEq4H_ssse3
-
-ALIGN 16
-
-DeblockChromaEq4H_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -3914,10 +3877,6 @@
;*******************************************************************************
WELS_EXTERN DeblockChromaLt4H_ssse3
-
-ALIGN 16
-
-DeblockChromaLt4H_ssse3:
push ebp
mov ebp,esp
and esp,0FFFFFFF0h
@@ -4230,10 +4189,6 @@
WELS_EXTERN DeblockLumaLt4V_ssse3
-
-ALIGN 16
-
-DeblockLumaLt4V_ssse3:
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
@@ -4620,12 +4575,9 @@
; int32_t iBeta)
;*******************************************************************************
+
WELS_EXTERN DeblockLumaEq4V_ssse3
-ALIGN 16
-
-DeblockLumaEq4V_ssse3:
-
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
@@ -5174,10 +5126,6 @@
;********************************************************************************
WELS_EXTERN DeblockLumaTransposeH2V_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeH2V_sse2:
push r3
push r4
push r5
@@ -5253,10 +5201,6 @@
;*******************************************************************************************
WELS_EXTERN DeblockLumaTransposeV2H_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeV2H_sse2:
push r3
push r4
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -67,9 +67,6 @@
SECTION .text
-WELS_EXTERN ExpandPictureLuma_sse2
-WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
-WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
;;;;;;;expanding result;;;;;;;
@@ -360,7 +357,6 @@
%endif
%endmacro
-ALIGN 16
;***********************************************************************----------------
; void ExpandPictureLuma_sse2( uint8_t *pDst,
; const int32_t iStride,
@@ -367,7 +363,7 @@
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
-ExpandPictureLuma_sse2:
+WELS_EXTERN ExpandPictureLuma_sse2
push r4
push r5
@@ -487,7 +483,6 @@
ret
-ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
; const int32_t iStride,
@@ -494,7 +489,7 @@
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
-ExpandPictureChromaAlign_sse2:
+WELS_EXTERN ExpandPictureChromaAlign_sse2
push r4
push r5
@@ -613,7 +608,6 @@
ret
-ALIGN 16
;***********************************************************************----------------
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
; const int32_t iStride,
@@ -620,7 +614,7 @@
; const int32_t iWidth,
; const int32_t iHeight );
;***********************************************************************----------------
-ExpandPictureChromaUnalign_sse2:
+WELS_EXTERN ExpandPictureChromaUnalign_sse2
push r4
push r5
push r6
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -54,12 +54,6 @@
SECTION .text
-WELS_EXTERN WelsCopy16x16_sse2
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
-WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
-WELS_EXTERN WelsCopy8x16_mmx ;
-WELS_EXTERN UpdateMbMv_sse2 ;
;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst,
@@ -67,8 +61,7 @@
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
-ALIGN 16
-WelsCopy16x16_sse2:
+WELS_EXTERN WelsCopy16x16_sse2
push r4
push r5
@@ -130,9 +123,8 @@
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
-ALIGN 16
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WelsCopy16x16NotAligned_sse2:
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
push r4
push r5
%assign push_num 2
@@ -194,8 +186,7 @@
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
-ALIGN 16
-WelsCopy16x8NotAligned_sse2:
+WELS_EXTERN WelsCopy16x8NotAligned_sse2
push r4
push r5
%assign push_num 2
@@ -235,8 +226,7 @@
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
-ALIGN 16
-WelsCopy8x16_mmx:
+WELS_EXTERN WelsCopy8x16_mmx
%assign push_num 0
LOAD_4_PARA
@@ -300,8 +290,7 @@
; uint8_t* Src,
; int32_t iStrideS )
;***********************************************************************
-ALIGN 16
-WelsCopy8x8_mmx:
+WELS_EXTERN WelsCopy8x8_mmx
push r4
%assign push_num 1
LOAD_4_PARA
@@ -349,8 +338,7 @@
;***********************************************************************
; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
;***********************************************************************
-ALIGN 16
-UpdateMbMv_sse2:
+WELS_EXTERN UpdateMbMv_sse2
%assign push_num 0
LOAD_2_PARA
@@ -385,16 +373,9 @@
SECTION .text
-WELS_EXTERN PixelAvgWidthEq4_mmx
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
-ALIGN 16
;*******************************************************************************
; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
@@ -401,7 +382,7 @@
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
-PixelAvgWidthEq4_mmx:
+WELS_EXTERN PixelAvgWidthEq4_mmx
%assign push_num 0
LOAD_7_PARA
@@ -428,7 +409,6 @@
ret
-ALIGN 16
;*******************************************************************************
; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
@@ -435,7 +415,7 @@
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
-PixelAvgWidthEq8_mmx:
+WELS_EXTERN PixelAvgWidthEq8_mmx
%assign push_num 0
LOAD_7_PARA
@@ -466,7 +446,6 @@
-ALIGN 16
;*******************************************************************************
; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
; uint8_t *pSrcA, int iSrcAStride,
@@ -473,7 +452,7 @@
; uint8_t *pSrcB, int iSrcBStride,
; int iHeight );
;*******************************************************************************
-PixelAvgWidthEq16_sse2:
+WELS_EXTERN PixelAvgWidthEq16_sse2
%assign push_num 0
LOAD_7_PARA
@@ -519,12 +498,11 @@
LOAD_7_PARA_POP
ret
-ALIGN 16
;*******************************************************************************
; void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
-McCopyWidthEq4_mmx:
+WELS_EXTERN McCopyWidthEq4_mmx
push r5
%assign push_num 1
LOAD_5_PARA
@@ -547,12 +525,11 @@
pop r5
ret
-ALIGN 16
;*******************************************************************************
; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
; uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
-McCopyWidthEq8_mmx:
+WELS_EXTERN McCopyWidthEq8_mmx
%assign push_num 0
LOAD_5_PARA
@@ -574,7 +551,6 @@
ret
-ALIGN 16
;*******************************************************************************
; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
;*******************************************************************************
@@ -589,7 +565,7 @@
movq [%1], %2
movhps [%1+8], %2
%endmacro
-McCopyWidthEq16_sse2:
+WELS_EXTERN McCopyWidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -65,7 +65,6 @@
SECTION .text
-ALIGN 16
;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src,
; int32_t iSrcStride,
@@ -75,7 +74,6 @@
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@@ -140,7 +138,6 @@
ret
-ALIGN 16
;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
@@ -150,7 +147,6 @@
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@@ -219,7 +215,6 @@
-ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
; int32_t iSrcStride,
@@ -229,7 +224,6 @@
; int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -71,10 +71,8 @@
SECTION .text
-WELS_EXTERN McHorVer20WidthEq4_mmx
-ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
@@ -82,7 +80,7 @@
; int iDstStride,
; int iHeight)
;*******************************************************************************
-McHorVer20WidthEq4_mmx:
+WELS_EXTERN McHorVer20WidthEq4_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -161,12 +159,7 @@
;*******************************************************************************
SECTION .text
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
-ALIGN 16
;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
; int16_t iSrcStride,
@@ -175,7 +168,7 @@
; int32_t iHeight
; )
;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -217,7 +210,6 @@
LOAD_5_PARA_POP
ret
-ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
@@ -226,7 +218,7 @@
; int iHeight,
; );
;*******************************************************************************
-McHorVer20WidthEq8_sse2:
+WELS_EXTERN McHorVer20WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -272,7 +264,6 @@
LOAD_5_PARA_POP
ret
-ALIGN 16
;*******************************************************************************
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
; int iSrcStride,
@@ -281,7 +272,7 @@
; int iHeight,
; );
;*******************************************************************************
-McHorVer20WidthEq16_sse2:
+WELS_EXTERN McHorVer20WidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -361,8 +352,7 @@
; int iDstStride,
; int iHeight )
;*******************************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
+WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -443,11 +433,6 @@
SECTION .text
-WELS_EXTERN McHorVer20Width9Or17_sse2
-WELS_EXTERN McHorVer02Height9Or17_sse2
-WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
-WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
-WELS_EXTERN McHorVer22HorFirst_sse2
;***********************************************************************
@@ -458,8 +443,7 @@
; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
-ALIGN 16
-McHorVer02Height9Or17_sse2:
+WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@@ -583,7 +567,6 @@
ret
-ALIGN 16
;***********************************************************************
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
@@ -593,7 +576,7 @@
; int32_t iHeight
; );
;***********************************************************************
-McHorVer20Width9Or17_sse2:
+WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@@ -742,7 +725,6 @@
-ALIGN 16
;***********************************************************************
;void McHorVer22HorFirst_sse2
; (const uint8_t *pSrc,
@@ -751,7 +733,7 @@
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
;***********************************************************************
-McHorVer22HorFirst_sse2:
+WELS_EXTERN McHorVer22HorFirst_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@@ -918,7 +900,7 @@
; int32_t iHeight);
;***********************************************************************
- McHorVer22Width8VerLastAlign_sse2:
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
@@ -1047,7 +1029,7 @@
; int32_t iHeight);
;***********************************************************************
- McHorVer22Width8VerLastUnAlign_sse2:
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
LOAD_6_PARA
SIGN_EXTENSION r1, r1d
--- a/codec/common/satd_sad.asm
+++ b/codec/common/satd_sad.asm
@@ -156,8 +156,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -229,9 +227,7 @@
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
+WELS_EXTERN WelsSampleSatd8x8_sse2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -250,9 +246,7 @@
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
+WELS_EXTERN WelsSampleSatd8x16_sse2
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -277,8 +271,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -308,8 +300,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -484,7 +474,6 @@
%ifdef X86_32
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
push ebx
push esi
push edi
@@ -678,7 +667,6 @@
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
push ebx
push esi
push edi
@@ -782,7 +770,6 @@
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
push ebx
push esi
push edi
@@ -987,7 +974,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -1040,8 +1026,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
%ifdef X86_32
push r4
push r5
@@ -1072,8 +1056,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
%ifdef X86_32
push r4
push r5
@@ -1110,8 +1092,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
%ifdef X86_32
push r4
push r5
@@ -1155,8 +1135,6 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
%ifdef X86_32
push r4
push r5
@@ -1276,8 +1254,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
%ifdef X86_32
push r4
push r5
@@ -1319,8 +1295,6 @@
;
;***********************************************************************
WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -1346,7 +1320,6 @@
WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
@@ -1377,7 +1350,6 @@
%endmacro
WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
%assign push_num 0
mov r2, arg3
push r2
@@ -1536,7 +1508,6 @@
paddw xmm7, %4
%endmacro
WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -1654,7 +1625,6 @@
WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -1739,7 +1709,6 @@
ret
WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -1951,7 +1920,6 @@
WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -2071,7 +2039,6 @@
ret
WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -2144,13 +2111,10 @@
;
;***********************************************************************
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
;***********************************************************************
; int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
;***********************************************************************
-WelsSampleSad4x4_mmx:
+WELS_EXTERN WelsSampleSad4x4_mmx
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -152,12 +152,10 @@
; , 6/7/2010
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
;***********************************************************************
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
%assign push_num 0
LOAD_2_PARA
@@ -247,12 +245,10 @@
ret
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
;***********************************************************************
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
%assign push_num 0
LOAD_2_PARA
@@ -342,12 +338,10 @@
ret
-WELS_EXTERN MdInterAnalysisVaaInfo_sse41
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse41:
+WELS_EXTERN MdInterAnalysisVaaInfo_sse41
%assign push_num 0
LOAD_1_PARA
movdqa xmm0,[r0]
@@ -378,12 +372,10 @@
mov retrd, 15
ret
-WELS_EXTERN MdInterAnalysisVaaInfo_sse2
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse2:
+WELS_EXTERN MdInterAnalysisVaaInfo_sse2
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -49,13 +49,10 @@
SECTION .text
-WELS_EXTERN WelsResBlockZero16x16_sse2
-
-ALIGN 16
;*******************************************************************************
; void WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
;*******************************************************************************
-WelsResBlockZero16x16_sse2:
+WELS_EXTERN WelsResBlockZero16x16_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -122,13 +119,10 @@
ret
-WELS_EXTERN WelsResBlockZero8x8_sse2
-
-ALIGN 16
;*******************************************************************************
; void WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
;*******************************************************************************
-WelsResBlockZero8x8_sse2:
+WELS_EXTERN WelsResBlockZero8x8_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -83,14 +83,11 @@
SECTION .text
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
;*******************************************************************************
; void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
;*******************************************************************************
-IdctResAddPred_mmx:
+WELS_EXTERN IdctResAddPred_mmx
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1d
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -177,18 +177,14 @@
;*******************************************************************************
SECTION .text
-WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
-WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
-WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
-ALIGN 16
;*******************************************************************************
; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
;
; pPred must align to 16
;*******************************************************************************
-WelsDecoderI4x4LumaPredH_sse2:
+WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -222,7 +218,7 @@
;*******************************************************************************
; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
-WelsDecoderI16x16LumaPredPlane_sse2:
+WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
push r3
push r4
%assign push_num 2
@@ -326,7 +322,6 @@
%endmacro
WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
-WelsDecoderI16x16LumaPredH_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -350,7 +345,6 @@
; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
-WelsDecoderI16x16LumaPredV_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -389,7 +383,6 @@
; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
-WelsDecoderIChromaPredPlane_sse2:
push r3
push r4
%assign push_num 2
@@ -477,7 +470,6 @@
WELSEMMS
ret
-ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
; 6 |7 |8 |9 |10|
@@ -490,7 +482,7 @@
; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
;
;*******************************************************************************
-WelsDecoderI4x4LumaPredDDR_mmx:
+WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -536,7 +528,6 @@
ret
-ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixel of 8 line from left
@@ -560,7 +551,6 @@
%endmacro
WELS_EXTERN WelsDecoderIChromaPredH_mmx
-WelsDecoderIChromaPredH_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -597,13 +587,11 @@
ret
-ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
; copy 8 pixels from top 8 pixels
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredV_mmx
-WelsDecoderIChromaPredV_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -627,7 +615,6 @@
ret
- ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
@@ -658,7 +645,6 @@
; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
-WelsDecoderI4x4LumaPredHD_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -714,7 +700,6 @@
-ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
@@ -742,7 +727,6 @@
; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
-WelsDecoderI4x4LumaPredHU_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -799,7 +783,6 @@
-ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
; l0|
@@ -829,7 +812,6 @@
; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
-WelsDecoderI4x4LumaPredVR_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -889,7 +871,6 @@
WELSEMMS
ret
-ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@@ -917,7 +898,6 @@
; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
-WelsDecoderI4x4LumaPredDDL_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -956,7 +936,6 @@
ret
-ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@@ -987,7 +966,6 @@
; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
-WelsDecoderI4x4LumaPredVL_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -1023,13 +1001,11 @@
WELSEMMS
ret
-ALIGN 16
;*******************************************************************************
;
; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDc_sse2
-WelsDecoderIChromaPredDc_sse2:
push r3
push r4
%assign push_num 2
@@ -1120,13 +1096,11 @@
-ALIGN 16
;*******************************************************************************
;
; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
-WelsDecoderI16x16LumaPredDc_sse2:
push r3
push r4
%assign push_num 2
@@ -1201,12 +1175,10 @@
; for intra prediction as follows, 11/19/2010
;*******************************************************************************
-ALIGN 16
;*******************************************************************************
; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
-WelsDecoderI16x16LumaPredDcTop_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -1273,12 +1245,10 @@
ret
-ALIGN 16
;*******************************************************************************
; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
-WelsDecoderI16x16LumaPredDcNA_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -1308,12 +1278,10 @@
ret
-ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
-WelsDecoderIChromaPredDcLeft_mmx:
push r3
push r4
%assign push_num 2
@@ -1381,12 +1349,10 @@
emms
ret
-ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
-WelsDecoderIChromaPredDcTop_sse2:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
@@ -1420,12 +1386,10 @@
movq [r0+r2], xmm0
ret
-ALIGN 16
;*******************************************************************************
; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
-WelsDecoderIChromaPredDcNA_mmx:
%assign push_num 0
LOAD_2_PARA
SIGN_EXTENSION r1, r1d
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -323,7 +323,6 @@
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
-CavlcParamCal_sse2:
push ebx
push edi
push esi
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -130,12 +130,10 @@
movd %5, %1
%endmacro
SECTION .text
-ALIGN 16
;***********************************************************************
; void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctT4_mmx
-WelsDctT4_mmx:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r2, r2d
@@ -163,7 +161,6 @@
; void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
;***********************************************************************
WELS_EXTERN WelsIDctT4Rec_mmx
-WelsIDctT4Rec_mmx:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -291,8 +288,6 @@
; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
;***********************************************************************
WELS_EXTERN WelsDctFourT4_sse2
-ALIGN 16
-WelsDctFourT4_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r2, r2d
@@ -340,8 +335,6 @@
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
-ALIGN 16
-WelsIDctFourT4Rec_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -399,8 +392,6 @@
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
-ALIGN 16
-WelsIDctRecI16x16Dc_sse2:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
@@ -475,7 +466,6 @@
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
-WelsHadamardT4Dc_sse2:
%assign push_num 0
LOAD_2_PARA
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -188,18 +188,13 @@
;***********************************************************************
SECTION .text
-WELS_EXTERN WelsI4x4LumaPredH_sse2
-WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-WELS_EXTERN WelsI4x4LumaPredDc_sse2
-WELS_EXTERN WelsI16x16LumaPredPlane_sse2
-ALIGN 16
;***********************************************************************
; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;
; pred must align to 16
;***********************************************************************
-WelsI4x4LumaPredH_sse2:
+WELS_EXTERN WelsI4x4LumaPredH_sse2
push r3
%assign push_num 1
LOAD_3_PARA
@@ -233,7 +228,7 @@
;***********************************************************************
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
-WelsI16x16LumaPredPlane_sse2:
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
push r3
push r4
%assign push_num 2
@@ -330,7 +325,6 @@
%endmacro
WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
push r3
%assign push_num 1
LOAD_3_PARA
@@ -361,7 +355,6 @@
; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -391,7 +384,6 @@
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
-WelsIChromaPredPlane_sse2:
push r3
push r4
%assign push_num 2
@@ -475,7 +467,6 @@
WELSEMMS
ret
-ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
; 6 |7 |8 |9 |10|
@@ -488,7 +479,7 @@
; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -529,7 +520,6 @@
WELSEMMS
ret
-ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
; 5 |6 |7 |8 |9 |
@@ -542,7 +532,7 @@
; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
;
;***********************************************************************
-WelsI4x4LumaPredDc_sse2:
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
push r3
push r4
%assign push_num 2
@@ -576,7 +566,6 @@
pop r3
ret
-ALIGN 16
;***********************************************************************
; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixel of 8 line from left
@@ -602,7 +591,6 @@
%endmacro
WELS_EXTERN WelsIChromaPredH_mmx
-WelsIChromaPredH_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -633,13 +621,11 @@
WELSEMMS
ret
-ALIGN 16
;***********************************************************************
; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy pixels from top 4 pixels
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredV_sse2
-WelsI4x4LumaPredV_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -649,13 +635,11 @@
movdqa [r0], xmm0
ret
-ALIGN 16
;***********************************************************************
; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
; copy 8 pixels from top 8 pixels
;***********************************************************************
WELS_EXTERN WelsIChromaPredV_sse2
-WelsIChromaPredV_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -669,7 +653,6 @@
movdqa [r0+48], xmm0
ret
- ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
; l0|
@@ -700,7 +683,6 @@
; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -751,7 +733,6 @@
WELSEMMS
ret
-ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
; l0|
@@ -779,7 +760,6 @@
; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -832,7 +812,6 @@
-ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
; l0|
@@ -862,7 +841,6 @@
; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -920,7 +898,6 @@
WELSEMMS
ret
-ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@@ -948,7 +925,6 @@
; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -985,7 +961,6 @@
ret
-ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
; l0|
@@ -1016,7 +991,6 @@
; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r2, r2d
@@ -1049,13 +1023,11 @@
WELSEMMS
ret
-ALIGN 16
;***********************************************************************
;
; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
push r3
push r4
%assign push_num 2
@@ -1141,13 +1113,11 @@
-ALIGN 16
;***********************************************************************
;
; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
push r3
push r4
%assign push_num 2
@@ -1210,8 +1180,6 @@
;***********************************************************************
%ifdef X86_32
WELS_EXTERN WelsSampleSatdThree4x4_sse2
-align 16
-WelsSampleSatdThree4x4_sse2:
push ebx
push esi
push edi
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -47,12 +47,10 @@
SECTION .text
-ALIGN 16
;***********************************************************************
;void WelsPrefetchZero_mmx(int8_t const*_A);
;***********************************************************************
WELS_EXTERN WelsPrefetchZero_mmx
-WelsPrefetchZero_mmx:
%assign push_num 0
LOAD_1_PARA
prefetchnta [r0]
@@ -59,12 +57,10 @@
ret
-ALIGN 16
;***********************************************************************
; void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroAligned64_sse2
-WelsSetMemZeroAligned64_sse2:
%assign push_num 0
LOAD_2_PARA
@@ -84,12 +80,10 @@
ret
-ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize64_mmx
-WelsSetMemZeroSize64_mmx:
%assign push_num 0
LOAD_2_PARA
@@ -114,12 +108,10 @@
WELSEMMS
ret
-ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize8_mmx
-WelsSetMemZeroSize8_mmx:
%assign push_num 0
LOAD_2_PARA
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -83,8 +83,6 @@
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4_sse2
-align 16
-WelsQuant4x4_sse2:
%assign push_num 0
LOAD_3_PARA
movdqa xmm2, [r1]
@@ -99,8 +97,6 @@
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsQuant4x4Dc_sse2
-align 16
-WelsQuant4x4Dc_sse2:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1w
@@ -118,8 +114,6 @@
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4_sse2
-align 16
-WelsQuantFour4x4_sse2:
%assign push_num 0
LOAD_3_PARA
MOVDQ xmm2, [r1]
@@ -140,8 +134,6 @@
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
;***********************************************************************
WELS_EXTERN WelsQuantFour4x4Max_sse2
-align 16
-WelsQuantFour4x4Max_sse2:
%assign push_num 0
LOAD_4_PARA
MOVDQ xmm2, [r1]
@@ -195,8 +187,6 @@
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2_mmx
-align 16
-WelsHadamardQuant2x2_mmx:
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1w
@@ -253,8 +243,6 @@
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
-align 16
-WelsHadamardQuant2x2Skip_mmx:
%assign push_num 0
LOAD_3_PARA
SIGN_EXTENSION r1, r1w
@@ -303,13 +291,10 @@
%endmacro
-ALIGN 16
;***********************************************************************
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************
-align 16
WELS_EXTERN WelsDequant4x4_sse2
-WelsDequant4x4_sse2:
%assign push_num 0
LOAD_2_PARA
@@ -323,10 +308,7 @@
;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************====
-align 16
-
WELS_EXTERN WelsDequantFour4x4_sse2
-WelsDequantFour4x4_sse2:
%assign push_num 0
LOAD_2_PARA
@@ -346,8 +328,6 @@
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
;***********************************************************************
WELS_EXTERN WelsDequantIHadamard4x4_sse2
-align 16
-WelsDequantIHadamard4x4_sse2:
%assign push_num 0
LOAD_2_PARA
%ifndef X86_32
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -166,9 +166,7 @@
;***********************************************************************
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
;***********************************************************************
-ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_sse2
-WelsScan4x4DcAc_sse2:
%ifdef X86_32
push r3
%assign push_num 1
@@ -200,9 +198,7 @@
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
-ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_ssse3
-WelsScan4x4DcAc_ssse3:
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
@@ -220,9 +216,7 @@
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
;***********************************************************************
-ALIGN 16
WELS_EXTERN WelsScan4x4Ac_sse2
-WelsScan4x4Ac_sse2:
%assign push_num 0
LOAD_2_PARA
movdqa xmm0, [r1]
@@ -259,9 +253,7 @@
;***********************************************************************
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
-ALIGN 16
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
-WelsCalculateSingleCtr4x4_sse2:
%ifdef X86_32
push r3
%assign push_num 1
@@ -319,9 +311,7 @@
;***********************************************************************
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
;***********************************************************************
-ALIGN 16
WELS_EXTERN WelsGetNoneZeroCount_sse2
-WelsGetNoneZeroCount_sse2:
%assign push_num 0
LOAD_1_PARA
movdqa xmm0, [r0]
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -163,8 +163,6 @@
paddw %3, %2
%endmacro
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
;***********************************************************************
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
@@ -173,7 +171,7 @@
; 6 7 8
; 0: the center point
-BilateralLumaFilter8_sse2:
+WELS_EXTERN BilateralLumaFilter8_sse2
push r3
%assign push_num 1
@@ -219,7 +217,6 @@
ret
-WELS_EXTERN WaverageChromaFilter8_sse2
;***********************************************************************
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
@@ -230,8 +227,7 @@
;1 2 4 2 1
;1 1 2 1 1
-ALIGN 16
-WaverageChromaFilter8_sse2:
+WELS_EXTERN WaverageChromaFilter8_sse2
push r3
--- a/codec/processing/src/asm/downsample_bilinear.asm
+++ b/codec/processing/src/asm/downsample_bilinear.asm
@@ -66,8 +66,6 @@
db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-ALIGN 16
-
;***********************************************************************
; Code
;***********************************************************************
@@ -74,14 +72,12 @@
SECTION .text
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
push ebx
push edx
push esi
@@ -227,14 +223,12 @@
pop ebx
ret
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
push ebx
push edx
push esi
@@ -331,14 +325,12 @@
pop ebx
ret
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
push ebx
push edx
push esi
@@ -422,14 +414,12 @@
; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
push ebx
push edx
push esi
@@ -533,14 +523,12 @@
pop ebx
ret
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
push ebx
push edx
push esi
@@ -623,14 +611,12 @@
ret
; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
push ebx
push edx
push esi
@@ -733,14 +719,12 @@
pop ebx
ret
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
;***********************************************************************
; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
; unsigned char* pSrc, const int iSrcStride,
; const int iSrcWidth, const int iSrcHeight );
;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
push ebx
push edx
push esi
@@ -825,7 +809,6 @@
-WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
;**************************************************************************************************************
;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
@@ -833,8 +816,7 @@
;{
;**************************************************************************************************************
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
push ebp
push esi
push edi
@@ -1029,7 +1011,6 @@
-WELS_EXTERN GeneralBilinearFastDownsampler_sse2
;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
@@ -1037,8 +1018,7 @@
;{
;**************************************************************************************************************
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
push ebp
push esi
push edi
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -255,12 +255,10 @@
%ifdef X86_32
-WELS_EXTERN SampleVariance16x16_sse2
;***********************************************************************
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
+WELS_EXTERN SampleVariance16x16_sse2
push esi
push edi
push ebx
@@ -357,7 +355,6 @@
-WELS_EXTERN VAACalcSad_sse2
;*************************************************************************************************************
;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
@@ -364,8 +361,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSad_sse2:
+WELS_EXTERN VAACalcSad_sse2
%define cur_data esp + pushsize + 4
%define ref_data esp + pushsize + 8
%define iPicWidth esp + pushsize + 12
@@ -451,12 +447,10 @@
%else ;64-bit
-WELS_EXTERN SampleVariance16x16_sse2
;***********************************************************************
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
+WELS_EXTERN SampleVariance16x16_sse2
%define SUM r10;[esp]
%define SUM_CUR r11;[esp+4]
%define SQR r13;[esp+8]
@@ -549,7 +543,6 @@
ret
-WELS_EXTERN VAACalcSad_sse2
;*************************************************************************************************************
;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
@@ -556,8 +549,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSad_sse2:
+WELS_EXTERN VAACalcSad_sse2
%define cur_data r0
%define ref_data r1
%define iPicWidth r2
@@ -647,7 +639,6 @@
%ifdef X86_32
-WELS_EXTERN VAACalcSadVar_sse2
;*************************************************************************************************************
;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
@@ -654,8 +645,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadVar_sse2:
+WELS_EXTERN VAACalcSadVar_sse2
%define localsize 8
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@@ -783,7 +773,6 @@
%else ;64-bit
-WELS_EXTERN VAACalcSadVar_sse2
;*************************************************************************************************************
;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
@@ -790,8 +779,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadVar_sse2:
+WELS_EXTERN VAACalcSadVar_sse2
%define cur_data arg1 ;r0
%define ref_data arg2 ;r1
%define iPicWidth arg3 ;r2
@@ -926,7 +914,6 @@
%ifdef X86_32
-WELS_EXTERN VAACalcSadSsd_sse2
;*************************************************************************************************************
;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
@@ -933,8 +920,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadSsd_sse2:
+WELS_EXTERN VAACalcSadSsd_sse2
%define localsize 12
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@@ -1082,7 +1068,6 @@
%else
-WELS_EXTERN VAACalcSadSsd_sse2
;*************************************************************************************************************
;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
@@ -1089,8 +1074,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadSsd_sse2:
+WELS_EXTERN VAACalcSadSsd_sse2
%define localsize 12
%define cur_data arg1;r0
%define ref_data arg2;r1
@@ -1246,7 +1230,6 @@
%endif
%ifdef X86_32
-WELS_EXTERN VAACalcSadBgd_sse2
;*************************************************************************************************************
;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
@@ -1253,8 +1236,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadBgd_sse2:
+WELS_EXTERN VAACalcSadBgd_sse2
%define localsize 12
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@@ -1425,7 +1407,6 @@
-WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
@@ -1433,8 +1414,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
+WELS_EXTERN VAACalcSadSsdBgd_sse2
%define localsize 16
%define cur_data esp + pushsize + localsize + 4
%define ref_data esp + pushsize + localsize + 8
@@ -1656,7 +1636,6 @@
ret
%else
-WELS_EXTERN VAACalcSadBgd_sse2
;*************************************************************************************************************
;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
@@ -1663,8 +1642,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadBgd_sse2:
+WELS_EXTERN VAACalcSadBgd_sse2
%define cur_data arg1;
%define ref_data arg2;
%define iPicWidth arg3;
@@ -1827,7 +1805,6 @@
-WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
@@ -1835,8 +1812,7 @@
;*************************************************************************************************************
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
+WELS_EXTERN VAACalcSadSsdBgd_sse2
%define cur_data arg1;
%define ref_data arg2;
%define iPicWidth arg3;