shithub: openh264

Download patch

ref: eec968234d3af6bb734ef31594f0255d5e4d0598
parent: 918b211990ec2ab891cac748aa3561d5b0db74f8
author: Martin Storsjö <[email protected]>
date: Sun Mar 16 09:23:24 EDT 2014

Fold ALIGN 16 and the function label into WELS_EXTERN

This simplifies the structure for all x86 assembly functions,
reducing the amount of duplicated code structure.

--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -342,6 +342,7 @@
 %endmacro
 
 %macro WELS_EXTERN 1
+    ALIGN 16
     %ifdef PREFIX
         global _%1
         %define %1 _%1
@@ -348,6 +349,7 @@
     %else
         global %1
     %endif
+    %1:
 %endmacro
 
 %macro WELS_AbsW 2
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -55,12 +55,10 @@
 ; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
 ; section CPUID - CPU Identification
 
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
 ;******************************************************************************************
 ;   int32_t WelsCPUIdVerify()
 ;******************************************************************************************
-WelsCPUIdVerify:
+WELS_EXTERN WelsCPUIdVerify
     push    r1
     PUSHRFLAGS
     PUSHRFLAGS
@@ -73,14 +71,12 @@
     pop      r1
     ret
 
-WELS_EXTERN WelsCPUId
-ALIGN 16
 ;****************************************************************************************************
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 %ifdef       WIN64
 
-WelsCPUId:
+WELS_EXTERN WelsCPUId
     push     rbx
     push     rdx
 
@@ -98,7 +94,7 @@
     ret
 
 %elifdef     UNIX64
-WelsCPUId:
+WELS_EXTERN WelsCPUId
     push     rbx
     push     rcx
     push     rdx
@@ -118,7 +114,7 @@
 
 %elifdef     X86_32
 
-WelsCPUId:
+WELS_EXTERN WelsCPUId
     push	ebx
     push	edi
 
@@ -143,13 +139,11 @@
 
 %endif
 
-WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
 ;****************************************************************************************************
 ;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
 ;****************************************************************************************************
-WelsCPUSupportAVX:
+WELS_EXTERN WelsCPUSupportAVX
 %ifdef     WIN64
         mov   eax,    ecx
         mov   ecx,    edx
@@ -178,13 +172,11 @@
         ret
 
 
-WELS_EXTERN  WelsCPUSupportFMA
 ; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
 ;****************************************************************************************************
 ;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
 ;****************************************************************************************************
-WelsCPUSupportFMA:
+WELS_EXTERN  WelsCPUSupportFMA
 %ifdef     WIN64
         mov   eax,   ecx
         mov   ecx,   edx
@@ -211,12 +203,10 @@
 	mov eax, 0
 	ret
 
-WELS_EXTERN WelsEmms
-ALIGN 16
 ;******************************************************************************************
 ;   void WelsEmms()
 ;******************************************************************************************
-WelsEmms:
+WELS_EXTERN WelsEmms
 	emms	; empty mmx technology states
 	ret
 
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -61,8 +61,6 @@
 
 
 WELS_EXTERN   DeblockLumaLt4V_ssse3
-
-DeblockLumaLt4V_ssse3:
   push        rbp
   mov         r11,[rsp + 16 + 20h]  ; pTC
   sub         rsp,1B0h
@@ -318,9 +316,6 @@
 
 
 WELS_EXTERN   DeblockLumaEq4V_ssse3
-
-ALIGN  16
-DeblockLumaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -781,9 +776,6 @@
 
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-
-ALIGN  16
-DeblockChromaLt4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rdi
@@ -943,8 +935,6 @@
 
 
 WELS_EXTERN   DeblockChromaEq4V_ssse3
-ALIGN 16
-DeblockChromaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   sub         rsp,90h
@@ -1097,8 +1087,6 @@
 
 
 WELS_EXTERN   DeblockChromaEq4H_ssse3
-ALIGN  16
-DeblockChromaEq4H_ssse3:
   mov         rax,rsp
   mov         [rax+20h],rbx
   push        rdi
@@ -1361,8 +1349,6 @@
 
 
 WELS_EXTERN DeblockChromaLt4H_ssse3
-ALIGN  16
-DeblockChromaLt4H_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -1647,8 +1633,6 @@
 
 
 WELS_EXTERN   DeblockLumaLt4V_ssse3
-
-DeblockLumaLt4V_ssse3:
   push        rbp
   mov         r11,r8  ; pTC
   sub         rsp,1B0h
@@ -1904,9 +1888,6 @@
 
 
 WELS_EXTERN DeblockLumaEq4V_ssse3
-
-ALIGN  16
-DeblockLumaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2366,8 +2347,6 @@
   ret
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-ALIGN  16
-DeblockChromaLt4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2534,8 +2513,6 @@
   ret
 
 WELS_EXTERN DeblockChromaEq4V_ssse3
-
-DeblockChromaEq4V_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2685,9 +2662,6 @@
   ret
 
 WELS_EXTERN DeblockChromaEq4H_ssse3
-
-ALIGN  16
-DeblockChromaEq4H_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -2960,8 +2934,6 @@
 
 
 WELS_EXTERN DeblockChromaLt4H_ssse3
-ALIGN  16
-DeblockChromaLt4H_ssse3:
   mov         rax,rsp
   push        rbx
   push        rbp
@@ -3256,9 +3228,6 @@
 ;                             int32_t iAlpha, int32_t iBeta)
 ;********************************************************************************
 WELS_EXTERN   DeblockChromaEq4V_ssse3
-
-ALIGN  16
-DeblockChromaEq4V_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -3426,8 +3395,6 @@
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-
-DeblockChromaLt4V_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -3629,10 +3596,6 @@
 ;***************************************************************************
 
 WELS_EXTERN     DeblockChromaEq4H_ssse3
-
-ALIGN  16
-
-DeblockChromaEq4H_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -3914,10 +3877,6 @@
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockChromaLt4H_ssse3
-
-ALIGN  16
-
-DeblockChromaLt4H_ssse3:
   push        ebp
   mov         ebp,esp
   and         esp,0FFFFFFF0h
@@ -4230,10 +4189,6 @@
 
 
 WELS_EXTERN  DeblockLumaLt4V_ssse3
-
-ALIGN  16
-
-DeblockLumaLt4V_ssse3:
     push	ebp
 	mov	ebp, esp
 	and	esp, -16				; fffffff0H
@@ -4620,12 +4575,9 @@
 ;                                 int32_t iBeta)
 ;*******************************************************************************
 
+
 WELS_EXTERN  DeblockLumaEq4V_ssse3
 
-ALIGN  16
-
-DeblockLumaEq4V_ssse3:
-
 	push	ebp
 	mov	ebp, esp
 	and	esp, -16				; fffffff0H
@@ -5174,10 +5126,6 @@
 ;********************************************************************************
 
 WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
     push     r3
     push     r4
     push     r5
@@ -5253,10 +5201,6 @@
 ;*******************************************************************************************
 
 WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
     push     r3
     push     r4
 
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -67,9 +67,6 @@
 
 SECTION .text
 
-WELS_EXTERN ExpandPictureLuma_sse2
-WELS_EXTERN ExpandPictureChromaAlign_sse2	; for chroma alignment
-WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
 
 ;;;;;;;expanding result;;;;;;;
 
@@ -360,7 +357,6 @@
 %endif
 %endmacro
 
-ALIGN 16
 ;***********************************************************************----------------
 ; void ExpandPictureLuma_sse2(	uint8_t *pDst,
 ;									const int32_t iStride,
@@ -367,7 +363,7 @@
 ;									const int32_t iWidth,
 ;									const int32_t iHeight	);
 ;***********************************************************************----------------
-ExpandPictureLuma_sse2:
+WELS_EXTERN ExpandPictureLuma_sse2
 
     push r4
     push r5
@@ -487,7 +483,6 @@
 
 	ret
 
-ALIGN 16
 ;***********************************************************************----------------
 ; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
 ;										const int32_t iStride,
@@ -494,7 +489,7 @@
 ;										const int32_t iWidth,
 ;										const int32_t iHeight	);
 ;***********************************************************************----------------
-ExpandPictureChromaAlign_sse2:
+WELS_EXTERN ExpandPictureChromaAlign_sse2
 
     push r4
     push r5
@@ -613,7 +608,6 @@
 
 	ret
 
-ALIGN 16
 ;***********************************************************************----------------
 ; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
 ;										const int32_t iStride,
@@ -620,7 +614,7 @@
 ;										const int32_t iWidth,
 ;										const int32_t iHeight	);
 ;***********************************************************************----------------
-ExpandPictureChromaUnalign_sse2:
+WELS_EXTERN ExpandPictureChromaUnalign_sse2
 	push r4
     push r5
     push r6
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -54,12 +54,6 @@
 
 SECTION .text
 
-WELS_EXTERN WelsCopy16x16_sse2
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
-WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
-WELS_EXTERN WelsCopy8x16_mmx		;
-WELS_EXTERN UpdateMbMv_sse2		;
 
 ;***********************************************************************
 ; void WelsCopy16x16_sse2(	uint8_t* Dst,
@@ -67,8 +61,7 @@
 ;							uint8_t* Src,
 ;							int32_t  iStrideS )
 ;***********************************************************************
-ALIGN 16
-WelsCopy16x16_sse2:
+WELS_EXTERN WelsCopy16x16_sse2
 
 	push r4
 	push r5
@@ -130,9 +123,8 @@
 ;							uint8_t* Src,
 ;							int32_t  iStrideS )
 ;***********************************************************************
-ALIGN 16
 ; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WelsCopy16x16NotAligned_sse2:
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
 	push r4
 	push r5
 	%assign  push_num 2
@@ -194,8 +186,7 @@
 ;							uint8_t* Src,
 ;							int32_t  iStrideS )
 ;***********************************************************************
-ALIGN 16
-WelsCopy16x8NotAligned_sse2:
+WELS_EXTERN WelsCopy16x8NotAligned_sse2
 	push r4
 	push r5
 	%assign  push_num 2
@@ -235,8 +226,7 @@
 ;                       uint8_t* Src,
 ;                       int32_t  iStrideS )
 ;***********************************************************************
-ALIGN 16
-WelsCopy8x16_mmx:
+WELS_EXTERN WelsCopy8x16_mmx
 	%assign  push_num 0
     LOAD_4_PARA
 
@@ -300,8 +290,7 @@
 ;                        uint8_t* Src,
 ;                        int32_t  iStrideS )
 ;***********************************************************************
-ALIGN 16
-WelsCopy8x8_mmx:
+WELS_EXTERN WelsCopy8x8_mmx
 	push r4
 	%assign  push_num 1
     LOAD_4_PARA
@@ -349,8 +338,7 @@
 ;***********************************************************************
 ; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
 ;***********************************************************************
-ALIGN 16
-UpdateMbMv_sse2:
+WELS_EXTERN UpdateMbMv_sse2
 
     %assign  push_num 0
     LOAD_2_PARA
@@ -385,16 +373,9 @@
 
 SECTION .text
 
-WELS_EXTERN PixelAvgWidthEq4_mmx
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
 
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
 
 
-ALIGN 16
 ;*******************************************************************************
 ; void PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
 ;                           uint8_t *pSrcA, int iSrcAStride,
@@ -401,7 +382,7 @@
 ;                           uint8_t *pSrcB, int iSrcBStride,
 ;                           int iHeight );
 ;*******************************************************************************
-PixelAvgWidthEq4_mmx:
+WELS_EXTERN PixelAvgWidthEq4_mmx
 
     %assign  push_num 0
     LOAD_7_PARA
@@ -428,7 +409,6 @@
     ret
 
 
-ALIGN 16
 ;*******************************************************************************
 ; void PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
 ;                           uint8_t *pSrcA, int iSrcAStride,
@@ -435,7 +415,7 @@
 ;                           uint8_t *pSrcB, int iSrcBStride,
 ;                           int iHeight );
 ;*******************************************************************************
-PixelAvgWidthEq8_mmx:
+WELS_EXTERN PixelAvgWidthEq8_mmx
     %assign  push_num 0
     LOAD_7_PARA
 
@@ -466,7 +446,6 @@
 
 
 
-ALIGN 16
 ;*******************************************************************************
 ; void PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
 ;                          uint8_t *pSrcA, int iSrcAStride,
@@ -473,7 +452,7 @@
 ;                          uint8_t *pSrcB, int iSrcBStride,
 ;                          int iHeight );
 ;*******************************************************************************
-PixelAvgWidthEq16_sse2:
+WELS_EXTERN PixelAvgWidthEq16_sse2
 
     %assign  push_num 0
     LOAD_7_PARA
@@ -519,12 +498,11 @@
 	LOAD_7_PARA_POP
     ret
 
-ALIGN 16
 ;*******************************************************************************
 ;  void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
 ;                          uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
-McCopyWidthEq4_mmx:
+WELS_EXTERN McCopyWidthEq4_mmx
     push	r5
     %assign  push_num 1
     LOAD_5_PARA
@@ -547,12 +525,11 @@
     pop	   r5
     ret
 
-ALIGN 16
 ;*******************************************************************************
 ;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
 ;                           uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
-McCopyWidthEq8_mmx:
+WELS_EXTERN McCopyWidthEq8_mmx
     %assign  push_num 0
     LOAD_5_PARA
 
@@ -574,7 +551,6 @@
     ret
 
 
-ALIGN 16
 ;*******************************************************************************
 ;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
@@ -589,7 +565,7 @@
 	movq	[%1],	%2
 	movhps	[%1+8], %2
 %endmacro
-McCopyWidthEq16_sse2:
+WELS_EXTERN McCopyWidthEq16_sse2
     %assign  push_num 0
     LOAD_5_PARA
 	SIGN_EXTENSION	r1, r1d
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -65,7 +65,6 @@
 
 SECTION .text
 
-ALIGN 16
 ;*******************************************************************************
 ; void McChromaWidthEq4_mmx( const uint8_t *src,
 ;							int32_t iSrcStride,
@@ -75,7 +74,6 @@
 ;							int32_t iHeigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
 	%assign  push_num 0
 	LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -140,7 +138,6 @@
 	ret
 
 
-ALIGN 16
 ;*******************************************************************************
 ; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
 ;						int32_t iSrcStride,
@@ -150,7 +147,6 @@
 ;						int32_t iheigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
 	%assign  push_num 0
 	LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -219,7 +215,6 @@
 
 
 
-ALIGN 16
 ;***********************************************************************
 ; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
 ;						 int32_t iSrcStride,
@@ -229,7 +224,6 @@
 ;					     int32_t iHeigh);
 ;***********************************************************************
 WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
 	%assign  push_num 0
 	LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -71,10 +71,8 @@
 
 SECTION .text
 
-WELS_EXTERN McHorVer20WidthEq4_mmx
 
 
-ALIGN 16
 ;*******************************************************************************
 ; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
 ;                       int iSrcStride,
@@ -82,7 +80,7 @@
 ;						int iDstStride,
 ;						int iHeight)
 ;*******************************************************************************
-McHorVer20WidthEq4_mmx:
+WELS_EXTERN McHorVer20WidthEq4_mmx
     %assign  push_num 0
     LOAD_5_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -161,12 +159,7 @@
 ;*******************************************************************************
 
 SECTION .text
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
 
-ALIGN 16
 ;***********************************************************************
 ; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
 ;                       int16_t iSrcStride,
@@ -175,7 +168,7 @@
 ;						int32_t iHeight
 ;                       )
 ;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
 	%assign  push_num 0
     LOAD_5_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -217,7 +210,6 @@
 	LOAD_5_PARA_POP
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ; void McHorVer20WidthEq8_sse2(  const uint8_t *pSrc,
 ;                       int iSrcStride,
@@ -226,7 +218,7 @@
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
-McHorVer20WidthEq8_sse2:
+WELS_EXTERN McHorVer20WidthEq8_sse2
 	%assign  push_num 0
     LOAD_5_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -272,7 +264,6 @@
 	LOAD_5_PARA_POP
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ; void McHorVer20WidthEq16_sse2(  const uint8_t *pSrc,
 ;                       int iSrcStride,
@@ -281,7 +272,7 @@
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
-McHorVer20WidthEq16_sse2:
+WELS_EXTERN McHorVer20WidthEq16_sse2
 	%assign  push_num 0
     LOAD_5_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -361,8 +352,7 @@
 ;                       int iDstStride,
 ;                       int iHeight )
 ;*******************************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
+WELS_EXTERN McHorVer02WidthEq8_sse2
 	%assign  push_num 0
     LOAD_5_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -443,11 +433,6 @@
 
 SECTION .text
 
-WELS_EXTERN McHorVer20Width9Or17_sse2
-WELS_EXTERN McHorVer02Height9Or17_sse2
-WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
-WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
-WELS_EXTERN McHorVer22HorFirst_sse2
 
 
 ;***********************************************************************
@@ -458,8 +443,7 @@
 ;						int32_t iWidth,
 ;                       int32_t iHeight )
 ;***********************************************************************
-ALIGN 16
-McHorVer02Height9Or17_sse2:
+WELS_EXTERN McHorVer02Height9Or17_sse2
 	%assign  push_num 0
     LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -583,7 +567,6 @@
 	ret
 
 
-ALIGN 16
 ;***********************************************************************
 ; void McHorVer20Width9Or17_sse2(		const uint8_t *pSrc,
 ;                       int32_t iSrcStride,
@@ -593,7 +576,7 @@
 ;						int32_t iHeight
 ;                      );
 ;***********************************************************************
-McHorVer20Width9Or17_sse2:
+WELS_EXTERN McHorVer20Width9Or17_sse2
 	%assign  push_num 0
     LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -742,7 +725,6 @@
 
 
 
-ALIGN 16
 ;***********************************************************************
 ;void McHorVer22HorFirst_sse2
 ;							(const uint8_t *pSrc,
@@ -751,7 +733,7 @@
 ;							int32_t iTapStride,
 ;							int32_t iWidth,int32_t iHeight);
 ;***********************************************************************
-McHorVer22HorFirst_sse2:
+WELS_EXTERN McHorVer22HorFirst_sse2
 	%assign  push_num 0
     LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -918,7 +900,7 @@
 ;											int32_t iHeight);
 ;***********************************************************************
 
- McHorVer22Width8VerLastAlign_sse2:
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
 	%assign  push_num 0
     LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
@@ -1047,7 +1029,7 @@
 ;											int32_t iHeight);
 ;***********************************************************************
 
- McHorVer22Width8VerLastUnAlign_sse2:
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
 	%assign  push_num 0
     LOAD_6_PARA
 	SIGN_EXTENSION	r1, r1d
--- a/codec/common/satd_sad.asm
+++ b/codec/common/satd_sad.asm
@@ -156,8 +156,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
@@ -229,9 +227,7 @@
  ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
  ;
  ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
+WELS_EXTERN WelsSampleSatd8x8_sse2
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
@@ -250,9 +246,7 @@
  ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
  ;
  ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
+WELS_EXTERN WelsSampleSatd8x16_sse2
 	 %assign  push_num 0
 	 LOAD_4_PARA
 	 SIGN_EXTENSION r1, r1d
@@ -277,8 +271,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
@@ -308,8 +300,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
@@ -484,7 +474,6 @@
 
 %ifdef X86_32
 WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
 	push   ebx
 	push   esi
 	push   edi
@@ -678,7 +667,6 @@
 ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
 
 WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
 	push   ebx
 	push   esi
 	push   edi
@@ -782,7 +770,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
 	push   ebx
 	push   esi
 	push   edi
@@ -987,7 +974,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1040,8 +1026,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
 %ifdef X86_32
 	push  r4
 	push  r5
@@ -1072,8 +1056,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
 %ifdef X86_32
 	push  r4
 	push  r5
@@ -1110,8 +1092,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
 %ifdef X86_32
 	push  r4
 	push  r5
@@ -1155,8 +1135,6 @@
 ;***********************************************************************
 
 WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
 %ifdef X86_32
 	push  r4
 	push  r5
@@ -1276,8 +1254,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
 %ifdef X86_32
 	push  r4
 	push  r5
@@ -1319,8 +1295,6 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1346,7 +1320,6 @@
 
 
 WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1377,7 +1350,6 @@
 %endmacro
 
 WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
 	%assign  push_num 0
 	mov		r2,  arg3
 	push	r2
@@ -1536,7 +1508,6 @@
 	paddw  xmm7, %4
 %endmacro
 WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1654,7 +1625,6 @@
 
 
 WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1739,7 +1709,6 @@
 	ret
 
 WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1951,7 +1920,6 @@
 
 
 WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENSION r1, r1d
@@ -2071,7 +2039,6 @@
 	ret
 
 WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENSION r1, r1d
@@ -2144,13 +2111,10 @@
 ;
 ;***********************************************************************
 
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
 ;***********************************************************************
 ;   int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
 ;***********************************************************************
-WelsSampleSad4x4_mmx:
+WELS_EXTERN WelsSampleSad4x4_mmx
     %assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENSION r1, r1d
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -152,12 +152,10 @@
 
 ; , 6/7/2010
 
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
 ;***********************************************************************
 ;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
 ;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
 
     %assign push_num 0
     LOAD_2_PARA
@@ -247,12 +245,10 @@
 
 	ret
 
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
 ;***********************************************************************
 ;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
 ;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
 
     %assign push_num 0
     LOAD_2_PARA
@@ -342,12 +338,10 @@
 
 	ret
 
-WELS_EXTERN MdInterAnalysisVaaInfo_sse41
 ;***********************************************************************
 ;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
 ;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse41:
+WELS_EXTERN MdInterAnalysisVaaInfo_sse41
 	%assign push_num 0
 	LOAD_1_PARA
 	movdqa xmm0,[r0]
@@ -378,12 +372,10 @@
 	mov retrd, 15
 	ret
 
-WELS_EXTERN MdInterAnalysisVaaInfo_sse2
 ;***********************************************************************
 ;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
 ;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse2:
+WELS_EXTERN MdInterAnalysisVaaInfo_sse2
 	%assign push_num 0
 	LOAD_1_PARA
 	movdqa xmm0, [r0]
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -49,13 +49,10 @@
 SECTION .text
 
 
-WELS_EXTERN   WelsResBlockZero16x16_sse2
-
-ALIGN    16
 ;*******************************************************************************
 ;  void WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
 ;*******************************************************************************
-WelsResBlockZero16x16_sse2:
+WELS_EXTERN   WelsResBlockZero16x16_sse2
         %assign push_num 0
         LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -122,13 +119,10 @@
 	ret
 
 
-WELS_EXTERN   WelsResBlockZero8x8_sse2
-
-ALIGN    16
 ;*******************************************************************************
 ;  void WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
 ;*******************************************************************************
-WelsResBlockZero8x8_sse2:
+WELS_EXTERN   WelsResBlockZero8x8_sse2
 	  %assign push_num 0
           LOAD_2_PARA
 	  SIGN_EXTENSION r1, r1d
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -83,14 +83,11 @@
 
 SECTION .text
 
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
 ;*******************************************************************************
 ;   void IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
 ;*******************************************************************************
 
-IdctResAddPred_mmx:
+WELS_EXTERN IdctResAddPred_mmx
     %assign push_num 0
     LOAD_3_PARA
     SIGN_EXTENSION r1, r1d
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -177,18 +177,14 @@
 ;*******************************************************************************
 
 SECTION .text
-WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
-WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
-WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
 
 
-ALIGN 16
 ;*******************************************************************************
 ;   void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
 ;
 ;	pPred must align to 16
 ;*******************************************************************************
-WelsDecoderI4x4LumaPredH_sse2:
+WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -222,7 +218,7 @@
 ;*******************************************************************************
 ; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
-WelsDecoderI16x16LumaPredPlane_sse2:
+WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
 		push r3
 		push r4
 		%assign push_num 2
@@ -326,7 +322,6 @@
 %endmacro
 
 WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
-WelsDecoderI16x16LumaPredH_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -350,7 +345,6 @@
 ; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
-WelsDecoderI16x16LumaPredV_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -389,7 +383,6 @@
 ; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
-WelsDecoderIChromaPredPlane_sse2:
 		push r3
 		push r4
 		%assign push_num 2
@@ -477,7 +470,6 @@
 		WELSEMMS
 		ret
 
-ALIGN 16
 ;*******************************************************************************
 ;	0 |1 |2 |3 |4 |
 ;	6 |7 |8 |9 |10|
@@ -490,7 +482,7 @@
 ;   void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;
 ;*******************************************************************************
-WelsDecoderI4x4LumaPredDDR_mmx:
+WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -536,7 +528,6 @@
 	ret
 
 
-ALIGN 16
 ;*******************************************************************************
 ;	void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
 ;   copy 8 pixel of 8 line from left
@@ -560,7 +551,6 @@
 %endmacro
 
 WELS_EXTERN WelsDecoderIChromaPredH_mmx
-WelsDecoderIChromaPredH_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -597,13 +587,11 @@
 	ret
 
 
-ALIGN 16
 ;*******************************************************************************
 ;	void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
 ;   copy 8 pixels from top 8 pixels
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredV_mmx
-WelsDecoderIChromaPredV_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -627,7 +615,6 @@
 	ret
 
 
-	ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
 ;	l0|
@@ -658,7 +645,6 @@
 ;   void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
-WelsDecoderI4x4LumaPredHD_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -714,7 +700,6 @@
 
 
 
-ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
 ;	l0|
@@ -742,7 +727,6 @@
 ;   void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
-WelsDecoderI4x4LumaPredHU_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -799,7 +783,6 @@
 
 
 
-ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
 ;	l0|
@@ -829,7 +812,6 @@
 ;   void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
-WelsDecoderI4x4LumaPredVR_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -889,7 +871,6 @@
 	WELSEMMS
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
 ;	l0|
@@ -917,7 +898,6 @@
 ;   void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
-WelsDecoderI4x4LumaPredDDL_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -956,7 +936,6 @@
 	ret
 
 
-ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
 ;	l0|
@@ -987,7 +966,6 @@
 ;   void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
-WelsDecoderI4x4LumaPredVL_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1023,13 +1001,11 @@
 	WELSEMMS
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ;
 ;   void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDc_sse2
-WelsDecoderIChromaPredDc_sse2:
 	push 	r3
 	push 	r4
 	%assign push_num 2
@@ -1120,13 +1096,11 @@
 
 
 
-ALIGN 16
 ;*******************************************************************************
 ;
 ;   void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
-WelsDecoderI16x16LumaPredDc_sse2:
 	push 	r3
 	push 	r4
 	%assign push_num 2
@@ -1201,12 +1175,10 @@
 ; for intra prediction as follows, 11/19/2010
 ;*******************************************************************************
 
-ALIGN 16
 ;*******************************************************************************
 ;	void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
-WelsDecoderI16x16LumaPredDcTop_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1273,12 +1245,10 @@
 
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ;	void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
-WelsDecoderI16x16LumaPredDcNA_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1308,12 +1278,10 @@
 
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ;	void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
-WelsDecoderIChromaPredDcLeft_mmx:
 	push r3
 	push r4
 	%assign push_num 2
@@ -1381,12 +1349,10 @@
 	emms
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ;	void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
-WelsDecoderIChromaPredDcTop_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
@@ -1420,12 +1386,10 @@
 	movq [r0+r2], xmm0
 	ret
 
-ALIGN 16
 ;*******************************************************************************
 ;	void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
-WelsDecoderIChromaPredDcNA_mmx:
 	%assign push_num 0
 	LOAD_2_PARA
 	SIGN_EXTENSION r1, r1d
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -323,7 +323,6 @@
 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
 WELS_EXTERN CavlcParamCal_sse2
-CavlcParamCal_sse2:
 	push ebx
 	push edi
 	push esi
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -130,12 +130,10 @@
     movd       %5, %1
 %endmacro
 SECTION .text
-ALIGN 16
 ;***********************************************************************
 ;   void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
 ;***********************************************************************
 WELS_EXTERN WelsDctT4_mmx
-WelsDctT4_mmx:
     %assign push_num 0
     LOAD_5_PARA
     SIGN_EXTENSION r2, r2d
@@ -163,7 +161,6 @@
 ;   void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
 ;***********************************************************************
 WELS_EXTERN WelsIDctT4Rec_mmx
-WelsIDctT4Rec_mmx:
     %assign push_num 0
     LOAD_5_PARA
     SIGN_EXTENSION r1, r1d
@@ -291,8 +288,6 @@
 ; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
 ;***********************************************************************
 WELS_EXTERN WelsDctFourT4_sse2
-ALIGN 16
-WelsDctFourT4_sse2:
     %assign push_num 0
     LOAD_5_PARA
     SIGN_EXTENSION r2, r2d
@@ -340,8 +335,6 @@
 ; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
 ;***********************************************************************
 WELS_EXTERN WelsIDctFourT4Rec_sse2
-ALIGN 16
-WelsIDctFourT4Rec_sse2:
 	%assign push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENSION r1, r1d
@@ -399,8 +392,6 @@
 ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
 ;***********************************************************************
 WELS_EXTERN WelsIDctRecI16x16Dc_sse2
-ALIGN 16
-WelsIDctRecI16x16Dc_sse2:
 	%assign push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENSION r1, r1d
@@ -475,7 +466,6 @@
 ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
 ;***********************************************************************
 WELS_EXTERN WelsHadamardT4Dc_sse2
-WelsHadamardT4Dc_sse2:
 		%assign push_num 0
 		LOAD_2_PARA
 		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -188,18 +188,13 @@
 ;***********************************************************************
 
 SECTION .text
-WELS_EXTERN WelsI4x4LumaPredH_sse2
-WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-WELS_EXTERN WelsI4x4LumaPredDc_sse2
-WELS_EXTERN WelsI16x16LumaPredPlane_sse2
 
-ALIGN 16
 ;***********************************************************************
 ;   void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;
 ;	pred must align to 16
 ;***********************************************************************
-WelsI4x4LumaPredH_sse2:
+WELS_EXTERN WelsI4x4LumaPredH_sse2
 	push r3
 	%assign push_num 1
 	LOAD_3_PARA
@@ -233,7 +228,7 @@
 ;***********************************************************************
 ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
-WelsI16x16LumaPredPlane_sse2:
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
 		push r3
 		push r4
 		%assign push_num 2
@@ -330,7 +325,6 @@
 %endmacro
 
 WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
 	push r3
 	%assign push_num 1
 	LOAD_3_PARA
@@ -361,7 +355,6 @@
 ; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
     %assign push_num 0
     LOAD_3_PARA
     SIGN_EXTENSION r2, r2d
@@ -391,7 +384,6 @@
 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredPlane_sse2
-WelsIChromaPredPlane_sse2:
 		push r3
 		push r4
 		%assign push_num 2
@@ -475,7 +467,6 @@
 		WELSEMMS
 		ret
 
-ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
 ;	6 |7 |8 |9 |10|
@@ -488,7 +479,7 @@
 ;   void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;
 ;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -529,7 +520,6 @@
 	WELSEMMS
 	ret
 
-ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
 ;	5 |6 |7 |8 |9 |
@@ -542,7 +532,7 @@
 ;   void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;
 ;***********************************************************************
-WelsI4x4LumaPredDc_sse2:
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
 	push r3
 	push r4
 	%assign push_num 2
@@ -576,7 +566,6 @@
 	pop r3
 	ret
 
-ALIGN 16
 ;***********************************************************************
 ;	void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy 8 pixel of 8 line from left
@@ -602,7 +591,6 @@
 %endmacro
 
 WELS_EXTERN WelsIChromaPredH_mmx
-WelsIChromaPredH_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -633,13 +621,11 @@
 	WELSEMMS
 	ret
 
-ALIGN 16
 ;***********************************************************************
 ;	void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy pixels from top 4 pixels
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredV_sse2
-WelsI4x4LumaPredV_sse2:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -649,13 +635,11 @@
 	movdqa		[r0],	xmm0
 	ret
 
-ALIGN 16
 ;***********************************************************************
 ;	void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy 8 pixels from top 8 pixels
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredV_sse2
-WelsIChromaPredV_sse2:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -669,7 +653,6 @@
 	movdqa		[r0+48],	xmm0
 	ret
 
-	ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
 ;	l0|
@@ -700,7 +683,6 @@
 ;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -751,7 +733,6 @@
 	WELSEMMS
 	ret
 
-ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
 ;	l0|
@@ -779,7 +760,6 @@
 ;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -832,7 +812,6 @@
 
 
 
-ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
 ;	l0|
@@ -862,7 +841,6 @@
 ;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -920,7 +898,6 @@
 	WELSEMMS
 	ret
 
-ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
 ;	l0|
@@ -948,7 +925,6 @@
 ;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -985,7 +961,6 @@
 	ret
 
 
-ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
 ;	l0|
@@ -1016,7 +991,6 @@
 ;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
 	%assign push_num 0
 	LOAD_3_PARA
 	SIGN_EXTENSION r2, r2d
@@ -1049,13 +1023,11 @@
 	WELSEMMS
 	ret
 
-ALIGN 16
 ;***********************************************************************
 ;
 ;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
 	push r3
 	push r4
 	%assign push_num 2
@@ -1141,13 +1113,11 @@
 
 
 
-ALIGN 16
 ;***********************************************************************
 ;
 ;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
 	push r3
 	push r4
 	%assign push_num 2
@@ -1210,8 +1180,6 @@
 ;***********************************************************************
 %ifdef X86_32
 WELS_EXTERN WelsSampleSatdThree4x4_sse2
-align 16
-WelsSampleSatdThree4x4_sse2:
 	push      ebx
 	push      esi
 	push      edi
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -47,12 +47,10 @@
 
 SECTION .text
 
-ALIGN 16
 ;***********************************************************************
 ;void WelsPrefetchZero_mmx(int8_t const*_A);
 ;***********************************************************************
 WELS_EXTERN WelsPrefetchZero_mmx
-WelsPrefetchZero_mmx:
 	%assign  push_num 0
 	LOAD_1_PARA
 	prefetchnta [r0]
@@ -59,12 +57,10 @@
 	ret
 
 
-ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroAligned64_sse2
-WelsSetMemZeroAligned64_sse2:
 
 		%assign  push_num 0
 		LOAD_2_PARA
@@ -84,12 +80,10 @@
 
 		ret
 
-ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize64_mmx
-WelsSetMemZeroSize64_mmx:
 
 		%assign  push_num 0
 		LOAD_2_PARA
@@ -114,12 +108,10 @@
 		WELSEMMS
 		ret
 
-ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize8_mmx
-WelsSetMemZeroSize8_mmx:
 
 		%assign  push_num 0
 		LOAD_2_PARA
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -83,8 +83,6 @@
 ;	void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuant4x4_sse2
-align 16
-WelsQuant4x4_sse2:
 		%assign push_num 0
                 LOAD_3_PARA
 		movdqa	xmm2, [r1]
@@ -99,8 +97,6 @@
 ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuant4x4Dc_sse2
-align 16
-WelsQuant4x4Dc_sse2:
  		%assign push_num 0
 		LOAD_3_PARA
 		SIGN_EXTENSION r1, r1w
@@ -118,8 +114,6 @@
 ;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuantFour4x4_sse2
-align 16
-WelsQuantFour4x4_sse2:
 		%assign push_num 0
 		LOAD_3_PARA
 		MOVDQ	xmm2, [r1]
@@ -140,8 +134,6 @@
 ;	void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
 ;***********************************************************************
 WELS_EXTERN WelsQuantFour4x4Max_sse2
-align 16
-WelsQuantFour4x4Max_sse2:
 		%assign push_num 0
 		LOAD_4_PARA
 		MOVDQ	xmm2, [r1]
@@ -195,8 +187,6 @@
 ;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
 ;***********************************************************************
 WELS_EXTERN WelsHadamardQuant2x2_mmx
-align 16
-WelsHadamardQuant2x2_mmx:
 		%assign push_num 0
 		LOAD_5_PARA
 		SIGN_EXTENSION r1, r1w
@@ -253,8 +243,6 @@
 ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
-align 16
-WelsHadamardQuant2x2Skip_mmx:
 		%assign push_num 0
 		LOAD_3_PARA
 		SIGN_EXTENSION r1, r1w
@@ -303,13 +291,10 @@
 %endmacro
 
 
-ALIGN  16
 ;***********************************************************************
 ; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************
-align 16
 WELS_EXTERN WelsDequant4x4_sse2
-WelsDequant4x4_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 
@@ -323,10 +308,7 @@
 ;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************====
 
-align 16
-
 WELS_EXTERN WelsDequantFour4x4_sse2
-WelsDequantFour4x4_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 
@@ -346,8 +328,6 @@
 ;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
-align 16
-WelsDequantIHadamard4x4_sse2:
 		%assign push_num 0
 		LOAD_2_PARA
 		%ifndef X86_32
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -166,9 +166,7 @@
 ;***********************************************************************
 ;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
-ALIGN 16
 WELS_EXTERN WelsScan4x4DcAc_sse2
-WelsScan4x4DcAc_sse2:
 	%ifdef X86_32
 	push r3
 	%assign push_num 1
@@ -200,9 +198,7 @@
 ;***********************************************************************
 ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
-ALIGN 16
 WELS_EXTERN WelsScan4x4DcAc_ssse3
-WelsScan4x4DcAc_ssse3:
 	%assign push_num 0
 	LOAD_2_PARA
 	movdqa     xmm0, [r1]
@@ -220,9 +216,7 @@
 ;***********************************************************************
 ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
 ;***********************************************************************
-ALIGN 16
 WELS_EXTERN WelsScan4x4Ac_sse2
-WelsScan4x4Ac_sse2:
 	%assign push_num 0
 	LOAD_2_PARA
 	movdqa     xmm0, [r1]
@@ -259,9 +253,7 @@
 ;***********************************************************************
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
-ALIGN 16
 WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
-WelsCalculateSingleCtr4x4_sse2:
 	%ifdef X86_32
 	push r3
 	%assign push_num 1
@@ -319,9 +311,7 @@
 ;***********************************************************************
 ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
 ;***********************************************************************
-ALIGN 16
 WELS_EXTERN WelsGetNoneZeroCount_sse2
-WelsGetNoneZeroCount_sse2:
 	%assign push_num 0
 	LOAD_1_PARA
 	movdqa    xmm0, [r0]
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -163,8 +163,6 @@
 		paddw		%3,	%2
 %endmacro
 
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
 ;***********************************************************************
 ;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
 ;***********************************************************************
@@ -173,7 +171,7 @@
 ;	6	7	8
 ;	0:	the center point
 
-BilateralLumaFilter8_sse2:
+WELS_EXTERN BilateralLumaFilter8_sse2
 
         push r3
         %assign push_num 1
@@ -219,7 +217,6 @@
 
 		ret
 
-WELS_EXTERN WaverageChromaFilter8_sse2
 ;***********************************************************************
 ; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
 ;***********************************************************************
@@ -230,8 +227,7 @@
 ;1	2	4	2	1
 ;1	1	2	1	1
 
-ALIGN 16
-WaverageChromaFilter8_sse2:
+WELS_EXTERN WaverageChromaFilter8_sse2
 
         push r3
 
--- a/codec/processing/src/asm/downsample_bilinear.asm
+++ b/codec/processing/src/asm/downsample_bilinear.asm
@@ -66,8 +66,6 @@
 	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
 
 
-ALIGN 16
-
 ;***********************************************************************
 ; Code
 ;***********************************************************************
@@ -74,14 +72,12 @@
 
 SECTION .text
 
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
 ;***********************************************************************
 ;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
 ;					unsigned char* pSrc, const int iSrcStride,
 ;					const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
 	push ebx
 	push edx
 	push esi
@@ -227,14 +223,12 @@
 	pop ebx
 	ret
 
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
 ;***********************************************************************
 ;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
 ;					  unsigned char* pSrc, const int iSrcStride,
 ;					  const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
 	push ebx
 	push edx
 	push esi
@@ -331,14 +325,12 @@
 	pop ebx
 	ret
 
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
 ;***********************************************************************
 ;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
 ;					  unsigned char* pSrc, const int iSrcStride,
 ;					  const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
 	push ebx
 	push edx
 	push esi
@@ -422,14 +414,12 @@
 
 
 ; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
 ;***********************************************************************
 ;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
 ;					unsigned char* pSrc, const int iSrcStride,
 ;					const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
 	push ebx
 	push edx
 	push esi
@@ -533,14 +523,12 @@
 	pop ebx
 	ret
 
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
 ;***********************************************************************
 ;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
 ;					  unsigned char* pSrc, const int iSrcStride,
 ;					  const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
 	push ebx
 	push edx
 	push esi
@@ -623,14 +611,12 @@
 	ret
 
 ; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
 ;***********************************************************************
 ;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
 ;					unsigned char* pSrc, const int iSrcStride,
 ;					const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
 	push ebx
 	push edx
 	push esi
@@ -733,14 +719,12 @@
 	pop ebx
 	ret
 
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
 ;***********************************************************************
 ;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
 ;					  unsigned char* pSrc, const int iSrcStride,
 ;					  const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
 	push ebx
 	push edx
 	push esi
@@ -825,7 +809,6 @@
 
 
 
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
 ;**************************************************************************************************************
 ;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
 ;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
@@ -833,8 +816,7 @@
 ;{
 ;**************************************************************************************************************
 
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
+WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
 	push	ebp
 	push	esi
 	push	edi
@@ -1029,7 +1011,6 @@
 
 
 
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
 ;**************************************************************************************************************
 ;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
 ;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
@@ -1037,8 +1018,7 @@
 ;{
 ;**************************************************************************************************************
 
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
+WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
 	push	ebp
 	push	esi
 	push	edi
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -255,12 +255,10 @@
 
 %ifdef X86_32
 
-WELS_EXTERN SampleVariance16x16_sse2
 ;***********************************************************************
 ;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
+WELS_EXTERN SampleVariance16x16_sse2
   push esi
   push edi
   push ebx
@@ -357,7 +355,6 @@
 
 
 
-WELS_EXTERN VAACalcSad_sse2
 ;*************************************************************************************************************
 ;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
 ;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
@@ -364,8 +361,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSad_sse2:
+WELS_EXTERN VAACalcSad_sse2
 %define         cur_data                        esp + pushsize + 4
 %define         ref_data                        esp + pushsize + 8
 %define         iPicWidth                       esp + pushsize + 12
@@ -451,12 +447,10 @@
 
 %else  ;64-bit
 
-WELS_EXTERN SampleVariance16x16_sse2
 ;***********************************************************************
 ;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
+WELS_EXTERN SampleVariance16x16_sse2
   %define SUM                   r10;[esp]
   %define SUM_CUR               r11;[esp+4]
   %define SQR                   r13;[esp+8]
@@ -549,7 +543,6 @@
   ret
 
 
-WELS_EXTERN VAACalcSad_sse2
 ;*************************************************************************************************************
 ;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
 ;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
@@ -556,8 +549,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSad_sse2:
+WELS_EXTERN VAACalcSad_sse2
 %define         cur_data                        r0
 %define         ref_data                        r1
 %define         iPicWidth                       r2
@@ -647,7 +639,6 @@
 
 
 %ifdef X86_32
-WELS_EXTERN VAACalcSadVar_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
 ;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
@@ -654,8 +645,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadVar_sse2:
+WELS_EXTERN VAACalcSadVar_sse2
 %define         localsize               8
 %define         cur_data                        esp + pushsize + localsize + 4
 %define         ref_data                        esp + pushsize + localsize + 8
@@ -783,7 +773,6 @@
 
 %else  ;64-bit
 
-WELS_EXTERN VAACalcSadVar_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
 ;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
@@ -790,8 +779,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadVar_sse2:
+WELS_EXTERN VAACalcSadVar_sse2
 %define         cur_data                        arg1 ;r0
 %define         ref_data                        arg2 ;r1
 %define         iPicWidth                       arg3 ;r2
@@ -926,7 +914,6 @@
 
 %ifdef X86_32
 
-WELS_EXTERN VAACalcSadSsd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
@@ -933,8 +920,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadSsd_sse2:
+WELS_EXTERN VAACalcSadSsd_sse2
 %define         localsize               12
 %define         cur_data                        esp + pushsize + localsize + 4
 %define         ref_data                        esp + pushsize + localsize + 8
@@ -1082,7 +1068,6 @@
 %else
 
 
-WELS_EXTERN VAACalcSadSsd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
@@ -1089,8 +1074,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadSsd_sse2:
+WELS_EXTERN VAACalcSadSsd_sse2
 %define         localsize               12
 %define         cur_data                        arg1;r0
 %define         ref_data                        arg2;r1
@@ -1246,7 +1230,6 @@
 %endif
 
 %ifdef X86_32
-WELS_EXTERN VAACalcSadBgd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
@@ -1253,8 +1236,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadBgd_sse2:
+WELS_EXTERN VAACalcSadBgd_sse2
 %define         localsize               12
 %define         cur_data                        esp + pushsize + localsize + 4
 %define         ref_data                        esp + pushsize + localsize + 8
@@ -1425,7 +1407,6 @@
 
 
 
-WELS_EXTERN VAACalcSadSsdBgd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
@@ -1433,8 +1414,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
+WELS_EXTERN VAACalcSadSsdBgd_sse2
 %define         localsize               16
 %define         cur_data                        esp + pushsize + localsize + 4
 %define         ref_data                        esp + pushsize + localsize + 8
@@ -1656,7 +1636,6 @@
    ret
 %else
 
-WELS_EXTERN VAACalcSadBgd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
@@ -1663,8 +1642,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadBgd_sse2:
+WELS_EXTERN VAACalcSadBgd_sse2
 %define         cur_data                        arg1;
 %define         ref_data                        arg2;
 %define         iPicWidth                       arg3;
@@ -1827,7 +1805,6 @@
 
 
 
-WELS_EXTERN VAACalcSadSsdBgd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
@@ -1835,8 +1812,7 @@
 ;*************************************************************************************************************
 
 
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
+WELS_EXTERN VAACalcSadSsdBgd_sse2
 %define         cur_data                        arg1;
 %define         ref_data                        arg2;
 %define         iPicWidth                       arg3;