shithub: openh264

Download patch

ref: e7cc8c2780e6ab2c409dc11b73cc204999fb7e7a
parent: 248f324c62bae5b2e2dd202e8e251c670de8af5a
author: Licai Guo <[email protected]>
date: Wed Mar 5 11:54:05 EST 2014

Add arm asm code for processing.

--- a/codec/common/deblocking_neon.S
+++ b/codec/common/deblocking_neon.S
@@ -795,7 +795,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
+WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
 
     vld1.64	{d0-d2}, [r0]
 
@@ -810,29 +810,28 @@
 WELS_ASM_FUNC_END
 
 #ifdef APPLE_IOS
-
-.macro BS_NZC_CHECK 
+.macro BS_NZC_CHECK
     vld1.8   {d0,d1}, [$0]
     /* Arrenge the input data --- TOP */
 	ands     r6, $1, #2
 	beq      bs_nzc_check_jump0
-	
+
     sub      r6, $0, $2, lsl #4
 	sub      r6, $2, lsl #3
     add      r6, #12
     vld1.32  d3[1], [r6]
-	
-bs_nzc_check_jump0:	
+
+bs_nzc_check_jump0:
     vext.8   q1, q1, q0, #12
 	vadd.u8  $3, q0, q1
 
-    
+
     /* Arrenge the input data --- LEFT */
 	ands     r6, $1, #1
 	beq      bs_nzc_check_jump1
-	
+
     sub      r6, $0, #21
-	add      r7, r6, #4 
+	add      r7, r6, #4
     vld1.8   d3[4], [r6]
 	add      r6, r7, #4
     vld1.8   d3[5], [r7]
@@ -839,10 +838,10 @@
 	add      r7, r6, #4
     vld1.8   d3[6], [r6]
     vld1.8   d3[7], [r7]
-	
+
 bs_nzc_check_jump1:
-	vzip.8   d0, d1	
 	vzip.8   d0, d1
+	vzip.8   d0, d1
     vext.8   q1, q1, q0, #12
 	vadd.u8  $4, q0, q1
 .endm
@@ -852,41 +851,41 @@
     vabd.s16  q5, $0, $1
     vabd.s16  q6, $1, $2
 	vdup.s16  $0, r6
-    vabd.s16  q7, $2, $3	
-    vabd.s16  q8, $3, $4	    
-    
+    vabd.s16  q7, $2, $3
+    vabd.s16  q8, $3, $4
+
     vcge.s16  q5, $0
     vcge.s16  q6, $0
     vcge.s16  q7, $0
-    vcge.s16  q8, $0 
-	
+    vcge.s16  q8, $0
+
 	vpadd.i16 d10, d10, d11
     vpadd.i16 d11, d12, d13
     vpadd.i16 d12, d14, d15
-    vpadd.i16 d13, d16, d17  
-   
+    vpadd.i16 d13, d16, d17
+
     vaddhn.i16  $5, q5, q5
     vaddhn.i16  $6, q6, q6
 .endm
 
-.macro BS_MV_CHECK 
+.macro BS_MV_CHECK
     vldm   $0, {q0,q1,q2,q3}
 
     /* Arrenge the input data --- TOP */
 	ands     r6, $1, #2
 	beq      bs_mv_check_jump0
-		
+
     sub      r6, $0, $2, lsl #6
     add      r6, #48
     vld1.8   {d8, d9}, [r6]
-	
+
 bs_mv_check_jump0:
     BS_COMPARE_MV  q4, q0, q1, q2, q3, $3, $4
-    
+
     /* Arrenge the input data --- LEFT */
 	ands     r6, $1, #1
 	beq      bs_mv_check_jump1
-	
+
     sub      r6, $0, #52
     add      r7, r6, #16
 	vld1.32   d8[0], [r6]
@@ -895,7 +894,7 @@
 	add      r7, r6, #16
     vld1.32   d9[0], [r6]
     vld1.32   d9[1], [r7]
-	
+
 bs_mv_check_jump1:
 	vzip.32   q0, q2
 	vzip.32   q1, q3
@@ -904,7 +903,6 @@
     BS_COMPARE_MV  q4, q0, q1, q2, q3, $5, $6
 .endm
 #else
-
 .macro BS_NZC_CHECK  arg0, arg1, arg2, arg3, arg4
     vld1.8   {d0,d1}, [\arg0]
     /* Arrenge the input data --- TOP */
@@ -999,28 +997,28 @@
 .endm
 #endif
 
- 
+
 WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
-	
+
 	stmdb sp!, {r5-r7}
-	
+
 	ldr  r5, [sp, #12]	//Save BS to r5
-	
+
 	/* Checking the nzc status */
 	BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
-        
+
 	/* For checking bS[I] = 2 */
 	mov      r6, #2
 	vcgt.s8  q14, q14, #0
 	vdup.u8  q0, r6
 	vcgt.s8  q15, q15, #0
-	
+
 	vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
 	vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
-	
+
 	/* Checking the mv status*/
 	BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
-	
+
 	/* For checking bS[I] = 1 */
     mov      r6, #1
 	vdup.u8  q0, r6
@@ -1027,12 +1025,12 @@
 
 	vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
 	vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
-	
-	
+
+
 	/* Check bS[I] is '1' or '2' */
 	vmax.u8 q1, q12, q14
 	vmax.u8 q0, q13, q15
-	
+
 	//vstm r5, {q0, q1}
     vst1.32 {q0, q1}, [r5]
 	ldmia sp!, {r5-r7}
--- a/codec/common/expand_picture.S
+++ b/codec/common/expand_picture.S
@@ -34,13 +34,13 @@
 .text
 #include "arm_arch_common_macro.S"
 
- 
+
 WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
     stmdb sp!, {r4-r8}
 	//Save the dst
 	mov r7, r0
 	mov r8, r3
-	
+
 	add r4, r7, r2
 	sub r4, #1
     //For the left and right expand
@@ -58,40 +58,40 @@
 	subs r8, #1
 	bne	_expand_picture_luma_loop2
 
-	//for the top and bottom expand              
+	//for the top and bottom expand
 	add r2, #64
 	sub r0, #32
 	mla r4, r1, r3, r0
 	sub r4, r1
 _expand_picture_luma_loop0:
-	mov r5, #32 
-    mls r5, r5, r1, r0 
+	mov r5, #32
+    mls r5, r5, r1, r0
 	add r6, r4, r1
 	vld1.8 {q0}, [r0]!
 	vld1.8 {q1}, [r4]!
-	
+
 	mov r8, #32
-_expand_picture_luma_loop1:	
-	vst1.8 {q0}, [r5], r1 
-	vst1.8 {q1}, [r6], r1 
+_expand_picture_luma_loop1:
+	vst1.8 {q0}, [r5], r1
+	vst1.8 {q1}, [r6], r1
 	subs r8, #1
     bne _expand_picture_luma_loop1
-	 
+
 	subs r2, #16
 	bne	_expand_picture_luma_loop0
 
     //vldreq.32 d0, [r0]
-	
+
 	ldmia sp!, {r4-r8}
 WELS_ASM_FUNC_END
 
- 
+
 WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
     stmdb sp!, {r4-r8}
 	//Save the dst
 	mov r7, r0
 	mov r8, r3
-	
+
 	add r4, r7, r2
 	sub r4, #1
     //For the left and right expand
@@ -107,31 +107,31 @@
 	subs r8, #1
 	bne	_expand_picture_chroma_loop2
 
-	//for the top and bottom expand              
+	//for the top and bottom expand
 	add r2, #32
 	sub r0, #16
 	mla r4, r1, r3, r0
 	sub r4, r1
 _expand_picture_chroma_loop0:
-	mov r5, #16 
-    mls r5, r5, r1, r0 
+	mov r5, #16
+    mls r5, r5, r1, r0
 	add r6, r4, r1
 	vld1.8 {q0}, [r0]!
 	vld1.8 {q1}, [r4]!
-	
+
 	mov r8, #16
-_expand_picture_chroma_loop1:	
-	vst1.8 {q0}, [r5], r1 
-	vst1.8 {q1}, [r6], r1 
+_expand_picture_chroma_loop1:
+	vst1.8 {q0}, [r5], r1
+	vst1.8 {q1}, [r6], r1
 	subs r8, #1
     bne _expand_picture_chroma_loop1
-	 
+
 	subs r2, #16
 	bne	_expand_picture_chroma_loop0
 
     //vldreq.32 d0, [r0]
-	
+
 	ldmia sp!, {r4-r8}
 WELS_ASM_FUNC_END
 
-#endif
\ No newline at end of file
+#endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -533,7 +533,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDc_neon
     //stmdb sp!, { r2-r5, lr}
     //Load the left column data (8 bytes)
     sub r2, r0, #1
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -61,15 +61,15 @@
 .endm
 #endif
 
- 
+
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
 	//Get the top line data to 'q0'
 	sub  r3, r1, r2
 	vldm r3, {d0, d1}
-    
+
 	//mov  r2, #16
 	mov  r3, #4
-	//Set the top line to the each line of MB(16*16) 
+	//Set the top line to the each line of MB(16*16)
 loop_0_get_i16x16_luma_pred_v:
 	vst1.8 {d0,d1}, [r0]!
 	vst1.8 {d0,d1}, [r0]!
@@ -76,10 +76,10 @@
 	vst1.8 {d0,d1}, [r0]!
 	vst1.8 {d0,d1}, [r0]!
 	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_v																
+	bne  loop_0_get_i16x16_luma_pred_v
 WELS_ASM_FUNC_END
 
- 
+
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
     //stmdb sp!, {r4, lr}
 	sub  r1, r1, #1
@@ -87,10 +87,10 @@
 loop_0_get_i16x16_luma_pred_h:
 	//Get one byte data from left side
 	vld1.8 {d0[],d1[]}, [r1], r2
-	vld1.8 {d2[],d3[]}, [r1], r2	
-	vld1.8 {d4[],d5[]}, [r1], r2	
+	vld1.8 {d2[],d3[]}, [r1], r2
+	vld1.8 {d4[],d5[]}, [r1], r2
 	vld1.8 {d6[],d7[]}, [r1], r2
-	
+
 	//Set the line of MB using the left side byte data
 	vst1.8 {d0,d1}, [r0]!
 	//add r0, #16
@@ -100,9 +100,9 @@
 	//add r0, #16
 	vst1.8 {d6,d7}, [r0]!
 	//add r0, #16
-	
+
 	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_h		
+	bne  loop_0_get_i16x16_luma_pred_h
 
 WELS_ASM_FUNC_END
 
@@ -113,11 +113,11 @@
 	sub r3, r1, #1
 	GET_8BYTE_DATA d0, r3, r2
 	GET_8BYTE_DATA d1, r3, r2
-	
+
 	//Get the top horizontal line data
-	sub  r3, r1, r2			
+	sub  r3, r1, r2
 	vldm r3, {d2, d3}
-	
+
 	//Calculate the sum of top horizontal line data and vertical line data
 	vpaddl.u8 q0, q0
 	vpaddl.u8 q1, q1
@@ -125,11 +125,11 @@
 	vadd.u16  d0, d0, d1
 	vpaddl.u16 d0, d0
 	vpaddl.u32 d0, d0
-	
-	//Calculate the mean value 
+
+	//Calculate the mean value
 	vrshr.u16  d0, d0, #5
 	vdup.8     q0, d0[0]
-	
+
 	//Set the mean value to the all of member of MB
 	mov  r3, #4
 loop_0_get_i16x16_luma_pred_dc_both:
@@ -138,8 +138,8 @@
 	vst1.8 {d0,d1}, [r0]!
 	vst1.8 {d0,d1}, [r0]!
 	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_dc_both					
-			
+	bne  loop_0_get_i16x16_luma_pred_dc_both
+
 WELS_ASM_FUNC_END
 
 
@@ -146,13 +146,13 @@
 //The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
 CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
 
-//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}                
+//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
 CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
-                  
 
+
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
 	//stmdb sp!, { r4, lr}
-        
+
 	//Load the table {(8,7,6,5,4,3,2,1) * 5}
 	adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
 	vldr    d0, [r3]
@@ -161,25 +161,25 @@
 	sub       r3,  r1, r2
 	sub       r1,  r3, #1
 	vld1.8    d1, [r1]
-	
+
 	//Pack the top[8] ~ top[15] to d2
 	add       r1, #9
 	vld1.8    d2, [r1]
-    
+
 	//Save the top[15] to d6 for next step
 	vdup.u8   d6,   d2[7]
-	
+
 	//Get and pack left[-1] ~ left[6] to d4
 	sub       r1,  r3, #1
 	GET_8BYTE_DATA d4, r1, r2
-	
+
 	//Get and pack left[8] ~ left[15] to d3
 	add       r1,  r2
 	GET_8BYTE_DATA d3, r1, r2
-	
+
 	//Save the left[15] to d7 for next step
 	vdup.u8   d7,   d3[7]
-    
+
 	//revert the sequence of d2,d3
 	vrev64.8   q1, q1
 
@@ -186,26 +186,26 @@
 	vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
 	vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
 
-        
+
 	vmovl.u8   q0, d0
 	vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
 	vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
-	
+
 	//Calculate the sum of items of q1, q2
 	vpadd.s16  d0, d2, d3
 	vpadd.s16  d1, d4, d5
 	vpaddl.s16 q0, q0
 	vpaddl.s32 q0, q0
-	
+
 	//Get the value of 'b', 'c' and extend to q1, q2.
 	vrshr.s64  q0, #6
 	vdup.s16   q1, d0[0]
 	vdup.s16   q2, d1[0]
-	
+
 	//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
 	adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
 	vld1.32   {d0}, [r3]
-	
+
 	//Get the value of 'a' and save to q3
 	vaddl.u8  q3, d6, d7
 	vshl.u16  q3, #4
@@ -214,22 +214,22 @@
 	vmovl.s8  q0, d0
 	vmla.s16  q3, q0, q1
 	vmla.s16  q3, q2, d0[0]
-	
+
 	//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
 	vshl.s16  q5, q1, #3
 	vadd.s16  q5, q3
-	
+
 	//right shift 5 bits and rounding
 	vqrshrun.s16 d0, q3, #5
 	vqrshrun.s16 d1, q5, #5
-	
+
 	//Set the line of MB
 	vst1.u32  {d0,d1}, [r0]!
-	
-	
+
+
 	//Do the same processing for setting other lines
 	mov  r3, #15
-loop_0_get_i16x16_luma_pred_plane:	
+loop_0_get_i16x16_luma_pred_plane:
 	vadd.s16  q3, q2
 	vadd.s16  q5, q2
 	vqrshrun.s16 d0, q3, #5
@@ -236,35 +236,35 @@
 	vqrshrun.s16 d1, q5, #5
 	vst1.u32  {d0,d1}, [r0]!
 	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_plane	
-		
+	bne  loop_0_get_i16x16_luma_pred_plane
+
 WELS_ASM_FUNC_END
 
- 
+
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the top row (4 bytes)
 	sub  r3, r1, r2
 	ldr  r3, [r3]
-	
+
 	//Set the luma MB using top line
 	str  r3, [r0], #4
 	str  r3, [r0], #4
 	str  r3, [r0], #4
 	str  r3, [r0]
-        
+
 WELS_ASM_FUNC_END
 
- 
+
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
 	//stmdb sp!, { r2-r5, lr}
 	//Load the left column (4 bytes)
 	sub  r3, r1, #1
 	vld1.8 {d0[]}, [r3], r2
-	vld1.8 {d1[]}, [r3], r2	
-	vld1.8 {d2[]}, [r3], r2	
+	vld1.8 {d1[]}, [r3], r2
+	vld1.8 {d2[]}, [r3], r2
 	vld1.8 {d3[]}, [r3]
-	
+
 	//Set the luma MB using the left side byte
 	vst1.32 {d0[0]}, [r0]!
 	vst1.32 {d1[0]}, [r0]!
@@ -279,36 +279,36 @@
 	//Load the top row data(8 bytes)
 	sub    r3,  r1, r2
 	vld1.32  {d0}, [r3]
-	
+
 	//For "t7 + (t7<<1)"
 	vdup.8   d1,  d0[7]
-	
+
 	//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
 	vext.8   d1,  d0, d1, #1
 	vaddl.u8 q1,  d1, d0
-	
+
 	//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
 	vext.8   q2,  q1, q1, #14
 	vadd.u16 q0,  q1, q2
-	
+
 	//right shift 2 bits and rounding
 	vqrshrn.u16  d0,  q0, #2
-	
+
 	//Save "ddl0, ddl1, ddl2, ddl3"
 	vext.8   d1, d0, d0, #1
 	vst1.32  d1[0], [r0]!
-	
+
 	//Save "ddl1, ddl2, ddl3, ddl4"
 	vext.8   d1, d0, d0, #2
 	vst1.32  d1[0], [r0]!
-	
+
 	//Save "ddl2, ddl3, ddl4, ddl5"
 	vext.8   d1, d0, d0, #3
-	vst1.32  d1[0], [r0]!	
-	
+	vst1.32  d1[0], [r0]!
+
 	//Save "ddl3, ddl4, ddl5, ddl6"
-	vst1.32  d0[1], [r0]	
-		
+	vst1.32  d0[1], [r0]
+
 WELS_ASM_FUNC_END
 
 
@@ -317,29 +317,29 @@
 	//Load the top row (4 bytes)
 	sub    r3,  r1, r2
 	vld1.32  {d0[1]}, [r3]
-	
+
 	//Load the left column (5 bytes)
 	sub    r3,  #1
 	vld1.8 {d0[3]}, [r3], r2
-	vld1.8 {d0[2]}, [r3], r2	
+	vld1.8 {d0[2]}, [r3], r2
 	vld1.8 {d0[1]}, [r3], r2
-	vld1.8 {d0[0]}, [r3], r2	
+	vld1.8 {d0[0]}, [r3], r2
 	vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
-	
-	
+
+
 	vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
 	                          //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
-	
+
 	//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
 	vaddl.u8 q2, d2, d0
-	
+
 	//q1:{TL0+LT0,LT0+T01,...L12+L23}
 	vext.8   q3, q3, q2, #14
 	vadd.u16 q1, q2, q3
-	
+
 	//right shift 2 bits and rounding
 	vqrshrn.u16 d0, q1, #2
-	
+
 	//Adjust the data sequence for setting luma MB of 'pred'
 	vst1.32   d0[1], [r0]!
 	vext.8    d0, d0, d0, #7
@@ -358,19 +358,19 @@
 	sub    r3,  r1, r2
 	vld1.32  {d0}, [r3]
 
-        
+
 	vext.8   d1,  d0, d0, #1
 	vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
-	
+
 	vext.8   q2,  q1, q1, #2
 	vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
-	
+
 	//calculate the "vl0,vl1,vl2,vl3,vl4"
 	vqrshrn.u16  d0,  q1, #1
-	
+
 	//calculate the "vl5,vl6,vl7,vl8,vl9"
 	vqrshrn.u16  d1,  q2, #2
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vst1.32  d0[0], [r0]!
 	vst1.32  d1[0], [r0]!
@@ -378,7 +378,7 @@
 	vext.8   d1,  d1, d1, #1
 	vst1.32  d0[0], [r0]!
 	vst1.32  d1[0], [r0]
-	
+
 WELS_ASM_FUNC_END
 
 
@@ -387,34 +387,34 @@
 	//Load the top row (4 bytes)
 	sub       r3,  r1, r2
 	vld1.32   {d0[1]}, [r3]
-	
+
 	//Load the left column (4 bytes)
 	sub       r3,  #1
-	vld1.8    {d0[3]}, [r3], r2	
+	vld1.8    {d0[3]}, [r3], r2
 	vld1.8    {d0[2]}, [r3], r2
-	vld1.8    {d0[1]}, [r3], r2	
-	vld1.8    {d0[0]}, [r3]	
+	vld1.8    {d0[1]}, [r3], r2
+	vld1.8    {d0[0]}, [r3]
 
-        
+
 	vext.8    d1, d0, d0, #7
 	vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
-	
+
 	vext.u8   q2, q1, q1, #14
 	vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
-	
+
 	//Calculate the vr0 ~ vr9
 	vqrshrn.u16 d1, q2, #2
 	vqrshrn.u16 d0, q1, #1
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vst1.32  d0[1], [r0]!
 	vst1.32  d1[1], [r0]!
 	//add    r2, r0, r1
 	vst1.8   d1[3], [r0]!
-	vst1.16  d0[2], [r0]!    
+	vst1.16  d0[2], [r0]!
 	vst1.8   d0[6], [r0]!
 	vst1.8   d1[2], [r0]!
-	vst1.16  d1[2], [r0]!    
+	vst1.16  d1[2], [r0]!
 	vst1.8   d1[6], [r0]
 WELS_ASM_FUNC_END
 
@@ -426,29 +426,29 @@
 	mov       r1,  #3
 	mul       r1,  r2
 	add       r1,  r3
-	vld1.8    {d0[]},  [r1]	   	   
-	vld1.8    {d0[4]}, [r3], r2	
+	vld1.8    {d0[]},  [r1]
+	vld1.8    {d0[4]}, [r3], r2
 	vld1.8    {d0[5]}, [r3], r2
-	vld1.8    {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}	
+	vld1.8    {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
 
 	vext.8    d1, d0, d0, #1
-	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}	
-	
+	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+
 	vext.u8   d2, d5, d4, #2
-	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3} 
-	
+	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+
 	//Calculate the hu0 ~ hu5
 	vqrshrn.u16 d2, q2, #1
 	vqrshrn.u16 d1, q1, #2
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vzip.8   d2, d1
 	vst1.32  d1[0], [r0]!
-	vext.8   d2, d1, d1, #2	
+	vext.8   d2, d1, d1, #2
 	vst1.32  d2[0], [r0]!
 	vst1.32  d1[1], [r0]!
 	vst1.32  d0[0], [r0]
-	
+
 WELS_ASM_FUNC_END
 
 
@@ -458,22 +458,22 @@
 	sub       r3,  r1, r2
 	sub       r3,  #1
 	vld1.32   {d0[1]}, [r3], r2
-	vld1.8    {d0[3]}, [r3], r2	
+	vld1.8    {d0[3]}, [r3], r2
 	vld1.8    {d0[2]}, [r3], r2
-	vld1.8    {d0[1]}, [r3], r2	
+	vld1.8    {d0[1]}, [r3], r2
 	vld1.8    {d0[0]}, [r3]	    //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
 
 
 	vext.8    d1, d0, d0, #7
 	vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
-	
+
 	vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
 	vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
-	
+
 	//Calculate the hd0~hd9
 	vqrshrn.u16 d1, q3, #2
 	vqrshrn.u16 d0, q2, #1
-	
+
 	//Adjust the data sequence for setting the luma MB
 	vmov      d3, d1
 	vtrn.8    d0, d1
@@ -501,25 +501,25 @@
 	vst1.8 {d0}, [r0]!
 	vst1.8 {d0}, [r0]!
 	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]			
-													
+	vst1.8 {d0}, [r0]
+
 WELS_ASM_FUNC_END
 
- 
+
 WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
 	//stmdb sp!, { r2-r5, lr}
 	////Get the left column (8 byte)
 	sub  r3, r1, #1
 	vld1.8 {d0[]}, [r3], r2
-	vld1.8 {d1[]}, [r3], r2	
-	vld1.8 {d2[]}, [r3], r2	
+	vld1.8 {d1[]}, [r3], r2
+	vld1.8 {d2[]}, [r3], r2
 	vld1.8 {d3[]}, [r3], r2
 	vld1.8 {d4[]}, [r3], r2
-	vld1.8 {d5[]}, [r3], r2	
-	vld1.8 {d6[]}, [r3], r2	
+	vld1.8 {d5[]}, [r3], r2
+	vld1.8 {d6[]}, [r3], r2
 	vld1.8 {d7[]}, [r3]
-	 
-	//Set the chroma MB using left column data 
+
+	//Set the chroma MB using left column data
 	vst1.8 {d0}, [r0]!
 	vst1.8 {d1}, [r0]!
 	vst1.8 {d2}, [r0]!
@@ -527,8 +527,8 @@
 	vst1.8 {d4}, [r0]!
 	vst1.8 {d5}, [r0]!
 	vst1.8 {d6}, [r0]!
-	vst1.8 {d7}, [r0]	
-	
+	vst1.8 {d7}, [r0]
+
 WELS_ASM_FUNC_END
 
 
@@ -536,36 +536,36 @@
     //stmdb sp!, { r2-r5, lr}
     //Load the left column data (8 bytes)
     sub r3, r1, #1
-    GET_8BYTE_DATA d0, r3, r2	
-    
+    GET_8BYTE_DATA d0, r3, r2
+
     //Load the top row data (8 bytes)
-    sub  r3, r1, r2			
+    sub  r3, r1, r2
     vldr d1, [r3]
-    
+
     //Calculate the sum of left column and top row
     vpaddl.u8  q0, q0
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1 //'m1' save to d2
-    
-    vrshr.u32  q0, q0, #2 //calculate 'm2','m3' 
-    vrshr.u32  d2, d2, #3 //calculate 'm4' 
-    
+
+    vrshr.u32  q0, q0, #2 //calculate 'm2','m3'
+    vrshr.u32  d2, d2, #3 //calculate 'm4'
+
     //duplicate the 'mx' to a vector line
     vdup.8     d4, d2[0]
     vdup.8     d5, d1[4]
     vdup.8     d6, d0[4]
     vdup.8     d7, d2[4]
-    
-    //Set the chroma MB 
+
+    //Set the chroma MB
     vst2.32 {d4[0],d5[0]}, [r0]!
     vst2.32 {d4[0],d5[0]}, [r0]!
-    vst2.32 {d4[0],d5[0]}, [r0]!	
     vst2.32 {d4[0],d5[0]}, [r0]!
+    vst2.32 {d4[0],d5[0]}, [r0]!
     vst2.32 {d6[0],d7[0]}, [r0]!
     vst2.32 {d6[0],d7[0]}, [r0]!
     vst2.32 {d6[0],d7[0]}, [r0]!
     vst2.32 {d6[0],d7[0]}, [r0]
-    		
+
 WELS_ASM_FUNC_END
 
 
@@ -579,36 +579,36 @@
 	//Load the top row data
 	sub  r3, r1, #1
 	sub  r3, r2
-	vld1.32 {d1[0]}, [r3] 
+	vld1.32 {d1[0]}, [r3]
 	add  r3, #5
 	vld1.32 {d0[0]}, [r3]
-	
+
 	//Load the left column data
 	sub  r3, #5
 	vld1.8 {d1[4]}, [r3], r2
-	vld1.8 {d1[5]}, [r3], r2	
+	vld1.8 {d1[5]}, [r3], r2
 	vld1.8 {d1[6]}, [r3], r2
-	vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}	
+	vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
 	add  r3, r2
 	vld1.8 {d0[4]}, [r3], r2
 	vld1.8 {d0[5]}, [r3], r2
 	vld1.8 {d0[6]}, [r3], r2
 	vld1.8 {d0[7]}, [r3]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
-	
-	
+
+
 	//Save T7 to d3 for next step
 	vdup.u8   d3,   d0[3]
 	//Save L7 to d4 for next step
 	vdup.u8   d4,   d0[7]
-	
+
 	//Calculate the value of 'a' and save to q2
 	vaddl.u8  q2, d3, d4
 	vshl.u16  q2, #4
-	
+
 	//Load the table {{1,2,3,4,1,2,3,4}*17}
 	adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
 	vld1.32   {d2}, [r3]
-	
+
 	//Calculate the 'b','c', and save to q0
 	vrev32.8  d1, d1
 	vsubl.u8  q0, d0, d1
@@ -617,32 +617,32 @@
 	vpaddl.s16 q0, q0
 	vpaddl.s32 q0, q0
 	vrshr.s64  q0, #5
-	
+
 	//Load the table {-3,-2,-1,0,1,2,3,4} to q3
 	adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
 	vld1.32   {d6, d7}, [r3]
-	
+
 	//Duplicate the 'b','c' to q0, q1 for SIMD instruction
 	vdup.s16   q1, d1[0]
 	vdup.s16   q0, d0[0]
-		
+
 	//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
 	vmla.s16   q2, q0, q3
 	vmla.s16   q2, q1, d6[0]
 	vqrshrun.s16 d0, q2, #5
-	
+
 	//Set a line of chroma MB
 	vst1.u32  {d0}, [r0]!
-	
+
 	//Do the same processing for each line.
 	mov  r3, #7
-loop_0_get_i_chroma_pred_plane:	
+loop_0_get_i_chroma_pred_plane:
 	vadd.s16   q2, q1
 	vqrshrun.s16 d0, q2, #5
 	vst1.u32  {d0}, [r0]!
 	subs  r3, #1
-	bne  loop_0_get_i_chroma_pred_plane		
-    
+	bne  loop_0_get_i_chroma_pred_plane
+
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -29,14 +29,14 @@
  *     POSSIBILITY OF SUCH DAMAGE.
  *
  */
- 
+
 #ifdef HAVE_NEON
 .text
 #include "arm_arch_common_macro.S"
 
- 
+
 #ifdef APPLE_IOS
- //The data sequence will be used 
+ //The data sequence will be used
 .macro GET_8BYTE_DATA_L0
 	vld1.8 {$0[0]}, [$1], $2
 	vld1.8 {$0[1]}, [$1], $2
@@ -49,7 +49,7 @@
 .endm
 
 
-.macro HDM_TRANSFORM_4X4_L0 
+.macro HDM_TRANSFORM_4X4_L0
 
 	//Do the vertical transform
 	vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
@@ -57,15 +57,15 @@
 	vswp  d1, d2
 	vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
 	vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
-	
+
 	//Do the horizontal transform
 	vtrn.32 q2, q1
 	vadd.s16 q0, q2, q1
 	vsub.s16 q1, q2, q1
-	
+
 	vtrn.16 q0, q1
 	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1 	
+	vsub.s16 q1, q0, q1
 
 	vmov.s16 d0, d4
 	vmov.s16 d1, d2
@@ -76,9 +76,9 @@
 	vtrn.32 d0, d1 //{0,1,3,2}
 	vaba.s16 $5, d0, $2 //16x16_v
 	vaba.s16 $5, d1, $8
-	vaba.s16 $5, d5, $8	
+	vaba.s16 $5, d5, $8
 	vadd.u16 $5, d3
-	
+
 	//16x16_h
 	vtrn.16 d4, d5 //{0,4,12,8}
 	vaba.s16 $6, d4, $3 //16x16_h
@@ -87,7 +87,7 @@
 	vadd.u16 d2, d3
 	vadd.u16 d2, d5
 	vadd.u16 $6, d2
-	
+
 	//16x16_dc_both
 	vaba.s16 $7, d4, $4 //16x16_dc_both
 	vadd.u16 $7, d2
@@ -95,7 +95,7 @@
 .endm
 
 #else
- //The data sequence will be used 
+ //The data sequence will be used
 .macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
 	vld1.8 {\arg0[0]}, [\arg1], \arg2
 	vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -115,15 +115,15 @@
 	vswp  d1, d2
 	vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
 	vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
-	
+
 	//Do the horizontal transform
 	vtrn.32 q2, q1
 	vadd.s16 q0, q2, q1
 	vsub.s16 q1, q2, q1
-	
+
 	vtrn.16 q0, q1
 	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1 	
+	vsub.s16 q1, q0, q1
 
 	vmov.s16 d0, d4
 	vmov.s16 d1, d2
@@ -134,9 +134,9 @@
 	vtrn.32 d0, d1 //{0,1,3,2}
 	vaba.s16 \arg5, d0, \arg2 //16x16_v
 	vaba.s16 \arg5, d1, \arg8
-	vaba.s16 \arg5, d5, \arg8	
+	vaba.s16 \arg5, d5, \arg8
 	vadd.u16 \arg5, d3
-	
+
 	//16x16_h
 	vtrn.16 d4, d5 //{0,4,12,8}
 	vaba.s16 \arg6, d4, \arg3 //16x16_h
@@ -145,7 +145,7 @@
 	vadd.u16 d2, d3
 	vadd.u16 d2, d5
 	vadd.u16 \arg6, d2
-	
+
 	//16x16_dc_both
 	vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
 	vadd.u16 \arg7, d2
@@ -152,20 +152,20 @@
 .endm
 #endif
 
-WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
     stmdb sp!, {r4-r7, lr}
 
 	//Get the top line data to 'q15'(16 bytes)
 	sub  r7, r0, r1
     vld1.8 {q15}, [r7]
-    
+
 	//Get the left colume data to 'q14' (16 bytes)
 	sub  r7, r0, #1
 	GET_8BYTE_DATA_L0 d28, r7, r1
-	GET_8BYTE_DATA_L0 d29, r7, r1	
-	
+	GET_8BYTE_DATA_L0 d29, r7, r1
+
 	//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
-	//Calculate the 16x16_dc_both mode SATD	
+	//Calculate the 16x16_dc_both mode SATD
 	vaddl.u8 q0, d30, d31
 	vaddl.u8 q1, d28, d29
 	vadd.u16 q0, q1
@@ -172,15 +172,15 @@
 	vadd.u16 d0, d1
 	vpaddl.u16 d0, d0
 	vpaddl.u32 d0, d0
-	
-	//Calculate the mean value 
+
+	//Calculate the mean value
 	vrshr.u16  d0, #5
-	vshl.u16   d27, d0, #4 
-	
-	
+	vshl.u16   d27, d0, #4
+
+
 	//Calculate the 16x16_v mode SATD and save to "q11, 12"
 	vshll.u8 q0, d30, #2
-	vshll.u8 q1, d31, #2	
+	vshll.u8 q1, d31, #2
 	vtrn.32  q0, q1
 	vadd.s16 q2, q0, q1
 	vsub.s16 q1, q0, q1
@@ -191,7 +191,7 @@
 	                  //{8,9,11,10, 12,13,15,14} q11
     //Calculate the 16x16_h mode SATD and save to "q9, q10"
 	vshll.u8 q0, d28, #2
-	vshll.u8 q1, d29, #2	
+	vshll.u8 q1, d29, #2
 	vtrn.32  q0, q1
 	vadd.s16 q2, q0, q1
 	vsub.s16 q1, q0, q1
@@ -199,64 +199,64 @@
 	vadd.s16 q10, q2, q1
 	vsub.s16 q9,  q2, q1
 	vtrn.32  q10, q9  //{0,1,3,2, 4,5,7,6} q10
-	                  //{8,9,11,10, 12,13,15,14} q9	
-	
+	                  //{8,9,11,10, 12,13,15,14} q9
+
 	vmov.i32 d17, #0//Save the SATD of DC_BOTH
 	vmov.i32 d16, #0//Save the SATD of H
 	vmov.i32 d15, #0//Save the SATD of V
 	vmov.i32 d14, #0//For zero D register
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes	
+	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
 	vld1.32  {q3}, [r2], r3
 	vld1.32  {q4}, [r2], r3
 	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3	
+	vld1.32  {q6}, [r2], r3
 	vtrn.32  q3, q4
-	vtrn.32  q5, q6	
-	
-    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14 
+	vtrn.32  q5, q6
+
+    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
-    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14		
+    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes	
+	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
 	vld1.32  {q3}, [r2], r3
 	vld1.32  {q4}, [r2], r3
 	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3	
+	vld1.32  {q6}, [r2], r3
 	vtrn.32  q3, q4
-	vtrn.32  q5, q6	
-	
-    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14 
+	vtrn.32  q5, q6
+
+    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
-    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14		
-	
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes	
+    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
+
+	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
 	vld1.32  {q3}, [r2], r3
 	vld1.32  {q4}, [r2], r3
 	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3	
+	vld1.32  {q6}, [r2], r3
 	vtrn.32  q3, q4
-	vtrn.32  q5, q6	
-	
-    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14 
+	vtrn.32  q5, q6
+
+    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
-    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14		
-	
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes	
+    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
+
+	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
 	vld1.32  {q3}, [r2], r3
 	vld1.32  {q4}, [r2], r3
 	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3	
+	vld1.32  {q6}, [r2], r3
 	vtrn.32  q3, q4
-	vtrn.32  q5, q6	
-	
-    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14 
+	vtrn.32  q5, q6
+
+    HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
-    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14		
-	
+    HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
+
 	//Get the data from stack
 	ldr r5, [sp, #20] //the addr of Best_mode
 	ldr r6, [sp, #24] //the value of i_lambda
@@ -266,19 +266,19 @@
 	vpaddl.u16 d15, d15
 	vpaddl.u32 d15, d15
 	vmov.u32   r0, d15[0]
-	
+
 	//vadd.u16   d22, d23
 	vrshr.u16  d16, #1
 	vpaddl.u16 d16, d16
 	vpaddl.u32 d16, d16
-	vmov.u32   r1, d16[0] 
+	vmov.u32   r1, d16[0]
 	add  r1, r6, lsl #1
-	
+
 	//vadd.u16   d20, d21
 	vrshr.u16  d17, #1
 	vpaddl.u16 d17, d17
 	vpaddl.u32 d17, d17
-	vmov.u32   r2, d17[0] 
+	vmov.u32   r2, d17[0]
 	add  r2, r6, lsl #1
 
     mov r4, #0
@@ -295,20 +295,20 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
     stmdb sp!, {r4-r7, lr}
-	
+
 	//Get the top line data to 'q15'(16 bytes)
 	sub  r4, r0, r1
     vld1.8 {q15}, [r4]
-    
+
 	//Get the left colume data to 'q14' (16 bytes)
 	sub  r4, r0, #1
 	GET_8BYTE_DATA_L0 d28, r4, r1
-	GET_8BYTE_DATA_L0 d29, r4, r1	
-	
+	GET_8BYTE_DATA_L0 d29, r4, r1
+
 	//Calculate the mean value and save to 'q13' (8 bytes)
-	//Calculate the 16x16_dc_both mode SATD	
+	//Calculate the 16x16_dc_both mode SATD
 	vaddl.u8 q0, d30, d31
 	vaddl.u8 q1, d28, d29
 	vadd.u16 q0, q1
@@ -315,40 +315,40 @@
 	vadd.u16 d0, d1
 	vpaddl.u16 d0, d0
 	vpaddl.u32 d0, d0
-	
-	//Calculate the mean value 
+
+	//Calculate the mean value
 	vrshr.u16  d0, d0, #5
 	vdup.8     q13, d0[0]
-	
+
 	sub  r4, r0, #1
-	
+
 	vmov.i32 q12, #0//Save the SATD of DC_BOTH
 	vmov.i32 q11, #0//Save the SATD of H
 	vmov.i32 q10, #0//Save the SATD of V
-	
+
 	mov lr, #16
 sad_intra_16x16_x3_opt_loop0:
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d0[]}, [r4], r1	
+	vld1.8 {d0[]}, [r4], r1
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes	
+	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
 	vld1.8  {q1}, [r2], r3
-	
+
 	subs lr, #1
 	//Do the SAD for top colume
 	vabal.u8  q12, d30, d2
-	vabal.u8  q12, d31, d3	
+	vabal.u8  q12, d31, d3
 
 	//Do the SAD for left colume
 	vabal.u8  q11, d0, d2
-	vabal.u8  q11, d0, d3	
+	vabal.u8  q11, d0, d3
 
 	//Do the SAD for mean value
 	vabal.u8  q10, d26, d2
-	vabal.u8  q10, d26, d3	
-	
+	vabal.u8  q10, d26, d3
+
 	bne sad_intra_16x16_x3_opt_loop0
-	
+
 	//Get the data from stack
 	ldr r5, [sp, #20] //the addr of Best_mode
 	ldr r6, [sp, #24] //the value of i_lambda
@@ -357,19 +357,19 @@
 	vpaddl.u16 d24, d24
 	vpaddl.u32 d24, d24
 	vmov.u32   r0, d24[0]
-	
+
 	vadd.u16   d22, d23
 	vpaddl.u16 d22, d22
 	vpaddl.u32 d22, d22
-	vmov.u32   r1, d22[0] 
+	vmov.u32   r1, d22[0]
 	add  r1, r6, lsl #1
-	
+
 	vadd.u16   d20, d21
 	vpaddl.u16 d20, d20
 	vpaddl.u32 d20, d20
-	vmov.u32   r2, d20[0] 
+	vmov.u32   r2, d20[0]
 	add  r2, r6, lsl #1
-		
+
     mov r4, #0
     cmp r1, r0
     movcc r0, r1
@@ -384,120 +384,120 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
     stmdb sp!, {r4-r7, lr}
-	
+
 	//Get the data from stack
 	ldr r4, [sp, #32] //p_dec_cr
 	ldr r5, [sp, #36] //p_enc_cr
-	
+
 	//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
 	sub  r6, r0, #1
 	GET_8BYTE_DATA_L0 d28, r6, r1
-	sub  r6, r4, #1	
-	GET_8BYTE_DATA_L0 d30, r6, r1	
-	
+	sub  r6, r4, #1
+	GET_8BYTE_DATA_L0 d30, r6, r1
+
 	//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
 	sub  r6, r0, r1
     vld1.8 {d29}, [r6]
 	sub  r6, r4, r1
     vld1.8 {d31}, [r6]
-        
+
 	//Calculate the sum of left column and top row
 	vmov.i32   q0, q14
     vpaddl.u8  q0, q0
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1 //'m1' save to d2
-    vrshr.u32  q0, q0, #2 //calculate 'm2','m3' 
-    vrshr.u32  d2, d2, #3 //calculate 'm4' 
-    
-    //duplicate the 'mx' to a vector line  
+    vrshr.u32  q0, q0, #2 //calculate 'm2','m3'
+    vrshr.u32  d2, d2, #3 //calculate 'm4'
+
+    //duplicate the 'mx' to a vector line
     vdup.8     d27, d2[0]
     vdup.8     d26, d1[4]
 	vtrn.32    d27, d26
-	
+
     vdup.8     d26, d0[4]
     vdup.8     d25, d2[4]
     vtrn.32    d26, d25   //Save to "d27, d26"
-	
+
 	vmov.i32   q0, q15
     vpaddl.u8  q0, q0
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1 //'m1' save to d2
-    vrshr.u32  q0, q0, #2 //calculate 'm2','m3' 
-    vrshr.u32  d2, d2, #3 //calculate 'm4' 
-    
+    vrshr.u32  q0, q0, #2 //calculate 'm2','m3'
+    vrshr.u32  d2, d2, #3 //calculate 'm4'
+
     //duplicate the 'mx' to a vector line
     vdup.8     d25, d2[0]
     vdup.8     d24, d1[4]
 	vtrn.32    d25, d24
-	
+
     vdup.8     d24, d0[4]
     vdup.8     d23, d2[4]
 	vtrn.32    d24, d23   //Save to "d25, d24"
-	
+
 	vmov.i32 q11, #0//Save the SATD of DC_BOTH
 	vmov.i32 q10, #0//Save the SATD of H
 	vmov.i32 q9 , #0//Save the SATD of V
 	sub  r6, r0, #1
-	sub  r7, r4, #1	
+	sub  r7, r4, #1
 	mov lr, #4
 sad_intra_8x8_x3_opt_loop0:
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes	
+	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
 	vld1.8  {d0}, [r2], r3
 	vld1.8  {d1}, [r5], r3
-	
+
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d2[]}, [r6], r1	
-	vld1.8 {d3[]}, [r7], r1	
-		
+	vld1.8 {d2[]}, [r6], r1
+	vld1.8 {d3[]}, [r7], r1
+
 	subs lr, #1
 
-	
+
 	//Do the SAD for top colume
-	vabal.u8  q11, d29, d0 
-	vabal.u8  q11, d31, d1	
+	vabal.u8  q11, d29, d0
+	vabal.u8  q11, d31, d1
 
 	//Do the SAD for left colume
 	vabal.u8  q10, d2, d0
-	vabal.u8  q10, d3, d1	
+	vabal.u8  q10, d3, d1
 
 	//Do the SAD for mean value
 	vabal.u8  q9, d27, d0
-	vabal.u8  q9, d25, d1	
-	
-	
+	vabal.u8  q9, d25, d1
+
+
 	bne sad_intra_8x8_x3_opt_loop0
 
 	mov lr, #4
 sad_intra_8x8_x3_opt_loop1:
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes	
+	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
 	vld1.8  {d0}, [r2], r3
 	vld1.8  {d1}, [r5], r3
-	
+
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d2[]}, [r6], r1	
-	vld1.8 {d3[]}, [r7], r1	
-		
+	vld1.8 {d2[]}, [r6], r1
+	vld1.8 {d3[]}, [r7], r1
+
 	subs lr, #1
 
-	
+
 	//Do the SAD for top colume
-	vabal.u8  q11, d29, d0 
-	vabal.u8  q11, d31, d1	
+	vabal.u8  q11, d29, d0
+	vabal.u8  q11, d31, d1
 
 	//Do the SAD for left colume
 	vabal.u8  q10, d2, d0
-	vabal.u8  q10, d3, d1	
+	vabal.u8  q10, d3, d1
 
 	//Do the SAD for mean value
 	vabal.u8  q9, d26, d0
-	vabal.u8  q9, d24, d1	
-	
-	
-	bne sad_intra_8x8_x3_opt_loop1	
+	vabal.u8  q9, d24, d1
+
+
+	bne sad_intra_8x8_x3_opt_loop1
 	//Get the data from stack
 	ldr r5, [sp, #20] //the addr of Best_mode
 	ldr r6, [sp, #24] //the value of i_lambda
@@ -505,13 +505,13 @@
 	vadd.u16   d22, d23
 	vpaddl.u16 d22, d22
 	vpaddl.u32 d22, d22
-	vmov.u32   r0, d22[0] 
+	vmov.u32   r0, d22[0]
 	add  r0, r6, lsl #1
-	
+
 	vadd.u16   d20, d21
 	vpaddl.u16 d20, d20
 	vpaddl.u32 d20, d20
-	vmov.u32   r1, d20[0] 
+	vmov.u32   r1, d20[0]
 	add  r1, r6, lsl #1
 
 	vadd.u16   d18, d19
@@ -533,28 +533,28 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon
     stmdb sp!, {r4-r7, lr}
-    
+
 	//Get the data from stack
 	ldr r4, [sp, #32] //p_dec_cr
 	ldr r5, [sp, #36] //p_enc_cr
-	
+
 	//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
 	sub  r6, r0, r1
     vld1.8 {d29}, [r6]
 	sub  r6, r4, r1
     vld1.8 {d31}, [r6]
-        
+
 	//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
 	sub  r6, r0, #1
 	GET_8BYTE_DATA_L0 d28, r6, r1
-	sub  r6, r4, #1	
-	GET_8BYTE_DATA_L0 d30, r6, r1	
-		
+	sub  r6, r4, #1
+	GET_8BYTE_DATA_L0 d30, r6, r1
+
 	//Calculate the 16x16_v mode SATD and save to "q12, 13"
 	vshll.u8 q0, d29, #2
-	vshll.u8 q1, d31, #2	
+	vshll.u8 q1, d31, #2
 	vtrn.32  q0, q1
 	vadd.s16 q2, q0, q1
 	vsub.s16 q1, q0, q1
@@ -565,7 +565,7 @@
 	                  //{8,9,11,10, 12,13,15,14} q12
     //Calculate the 16x16_h mode SATD and save to "q10, q11"
 	vshll.u8 q0, d28, #2
-	vshll.u8 q1, d30, #2	
+	vshll.u8 q1, d30, #2
 	vtrn.32  q0, q1
 	vadd.s16 q2, q0, q1
 	vsub.s16 q1, q0, q1
@@ -573,69 +573,69 @@
 	vadd.s16 q11, q2, q1
 	vsub.s16 q10,  q2, q1
 	vtrn.32  q11, q10  //{0,1,3,2, 4,5,7,6} q11
-	                   //{8,9,11,10, 12,13,15,14} q10	
-			
+	                   //{8,9,11,10, 12,13,15,14} q10
+
 	//Calculate the sum of left column and top row
 	//vmov.i32   q0, q14
     vpaddl.u8  q0, q14
     vpaddl.u16 q0, q0
-    vadd.u32   d2, d0, d1 
+    vadd.u32   d2, d0, d1
 
     vpaddl.u8  q2, q15
     vpaddl.u16 q2, q2
-    vadd.u32   d3, d4, d5 
-	
+    vadd.u32   d3, d4, d5
+
 	vtrn.32    q0, q2
 	vrshr.u32  q1, #3
-	vrshr.u32  q2, #2	
+	vrshr.u32  q2, #2
 	vshll.u32  q9, d4, #4 // {2cb, 2cr} q9
 	vshll.u32  q8, d5, #4 // {1cb, 1cr} q8
 	vshll.u32  q7, d2, #4 // {0cb, 3cb} q7
 	vshll.u32  q6, d3, #4 // {0cr, 3cr} q6
-	
-	
+
+
     vmov.i32 d28, #0//Save the SATD of DC_BOTH
 	vmov.i32 d10, #0//Save the SATD of H
 	vmov.i32 d11, #0//Save the SATD of V
 	vmov.i32 d30, #0//For zero D register
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes	
+	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
 	vld1.32  {d6}, [r2], r3
 	vld1.32  {d7}, [r2], r3
 	vld1.32  {d8}, [r2], r3
-	vld1.32  {d9}, [r2], r3	
+	vld1.32  {d9}, [r2], r3
 	vtrn.32  d6, d7
-	vtrn.32  d8, d9	
+	vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
-	
+
 	vld1.32  {d6}, [r5], r3
 	vld1.32  {d7}, [r5], r3
 	vld1.32  {d8}, [r5], r3
-	vld1.32  {d9}, [r5], r3	
+	vld1.32  {d9}, [r5], r3
 	vtrn.32  d6, d7
-	vtrn.32  d8, d9	
+	vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
-    HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30		
+    HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes	
+	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
 	vld1.32  {d6}, [r2], r3
 	vld1.32  {d7}, [r2], r3
 	vld1.32  {d8}, [r2], r3
-	vld1.32  {d9}, [r2], r3	
+	vld1.32  {d9}, [r2], r3
 	vtrn.32  d6, d7
-	vtrn.32  d8, d9	
-    HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30 
+	vtrn.32  d8, d9
+    HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
-	
+
 	vld1.32  {d6}, [r5], r3
 	vld1.32  {d7}, [r5], r3
 	vld1.32  {d8}, [r5], r3
-	vld1.32  {d9}, [r5], r3	
+	vld1.32  {d9}, [r5], r3
 	vtrn.32  d6, d7
-	vtrn.32  d8, d9	
+	vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
-    HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30	
-		
+    HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
+
 	//Get the data from stack
 	ldr r5, [sp, #20] //the addr of Best_mode
 	ldr r6, [sp, #24] //the value of i_lambda
@@ -643,13 +643,13 @@
 	vrshr.u16  d11, #1
 	vpaddl.u16 d11, d11
 	vpaddl.u32 d11, d11
-	vmov.u32   lr, d11[0] 
+	vmov.u32   lr, d11[0]
 	add  lr, r6, lsl #1
-	
+
 	vrshr.u16  d10, #1
 	vpaddl.u16 d10, d10
 	vpaddl.u32 d10, d10
-	vmov.u32   r3, d10[0] 
+	vmov.u32   r3, d10[0]
 	add  r3, r6, lsl #1
 
 	vrshr.u16  d28, #1
@@ -672,13 +672,13 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra4x4Combined3Satd_neon
     stmdb sp!, {r4-r7, lr}
 
     //Get the top line data to 'd31[0~3]'(4 bytes)
 	sub  r7, r0, r1
     vld1.32 {d31[0]}, [r7]
-    
+
 	//Get the left colume data to 'd31[4~7]' (4 bytes)
 	sub  r7, r0, #1
     vld1.8 {d31[4]}, [r7], r1
@@ -685,18 +685,18 @@
     vld1.8 {d31[5]}, [r7], r1
     vld1.8 {d31[6]}, [r7], r1
     vld1.8 {d31[7]}, [r7], r1
-	
+
 	//Calculate the mean value and save to 'd30' (2 bytes)
 	vpaddl.u8 d0, d31
 	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0	
-	//Calculate the mean value 
+	vpaddl.u32 d0, d0
+	//Calculate the mean value
 	vrshr.u16  d0, #3
-	vshl.u16   d30, d0, #4 
-	
+	vshl.u16   d30, d0, #4
+
 	//Calculate the 16x16_v mode SATD and save to "d29"
-    //Calculate the 16x16_h mode SATD and save to "d28"	
-	vshll.u8 q0, d31, #2	
+    //Calculate the 16x16_h mode SATD and save to "d28"
+	vshll.u8 q0, d31, #2
 	vtrn.32  d0, d1
 	vadd.s16 d2, d0, d1
 	vsub.s16 d1, d0, d1
@@ -710,12 +710,12 @@
 	vmov.i32 d26, #0//Save the SATD of H
 	vmov.i32 d25, #0//Save the SATD of V
 	vmov.i32 d24, #0//For zero D register
-	
-	//Load the p_enc data and save to "d22,d23"--- 4X4 bytes	
+
+	//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
 	vld1.32  {d23[0]}, [r2], r3
 	vld1.32  {d23[1]}, [r2], r3
 	vld1.32  {d22[0]}, [r2], r3
-	vld1.32  {d22[1]}, [r2], r3	
+	vld1.32  {d22[1]}, [r2], r3
 
     HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
 
@@ -723,17 +723,17 @@
 	ldr r5, [sp, #28] //the value of lambda2
 	ldr r6, [sp, #32] //the value of lambda1
 	ldr r7, [sp, #36] //the value of lambda0
-	
+
 	vrshr.u16  d25, #1
 	vpaddl.u16 d25, d25
 	vpaddl.u32 d25, d25
-	vmov.u32   r0, d25[0] 
+	vmov.u32   r0, d25[0]
 	add  r0, r7
-	
+
 	vrshr.u16  d26, #1
 	vpaddl.u16 d26, d26
 	vpaddl.u32 d26, d26
-	vmov.u32   r1, d26[0] 
+	vmov.u32   r1, d26[0]
 	add  r1, r6
 
 	vrshr.u16  d27, #1
@@ -741,10 +741,10 @@
 	vpaddl.u32 d27, d27
 	vmov.u32   r2, d27[0]
 	add  r2, r5
-	
+
 	ldr r5, [sp, #20] //p_dst
-	ldr r6, [sp, #24] //the addr of Best_mode	
-						
+	ldr r6, [sp, #24] //the addr of Best_mode
+
 	mov r4, r0
 	cmp r1, r4
 	movcc r4, r1
@@ -770,8 +770,8 @@
 	vdup.8 d0, d31[4]
 	vdup.8 d1, d31[5]
 	vdup.8 d2, d31[6]
-	vdup.8 d3, d31[7]	
-	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]							
+	vdup.8 d3, d31[7]
+	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
 
 	bl satd_intra_4x4_x3_opt_end
 satd_intra_4x4_x3_opt_jump1:
@@ -783,11 +783,11 @@
 	vst1.32 {d31[0]}, [r5]!
 	vst1.32 {d31[0]}, [r5]!
 
-			
+
 satd_intra_4x4_x3_opt_end:
-	mov r0, r4	
-		
+	mov r0, r4
+
 	ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
-#endif
\ No newline at end of file
+#endif
--- a/codec/encoder/core/arm/mc_neon.S
+++ b/codec/encoder/core/arm/mc_neon.S
@@ -1,1963 +1,1963 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef	HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-#ifdef APPLE_IOS
-.macro	AVERAGE_TWO_8BITS 
-//	{	// input:dst_d, src_d A and B; working: q13
-		vaddl.u8	q13, $2, $1
-		vrshrn.u16		$0, q13, #1		
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS 
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
-		vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		$6, q12, #5
-//	}
-.endm
-
-.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used 
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
-		vrev64.8	$2, $0				// X[5][4][3][2][1][0]O
-		vaddl.u8	$3, $0, $2			// each 16bits, *[50][41][32][23][14][05]*	
-		vmul.s16	$0, $2, $1			// 0+1*[50]-5*[41]+20[32]
-		vpadd.s16	$0, $0, $0
-		vpadd.s16	$0, $0, $0
-		vqrshrun.s16	$0, $4, #5
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0 
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
-		vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		$6, q12, #5
-		vaddl.u8	q13, $2, $6
-		vrshrn.u16		$6, q13, #1		
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
-		vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		$6, q12, #5
-		vaddl.u8	q13, $3, $6
-		vrshrn.u16		$6, q13, #1		
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_TO_16BITS 
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q,
-		vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
-		vaddl.u8	q13, $2, $3	//src[0]+src[1]
-		vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-		vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
-.endm
-
-.macro	FILTER_3_IN_16BITS_TO_8BITS
-//	{	// input:a, b, c, dst_d;
-		vsub.s16	$0, $0, $1			//a-b
-		vshr.s16	$0, $0, #2			//(a-b)/4
-		vsub.s16	$0, $0, $1			//(a-b)/4-b
-		vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-		vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-		vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-		vqrshrun.s16	$3, $0, #6		//(+32)>>6
-//	}
-.endm
-
-.macro	UNPACK_2_16BITS_TO_ABC
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a
-		vext.16	$4, $0, $1, #2		//src[0]
-		vext.16	$3, $0, $1, #3		//src[1]
-		vadd.s16	$4, $3					//c=src[0]+src[1]
-
-		vext.16	$3, $0, $1, #1		//src[-1]
-		vext.16	$2, $0, $1, #4		//src[2]
-		vadd.s16	$3, $2					//b=src[-1]+src[2]	
-
-		vext.16	$2, $0, $1, #5		//src[3]	
-		vadd.s16	$2, $0					//a=src[-2]+src[3]
-//	}
-.endm
-
-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-		vext.16	$3, $3, $3, #7	// 0x????, [0][1][2][3][4][5],
-		vrev64.16	$1, $1
-		vadd.u16	$2, $1				// C[2+3],B[1+4],A[0+5],
-		vshr.s64	$1, $2, #16
-		vshr.s64	$0, $2, #32		// Output: C $2, B $1, A $0
-		
-		vsub.s16	$0, $0, $1			//a-b
-		vshr.s16	$0, $0, #2			//(a-b)/4
-		vsub.s16	$0, $0, $1			//(a-b)/4-b
-		vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-		vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-		vadd.s16	$1, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-		vqrshrun.s16	$0, $3, #6		//(+32)>>6					
-//	}
-.endm
-#else
-.macro	AVERAGE_TWO_8BITS arg0, arg1,arg2
-//	{	// input:dst_d, src_d A and B; working: q13
-		vaddl.u8	q13, \arg2, \arg1
-		vrshrn.u16		\arg0, q13, #1		
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b
-		vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		\arg6, q12, #5
-//	}
-.endm
-
-.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used 
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
-		vrev64.8	\arg2, \arg0				// X[5][4][3][2][1][0]O
-		vaddl.u8	\arg3, \arg0, \arg2			// each 16bits, *[50][41][32][23][14][05]*	
-		vmul.s16	\arg0, \arg2, \arg1			// 0+1*[50]-5*[41]+20[32]
-		vpadd.s16	\arg0, \arg0, \arg0
-		vpadd.s16	\arg0, \arg0, \arg0
-		vqrshrun.s16	\arg0, \arg4, #5
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0  arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
-		vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		\arg6, q12, #5
-		vaddl.u8	q13, \arg2, \arg6
-		vrshrn.u16		\arg6, q13, #1		
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
-		vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-		vqrshrun.s16		\arg6, q12, #5
-		vaddl.u8	q13, \arg3, \arg6
-		vrshrn.u16		\arg6, q13, #1		
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_TO_16BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8 
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3]
-		vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
-		vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-		vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-		vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-		vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
-.endm
-
-.macro	FILTER_3_IN_16BITS_TO_8BITS arg0, arg1,arg2, arg3
-//	{	// input:a, b, c, dst_d;
-		vsub.s16	\arg0, \arg0, \arg1			//a-b
-		vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-		vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-		vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-		vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-		vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-		vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
-//	}
-.endm
-
-.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1,arg2, arg3, arg4
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5)
-		vext.16	\arg4, \arg0, \arg1, #2		//src[0]
-		vext.16	\arg3, \arg0, \arg1, #3		//src[1]
-		vadd.s16	\arg4, \arg3					//c=src[0]+src[1]
-
-		vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
-		vext.16	\arg2, \arg0, \arg1, #4		//src[2]
-		vadd.s16	\arg3, \arg2					//b=src[-1]+src[2]	
-
-		vext.16	\arg2, \arg0, \arg1, #5		//src[3]	
-		vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
-//	}
-.endm
-
-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-		vext.16	\arg3, \arg3, \arg3, #7	// 0x????, [0][1][2][3][4][5]
-		vrev64.16	\arg1, \arg1
-		vadd.u16	\arg2, \arg1				// C[2+3],B[1+4],A[0+5]
-		vshr.s64	\arg1, \arg2, #16
-		vshr.s64	\arg0, \arg2, #32		// Output: C \arg2, B \arg1, A \arg0
-		
-		vsub.s16	\arg0, \arg0, \arg1			//a-b
-		vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-		vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-		vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-		vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-		vadd.s16	\arg1, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-		vqrshrun.s16	\arg0, \arg3, #6		//(+32)>>6					
-//	}
-.endm
-#endif
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_h_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-		
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w16_h_mc_luma_loop:							
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
-	
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q4, q0, q1, #3		//q4=src[1]
-	vext.8		q5, q0, q1, #4		//q5=src[2]
-	vext.8		q6, q0, q1, #5		//q6=src[3]
-	
-	FILTER_6TAG_8BITS 	d0, d4, d6, d8, d10, d12, d2, q14, q15
-
-	FILTER_6TAG_8BITS 	d1, d5, d7, d9, d11, d13, d3, q14, q15
-
-	sub		r4, #1	
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
-
-	cmp		r4, #0
-	bne		w16_h_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_h_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d3, r5, r4					// 0x0014FFFB00010000
-	
-	sub			r3, #16
-	ldr			r4, [sp, #8]
-		
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-			
-w17_h_mc_luma_loop:							
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 22(17+5); q0=src[-2]
-	
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q4, q0, q1, #3		//q4=src[1]
-	vext.8		q5, q0, q1, #4		//q5=src[2]
-	vext.8		q6, q0, q1, #5		//q6=src[3]
-	
-	FILTER_6TAG_8BITS 	d0, d4, d6, d8, d10, d12, d14, q14, q15
-
-	FILTER_6TAG_8BITS 	d1, d5, d7, d9, d11, d13, d15, q14, q15
-
-	vst1.u8	{d14, d15}, [r2]!		//write [0:15] Byte
-
-	vsli.64	d2, d2, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d3, d14, q7, q1
-
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
-		
-	sub		r4, #1	
-	cmp		r4, #0
-	bne		w17_h_mc_luma_loop
-	pop		{r4-r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_h_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d7, r5, r4					// 0x0014FFFB00010000
-	
-	sub			r3, #8	
-	ldr			r4, [sp, #8]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w9_h_mc_luma_loop:							
-	vld1.u8	{d0,d1}, [r0], r1	//only use 14(9+5); q0=src[-2]
-	pld			[r0]
-	
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
-	
-	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d8, q14, q15
-
-	sub		r4, #1	
-	vst1.u8	{d8}, [r2]!		//write [0:7] Byte
-
-	vsli.64	d2, d1, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d7, d14, q7, q1
-	vst1.u8	{d2[0]}, [r2], r3		//write 8th Byte
-
-	cmp		r4, #0
-	bne		w9_h_mc_luma_loop
-	pop		{r4-r5}		
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_h_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
-		
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w4_h_mc_luma_loop:							
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
-
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q4, q2, q2, #2		//src[1:6 * *]
-	
-	vtrn.32	q3, q4					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]		
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
-	
-	FILTER_6TAG_8BITS 	d0, d4, d6, d7, d2, d5, d1, q14, q15
-	
-	vmov		r4, r5, d1	
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-		
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_h_mc_luma_loop
-	
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_10_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-		
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w16_xy_10_mc_luma_loop:							
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
-	
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q4, q0, q1, #3		//q4=src[1]
-	vext.8		q5, q0, q1, #4		//q5=src[2]
-	vext.8		q6, q0, q1, #5		//q6=src[3]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d8, d10, d12, d2, q14, q15
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d5, d7, d9, d11, d13, d3, q14, q15
-
-	sub		r4, #1	
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
-
-	cmp		r4, #0
-	bne		w16_xy_10_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_10_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w8_xy_10_mc_luma_loop:							
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
-	
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d3, d4, d5, d6, d1, q14, q15
-
-	sub		r4, #1	
-	vst1.u8	{d1}, [r2], r3
-
-	cmp		r4, #0
-	bne		w8_xy_10_mc_luma_loop
-	pop		{r4}		
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_10_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
-		
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w4_xy_10_mc_luma_loop:							
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
-
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q4, q2, q2, #2		//src[1:6 * *]
-	
-	vtrn.32	q3, q4					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]		
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d7, d2, d5, d1, q14, q15
-	
-	vmov		r4, r5, d1	
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-		
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_10_mc_luma_loop
-	
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_30_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-		
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w16_xy_30_mc_luma_loop:							
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
-	
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q4, q0, q1, #3		//q4=src[1]
-	vext.8		q5, q0, q1, #4		//q5=src[2]
-	vext.8		q6, q0, q1, #5		//q6=src[3]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d8, d10, d12, d2, q14, q15
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d5, d7, d9, d11, d13, d3, q14, q15
-
-	sub		r4, #1	
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
-
-	cmp		r4, #0
-	bne		w16_xy_30_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_30_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w8_xy_30_mc_luma_loop:							
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
-	
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d3, d4, d5, d6, d1, q14, q15
-
-	sub		r4, #1	
-	vst1.u8	{d1}, [r2], r3
-
-	cmp		r4, #0
-	bne		w8_xy_30_mc_luma_loop
-	pop		{r4}		
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_30_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
-		
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w4_xy_30_mc_luma_loop:							
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
-
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q4, q2, q2, #2		//src[1:6 * *]
-	
-	vtrn.32	q3, q4					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]		
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d7, d2, d5, d1, q14, q15
-	
-	vmov		r4, r5, d1	
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-		
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_30_mc_luma_loop
-	
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_01_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-	
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]		
-	vld1.u8	{q4}, [r0], r1		//q4=src[2]
-
-w16_xy_01_luma_loop:
-
-	vld1.u8	{q5}, [r0], r1		//q5=src[3]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row	
-	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row	
-	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d4, d6, d8, d10, d0, d2, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d5, d7, d9, d11, d1, d3, d13, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row	
-	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d6, d8, d10, d0, d2, d4, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d7, d9, d11, d1, d3, d5, d13, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row	
-	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d8, d10, d0, d2, d4, d6, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d9, d11, d1, d3, d5, d7, d13, q14, q15
-	vld1.u8	{q4}, [r0], r1		//read 6th row	
-	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d10, d0, d2, d4, d6, d8, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d11, d1, d3, d5, d7, d9, d13, q14, q15
-	vld1.u8	{q5}, [r0], r1		//read 7th row	
-	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte	
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row	
-	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte	
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte	
-
-	//q2, q3, q4, q5, q0 --> q0~q4
-	vswp	q0, q4
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q5						//q0~q4
-		
-	sub		r4, #8	
-	cmp		r4, #0
-	bne		w16_xy_01_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_01_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-				
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]	
-
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
-	
-w8_xy_01_mc_luma_loop:
-	
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d12, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row	
-	vst1.u8	{d12}, [r2], r3		//write 1st 8Byte
-	
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d2, d3, d4, d5, d0, d12, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row	
-	vst1.u8	{d12}, [r2], r3		//write 2nd 8Byte
-
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d12, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row	
-	vst1.u8	{d12}, [r2], r3		//write 3rd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d4, d5, d0, d1, d2, d12, q14, q15	
-	vld1.u8	{d3}, [r0], r1		//read 5th row	
-	vst1.u8	{d12}, [r2], r3		//write 4th 8Byte
-
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
-		
-	sub		r4, #4	
-	cmp		r4, #0
-	bne		w8_xy_01_mc_luma_loop
-		
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
-	
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
-
-	ldr		r4, [r0], r1		//r4=src[2]			
-	vmov		d3, r7, r4	
-	ldr			r7, [sp, #16]
-		
-w4_xy_01_mc_luma_loop:
-
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]	
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d12, q14, q15
-	vmov		r4, r5, d12
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
-		
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d12, q14, q15	
-	vmov		r5, r6, d12
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
-
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
-		
-	sub		r7, #4	
-	cmp		r7, #0
-	bne		w4_xy_01_mc_luma_loop
-		
-	pop		{r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_03_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-	
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]		
-	vld1.u8	{q4}, [r0], r1		//q4=src[2]
-
-w16_xy_03_luma_loop:
-
-	vld1.u8	{q5}, [r0], r1		//q5=src[3]
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row	
-	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row	
-	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d4, d6, d8, d10, d0, d2, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d5, d7, d9, d11, d1, d3, d13, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row	
-	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d6, d8, d10, d0, d2, d4, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d7, d9, d11, d1, d3, d5, d13, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row	
-	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d8, d10, d0, d2, d4, d6, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d9, d11, d1, d3, d5, d7, d13, q14, q15
-	vld1.u8	{q4}, [r0], r1		//read 6th row	
-	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d10, d0, d2, d4, d6, d8, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d11, d1, d3, d5, d7, d9, d13, q14, q15
-	vld1.u8	{q5}, [r0], r1		//read 7th row	
-	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte	
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row	
-	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte	
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte	
-
-	//q2, q3, q4, q5, q0 --> q0~q4
-	vswp	q0, q4
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q5						//q0~q4
-		
-	sub		r4, #8	
-	cmp		r4, #0
-	bne		w16_xy_03_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_03_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-				
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]	
-
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
-	
-w8_xy_03_mc_luma_loop:
-	
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d12, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row	
-	vst1.u8	{d12}, [r2], r3		//write 1st 8Byte
-	
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d2, d3, d4, d5, d0, d12, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row	
-	vst1.u8	{d12}, [r2], r3		//write 2nd 8Byte
-
-	pld			[r0]	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d12, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row	
-	vst1.u8	{d12}, [r2], r3		//write 3rd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d4, d5, d0, d1, d2, d12, q14, q15	
-	vld1.u8	{d3}, [r0], r1		//read 5th row	
-	vst1.u8	{d12}, [r2], r3		//write 4th 8Byte
-
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
-		
-	sub		r4, #4	
-	cmp		r4, #0
-	bne		w8_xy_03_mc_luma_loop
-		
-	pop		{r4}
-	WELS_ASM_FUNC_END
-
-    WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
-	
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
-
-	ldr		r4, [r0], r1		//r4=src[2]			
-	vmov		d3, r7, r4	
-	ldr			r7, [sp, #16]
-		
-w4_xy_03_mc_luma_loop:
-
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]	
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d12, q14, q15
-	vmov		r4, r5, d12
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
-		
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
-	
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d12, q14, q15	
-	vmov		r5, r6, d12
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
-
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
-		
-	sub		r7, #4	
-	cmp		r7, #0
-	bne		w4_xy_03_mc_luma_loop
-		
-	pop		{r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_v_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-	
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]		
-	vld1.u8	{q4}, [r0], r1		//q4=src[2]
-
-w16_v_mc_luma_loop:
-
-	vld1.u8	{q5}, [r0], r1		//q5=src[3]
-	
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row	
-	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
-	
-	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row	
-	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS 	d4, d6, d8, d10, d0, d2, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d5, d7, d9, d11, d1, d3, d13, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row	
-	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS 	d6, d8, d10, d0, d2, d4, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d7, d9, d11, d1, d3, d5, d13, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row	
-	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS 	d8, d10, d0, d2, d4, d6, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d9, d11, d1, d3, d5, d7, d13, q14, q15
-	vld1.u8	{q4}, [r0], r1		//read 6th row	
-	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
-	
-	FILTER_6TAG_8BITS 	d10, d0, d2, d4, d6, d8, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d11, d1, d3, d5, d7, d9, d13, q14, q15
-	vld1.u8	{q5}, [r0], r1		//read 7th row	
-	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte	
-
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row	
-	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte	
-
-	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte	
-
-	//q2, q3, q4, q5, q0 --> q0~q4
-	vswp	q0, q4
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q5						//q0~q4
-		
-	sub		r4, #8	
-	cmp		r4, #0
-	bne		w16_v_mc_luma_loop
-	pop		{r4}
-	WELS_ASM_FUNC_END
-
-    WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_v_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-	
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]		
-	vld1.u8	{q4}, [r0], r1		//q4=src[2]
-
-w17_v_mc_luma_loop:
-
-	vld1.u8	{q5}, [r0], r1		//q5=src[3]
-	
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row	
-	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
-	
-	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row	
-	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS 	d4, d6, d8, d10, d0, d2, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d5, d7, d9, d11, d1, d3, d13, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row	
-	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS 	d6, d8, d10, d0, d2, d4, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d7, d9, d11, d1, d3, d5, d13, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row	
-	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS 	d8, d10, d0, d2, d4, d6, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d9, d11, d1, d3, d5, d7, d13, q14, q15
-	vld1.u8	{q4}, [r0], r1		//read 6th row	
-	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
-	
-	FILTER_6TAG_8BITS 	d10, d0, d2, d4, d6, d8, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d11, d1, d3, d5, d7, d9, d13, q14, q15
-	vld1.u8	{q5}, [r0], r1		//read 7th row	
-	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte	
-
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row	
-	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte	
-
-	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
-	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte	
-
-	//q2, q3, q4, q5, q0 --> q0~q4
-	vswp	q0, q4
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q5						//q0~q4
-		
-	sub		r4, #8	
-	cmp		r4, #1
-	bne		w17_v_mc_luma_loop
-	// the last 16Bytes
-	vld1.u8	{q5}, [r0], r1		//q5=src[3]
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
-	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
-	
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_v_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-				
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]	
-
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
-	
-w9_v_mc_luma_loop:
-	
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d12, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row	
-	vst1.u8	{d12}, [r2], r3		//write 1st 8Byte
-	
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d12, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row	
-	vst1.u8	{d12}, [r2], r3		//write 2nd 8Byte
-
-	pld			[r0]	
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d12, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row	
-	vst1.u8	{d12}, [r2], r3		//write 3rd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d12, q14, q15	
-	vld1.u8	{d3}, [r0], r1		//read 5th row	
-	vst1.u8	{d12}, [r2], r3		//write 4th 8Byte
-
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
-		
-	sub		r4, #4	
-	cmp		r4, #1
-	bne		w9_v_mc_luma_loop
-
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d12, q14, q15
-	vst1.u8	{d12}, [r2], r3		//write last 8Byte
-				
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20	
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5	
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
-	
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
-
-	ldr		r4, [r0], r1		//r4=src[2]			
-	vmov		d3, r7, r4	
-	ldr			r7, [sp, #16]
-		
-w4_v_mc_luma_loop:
-	
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]	
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
-	
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d12, q14, q15
-	vmov		r4, r5, d12
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
-		
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
-	
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d12, q14, q15	
-	vmov		r5, r6, d12
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
-
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
-		
-	sub		r7, #4	
-	cmp		r7, #0
-	bne		w4_v_mc_luma_loop
-		
-	pop		{r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_hv_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-		
-	sub			r0, #2					//src[-2]
-	sub			r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-	
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(16+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(16+5), =src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(16+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(16+5), =src[1]		
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(16+5), =src[2]
-
-w16_hv_mc_luma_loop:
-
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(16+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10			
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 5 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2], r3		//write 16Byte
-	
-	
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row	
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail		
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 5 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
-	vst1.u8	{d3, d4}, [r2], r3		//write 16Byte
-	
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row	
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 5 avail		
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2], r3		//write 16Byte
-		
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row	
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail		
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 5 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
-	
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2	
-	vmov	q2, q8
-	
-	vmov	d20,d8
-	vmov	q4, q1				
-	vmov	q1, q7
-	vmov	d14,d20
-			
-	sub		r4, #4	
-	cmp		r4, #0
-	bne		w16_hv_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_hv_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-		
-	sub			r0, #2					//src[-2]
-	sub			r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-	
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(17+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(17+5), =src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(17+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(17+5), =src[1]		
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(17+5), =src[2]
-	sub			r3, #16
-
-w17_hv_mc_luma_loop:
-
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10			
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{d0, d1}, [r2]!			//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte	
-	
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row	
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail		
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 6 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
-	vst1.u8	{d3, d4}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d5, d22, d23, q11 //output to d5[0]
-	vst1.u8	{d5[0]}, [r2], r3		//write 16th Byte	
-		
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row	
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6	
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 6 avail		
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d8, d22, d23, q11 //output to d8[0]
-	vst1.u8	{d8[0]}, [r2], r3		//write 16th Byte		
-		
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row	
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail		
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 6 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d11, d22, d23, q11 //output to d11[0]
-	vst1.u8	{d11[0]}, [r2], r3		//write 16th Byte	
-		
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2	
-	vmov	q2, q8
-	
-	vmov	d20,d8
-	vmov	q4, q1				
-	vmov	q1, q7
-	vmov	d14,d20
-			
-	sub		r4, #4	
-	cmp		r4, #1
-	bne		w17_hv_mc_luma_loop
-	//the last row	
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	// vertical filtered into q9/q10			
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2]!			//write 16Byte	
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte	
-	
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_hv_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-		
-	sub			r0, #2				//src[-2]
-	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-	
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 14(9+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 14(9+5), =src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
-	
-	vld1.u8	{q2}, [r0], r1	//use 14(9+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 14(9+5), =src[1]		
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 14(9+5), =src[2]
-	sub			r3, #8
-
-w9_hv_mc_luma_loop:
-
-	vld1.u8	{q5}, [r0], r1	//use 14(9+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q6/q7			
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q6, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q7, q14, q15	// 6 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
-	vst1.u8	d12, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
-	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte	
-	
-	vld1.u8	{q0}, [r0], r1		//read 2nd row	
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q6/q7			
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d10, d0, q6, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d11, d1, q7, q14, q15	// 6 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
-	vst1.u8	d12, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
-	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte	
-
-	vld1.u8	{q1}, [r0], r1		//read 3rd row	
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q6/q7			
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d0, d2, q6, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d1, d3, q7, q14, q15	// 6 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
-	vst1.u8	d12, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
-	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte	
-	
-	vld1.u8	{q2}, [r0], r1		//read 4th row	
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q6/q7			
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d10, d0, d2, d4, q6, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d11, d1, d3, d5, q7, q14, q15	// 6 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
-	vst1.u8	d12, [r2]!			//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
-	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte	
-
-	//q4~q5, q0~q2, --> q0~q4
-	vswp	q0, q4
-	vswp	q2, q4
-	vmov	q3, q1	
-	vmov	q1, q5
-			
-	sub		r4, #4	
-	cmp		r4, #1
-	bne		w9_hv_mc_luma_loop
-	//the last row
-	vld1.u8	{q5}, [r0], r1	//use 14(9+5), =src[3]
-	// vertical filtered into q6/q7			
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q6, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q7, q14, q15	// 6 avail
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
-	vst1.u8	d12, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
-	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte		
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_hv_neon
-	push		{r4 ,r5, r6}
-	ldr			r6, [sp, #12]
-	
-	sub			r0, #2				//src[-2]
-	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-	
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[-1]
-	
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
-	
-	vld1.u8	{q2}, [r0], r1	//use 9(4+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 9(4+5), =src[1]		
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 9(4+5), =src[2]
-	
-w4_hv_mc_luma_loop:
-
-	vld1.u8	{q5}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q6}, [r0], r1	//use 9(4+5), =src[4]
-		
-	//the 1st&2nd row
-	pld			[r0]
-	pld			[r0, r1]	
-	// vertical filtered			
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q8, q14, q15	// 1 avail
-		
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8,d10, d12, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9,d11, d13,q10, q14, q15	// 1 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q0, q7, q8		//4 avail
-	
-	vmov	d23, d0
-	vmov	d25, d14
-	vmov	d27, d16
-		
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
-			
-	//the 3rd&4th row
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[4]	
-	pld			[r0]
-	pld			[r0, r1]	
-	// vertical filtered			
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d12, d0, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d13, d1, q8, q14, q15	// 1 avail
-		
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8,d10, d12, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9,d11, d13, d1, d3,q10, q14, q15	// 1 avail	
-	// horizon filtered 
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q2, q7, q8		//4 avail
-	
-	vmov	d23, d4
-	vmov	d25, d14
-	vmov	d27, d16
-		
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
-
-	//q4~q6, q0~q1, --> q0~q4
-	vswp	q4, q0
-	vmov	q3, q4	
-	vmov	q4, q1
-	vmov	q1, q5	
-	vmov	q2, q6
-					
-	sub		r6, #4	
-	cmp		r6, #0
-	bne		w4_hv_mc_luma_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN enc_mc_copy_w16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]	
-w16_copy_loop:
-	vld1.u8		{q0}, [r0], r1
-	vld1.u8		{q1}, [r0], r1
-	vst1.u8		{q0}, [r2], r3	
-	vst1.u8		{q1}, [r2], r3	
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w16_copy_loop
-	
-	pop		{r4}	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_copy_w8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]	
-w8_copy_loop:
-	vld1.u8		{d0}, [r0], r1
-	vld1.u8		{d1}, [r0], r1
-	vst1.u8		{d0}, [r2], r3	
-	vst1.u8		{d1}, [r2], r3	
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w8_copy_loop
-	
-	pop		{r4}	
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN enc_mc_copy_w4_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]	
-w4_copy_loop:
-	ldr		r5, [r0], r1
-	ldr		r6, [r0], r1
-	str		r5, [r2], r3	
-	str		r6, [r2], r3
-		
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w4_copy_loop
-	
-	pop		{r4, r5, r6}	
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN enc_pixel_avg_w16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]	
-w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2]!
-	vld1.u8		{q1}, [r3]!
-	vld1.u8		{q2}, [r2]!
-	vld1.u8		{q3}, [r3]!
-	
-	vld1.u8		{q4}, [r2]!
-	vld1.u8		{q5}, [r3]!
-	vld1.u8		{q6}, [r2]!
-	vld1.u8		{q7}, [r3]!
-		
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
-		
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
-	
-	AVERAGE_TWO_8BITS		d8, d8, d10
-	AVERAGE_TWO_8BITS		d9, d9, d11
-	vst1.u8		{q4}, [r0], r1
-		
-	AVERAGE_TWO_8BITS		d12, d12, d14
-	AVERAGE_TWO_8BITS		d13, d13, d15
-	vst1.u8		{q6}, [r0], r1
-			
-	sub			r4, #4
-	cmp			r4, #0
-	bne			w16_pix_avg_loop
-	
-	pop		{r4}	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_pix_avg_w16_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]	
-		
-enc_w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2], r3
-	vld1.u8		{q1}, [r4], r5
-	vld1.u8		{q2}, [r2], r3
-	vld1.u8		{q3}, [r4], r5
-	
-	vld1.u8		{q4}, [r2], r3
-	vld1.u8		{q5}, [r4], r5
-	vld1.u8		{q6}, [r2], r3
-	vld1.u8		{q7}, [r4], r5
-		
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
-		
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
-	
-	AVERAGE_TWO_8BITS		d8, d8, d10
-	AVERAGE_TWO_8BITS		d9, d9, d11
-	vst1.u8		{q4}, [r0], r1
-		
-	AVERAGE_TWO_8BITS		d12, d12, d14
-	AVERAGE_TWO_8BITS		d13, d13, d15
-	vst1.u8		{q6}, [r0], r1
-			
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w16_pix_avg_loop
-	
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN enc_pix_avg_w8_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]	
-enc_w8_pix_avg_loop:
-
-	vld1.u8		{d0}, [r2], r3
-	vld1.u8		{d2}, [r4], r5
-	vld1.u8		{d1}, [r2], r3
-	vld1.u8		{d3}, [r4], r5
-		
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{d0}, [r0], r1
-	vst1.u8		{d1}, [r0], r1
-
-	vld1.u8		{d4}, [r2], r3
-	vld1.u8		{d6}, [r4], r5
-	vld1.u8		{d5}, [r2], r3	
-	vld1.u8		{d7}, [r4], r5
-			
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{d4}, [r0], r1
-	vst1.u8		{d5}, [r0], r1	
-			
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w8_pix_avg_loop
-	
-	pop		{r4, r5, r6}	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_chroma_w8_neon
-
-	push		{r4, r5}
-	ldr			r4, [sp, #8]
-	ldr			r5, [sp, #12]
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
-	vld1.u8		{q0}, [r0], r1	//src[x]
-
-	vdup.u8	d28, d31[0]			//A     
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C     
-	vdup.u8	d31, d31[3]			//D	
-	
-	vext.u8		d1, d0, d1, #1		//src[x+1]
-	
-w8_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{q1}, [r0], r1	//src[x+stride]
-	vld1.u8		{q2}, [r0], r1	//src[x+2*stride]
-	vext.u8		d3, d2, d3, #1		//src[x+stride+1]
-	vext.u8		d5, d4, d5, #1		//src[x+2*stride+1]
-			
-	vmull.u8		q3, d0, d28			//(src[x] * A)		
-	vmlal.u8		q3, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d2, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d3, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3
-	
-	vmull.u8		q3, d2, d28			//(src[x] * A)		
-	vmlal.u8		q3, d3, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d4, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d5, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3	
-		
-	vmov		q0, q2
-	sub			r5, #2
-	cmp			r5, #0
-	bne			w8_mc_chroma_loop
-	
-	pop		{r4, r5}	
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN enc_mc_chroma_w4_neon
-
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r6, [sp, #16]
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
-
-	vdup.u8	d28, d31[0]			//A     
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C     
-	vdup.u8	d31, d31[3]			//D	
-	
-w4_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{d0}, [r0], r1	//a::src[x]
-	vld1.u8		{d2}, [r0], r1	//b::src[x+stride]
-	vld1.u8		{d4}, [r0]			//c::src[x+2*stride]
-	
-	vshr.u64		d1, d0, #8	
-	vshr.u64		d3, d2, #8	
-	vshr.u64		d5, d4, #8	
-	
-	vmov			q3, q1				//b::[0:7]+b::[1~8]
-	vtrn.32		q0, q1				//d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
-	vtrn.32		q3, q2				//d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
-			
-	vmull.u8		q1, d0, d28			//(src[x] * A)		
-	vmlal.u8		q1, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q1, d6, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q1, d7, d31			//+=(src[x+stride+1] * D)
-	
-	vrshrn.u16		d2, q1, #6
-	vmov		r4, r5, d2
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-
-	sub			r6, #2
-	cmp			r6, #0
-	bne			w4_mc_chroma_loop
-	
-	pop		{r4, r5, r6}	
-WELS_ASM_FUNC_END
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef	HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef APPLE_IOS
+.macro	AVERAGE_TWO_8BITS
+//	{	// input:dst_d, src_d A and B; working: q13
+    vaddl.u8	q13, $2, $1
+    vrshrn.u16		$0, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+//	}
+.endm
+
+.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
+//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+    vrev64.8	$2, $0				// X[5][4][3][2][1][0]O
+    vaddl.u8	$3, $0, $2			// each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16	$0, $2, $1			// 0+1*[50]-5*[41]+20[32]
+    vpadd.s16	$0, $0, $0
+    vpadd.s16	$0, $0, $0
+    vqrshrun.s16	$0, $4, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+    vaddl.u8	q13, $2, $6
+    vrshrn.u16		$6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+    vaddl.u8	q13, $3, $6
+    vrshrn.u16		$6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q,
+    vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS
+//	{	// input:a, b, c, dst_d;
+    vsub.s16	$0, $0, $1			//a-b
+    vshr.s16	$0, $0, #2			//(a-b)/4
+    vsub.s16	$0, $0, $1			//(a-b)/4-b
+    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
+    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	$3, $0, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	UNPACK_2_16BITS_TO_ABC
+//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a
+    vext.16	$4, $0, $1, #2		//src[0]
+    vext.16	$3, $0, $1, #3		//src[1]
+    vadd.s16	$4, $3					//c=src[0]+src[1]
+
+    vext.16	$3, $0, $1, #1		//src[-1]
+    vext.16	$2, $0, $1, #4		//src[2]
+    vadd.s16	$3, $2					//b=src[-1]+src[2]
+
+    vext.16	$2, $0, $1, #5		//src[3]
+    vadd.s16	$2, $0					//a=src[-2]+src[3]
+//	}
+.endm
+
+.macro	UNPACK_1_IN_8x16BITS_TO_8BITS
+//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16	$3, $3, $3, #7	// 0x????, [0][1][2][3][4][5],
+    vrev64.16	$1, $1
+    vadd.u16	$2, $1				// C[2+3],B[1+4],A[0+5],
+    vshr.s64	$1, $2, #16
+    vshr.s64	$0, $2, #32		// Output: C $2, B $1, A $0
+
+    vsub.s16	$0, $0, $1			//a-b
+    vshr.s16	$0, $0, #2			//(a-b)/4
+    vsub.s16	$0, $0, $1			//(a-b)/4-b
+    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
+    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	$1, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	$0, $3, #6		//(+32)>>6
+//	}
+.endm
+#else
+.macro	AVERAGE_TWO_8BITS arg0, arg1,arg2
+//	{	// input:dst_d, src_d A and B; working: q13
+    vaddl.u8	q13, \arg2, \arg1
+    vrshrn.u16		\arg0, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+//	}
+.endm
+
+.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used
+//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+    vrev64.8	\arg2, \arg0				// X[5][4][3][2][1][0]O
+    vaddl.u8	\arg3, \arg0, \arg2			// each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16	\arg0, \arg2, \arg1			// 0+1*[50]-5*[41]+20[32]
+    vpadd.s16	\arg0, \arg0, \arg0
+    vpadd.s16	\arg0, \arg0, \arg0
+    vqrshrun.s16	\arg0, \arg4, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0  arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+    vaddl.u8	q13, \arg2, \arg6
+    vrshrn.u16		\arg6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+    vaddl.u8	q13, \arg3, \arg6
+    vrshrn.u16		\arg6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3]
+    vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS arg0, arg1,arg2, arg3
+//	{	// input:a, b, c, dst_d;
+    vsub.s16	\arg0, \arg0, \arg1			//a-b
+    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
+    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
+    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
+    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1,arg2, arg3, arg4
+//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5)
+    vext.16	\arg4, \arg0, \arg1, #2		//src[0]
+    vext.16	\arg3, \arg0, \arg1, #3		//src[1]
+    vadd.s16	\arg4, \arg3					//c=src[0]+src[1]
+
+    vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
+    vext.16	\arg2, \arg0, \arg1, #4		//src[2]
+    vadd.s16	\arg3, \arg2					//b=src[-1]+src[2]
+
+    vext.16	\arg2, \arg0, \arg1, #5		//src[3]
+    vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
+//	}
+.endm
+
+.macro	UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
+//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16	\arg3, \arg3, \arg3, #7	// 0x????, [0][1][2][3][4][5]
+    vrev64.16	\arg1, \arg1
+    vadd.u16	\arg2, \arg1				// C[2+3],B[1+4],A[0+5]
+    vshr.s64	\arg1, \arg2, #16
+    vshr.s64	\arg0, \arg2, #32		// Output: C \arg2, B \arg1, A \arg0
+
+    vsub.s16	\arg0, \arg0, \arg1			//a-b
+    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
+    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
+    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
+    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	\arg1, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	\arg0, \arg3, #6		//(+32)>>6
+//	}
+.endm
+#endif
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_h_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w16_h_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
+	pld			[r0]
+	pld			[r0, #16]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q4, q0, q1, #3		//q4=src[1]
+	vext.8		q5, q0, q1, #4		//q5=src[2]
+	vext.8		q6, q0, q1, #5		//q6=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+	FILTER_6TAG_8BITS 	d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+
+	cmp		r4, #0
+	bne		w16_h_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_h_neon
+	push		{r4-r5}
+	mov			r4, #20
+	mov			r5, #1
+	sub			r4, r4, r4, lsl #(16-2)
+	lsl			r5, #16
+	ror			r4, #16
+	vmov		d3, r5, r4					// 0x0014FFFB00010000
+
+	sub			r3, #16
+	ldr			r4, [sp, #8]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w17_h_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 22(17+5); q0=src[-2]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q4, q0, q1, #3		//q4=src[1]
+	vext.8		q5, q0, q1, #4		//q5=src[2]
+	vext.8		q6, q0, q1, #5		//q6=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d4, d6, d8, d10, d12, d14, q14, q15
+
+	FILTER_6TAG_8BITS 	d1, d5, d7, d9, d11, d13, d15, q14, q15
+
+	vst1.u8	{d14, d15}, [r2]!		//write [0:15] Byte
+
+	vsli.64	d2, d2, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+	FILTER_SINGLE_TAG_8BITS	d2, d3, d14, q7, q1
+
+	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+
+	sub		r4, #1
+	cmp		r4, #0
+	bne		w17_h_mc_luma_loop
+	pop		{r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_h_neon
+	push		{r4-r5}
+	mov			r4, #20
+	mov			r5, #1
+	sub			r4, r4, r4, lsl #(16-2)
+	lsl			r5, #16
+	ror			r4, #16
+	vmov		d7, r5, r4					// 0x0014FFFB00010000
+
+	sub			r3, #8
+	ldr			r4, [sp, #8]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w9_h_mc_luma_loop:
+	vld1.u8	{d0,d1}, [r0], r1	//only use 14(9+5); q0=src[-2]
+	pld			[r0]
+
+	vext.8		d2, d0, d1, #1		//d2=src[-1]
+	vext.8		d3, d0, d1, #2		//d3=src[0]
+	vext.8		d4, d0, d1, #3		//d4=src[1]
+	vext.8		d5, d0, d1, #4		//d5=src[2]
+	vext.8		d6, d0, d1, #5		//d6=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d8, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d8}, [r2]!		//write [0:7] Byte
+
+	vsli.64	d2, d1, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+	FILTER_SINGLE_TAG_8BITS	d2, d7, d14, q7, q1
+	vst1.u8	{d2[0]}, [r2], r3		//write 8th Byte
+
+	cmp		r4, #0
+	bne		w9_h_mc_luma_loop
+	pop		{r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_h_neon
+	push		{r4, r5, r6}
+	ldr			r6, [sp, #12]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w4_h_mc_luma_loop:
+	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
+	pld			[r0]
+	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
+	pld			[r0]
+
+	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
+	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
+	vext.8		q3, q2, q2, #1		//src[0:6 *]
+	vext.8		q4, q2, q2, #2		//src[1:6 * *]
+
+	vtrn.32	q3, q4					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
+	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
+	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+
+	FILTER_6TAG_8BITS 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+	vmov		r4, r5, d1
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub		r6, #2
+	cmp		r6, #0
+	bne		w4_h_mc_luma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_10_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w16_xy_10_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
+	pld			[r0]
+	pld			[r0, #16]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q4, q0, q1, #3		//q4=src[1]
+	vext.8		q5, q0, q1, #4		//q5=src[2]
+	vext.8		q6, q0, q1, #5		//q6=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+
+	cmp		r4, #0
+	bne		w16_xy_10_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_10_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w8_xy_10_mc_luma_loop:
+	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
+	pld			[r0]
+
+	vext.8		d2, d0, d1, #1		//d2=src[-1]
+	vext.8		d3, d0, d1, #2		//d3=src[0]
+	vext.8		d4, d0, d1, #3		//d4=src[1]
+	vext.8		d5, d0, d1, #4		//d5=src[2]
+	vext.8		d6, d0, d1, #5		//d6=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d1}, [r2], r3
+
+	cmp		r4, #0
+	bne		w8_xy_10_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_10_neon
+	push		{r4, r5, r6}
+	ldr			r6, [sp, #12]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w4_xy_10_mc_luma_loop:
+	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
+	pld			[r0]
+	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
+	pld			[r0]
+
+	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
+	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
+	vext.8		q3, q2, q2, #1		//src[0:6 *]
+	vext.8		q4, q2, q2, #2		//src[1:6 * *]
+
+	vtrn.32	q3, q4					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
+	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
+	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+	vmov		r4, r5, d1
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub		r6, #2
+	cmp		r6, #0
+	bne		w4_xy_10_mc_luma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_30_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w16_xy_30_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
+	pld			[r0]
+	pld			[r0, #16]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q4, q0, q1, #3		//q4=src[1]
+	vext.8		q5, q0, q1, #4		//q5=src[2]
+	vext.8		q6, q0, q1, #5		//q6=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+
+	cmp		r4, #0
+	bne		w16_xy_30_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_30_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w8_xy_30_mc_luma_loop:
+	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
+	pld			[r0]
+
+	vext.8		d2, d0, d1, #1		//d2=src[-1]
+	vext.8		d3, d0, d1, #2		//d3=src[0]
+	vext.8		d4, d0, d1, #3		//d4=src[1]
+	vext.8		d5, d0, d1, #4		//d5=src[2]
+	vext.8		d6, d0, d1, #5		//d6=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d1}, [r2], r3
+
+	cmp		r4, #0
+	bne		w8_xy_30_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_30_neon
+	push		{r4, r5, r6}
+	ldr			r6, [sp, #12]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w4_xy_30_mc_luma_loop:
+	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
+	pld			[r0]
+	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
+	pld			[r0]
+
+	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
+	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
+	vext.8		q3, q2, q2, #1		//src[0:6 *]
+	vext.8		q4, q2, q2, #2		//src[1:6 * *]
+
+	vtrn.32	q3, q4					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
+	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
+	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+	vmov		r4, r5, d1
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub		r6, #2
+	cmp		r6, #0
+	bne		w4_xy_30_mc_luma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_01_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q4}, [r0], r1		//q4=src[2]
+
+w16_xy_01_luma_loop:
+
+	vld1.u8	{q5}, [r0], r1		//q5=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d4, d6, d8, d10, d0, d2, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d5, d7, d9, d11, d1, d3, d13, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d6, d8, d10, d0, d2, d4, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d7, d9, d11, d1, d3, d5, d13, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d8, d10, d0, d2, d4, d6, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d9, d11, d1, d3, d5, d7, d13, q14, q15
+	vld1.u8	{q4}, [r0], r1		//read 6th row
+	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d10, d0, d2, d4, d6, d8, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d11, d1, d3, d5, d7, d9, d13, q14, q15
+	vld1.u8	{q5}, [r0], r1		//read 7th row
+	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q4, q5, q0 --> q0~q4
+	vswp	q0, q4
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q5						//q0~q4
+
+	sub		r4, #8
+	cmp		r4, #0
+	bne		w16_xy_01_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_01_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
+	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{d2}, [r0], r1		//d2=src[0]
+	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+
+	vld1.u8	{d4}, [r0], r1		//d4=src[2]
+	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+
+w8_xy_01_mc_luma_loop:
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d12, q14, q15
+	vld1.u8	{d0}, [r0], r1		//read 2nd row
+	vst1.u8	{d12}, [r2], r3		//write 1st 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d2, d3, d4, d5, d0, d12, q14, q15
+	vld1.u8	{d1}, [r0], r1		//read 3rd row
+	vst1.u8	{d12}, [r2], r3		//write 2nd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d12, q14, q15
+	vld1.u8	{d2}, [r0], r1		//read 4th row
+	vst1.u8	{d12}, [r2], r3		//write 3rd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d4, d5, d0, d1, d2, d12, q14, q15
+	vld1.u8	{d3}, [r0], r1		//read 5th row
+	vst1.u8	{d12}, [r2], r3		//write 4th 8Byte
+
+	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+	vswp	q0, q2
+	vswp	q1, q2
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w8_xy_01_mc_luma_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon
+	push		{r4, r5, r6, r7}
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	ldr		r4, [r0], r1		//r4=src[-2]
+	ldr		r5, [r0], r1		//r5=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	ldr		r6, [r0], r1		//r6=src[0]
+	ldr		r7, [r0], r1		//r7=src[1]
+
+	vmov		d0, r4, r5
+	vmov		d1, r5, r6
+	vmov		d2, r6, r7
+
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d3, r7, r4
+	ldr			r7, [sp, #16]
+
+w4_xy_01_mc_luma_loop:
+
+	//using reserving r4
+	ldr		r5, [r0], r1		//r5=src[3]
+	ldr		r6, [r0], r1		//r6=src[0]
+	vmov		d4, r4, r5
+	vmov		d5, r5, r6			//reserved r6
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d12, q14, q15
+	vmov		r4, r5, d12
+	str	r4, [r2], r3			//write 1st 4Byte
+	str	r5, [r2], r3			//write 2nd 4Byte
+
+	ldr		r5, [r0], r1		//r5=src[1]
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d0, r6, r5
+	vmov		d1, r5, r4			//reserved r4
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d12, q14, q15
+	vmov		r5, r6, d12
+	str	r5, [r2], r3			//write 3rd 4Byte
+	str	r6, [r2], r3			//write 4th 4Byte
+
+	//d4, d5, d0, d1 --> d0, d1, d2, d3
+	vmov	q1, q0
+	vmov	q0, q2
+
+	sub		r7, #4
+	cmp		r7, #0
+	bne		w4_xy_01_mc_luma_loop
+
+	pop		{r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_03_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q4}, [r0], r1		//q4=src[2]
+
+w16_xy_03_luma_loop:
+
+	vld1.u8	{q5}, [r0], r1		//q5=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d4, d6, d8, d10, d0, d2, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d5, d7, d9, d11, d1, d3, d13, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d6, d8, d10, d0, d2, d4, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d7, d9, d11, d1, d3, d5, d13, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d8, d10, d0, d2, d4, d6, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d9, d11, d1, d3, d5, d7, d13, q14, q15
+	vld1.u8	{q4}, [r0], r1		//read 6th row
+	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d10, d0, d2, d4, d6, d8, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d11, d1, d3, d5, d7, d9, d13, q14, q15
+	vld1.u8	{q5}, [r0], r1		//read 7th row
+	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q4, q5, q0 --> q0~q4
+	vswp	q0, q4
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q5						//q0~q4
+
+	sub		r4, #8
+	cmp		r4, #0
+	bne		w16_xy_03_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_03_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
+	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{d2}, [r0], r1		//d2=src[0]
+	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+
+	vld1.u8	{d4}, [r0], r1		//d4=src[2]
+	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+
+w8_xy_03_mc_luma_loop:
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d12, q14, q15
+	vld1.u8	{d0}, [r0], r1		//read 2nd row
+	vst1.u8	{d12}, [r2], r3		//write 1st 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d2, d3, d4, d5, d0, d12, q14, q15
+	vld1.u8	{d1}, [r0], r1		//read 3rd row
+	vst1.u8	{d12}, [r2], r3		//write 2nd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d12, q14, q15
+	vld1.u8	{d2}, [r0], r1		//read 4th row
+	vst1.u8	{d12}, [r2], r3		//write 3rd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d4, d5, d0, d1, d2, d12, q14, q15
+	vld1.u8	{d3}, [r0], r1		//read 5th row
+	vst1.u8	{d12}, [r2], r3		//write 4th 8Byte
+
+	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+	vswp	q0, q2
+	vswp	q1, q2
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w8_xy_03_mc_luma_loop
+
+	pop		{r4}
+	WELS_ASM_FUNC_END
+
+    WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon
+	push		{r4, r5, r6, r7}
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	ldr		r4, [r0], r1		//r4=src[-2]
+	ldr		r5, [r0], r1		//r5=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	ldr		r6, [r0], r1		//r6=src[0]
+	ldr		r7, [r0], r1		//r7=src[1]
+
+	vmov		d0, r4, r5
+	vmov		d1, r5, r6
+	vmov		d2, r6, r7
+
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d3, r7, r4
+	ldr			r7, [sp, #16]
+
+w4_xy_03_mc_luma_loop:
+
+	//using reserving r4
+	ldr		r5, [r0], r1		//r5=src[3]
+	ldr		r6, [r0], r1		//r6=src[0]
+	vmov		d4, r4, r5
+	vmov		d5, r5, r6			//reserved r6
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d12, q14, q15
+	vmov		r4, r5, d12
+	str	r4, [r2], r3			//write 1st 4Byte
+	str	r5, [r2], r3			//write 2nd 4Byte
+
+	ldr		r5, [r0], r1		//r5=src[1]
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d0, r6, r5
+	vmov		d1, r5, r4			//reserved r4
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d12, q14, q15
+	vmov		r5, r6, d12
+	str	r5, [r2], r3			//write 3rd 4Byte
+	str	r6, [r2], r3			//write 4th 4Byte
+
+	//d4, d5, d0, d1 --> d0, d1, d2, d3
+	vmov	q1, q0
+	vmov	q0, q2
+
+	sub		r7, #4
+	cmp		r7, #0
+	bne		w4_xy_03_mc_luma_loop
+
+	pop		{r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_v_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q4}, [r0], r1		//q4=src[2]
+
+w16_v_mc_luma_loop:
+
+	vld1.u8	{q5}, [r0], r1		//q5=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS 	d4, d6, d8, d10, d0, d2, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d5, d7, d9, d11, d1, d3, d13, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS 	d6, d8, d10, d0, d2, d4, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d7, d9, d11, d1, d3, d5, d13, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS 	d8, d10, d0, d2, d4, d6, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d9, d11, d1, d3, d5, d7, d13, q14, q15
+	vld1.u8	{q4}, [r0], r1		//read 6th row
+	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS 	d10, d0, d2, d4, d6, d8, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d11, d1, d3, d5, d7, d9, d13, q14, q15
+	vld1.u8	{q5}, [r0], r1		//read 7th row
+	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q4, q5, q0 --> q0~q4
+	vswp	q0, q4
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q5						//q0~q4
+
+	sub		r4, #8
+	cmp		r4, #0
+	bne		w16_v_mc_luma_loop
+	pop		{r4}
+	WELS_ASM_FUNC_END
+
+    WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_v_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q4}, [r0], r1		//q4=src[2]
+
+w17_v_mc_luma_loop:
+
+	vld1.u8	{q5}, [r0], r1		//q5=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q6}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS 	d4, d6, d8, d10, d0, d2, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d5, d7, d9, d11, d1, d3, d13, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q6}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS 	d6, d8, d10, d0, d2, d4, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d7, d9, d11, d1, d3, d5, d13, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q6}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS 	d8, d10, d0, d2, d4, d6, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d9, d11, d1, d3, d5, d7, d13, q14, q15
+	vld1.u8	{q4}, [r0], r1		//read 6th row
+	vst1.u8	{q6}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS 	d10, d0, d2, d4, d6, d8, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d11, d1, d3, d5, d7, d9, d13, q14, q15
+	vld1.u8	{q5}, [r0], r1		//read 7th row
+	vst1.u8	{q6}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q6}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d8, d10, d0, d12, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d9, d11, d1, d13, q14, q15
+	vst1.u8	{q6}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q4, q5, q0 --> q0~q4
+	vswp	q0, q4
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q5						//q0~q4
+
+	sub		r4, #8
+	cmp		r4, #1
+	bne		w17_v_mc_luma_loop
+	// the last 16Bytes
+	vld1.u8	{q5}, [r0], r1		//q5=src[3]
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d8, d10, d12, q14, q15
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d9, d11, d13, q14, q15
+	vst1.u8	{q6}, [r2], r3			//write 1st 16Byte
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_v_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
+	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{d2}, [r0], r1		//d2=src[0]
+	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+
+	vld1.u8	{d4}, [r0], r1		//d4=src[2]
+	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+
+w9_v_mc_luma_loop:
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d12, q14, q15
+	vld1.u8	{d0}, [r0], r1		//read 2nd row
+	vst1.u8	{d12}, [r2], r3		//write 1st 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d12, q14, q15
+	vld1.u8	{d1}, [r0], r1		//read 3rd row
+	vst1.u8	{d12}, [r2], r3		//write 2nd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d12, q14, q15
+	vld1.u8	{d2}, [r0], r1		//read 4th row
+	vst1.u8	{d12}, [r2], r3		//write 3rd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d12, q14, q15
+	vld1.u8	{d3}, [r0], r1		//read 5th row
+	vst1.u8	{d12}, [r2], r3		//write 4th 8Byte
+
+	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+	vswp	q0, q2
+	vswp	q1, q2
+
+	sub		r4, #4
+	cmp		r4, #1
+	bne		w9_v_mc_luma_loop
+
+	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d12, q14, q15
+	vst1.u8	{d12}, [r2], r3		//write last 8Byte
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon
+	push		{r4, r5, r6, r7}
+	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	ldr		r4, [r0], r1		//r4=src[-2]
+	ldr		r5, [r0], r1		//r5=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	ldr		r6, [r0], r1		//r6=src[0]
+	ldr		r7, [r0], r1		//r7=src[1]
+
+	vmov		d0, r4, r5
+	vmov		d1, r5, r6
+	vmov		d2, r6, r7
+
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d3, r7, r4
+	ldr			r7, [sp, #16]
+
+w4_v_mc_luma_loop:
+
+//	pld			[r0]
+	//using reserving r4
+	ldr		r5, [r0], r1		//r5=src[3]
+	ldr		r6, [r0], r1		//r6=src[0]
+	vmov		d4, r4, r5
+	vmov		d5, r5, r6			//reserved r6
+
+	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d12, q14, q15
+	vmov		r4, r5, d12
+	str	r4, [r2], r3			//write 1st 4Byte
+	str	r5, [r2], r3			//write 2nd 4Byte
+
+	ldr		r5, [r0], r1		//r5=src[1]
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d0, r6, r5
+	vmov		d1, r5, r4			//reserved r4
+
+	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d12, q14, q15
+	vmov		r5, r6, d12
+	str	r5, [r2], r3			//write 3rd 4Byte
+	str	r6, [r2], r3			//write 4th 4Byte
+
+	//d4, d5, d0, d1 --> d0, d1, d2, d3
+	vmov	q1, q0
+	vmov	q0, q2
+
+	sub		r7, #4
+	cmp		r7, #0
+	bne		w4_v_mc_luma_loop
+
+	pop		{r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_hv_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2					//src[-2]
+	sub			r0, r1, lsl #1		//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0-d2}, [r0], r1		//use 21(16+5), =src[-2]
+	vld1.u8	{d3-d5}, [r0], r1		//use 21(16+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+
+	vld1.u8	{d6-d8}, [r0], r1		//use 21(16+5), =src[0]
+	vld1.u8	{d9-d11}, [r0], r1	//use 21(16+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{d12-d14}, [r0], r1	//use 21(16+5), =src[2]
+
+w16_hv_mc_luma_loop:
+
+	vld1.u8	{d15-d17}, [r0], r1	//use 21(16+5), =src[3]
+	//the 1st row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
+	vst1.u8	{q0}, [r2], r3		//write 16Byte
+
+
+	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
+	//the 2nd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
+	vst1.u8	{d3, d4}, [r2], r3		//write 16Byte
+
+	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
+	//the 3rd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
+	vst1.u8	{d6, d7}, [r2], r3		//write 16Byte
+
+	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
+	//the 4th row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
+	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
+
+	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+	vswp	q0, q6
+	vswp	q6, q3
+	vmov	q5, q2
+	vmov	q2, q8
+
+	vmov	d20,d8
+	vmov	q4, q1
+	vmov	q1, q7
+	vmov	d14,d20
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w16_hv_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_hv_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2					//src[-2]
+	sub			r0, r1, lsl #1		//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0-d2}, [r0], r1		//use 21(17+5), =src[-2]
+	vld1.u8	{d3-d5}, [r0], r1		//use 21(17+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+
+	vld1.u8	{d6-d8}, [r0], r1		//use 21(17+5), =src[0]
+	vld1.u8	{d9-d11}, [r0], r1	//use 21(17+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{d12-d14}, [r0], r1	//use 21(17+5), =src[2]
+	sub			r3, #16
+
+w17_hv_mc_luma_loop:
+
+	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
+	//the 1st row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
+	vst1.u8	{d0, d1}, [r2]!			//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
+	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+
+	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
+	//the 2nd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
+	vst1.u8	{d3, d4}, [r2]!		//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d5, d22, d23, q11 //output to d5[0]
+	vst1.u8	{d5[0]}, [r2], r3		//write 16th Byte
+
+	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
+	//the 3rd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
+	vst1.u8	{d6, d7}, [r2]!		//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d8, d22, d23, q11 //output to d8[0]
+	vst1.u8	{d8[0]}, [r2], r3		//write 16th Byte
+
+	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
+	//the 4th row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
+	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d11, d22, d23, q11 //output to d11[0]
+	vst1.u8	{d11[0]}, [r2], r3		//write 16th Byte
+
+	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+	vswp	q0, q6
+	vswp	q6, q3
+	vmov	q5, q2
+	vmov	q2, q8
+
+	vmov	d20,d8
+	vmov	q4, q1
+	vmov	q1, q7
+	vmov	d14,d20
+
+	sub		r4, #4
+	cmp		r4, #1
+	bne		w17_hv_mc_luma_loop
+	//the last row
+	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
+	vst1.u8	{q0}, [r2]!			//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
+	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_hv_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2				//src[-2]
+	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014		// 20
+	vld1.u8	{q0}, [r0], r1	//use 14(9+5), =src[-2]
+	vld1.u8	{q1}, [r0], r1	//use 14(9+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2		// 5
+
+	vld1.u8	{q2}, [r0], r1	//use 14(9+5), =src[0]
+	vld1.u8	{q3}, [r0], r1	//use 14(9+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{q4}, [r0], r1	//use 14(9+5), =src[2]
+	sub			r3, #8
+
+w9_hv_mc_luma_loop:
+
+	vld1.u8	{q5}, [r0], r1	//use 14(9+5), =src[3]
+	//the 1st row
+	pld			[r0]
+	// vertical filtered into q6/q7
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q6, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q7, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
+	vst1.u8	d12, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
+	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte
+
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	//the 2nd row
+	pld			[r0]
+	// vertical filtered into q6/q7
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d10, d0, q6, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d11, d1, q7, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
+	vst1.u8	d12, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
+	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte
+
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	//the 3rd row
+	pld			[r0]
+	// vertical filtered into q6/q7
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d0, d2, q6, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d1, d3, q7, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
+	vst1.u8	d12, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
+	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte
+
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	//the 4th row
+	pld			[r0]
+	// vertical filtered into q6/q7
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d10, d0, d2, d4, q6, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d11, d1, d3, d5, q7, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
+	vst1.u8	d12, [r2]!			//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
+	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte
+
+	//q4~q5, q0~q2, --> q0~q4
+	vswp	q0, q4
+	vswp	q2, q4
+	vmov	q3, q1
+	vmov	q1, q5
+
+	sub		r4, #4
+	cmp		r4, #1
+	bne		w9_hv_mc_luma_loop
+	//the last row
+	vld1.u8	{q5}, [r0], r1	//use 14(9+5), =src[3]
+	// vertical filtered into q6/q7
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q6, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q7, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q6, q7, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12	//output to q6[0]
+	vst1.u8	d12, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d13, d14, d15, q7 //output to d13[0]
+	vst1.u8	{d13[0]}, [r2], r3	//write 8th Byte
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_hv_neon
+	push		{r4 ,r5, r6}
+	ldr			r6, [sp, #12]
+
+	sub			r0, #2				//src[-2]
+	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014		// 20
+	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[-2]
+	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2		// 5
+
+	vld1.u8	{q2}, [r0], r1	//use 9(4+5), =src[0]
+	vld1.u8	{q3}, [r0], r1	//use 9(4+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{q4}, [r0], r1	//use 9(4+5), =src[2]
+
+w4_hv_mc_luma_loop:
+
+	vld1.u8	{q5}, [r0], r1	//use 9(4+5), =src[3]
+	vld1.u8	{q6}, [r0], r1	//use 9(4+5), =src[4]
+
+	//the 1st&2nd row
+	pld			[r0]
+	pld			[r0, r1]
+	// vertical filtered
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q7, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q8, q14, q15	// 1 avail
+
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8,d10, d12, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9,d11, d13,q10, q14, q15	// 1 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
+	UNPACK_2_16BITS_TO_ABC	q9,q10, q0, q7, q8		//4 avail
+
+	vmov	d23, d0
+	vmov	d25, d14
+	vmov	d27, d16
+
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
+	vmov		r4 ,r5, d22
+	str		r4, [r2], r3				//write 4Byte
+	str		r5, [r2], r3				//write 4Byte
+
+	//the 3rd&4th row
+	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[3]
+	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[4]
+	pld			[r0]
+	pld			[r0, r1]
+	// vertical filtered
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d12, d0, q7, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d13, d1, q8, q14, q15	// 1 avail
+
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8,d10, d12, d0, d2, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9,d11, d13, d1, d3,q10, q14, q15	// 1 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
+	UNPACK_2_16BITS_TO_ABC	q9,q10, q2, q7, q8		//4 avail
+
+	vmov	d23, d4
+	vmov	d25, d14
+	vmov	d27, d16
+
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
+	vmov		r4 ,r5, d22
+	str		r4, [r2], r3				//write 4Byte
+	str		r5, [r2], r3				//write 4Byte
+
+	//q4~q6, q0~q1, --> q0~q4
+	vswp	q4, q0
+	vmov	q3, q4
+	vmov	q4, q1
+	vmov	q1, q5
+	vmov	q2, q6
+
+	sub		r6, #4
+	cmp		r6, #0
+	bne		w4_hv_mc_luma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_copy_w16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+w16_copy_loop:
+	vld1.u8		{q0}, [r0], r1
+	vld1.u8		{q1}, [r0], r1
+	vst1.u8		{q0}, [r2], r3
+	vst1.u8		{q1}, [r2], r3
+	sub			r4, #2
+	cmp			r4, #0
+	bne			w16_copy_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_copy_w8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+w8_copy_loop:
+	vld1.u8		{d0}, [r0], r1
+	vld1.u8		{d1}, [r0], r1
+	vst1.u8		{d0}, [r2], r3
+	vst1.u8		{d1}, [r2], r3
+	sub			r4, #2
+	cmp			r4, #0
+	bne			w8_copy_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_copy_w4_neon
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+w4_copy_loop:
+	ldr		r5, [r0], r1
+	ldr		r6, [r0], r1
+	str		r5, [r2], r3
+	str		r6, [r2], r3
+
+	sub			r4, #2
+	cmp			r4, #0
+	bne			w4_copy_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_pixel_avg_w16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+w16_pix_avg_loop:
+	vld1.u8		{q0}, [r2]!
+	vld1.u8		{q1}, [r3]!
+	vld1.u8		{q2}, [r2]!
+	vld1.u8		{q3}, [r3]!
+
+	vld1.u8		{q4}, [r2]!
+	vld1.u8		{q5}, [r3]!
+	vld1.u8		{q6}, [r2]!
+	vld1.u8		{q7}, [r3]!
+
+	AVERAGE_TWO_8BITS		d0, d0, d2
+	AVERAGE_TWO_8BITS		d1, d1, d3
+	vst1.u8		{q0}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d4, d4, d6
+	AVERAGE_TWO_8BITS		d5, d5, d7
+	vst1.u8		{q2}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d8, d8, d10
+	AVERAGE_TWO_8BITS		d9, d9, d11
+	vst1.u8		{q4}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d12, d12, d14
+	AVERAGE_TWO_8BITS		d13, d13, d15
+	vst1.u8		{q6}, [r0], r1
+
+	sub			r4, #4
+	cmp			r4, #0
+	bne			w16_pix_avg_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_pix_avg_w16_neon
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+	ldr			r5, [sp, #16]
+	ldr			r6, [sp, #20]
+
+enc_w16_pix_avg_loop:
+	vld1.u8		{q0}, [r2], r3
+	vld1.u8		{q1}, [r4], r5
+	vld1.u8		{q2}, [r2], r3
+	vld1.u8		{q3}, [r4], r5
+
+	vld1.u8		{q4}, [r2], r3
+	vld1.u8		{q5}, [r4], r5
+	vld1.u8		{q6}, [r2], r3
+	vld1.u8		{q7}, [r4], r5
+
+	AVERAGE_TWO_8BITS		d0, d0, d2
+	AVERAGE_TWO_8BITS		d1, d1, d3
+	vst1.u8		{q0}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d4, d4, d6
+	AVERAGE_TWO_8BITS		d5, d5, d7
+	vst1.u8		{q2}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d8, d8, d10
+	AVERAGE_TWO_8BITS		d9, d9, d11
+	vst1.u8		{q4}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d12, d12, d14
+	AVERAGE_TWO_8BITS		d13, d13, d15
+	vst1.u8		{q6}, [r0], r1
+
+	sub			r6, #4
+	cmp			r6, #0
+	bne			enc_w16_pix_avg_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_pix_avg_w8_neon
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+	ldr			r5, [sp, #16]
+	ldr			r6, [sp, #20]
+enc_w8_pix_avg_loop:
+
+	vld1.u8		{d0}, [r2], r3
+	vld1.u8		{d2}, [r4], r5
+	vld1.u8		{d1}, [r2], r3
+	vld1.u8		{d3}, [r4], r5
+
+	AVERAGE_TWO_8BITS		d0, d0, d2
+	AVERAGE_TWO_8BITS		d1, d1, d3
+	vst1.u8		{d0}, [r0], r1
+	vst1.u8		{d1}, [r0], r1
+
+	vld1.u8		{d4}, [r2], r3
+	vld1.u8		{d6}, [r4], r5
+	vld1.u8		{d5}, [r2], r3
+	vld1.u8		{d7}, [r4], r5
+
+	AVERAGE_TWO_8BITS		d4, d4, d6
+	AVERAGE_TWO_8BITS		d5, d5, d7
+	vst1.u8		{d4}, [r0], r1
+	vst1.u8		{d5}, [r0], r1
+
+	sub			r6, #4
+	cmp			r6, #0
+	bne			enc_w8_pix_avg_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_chroma_w8_neon
+
+	push		{r4, r5}
+	ldr			r4, [sp, #8]
+	ldr			r5, [sp, #12]
+	vld1.u8	{d31}, [r4]		//load A/B/C/D
+	vld1.u8		{q0}, [r0], r1	//src[x]
+
+	vdup.u8	d28, d31[0]			//A
+	vdup.u8	d29, d31[1]			//B
+	vdup.u8	d30, d31[2]			//C
+	vdup.u8	d31, d31[3]			//D
+
+	vext.u8		d1, d0, d1, #1		//src[x+1]
+
+w8_mc_chroma_loop:	// each two pxl row
+	vld1.u8		{q1}, [r0], r1	//src[x+stride]
+	vld1.u8		{q2}, [r0], r1	//src[x+2*stride]
+	vext.u8		d3, d2, d3, #1		//src[x+stride+1]
+	vext.u8		d5, d4, d5, #1		//src[x+2*stride+1]
+
+	vmull.u8		q3, d0, d28			//(src[x] * A)
+	vmlal.u8		q3, d1, d29			//+=(src[x+1] * B)
+	vmlal.u8		q3, d2, d30			//+=(src[x+stride] * C)
+	vmlal.u8		q3, d3, d31			//+=(src[x+stride+1] * D)
+	vrshrn.u16		d6, q3, #6
+	vst1.u8	d6, [r2], r3
+
+	vmull.u8		q3, d2, d28			//(src[x] * A)
+	vmlal.u8		q3, d3, d29			//+=(src[x+1] * B)
+	vmlal.u8		q3, d4, d30			//+=(src[x+stride] * C)
+	vmlal.u8		q3, d5, d31			//+=(src[x+stride+1] * D)
+	vrshrn.u16		d6, q3, #6
+	vst1.u8	d6, [r2], r3
+
+	vmov		q0, q2
+	sub			r5, #2
+	cmp			r5, #0
+	bne			w8_mc_chroma_loop
+
+	pop		{r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_chroma_w4_neon
+
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+	ldr			r6, [sp, #16]
+	vld1.u8	{d31}, [r4]		//load A/B/C/D
+
+	vdup.u8	d28, d31[0]			//A
+	vdup.u8	d29, d31[1]			//B
+	vdup.u8	d30, d31[2]			//C
+	vdup.u8	d31, d31[3]			//D
+
+w4_mc_chroma_loop:	// each two pxl row
+	vld1.u8		{d0}, [r0], r1	//a::src[x]
+	vld1.u8		{d2}, [r0], r1	//b::src[x+stride]
+	vld1.u8		{d4}, [r0]			//c::src[x+2*stride]
+
+	vshr.u64		d1, d0, #8
+	vshr.u64		d3, d2, #8
+	vshr.u64		d5, d4, #8
+
+	vmov			q3, q1				//b::[0:7]+b::[1~8]
+	vtrn.32		q0, q1				//d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+	vtrn.32		q3, q2				//d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+
+	vmull.u8		q1, d0, d28			//(src[x] * A)
+	vmlal.u8		q1, d1, d29			//+=(src[x+1] * B)
+	vmlal.u8		q1, d6, d30			//+=(src[x+stride] * C)
+	vmlal.u8		q1, d7, d31			//+=(src[x+stride+1] * D)
+
+	vrshrn.u16		d2, q1, #6
+	vmov		r4, r5, d2
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub			r6, #2
+	cmp			r6, #0
+	bne			w4_mc_chroma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+#endif
--- a/codec/encoder/core/arm/memory_neon.S
+++ b/codec/encoder/core/arm/memory_neon.S
@@ -60,4 +60,4 @@
     vst1.64 {d0}, [r0]!
 WELS_ASM_FUNC_END
 
-#endif
\ No newline at end of file
+#endif
--- a/codec/encoder/core/arm/pixel_neon.S
+++ b/codec/encoder/core/arm/pixel_neon.S
@@ -35,73 +35,73 @@
 #include "arm_arch_common_macro.S"
 
 .macro SATD_16x4
-    vld1.64     {q0}, [r0,:128], r1   
-    vld1.64     {q1}, [r2], r3   
+    vld1.64     {q0}, [r0,:128], r1
+    vld1.64     {q1}, [r2], r3
 
-    vsubl.u8    q4,  d0,  d2        
-    vld1.64     {q2}, [r0,:128], r1  
+    vsubl.u8    q4,  d0,  d2
+    vld1.64     {q2}, [r0,:128], r1
 
-    vsubl.u8    q6, d1,  d3       
-    vld1.64     {q3}, [r2], r3   
+    vsubl.u8    q6, d1,  d3
+    vld1.64     {q3}, [r2], r3
 
-    vsubl.u8    q5,  d4,  d6        
-    vld1.64     {q0}, [r0,:128], r1  
+    vsubl.u8    q5,  d4,  d6
+    vld1.64     {q0}, [r0,:128], r1
 
-    vsubl.u8    q7, d5,  d7	
+    vsubl.u8    q7, d5,  d7
     vld1.64     {q1}, [r2], r3
 
     vsubl.u8    q8, d0,  d2
-    vld1.64     {q2}, [r0,:128], r1 
+    vld1.64     {q2}, [r0,:128], r1
 
     vsubl.u8    q10, d1,  d3
-    vadd.s16    q0,  q4,  q5  
+    vadd.s16    q0,  q4,  q5
 
-    vld1.64     {q3}, [r2], r3  
-    vsub.s16    q1,  q4,  q5     
+    vld1.64     {q3}, [r2], r3
+    vsub.s16    q1,  q4,  q5
 
-    vsubl.u8    q9, d4,  d6     
-    vsubl.u8    q11, d5,  d7    
+    vsubl.u8    q9, d4,  d6
+    vsubl.u8    q11, d5,  d7
 
-    vadd.s16    q2, q8, q9       
-    vsub.s16    q3, q8, q9		
+    vadd.s16    q2, q8, q9
+    vsub.s16    q3, q8, q9
 
-    vadd.s16    q4, q6, q7			
+    vadd.s16    q4, q6, q7
     vsub.s16	q5, q6, q7
 
-    vadd.s16    q6, q10, q11	
-    vsub.s16	q7, q10, q11	
+    vadd.s16    q6, q10, q11
+    vsub.s16	q7, q10, q11
 
-    vadd.s16    q8, q0, q2      
-    vsub.s16    q10, q0, q2 
+    vadd.s16    q8, q0, q2
+    vsub.s16    q10, q0, q2
 
-    vadd.s16    q9, q4, q6  
-    vsub.s16    q11, q4, q6 	
+    vadd.s16    q9, q4, q6
+    vsub.s16    q11, q4, q6
 
-    vsub.s16    q0, q1, q3		
-    vadd.s16    q2, q1, q3     
+    vsub.s16    q0, q1, q3
+    vadd.s16    q2, q1, q3
 
-    vsub.s16    q1, q5, q7		
-    vadd.s16    q3, q5, q7    
+    vsub.s16    q1, q5, q7
+    vadd.s16    q3, q5, q7
 
-    vtrn.16 q8, q10  
-    vtrn.16 q9, q11	
+    vtrn.16 q8, q10
+    vtrn.16 q9, q11
 
-    vadd.s16 q4, q8, q10		
-    vabd.s16 q6, q8, q10	
+    vadd.s16 q4, q8, q10
+    vabd.s16 q6, q8, q10
 
-    vadd.s16 q5, q9, q11		
-    vabd.s16 q7, q9, q11	
+    vadd.s16 q5, q9, q11
+    vabd.s16 q7, q9, q11
 
     vabs.s16 q4, q4
     vabs.s16 q5, q5
 
-    vtrn.16 q0, q2			
-    vtrn.16 q1, q3			
+    vtrn.16 q0, q2
+    vtrn.16 q1, q3
 
-    vadd.s16 q8, q0, q2			  
-    vabd.s16 q10, q0, q2		 
+    vadd.s16 q8, q0, q2
+    vabd.s16 q10, q0, q2
 
-    vadd.s16 q9, q1, q3			  
+    vadd.s16 q9, q1, q3
     vabd.s16 q11, q1, q3
 
     vabs.s16 q8, q8
@@ -128,31 +128,31 @@
     vld1.64     {d1}, [r2], r3
 
     vld1.64     {d2}, [r0,:64], r1
-    vsubl.u8    q4, d0, d1            
+    vsubl.u8    q4, d0, d1
 
     vld1.64     {d3}, [r2], r3
-    vsubl.u8    q5, d2, d3         
+    vsubl.u8    q5, d2, d3
 
     vld1.64     {d4}, [r0,:64], r1
     vld1.64     {d5}, [r2], r3
 
-    vadd.s16    q8, q4, q5                 
-    vsubl.u8    q6, d4, d5             
+    vadd.s16    q8, q4, q5
+    vsubl.u8    q6, d4, d5
 
     vld1.64     {d6}, [r0,:64], r1
     vld1.64     {d7}, [r2], r3
 
-    vsubl.u8    q7, d6,  d7               
-    vsub.s16    q9, q4, q5       
+    vsubl.u8    q7, d6,  d7
+    vsub.s16    q9, q4, q5
 
-    vadd.s16    q10, q6, q7         
-    vsub.s16    q11, q6, q7         
+    vadd.s16    q10, q6, q7
+    vsub.s16    q11, q6, q7
 
-    vadd.s16    q0, q8, q10               
-    vsub.s16    q1, q8, q10      
+    vadd.s16    q0, q8, q10
+    vsub.s16    q1, q8, q10
 
-    vsub.s16    q2, q9, q11            
-    vadd.s16    q3, q9, q11        
+    vsub.s16    q2, q9, q11
+    vadd.s16    q3, q9, q11
 
     vtrn.16     q0, q1
     vtrn.16     q2, q3
@@ -220,7 +220,7 @@
 .endm
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad16x16_neon
 
     vld1.64 {q0}, [r0, :128], r1
     vld1.64 {q1}, [r2], r3
@@ -260,7 +260,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad16x8_neon
 
     vld1.64 {q0}, [r0, :128], r1
     vld1.64 {q1}, [r2], r3
@@ -298,7 +298,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad8x16_neon
 
     vld1.64 {d0}, [r0, :64], r1
     vld1.64 {d1}, [r2], r3
@@ -332,7 +332,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
 
     vld1.64 {d0}, [r0, :64], r1
     vld1.64 {d1}, [r2], r3
@@ -364,7 +364,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
     stmdb sp!, {r4-r5, lr}
 
 	//Loading a horizontal line data (4 bytes)
@@ -376,23 +376,23 @@
     //line 1
 	ldr r4, [r0], r1
 	ldr r5, [r2], r3
-	usada8  lr, r4, r5, lr	
+	usada8  lr, r4, r5, lr
 
-    //line 2	
+    //line 2
 	ldr r4, [r0], r1
 	ldr r5, [r2], r3
-	usada8  lr, r4, r5, lr	
-	
+	usada8  lr, r4, r5, lr
+
 	//line 3
 	ldr r4, [r0]
 	ldr r5, [r2]
-	usada8  r0, r4, r5, lr	
+	usada8  r0, r4, r5, lr
 
 	ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
 
     stmdb sp!, {r4-r5, lr}
 
@@ -400,30 +400,30 @@
 	sub   r4, r2, #1
 	add   r5, r2, #1
 	sub   r2, r3
-			
+
     //Loading a horizontal line data (16 bytes)
 	vld1.8 {q0}, [r0], r1 //save pix1
-	
+
 	vld1.8 {q1}, [r2], r3 //save pix2 - stride
 	vld1.8 {q6}, [r2], r3 //save pix2
 	vld1.8 {q2}, [r2], r3 //save pix2 + stride
-	
+
 	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vld1.8 {q4}, [r5], r3 //save pix2 + 1	
-		    
+	vld1.8 {q4}, [r5], r3 //save pix2 + 1
+
 	//Do the SAD for 16 bytes
 	vabdl.u8  q15, d0, d2
 	vabal.u8  q15, d1, d3
-	
+
 	vabdl.u8  q13, d0, d4
 	vabal.u8  q13, d1, d5
-	
+
 	vabdl.u8  q11, d0, d6
 	vabal.u8  q11, d1, d7
-	
+
 	vabdl.u8  q9, d0, d8
-	vabal.u8  q9, d1, d9			
-	
+	vabal.u8  q9, d1, d9
+
 	mov lr, #15
 pixel_sad_4_16x16_loop_0:
 
@@ -436,13 +436,13 @@
 	vabal.u8  q15, d1, d3
 	vld1.8 {q3}, [r4], r3 //save pix2 - 1
 	vabal.u8  q13, d0, d4
-	vld1.8 {q4}, [r5], r3 //save pix2 + 1	
+	vld1.8 {q4}, [r5], r3 //save pix2 + 1
     vabal.u8  q13, d1, d5
 	subs lr, #1
 
 	vabal.u8  q11, d0, d6
 	vabal.u8  q11, d1, d7
-	
+
 	vabal.u8  q9, d0, d8
 	vabal.u8  q9, d1, d9
 
@@ -451,18 +451,18 @@
 
     //Save SAD to 'r0'
 	ldr   r0, [sp, #12]
-	
+
 	vadd.u16   d0, d30, d31
 	vadd.u16   d1, d26, d27
 	vadd.u16   d2, d22, d23
 	vadd.u16   d3, d18, d19
-	
+
 	vpaddl.u16 q0, q0
 	vpaddl.u16 q1, q1
-	
+
 	vpaddl.u32 q0, q0
 	vpaddl.u32 q1, q1
-		
+
 	vshl.u32   q0, #4
 	vshl.u32   q1, #4
 	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
@@ -471,37 +471,37 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
     stmdb sp!, {r4-r5, lr}
-	
+
 	//Generate the pix2 start addr
 	sub   r4, r2, #1
 	add   r5, r2, #1
 	sub   r2, r3
-			
+
     //Loading a horizontal line data (16 bytes)
 	vld1.8 {q0}, [r0], r1 //save pix1
-	
+
 	vld1.8 {q1}, [r2], r3 //save pix2 - stride
 	vld1.8 {q6}, [r2], r3 //save pix2
 	vld1.8 {q2}, [r2], r3 //save pix2 + stride
-	
+
 	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vld1.8 {q4}, [r5], r3 //save pix2 + 1	
-		    
+	vld1.8 {q4}, [r5], r3 //save pix2 + 1
+
 	//Do the SAD for 16 bytes
 	vabdl.u8  q15, d0, d2
 	vabal.u8  q15, d1, d3
-	
+
 	vabdl.u8  q13, d0, d4
 	vabal.u8  q13, d1, d5
-	
+
 	vabdl.u8  q11, d0, d6
 	vabal.u8  q11, d1, d7
-	
+
 	vabdl.u8  q9, d0, d8
-	vabal.u8  q9, d1, d9			
-	
+	vabal.u8  q9, d1, d9
+
 	mov lr, #7
 pixel_sad_4_16x8_loop_0:
 
@@ -514,67 +514,67 @@
 	vabal.u8  q15, d1, d3
 	vld1.8 {q3}, [r4], r3 //save pix2 - 1
 	vabal.u8  q13, d0, d4
-	vld1.8 {q4}, [r5], r3 //save pix2 + 1	
+	vld1.8 {q4}, [r5], r3 //save pix2 + 1
     vabal.u8  q13, d1, d5
 	subs lr, #1
 
 	vabal.u8  q11, d0, d6
 	vabal.u8  q11, d1, d7
-	
+
 	vabal.u8  q9, d0, d8
 	vabal.u8  q9, d1, d9
-	
+
 	bne pixel_sad_4_16x8_loop_0
 
     //Save SAD to 'r0'
 	ldr   r0, [sp, #12]
-	
+
 	vadd.u16   d0, d30, d31
 	vadd.u16   d1, d26, d27
 	vadd.u16   d2, d22, d23
 	vadd.u16   d3, d18, d19
-	
+
 	vpaddl.u16 q0, q0
 	vpaddl.u16 q1, q1
-	
+
 	vpaddl.u32 q0, q0
 	vpaddl.u32 q1, q1
-		
+
 	vshl.u32   q0, #4
 	vshl.u32   q1, #4
 	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
-	
+
 	ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
- 
-WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon
+
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
     stmdb sp!, {r4-r5, lr}
-	
+
 	//Generate the pix2 start addr
 	sub   r4, r2, #1
 	add   r5, r2, #1
 	sub   r2, r3
-			
+
     //Loading a horizontal line data (8 bytes)
 	vld1.8 {d0}, [r0], r1 //save pix1
-	
+
 	vld1.8 {d1}, [r2], r3 //save pix2 - stride
 	vld1.8 {d6}, [r2], r3 //save pix2
 	vld1.8 {d2}, [r2], r3 //save pix2 + stride
-	
+
 	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1	
-		    
+	vld1.8 {d4}, [r5], r3 //save pix2 + 1
+
 	//Do the SAD for 8 bytes
 	vabdl.u8  q15, d0, d1
 	vabdl.u8  q14, d0, d2
 	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d0, d4		
-	
+	vabdl.u8  q12, d0, d4
+
 	mov lr, #15
 pixel_sad_4_8x16_loop_0:
-	
+
     //Loading a horizontal line data (8 bytes)
 	vld1.8 {d0}, [r0], r1 //save pix1
 	vmov.8 d1,   d6       //save pix2 - stride
@@ -582,7 +582,7 @@
 	vld1.8 {d2}, [r2], r3 //save pix2 + stride
 	vld1.8 {d3}, [r4], r3 //save pix2 - 1
 	vabal.u8  q15, d0, d1
-	
+
 	vld1.8 {d4}, [r5], r3 //save pix2 + 1
 	//Do the SAD for 8 bytes
 	vabal.u8  q14, d0, d2
@@ -594,50 +594,50 @@
 
     //Save SAD to 'r0'
 	ldr   r0, [sp, #12]
-	
+
 	vadd.u16   d0, d30, d31
 	vadd.u16   d1, d28, d29
 	vadd.u16   d2, d26, d27
 	vadd.u16   d3, d24, d25
-	
+
 	vpaddl.u16 q0, q0
 	vpaddl.u16 q1, q1
-	
+
 	vpaddl.u32 q0, q0
 	vpaddl.u32 q1, q1
-		
+
 	vshl.u32   q0, #4
 	vshl.u32   q1, #4
 	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
-	
+
 	ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
 	stmdb sp!, {r4-r5, lr}
-	
+
 	//Generate the pix2 start addr
 	sub   r4, r2, #1
 	add   r5, r2, #1
 	sub   r2, r3
-			
+
     //Loading a horizontal line data (8 bytes)
 	vld1.8 {d0}, [r0], r1 //save pix1
-	
+
 	vld1.8 {d1}, [r2], r3 //save pix2 - stride
 	vld1.8 {d6}, [r2], r3 //save pix2
 	vld1.8 {d2}, [r2], r3 //save pix2 + stride
-	
+
 	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1	
-		    
+	vld1.8 {d4}, [r5], r3 //save pix2 + 1
+
 	//Do the SAD for 8 bytes
 	vabdl.u8  q15, d0, d1
 	vabdl.u8  q14, d0, d2
 	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d0, d4		
-	
+	vabdl.u8  q12, d0, d4
+
 	mov lr, #7
 pixel_sad_4_8x8_loop_0:
 
@@ -648,7 +648,7 @@
 	vld1.8 {d2}, [r2], r3 //save pix2 + stride
 	vld1.8 {d3}, [r4], r3 //save pix2 - 1
 	vabal.u8  q15, d0, d1
-	
+
 	vld1.8 {d4}, [r5], r3 //save pix2 + 1
 	//Do the SAD for 8 bytes
 	vabal.u8  q14, d0, d2
@@ -659,34 +659,34 @@
 
     //Save SAD to 'r0'
 	ldr   r0, [sp, #12]
-	
+
 	vadd.u16   d0, d30, d31
 	vadd.u16   d1, d28, d29
 	vadd.u16   d2, d26, d27
 	vadd.u16   d3, d24, d25
-	
+
 	vpaddl.u16 q0, q0
 	vpaddl.u16 q1, q1
-	
+
 	vpaddl.u32 q0, q0
 	vpaddl.u32 q1, q1
-		
+
 	vshl.u32   q0, #4
 	vshl.u32   q1, #4
 	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
-	
+
 	ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
 
 	vld1.32  {d0[0]}, [r0], r1
 	vld1.32  {d0[1]}, [r0], r1
 	vld1.32  {d1[0]}, [r0], r1
 	vld1.32  {d1[1]}, [r0]
-	
-	
+
+
 	sub   r0, r2, r3
 	vld1.32  {d2[0]}, [r0], r3
 	vld1.32  {d2[1]}, [r0], r3
@@ -693,32 +693,32 @@
 	vld1.32  {d3[0]}, [r0], r3
 	vld1.32  {d3[1]}, [r0], r3
 	vld1.32  {d4[0]}, [r0], r3
-	vld1.32  {d4[1]}, [r0]		
-					
-	sub   r0,  r2, #1				
+	vld1.32  {d4[1]}, [r0]
+
+	sub   r0,  r2, #1
 	vld1.32  {d5[0]}, [r0], r3
 	vld1.32  {d5[1]}, [r0], r3
 	vld1.32  {d6[0]}, [r0], r3
-	vld1.32  {d6[1]}, [r0]	
-	
-	add   r0,  r2, #1				
+	vld1.32  {d6[1]}, [r0]
+
+	add   r0,  r2, #1
 	vld1.32  {d7[0]}, [r0], r3
 	vld1.32  {d7[1]}, [r0], r3
 	vld1.32  {d8[0]}, [r0], r3
 	vld1.32  {d8[1]}, [r0]
-	
+
 	vabdl.u8  q15, d0, d2
 	vabdl.u8  q14, d1, d3
-	
+
 	vabdl.u8  q13, d0, d3
 	vabdl.u8  q12, d1, d4
-	
+
 	vabdl.u8  q11, d0, d5
 	vabdl.u8  q10, d1, d6
-	
+
 	vabdl.u8  q9, d0, d7
 	vabdl.u8  q8, d1, d8
-	
+
 	//Save SAD to 'r4'
 	ldr   r0, [sp]
 	vadd.u16   q0, q14, q15
@@ -725,18 +725,18 @@
 	vadd.u16   q1, q12, q13
 	vadd.u16   q2, q10, q11
 	vadd.u16   q3, q8 , q9
-	
+
 	vadd.u16   d0, d1
 	vadd.u16   d1, d2, d3
 	vadd.u16   d2, d4, d5
 	vadd.u16   d3, d6, d7
-	
+
 	vpaddl.u16 q0, q0
 	vpaddl.u16 q1, q1
-	
+
 	vpaddl.u32 q0, q0
 	vpaddl.u32 q1, q1
-		
+
 	vshl.u32   q0, #4
 	vshl.u32   q1, #4
 	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
@@ -744,7 +744,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd16x16_neon
 
     SATD_16x4
     vadd.u16    q15,  q0, q2
@@ -769,7 +769,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd16x8_neon
 
     SATD_16x4
     vadd.u16    q15,  q0, q2
@@ -786,7 +786,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd8x16_neon
 
     SATD_8x4
     vadd.u16    q15,  q0, q1
@@ -811,7 +811,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd8x8_neon
 
     SATD_8x4
     vadd.u16    q15,  q0, q1
@@ -828,7 +828,7 @@
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
 
     //Load the pix1 data --- 16 bytes
 	vld1.32  {d0[0]}, [r0], r1
@@ -836,11 +836,11 @@
 	vld1.32  {d1[0]}, [r0], r1
 	vld1.32  {d1[1]}, [r0]
 
-    //Load the pix2 data --- 16 bytes	
+    //Load the pix2 data --- 16 bytes
 	vld1.32  {d2[0]}, [r2], r3
 	vld1.32  {d2[1]}, [r2], r3
 	vld1.32  {d3[0]}, [r2], r3
-	vld1.32  {d3[1]}, [r2]	
+	vld1.32  {d3[1]}, [r2]
 
     //Get the difference
     vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@@ -861,15 +861,15 @@
     vtrn.16 q13, q12
     vadd.s16 q15, q13, q12
 
-    //Do the SAD	
-    vabs.s16 q15, q15	
+    //Do the SAD
+    vabs.s16 q15, q15
     vabd.s16 q14, q13, q12
 
     vadd.u16 q0, q15, q14
 
     vrhadd.u16 d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
 	vmov.u32   r0, d0[0]
 
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -1,1312 +1,1312 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef  HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-#ifdef APPLE_IOS
-.macro	LORD_ALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, src*, src_stride	
-		vld1.64	{$0}, [$4,:128], $5
-		vld1.64	{$1}, [$4,:128], $5
-		vld1.64	{$2}, [$4,:128], $5
-		vld1.64	{$3}, [$4,:128], $5
-//	}
-.endm
-
-.macro	STORE_ALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, dst*, dst_stride	
-		vst1.64	{$0}, [$4,:128], $5
-		vst1.64	{$1}, [$4,:128], $5
-		vst1.64	{$2}, [$4,:128], $5
-		vst1.64	{$3}, [$4,:128], $5
-//	}
-.endm
-
-.macro	LORD_UNALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, src*, src_stride	
-		vld1.64	{$0}, [$4], $5
-		vld1.64	{$1}, [$4], $5
-		vld1.64	{$2}, [$4], $5
-		vld1.64	{$3}, [$4], $5
-//	}
-.endm
-
-.macro	STORE_UNALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, dst*, dst_stride	
-		vst1.64	{$0}, [$4], $5
-		vst1.64	{$1}, [$4], $5
-		vst1.64	{$2}, [$4], $5
-		vst1.64	{$3}, [$4], $5
-//	}
-.endm
-
-.macro	LOAD_4x4_DATA_FOR_DCT
-//	{	//	input: $0~$3, src1*, src1_stride, src2*, src2_stride	
-		vld2.16	{$0[0],$1[0]}, [$4], $5
-		vld2.16	{$2[0],$3[0]}, [$6], $7
-		vld2.16	{$0[1],$1[1]}, [$4], $5
-		vld2.16	{$2[1],$3[1]}, [$6], $7
-
-		vld2.16	{$0[2],$1[2]}, [$4], $5
-		vld2.16	{$2[2],$3[2]}, [$6], $7
-		vld2.16	{$0[3],$1[3]}, [$4], $5
-		vld2.16	{$2[3],$3[3]}, [$6], $7
-//	}
-.endm
-
-.macro	LOAD_8x8_DATA_FOR_DCT
-//	{	//	input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-		vld1.64	{$0}, [$8], r2
-		vld1.64	{$4}, [$9], r4
-		vld1.64	{$1}, [$8], r2
-		vld1.64	{$5}, [$9], r4
-
-		vld1.64	{$2}, [$8], r2
-		vld1.64	{$6}, [$9], r4
-		vld1.64	{$3}, [$8], r2
-		vld1.64	{$7}, [$9], r4
-//	}
-.endm
-	
-.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], working: [4]~[7]
-		vadd.s16		$4, $0, $3			//int16 s[0] = data[i] + data[i3];
-		vsub.s16		$7, $0, $3			//int16 s[3] = data[i] - data[i3];
-		vadd.s16		$5, $1, $2			//int16 s[1] = data[i1] + data[i2];
-		vsub.s16		$6, $1, $2			//int16 s[2] = data[i1] - data[i2];		
-		
-		vadd.s16		$0, $4, $5			//int16 dct[i ] = s[0] + s[1];
-		vsub.s16		$2, $4, $5			//int16 dct[i2] = s[0] - s[1];		
-		vshl.s16		$1, $7, #1
-		vshl.s16		$3, $6, #1
-		vadd.s16		$1, $1, $6			//int16 dct[i1] = (s[3] << 1) + s[2];
-		vsub.s16		$3, $7, $3			//int16 dct[i3] = s[3] - (s[2] << 1);
-//	}
-.endm
-
-.macro	MATRIX_TRANSFORM_EACH_16BITS
-//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-		vtrn.s16		$0, $1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-		vtrn.s16		$2, $3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-		vtrn.32		$0, $2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-		vtrn.32		$1, $3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-//	}
-.endm
-
-.macro	NEWQUANT_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
-		veor.s16		$6, $6			// init 0 , and keep 0;
-		vaba.s16		$1, $0, $6		// f + abs(coef - 0)
-		vmull.s16		$7, $2, $4
-		vmull.s16		$8, $3, $5
-		vshr.s32		$7, #16
-		vshr.s32		$8, #16	
-		vmovn.s32		$2, $7
-		vmovn.s32		$3, $8	
-	
-		vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
-		vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
-		vshl.s16		$6, #1
-		vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
-//	}
-.endm
-
-.macro	NEWQUANT_COEF_EACH_16BITS_MAX	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
-		veor.s16		$6, $6			// init 0 , and keep 0;
-		vaba.s16		$1, $0, $6		// f + abs(coef - 0)
-		vmull.s16		$7, $2, $4
-		vmull.s16		$8, $3, $5
-		vshr.s32		$7, #16
-		vshr.s32		$8, #16	
-		vmovn.s32		$2, $7
-		vmovn.s32		$3, $8	
-	
-		vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
-		vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
-		vshl.s16		$6, #1
-		vmax.s16		$9, $2, $3
-		vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
-//	}
-.endm
-
-.macro	QUANT_DUALWORD_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
-		vaba.s16		$1, $0, $3		// f + abs(coef - 0)
-		vmull.s16		$4, $1, $2		// *= mf
-		vshr.s32		$4, #16	
-		vmovn.s32		$1, $4			// >> 16
-	
-		vcgt.s16		$2, $0, #0		// if true, location of coef == 11111111
-		vbif.s16		$3, $1, $2		// if (x<0) reserved part; else keep 0 untouched
-		vshl.s16		$3, #1
-		vsub.s16		$1, $1, $3		// if x > 0, -= 0; else x-= 2x
-//	}
-.endm
-	
-.macro	DC_ZERO_COUNT_IN_DUALWORD
-//	{	//	input:	coef, dst_d, working_d (all 0x01)
-		vceq.s16	$1, $0, #0	
-		vand.s16	$1, $2
-		vpadd.s16	$1, $1, $1
-		vpadd.s16	$1, $1, $1
-//	}
-.endm
-
-.macro	SELECT_MAX_IN_ABS_COEF
-//	{	//	input:	coef_0, coef_1, max_q (identy to follow two)
-		vmax.s16		$2, $0, $1		// max 1st in $3 & max 2nd in $4
-		vpmax.s16		$3, $3, $4		// max 1st in $3[0][1] & max 2nd in $3[2][3]
-		vpmax.s16		$3, $3, $4		// max 1st in $3[0][1]
-//	}
-.endm
-	
-.macro	ZERO_COUNT_IN_2_QUARWORD
-//	{	//	input:	coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
-		vceq.s16	$0, #0
-		vceq.s16	$1, #0		
-		vand.s16	$0, $2
-		vand.s16	$1, $2
-	
-		vpadd.s16	$3, $3, $5
-		vpadd.s16	$4, $4, $6
-		vpadd.s16	$3, $3, $4		// 8-->4
-		vpadd.s16	$3, $3, $3
-		vpadd.s16	$3, $3, $3
-//	}
-.endm
-
-.macro	HDM_QUANT_2x2_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], working_d, dst_d
-		vshr.s64	$1, $0, #32
-		vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-		vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-		vtrn.s16	$2, $1
-		vtrn.s32	$2, $1
-//	}
-.endm
-
-.macro	IHDM_4x4_TOTAL_16BITS
-//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
-		vshr.s64	$1, $0, #32
-		vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
-		vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
-		vtrn.s16	$2, $1
-		vrev32.16	$1, $1	
-		vtrn.s32	$2, $1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
-
-		vrev64.16	$1, $2	
-		vadd.s16	$0, $2, $1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
-		vsub.s16	$1, $2, $1	
-		vrev32.16	$1, $1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
-		vtrn.s32	$0, $1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-//	}
-.endm
-	
-.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
-		vmovl.u8		$4,$0
-		vmovl.u8		$5,$1
-		vadd.s16		$4,$2
-		vadd.s16		$5,$3	
-		vqmovun.s16	$0,$4
-		vqmovun.s16	$1,$5	
-//	}
-.endm
-	
-.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
-		vadd.s16		$4, $0, $2			//int16 e[i][0] = src[0] + src[2];
-		vsub.s16		$5, $0, $2			//int16 e[i][1] = src[0] - src[2];
-		vshr.s16		$6, $1, #1
-		vshr.s16		$7, $3, #1
-		vsub.s16		$6, $6, $3			//int16 e[i][2] = (src[1]>>1)-src[3];	
-		vadd.s16		$7, $1, $7			//int16 e[i][3] = src[1] + (src[3]>>1);		
-//	}
-.endm
-
-.macro	TRANSFORM_TOTAL_16BITS	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s16		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s16		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s16		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s16		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
-.endm
-				
-
-.macro	ROW_TRANSFORM_0_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
-		vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-		vsubl.s16		$6, $1, $3			//int32 e[i][2] = src[1] - src[3];	
-		vaddl.s16		$7, $1, $3			//int32 e[i][3] = src[1] + src[3];		
-//	}
-.endm
-
-.macro	ROW_TRANSFORM_1_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-		vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-		vshr.s16		$8, $1, #1
-		vshr.s16		$9, $3, #1
-		vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];	
-		vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);		
-//	}
-.endm
-
-.macro	TRANSFORM_4BYTES	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
-.endm
-
-.macro	COL_TRANSFORM_0_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vsub.s32		$6, $1, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		$7, $1, $3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
-.endm
-
-.macro	COL_TRANSFORM_1_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vshr.s32		$6, $1, #1
-		vshr.s32		$7, $3, #1
-		vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
-.endm
-#else
-.macro	LORD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, src*, src_stride	
-		vld1.64	{\arg0}, [\arg4,:128], \arg5
-		vld1.64	{\arg1}, [\arg4,:128], \arg5
-		vld1.64	{\arg2}, [\arg4,:128], \arg5
-		vld1.64	{\arg3}, [\arg4,:128], \arg5
-//	}
-.endm
-
-.macro	STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, dst*, dst_stride	
-		vst1.64	{\arg0}, [\arg4,:128], \arg5
-		vst1.64	{\arg1}, [\arg4,:128], \arg5
-		vst1.64	{\arg2}, [\arg4,:128], \arg5
-		vst1.64	{\arg3}, [\arg4,:128], \arg5
-//	}
-.endm
-
-.macro	LORD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, src*, src_stride	
-		vld1.64	{\arg0}, [\arg4], \arg5
-		vld1.64	{\arg1}, [\arg4], \arg5
-		vld1.64	{\arg2}, [\arg4], \arg5
-		vld1.64	{\arg3}, [\arg4], \arg5
-//	}
-.endm
-
-.macro	STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, dst*, dst_stride	
-		vst1.64	{\arg0}, [\arg4], \arg5
-		vst1.64	{\arg1}, [\arg4], \arg5
-		vst1.64	{\arg2}, [\arg4], \arg5
-		vst1.64	{\arg3}, [\arg4], \arg5
-//	}
-.endm
-
-.macro	LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride	
-		vld2.16	{\arg0[0],\arg1[0]}, [\arg4], \arg5
-		vld2.16	{\arg2[0],\arg3[0]}, [\arg6], \arg7
-		vld2.16	{\arg0[1],\arg1[1]}, [\arg4], \arg5
-		vld2.16	{\arg2[1],\arg3[1]}, [\arg6], \arg7
-
-		vld2.16	{\arg0[2],\arg1[2]}, [\arg4], \arg5
-		vld2.16	{\arg2[2],\arg3[2]}, [\arg6], \arg7
-		vld2.16	{\arg0[3],\arg1[3]}, [\arg4], \arg5
-		vld2.16	{\arg2[3],\arg3[3]}, [\arg6], \arg7
-//	}
-.endm
-
-.macro	LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-		vld1.64	{\arg0}, [\arg8], r2
-		vld1.64	{\arg4}, [\arg9], r4
-		vld1.64	{\arg1}, [\arg8], r2
-		vld1.64	{\arg5}, [\arg9], r4
-
-		vld1.64	{\arg2}, [\arg8], r2
-		vld1.64	{\arg6}, [\arg9], r4
-		vld1.64	{\arg3}, [\arg8], r2
-		vld1.64	{\arg7}, [\arg9], r4
-//	}
-.endm
-	
-.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], working: [4]~[7]
-		vadd.s16		\arg4, \arg0, \arg3			//int16 s[0] = data[i] + data[i3];
-		vsub.s16		\arg7, \arg0, \arg3			//int16 s[3] = data[i] - data[i3];
-		vadd.s16		\arg5, \arg1, \arg2			//int16 s[1] = data[i1] + data[i2];
-		vsub.s16		\arg6, \arg1, \arg2			//int16 s[2] = data[i1] - data[i2];		
-		
-		vadd.s16		\arg0, \arg4, \arg5			//int16 dct[i ] = s[0] + s[1];
-		vsub.s16		\arg2, \arg4, \arg5			//int16 dct[i2] = s[0] - s[1];		
-		vshl.s16		\arg1, \arg7, #1
-		vshl.s16		\arg3, \arg6, #1
-		vadd.s16		\arg1, \arg1, \arg6			//int16 dct[i1] = (s[3] << 1) + s[2];
-		vsub.s16		\arg3, \arg7, \arg3			//int16 dct[i3] = s[3] - (s[2] << 1);
-//	}
-.endm
-
-.macro	MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
-//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-		vtrn.s16		\arg0, \arg1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-		vtrn.s16		\arg2, \arg3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-		vtrn.32		\arg0, \arg2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-		vtrn.32		\arg1, \arg3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-//	}
-.endm
-
-.macro	NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
-		veor.s16		\arg6, \arg6			// init 0 , and keep 0;
-		vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
-		vmull.s16		\arg7, \arg2, \arg4
-		vmull.s16		\arg8, \arg3, \arg5
-		vshr.s32		\arg7, #16
-		vshr.s32		\arg8, #16	
-		vmovn.s32		\arg2, \arg7
-		vmovn.s32		\arg3, \arg8	
-	
-		vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
-		vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
-		vshl.s16		\arg6, #1
-		vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
-//	}
-.endm
-
-.macro	NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
-		veor.s16		\arg6, \arg6			// init 0 , and keep 0;
-		vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
-		vmull.s16		\arg7, \arg2, \arg4
-		vmull.s16		\arg8, \arg3, \arg5
-		vshr.s32		\arg7, #16
-		vshr.s32		\arg8, #16	
-		vmovn.s32		\arg2, \arg7
-		vmovn.s32		\arg3, \arg8	
-	
-		vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
-		vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
-		vshl.s16		\arg6, #1
-		vmax.s16		\arg9, \arg2, \arg3
-		vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
-//	}
-.endm
-
-.macro	QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
-//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
-		vaba.s16		\arg1, \arg0, \arg3		// f + abs(coef - 0)
-		vmull.s16		\arg4, \arg1, \arg2		// *= mf
-		vshr.s32		\arg4, #16	
-		vmovn.s32		\arg1, \arg4			// >> 16
-	
-		vcgt.s16		\arg2, \arg0, #0		// if true, location of coef == 11111111
-		vbif.s16		\arg3, \arg1, \arg2		// if (x<0) reserved part; else keep 0 untouched
-		vshl.s16		\arg3, #1
-		vsub.s16		\arg1, \arg1, \arg3		// if x > 0, -= 0; else x-= 2x
-//	}
-.endm
-	
-.macro	DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
-//	{	//	input:	coef, dst_d, working_d (all 0x01)
-		vceq.s16	\arg1, \arg0, #0	
-		vand.s16	\arg1, \arg2
-		vpadd.s16	\arg1, \arg1, \arg1
-		vpadd.s16	\arg1, \arg1, \arg1
-//	}
-.endm
-
-.macro	SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
-//	{	//	input:	coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
-		vmax.s16		\arg2, \arg0, \arg1		// max 1st in \arg3 & max 2nd in \arg4
-		vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
-		vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1]
-//	}
-.endm
-	
-.macro	ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
-//	{	//	input:	coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
-		vceq.s16	\arg0, #0
-		vceq.s16	\arg1, #0		
-		vand.s16	\arg0, \arg2
-		vand.s16	\arg1, \arg2
-	
-		vpadd.s16	\arg3, \arg3, \arg5
-		vpadd.s16	\arg4, \arg4, \arg6
-		vpadd.s16	\arg3, \arg3, \arg4		// 8-->4
-		vpadd.s16	\arg3, \arg3, \arg3
-		vpadd.s16	\arg3, \arg3, \arg3
-//	}
-.endm
-
-.macro	HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
-//	{	//	input: src_d[0]~[3], working_d, dst_d
-		vshr.s64	\arg1, \arg0, #32
-		vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-		vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-		vtrn.s16	\arg2, \arg1
-		vtrn.s32	\arg2, \arg1
-//	}
-.endm
-
-.macro	IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
-//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
-		vshr.s64	\arg1, \arg0, #32
-		vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
-		vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
-		vtrn.s16	\arg2, \arg1
-		vrev32.16	\arg1, \arg1	
-		vtrn.s32	\arg2, \arg1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
-
-		vrev64.16	\arg1, \arg2	
-		vadd.s16	\arg0, \arg2, \arg1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
-		vsub.s16	\arg1, \arg2, \arg1	
-		vrev32.16	\arg1, \arg1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
-		vtrn.s32	\arg0, \arg1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-//	}
-.endm
-	
-.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
-		vmovl.u8		\arg4,\arg0
-		vmovl.u8		\arg5,\arg1
-		vadd.s16		\arg4,\arg2
-		vadd.s16		\arg5,\arg3	
-		vqmovun.s16	\arg0,\arg4
-		vqmovun.s16	\arg1,\arg5	
-//	}
-.endm
-	
-.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
-		vadd.s16		\arg4, \arg0, \arg2			//int16 e[i][0] = src[0] + src[2];
-		vsub.s16		\arg5, \arg0, \arg2			//int16 e[i][1] = src[0] - src[2];
-		vshr.s16		\arg6, \arg1, #1
-		vshr.s16		\arg7, \arg3, #1
-		vsub.s16		\arg6, \arg6, \arg3			//int16 e[i][2] = (src[1]>>1)-src[3];	
-		vadd.s16		\arg7, \arg1, \arg7			//int16 e[i][3] = src[1] + (src[3]>>1);		
-//	}
-.endm
-
-.macro	TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s16		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s16		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s16		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s16		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
-.endm
-				
-
-.macro	ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
-		vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-		vsubl.s16		\arg6, \arg1, \arg3			//int32 e[i][2] = src[1] - src[3];	
-		vaddl.s16		\arg7, \arg1, \arg3			//int32 e[i][3] = src[1] + src[3];		
-//	}
-.endm
-
-.macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
-		vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-		vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-		vshr.s16		\arg8, \arg1, #1
-		vshr.s16		\arg9, \arg3, #1
-		vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];	
-		vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);		
-//	}
-.endm
-
-.macro	TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-		vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-		vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-		vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-		vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
-.endm
-
-.macro	COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vsub.s32		\arg6, \arg1, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		\arg7, \arg1, \arg3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
-.endm
-
-.macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-		vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-		vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-		vshr.s32		\arg6, \arg1, #1
-		vshr.s32		\arg7, \arg3, #1
-		vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];	
-		vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
-.endm
-#endif
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
-	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
-
-	LORD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
-
-	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
-
-	LORD_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
-
-	STORE_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
-
-	LORD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
-
-	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
-
-	LORD_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
-
-	STORE_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
-	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
-	
-WELS_ASM_FUNC_END
-  	
-
-WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
-	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
-	
-	LORD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
-
-	LORD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
-
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
-	
-WELS_ASM_FUNC_END
-  	
-
-
-WELS_ASM_FUNC_BEGIN WelsDctT4_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	LOAD_4x4_DATA_FOR_DCT	d4, d5, d6, d7, r1, r2, r3, r4
-		
-	vsubl.u8	q0, d4, d6
-	vsubl.u8	q1, d5, d7
-	vtrn.s32	q0, q1
-	vswp		d1, d2
-
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
-	
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
-	
-	vst1.s16		{q0, q1}, [r0]!
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	LOAD_8x8_DATA_FOR_DCT	d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
-
-	vsubl.u8	q0,  d8, d12
-	vsubl.u8	q1,  d9, d13
-	vsubl.u8	q2, d10, d14
-	vsubl.u8	q3, d11, d15
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
-
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
-	
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-
-	vswp		d1, d2
-	vswp		d5, d6
-	vswp		q1, q2	
-	vst1.s16		{q0, q1}, [r0]!
-	vst1.s16		{q2, q3}, [r0]!
-	
-	////////////////
-	LOAD_8x8_DATA_FOR_DCT	d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
-
-	vsubl.u8	q0,  d8, d12
-	vsubl.u8	q1,  d9, d13
-	vsubl.u8	q2, d10, d14
-	vsubl.u8	q3, d11, d15
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
-
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
-	
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	vswp		d1, d2
-	vswp		d5, d6
-	vswp		q1, q2
-	vst1.s16		{q0, q1}, [r0]!
-	vst1.s16		{q2, q3}, [r0]!
-			
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q0, q1}, [r0]
-	vld1.s16		{q3}, [r2]	
-
-	vmov			q4, q2
-	
-	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q5, q6, q7
-	vst1.s16		{q2}, [r0]!
-
-	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r0]!
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
-
-	vld1.s16		{q0, q1}, [r0]
-	vdup.s16		q2, r1		// even ff range [0, 768]
-	vdup.s16		q3, r2
-
-	vmov			q4, q2
-
-	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q5, q6, q7
-	vst1.s16		{q2}, [r0]!
-
-	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r0]!
-
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q3}, [r2]	
-	mov				r1, r0
-	
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!
-	
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!	
-
-	vld1.s16		{q0, q1}, [r0]!	
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!
-
-	vld1.s16		{q0, q1}, [r0]!	
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
-	vst1.s16		{q4}, [r1]!	
-	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q3}, [r2]	
-	mov				r1, r0
-
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
-	vst1.s16		{q4}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
-	vst1.s16		{q8}, [r1]!		// then 1st 16 elem in d18 & d20
-	
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
-	vst1.s16		{q4}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
-	vst1.s16		{q8}, [r1]!	// then 2nd 16 elem in d19 & d21
-	
-	SELECT_MAX_IN_ABS_COEF	q9, q10, q0, d0, d1
-	vst1.s32		{d0[0]}, [r3]!
-	
-	///////////
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
-	vst1.s16		{q4}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
-	vst1.s16		{q8}, [r1]!		// then 3rd 16 elem in d18 & d20
-	
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q4, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
-	vst1.s16		{q4}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
-	vst1.s16		{q8}, [r1]!	// then 4th 16 elem in d19 & d21
-	
-	SELECT_MAX_IN_ABS_COEF	q9, q10, q0, d0, d1
-	vst1.s32		{d0[0]}, [r3]!
-		
-WELS_ASM_FUNC_END
-	          
-
-WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
-	push	{r2,r3}
-	mov		r2, #64	// 2*16*sizeof(int16_t)
-	add		r3, r1, #32
-
-	vld1.s16		{d0}, [r1], r2
-	vld1.s16		{d1}, [r3], r2
-	vld1.s16		{d4}, [r1], r2
-	vld1.s16		{d5}, [r3], r2
-	vld1.s16		{d2}, [r1], r2
-	vld1.s16		{d3}, [r3], r2
-	vld1.s16		{d6}, [r1], r2
-	vld1.s16		{d7}, [r3], r2
-	vtrn.16		q0, q2		// d0[0 4], d1[1 5]
-	vtrn.16		q1, q3		// d2[2 6], d3[3 7]
-	
-	vld1.s16		{d8}, [r1], r2
-	vld1.s16		{d9}, [r3], r2
-	vld1.s16		{d12}, [r1], r2
-	vld1.s16		{d13}, [r3], r2
-	vld1.s16		{d10}, [r1], r2
-	vld1.s16		{d11}, [r3], r2
-	vld1.s16		{d14}, [r1], r2
-	vld1.s16		{d15}, [r3], r2
-	vtrn.16		q4, q6		// d8[08 12], d9[09 13]
-	vtrn.16		q5, q7		//d10[10 14],d11[11 15]
-	
-	vtrn.32		q0, q4		// d0 [0 4 08 12] = dct[idx],		d1[1 5 09 13] = dct[idx+16]
-	vtrn.32		q1, q5		// d2 [2 6 10 14] = dct[idx+64],	d3[3 7 11 15] = dct[idx+80]
-	
-	ROW_TRANSFORM_0_STEP	d0, d1, d3, d2, q4, q7, q6, q5
-	
-	TRANSFORM_4BYTES		q0, q1, q3, q2, q4, q7, q6, q5
-
-	// transform element 32bits
-	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-	
-	COL_TRANSFORM_0_STEP	q0, q1, q3, q2, q4, q7, q6, q5
-	
-	TRANSFORM_4BYTES		q0, q1, q3, q2, q4, q7, q6, q5		
-	
-	vrshrn.s32		d8, q0, #1
-	vrshrn.s32		d9, q1, #1	
-	vrshrn.s32		d10, q2, #1
-	vrshrn.s32		d11, q3, #1
-	vst1.16	{q4, q5}, [r0]	//store
-
-	pop		{r2,r3}
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
-
-	vdup.s16	d1, r1				//ff
-	vdup.s16	d2, r2				//mf
-	veor		d3, d3
-		
-	mov			r1, #32
-	mov			r2, r0
-	
-	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[00]=0	
-	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[16]=0		
-	vld1.s16	{d0[2]}, [r0], r1		//rs[32]	
-	vst1.s16	{d3[0]}, [r2], r1		//rs[32]=0		
-	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[48]=0	
-
-	HDM_QUANT_2x2_TOTAL_16BITS	d0, d4, d5		// output d5
-
-	HDM_QUANT_2x2_TOTAL_16BITS	d5, d4, d0		// output d0
-		
-	QUANT_DUALWORD_COEF_EACH_16BITS	d0, d1, d2, d3, q2	
-	
-	vst1.s16	d1, [r3]		// store to dct
-	ldr			r2, [sp, #0]
-	vst1.s16	d1, [r2]		// store to block
-
-	mov			r1, #1
-	vdup.s16	d3, r1
-	DC_ZERO_COUNT_IN_DUALWORD	d1, d0, d3
-	
-	vmov	r0, r1, d0
-	and		r0, #0x07		// range [0~4]
-	rsb		r0, #4
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
-
-	vdup.s16	d3, r1
-	mov			r1, #32
-	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
-	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
-	vld1.s16	{d0[2]}, [r0], r1		//rs[32]	
-	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
-
-	HDM_QUANT_2x2_TOTAL_16BITS	d0, d1, d2		// output d2
-
-	HDM_QUANT_2x2_TOTAL_16BITS	d2, d1, d0		// output d0
-
-	vabs.s16	d1, d0
-	vcgt.s16	d1, d1, d3		// abs(dct[i])>threshold;
-	vmov	r0, r1, d1
-	orr		r0, r1
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
-	push	{r1}
-	vld1.s16	{q0, q1}, [r0]
-	vmov.s16	q8, #1
-	
-	ZERO_COUNT_IN_2_QUARWORD	q0, q1, q8, d0, d1, d2, d3
-	vmov	r0, r1, d0
-	and		r0, #0x1F	// range [0~16]
-	rsb		r0, #16
-	pop		{r1}
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
-	vld1.s16	{q0, q1}, [r0]
-	vld1.u16	{q2}, [r1]
-
-	vmul.s16	q4, q0, q2
-	vmul.s16	q5, q1, q2
-	
-	vst1.s16	{q4, q5}, [r0]
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
-	vld1.u16	{q8}, [r1]
-	mov		r1, r0
-	vld1.s16	{q0, q1}, [r0]!
-	vld1.s16	{q2, q3}, [r0]!
-	vmul.s16	q0, q0, q8	
-	vld1.s16	{q4, q5}, [r0]!
-	vmul.s16	q1, q1, q8	
-	vld1.s16	{q6, q7}, [r0]!
-	
-	vst1.s16	{q0, q1}, [r1]!
-				
-	vmul.s16	q2, q2, q8
-	vmul.s16	q3, q3, q8
-	vmul.s16	q4, q4, q8
-	vst1.s16	{q2, q3}, [r1]!
-		
-	vmul.s16	q5, q5, q8
-	vmul.s16	q6, q6, q8
-	vmul.s16	q7, q7, q8	
-	vst1.s16	{q4, q5}, [r1]!		
-	vst1.s16	{q6, q7}, [r1]!
-	
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
-
-	vld1.s16	{q0, q1}, [r0]
-	vdup.s16	q4, r1
-	
-	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
-	IHDM_4x4_TOTAL_16BITS	q1, q2, q3
-	
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
-	
-	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
-	vmul.s16	q0, q4
-	
-	IHDM_4x4_TOTAL_16BITS	q1, q2, q3		
-	vmul.s16	q1, q4
-
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3	
-	vst1.s16	{q0, q1}, [r0]
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
-	vld1.u32		{d14[0]}, [r2], r3
-	push			{r4}
-	ldr				r4, [sp, #4]
-	vld1.u32		{d14[1]}, [r2], r3
-		
-	vld4.s16		{d0, d1, d2, d3}, [r4]		// cost 3 cycles!	
-	vld1.u32		{d15[0]}, [r2], r3	
-	vld1.u32		{d15[1]}, [r2], r3			// q7 is pred
-	
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-	
-	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-	
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
-
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-	
-	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-	vrshr.s16		d0, d0, #6
-	vrshr.s16		d1, d1, #6
-	vrshr.s16		d2, d2, #6
-	vrshr.s16		d3, d3, #6
-				
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q2,d14
-	vadd.s16		q0,q2
-	vqmovun.s16	d14,q0
-	vst1.32		{d14[0]},[r0],r1
-	vst1.32		{d14[1]},[r0],r1
-	
-	vmovl.u8		q2,d15
-	vadd.s16		q1,q2
-	vqmovun.s16	d15,q1
-	vst1.32		{d15[0]},[r0],r1
-	vst1.32		{d15[1]},[r0]
-	
-	pop			{r4}
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
-
-	vld1.u64		{d16}, [r2], r3
-	push			{r4}
-	ldr				r4, [sp, #4]
-	vld1.u64		{d17}, [r2], r3
-		
-	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!	
-	vld1.u64		{d18}, [r2], r3	
-	vld1.u64		{d19}, [r2], r3
-	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!	
-	vswp			d1, d4
-	vswp			d3, d6
-	vswp			q1, q2						// q0~q3
-	
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
-
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	vrshr.s16		q0, q0, #6
-	vrshr.s16		q1, q1, #6
-	vrshr.s16		q2, q2, #6
-	vrshr.s16		q3, q3, #6
-				
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q4,d16
-	vadd.s16		q0,q4
-	vqmovun.s16	d16,q0
-	vst1.u8		{d16},[r0],r1
-
-	vmovl.u8		q4,d17
-	vadd.s16		q1,q4
-	vqmovun.s16	d17,q1
-	vst1.u8		{d17},[r0],r1
-
-	vmovl.u8		q4,d18
-	vadd.s16		q2,q4
-	vqmovun.s16	d18,q2
-	vst1.u8		{d18},[r0],r1
-
-	vmovl.u8		q4,d19
-	vadd.s16		q3,q4
-	vqmovun.s16	d19,q3
-	vst1.u8		{d19},[r0],r1
-
-	vld1.u64		{d16}, [r2], r3
-	vld1.u64		{d17}, [r2], r3
-		
-	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!	
-	vld1.u64		{d18}, [r2], r3	
-	vld1.u64		{d19}, [r2], r3
-	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!	
-	vswp			d1, d4
-	vswp			d3, d6
-	vswp			q1, q2						// q0~q3
-	
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
-
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
-	vrshr.s16		q0, q0, #6
-	vrshr.s16		q1, q1, #6
-	vrshr.s16		q2, q2, #6
-	vrshr.s16		q3, q3, #6
-				
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q4,d16
-	vadd.s16		q0,q4
-	vqmovun.s16	d16,q0
-	vst1.u8		{d16},[r0],r1
-
-	vmovl.u8		q4,d17
-	vadd.s16		q1,q4
-	vqmovun.s16	d17,q1
-	vst1.u8		{d17},[r0],r1
-
-	vmovl.u8		q4,d18
-	vadd.s16		q2,q4
-	vqmovun.s16	d18,q2
-	vst1.u8		{d18},[r0],r1
-
-	vmovl.u8		q4,d19
-	vadd.s16		q3,q4
-	vqmovun.s16	d19,q3
-	vst1.u8		{d19},[r0],r1	
-	
-	pop			{r4}
-WELS_ASM_FUNC_END
-	
-
-WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-	
-	vld1.s16	{q8,q9}, [r4]
-	vrshr.s16		q8, q8, #6
-	vrshr.s16		q9, q9, #6
-
-	vdup.s16	d20, d16[0]
-	vdup.s16	d21, d16[1]	
-	vdup.s16	d22, d16[2]
-	vdup.s16	d23, d16[3]
-	
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vdup.s16	d20, d17[0]
-	vdup.s16	d21, d17[1]	
-	vdup.s16	d22, d17[2]
-	vdup.s16	d23, d17[3]
-	
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vdup.s16	d20, d18[0]
-	vdup.s16	d21, d18[1]	
-	vdup.s16	d22, d18[2]
-	vdup.s16	d23, d18[3]
-	
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vdup.s16	d20, d19[0]
-	vdup.s16	d21, d19[1]	
-	vdup.s16	d22, d19[2]
-	vdup.s16	d23, d19[3]
-	
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
-
-	vld1.u8	{q0}, [r2], r3		
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1	
-				
-	pop			{r4}
-WELS_ASM_FUNC_END
-#endif
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef  HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef APPLE_IOS
+.macro	LORD_ALIGNED_DATA_WITH_STRIDE
+//	{	//	input: $0~$3, src*, src_stride
+    vld1.64	{$0}, [$4,:128], $5
+    vld1.64	{$1}, [$4,:128], $5
+    vld1.64	{$2}, [$4,:128], $5
+    vld1.64	{$3}, [$4,:128], $5
+//	}
+.endm
+
+.macro	STORE_ALIGNED_DATA_WITH_STRIDE
+//	{	//	input: $0~$3, dst*, dst_stride
+    vst1.64	{$0}, [$4,:128], $5
+    vst1.64	{$1}, [$4,:128], $5
+    vst1.64	{$2}, [$4,:128], $5
+    vst1.64	{$3}, [$4,:128], $5
+//	}
+.endm
+
+.macro	LORD_UNALIGNED_DATA_WITH_STRIDE
+//	{	//	input: $0~$3, src*, src_stride
+    vld1.64	{$0}, [$4], $5
+    vld1.64	{$1}, [$4], $5
+    vld1.64	{$2}, [$4], $5
+    vld1.64	{$3}, [$4], $5
+//	}
+.endm
+
+.macro	STORE_UNALIGNED_DATA_WITH_STRIDE
+//	{	//	input: $0~$3, dst*, dst_stride
+    vst1.64	{$0}, [$4], $5
+    vst1.64	{$1}, [$4], $5
+    vst1.64	{$2}, [$4], $5
+    vst1.64	{$3}, [$4], $5
+//	}
+.endm
+
+.macro	LOAD_4x4_DATA_FOR_DCT
+//	{	//	input: $0~$3, src1*, src1_stride, src2*, src2_stride
+    vld2.16	{$0[0],$1[0]}, [$4], $5
+    vld2.16	{$2[0],$3[0]}, [$6], $7
+    vld2.16	{$0[1],$1[1]}, [$4], $5
+    vld2.16	{$2[1],$3[1]}, [$6], $7
+
+    vld2.16	{$0[2],$1[2]}, [$4], $5
+    vld2.16	{$2[2],$3[2]}, [$6], $7
+    vld2.16	{$0[3],$1[3]}, [$4], $5
+    vld2.16	{$2[3],$3[3]}, [$6], $7
+//	}
+.endm
+
+.macro	LOAD_8x8_DATA_FOR_DCT
+//	{	//	input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+    vld1.64	{$0}, [$8], r2
+    vld1.64	{$4}, [$9], r4
+    vld1.64	{$1}, [$8], r2
+    vld1.64	{$5}, [$9], r4
+
+    vld1.64	{$2}, [$8], r2
+    vld1.64	{$6}, [$9], r4
+    vld1.64	{$3}, [$8], r2
+    vld1.64	{$7}, [$9], r4
+//	}
+.endm
+
+.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS
+//	{	//	input: src_d[0]~[3], working: [4]~[7]
+    vadd.s16		$4, $0, $3			//int16 s[0] = data[i] + data[i3];
+    vsub.s16		$7, $0, $3			//int16 s[3] = data[i] - data[i3];
+    vadd.s16		$5, $1, $2			//int16 s[1] = data[i1] + data[i2];
+    vsub.s16		$6, $1, $2			//int16 s[2] = data[i1] - data[i2];
+
+    vadd.s16		$0, $4, $5			//int16 dct[i ] = s[0] + s[1];
+    vsub.s16		$2, $4, $5			//int16 dct[i2] = s[0] - s[1];
+    vshl.s16		$1, $7, #1
+    vshl.s16		$3, $6, #1
+    vadd.s16		$1, $1, $6			//int16 dct[i1] = (s[3] << 1) + s[2];
+    vsub.s16		$3, $7, $3			//int16 dct[i3] = s[3] - (s[2] << 1);
+//	}
+.endm
+
+.macro	MATRIX_TRANSFORM_EACH_16BITS
+//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+    vtrn.s16		$0, $1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s16		$2, $3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vtrn.32		$0, $2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vtrn.32		$1, $3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+//	}
+.endm
+
+.macro	NEWQUANT_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
+//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+    veor.s16		$6, $6			// init 0 , and keep 0;
+    vaba.s16		$1, $0, $6		// f + abs(coef - 0)
+    vmull.s16		$7, $2, $4
+    vmull.s16		$8, $3, $5
+    vshr.s32		$7, #16
+    vshr.s32		$8, #16
+    vmovn.s32		$2, $7
+    vmovn.s32		$3, $8
+
+    vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
+    vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
+    vshl.s16		$6, #1
+    vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
+//	}
+.endm
+
+.macro	NEWQUANT_COEF_EACH_16BITS_MAX	// if coef <= 0, - coef; else , coef;
+//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+    veor.s16		$6, $6			// init 0 , and keep 0;
+    vaba.s16		$1, $0, $6		// f + abs(coef - 0)
+    vmull.s16		$7, $2, $4
+    vmull.s16		$8, $3, $5
+    vshr.s32		$7, #16
+    vshr.s32		$8, #16
+    vmovn.s32		$2, $7
+    vmovn.s32		$3, $8
+
+    vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
+    vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
+    vshl.s16		$6, #1
+    vmax.s16		$9, $2, $3
+    vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
+//	}
+.endm
+
+.macro	QUANT_DUALWORD_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
+//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
+    vaba.s16		$1, $0, $3		// f + abs(coef - 0)
+    vmull.s16		$4, $1, $2		// *= mf
+    vshr.s32		$4, #16
+    vmovn.s32		$1, $4			// >> 16
+
+    vcgt.s16		$2, $0, #0		// if true, location of coef == 11111111
+    vbif.s16		$3, $1, $2		// if (x<0) reserved part; else keep 0 untouched
+    vshl.s16		$3, #1
+    vsub.s16		$1, $1, $3		// if x > 0, -= 0; else x-= 2x
+//	}
+.endm
+
+.macro	DC_ZERO_COUNT_IN_DUALWORD
+//	{	//	input:	coef, dst_d, working_d (all 0x01)
+    vceq.s16	$1, $0, #0
+    vand.s16	$1, $2
+    vpadd.s16	$1, $1, $1
+    vpadd.s16	$1, $1, $1
+//	}
+.endm
+
+.macro	SELECT_MAX_IN_ABS_COEF
+//	{	//	input:	coef_0, coef_1, max_q (identy to follow two)
+    vmax.s16		$2, $0, $1		// max 1st in $3 & max 2nd in $4
+    vpmax.s16		$3, $3, $4		// max 1st in $3[0][1] & max 2nd in $3[2][3]
+    vpmax.s16		$3, $3, $4		// max 1st in $3[0][1]
+//	}
+.endm
+
+.macro	ZERO_COUNT_IN_2_QUARWORD
+//	{	//	input:	coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
+    vceq.s16	$0, #0
+    vceq.s16	$1, #0
+    vand.s16	$0, $2
+    vand.s16	$1, $2
+
+    vpadd.s16	$3, $3, $5
+    vpadd.s16	$4, $4, $6
+    vpadd.s16	$3, $3, $4		// 8-->4
+    vpadd.s16	$3, $3, $3
+    vpadd.s16	$3, $3, $3
+//	}
+.endm
+
+.macro	HDM_QUANT_2x2_TOTAL_16BITS
+//	{	//	input: src_d[0]~[3], working_d, dst_d
+    vshr.s64	$1, $0, #32
+    vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+    vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+    vtrn.s16	$2, $1
+    vtrn.s32	$2, $1
+//	}
+.endm
+
+.macro	IHDM_4x4_TOTAL_16BITS
+//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+    vshr.s64	$1, $0, #32
+    vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+    vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+    vtrn.s16	$2, $1
+    vrev32.16	$1, $1
+    vtrn.s32	$2, $1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+
+    vrev64.16	$1, $2
+    vadd.s16	$0, $2, $1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+    vsub.s16	$1, $2, $1
+    vrev32.16	$1, $1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+    vtrn.s32	$0, $1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+//	}
+.endm
+
+.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
+//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+    vmovl.u8		$4,$0
+    vmovl.u8		$5,$1
+    vadd.s16		$4,$2
+    vadd.s16		$5,$3
+    vqmovun.s16	$0,$4
+    vqmovun.s16	$1,$5
+//	}
+.endm
+
+.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS
+//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
+    vadd.s16		$4, $0, $2			//int16 e[i][0] = src[0] + src[2];
+    vsub.s16		$5, $0, $2			//int16 e[i][1] = src[0] - src[2];
+    vshr.s16		$6, $1, #1
+    vshr.s16		$7, $3, #1
+    vsub.s16		$6, $6, $3			//int16 e[i][2] = (src[1]>>1)-src[3];
+    vadd.s16		$7, $1, $7			//int16 e[i][3] = src[1] + (src[3]>>1);
+//	}
+.endm
+
+.macro	TRANSFORM_TOTAL_16BITS	// both row & col transform used
+//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s16		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s16		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s16		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s16		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
+//	}
+.endm
+
+
+.macro	ROW_TRANSFORM_0_STEP
+//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
+    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
+    vsubl.s16		$6, $1, $3			//int32 e[i][2] = src[1] - src[3];
+    vaddl.s16		$7, $1, $3			//int32 e[i][3] = src[1] + src[3];
+//	}
+.endm
+
+.macro	ROW_TRANSFORM_1_STEP
+//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
+    vshr.s16		$8, $1, #1
+    vshr.s16		$9, $3, #1
+    vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);
+//	}
+.endm
+
+.macro	TRANSFORM_4BYTES	// both row & col transform used
+//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
+//	}
+.endm
+
+.macro	COL_TRANSFORM_0_STEP
+//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vsub.s32		$6, $1, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		$7, $1, $3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//	}
+.endm
+
+.macro	COL_TRANSFORM_1_STEP
+//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32		$6, $1, #1
+    vshr.s32		$7, $3, #1
+    vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//	}
+.endm
+#else
+.macro	LORD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//	{	//	input: \arg0~\arg3, src*, src_stride
+    vld1.64	{\arg0}, [\arg4,:128], \arg5
+    vld1.64	{\arg1}, [\arg4,:128], \arg5
+    vld1.64	{\arg2}, [\arg4,:128], \arg5
+    vld1.64	{\arg3}, [\arg4,:128], \arg5
+//	}
+.endm
+
+.macro	STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//	{	//	input: \arg0~\arg3, dst*, dst_stride
+    vst1.64	{\arg0}, [\arg4,:128], \arg5
+    vst1.64	{\arg1}, [\arg4,:128], \arg5
+    vst1.64	{\arg2}, [\arg4,:128], \arg5
+    vst1.64	{\arg3}, [\arg4,:128], \arg5
+//	}
+.endm
+
+.macro	LORD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//	{	//	input: \arg0~\arg3, src*, src_stride
+    vld1.64	{\arg0}, [\arg4], \arg5
+    vld1.64	{\arg1}, [\arg4], \arg5
+    vld1.64	{\arg2}, [\arg4], \arg5
+    vld1.64	{\arg3}, [\arg4], \arg5
+//	}
+.endm
+
+.macro	STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//	{	//	input: \arg0~\arg3, dst*, dst_stride
+    vst1.64	{\arg0}, [\arg4], \arg5
+    vst1.64	{\arg1}, [\arg4], \arg5
+    vst1.64	{\arg2}, [\arg4], \arg5
+    vst1.64	{\arg3}, [\arg4], \arg5
+//	}
+.endm
+
+.macro	LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
+    vld2.16	{\arg0[0],\arg1[0]}, [\arg4], \arg5
+    vld2.16	{\arg2[0],\arg3[0]}, [\arg6], \arg7
+    vld2.16	{\arg0[1],\arg1[1]}, [\arg4], \arg5
+    vld2.16	{\arg2[1],\arg3[1]}, [\arg6], \arg7
+
+    vld2.16	{\arg0[2],\arg1[2]}, [\arg4], \arg5
+    vld2.16	{\arg2[2],\arg3[2]}, [\arg6], \arg7
+    vld2.16	{\arg0[3],\arg1[3]}, [\arg4], \arg5
+    vld2.16	{\arg2[3],\arg3[3]}, [\arg6], \arg7
+//	}
+.endm
+
+.macro	LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//	{	//	input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+    vld1.64	{\arg0}, [\arg8], r2
+    vld1.64	{\arg4}, [\arg9], r4
+    vld1.64	{\arg1}, [\arg8], r2
+    vld1.64	{\arg5}, [\arg9], r4
+
+    vld1.64	{\arg2}, [\arg8], r2
+    vld1.64	{\arg6}, [\arg9], r4
+    vld1.64	{\arg3}, [\arg8], r2
+    vld1.64	{\arg7}, [\arg9], r4
+//	}
+.endm
+
+.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: src_d[0]~[3], working: [4]~[7]
+    vadd.s16		\arg4, \arg0, \arg3			//int16 s[0] = data[i] + data[i3];
+    vsub.s16		\arg7, \arg0, \arg3			//int16 s[3] = data[i] - data[i3];
+    vadd.s16		\arg5, \arg1, \arg2			//int16 s[1] = data[i1] + data[i2];
+    vsub.s16		\arg6, \arg1, \arg2			//int16 s[2] = data[i1] - data[i2];
+
+    vadd.s16		\arg0, \arg4, \arg5			//int16 dct[i ] = s[0] + s[1];
+    vsub.s16		\arg2, \arg4, \arg5			//int16 dct[i2] = s[0] - s[1];
+    vshl.s16		\arg1, \arg7, #1
+    vshl.s16		\arg3, \arg6, #1
+    vadd.s16		\arg1, \arg1, \arg6			//int16 dct[i1] = (s[3] << 1) + s[2];
+    vsub.s16		\arg3, \arg7, \arg3			//int16 dct[i3] = s[3] - (s[2] << 1);
+//	}
+.endm
+
+.macro	MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
+//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+    vtrn.s16		\arg0, \arg1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s16		\arg2, \arg3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vtrn.32		\arg0, \arg2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vtrn.32		\arg1, \arg3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+//	}
+.endm
+
+.macro	NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+    veor.s16		\arg6, \arg6			// init 0 , and keep 0;
+    vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
+    vmull.s16		\arg7, \arg2, \arg4
+    vmull.s16		\arg8, \arg3, \arg5
+    vshr.s32		\arg7, #16
+    vshr.s32		\arg8, #16
+    vmovn.s32		\arg2, \arg7
+    vmovn.s32		\arg3, \arg8
+
+    vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
+    vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
+    vshl.s16		\arg6, #1
+    vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
+//	}
+.endm
+
+.macro	NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+    veor.s16		\arg6, \arg6			// init 0 , and keep 0;
+    vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
+    vmull.s16		\arg7, \arg2, \arg4
+    vmull.s16		\arg8, \arg3, \arg5
+    vshr.s32		\arg7, #16
+    vshr.s32		\arg8, #16
+    vmovn.s32		\arg2, \arg7
+    vmovn.s32		\arg3, \arg8
+
+    vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
+    vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
+    vshl.s16		\arg6, #1
+    vmax.s16		\arg9, \arg2, \arg3
+    vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
+//	}
+.endm
+
+.macro	QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
+//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
+    vaba.s16		\arg1, \arg0, \arg3		// f + abs(coef - 0)
+    vmull.s16		\arg4, \arg1, \arg2		// *= mf
+    vshr.s32		\arg4, #16
+    vmovn.s32		\arg1, \arg4			// >> 16
+
+    vcgt.s16		\arg2, \arg0, #0		// if true, location of coef == 11111111
+    vbif.s16		\arg3, \arg1, \arg2		// if (x<0) reserved part; else keep 0 untouched
+    vshl.s16		\arg3, #1
+    vsub.s16		\arg1, \arg1, \arg3		// if x > 0, -= 0; else x-= 2x
+//	}
+.endm
+
+.macro	DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
+//	{	//	input:	coef, dst_d, working_d (all 0x01)
+    vceq.s16	\arg1, \arg0, #0
+    vand.s16	\arg1, \arg2
+    vpadd.s16	\arg1, \arg1, \arg1
+    vpadd.s16	\arg1, \arg1, \arg1
+//	}
+.endm
+
+.macro	SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
+//	{	//	input:	coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
+    vmax.s16		\arg2, \arg0, \arg1		// max 1st in \arg3 & max 2nd in \arg4
+    vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
+    vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1]
+//	}
+.endm
+
+.macro	ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
+//	{	//	input:	coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
+    vceq.s16	\arg0, #0
+    vceq.s16	\arg1, #0
+    vand.s16	\arg0, \arg2
+    vand.s16	\arg1, \arg2
+
+    vpadd.s16	\arg3, \arg3, \arg5
+    vpadd.s16	\arg4, \arg4, \arg6
+    vpadd.s16	\arg3, \arg3, \arg4		// 8-->4
+    vpadd.s16	\arg3, \arg3, \arg3
+    vpadd.s16	\arg3, \arg3, \arg3
+//	}
+.endm
+
+.macro	HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
+//	{	//	input: src_d[0]~[3], working_d, dst_d
+    vshr.s64	\arg1, \arg0, #32
+    vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+    vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+    vtrn.s16	\arg2, \arg1
+    vtrn.s32	\arg2, \arg1
+//	}
+.endm
+
+.macro	IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
+//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+    vshr.s64	\arg1, \arg0, #32
+    vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+    vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+    vtrn.s16	\arg2, \arg1
+    vrev32.16	\arg1, \arg1
+    vtrn.s32	\arg2, \arg1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+
+    vrev64.16	\arg1, \arg2
+    vadd.s16	\arg0, \arg2, \arg1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+    vsub.s16	\arg1, \arg2, \arg1
+    vrev32.16	\arg1, \arg1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+    vtrn.s32	\arg0, \arg1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+//	}
+.endm
+
+.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
+//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+    vmovl.u8		\arg4,\arg0
+    vmovl.u8		\arg5,\arg1
+    vadd.s16		\arg4,\arg2
+    vadd.s16		\arg5,\arg3
+    vqmovun.s16	\arg0,\arg4
+    vqmovun.s16	\arg1,\arg5
+//	}
+.endm
+
+.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
+    vadd.s16		\arg4, \arg0, \arg2			//int16 e[i][0] = src[0] + src[2];
+    vsub.s16		\arg5, \arg0, \arg2			//int16 e[i][1] = src[0] - src[2];
+    vshr.s16		\arg6, \arg1, #1
+    vshr.s16		\arg7, \arg3, #1
+    vsub.s16		\arg6, \arg6, \arg3			//int16 e[i][2] = (src[1]>>1)-src[3];
+    vadd.s16		\arg7, \arg1, \arg7			//int16 e[i][3] = src[1] + (src[3]>>1);
+//	}
+.endm
+
+.macro	TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
+//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s16		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s16		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s16		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s16		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
+//	}
+.endm
+
+
+.macro	ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
+    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
+    vsubl.s16		\arg6, \arg1, \arg3			//int32 e[i][2] = src[1] - src[3];
+    vaddl.s16		\arg7, \arg1, \arg3			//int32 e[i][3] = src[1] + src[3];
+//	}
+.endm
+
+.macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
+    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
+    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
+    vshr.s16		\arg8, \arg1, #1
+    vshr.s16		\arg9, \arg3, #1
+    vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);
+//	}
+.endm
+
+.macro	TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
+//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
+//	}
+.endm
+
+.macro	COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vsub.s32		\arg6, \arg1, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		\arg7, \arg1, \arg3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//	}
+.endm
+
+.macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32		\arg6, \arg1, #1
+    vshr.s32		\arg7, \arg3, #1
+    vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//	}
+.endm
+#endif
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
+
+	LORD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+
+	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+
+	LORD_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
+
+	STORE_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
+
+	LORD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+
+	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+
+	LORD_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
+
+	STORE_ALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	q4, q5, q6, q7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+
+	LORD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+
+	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+
+WELS_ASM_FUNC_BEGIN WelsDctT4_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	LOAD_4x4_DATA_FOR_DCT	d4, d5, d6, d7, r1, r2, r3, r4
+
+	vsubl.u8	q0, d4, d6
+	vsubl.u8	q1, d5, d7
+	vtrn.s32	q0, q1
+	vswp		d1, d2
+
+	// horizontal transform
+	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+
+	// transform element
+	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+
+	//	vertical transform
+	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+
+	// transform element
+	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+
+	vst1.s16		{q0, q1}, [r0]!
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	LOAD_8x8_DATA_FOR_DCT	d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
+
+	vsubl.u8	q0,  d8, d12
+	vsubl.u8	q1,  d9, d13
+	vsubl.u8	q2, d10, d14
+	vsubl.u8	q3, d11, d15
+	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+
+	// horizontal transform
+	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	// transform element
+	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+
+	//	vertical transform
+	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	vswp		d1, d2
+	vswp		d5, d6
+	vswp		q1, q2
+	vst1.s16		{q0, q1}, [r0]!
+	vst1.s16		{q2, q3}, [r0]!
+
+	////////////////
+	LOAD_8x8_DATA_FOR_DCT	d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
+
+	vsubl.u8	q0,  d8, d12
+	vsubl.u8	q1,  d9, d13
+	vsubl.u8	q2, d10, d14
+	vsubl.u8	q3, d11, d15
+	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+
+	// horizontal transform
+	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	// transform element
+	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+
+	//	vertical transform
+	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	vswp		d1, d2
+	vswp		d5, d6
+	vswp		q1, q2
+	vst1.s16		{q0, q1}, [r0]!
+	vst1.s16		{q2, q3}, [r0]!
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
+	vld1.s16		{q2}, [r1]
+	vld1.s16		{q0, q1}, [r0]
+	vld1.s16		{q3}, [r2]
+
+	vmov			q4, q2
+
+	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q5, q6, q7
+	vst1.s16		{q2}, [r0]!
+
+	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r0]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
+
+	vld1.s16		{q0, q1}, [r0]
+	vdup.s16		q2, r1		// even ff range [0, 768]
+	vdup.s16		q3, r2
+
+	vmov			q4, q2
+
+	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q5, q6, q7
+	vst1.s16		{q2}, [r0]!
+
+	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r0]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
+	vld1.s16		{q2}, [r1]
+	vld1.s16		{q3}, [r2]
+	mov				r1, r0
+
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q0, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS	q1, q4, d8, d9, d6, d7, q5, q6, q7
+	vst1.s16		{q4}, [r1]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
+	vld1.s16		{q2}, [r1]
+	vld1.s16		{q3}, [r2]
+	mov				r1, r0
+
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
+	vst1.s16		{q4}, [r1]!
+	vmov			q8, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
+	vst1.s16		{q8}, [r1]!		// then 1st 16 elem in d18 & d20
+
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
+	vst1.s16		{q4}, [r1]!
+	vmov			q8, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
+	vst1.s16		{q8}, [r1]!	// then 2nd 16 elem in d19 & d21
+
+	SELECT_MAX_IN_ABS_COEF	q9, q10, q0, d0, d1
+	vst1.s32		{d0[0]}, [r3]!
+
+	///////////
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
+	vst1.s16		{q4}, [r1]!
+	vmov			q8, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
+	vst1.s16		{q8}, [r1]!		// then 3rd 16 elem in d18 & d20
+
+	vld1.s16		{q0, q1}, [r0]!
+	vmov			q4, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
+	vst1.s16		{q4}, [r1]!
+	vmov			q8, q2
+	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
+	vst1.s16		{q8}, [r1]!	// then 4th 16 elem in d19 & d21
+
+	SELECT_MAX_IN_ABS_COEF	q9, q10, q0, d0, d1
+	vst1.s32		{d0[0]}, [r3]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
+	push	{r2,r3}
+	mov		r2, #64	// 2*16*sizeof(int16_t)
+	add		r3, r1, #32
+
+	vld1.s16		{d0}, [r1], r2
+	vld1.s16		{d1}, [r3], r2
+	vld1.s16		{d4}, [r1], r2
+	vld1.s16		{d5}, [r3], r2
+	vld1.s16		{d2}, [r1], r2
+	vld1.s16		{d3}, [r3], r2
+	vld1.s16		{d6}, [r1], r2
+	vld1.s16		{d7}, [r3], r2
+	vtrn.16		q0, q2		// d0[0 4], d1[1 5]
+	vtrn.16		q1, q3		// d2[2 6], d3[3 7]
+
+	vld1.s16		{d8}, [r1], r2
+	vld1.s16		{d9}, [r3], r2
+	vld1.s16		{d12}, [r1], r2
+	vld1.s16		{d13}, [r3], r2
+	vld1.s16		{d10}, [r1], r2
+	vld1.s16		{d11}, [r3], r2
+	vld1.s16		{d14}, [r1], r2
+	vld1.s16		{d15}, [r3], r2
+	vtrn.16		q4, q6		// d8[08 12], d9[09 13]
+	vtrn.16		q5, q7		//d10[10 14],d11[11 15]
+
+	vtrn.32		q0, q4		// d0 [0 4 08 12] = dct[idx],		d1[1 5 09 13] = dct[idx+16]
+	vtrn.32		q1, q5		// d2 [2 6 10 14] = dct[idx+64],	d3[3 7 11 15] = dct[idx+80]
+
+	ROW_TRANSFORM_0_STEP	d0, d1, d3, d2, q4, q7, q6, q5
+
+	TRANSFORM_4BYTES		q0, q1, q3, q2, q4, q7, q6, q5
+
+	// transform element 32bits
+	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+
+	COL_TRANSFORM_0_STEP	q0, q1, q3, q2, q4, q7, q6, q5
+
+	TRANSFORM_4BYTES		q0, q1, q3, q2, q4, q7, q6, q5
+
+	vrshrn.s32		d8, q0, #1
+	vrshrn.s32		d9, q1, #1
+	vrshrn.s32		d10, q2, #1
+	vrshrn.s32		d11, q3, #1
+	vst1.16	{q4, q5}, [r0]	//store
+
+	pop		{r2,r3}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
+
+	vdup.s16	d1, r1				//ff
+	vdup.s16	d2, r2				//mf
+	veor		d3, d3
+
+	mov			r1, #32
+	mov			r2, r0
+
+	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
+	vst1.s16	{d3[0]}, [r2], r1		//rs[00]=0
+	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
+	vst1.s16	{d3[0]}, [r2], r1		//rs[16]=0
+	vld1.s16	{d0[2]}, [r0], r1		//rs[32]
+	vst1.s16	{d3[0]}, [r2], r1		//rs[32]=0
+	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
+	vst1.s16	{d3[0]}, [r2], r1		//rs[48]=0
+
+	HDM_QUANT_2x2_TOTAL_16BITS	d0, d4, d5		// output d5
+
+	HDM_QUANT_2x2_TOTAL_16BITS	d5, d4, d0		// output d0
+
+	QUANT_DUALWORD_COEF_EACH_16BITS	d0, d1, d2, d3, q2
+
+	vst1.s16	d1, [r3]		// store to dct
+	ldr			r2, [sp, #0]
+	vst1.s16	d1, [r2]		// store to block
+
+	mov			r1, #1
+	vdup.s16	d3, r1
+	DC_ZERO_COUNT_IN_DUALWORD	d1, d0, d3
+
+	vmov	r0, r1, d0
+	and		r0, #0x07		// range [0~4]
+	rsb		r0, #4
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
+
+	vdup.s16	d3, r1
+	mov			r1, #32
+	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
+	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
+	vld1.s16	{d0[2]}, [r0], r1		//rs[32]
+	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
+
+	HDM_QUANT_2x2_TOTAL_16BITS	d0, d1, d2		// output d2
+
+	HDM_QUANT_2x2_TOTAL_16BITS	d2, d1, d0		// output d0
+
+	vabs.s16	d1, d0
+	vcgt.s16	d1, d1, d3		// abs(dct[i])>threshold;
+	vmov	r0, r1, d1
+	orr		r0, r1
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
+	push	{r1}
+	vld1.s16	{q0, q1}, [r0]
+	vmov.s16	q8, #1
+
+	ZERO_COUNT_IN_2_QUARWORD	q0, q1, q8, d0, d1, d2, d3
+	vmov	r0, r1, d0
+	and		r0, #0x1F	// range [0~16]
+	rsb		r0, #16
+	pop		{r1}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
+	vld1.s16	{q0, q1}, [r0]
+	vld1.u16	{q2}, [r1]
+
+	vmul.s16	q4, q0, q2
+	vmul.s16	q5, q1, q2
+
+	vst1.s16	{q4, q5}, [r0]
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
+	vld1.u16	{q8}, [r1]
+	mov		r1, r0
+	vld1.s16	{q0, q1}, [r0]!
+	vld1.s16	{q2, q3}, [r0]!
+	vmul.s16	q0, q0, q8
+	vld1.s16	{q4, q5}, [r0]!
+	vmul.s16	q1, q1, q8
+	vld1.s16	{q6, q7}, [r0]!
+
+	vst1.s16	{q0, q1}, [r1]!
+
+	vmul.s16	q2, q2, q8
+	vmul.s16	q3, q3, q8
+	vmul.s16	q4, q4, q8
+	vst1.s16	{q2, q3}, [r1]!
+
+	vmul.s16	q5, q5, q8
+	vmul.s16	q6, q6, q8
+	vmul.s16	q7, q7, q8
+	vst1.s16	{q4, q5}, [r1]!
+	vst1.s16	{q6, q7}, [r1]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
+
+	vld1.s16	{q0, q1}, [r0]
+	vdup.s16	q4, r1
+
+	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
+	IHDM_4x4_TOTAL_16BITS	q1, q2, q3
+
+	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+
+	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
+	vmul.s16	q0, q4
+
+	IHDM_4x4_TOTAL_16BITS	q1, q2, q3
+	vmul.s16	q1, q4
+
+	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+	vst1.s16	{q0, q1}, [r0]
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
+	vld1.u32		{d14[0]}, [r2], r3
+	push			{r4}
+	ldr				r4, [sp, #4]
+	vld1.u32		{d14[1]}, [r2], r3
+
+	vld4.s16		{d0, d1, d2, d3}, [r4]		// cost 3 cycles!
+	vld1.u32		{d15[0]}, [r2], r3
+	vld1.u32		{d15[1]}, [r2], r3			// q7 is pred
+
+	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+
+	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+
+	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+
+	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+
+	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+	vrshr.s16		d0, d0, #6
+	vrshr.s16		d1, d1, #6
+	vrshr.s16		d2, d2, #6
+	vrshr.s16		d3, d3, #6
+
+	//after rounding 6, clip into [0, 255]
+	vmovl.u8		q2,d14
+	vadd.s16		q0,q2
+	vqmovun.s16	d14,q0
+	vst1.32		{d14[0]},[r0],r1
+	vst1.32		{d14[1]},[r0],r1
+
+	vmovl.u8		q2,d15
+	vadd.s16		q1,q2
+	vqmovun.s16	d15,q1
+	vst1.32		{d15[0]},[r0],r1
+	vst1.32		{d15[1]},[r0]
+
+	pop			{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
+
+	vld1.u64		{d16}, [r2], r3
+	push			{r4}
+	ldr				r4, [sp, #4]
+	vld1.u64		{d17}, [r2], r3
+
+	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!
+	vld1.u64		{d18}, [r2], r3
+	vld1.u64		{d19}, [r2], r3
+	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!
+	vswp			d1, d4
+	vswp			d3, d6
+	vswp			q1, q2						// q0~q3
+
+	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+
+	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+	vrshr.s16		q0, q0, #6
+	vrshr.s16		q1, q1, #6
+	vrshr.s16		q2, q2, #6
+	vrshr.s16		q3, q3, #6
+
+	//after rounding 6, clip into [0, 255]
+	vmovl.u8		q4,d16
+	vadd.s16		q0,q4
+	vqmovun.s16	d16,q0
+	vst1.u8		{d16},[r0],r1
+
+	vmovl.u8		q4,d17
+	vadd.s16		q1,q4
+	vqmovun.s16	d17,q1
+	vst1.u8		{d17},[r0],r1
+
+	vmovl.u8		q4,d18
+	vadd.s16		q2,q4
+	vqmovun.s16	d18,q2
+	vst1.u8		{d18},[r0],r1
+
+	vmovl.u8		q4,d19
+	vadd.s16		q3,q4
+	vqmovun.s16	d19,q3
+	vst1.u8		{d19},[r0],r1
+
+	vld1.u64		{d16}, [r2], r3
+	vld1.u64		{d17}, [r2], r3
+
+	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!
+	vld1.u64		{d18}, [r2], r3
+	vld1.u64		{d19}, [r2], r3
+	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!
+	vswp			d1, d4
+	vswp			d3, d6
+	vswp			q1, q2						// q0~q3
+
+	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+
+	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+
+	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q4, q5, q6, q7
+	vrshr.s16		q0, q0, #6
+	vrshr.s16		q1, q1, #6
+	vrshr.s16		q2, q2, #6
+	vrshr.s16		q3, q3, #6
+
+	//after rounding 6, clip into [0, 255]
+	vmovl.u8		q4,d16
+	vadd.s16		q0,q4
+	vqmovun.s16	d16,q0
+	vst1.u8		{d16},[r0],r1
+
+	vmovl.u8		q4,d17
+	vadd.s16		q1,q4
+	vqmovun.s16	d17,q1
+	vst1.u8		{d17},[r0],r1
+
+	vmovl.u8		q4,d18
+	vadd.s16		q2,q4
+	vqmovun.s16	d18,q2
+	vst1.u8		{d18},[r0],r1
+
+	vmovl.u8		q4,d19
+	vadd.s16		q3,q4
+	vqmovun.s16	d19,q3
+	vst1.u8		{d19},[r0],r1
+
+	pop			{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	vld1.s16	{q8,q9}, [r4]
+	vrshr.s16		q8, q8, #6
+	vrshr.s16		q9, q9, #6
+
+	vdup.s16	d20, d16[0]
+	vdup.s16	d21, d16[1]
+	vdup.s16	d22, d16[2]
+	vdup.s16	d23, d16[3]
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vdup.s16	d20, d17[0]
+	vdup.s16	d21, d17[1]
+	vdup.s16	d22, d17[2]
+	vdup.s16	d23, d17[3]
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vdup.s16	d20, d18[0]
+	vdup.s16	d21, d18[1]
+	vdup.s16	d22, d18[2]
+	vdup.s16	d23, d18[3]
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vdup.s16	d20, d19[0]
+	vdup.s16	d21, d19[1]
+	vdup.s16	d22, d19[2]
+	vdup.s16	d23, d19[3]
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	vld1.u8	{q0}, [r2], r3
+	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
+	vst1.u8	{q0}, [r0], r1
+
+	pop			{r4}
+WELS_ASM_FUNC_END
+#endif
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -110,6 +110,33 @@
 
 #endif//X86_ASM
 
+#if defined (HAVE_NEON)
+
+int32_t WelsSampleSad4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+
+int32_t WelsSampleSatd8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+
+int32_t WelsIntra16x16Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra16x16Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra8x8Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
+int32_t WelsIntra8x8Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
+int32_t WelsIntra4x4Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t);
+
+#endif
 
 #if defined(__cplusplus)
 }
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -482,6 +482,33 @@
 
 #endif //(X86_ASM)
 
+#if defined (HAVE_NEON)
+  if (uiCpuFlag & WELS_CPU_NEON) {
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_neon;
+
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_neon;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_neon;
+
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_neon;
+
+    pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd   = WelsIntra4x4Combined3Satd_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd   = WelsIntra8x8Combined3Satd_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad    = WelsIntra8x8Combined3Sad_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_neon;
+    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad  = WelsIntra16x16Combined3Sad_neon;
+  }
+#endif
 }
 
 } // namespace WelsSVCEnc
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -231,6 +231,11 @@
     pfVar = SampleVariance16x16_sse2;
   }
 #endif
+#ifdef HAVE_NEON
+  if (iCpuFlag & WELS_CPU_NEON) {
+    pfVar = SampleVariance16x16_neon;
+  }
+#endif
 }
 
 void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -62,6 +62,11 @@
 WELSVP_EXTERN_C_END
 #endif
 
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+VarFunc      SampleVariance16x16_neon;
+WELSVP_EXTERN_C_END
+#endif
 
 class CAdaptiveQuantization : public IStrategy {
  public:
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -35,7 +35,7 @@
 #include "arm_arch_common_macro.S"
 
 #ifdef APPLE_IOS
-.macro SQR_ADD_16BYTES 
+.macro SQR_ADD_16BYTES
 	vmull.u8 q3, $0, $0
 	vmull.u8 q8, $1, $1
 	vpadal.u16 $2, q3
@@ -51,23 +51,23 @@
 #endif
 
 
-WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon
+WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
     stmdb sp!, {r4}
 
 	vld1.8   {q15}, [r0], r1 //save the ref data (16bytes)
 	vld1.8   {q14}, [r2], r3 //save the src data (16bytes)
-	
-	
-	vabd.u8  q13, q14, q15 
+
+
+	vabd.u8  q13, q14, q15
 	vmull.u8 q12, d27, d27
 	vmull.u8 q11, d26, d26
 	vaddl.u16 q12, d24, d25
 	vpadal.u16 q12, q11     //sqr
 
-    vaddl.u8 q13, d26, d27 //sum   
-			 
+    vaddl.u8 q13, d26, d27 //sum
+
 	vaddl.u8 q10, d28, d29 //sum_cur
-	
+
 	vmull.u8 q9,  d29, d29
 	vmull.u8 q8,  d28, d28
 	vaddl.u16 q9, d18, d19       //sqr_cur
@@ -78,35 +78,35 @@
 
 	vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
 	vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
-    
+
 	vabd.u8 q2, q0, q1
-		
+
 	//q10 save sum_cur
 	vpadal.u8 q10, q1
 
 	//q12 save sqr
 	SQR_ADD_16BYTES d4, d5, q12
-	
+
     //q13 save sum
 	vpadal.u8 q13, q2
 
 	subs r4, #1
-	
-	//q9 save sqr_cur	
-	SQR_ADD_16BYTES d2, d3, q9	
-	
-	bne pixel_var_16x16_loop0		
-	
+
+	//q9 save sqr_cur
+	SQR_ADD_16BYTES d2, d3, q9
+
+	bne pixel_var_16x16_loop0
+
 	vadd.u16 d0, d26, d27 //sum
-	vadd.u16 d1, d20, d21 //sum_cur	 
+	vadd.u16 d1, d20, d21 //sum_cur
 	vpaddl.u16 q0, q0
 	vadd.u32 d2, d24, d25 //sqr
 	vadd.u32 d3, d18, d19 //sqr_cur
 	vpadd.u32 d0, d0, d1
 	vpadd.u32 d1, d2, d3
-	
+
 	ldr       r4, [sp, #4]
-	
+
 	vshr.u32  q0, q0, #8
 	vmul.u32  d0, d0
 	vsub.u32  d0, d1, d0
@@ -117,4 +117,4 @@
 
 WELS_ASM_FUNC_END
 
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -35,29 +35,29 @@
 #include "arm_arch_common_macro.S"
 
 
-WELS_ASM_FUNC_BEGIN	comp_ds_bilinear_neon
+WELS_ASM_FUNC_BEGIN	DyadicBilinearDownsampler_neon
 	stmdb	sp!, {r4-r8, lr}
-	
+
 	//Get	the	width	and	height
 	ldr	 r4, [sp,	#24]	//src_width
 	ldr	 r5, [sp,	#28]	//src_height
-	
+
 	//Initialize the register
 	mov	r6,	r2
 	mov	r8,	r0
 	mov	lr,	#0
-	lsr	r5,	#1	
-	
+	lsr	r5,	#1
+
 	//Save the tailer	for	the	unasigned	size
 	mla	 r7, r1, r5, r0
 	vld1.32	{q15}, [r7]
-	
+
 	add	r7,	r2,	r3
 	//processing a colume	data
-comp_ds_bilinear_loop0:	
+comp_ds_bilinear_loop0:
 
 	vld1.8 {q0,q1},	[r2]!
-	vld1.8 {q2,q3},	[r7]!			
+	vld1.8 {q2,q3},	[r7]!
 	vpaddl.u8	q0,	q0
 	vpaddl.u8	q1,	q1
 	vpaddl.u8	q2,	q2
@@ -70,9 +70,9 @@
 	vrhadd.u16 q1, q3
 	vmovn.u16	d0,	q0
 	vmovn.u16	d1,	q1
-	vst1.32	{q0},	[r0]!	
+	vst1.32	{q0},	[r0]!
 	add	lr,	#32
-	
+
 	cmp	lr,	r4
 	movcs	lr,	#0
 	addcs	r6,	r3,	lsl	#1
@@ -82,10 +82,10 @@
 	movcs	r0,	r8
 	subscs r5, #1
 	bne	comp_ds_bilinear_loop0
-	
+
 	//restore	the	tailer for the unasigned size
 	vst1.32	{q15}, [r0]
-	
+
 	ldmia	sp!, {r4-r8,lr}
 WELS_ASM_FUNC_END
 
@@ -96,29 +96,29 @@
     //Get	the	width	and	height
 	ldr	 r4, [sp,	#20]	//src_width
 	ldr	 r5, [sp,	#24]	//src_height
-	
+
 	//Get	the	difference
-	sub	lr,	r3,	r4 
+	sub	lr,	r3,	r4
 	sub	r1,	r1,	r4,	lsr	#1
-	
+
 	lsr	r5,	#1
-	
+
 	//processing a colume	data
-comp_ds_bilinear_w_x8_loop0:	
-	
+comp_ds_bilinear_w_x8_loop0:
+
 	lsr	r6,	r4,	#3
 	add	r7,	r2,	r3
 	//processing a line	data
 comp_ds_bilinear_w_x8_loop1:
-		
+
 	vld1.8 {d0}, [r2]!
-	vld1.8 {d1}, [r7]!			
+	vld1.8 {d1}, [r7]!
 	vpaddl.u8	q0,	q0
 	vrshr.u16	q0,	#1
 	vrhadd.u16 d0, d1
-	
+
 	vmovn.u16	d0,	q0
-	vst1.32	{d0[0]}, [r0]!		
+	vst1.32	{d0[0]}, [r0]!
 	subs r6, #1
 	bne	comp_ds_bilinear_w_x8_loop1
 
@@ -126,7 +126,7 @@
 	add	r0,	r1
 	subs r5, #1
 	bne	comp_ds_bilinear_w_x8_loop0
-	
+
     ldmia	sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
@@ -137,31 +137,31 @@
     //Get	the	width	and	height
 	ldr	 r4, [sp,	#20]	//src_width
 	ldr	 r5, [sp,	#24]	//src_height
-	
+
 	//Get	the	difference
-	sub	lr,	r3,	r4 
+	sub	lr,	r3,	r4
 	sub	r1,	r1,	r4,	lsr	#1
-	
+
 	lsr	r5,	#1
-	
+
 	//processing a colume	data
-comp_ds_bilinear_w_x16_loop0:	
-	
+comp_ds_bilinear_w_x16_loop0:
+
 	lsr	r6,	r4,	#4
 	add	r7,	r2,	r3
 	//processing a line	data
 comp_ds_bilinear_w_x16_loop1:
-		
+
 	vld1.8 {q0}, [r2]!
-	vld1.8 {q1}, [r7]!			
+	vld1.8 {q1}, [r7]!
 	vpaddl.u8	q0,	q0
 	vpaddl.u8	q1,	q1
 	vrshr.u16	q0,	#1
 	vrshr.u16	q1,	#1
 	vrhadd.u16 q0, q1
-	
+
 	vmovn.u16	d0,	q0
-	vst1.32	{d0},	[r0]!		
+	vst1.32	{d0},	[r0]!
 	subs r6, #1
 	bne	comp_ds_bilinear_w_x16_loop1
 
@@ -169,34 +169,34 @@
 	add	r0,	r1
 	subs r5, #1
 	bne	comp_ds_bilinear_w_x16_loop0
-	
+
 	ldmia	sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN	comp_ds_bilinear_w_x32_neon
+WELS_ASM_FUNC_BEGIN	DyadicBilinearDownsamplerWidthx32_neon
 	stmdb	sp!, {r4-r7, lr}
 
 	//Get	the	width	and	height
 	ldr	 r4, [sp,	#20]	//src_width
 	ldr	 r5, [sp,	#24]	//src_height
-	
+
 	//Get	the	difference
-	sub	lr,	r3,	r4 
+	sub	lr,	r3,	r4
 	sub	r1,	r1,	r4,	lsr	#1
-	
+
 	lsr	r5,	#1
-	
+
 	//processing a colume	data
-comp_ds_bilinear_w_x32_loop0:	
-	
+comp_ds_bilinear_w_x32_loop0:
+
 	lsr	r6,	r4,	#5
 	add	r7,	r2,	r3
 	//processing a line	data
 comp_ds_bilinear_w_x32_loop1:
-		
+
 	vld1.8 {q0,q1},	[r2]!
-	vld1.8 {q2,q3},	[r7]!			
+	vld1.8 {q2,q3},	[r7]!
 	vpaddl.u8	q0,	q0
 	vpaddl.u8	q1,	q1
 	vpaddl.u8	q2,	q2
@@ -207,10 +207,10 @@
 	vrshr.u16	q3,	#1
 	vrhadd.u16 q0, q2
 	vrhadd.u16 q1, q3
-		
+
 	vmovn.u16	d0,	q0
 	vmovn.u16	d1,	q1
-	vst1.32	{q0},	[r0]!		
+	vst1.32	{q0},	[r0]!
 	subs r6, #1
 	bne	comp_ds_bilinear_w_x32_loop1
 
@@ -218,14 +218,14 @@
 	add	r0,	r1
 	subs r5, #1
 	bne	comp_ds_bilinear_w_x32_loop0
-	
+
 	ldmia	sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
+WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
     stmdb sp!, {r4-r12, lr}
-    
+
 	//Get the data from stack
 	ldr r4, [sp, #40] //the addr of src
 	ldr r5, [sp, #44] //the value of src_stride
@@ -245,11 +245,11 @@
 	and		r9, r7, r10			// r9 vinc(scaleY mod 32767)
     mov     r11, #-1
 	mul		r11, r9			// r11 -vinc
-	
+
 	vdup.s16 d2, r9
 	vdup.s16 d3, r11
 	vext.8   d5, d3, d2, #4		// vinc vinc -vinc -vinc
-	
+
     mov		 r11, #0x40000000
     mov      r12, #0x4000
     sub      r12, #1
@@ -261,13 +261,13 @@
     sub      r11, #1
 	vdup.s16 d9, r11
 	vext.8	 d7, d9, d8, #4		//init v  16384 16384 16383 16383
-	
-	veor    q14,     q14	
-	sub		r1,		r2			// stride - width			
+
+	veor    q14,     q14
+	sub		r1,		r2			// stride - width
 	mov		r8,		#16384		// yInverse
 	sub		r3,		#1
-	
-_HEIGHT:	
+
+_HEIGHT:
     ldr     r4, [sp, #40]           //the addr of src
     mov		r11,	r8
     lsr		r11,	#15
@@ -275,8 +275,8 @@
 	add		r11,	r4					// get current row address
 	mov		r12,	r11
 	add		r12,	r5
-	
-	mov		r9,		#16384				// xInverse	
+
+	mov		r9,		#16384				// xInverse
 	sub		r10, r2, #1
     vmov.s16 d6, d1
 
@@ -288,8 +288,8 @@
     add     r4,     r12,lr
 	vld2.8	{d28[4],d29[4]},	[r4]		//q14: 000d000b000c000a;
 	vzip.32		d28, d29					//q14: 000d000c000b000a;
-	
-	vmull.u16	q13, d6, d7			//q13: init u  *  init  v	
+
+	vmull.u16	q13, d6, d7			//q13: init u  *  init  v
 	vmull.u32	q12, d26,d28
 	vmlal.u32	q12, d27,d29
 	vqadd.u64	d24, d24,d25
@@ -296,13 +296,13 @@
 	vrshr.u64	d24, #30
 
 	vst1.8	{d24[0]},	[r0]!
-	add		r9,	r6	
+	add		r9,	r6
 	vadd.u16	d6, d0				// inc u
 	vshl.u16	d6, #1
 	vshr.u16	d6, #1
 	subs	r10, #1
 	bne		_WIDTH
-	
+
 WIDTH_END:
     lsr		r9,		#15
     add     r4,r11,r9
@@ -317,26 +317,26 @@
 	subs	r3,		#1
 	bne		_HEIGHT
 
-LAST_ROW:	
+LAST_ROW:
     ldr     r4, [sp, #40]           //the addr of src
     lsr		r8,	#15
 	mul		r8, r5
-	add		r4,	r8					// get current row address	
+	add		r4,	r8					// get current row address
 	mov		r9,		#16384
 
 _LAST_ROW_WIDTH:
 	mov		r11,	r9
     lsr		r11,	#15
-    
+
 	add     r3,     r4,r11
 	vld1.8	{d0[0]},	[r3]
-	vst1.8	{d0[0]},	[r0]	
-	add		r0,		#1	
-	add		r9,		r6	
+	vst1.8	{d0[0]},	[r0]
+	add		r0,		#1
+	add		r9,		r6
 	subs	r2,		#1
 	bne		_LAST_ROW_WIDTH
-	
+
 	ldmia sp!, {r4-r12, lr}
 WELS_ASM_FUNC_END
 
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -35,24 +35,24 @@
 #include "arm_arch_common_macro.S"
 
 
-WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
     stmdb sp!, {lr}
 	//Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 
+	vld1.8 {d0}, [r0], r1
 	vld1.8 {d1}, [r2], r3
-    
+
 	//Do the SAD for 8 bytes
 	vabdl.u8  q1, d0, d1
-	
+
 	mov lr, #7
 pixel_sad_8x8_loop0:
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 
+	vld1.8 {d0}, [r0], r1
 	vld1.8 {d1}, [r2], r3
 
 	subs lr, #1
-	
+
 	//Do the SAD for 8 bytes
 	vabal.u8  q1, d0, d1
 	bne pixel_sad_8x8_loop0
@@ -65,4 +65,4 @@
 	ldmia sp!, {lr}
 WELS_ASM_FUNC_END
 
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -36,7 +36,7 @@
 
 #ifdef APPLE_IOS
 
-.macro ABS_SUB_SUM_16BYTES 
+.macro ABS_SUB_SUM_16BYTES
 	vld1.32 {q15}, [$0], $2
 	vld1.32 {q14}, [$1], $2
 	vabal.u8 $3, d30, d28
@@ -43,22 +43,22 @@
 	vabal.u8 $4, d31, d29
 .endm
 
-.macro ABS_SUB_SUM_8x16BYTES 
+.macro ABS_SUB_SUM_8x16BYTES
 	vld1.32 {q15}, [$0], $2
 	vld1.32 {q14}, [$1], $2
 	vabdl.u8 $3, d30, d28
 	vabdl.u8 $4, d31, d29
-	
+
 	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
 	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
 	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
 	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4			
 	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
 	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
 .endm
 
-.macro SAD_8X16BITS 
+.macro SAD_8X16BITS
 	vadd.u16 d31, $0, $1
 	vpaddl.u16 d31, d31
 	vpaddl.u32 $2, d31
@@ -73,19 +73,19 @@
 	vabal.u8 \arg4, d31, d29
 .endm
 
-.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4 
+.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
 	vld1.32 {q15}, [\arg0], \arg2
 	vld1.32 {q14}, [\arg1], \arg2
 	vabdl.u8 \arg3, d30, d28
 	vabdl.u8 \arg4, d31, d29
-	
+
 	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
 	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
 	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
 	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4			
 	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
 	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
 .endm
 
 .macro SAD_8X16BITS arg0, arg1, arg2
@@ -96,67 +96,67 @@
 #endif
 
 
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_neon
+WELS_ASM_FUNC_BEGIN VAACalcSad_neon
 
-    stmdb sp!, {r4-r8}	
-	
+    stmdb sp!, {r4-r8}
+
 	ldr r4, [sp, #20] //load pic_stride
 	ldr r5, [sp, #28] //load psad8x8
-   
+
 	//Initial the Q4 register for save the "psadframe"
 	vmov.s64 q4, #0
-	
+
 	//Get the jump distance to use on loop codes
 	lsl r8, r4, #4
 	sub r7, r8, #16 //R7 keep the 16*pic_stride-16
 	sub r8, r2      //R8 keep the 16*pic_stride-pic_width
-	
+
 vaa_calc_sad_loop0:
 
     //R6 keep the pic_width
     mov r6, r2
-	
-vaa_calc_sad_loop1:    
 
+vaa_calc_sad_loop1:
+
 	//Process the 16x16 bytes
 	ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
 	ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
-	
+
 	//Do the SAD
 	SAD_8X16BITS d0, d1, d0
 	SAD_8X16BITS d2, d3, d1
 	SAD_8X16BITS d4, d5, d2
-	SAD_8X16BITS d6, d7, d3	
-	
+	SAD_8X16BITS d6, d7, d3
+
 	//Write to "psad8x8" buffer
-	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]! 
-	
+	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
 
+
 	//Adjust the input address
 	sub r0, r7
 	sub r1, r7
-								  				  			      
+
 	subs r6, #16
-	
-	//Save to calculate "psadframe"	
+
+	//Save to calculate "psadframe"
 	vadd.u32 q0, q1
 	vadd.u32 q4, q0
-	
+
 	bne vaa_calc_sad_loop1
-	
+
 	//Adjust the input address
 	add r0, r8
 	add r1, r8
-	
+
     subs r3, #16
-	bne vaa_calc_sad_loop0			
-	
+	bne vaa_calc_sad_loop0
+
 	ldr r6, [sp, #24] //load psadframe
 	vadd.u32 d8, d9
 	vst1.32 {d8[0]}, [r6]
-	
+
 	ldmia sp!, {r4-r8}
-													
+
 WELS_ASM_FUNC_END
 
 
@@ -164,12 +164,12 @@
 .macro  SAD_SD_MAD_16BYTES
 	vld1.32 {q0}, [$0], $2
 	vld1.32 {q1}, [$1], $2
-	
+
 	vpadal.u8 $3, q0
 	vpadal.u8 $4, q1
-	
-	vabd.u8 q0, q0, q1       
-	vmax.u8 $5, q0 
+
+	vabd.u8 q0, q0, q1
+	vmax.u8 $5, q0
 	vpadal.u8 $6, q0
 .endm
 
@@ -177,13 +177,13 @@
 	vld1.32 {q0}, [$0], $2
 	vld1.32 {q1}, [$1], $2
 
-	vpaddl.u8 q2, q0   
+	vpaddl.u8 q2, q0
 	vpaddl.u8 q3, q1
-	
-	vabd.u8 $3, q0, q1 
+
+	vabd.u8 $3, q0, q1
 	vpaddl.u8 $4, $3       //abs_diff
 
-		
+
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
@@ -191,7 +191,7 @@
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-	
+
 	vsub.u16 $5, q2, q3
 .endm
 
@@ -203,18 +203,18 @@
 	vpaddl.u16 $3, $3
 	vpaddl.u32 $3, $3
 	vpaddl.s16 $4, $4
-	vpaddl.s32 $4, $4	
+	vpaddl.s32 $4, $4
 .endm
 #else
-.macro  SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6   
+.macro  SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
 	vld1.32 {q0}, [\arg0], \arg2
 	vld1.32 {q1}, [\arg1], \arg2
-	
+
 	vpadal.u8 \arg3, q0
 	vpadal.u8 \arg4, q1
-	
-	vabd.u8 q0, q0, q1       
-	vmax.u8 \arg5, q0 
+
+	vabd.u8 q0, q0, q1
+	vmax.u8 \arg5, q0
 	vpadal.u8 \arg6, q0
 .endm
 
@@ -222,13 +222,13 @@
 	vld1.32 {q0}, [\arg0], \arg2
 	vld1.32 {q1}, [\arg1], \arg2
 
-	vpaddl.u8 q2, q0   
+	vpaddl.u8 q2, q0
 	vpaddl.u8 q3, q1
-	
-	vabd.u8 \arg3, q0, q1 
+
+	vabd.u8 \arg3, q0, q1
 	vpaddl.u8 \arg4, \arg3       //abs_diff
 
-		
+
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
@@ -236,7 +236,7 @@
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
-	
+
 	vsub.u16 \arg5, q2, q3
 .endm
 
@@ -248,69 +248,69 @@
 	vpaddl.u16 \arg3, \arg3
 	vpaddl.u32 \arg3, \arg3
 	vpaddl.s16 \arg4, \arg4
-	vpaddl.s32 \arg4, \arg4	
+	vpaddl.s32 \arg4, \arg4
 .endm
 #endif
 
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_bgd_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon
 
     stmdb sp!, {r4-r10}
-	
+
 	ldr r4, [sp, #28] //load pic_stride
 	ldr r5, [sp, #36] //load psad8x8
     ldr r6, [sp, #40] //load psd8x8
     ldr r7, [sp, #44] //load pmad8x8
-   
+
 	//Initial the Q4 register for save the "psadframe"
 	vmov.s64 q15, #0
-	
+
 	//Get the jump distance to use on loop codes
 	lsl r10, r4, #4
 	sub r9, r10, #16 //R9 keep the 16*pic_stride-16
 	sub r10, r2      //R10 keep the 16*pic_stride-pic_width
-	
+
 vaa_calc_sad_bgd_loop0:
 
     //R6 keep the pic_width
     mov r8, r2
-	
-vaa_calc_sad_bgd_loop1:    
 
+vaa_calc_sad_bgd_loop1:
+
 	//Process the 16x16 bytes        pmad psad psd
 	SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
 	SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
-	
+
     SAD_SD_MAD_CALC d26, d27, d16, q11, q9
-    SAD_SD_MAD_CALC d28, d29, d17, q12, q10	
+    SAD_SD_MAD_CALC d28, d29, d17, q12, q10
 
 	//Write to "psad8x8" buffer
-	vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]! 
+	vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
 	//Adjust the input address
 	sub r0, r9
 	sub r1, r9
 	//Write to "psd8x8" buffer
-	vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]! 
+	vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
 	subs r8, #16
 	//Write to "pmad8x8" buffer
-	vst2.16 {d16[0],d17[0]}, [r7]! 
-	//Save to calculate "psadframe"	
+	vst2.16 {d16[0],d17[0]}, [r7]!
+	//Save to calculate "psadframe"
 	vadd.u32 q11, q12
 	vadd.u32 q15, q11
-	
+
 	bne vaa_calc_sad_bgd_loop1
-	
+
 	//Adjust the input address
 	add r0, r10
 	add r1, r10
-	
+
     subs r3, #16
-	bne vaa_calc_sad_bgd_loop0			
-	
+	bne vaa_calc_sad_bgd_loop0
+
 	ldr r8, [sp, #32] //load psadframe
 	vadd.u32 d30, d31
-	vst1.32 {d30[0]}, [r8]	
+	vst1.32 {d30[0]}, [r8]
 	ldmia sp!, {r4-r10}
-													
+
 WELS_ASM_FUNC_END
 
 
@@ -318,7 +318,7 @@
 .macro  SSD_MUL_SUM_16BYTES_RESET
 	vmull.u8 $3, $0, $0
 	vpaddl.u16 $2, $3
-	
+
 	vmull.u8 $3, $1, $1
 	vpadal.u16 $2, $3
 .endm
@@ -326,7 +326,7 @@
 .macro  SSD_MUL_SUM_16BYTES
 	vmull.u8 $3, $0, $0
 	vpadal.u16 $2, $3
-	
+
 	vmull.u8 $3, $1, $1
 	vpadal.u16 $2, $3
 .endm
@@ -333,21 +333,21 @@
 
 .macro SAD_SSD_BGD_16
 	vld1.8 {q0}, [$0], $2 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
-	
+
 	vpadal.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vld1.8 {q1}, [$1], $2 //load ref_row
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -354,20 +354,20 @@
 //the last row of a 16x16 block
 .macro SAD_SSD_BGD_16_end
 	vld1.8 {q0}, [$0], $1 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
-	
+
 	vpadal.u8 $2, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -374,23 +374,23 @@
 //for the begin of a 8x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_8x8
 	vld1.8 {q0}, [$0], $2 //load cur_row
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-	
+
 	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
-	
-	
+
+
 	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vld1.8 {q1}, [$1], $2 //load ref_row
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -398,18 +398,18 @@
 .macro SAD_SSD_BGD_16_RESET_16x16
 	vld1.8 {q0}, [$0], $2 //load cur_row
 	vld1.8 {q1}, [$1], $2 //load ref_row
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-	
+
 	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vld1.8 {q1}, [$1], $2 //load ref_row
 
 	vpaddl.u8 q9, q0								//q9 for l_sum		reset for every 16x16
@@ -419,24 +419,24 @@
 
 //for each 8x16 block
 .macro SAD_SSD_BGD_CALC_8x16
-	
+
 	vpmax.u8 d10, d10, d11 //4 numbers
 	vpmax.u8 d10, d10, d10 //2 numbers
 	vpmax.u8 d10, d10, d10 //1 number1
-	
+
 	vmov $0, d10			//d26 d27 keeps the l_mad
-	
+
 	//p_sd8x8			fix me
-	vpaddl.u16 q3, q3			
+	vpaddl.u16 q3, q3
 	vpaddl.u16 q4, q4
-	
+
 	vsub.i32 $1, q3, q4
 	vpaddl.u32 $1, $1
-	
+
 	//psad8x8
 	vpaddl.u16 $2, $2
 	vpaddl.u32 $2, $2
-	
+
 	//psadframe
 	vadd.i32 q12, $2
 .endm
@@ -451,9 +451,9 @@
 	SAD_SSD_BGD_16 $0, $1, $2, q6
 	SAD_SSD_BGD_16 $0, $1, $2, q6
 	SAD_SSD_BGD_16 $0, $1, $2, q6
-	
+
 	SAD_SSD_BGD_CALC_8x16 d26, q14, q6
-	
+
 	//for another 8x16
 	SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
 	SAD_SSD_BGD_16 $0, $1, $2, q7
@@ -463,20 +463,20 @@
 	SAD_SSD_BGD_16 $0, $1, $2, q7
 	SAD_SSD_BGD_16 $0, $1, $2, q7
 	SAD_SSD_BGD_16_end $0, $2, q7
-	
+
 	SAD_SSD_BGD_CALC_8x16 d27, q15, q7
 .endm
 
-.macro  SSD_SAD_SD_MAD_PADDL    
+.macro  SSD_SAD_SD_MAD_PADDL
 	vpaddl.s16 $0, $0
-	vpaddl.s32 $0, $0	
-	vadd.i32 $1, $1, $2	
+	vpaddl.s32 $0, $0
+	vadd.i32 $1, $1, $2
 .endm
 #else
 .macro  SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
 	vmull.u8   \arg3, \arg0, \arg0
 	vpaddl.u16 \arg2, \arg3
-	
+
 	vmull.u8   \arg3, \arg1, \arg1
 	vpadal.u16 \arg2, \arg3
 .endm
@@ -484,7 +484,7 @@
 .macro  SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
 	vmull.u8   \arg3, \arg0, \arg0
 	vpadal.u16 \arg2, \arg3
-	
+
 	vmull.u8   \arg3, \arg1, \arg1
 	vpadal.u16 \arg2, \arg3
 .endm
@@ -491,21 +491,21 @@
 
 .macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3
 	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
-	
+
 	vpadal.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -512,20 +512,20 @@
 //the last row of a 16x16 block
 .macro SAD_SSD_BGD_16_end arg0, arg1, arg2
 	vld1.8 {q0}, [\arg0], \arg1 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
-	
+
 	vpadal.u8 \arg2, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -532,23 +532,23 @@
 //for the begin of a 8x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3
 	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-	
+
 	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
-	
-	
+
+
 	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -556,18 +556,18 @@
 .macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3
 	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-	
+
 	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
-	
+
 	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 
 	vpaddl.u8 q9, q0								//q9 for l_sum		reset for every 16x16
@@ -577,24 +577,24 @@
 
 //for each 8x16 block
 .macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2
-	
+
 	vpmax.u8 d10, d10, d11 //4 numbers
 	vpmax.u8 d10, d10, d10 //2 numbers
 	vpmax.u8 d10, d10, d10 //1 number1
-	
+
 	vmov \arg0, d10			//d26 d27 keeps the l_mad
-	
+
 	//p_sd8x8
-	vpaddl.u16 q3, q3			
+	vpaddl.u16 q3, q3
 	vpaddl.u16 q4, q4
-	
+
 	vsub.i32 \arg1, q3, q4
 	vpaddl.u32 \arg1, \arg1
-	
+
 	//psad8x8
 	vpaddl.u16 \arg2, \arg2
 	vpaddl.u32 \arg2, \arg2
-	
+
 	//psadframe
 	vadd.i32 q12, \arg2
 .endm
@@ -609,9 +609,9 @@
 	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
 	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
 	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	
+
 	SAD_SSD_BGD_CALC_8x16 d26, q14, q6
-	
+
 	//for another 8x16
 	SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
 	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
@@ -621,30 +621,30 @@
 	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
 	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
 	SAD_SSD_BGD_16_end \arg0, \arg2, q7
-	
+
 	SAD_SSD_BGD_CALC_8x16 d27, q15, q7
 .endm
 
-.macro  SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2    
+.macro  SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
 	vpaddl.s16 \arg0, \arg0
-	vpaddl.s32 \arg0, \arg0	
-	vadd.i32 \arg1, \arg1, \arg2	
+	vpaddl.s32 \arg0, \arg0
+	vadd.i32 \arg1, \arg1, \arg2
 .endm
 #endif
 
 
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_bgd_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
 	stmdb sp!, {r0-r12, r14}
-	
+
 	ldr r4, [sp, #56] //r4 keeps the pic_stride
-	
+
 	sub r5, r4, #1
 	lsl r5, r5, #4 //r5 keeps the little step
-	
+
 	lsl r6, r4, #4
 	sub r6, r2, r6	//r6 keeps the big step
-	
-	
+
+
 	ldr r8, [sp, #64]//psad8x8
 	ldr r9, [sp, #68]//psum16x16
 	ldr r10, [sp, #72]//psqsum16x16
@@ -651,62 +651,62 @@
 	ldr r11, [sp, #76]//psqdiff16x16
 	ldr r12, [sp, #80]//p_sd8x8
 	ldr r14, [sp, #84]//p_mad8x8
-	
+
 	vmov.i8 q12, #0
-		
+
 vaa_calc_sad_ssd_bgd_height_loop:
 
     mov r7, r2
 vaa_calc_sad_ssd_bgd_width_loop:
-    
+
     //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff	q8, l_sum q9, l_sqsum q10
     SAD_SSD_BGD_16x16 r0,r1,r4
-    
+
     //psad8x8
     vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
-    
+
     sub r0, r0, r5 //jump to next 16x16
     sub r1, r1, r5 //jump to next 16x16
-    
+
     //p_sd8x8
     vst4.32 {d28[0], d29[0],d30[0], d31[0]}, [r12]!
 
     //p_mad8x8
     vst2.16 {d26[0], d27[0]}, [r14]!
-    
+
     //psqdiff16x16
-    vpaddl.s32 q8, q8	
+    vpaddl.s32 q8, q8
     vadd.i32 d16, d16, d17
-    
+
     vst1.32 {d16[0]}, [r11]! //psqdiff16x16
-    
+
     //psum16x16
     SSD_SAD_SD_MAD_PADDL q9, d18, d19
     vst1.32 {d18[0]}, [r9]! //psum16x16
 
     //psqsum16x16
-    vpaddl.s32 q10, q10	
-    vadd.i32 d20, d20, d21	
+    vpaddl.s32 q10, q10
+    vadd.i32 d20, d20, d21
     vst1.32 {d20[0]}, [r10]! //psqsum16x16
-    
+
     subs r7, #16
-    
+
     bne vaa_calc_sad_ssd_bgd_width_loop
-		
+
     sub r0, r0, r6		//jump to next 16 x width
     sub r1, r1, r6		//jump to next 16 x width
-    
+
     subs r3, #16
 bne vaa_calc_sad_ssd_bgd_height_loop
-	
+
 	//psadframe
 	ldr r7, [sp, #60]//psadframe
-	
+
 	vadd.i32 d24, d24, d25
 	vst1.32 {d24[0]}, [r7]
-	
+
 	ldmia sp!, {r0-r12, r14}
-	
+
 WELS_ASM_FUNC_END
 
 
@@ -713,33 +713,33 @@
 #ifdef APPLE_IOS
 .macro SAD_VAR_16
 	vld1.8 {q0}, [$0], $2 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpadal.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vld1.8 {q1}, [$1], $2
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_END
 	vld1.8 {q0}, [$0], $1 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpadal.u8 $2, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -746,35 +746,35 @@
 .macro SAD_VAR_16_RESET_16x16
 	vld1.8 {q0}, [$0], $2 //load cur_row
 	vld1.8 {q1}, [$1], $2
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vld1.8 {q1}, [$1], $2
-	
+
 	vpaddl.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
 .endm
 
 .macro SAD_VAR_16_RESET_8x8
 	vld1.8 {q0}, [$0], $2 //load cur_row
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vld1.8 {q1}, [$1], $2
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -788,7 +788,7 @@
 	SAD_VAR_16 $0, $1, $2, q6
 	SAD_VAR_16 $0, $1, $2, q6
 	SAD_VAR_16 $0, $1, $2, q6
-	
+
 	vpaddl.u16 q6, q6
 	vpaddl.u32 q6, q6
 	vadd.i32 q12, q6
@@ -802,42 +802,42 @@
 	SAD_VAR_16 $0, $1, $2, q7
 	SAD_VAR_16 $0, $1, $2, q7
 	SAD_VAR_16_END $0, $2, q7
-	
+
 	vpaddl.u16 q7, q7
 	vpaddl.u32 q7, q7
-	
+
 	vadd.i32 q12, q7
 .endm
 #else
 .macro SAD_VAR_16 arg0, arg1, arg2, arg3
 	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpadal.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vld1.8 {q1}, [\arg1], \arg2
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_END arg0, arg1, arg2
 	vld1.8 {q0}, [\arg0], \arg1 //load cur_row
-	
+
 	vpadal.u8 q3, q0	//add cur_row together
 	vpadal.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpadal.u8 \arg2, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -845,35 +845,35 @@
 .macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3
 	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 	vld1.8 {q1}, [\arg1], \arg2
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vld1.8 {q1}, [\arg1], \arg2
-	
+
 	vpaddl.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
 .endm
 
 .macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3
 	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	
+
 	vpaddl.u8 q3, q0	//add cur_row together
 	vpaddl.u8 q4, q1	//add ref_row together
-	
+
 	vabd.u8 q2, q0, q1	//abs_diff
-	
+
 	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
-	
+
 	vld1.8 {q1}, [\arg1], \arg2
-	
+
 	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
-	
+
 	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
 .endm
 
@@ -887,7 +887,7 @@
 	SAD_VAR_16 \arg0, \arg1, \arg2, q6
 	SAD_VAR_16 \arg0, \arg1, \arg2, q6
 	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	
+
 	vpaddl.u16 q6, q6
 	vpaddl.u32 q6, q6
 	vadd.i32 q12, q6
@@ -901,26 +901,26 @@
 	SAD_VAR_16 \arg0, \arg1, \arg2, q7
 	SAD_VAR_16 \arg0, \arg1, \arg2, q7
 	SAD_VAR_16_END \arg0, \arg2, q7
-	
+
 	vpaddl.u16 q7, q7
 	vpaddl.u32 q7, q7
-	
+
 	vadd.i32 q12, q7
 .endm
 #endif
 
 
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_var_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
 	stmdb sp!, {r4-r11}
-	
+
 	ldr r4, [sp, #32] //r4 keeps the pic_stride
-	
+
 	sub r5, r4, #1
 	lsl r5, r5, #4 //r5 keeps the little step
-	
+
 	lsl r6, r4, #4
 	sub r6, r2, r6	//r6 keeps the big step
-	
+
 	ldr r7,		[sp, #36]	//psadframe
 	ldr r8,		[sp, #40]	//psad8x8
 	ldr r9,		[sp, #44]	//psum16x16
@@ -936,25 +936,25 @@
     SAD_VAR_16x16 r0,r1,r4
     //psad8x8
     vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
-    
+
     sub r0, r0, r5 //jump to next 16x16
     sub r1, r1, r5 //jump to next 16x16
-    
+
     //psum16x16
     SSD_SAD_SD_MAD_PADDL q9, d18, d19
     vst1.32 {d18[0]}, [r9]! //psum16x16
-    
+
     //psqsum16x16
-    vpaddl.s32 q10, q10	
+    vpaddl.s32 q10, q10
     subs r11, #16
-    vadd.i32 d20, d20, d21	
+    vadd.i32 d20, d20, d21
     vst1.32 {d20[0]}, [r10]! //psqsum16x16
-        
+
     bne vaa_calc_sad_var_width_loop
-    
+
     sub r0, r0, r6		//jump to next 16 x width
     sub r1, r1, r6		//jump to next 16 x width
-    
+
     subs r3, #16
 bne vaa_calc_sad_var_height_loop
 
@@ -968,25 +968,25 @@
 #ifdef APPLE_IOS
 .macro SAD_SSD_16
 	SAD_VAR_16 $0, $1, $2, $3
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5,q8, q11
 .endm
 
 .macro SAD_SSD_16_END
 	SAD_VAR_16_END $0, $1, $2, $3
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_16x16
 	SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
-	
+
 	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_8x8
 	SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
 .endm
 
@@ -1000,7 +1000,7 @@
 	SAD_SSD_16 $0, $1, $2, q6
 	SAD_SSD_16 $0, $1, $2, q6
 	SAD_SSD_16 $0, $1, $2, q6
-	
+
 	vpaddl.u16 q6, q6
 	vpaddl.u32 q6, q6
 	vadd.i32 q12, q6
@@ -1014,34 +1014,34 @@
 	SAD_SSD_16 $0, $1, $2, q7
 	SAD_SSD_16 $0, $1, $2, q7
 	SAD_SSD_16_END $0, $2, q7
-	
+
 	vpaddl.u16 q7, q7
 	vpaddl.u32 q7, q7
-	
+
 	vadd.i32 q12, q7
 .endm
 #else
 .macro SAD_SSD_16 arg0, arg1, arg2, arg3
 	SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5,q8, q11
 .endm
 
 .macro SAD_SSD_16_END arg0, arg1, arg2, arg3
 	SAD_VAR_16_END \arg0, \arg1, \arg2, \arg3
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3
 	SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
-	
+
 	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3
 	SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
-	
+
 	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
 .endm
 
@@ -1055,7 +1055,7 @@
 	SAD_SSD_16 \arg0, \arg1, \arg2, q6
 	SAD_SSD_16 \arg0, \arg1, \arg2, q6
 	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	
+
 	vpaddl.u16 q6, q6
 	vpaddl.u32 q6, q6
 	vadd.i32 q12, q6
@@ -1069,26 +1069,26 @@
 	SAD_SSD_16 \arg0, \arg1, \arg2, q7
 	SAD_SSD_16 \arg0, \arg1, \arg2, q7
 	SAD_SSD_16_END \arg0, \arg2, q7
-	
+
 	vpaddl.u16 q7, q7
 	vpaddl.u32 q7, q7
-	
+
 	vadd.i32 q12, q7
 .endm
 #endif
 
 
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
 	stmdb sp!, {r4-r12}
 
 	ldr r4, [sp, #36] //r4 keeps the pic_stride
-	
+
 	sub r5, r4, #1
 	lsl r5, r5, #4 //r5 keeps the little step
-	
+
 	lsl r6, r4, #4
 	sub r6, r2, r6	//r6 keeps the big step
-	
+
 	ldr r7,		[sp, #40]	//psadframe
 	ldr r8,		[sp, #44]	//psad8x8
 	ldr r9,		[sp, #48]	//psum16x16
@@ -1105,32 +1105,32 @@
     SAD_SSD_16x16 r0,r1,r4
     //psad8x8
     vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
-    
+
     sub r0, r0, r5 //jump to next 16x16
     sub r1, r1, r5 //jump to next 16x16
-    
+
     //psum16x16
     vpaddl.s16 q9, q9
-    vpaddl.s32 q9, q9	
+    vpaddl.s32 q9, q9
     vadd.i32 d18, d18, d19
     vst1.32 {d18[0]}, [r9]! //psum16x16
 
     //psqsum16x16
-    vpaddl.s32 q10, q10	
-    vadd.i32 d20, d20, d21	
+    vpaddl.s32 q10, q10
+    vadd.i32 d20, d20, d21
     vst1.32 {d20[0]}, [r10]! //psqsum16x16
-    
+
     //psqdiff16x16
-    vpaddl.s32 q8, q8	
+    vpaddl.s32 q8, q8
     vadd.i32 d16, d16, d17
     subs r12, #16
     vst1.32 {d16[0]}, [r11]! //psqdiff16x16
-        
+
     bne vaa_calc_sad_ssd_width_loop
-    
+
     sub r0, r0, r6		//jump to next 16 x width
     sub r1, r1, r6		//jump to next 16 x width
-    
+
     subs r3, #16
 	bne vaa_calc_sad_ssd_height_loop
 
@@ -1140,4 +1140,4 @@
 	ldmia sp!, {r4-r12}
 WELS_ASM_FUNC_END
 
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -75,6 +75,16 @@
   }
 #endif//X86_ASM
 
+#if defined(HAVE_NEON)
+  if (iCpuFlag & WELS_CPU_NEON) {
+    sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
+    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
+    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
+    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
+    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
+    sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearAccurateDownsamplerWrap_neon;
+  }
+#endif
 }
 
 EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -103,7 +103,20 @@
 WELSVP_EXTERN_C_END
 #endif
 
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+// iSrcWidth no limitation
+HalveDownsampleFunc		DyadicBilinearDownsampler_neon;
+// iSrcWidth = x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_neon;
 
+GeneralDownsampleFunc   GeneralBilinearAccurateDownsamplerWrap_neon;
+
+void GeneralBilinearAccurateDownsampler_neon( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+									   uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+
+WELSVP_EXTERN_C_END
+#endif
 
 
 class CDownsampling : public IStrategy {
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -229,4 +229,14 @@
 //}
 #endif //X86_ASM
 
+#ifdef HAVE_NEON
+void GeneralBilinearAccurateDownsamplerWrap_neon(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+                        uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBit = 15;
+  const uint32_t kuiScale = (1 << kiScaleBit);
+  uint32_t uiScalex = (uint32_t)((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
+  uint32_t uiScaley = (uint32_t)((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
+  GeneralBilinearAccurateDownsampler_neon(pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
+}
+#endif
 WELSVP_NAMESPACE_END
--- a/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -130,6 +130,12 @@
     pfSad = WelsSampleSad8x8_sse21;
   }
 #endif
+
+#ifdef HAVE_NEON
+  if (iCpuFlag & WELS_CPU_NEON) {
+    pfSad = WelsSampleSad8x8_neon;
+  }
+#endif
 }
 
 
--- a/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -60,6 +60,12 @@
 WELSVP_EXTERN_C_END
 #endif
 
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+SadFunc      WelsSampleSad8x8_neon;
+WELSVP_EXTERN_C_END
+#endif
+
 WELSVP_NAMESPACE_END
 
 #endif
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -65,6 +65,15 @@
     sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;
   }
 #endif//X86_ASM
+#ifdef HAVE_NEON
+  if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
+    sVaaFuncs.pfVAACalcSad			= VAACalcSad_neon;
+    sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_neon;
+    sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_neon;
+    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_neon;
+    sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_neon;
+    }
+#endif//X86_ASM
 }
 
 EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
--- a/codec/processing/src/vaacalc/vaacalculation.h
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -103,6 +103,16 @@
 WELSVP_EXTERN_C_END
 #endif
 
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+VAACalcSadBgdFunc		VAACalcSadBgd_neon;
+VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_neon;
+VAACalcSadFunc			    VAACalcSad_neon;
+VAACalcSadVarFunc		VAACalcSadVar_neon;
+VAACalcSadSsdFunc		VAACalcSadSsd_neon;
+WELSVP_EXTERN_C_END
+#endif
+
 class CVAACalculation : public IStrategy {
  public:
   CVAACalculation (int32_t iCpuFlag);