ref: e7cc8c2780e6ab2c409dc11b73cc204999fb7e7a
parent: 248f324c62bae5b2e2dd202e8e251c670de8af5a
author: Licai Guo <[email protected]>
date: Wed Mar 5 11:54:05 EST 2014
Add arm asm code for processing.
--- a/codec/common/deblocking_neon.S
+++ b/codec/common/deblocking_neon.S
@@ -795,7 +795,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
+WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
vld1.64 {d0-d2}, [r0]
@@ -810,29 +810,28 @@
WELS_ASM_FUNC_END
#ifdef APPLE_IOS
-
-.macro BS_NZC_CHECK
+.macro BS_NZC_CHECK
vld1.8 {d0,d1}, [$0]
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_nzc_check_jump0
-
+
sub r6, $0, $2, lsl #4
sub r6, $2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
-
-bs_nzc_check_jump0:
+
+bs_nzc_check_jump0:
vext.8 q1, q1, q0, #12
vadd.u8 $3, q0, q1
-
+
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_nzc_check_jump1
-
+
sub r6, $0, #21
- add r7, r6, #4
+ add r7, r6, #4
vld1.8 d3[4], [r6]
add r6, r7, #4
vld1.8 d3[5], [r7]
@@ -839,10 +838,10 @@
add r7, r6, #4
vld1.8 d3[6], [r6]
vld1.8 d3[7], [r7]
-
+
bs_nzc_check_jump1:
- vzip.8 d0, d1
vzip.8 d0, d1
+ vzip.8 d0, d1
vext.8 q1, q1, q0, #12
vadd.u8 $4, q0, q1
.endm
@@ -852,41 +851,41 @@
vabd.s16 q5, $0, $1
vabd.s16 q6, $1, $2
vdup.s16 $0, r6
- vabd.s16 q7, $2, $3
- vabd.s16 q8, $3, $4
-
+ vabd.s16 q7, $2, $3
+ vabd.s16 q8, $3, $4
+
vcge.s16 q5, $0
vcge.s16 q6, $0
vcge.s16 q7, $0
- vcge.s16 q8, $0
-
+ vcge.s16 q8, $0
+
vpadd.i16 d10, d10, d11
vpadd.i16 d11, d12, d13
vpadd.i16 d12, d14, d15
- vpadd.i16 d13, d16, d17
-
+ vpadd.i16 d13, d16, d17
+
vaddhn.i16 $5, q5, q5
vaddhn.i16 $6, q6, q6
.endm
-.macro BS_MV_CHECK
+.macro BS_MV_CHECK
vldm $0, {q0,q1,q2,q3}
/* Arrenge the input data --- TOP */
ands r6, $1, #2
beq bs_mv_check_jump0
-
+
sub r6, $0, $2, lsl #6
add r6, #48
vld1.8 {d8, d9}, [r6]
-
+
bs_mv_check_jump0:
BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
-
+
/* Arrenge the input data --- LEFT */
ands r6, $1, #1
beq bs_mv_check_jump1
-
+
sub r6, $0, #52
add r7, r6, #16
vld1.32 d8[0], [r6]
@@ -895,7 +894,7 @@
add r7, r6, #16
vld1.32 d9[0], [r6]
vld1.32 d9[1], [r7]
-
+
bs_mv_check_jump1:
vzip.32 q0, q2
vzip.32 q1, q3
@@ -904,7 +903,6 @@
BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
.endm
#else
-
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
vld1.8 {d0,d1}, [\arg0]
/* Arrenge the input data --- TOP */
@@ -999,28 +997,28 @@
.endm
#endif
-
+
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
-
+
stmdb sp!, {r5-r7}
-
+
ldr r5, [sp, #12] //Save BS to r5
-
+
/* Checking the nzc status */
BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
-
+
/* For checking bS[I] = 2 */
mov r6, #2
vcgt.s8 q14, q14, #0
vdup.u8 q0, r6
vcgt.s8 q15, q15, #0
-
+
vand.u8 q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
vand.u8 q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
-
+
/* Checking the mv status*/
BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
-
+
/* For checking bS[I] = 1 */
mov r6, #1
vdup.u8 q0, r6
@@ -1027,12 +1025,12 @@
vand.u8 q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
vand.u8 q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
-
-
+
+
/* Check bS[I] is '1' or '2' */
vmax.u8 q1, q12, q14
vmax.u8 q0, q13, q15
-
+
//vstm r5, {q0, q1}
vst1.32 {q0, q1}, [r5]
ldmia sp!, {r5-r7}
--- a/codec/common/expand_picture.S
+++ b/codec/common/expand_picture.S
@@ -34,13 +34,13 @@
.text
#include "arm_arch_common_macro.S"
-
+
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
stmdb sp!, {r4-r8}
//Save the dst
mov r7, r0
mov r8, r3
-
+
add r4, r7, r2
sub r4, #1
//For the left and right expand
@@ -58,40 +58,40 @@
subs r8, #1
bne _expand_picture_luma_loop2
- //for the top and bottom expand
+ //for the top and bottom expand
add r2, #64
sub r0, #32
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_luma_loop0:
- mov r5, #32
- mls r5, r5, r1, r0
+ mov r5, #32
+ mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
-
+
mov r8, #32
-_expand_picture_luma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
+_expand_picture_luma_loop1:
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_luma_loop1
-
+
subs r2, #16
bne _expand_picture_luma_loop0
//vldreq.32 d0, [r0]
-
+
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
-
+
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
stmdb sp!, {r4-r8}
//Save the dst
mov r7, r0
mov r8, r3
-
+
add r4, r7, r2
sub r4, #1
//For the left and right expand
@@ -107,31 +107,31 @@
subs r8, #1
bne _expand_picture_chroma_loop2
- //for the top and bottom expand
+ //for the top and bottom expand
add r2, #32
sub r0, #16
mla r4, r1, r3, r0
sub r4, r1
_expand_picture_chroma_loop0:
- mov r5, #16
- mls r5, r5, r1, r0
+ mov r5, #16
+ mls r5, r5, r1, r0
add r6, r4, r1
vld1.8 {q0}, [r0]!
vld1.8 {q1}, [r4]!
-
+
mov r8, #16
-_expand_picture_chroma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
+_expand_picture_chroma_loop1:
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
subs r8, #1
bne _expand_picture_chroma_loop1
-
+
subs r2, #16
bne _expand_picture_chroma_loop0
//vldreq.32 d0, [r0]
-
+
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -533,7 +533,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
+WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes)
sub r2, r0, #1
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -61,15 +61,15 @@
.endm
#endif
-
+
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
//Get the top line data to 'q0'
sub r3, r1, r2
vldm r3, {d0, d1}
-
+
//mov r2, #16
mov r3, #4
- //Set the top line to the each line of MB(16*16)
+ //Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
@@ -76,10 +76,10 @@
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
- bne loop_0_get_i16x16_luma_pred_v
+ bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
-
+
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
//stmdb sp!, {r4, lr}
sub r1, r1, #1
@@ -87,10 +87,10 @@
loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r1], r2
- vld1.8 {d2[],d3[]}, [r1], r2
- vld1.8 {d4[],d5[]}, [r1], r2
+ vld1.8 {d2[],d3[]}, [r1], r2
+ vld1.8 {d4[],d5[]}, [r1], r2
vld1.8 {d6[],d7[]}, [r1], r2
-
+
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0]!
//add r0, #16
@@ -100,9 +100,9 @@
//add r0, #16
vst1.8 {d6,d7}, [r0]!
//add r0, #16
-
+
subs r3, #1
- bne loop_0_get_i16x16_luma_pred_h
+ bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
@@ -113,11 +113,11 @@
sub r3, r1, #1
GET_8BYTE_DATA d0, r3, r2
GET_8BYTE_DATA d1, r3, r2
-
+
//Get the top horizontal line data
- sub r3, r1, r2
+ sub r3, r1, r2
vldm r3, {d2, d3}
-
+
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
@@ -125,11 +125,11 @@
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
-
- //Calculate the mean value
+
+ //Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
-
+
//Set the mean value to the all of member of MB
mov r3, #4
loop_0_get_i16x16_luma_pred_dc_both:
@@ -138,8 +138,8 @@
vst1.8 {d0,d1}, [r0]!
vst1.8 {d0,d1}, [r0]!
subs r3, #1
- bne loop_0_get_i16x16_luma_pred_dc_both
-
+ bne loop_0_get_i16x16_luma_pred_dc_both
+
WELS_ASM_FUNC_END
@@ -146,13 +146,13 @@
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
-//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
+//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
-
+
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
//stmdb sp!, { r4, lr}
-
+
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r3]
@@ -161,25 +161,25 @@
sub r3, r1, r2
sub r1, r3, #1
vld1.8 d1, [r1]
-
+
//Pack the top[8] ~ top[15] to d2
add r1, #9
vld1.8 d2, [r1]
-
+
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
-
+
//Get and pack left[-1] ~ left[6] to d4
sub r1, r3, #1
GET_8BYTE_DATA d4, r1, r2
-
+
//Get and pack left[8] ~ left[15] to d3
add r1, r2
GET_8BYTE_DATA d3, r1, r2
-
+
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
-
+
//revert the sequence of d2,d3
vrev64.8 q1, q1
@@ -186,26 +186,26 @@
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
-
+
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
-
+
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
-
+
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
-
+
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r3]
-
+
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
@@ -214,22 +214,22 @@
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
-
+
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q5, q1, #3
vadd.s16 q5, q3
-
+
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
-
+
//Set the line of MB
vst1.u32 {d0,d1}, [r0]!
-
-
+
+
//Do the same processing for setting other lines
mov r3, #15
-loop_0_get_i16x16_luma_pred_plane:
+loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2
vadd.s16 q5, q2
vqrshrun.s16 d0, q3, #5
@@ -236,35 +236,35 @@
vqrshrun.s16 d1, q5, #5
vst1.u32 {d0,d1}, [r0]!
subs r3, #1
- bne loop_0_get_i16x16_luma_pred_plane
-
+ bne loop_0_get_i16x16_luma_pred_plane
+
WELS_ASM_FUNC_END
-
+
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r3, r1, r2
ldr r3, [r3]
-
+
//Set the luma MB using top line
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0]
-
+
WELS_ASM_FUNC_END
-
+
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
- vld1.8 {d1[]}, [r3], r2
- vld1.8 {d2[]}, [r3], r2
+ vld1.8 {d1[]}, [r3], r2
+ vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3]
-
+
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0]!
vst1.32 {d1[0]}, [r0]!
@@ -279,36 +279,36 @@
//Load the top row data(8 bytes)
sub r3, r1, r2
vld1.32 {d0}, [r3]
-
+
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
-
+
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
-
+
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
-
+
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
-
+
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0]!
-
+
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0]!
-
+
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
- vst1.32 d1[0], [r0]!
-
+ vst1.32 d1[0], [r0]!
+
//Save "ddl3, ddl4, ddl5, ddl6"
- vst1.32 d0[1], [r0]
-
+ vst1.32 d0[1], [r0]
+
WELS_ASM_FUNC_END
@@ -317,29 +317,29 @@
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
-
+
//Load the left column (5 bytes)
sub r3, #1
vld1.8 {d0[3]}, [r3], r2
- vld1.8 {d0[2]}, [r3], r2
+ vld1.8 {d0[2]}, [r3], r2
vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3], r2
+ vld1.8 {d0[0]}, [r3], r2
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
-
-
+
+
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
-
+
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
-
+
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
-
+
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
-
+
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0]!
vext.8 d0, d0, d0, #7
@@ -358,19 +358,19 @@
sub r3, r1, r2
vld1.32 {d0}, [r3]
-
+
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
-
+
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
-
+
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
-
+
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
-
+
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]!
@@ -378,7 +378,7 @@
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0]!
vst1.32 d1[0], [r0]
-
+
WELS_ASM_FUNC_END
@@ -387,34 +387,34 @@
//Load the top row (4 bytes)
sub r3, r1, r2
vld1.32 {d0[1]}, [r3]
-
+
//Load the left column (4 bytes)
sub r3, #1
- vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
- vld1.8 {d0[0]}, [r3]
+ vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[0]}, [r3]
-
+
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
-
+
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
-
+
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
-
+
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0]!
vst1.32 d1[1], [r0]!
//add r2, r0, r1
vst1.8 d1[3], [r0]!
- vst1.16 d0[2], [r0]!
+ vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r0]!
- vst1.16 d1[2], [r0]!
+ vst1.16 d1[2], [r0]!
vst1.8 d1[6], [r0]
WELS_ASM_FUNC_END
@@ -426,29 +426,29 @@
mov r1, #3
mul r1, r2
add r1, r3
- vld1.8 {d0[]}, [r1]
- vld1.8 {d0[4]}, [r3], r2
+ vld1.8 {d0[]}, [r1]
+ vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
- vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+ vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1
- vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
-
+ vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+
vext.u8 d2, d5, d4, #2
- vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
-
+ vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
-
+
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0]!
- vext.8 d2, d1, d1, #2
+ vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0]!
vst1.32 d1[1], [r0]!
vst1.32 d0[0], [r0]
-
+
WELS_ASM_FUNC_END
@@ -458,22 +458,22 @@
sub r3, r1, r2
sub r3, #1
vld1.32 {d0[1]}, [r3], r2
- vld1.8 {d0[3]}, [r3], r2
+ vld1.8 {d0[3]}, [r3], r2
vld1.8 {d0[2]}, [r3], r2
- vld1.8 {d0[1]}, [r3], r2
+ vld1.8 {d0[1]}, [r3], r2
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
-
+
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
-
+
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
-
+
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
@@ -501,25 +501,25 @@
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
vst1.8 {d0}, [r0]!
- vst1.8 {d0}, [r0]
-
+ vst1.8 {d0}, [r0]
+
WELS_ASM_FUNC_END
-
+
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r3, r1, #1
vld1.8 {d0[]}, [r3], r2
- vld1.8 {d1[]}, [r3], r2
- vld1.8 {d2[]}, [r3], r2
+ vld1.8 {d1[]}, [r3], r2
+ vld1.8 {d2[]}, [r3], r2
vld1.8 {d3[]}, [r3], r2
vld1.8 {d4[]}, [r3], r2
- vld1.8 {d5[]}, [r3], r2
- vld1.8 {d6[]}, [r3], r2
+ vld1.8 {d5[]}, [r3], r2
+ vld1.8 {d6[]}, [r3], r2
vld1.8 {d7[]}, [r3]
-
- //Set the chroma MB using left column data
+
+ //Set the chroma MB using left column data
vst1.8 {d0}, [r0]!
vst1.8 {d1}, [r0]!
vst1.8 {d2}, [r0]!
@@ -527,8 +527,8 @@
vst1.8 {d4}, [r0]!
vst1.8 {d5}, [r0]!
vst1.8 {d6}, [r0]!
- vst1.8 {d7}, [r0]
-
+ vst1.8 {d7}, [r0]
+
WELS_ASM_FUNC_END
@@ -536,36 +536,36 @@
//stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes)
sub r3, r1, #1
- GET_8BYTE_DATA d0, r3, r2
-
+ GET_8BYTE_DATA d0, r3, r2
+
//Load the top row data (8 bytes)
- sub r3, r1, r2
+ sub r3, r1, r2
vldr d1, [r3]
-
+
//Calculate the sum of left column and top row
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
-
- vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
- vrshr.u32 d2, d2, #3 //calculate 'm4'
-
+
+ vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
+ vrshr.u32 d2, d2, #3 //calculate 'm4'
+
//duplicate the 'mx' to a vector line
vdup.8 d4, d2[0]
vdup.8 d5, d1[4]
vdup.8 d6, d0[4]
vdup.8 d7, d2[4]
-
- //Set the chroma MB
+
+ //Set the chroma MB
vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
- vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d4[0],d5[0]}, [r0]!
+ vst2.32 {d4[0],d5[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]!
vst2.32 {d6[0],d7[0]}, [r0]
-
+
WELS_ASM_FUNC_END
@@ -579,36 +579,36 @@
//Load the top row data
sub r3, r1, #1
sub r3, r2
- vld1.32 {d1[0]}, [r3]
+ vld1.32 {d1[0]}, [r3]
add r3, #5
vld1.32 {d0[0]}, [r3]
-
+
//Load the left column data
sub r3, #5
vld1.8 {d1[4]}, [r3], r2
- vld1.8 {d1[5]}, [r3], r2
+ vld1.8 {d1[5]}, [r3], r2
vld1.8 {d1[6]}, [r3], r2
- vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+ vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r3, r2
vld1.8 {d0[4]}, [r3], r2
vld1.8 {d0[5]}, [r3], r2
vld1.8 {d0[6]}, [r3], r2
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
-
-
+
+
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
-
+
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
-
+
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r3]
-
+
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
@@ -617,32 +617,32 @@
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
-
+
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r3]
-
+
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
-
+
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
-
+
//Set a line of chroma MB
vst1.u32 {d0}, [r0]!
-
+
//Do the same processing for each line.
mov r3, #7
-loop_0_get_i_chroma_pred_plane:
+loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0]!
subs r3, #1
- bne loop_0_get_i_chroma_pred_plane
-
+ bne loop_0_get_i_chroma_pred_plane
+
WELS_ASM_FUNC_END
#endif
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -29,14 +29,14 @@
* POSSIBILITY OF SUCH DAMAGE.
*
*/
-
+
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
-
+
#ifdef APPLE_IOS
- //The data sequence will be used
+ //The data sequence will be used
.macro GET_8BYTE_DATA_L0
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
@@ -49,7 +49,7 @@
.endm
-.macro HDM_TRANSFORM_4X4_L0
+.macro HDM_TRANSFORM_4X4_L0
//Do the vertical transform
vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
@@ -57,15 +57,15 @@
vswp d1, d2
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
-
+
//Do the horizontal transform
vtrn.32 q2, q1
vadd.s16 q0, q2, q1
vsub.s16 q1, q2, q1
-
+
vtrn.16 q0, q1
vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
+ vsub.s16 q1, q0, q1
vmov.s16 d0, d4
vmov.s16 d1, d2
@@ -76,9 +76,9 @@
vtrn.32 d0, d1 //{0,1,3,2}
vaba.s16 $5, d0, $2 //16x16_v
vaba.s16 $5, d1, $8
- vaba.s16 $5, d5, $8
+ vaba.s16 $5, d5, $8
vadd.u16 $5, d3
-
+
//16x16_h
vtrn.16 d4, d5 //{0,4,12,8}
vaba.s16 $6, d4, $3 //16x16_h
@@ -87,7 +87,7 @@
vadd.u16 d2, d3
vadd.u16 d2, d5
vadd.u16 $6, d2
-
+
//16x16_dc_both
vaba.s16 $7, d4, $4 //16x16_dc_both
vadd.u16 $7, d2
@@ -95,7 +95,7 @@
.endm
#else
- //The data sequence will be used
+ //The data sequence will be used
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -115,15 +115,15 @@
vswp d1, d2
vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
-
+
//Do the horizontal transform
vtrn.32 q2, q1
vadd.s16 q0, q2, q1
vsub.s16 q1, q2, q1
-
+
vtrn.16 q0, q1
vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
+ vsub.s16 q1, q0, q1
vmov.s16 d0, d4
vmov.s16 d1, d2
@@ -134,9 +134,9 @@
vtrn.32 d0, d1 //{0,1,3,2}
vaba.s16 \arg5, d0, \arg2 //16x16_v
vaba.s16 \arg5, d1, \arg8
- vaba.s16 \arg5, d5, \arg8
+ vaba.s16 \arg5, d5, \arg8
vadd.u16 \arg5, d3
-
+
//16x16_h
vtrn.16 d4, d5 //{0,4,12,8}
vaba.s16 \arg6, d4, \arg3 //16x16_h
@@ -145,7 +145,7 @@
vadd.u16 d2, d3
vadd.u16 d2, d5
vadd.u16 \arg6, d2
-
+
//16x16_dc_both
vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
vadd.u16 \arg7, d2
@@ -152,20 +152,20 @@
.endm
#endif
-WELS_ASM_FUNC_BEGIN satd_intra_16x16_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'q15'(16 bytes)
sub r7, r0, r1
vld1.8 {q15}, [r7]
-
+
//Get the left colume data to 'q14' (16 bytes)
sub r7, r0, #1
GET_8BYTE_DATA_L0 d28, r7, r1
- GET_8BYTE_DATA_L0 d29, r7, r1
-
+ GET_8BYTE_DATA_L0 d29, r7, r1
+
//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
- //Calculate the 16x16_dc_both mode SATD
+ //Calculate the 16x16_dc_both mode SATD
vaddl.u8 q0, d30, d31
vaddl.u8 q1, d28, d29
vadd.u16 q0, q1
@@ -172,15 +172,15 @@
vadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
-
- //Calculate the mean value
+
+ //Calculate the mean value
vrshr.u16 d0, #5
- vshl.u16 d27, d0, #4
-
-
+ vshl.u16 d27, d0, #4
+
+
//Calculate the 16x16_v mode SATD and save to "q11, 12"
vshll.u8 q0, d30, #2
- vshll.u8 q1, d31, #2
+ vshll.u8 q1, d31, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@@ -191,7 +191,7 @@
//{8,9,11,10, 12,13,15,14} q11
//Calculate the 16x16_h mode SATD and save to "q9, q10"
vshll.u8 q0, d28, #2
- vshll.u8 q1, d29, #2
+ vshll.u8 q1, d29, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@@ -199,64 +199,64 @@
vadd.s16 q10, q2, q1
vsub.s16 q9, q2, q1
vtrn.32 q10, q9 //{0,1,3,2, 4,5,7,6} q10
- //{8,9,11,10, 12,13,15,14} q9
-
+ //{8,9,11,10, 12,13,15,14} q9
+
vmov.i32 d17, #0//Save the SATD of DC_BOTH
vmov.i32 d16, #0//Save the SATD of H
vmov.i32 d15, #0//Save the SATD of V
vmov.i32 d14, #0//For zero D register
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
+ vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
- vtrn.32 q5, q6
-
- HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
+ vtrn.32 q5, q6
+
+ HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
- HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
+ HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
+ vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
- vtrn.32 q5, q6
-
- HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
+ vtrn.32 q5, q6
+
+ HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
- HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
-
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
+
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
+ vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
- vtrn.32 q5, q6
-
- HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
+ vtrn.32 q5, q6
+
+ HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
- HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
-
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
+
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {q3}, [r2], r3
vld1.32 {q4}, [r2], r3
vld1.32 {q5}, [r2], r3
- vld1.32 {q6}, [r2], r3
+ vld1.32 {q6}, [r2], r3
vtrn.32 q3, q4
- vtrn.32 q5, q6
-
- HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
+ vtrn.32 q5, q6
+
+ HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
- HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
-
+ HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
+
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@@ -266,19 +266,19 @@
vpaddl.u16 d15, d15
vpaddl.u32 d15, d15
vmov.u32 r0, d15[0]
-
+
//vadd.u16 d22, d23
vrshr.u16 d16, #1
vpaddl.u16 d16, d16
vpaddl.u32 d16, d16
- vmov.u32 r1, d16[0]
+ vmov.u32 r1, d16[0]
add r1, r6, lsl #1
-
+
//vadd.u16 d20, d21
vrshr.u16 d17, #1
vpaddl.u16 d17, d17
vpaddl.u32 d17, d17
- vmov.u32 r2, d17[0]
+ vmov.u32 r2, d17[0]
add r2, r6, lsl #1
mov r4, #0
@@ -295,20 +295,20 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN sad_intra_16x16_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
-
+
//Get the top line data to 'q15'(16 bytes)
sub r4, r0, r1
vld1.8 {q15}, [r4]
-
+
//Get the left colume data to 'q14' (16 bytes)
sub r4, r0, #1
GET_8BYTE_DATA_L0 d28, r4, r1
- GET_8BYTE_DATA_L0 d29, r4, r1
-
+ GET_8BYTE_DATA_L0 d29, r4, r1
+
//Calculate the mean value and save to 'q13' (8 bytes)
- //Calculate the 16x16_dc_both mode SATD
+ //Calculate the 16x16_dc_both mode SATD
vaddl.u8 q0, d30, d31
vaddl.u8 q1, d28, d29
vadd.u16 q0, q1
@@ -315,40 +315,40 @@
vadd.u16 d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
-
- //Calculate the mean value
+
+ //Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q13, d0[0]
-
+
sub r4, r0, #1
-
+
vmov.i32 q12, #0//Save the SATD of DC_BOTH
vmov.i32 q11, #0//Save the SATD of H
vmov.i32 q10, #0//Save the SATD of V
-
+
mov lr, #16
sad_intra_16x16_x3_opt_loop0:
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d0[]}, [r4], r1
+ vld1.8 {d0[]}, [r4], r1
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {q1}, [r2], r3
-
+
subs lr, #1
//Do the SAD for top colume
vabal.u8 q12, d30, d2
- vabal.u8 q12, d31, d3
+ vabal.u8 q12, d31, d3
//Do the SAD for left colume
vabal.u8 q11, d0, d2
- vabal.u8 q11, d0, d3
+ vabal.u8 q11, d0, d3
//Do the SAD for mean value
vabal.u8 q10, d26, d2
- vabal.u8 q10, d26, d3
-
+ vabal.u8 q10, d26, d3
+
bne sad_intra_16x16_x3_opt_loop0
-
+
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@@ -357,19 +357,19 @@
vpaddl.u16 d24, d24
vpaddl.u32 d24, d24
vmov.u32 r0, d24[0]
-
+
vadd.u16 d22, d23
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
- vmov.u32 r1, d22[0]
+ vmov.u32 r1, d22[0]
add r1, r6, lsl #1
-
+
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
- vmov.u32 r2, d20[0]
+ vmov.u32 r2, d20[0]
add r2, r6, lsl #1
-
+
mov r4, #0
cmp r1, r0
movcc r0, r1
@@ -384,120 +384,120 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN sad_intra_8x8_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
stmdb sp!, {r4-r7, lr}
-
+
//Get the data from stack
ldr r4, [sp, #32] //p_dec_cr
ldr r5, [sp, #36] //p_enc_cr
-
+
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
sub r6, r0, #1
GET_8BYTE_DATA_L0 d28, r6, r1
- sub r6, r4, #1
- GET_8BYTE_DATA_L0 d30, r6, r1
-
+ sub r6, r4, #1
+ GET_8BYTE_DATA_L0 d30, r6, r1
+
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
sub r6, r0, r1
vld1.8 {d29}, [r6]
sub r6, r4, r1
vld1.8 {d31}, [r6]
-
+
//Calculate the sum of left column and top row
vmov.i32 q0, q14
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
- vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
- vrshr.u32 d2, d2, #3 //calculate 'm4'
-
- //duplicate the 'mx' to a vector line
+ vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
+ vrshr.u32 d2, d2, #3 //calculate 'm4'
+
+ //duplicate the 'mx' to a vector line
vdup.8 d27, d2[0]
vdup.8 d26, d1[4]
vtrn.32 d27, d26
-
+
vdup.8 d26, d0[4]
vdup.8 d25, d2[4]
vtrn.32 d26, d25 //Save to "d27, d26"
-
+
vmov.i32 q0, q15
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
- vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
- vrshr.u32 d2, d2, #3 //calculate 'm4'
-
+ vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
+ vrshr.u32 d2, d2, #3 //calculate 'm4'
+
//duplicate the 'mx' to a vector line
vdup.8 d25, d2[0]
vdup.8 d24, d1[4]
vtrn.32 d25, d24
-
+
vdup.8 d24, d0[4]
vdup.8 d23, d2[4]
vtrn.32 d24, d23 //Save to "d25, d24"
-
+
vmov.i32 q11, #0//Save the SATD of DC_BOTH
vmov.i32 q10, #0//Save the SATD of H
vmov.i32 q9 , #0//Save the SATD of V
sub r6, r0, #1
- sub r7, r4, #1
+ sub r7, r4, #1
mov lr, #4
sad_intra_8x8_x3_opt_loop0:
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {d0}, [r2], r3
vld1.8 {d1}, [r5], r3
-
+
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d2[]}, [r6], r1
- vld1.8 {d3[]}, [r7], r1
-
+ vld1.8 {d2[]}, [r6], r1
+ vld1.8 {d3[]}, [r7], r1
+
subs lr, #1
-
+
//Do the SAD for top colume
- vabal.u8 q11, d29, d0
- vabal.u8 q11, d31, d1
+ vabal.u8 q11, d29, d0
+ vabal.u8 q11, d31, d1
//Do the SAD for left colume
vabal.u8 q10, d2, d0
- vabal.u8 q10, d3, d1
+ vabal.u8 q10, d3, d1
//Do the SAD for mean value
vabal.u8 q9, d27, d0
- vabal.u8 q9, d25, d1
-
-
+ vabal.u8 q9, d25, d1
+
+
bne sad_intra_8x8_x3_opt_loop0
mov lr, #4
sad_intra_8x8_x3_opt_loop1:
- //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+ //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
vld1.8 {d0}, [r2], r3
vld1.8 {d1}, [r5], r3
-
+
//Get the left colume data to 'd0' (16 bytes)
- vld1.8 {d2[]}, [r6], r1
- vld1.8 {d3[]}, [r7], r1
-
+ vld1.8 {d2[]}, [r6], r1
+ vld1.8 {d3[]}, [r7], r1
+
subs lr, #1
-
+
//Do the SAD for top colume
- vabal.u8 q11, d29, d0
- vabal.u8 q11, d31, d1
+ vabal.u8 q11, d29, d0
+ vabal.u8 q11, d31, d1
//Do the SAD for left colume
vabal.u8 q10, d2, d0
- vabal.u8 q10, d3, d1
+ vabal.u8 q10, d3, d1
//Do the SAD for mean value
vabal.u8 q9, d26, d0
- vabal.u8 q9, d24, d1
-
-
- bne sad_intra_8x8_x3_opt_loop1
+ vabal.u8 q9, d24, d1
+
+
+ bne sad_intra_8x8_x3_opt_loop1
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@@ -505,13 +505,13 @@
vadd.u16 d22, d23
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
- vmov.u32 r0, d22[0]
+ vmov.u32 r0, d22[0]
add r0, r6, lsl #1
-
+
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
- vmov.u32 r1, d20[0]
+ vmov.u32 r1, d20[0]
add r1, r6, lsl #1
vadd.u16 d18, d19
@@ -533,28 +533,28 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN satd_intra_8x8_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Satd_neon
stmdb sp!, {r4-r7, lr}
-
+
//Get the data from stack
ldr r4, [sp, #32] //p_dec_cr
ldr r5, [sp, #36] //p_enc_cr
-
+
//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
sub r6, r0, r1
vld1.8 {d29}, [r6]
sub r6, r4, r1
vld1.8 {d31}, [r6]
-
+
//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
sub r6, r0, #1
GET_8BYTE_DATA_L0 d28, r6, r1
- sub r6, r4, #1
- GET_8BYTE_DATA_L0 d30, r6, r1
-
+ sub r6, r4, #1
+ GET_8BYTE_DATA_L0 d30, r6, r1
+
//Calculate the 16x16_v mode SATD and save to "q12, 13"
vshll.u8 q0, d29, #2
- vshll.u8 q1, d31, #2
+ vshll.u8 q1, d31, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@@ -565,7 +565,7 @@
//{8,9,11,10, 12,13,15,14} q12
//Calculate the 16x16_h mode SATD and save to "q10, q11"
vshll.u8 q0, d28, #2
- vshll.u8 q1, d30, #2
+ vshll.u8 q1, d30, #2
vtrn.32 q0, q1
vadd.s16 q2, q0, q1
vsub.s16 q1, q0, q1
@@ -573,69 +573,69 @@
vadd.s16 q11, q2, q1
vsub.s16 q10, q2, q1
vtrn.32 q11, q10 //{0,1,3,2, 4,5,7,6} q11
- //{8,9,11,10, 12,13,15,14} q10
-
+ //{8,9,11,10, 12,13,15,14} q10
+
//Calculate the sum of left column and top row
//vmov.i32 q0, q14
vpaddl.u8 q0, q14
vpaddl.u16 q0, q0
- vadd.u32 d2, d0, d1
+ vadd.u32 d2, d0, d1
vpaddl.u8 q2, q15
vpaddl.u16 q2, q2
- vadd.u32 d3, d4, d5
-
+ vadd.u32 d3, d4, d5
+
vtrn.32 q0, q2
vrshr.u32 q1, #3
- vrshr.u32 q2, #2
+ vrshr.u32 q2, #2
vshll.u32 q9, d4, #4 // {2cb, 2cr} q9
vshll.u32 q8, d5, #4 // {1cb, 1cr} q8
vshll.u32 q7, d2, #4 // {0cb, 3cb} q7
vshll.u32 q6, d3, #4 // {0cr, 3cr} q6
-
-
+
+
vmov.i32 d28, #0//Save the SATD of DC_BOTH
vmov.i32 d10, #0//Save the SATD of H
vmov.i32 d11, #0//Save the SATD of V
vmov.i32 d30, #0//For zero D register
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {d6}, [r2], r3
vld1.32 {d7}, [r2], r3
vld1.32 {d8}, [r2], r3
- vld1.32 {d9}, [r2], r3
+ vld1.32 {d9}, [r2], r3
vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
-
+
vld1.32 {d6}, [r5], r3
vld1.32 {d7}, [r5], r3
vld1.32 {d8}, [r5], r3
- vld1.32 {d9}, [r5], r3
+ vld1.32 {d9}, [r5], r3
vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
- HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
+ HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
- //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+ //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
vld1.32 {d6}, [r2], r3
vld1.32 {d7}, [r2], r3
vld1.32 {d8}, [r2], r3
- vld1.32 {d9}, [r2], r3
+ vld1.32 {d9}, [r2], r3
vtrn.32 d6, d7
- vtrn.32 d8, d9
- HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
+ vtrn.32 d8, d9
+ HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
-
+
vld1.32 {d6}, [r5], r3
vld1.32 {d7}, [r5], r3
vld1.32 {d8}, [r5], r3
- vld1.32 {d9}, [r5], r3
+ vld1.32 {d9}, [r5], r3
vtrn.32 d6, d7
- vtrn.32 d8, d9
+ vtrn.32 d8, d9
HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
- HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
-
+ HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
+
//Get the data from stack
ldr r5, [sp, #20] //the addr of Best_mode
ldr r6, [sp, #24] //the value of i_lambda
@@ -643,13 +643,13 @@
vrshr.u16 d11, #1
vpaddl.u16 d11, d11
vpaddl.u32 d11, d11
- vmov.u32 lr, d11[0]
+ vmov.u32 lr, d11[0]
add lr, r6, lsl #1
-
+
vrshr.u16 d10, #1
vpaddl.u16 d10, d10
vpaddl.u32 d10, d10
- vmov.u32 r3, d10[0]
+ vmov.u32 r3, d10[0]
add r3, r6, lsl #1
vrshr.u16 d28, #1
@@ -672,13 +672,13 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN satd_intra_4x4_x3_opt_neon
+WELS_ASM_FUNC_BEGIN WelsIntra4x4Combined3Satd_neon
stmdb sp!, {r4-r7, lr}
//Get the top line data to 'd31[0~3]'(4 bytes)
sub r7, r0, r1
vld1.32 {d31[0]}, [r7]
-
+
//Get the left colume data to 'd31[4~7]' (4 bytes)
sub r7, r0, #1
vld1.8 {d31[4]}, [r7], r1
@@ -685,18 +685,18 @@
vld1.8 {d31[5]}, [r7], r1
vld1.8 {d31[6]}, [r7], r1
vld1.8 {d31[7]}, [r7], r1
-
+
//Calculate the mean value and save to 'd30' (2 bytes)
vpaddl.u8 d0, d31
vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- //Calculate the mean value
+ vpaddl.u32 d0, d0
+ //Calculate the mean value
vrshr.u16 d0, #3
- vshl.u16 d30, d0, #4
-
+ vshl.u16 d30, d0, #4
+
//Calculate the 16x16_v mode SATD and save to "d29"
- //Calculate the 16x16_h mode SATD and save to "d28"
- vshll.u8 q0, d31, #2
+ //Calculate the 16x16_h mode SATD and save to "d28"
+ vshll.u8 q0, d31, #2
vtrn.32 d0, d1
vadd.s16 d2, d0, d1
vsub.s16 d1, d0, d1
@@ -710,12 +710,12 @@
vmov.i32 d26, #0//Save the SATD of H
vmov.i32 d25, #0//Save the SATD of V
vmov.i32 d24, #0//For zero D register
-
- //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
+
+ //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
vld1.32 {d23[0]}, [r2], r3
vld1.32 {d23[1]}, [r2], r3
vld1.32 {d22[0]}, [r2], r3
- vld1.32 {d22[1]}, [r2], r3
+ vld1.32 {d22[1]}, [r2], r3
HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
@@ -723,17 +723,17 @@
ldr r5, [sp, #28] //the value of lambda2
ldr r6, [sp, #32] //the value of lambda1
ldr r7, [sp, #36] //the value of lambda0
-
+
vrshr.u16 d25, #1
vpaddl.u16 d25, d25
vpaddl.u32 d25, d25
- vmov.u32 r0, d25[0]
+ vmov.u32 r0, d25[0]
add r0, r7
-
+
vrshr.u16 d26, #1
vpaddl.u16 d26, d26
vpaddl.u32 d26, d26
- vmov.u32 r1, d26[0]
+ vmov.u32 r1, d26[0]
add r1, r6
vrshr.u16 d27, #1
@@ -741,10 +741,10 @@
vpaddl.u32 d27, d27
vmov.u32 r2, d27[0]
add r2, r5
-
+
ldr r5, [sp, #20] //p_dst
- ldr r6, [sp, #24] //the addr of Best_mode
-
+ ldr r6, [sp, #24] //the addr of Best_mode
+
mov r4, r0
cmp r1, r4
movcc r4, r1
@@ -770,8 +770,8 @@
vdup.8 d0, d31[4]
vdup.8 d1, d31[5]
vdup.8 d2, d31[6]
- vdup.8 d3, d31[7]
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
+ vdup.8 d3, d31[7]
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
bl satd_intra_4x4_x3_opt_end
satd_intra_4x4_x3_opt_jump1:
@@ -783,11 +783,11 @@
vst1.32 {d31[0]}, [r5]!
vst1.32 {d31[0]}, [r5]!
-
+
satd_intra_4x4_x3_opt_end:
- mov r0, r4
-
+ mov r0, r4
+
ldmia sp!, {r4-r7, lr}
WELS_ASM_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/encoder/core/arm/mc_neon.S
+++ b/codec/encoder/core/arm/mc_neon.S
@@ -1,1963 +1,1963 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-#ifdef APPLE_IOS
-.macro AVERAGE_TWO_8BITS
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, $2, $1
- vrshrn.u16 $0, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
-// }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
- vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
- vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 $0, $0, $0
- vpadd.s16 $0, $0, $0
- vqrshrun.s16 $0, $4, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $2, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $3, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q,
- vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS
-// { // input:a, b, c, dst_d;
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $3, $0, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a
- vext.16 $4, $0, $1, #2 //src[0]
- vext.16 $3, $0, $1, #3 //src[1]
- vadd.s16 $4, $3 //c=src[0]+src[1]
-
- vext.16 $3, $0, $1, #1 //src[-1]
- vext.16 $2, $0, $1, #4 //src[2]
- vadd.s16 $3, $2 //b=src[-1]+src[2]
-
- vext.16 $2, $0, $1, #5 //src[3]
- vadd.s16 $2, $0 //a=src[-2]+src[3]
-// }
-.endm
-
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
- vrev64.16 $1, $1
- vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
- vshr.s64 $1, $2, #16
- vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
-
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $0, $3, #6 //(+32)>>6
-// }
-.endm
-#else
-.macro AVERAGE_TWO_8BITS arg0, arg1,arg2
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, \arg2, \arg1
- vrshrn.u16 \arg0, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
-// }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
- vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
- vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 \arg0, \arg0, \arg0
- vpadd.s16 \arg0, \arg0, \arg0
- vqrshrun.s16 \arg0, \arg4, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg2, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg3, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3]
- vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1,arg2, arg3
-// { // input:a, b, c, dst_d;
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC arg0, arg1,arg2, arg3, arg4
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5)
- vext.16 \arg4, \arg0, \arg1, #2 //src[0]
- vext.16 \arg3, \arg0, \arg1, #3 //src[1]
- vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
-
- vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
- vext.16 \arg2, \arg0, \arg1, #4 //src[2]
- vadd.s16 \arg3, \arg2 //b=src[-1]+src[2]
-
- vext.16 \arg2, \arg0, \arg1, #5 //src[3]
- vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
-// }
-.endm
-
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
- vrev64.16 \arg1, \arg1
- vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
- vshr.s64 \arg1, \arg2, #16
- vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
-
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
-// }
-.endm
-#endif
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_h_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q4, q0, q1, #3 //q4=src[1]
- vext.8 q5, q0, q1, #4 //q5=src[2]
- vext.8 q6, q0, q1, #5 //q6=src[3]
-
- FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15
-
- FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_h_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_h_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d3, r5, r4 // 0x0014FFFB00010000
-
- sub r3, #16
- ldr r4, [sp, #8]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w17_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q4, q0, q1, #3 //q4=src[1]
- vext.8 q5, q0, q1, #4 //q5=src[2]
- vext.8 q6, q0, q1, #5 //q6=src[3]
-
- FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15
-
- FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15
-
- vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte
-
- vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1
-
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
-
- sub r4, #1
- cmp r4, #0
- bne w17_h_mc_luma_loop
- pop {r4-r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_h_neon
- push {r4-r5}
- mov r4, #20
- mov r5, #1
- sub r4, r4, r4, lsl #(16-2)
- lsl r5, #16
- ror r4, #16
- vmov d7, r5, r4 // 0x0014FFFB00010000
-
- sub r3, #8
- ldr r4, [sp, #8]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w9_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15
-
- sub r4, #1
- vst1.u8 {d8}, [r2]! //write [0:7] Byte
-
- vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
- FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1
- vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
-
- cmp r4, #0
- bne w9_h_mc_luma_loop
- pop {r4-r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_h_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_h_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q4, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_h_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_10_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q4, q0, q1, #3 //q4=src[1]
- vext.8 q5, q0, q1, #4 //q5=src[2]
- vext.8 q6, q0, q1, #5 //q6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_xy_10_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_10_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_xy_10_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_10_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_xy_10_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q4, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_xy_10_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_30_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q4, q0, q1, #3 //q4=src[1]
- vext.8 q5, q0, q1, #4 //q5=src[2]
- vext.8 q6, q0, q1, #5 //q6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_xy_30_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_30_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_xy_30_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_30_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_xy_30_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q4, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_xy_30_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_01_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q4}, [r0], r1 //q4=src[2]
-
-w16_xy_01_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15
- vld1.u8 {q4}, [r0], r1 //read 6th row
- vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15
- vld1.u8 {q5}, [r0], r1 //read 7th row
- vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q4
- vswp q0, q2
- vmov q1, q3
- vmov q3, q5 //q0~q4
-
- sub r4, #8
- cmp r4, #0
- bne w16_xy_01_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_01_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_xy_01_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_xy_01_mc_luma_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon
- push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_xy_01_mc_luma_loop:
-
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vmov r4, r5, d12
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vmov r5, r6, d12
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_xy_01_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_03_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q4}, [r0], r1 //q4=src[2]
-
-w16_xy_03_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15
- vld1.u8 {q4}, [r0], r1 //read 6th row
- vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15
- vld1.u8 {q5}, [r0], r1 //read 7th row
- vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q4
- vswp q0, q2
- vmov q1, q3
- vmov q3, q5 //q0~q4
-
- sub r4, #8
- cmp r4, #0
- bne w16_xy_03_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_03_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_xy_03_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_xy_03_mc_luma_loop
-
- pop {r4}
- WELS_ASM_FUNC_END
-
- WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon
- push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_xy_03_mc_luma_loop:
-
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vmov r4, r5, d12
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vmov r5, r6, d12
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_xy_03_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_v_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q4}, [r0], r1 //q4=src[2]
-
-w16_v_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
- vld1.u8 {q4}, [r0], r1 //read 6th row
- vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
- vld1.u8 {q5}, [r0], r1 //read 7th row
- vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q4
- vswp q0, q2
- vmov q1, q3
- vmov q3, q5 //q0~q4
-
- sub r4, #8
- cmp r4, #0
- bne w16_v_mc_luma_loop
- pop {r4}
- WELS_ASM_FUNC_END
-
- WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_v_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q4}, [r0], r1 //q4=src[2]
-
-w17_v_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
- vld1.u8 {q4}, [r0], r1 //read 6th row
- vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
- vld1.u8 {q5}, [r0], r1 //read 7th row
- vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q4
- vswp q0, q2
- vmov q1, q3
- vmov q3, q5 //q0~q4
-
- sub r4, #8
- cmp r4, #1
- bne w17_v_mc_luma_loop
- // the last 16Bytes
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
- FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
- FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_v_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w9_v_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #1
- bne w9_v_mc_luma_loop
-
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
- vst1.u8 {d12}, [r2], r3 //write last 8Byte
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon
- push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_v_mc_luma_loop:
-
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
- vmov r4, r5, d12
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
- vmov r5, r6, d12
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_v_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_hv_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
-
-w16_hv_mc_luma_loop:
-
- vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2], r3 //write 16Byte
-
-
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
- vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
-
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
-
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
-
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
-
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
-
- sub r4, #4
- cmp r4, #0
- bne w16_hv_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_hv_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
- sub r3, #16
-
-w17_hv_mc_luma_loop:
-
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {d0, d1}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
-
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
- vst1.u8 {d3, d4}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
- vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
-
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
- vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
-
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
- vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
-
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
-
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
-
- sub r4, #4
- cmp r4, #1
- bne w17_hv_mc_luma_loop
- //the last row
- vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2]! //write 16Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
- vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_hv_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
- sub r3, #8
-
-w9_hv_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
- vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
-
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
- vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
-
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
- vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
-
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
- vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
-
- //q4~q5, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q5
-
- sub r4, #4
- cmp r4, #1
- bne w9_hv_mc_luma_loop
- //the last row
- vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2]! //write 8Byte
- UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
- vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_hv_neon
- push {r4 ,r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
-
-w4_hv_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
-
- //the 1st&2nd row
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
-
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
-
- vmov d23, d0
- vmov d25, d14
- vmov d27, d16
-
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
-
- //the 3rd&4th row
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
-
- FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
-
- vmov d23, d4
- vmov d25, d14
- vmov d27, d16
-
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
-
- //q4~q6, q0~q1, --> q0~q4
- vswp q4, q0
- vmov q3, q4
- vmov q4, q1
- vmov q1, q5
- vmov q2, q6
-
- sub r6, #4
- cmp r6, #0
- bne w4_hv_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_copy_w16_neon
- push {r4}
- ldr r4, [sp, #4]
-w16_copy_loop:
- vld1.u8 {q0}, [r0], r1
- vld1.u8 {q1}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- vst1.u8 {q1}, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w16_copy_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_copy_w8_neon
- push {r4}
- ldr r4, [sp, #4]
-w8_copy_loop:
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vst1.u8 {d1}, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w8_copy_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_copy_w4_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
-w4_copy_loop:
- ldr r5, [r0], r1
- ldr r6, [r0], r1
- str r5, [r2], r3
- str r6, [r2], r3
-
- sub r4, #2
- cmp r4, #0
- bne w4_copy_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_pixel_avg_w16_neon
- push {r4}
- ldr r4, [sp, #4]
-w16_pix_avg_loop:
- vld1.u8 {q0}, [r2]!
- vld1.u8 {q1}, [r3]!
- vld1.u8 {q2}, [r2]!
- vld1.u8 {q3}, [r3]!
-
- vld1.u8 {q4}, [r2]!
- vld1.u8 {q5}, [r3]!
- vld1.u8 {q6}, [r2]!
- vld1.u8 {q7}, [r3]!
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
-
- AVERAGE_TWO_8BITS d8, d8, d10
- AVERAGE_TWO_8BITS d9, d9, d11
- vst1.u8 {q4}, [r0], r1
-
- AVERAGE_TWO_8BITS d12, d12, d14
- AVERAGE_TWO_8BITS d13, d13, d15
- vst1.u8 {q6}, [r0], r1
-
- sub r4, #4
- cmp r4, #0
- bne w16_pix_avg_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_pix_avg_w16_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
-
-enc_w16_pix_avg_loop:
- vld1.u8 {q0}, [r2], r3
- vld1.u8 {q1}, [r4], r5
- vld1.u8 {q2}, [r2], r3
- vld1.u8 {q3}, [r4], r5
-
- vld1.u8 {q4}, [r2], r3
- vld1.u8 {q5}, [r4], r5
- vld1.u8 {q6}, [r2], r3
- vld1.u8 {q7}, [r4], r5
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
-
- AVERAGE_TWO_8BITS d8, d8, d10
- AVERAGE_TWO_8BITS d9, d9, d11
- vst1.u8 {q4}, [r0], r1
-
- AVERAGE_TWO_8BITS d12, d12, d14
- AVERAGE_TWO_8BITS d13, d13, d15
- vst1.u8 {q6}, [r0], r1
-
- sub r6, #4
- cmp r6, #0
- bne enc_w16_pix_avg_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_pix_avg_w8_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r5, [sp, #16]
- ldr r6, [sp, #20]
-enc_w8_pix_avg_loop:
-
- vld1.u8 {d0}, [r2], r3
- vld1.u8 {d2}, [r4], r5
- vld1.u8 {d1}, [r2], r3
- vld1.u8 {d3}, [r4], r5
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
-
- vld1.u8 {d4}, [r2], r3
- vld1.u8 {d6}, [r4], r5
- vld1.u8 {d5}, [r2], r3
- vld1.u8 {d7}, [r4], r5
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
-
- sub r6, #4
- cmp r6, #0
- bne enc_w8_pix_avg_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_chroma_w8_neon
-
- push {r4, r5}
- ldr r4, [sp, #8]
- ldr r5, [sp, #12]
- vld1.u8 {d31}, [r4] //load A/B/C/D
- vld1.u8 {q0}, [r0], r1 //src[x]
-
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
-
- vext.u8 d1, d0, d1, #1 //src[x+1]
-
-w8_mc_chroma_loop: // each two pxl row
- vld1.u8 {q1}, [r0], r1 //src[x+stride]
- vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
- vext.u8 d3, d2, d3, #1 //src[x+stride+1]
- vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
-
- vmull.u8 q3, d0, d28 //(src[x] * A)
- vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
-
- vmull.u8 q3, d2, d28 //(src[x] * A)
- vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
-
- vmov q0, q2
- sub r5, #2
- cmp r5, #0
- bne w8_mc_chroma_loop
-
- pop {r4, r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN enc_mc_chroma_w4_neon
-
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r6, [sp, #16]
- vld1.u8 {d31}, [r4] //load A/B/C/D
-
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
-
-w4_mc_chroma_loop: // each two pxl row
- vld1.u8 {d0}, [r0], r1 //a::src[x]
- vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
- vld1.u8 {d4}, [r0] //c::src[x+2*stride]
-
- vshr.u64 d1, d0, #8
- vshr.u64 d3, d2, #8
- vshr.u64 d5, d4, #8
-
- vmov q3, q1 //b::[0:7]+b::[1~8]
- vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
- vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
-
- vmull.u8 q1, d0, d28 //(src[x] * A)
- vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
- vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
-
- vrshrn.u16 d2, q1, #6
- vmov r4, r5, d2
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_mc_chroma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-#endif
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef APPLE_IOS
+.macro AVERAGE_TWO_8BITS
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, $2, $1
+ vrshrn.u16 $0, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+// }
+.endm
+
+.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+ vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
+ vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 $0, $0, $0
+ vpadd.s16 $0, $0, $0
+ vqrshrun.s16 $0, $4, #5
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $2, $6
+ vrshrn.u16 $6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b;
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $3, $6
+ vrshrn.u16 $6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_TO_16BITS
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q,
+ vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
+.endm
+
+.macro FILTER_3_IN_16BITS_TO_8BITS
+// { // input:a, b, c, dst_d;
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $3, $0, #6 //(+32)>>6
+// }
+.endm
+
+.macro UNPACK_2_16BITS_TO_ABC
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a
+ vext.16 $4, $0, $1, #2 //src[0]
+ vext.16 $3, $0, $1, #3 //src[1]
+ vadd.s16 $4, $3 //c=src[0]+src[1]
+
+ vext.16 $3, $0, $1, #1 //src[-1]
+ vext.16 $2, $0, $1, #4 //src[2]
+ vadd.s16 $3, $2 //b=src[-1]+src[2]
+
+ vext.16 $2, $0, $1, #5 //src[3]
+ vadd.s16 $2, $0 //a=src[-2]+src[3]
+// }
+.endm
+
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
+ vrev64.16 $1, $1
+ vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
+ vshr.s64 $1, $2, #16
+ vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
+
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $0, $3, #6 //(+32)>>6
+// }
+.endm
+#else
+.macro AVERAGE_TWO_8BITS arg0, arg1,arg2
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, \arg2, \arg1
+ vrshrn.u16 \arg0, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+// }
+.endm
+
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+ vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
+ vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 \arg0, \arg0, \arg0
+ vpadd.s16 \arg0, \arg0, \arg0
+ vqrshrun.s16 \arg0, \arg4, #5
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg2, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg3, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1,arg2, arg3, arg4,arg5, arg6, arg7,arg8
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3]
+ vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
+.endm
+
+.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1,arg2, arg3
+// { // input:a, b, c, dst_d;
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
+// }
+.endm
+
+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1,arg2, arg3, arg4
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5)
+ vext.16 \arg4, \arg0, \arg1, #2 //src[0]
+ vext.16 \arg3, \arg0, \arg1, #3 //src[1]
+ vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
+
+ vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
+ vext.16 \arg2, \arg0, \arg1, #4 //src[2]
+ vadd.s16 \arg3, \arg2 //b=src[-1]+src[2]
+
+ vext.16 \arg2, \arg0, \arg1, #5 //src[3]
+ vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
+// }
+.endm
+
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
+ vrev64.16 \arg1, \arg1
+ vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
+ vshr.s64 \arg1, \arg2, #16
+ vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
+
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
+// }
+.endm
+#endif
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_h_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_h_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+ FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_h_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_h_neon
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d3, r5, r4 // 0x0014FFFB00010000
+
+ sub r3, #16
+ ldr r4, [sp, #8]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w17_h_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15
+
+ FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15
+
+ vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte
+
+ vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1
+
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ sub r4, #1
+ cmp r4, #0
+ bne w17_h_mc_luma_loop
+ pop {r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_h_neon
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d7, r5, r4 // 0x0014FFFB00010000
+
+ sub r3, #8
+ ldr r4, [sp, #8]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w9_h_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d8}, [r2]! //write [0:7] Byte
+
+ vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1
+ vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
+
+ cmp r4, #0
+ bne w9_h_mc_luma_loop
+ pop {r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_h_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_h_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q4, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_h_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_10_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_xy_10_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_xy_10_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_10_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_xy_10_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_xy_10_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_10_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_xy_10_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q4, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_10_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_30_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_xy_30_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_xy_30_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_30_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_xy_30_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_xy_30_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_30_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_xy_30_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q4, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_30_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_01_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w16_xy_01_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_01_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_01_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_xy_01_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_01_mc_luma_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon
+ push {r4, r5, r6, r7}
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_xy_01_mc_luma_loop:
+
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vmov r4, r5, d12
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vmov r5, r6, d12
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_01_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_xy_03_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w16_xy_03_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_03_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w8_xy_03_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_xy_03_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_03_mc_luma_loop
+
+ pop {r4}
+ WELS_ASM_FUNC_END
+
+ WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon
+ push {r4, r5, r6, r7}
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_xy_03_mc_luma_loop:
+
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vmov r4, r5, d12
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vmov r5, r6, d12
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_03_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_v_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w16_v_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_v_mc_luma_loop
+ pop {r4}
+ WELS_ASM_FUNC_END
+
+ WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_v_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w17_v_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #1
+ bne w17_v_mc_luma_loop
+ // the last 16Bytes
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_v_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w9_v_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #1
+ bne w9_v_mc_luma_loop
+
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vst1.u8 {d12}, [r2], r3 //write last 8Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon
+ push {r4, r5, r6, r7}
+ sub r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_v_mc_luma_loop:
+
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vmov r4, r5, d12
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vmov r5, r6, d12
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_v_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w16_hv_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2 //src[-2]
+ sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
+
+w16_hv_mc_luma_loop:
+
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2], r3 //write 16Byte
+
+
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
+
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
+
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
+
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
+
+ sub r4, #4
+ cmp r4, #0
+ bne w16_hv_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w17_hv_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2 //src[-2]
+ sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
+ sub r3, #16
+
+w17_hv_mc_luma_loop:
+
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {d0, d1}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ vst1.u8 {d3, d4}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
+ vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
+ vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
+ vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
+
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
+
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
+
+ sub r4, #4
+ cmp r4, #1
+ bne w17_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w9_hv_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2 //src[-2]
+ sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
+ sub r3, #8
+
+w9_hv_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ //q4~q5, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q5
+
+ sub r4, #4
+ cmp r4, #1
+ bne w9_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_hv_neon
+ push {r4 ,r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2 //src[-2]
+ sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
+
+w4_hv_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
+
+ //the 1st&2nd row
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
+
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
+
+ vmov d23, d0
+ vmov d25, d14
+ vmov d27, d16
+
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
+
+ //the 3rd&4th row
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
+
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
+
+ vmov d23, d4
+ vmov d25, d14
+ vmov d27, d16
+
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
+
+ //q4~q6, q0~q1, --> q0~q4
+ vswp q4, q0
+ vmov q3, q4
+ vmov q4, q1
+ vmov q1, q5
+ vmov q2, q6
+
+ sub r6, #4
+ cmp r6, #0
+ bne w4_hv_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_copy_w16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w16_copy_loop:
+ vld1.u8 {q0}, [r0], r1
+ vld1.u8 {q1}, [r0], r1
+ vst1.u8 {q0}, [r2], r3
+ vst1.u8 {q1}, [r2], r3
+ sub r4, #2
+ cmp r4, #0
+ bne w16_copy_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_copy_w8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w8_copy_loop:
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vst1.u8 {d1}, [r2], r3
+ sub r4, #2
+ cmp r4, #0
+ bne w8_copy_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_copy_w4_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+w4_copy_loop:
+ ldr r5, [r0], r1
+ ldr r6, [r0], r1
+ str r5, [r2], r3
+ str r6, [r2], r3
+
+ sub r4, #2
+ cmp r4, #0
+ bne w4_copy_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_pixel_avg_w16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w16_pix_avg_loop:
+ vld1.u8 {q0}, [r2]!
+ vld1.u8 {q1}, [r3]!
+ vld1.u8 {q2}, [r2]!
+ vld1.u8 {q3}, [r3]!
+
+ vld1.u8 {q4}, [r2]!
+ vld1.u8 {q5}, [r3]!
+ vld1.u8 {q6}, [r2]!
+ vld1.u8 {q7}, [r3]!
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
+
+ AVERAGE_TWO_8BITS d8, d8, d10
+ AVERAGE_TWO_8BITS d9, d9, d11
+ vst1.u8 {q4}, [r0], r1
+
+ AVERAGE_TWO_8BITS d12, d12, d14
+ AVERAGE_TWO_8BITS d13, d13, d15
+ vst1.u8 {q6}, [r0], r1
+
+ sub r4, #4
+ cmp r4, #0
+ bne w16_pix_avg_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_pix_avg_w16_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
+
+enc_w16_pix_avg_loop:
+ vld1.u8 {q0}, [r2], r3
+ vld1.u8 {q1}, [r4], r5
+ vld1.u8 {q2}, [r2], r3
+ vld1.u8 {q3}, [r4], r5
+
+ vld1.u8 {q4}, [r2], r3
+ vld1.u8 {q5}, [r4], r5
+ vld1.u8 {q6}, [r2], r3
+ vld1.u8 {q7}, [r4], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
+
+ AVERAGE_TWO_8BITS d8, d8, d10
+ AVERAGE_TWO_8BITS d9, d9, d11
+ vst1.u8 {q4}, [r0], r1
+
+ AVERAGE_TWO_8BITS d12, d12, d14
+ AVERAGE_TWO_8BITS d13, d13, d15
+ vst1.u8 {q6}, [r0], r1
+
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w16_pix_avg_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_pix_avg_w8_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
+enc_w8_pix_avg_loop:
+
+ vld1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r4], r5
+ vld1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r4], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
+
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d6}, [r4], r5
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d7}, [r4], r5
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
+
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w8_pix_avg_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_chroma_w8_neon
+
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ ldr r5, [sp, #12]
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+ vld1.u8 {q0}, [r0], r1 //src[x]
+
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
+
+ vext.u8 d1, d0, d1, #1 //src[x+1]
+
+w8_mc_chroma_loop: // each two pxl row
+ vld1.u8 {q1}, [r0], r1 //src[x+stride]
+ vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
+ vext.u8 d3, d2, d3, #1 //src[x+stride+1]
+ vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
+
+ vmull.u8 q3, d0, d28 //(src[x] * A)
+ vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
+
+ vmull.u8 q3, d2, d28 //(src[x] * A)
+ vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
+
+ vmov q0, q2
+ sub r5, #2
+ cmp r5, #0
+ bne w8_mc_chroma_loop
+
+ pop {r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN enc_mc_chroma_w4_neon
+
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r6, [sp, #16]
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
+
+w4_mc_chroma_loop: // each two pxl row
+ vld1.u8 {d0}, [r0], r1 //a::src[x]
+ vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
+ vld1.u8 {d4}, [r0] //c::src[x+2*stride]
+
+ vshr.u64 d1, d0, #8
+ vshr.u64 d3, d2, #8
+ vshr.u64 d5, d4, #8
+
+ vmov q3, q1 //b::[0:7]+b::[1~8]
+ vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+ vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+
+ vmull.u8 q1, d0, d28 //(src[x] * A)
+ vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
+
+ vrshrn.u16 d2, q1, #6
+ vmov r4, r5, d2
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_mc_chroma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+#endif
--- a/codec/encoder/core/arm/memory_neon.S
+++ b/codec/encoder/core/arm/memory_neon.S
@@ -60,4 +60,4 @@
vst1.64 {d0}, [r0]!
WELS_ASM_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/encoder/core/arm/pixel_neon.S
+++ b/codec/encoder/core/arm/pixel_neon.S
@@ -35,73 +35,73 @@
#include "arm_arch_common_macro.S"
.macro SATD_16x4
- vld1.64 {q0}, [r0,:128], r1
- vld1.64 {q1}, [r2], r3
+ vld1.64 {q0}, [r0,:128], r1
+ vld1.64 {q1}, [r2], r3
- vsubl.u8 q4, d0, d2
- vld1.64 {q2}, [r0,:128], r1
+ vsubl.u8 q4, d0, d2
+ vld1.64 {q2}, [r0,:128], r1
- vsubl.u8 q6, d1, d3
- vld1.64 {q3}, [r2], r3
+ vsubl.u8 q6, d1, d3
+ vld1.64 {q3}, [r2], r3
- vsubl.u8 q5, d4, d6
- vld1.64 {q0}, [r0,:128], r1
+ vsubl.u8 q5, d4, d6
+ vld1.64 {q0}, [r0,:128], r1
- vsubl.u8 q7, d5, d7
+ vsubl.u8 q7, d5, d7
vld1.64 {q1}, [r2], r3
vsubl.u8 q8, d0, d2
- vld1.64 {q2}, [r0,:128], r1
+ vld1.64 {q2}, [r0,:128], r1
vsubl.u8 q10, d1, d3
- vadd.s16 q0, q4, q5
+ vadd.s16 q0, q4, q5
- vld1.64 {q3}, [r2], r3
- vsub.s16 q1, q4, q5
+ vld1.64 {q3}, [r2], r3
+ vsub.s16 q1, q4, q5
- vsubl.u8 q9, d4, d6
- vsubl.u8 q11, d5, d7
+ vsubl.u8 q9, d4, d6
+ vsubl.u8 q11, d5, d7
- vadd.s16 q2, q8, q9
- vsub.s16 q3, q8, q9
+ vadd.s16 q2, q8, q9
+ vsub.s16 q3, q8, q9
- vadd.s16 q4, q6, q7
+ vadd.s16 q4, q6, q7
vsub.s16 q5, q6, q7
- vadd.s16 q6, q10, q11
- vsub.s16 q7, q10, q11
+ vadd.s16 q6, q10, q11
+ vsub.s16 q7, q10, q11
- vadd.s16 q8, q0, q2
- vsub.s16 q10, q0, q2
+ vadd.s16 q8, q0, q2
+ vsub.s16 q10, q0, q2
- vadd.s16 q9, q4, q6
- vsub.s16 q11, q4, q6
+ vadd.s16 q9, q4, q6
+ vsub.s16 q11, q4, q6
- vsub.s16 q0, q1, q3
- vadd.s16 q2, q1, q3
+ vsub.s16 q0, q1, q3
+ vadd.s16 q2, q1, q3
- vsub.s16 q1, q5, q7
- vadd.s16 q3, q5, q7
+ vsub.s16 q1, q5, q7
+ vadd.s16 q3, q5, q7
- vtrn.16 q8, q10
- vtrn.16 q9, q11
+ vtrn.16 q8, q10
+ vtrn.16 q9, q11
- vadd.s16 q4, q8, q10
- vabd.s16 q6, q8, q10
+ vadd.s16 q4, q8, q10
+ vabd.s16 q6, q8, q10
- vadd.s16 q5, q9, q11
- vabd.s16 q7, q9, q11
+ vadd.s16 q5, q9, q11
+ vabd.s16 q7, q9, q11
vabs.s16 q4, q4
vabs.s16 q5, q5
- vtrn.16 q0, q2
- vtrn.16 q1, q3
+ vtrn.16 q0, q2
+ vtrn.16 q1, q3
- vadd.s16 q8, q0, q2
- vabd.s16 q10, q0, q2
+ vadd.s16 q8, q0, q2
+ vabd.s16 q10, q0, q2
- vadd.s16 q9, q1, q3
+ vadd.s16 q9, q1, q3
vabd.s16 q11, q1, q3
vabs.s16 q8, q8
@@ -128,31 +128,31 @@
vld1.64 {d1}, [r2], r3
vld1.64 {d2}, [r0,:64], r1
- vsubl.u8 q4, d0, d1
+ vsubl.u8 q4, d0, d1
vld1.64 {d3}, [r2], r3
- vsubl.u8 q5, d2, d3
+ vsubl.u8 q5, d2, d3
vld1.64 {d4}, [r0,:64], r1
vld1.64 {d5}, [r2], r3
- vadd.s16 q8, q4, q5
- vsubl.u8 q6, d4, d5
+ vadd.s16 q8, q4, q5
+ vsubl.u8 q6, d4, d5
vld1.64 {d6}, [r0,:64], r1
vld1.64 {d7}, [r2], r3
- vsubl.u8 q7, d6, d7
- vsub.s16 q9, q4, q5
+ vsubl.u8 q7, d6, d7
+ vsub.s16 q9, q4, q5
- vadd.s16 q10, q6, q7
- vsub.s16 q11, q6, q7
+ vadd.s16 q10, q6, q7
+ vsub.s16 q11, q6, q7
- vadd.s16 q0, q8, q10
- vsub.s16 q1, q8, q10
+ vadd.s16 q0, q8, q10
+ vsub.s16 q1, q8, q10
- vsub.s16 q2, q9, q11
- vadd.s16 q3, q9, q11
+ vsub.s16 q2, q9, q11
+ vadd.s16 q3, q9, q11
vtrn.16 q0, q1
vtrn.16 q2, q3
@@ -220,7 +220,7 @@
.endm
-WELS_ASM_FUNC_BEGIN pixel_sad_16x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad16x16_neon
vld1.64 {q0}, [r0, :128], r1
vld1.64 {q1}, [r2], r3
@@ -260,7 +260,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_16x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad16x8_neon
vld1.64 {q0}, [r0, :128], r1
vld1.64 {q1}, [r2], r3
@@ -298,7 +298,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_8x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad8x16_neon
vld1.64 {d0}, [r0, :64], r1
vld1.64 {d1}, [r2], r3
@@ -332,7 +332,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
vld1.64 {d0}, [r0, :64], r1
vld1.64 {d1}, [r2], r3
@@ -364,7 +364,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_4x4_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
stmdb sp!, {r4-r5, lr}
//Loading a horizontal line data (4 bytes)
@@ -376,23 +376,23 @@
//line 1
ldr r4, [r0], r1
ldr r5, [r2], r3
- usada8 lr, r4, r5, lr
+ usada8 lr, r4, r5, lr
- //line 2
+ //line 2
ldr r4, [r0], r1
ldr r5, [r2], r3
- usada8 lr, r4, r5, lr
-
+ usada8 lr, r4, r5, lr
+
//line 3
ldr r4, [r0]
ldr r5, [r2]
- usada8 r0, r4, r5, lr
+ usada8 r0, r4, r5, lr
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_4_16x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
stmdb sp!, {r4-r5, lr}
@@ -400,30 +400,30 @@
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
-
+
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
-
+
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q6}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
-
+
vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vld1.8 {q4}, [r5], r3 //save pix2 + 1
-
+ vld1.8 {q4}, [r5], r3 //save pix2 + 1
+
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
-
+
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
-
+
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
-
+
vabdl.u8 q9, d0, d8
- vabal.u8 q9, d1, d9
-
+ vabal.u8 q9, d1, d9
+
mov lr, #15
pixel_sad_4_16x16_loop_0:
@@ -436,13 +436,13 @@
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
- vld1.8 {q4}, [r5], r3 //save pix2 + 1
+ vld1.8 {q4}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
-
+
vabal.u8 q9, d0, d8
vabal.u8 q9, d1, d9
@@ -451,18 +451,18 @@
//Save SAD to 'r0'
ldr r0, [sp, #12]
-
+
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
-
+
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
-
+
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
-
+
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
@@ -471,37 +471,37 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_4_16x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
stmdb sp!, {r4-r5, lr}
-
+
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
-
+
//Loading a horizontal line data (16 bytes)
vld1.8 {q0}, [r0], r1 //save pix1
-
+
vld1.8 {q1}, [r2], r3 //save pix2 - stride
vld1.8 {q6}, [r2], r3 //save pix2
vld1.8 {q2}, [r2], r3 //save pix2 + stride
-
+
vld1.8 {q3}, [r4], r3 //save pix2 - 1
- vld1.8 {q4}, [r5], r3 //save pix2 + 1
-
+ vld1.8 {q4}, [r5], r3 //save pix2 + 1
+
//Do the SAD for 16 bytes
vabdl.u8 q15, d0, d2
vabal.u8 q15, d1, d3
-
+
vabdl.u8 q13, d0, d4
vabal.u8 q13, d1, d5
-
+
vabdl.u8 q11, d0, d6
vabal.u8 q11, d1, d7
-
+
vabdl.u8 q9, d0, d8
- vabal.u8 q9, d1, d9
-
+ vabal.u8 q9, d1, d9
+
mov lr, #7
pixel_sad_4_16x8_loop_0:
@@ -514,67 +514,67 @@
vabal.u8 q15, d1, d3
vld1.8 {q3}, [r4], r3 //save pix2 - 1
vabal.u8 q13, d0, d4
- vld1.8 {q4}, [r5], r3 //save pix2 + 1
+ vld1.8 {q4}, [r5], r3 //save pix2 + 1
vabal.u8 q13, d1, d5
subs lr, #1
vabal.u8 q11, d0, d6
vabal.u8 q11, d1, d7
-
+
vabal.u8 q9, d0, d8
vabal.u8 q9, d1, d9
-
+
bne pixel_sad_4_16x8_loop_0
//Save SAD to 'r0'
ldr r0, [sp, #12]
-
+
vadd.u16 d0, d30, d31
vadd.u16 d1, d26, d27
vadd.u16 d2, d22, d23
vadd.u16 d3, d18, d19
-
+
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
-
+
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
-
+
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
-
+
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN pixel_sad_4_8x16_neon
+
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
stmdb sp!, {r4-r5, lr}
-
+
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
-
+
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
-
+
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
-
+
vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
-
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
+
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d0, d4
-
+ vabdl.u8 q12, d0, d4
+
mov lr, #15
pixel_sad_4_8x16_loop_0:
-
+
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
vmov.8 d1, d6 //save pix2 - stride
@@ -582,7 +582,7 @@
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
-
+
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
@@ -594,50 +594,50 @@
//Save SAD to 'r0'
ldr r0, [sp, #12]
-
+
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
-
+
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
-
+
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
-
+
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
-
+
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_4_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
stmdb sp!, {r4-r5, lr}
-
+
//Generate the pix2 start addr
sub r4, r2, #1
add r5, r2, #1
sub r2, r3
-
+
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1 //save pix1
-
+
vld1.8 {d1}, [r2], r3 //save pix2 - stride
vld1.8 {d6}, [r2], r3 //save pix2
vld1.8 {d2}, [r2], r3 //save pix2 + stride
-
+
vld1.8 {d3}, [r4], r3 //save pix2 - 1
- vld1.8 {d4}, [r5], r3 //save pix2 + 1
-
+ vld1.8 {d4}, [r5], r3 //save pix2 + 1
+
//Do the SAD for 8 bytes
vabdl.u8 q15, d0, d1
vabdl.u8 q14, d0, d2
vabdl.u8 q13, d0, d3
- vabdl.u8 q12, d0, d4
-
+ vabdl.u8 q12, d0, d4
+
mov lr, #7
pixel_sad_4_8x8_loop_0:
@@ -648,7 +648,7 @@
vld1.8 {d2}, [r2], r3 //save pix2 + stride
vld1.8 {d3}, [r4], r3 //save pix2 - 1
vabal.u8 q15, d0, d1
-
+
vld1.8 {d4}, [r5], r3 //save pix2 + 1
//Do the SAD for 8 bytes
vabal.u8 q14, d0, d2
@@ -659,34 +659,34 @@
//Save SAD to 'r0'
ldr r0, [sp, #12]
-
+
vadd.u16 d0, d30, d31
vadd.u16 d1, d28, d29
vadd.u16 d2, d26, d27
vadd.u16 d3, d24, d25
-
+
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
-
+
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
-
+
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
-
+
ldmia sp!, {r4-r5, lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_sad_4_4x4_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
-
-
+
+
sub r0, r2, r3
vld1.32 {d2[0]}, [r0], r3
vld1.32 {d2[1]}, [r0], r3
@@ -693,32 +693,32 @@
vld1.32 {d3[0]}, [r0], r3
vld1.32 {d3[1]}, [r0], r3
vld1.32 {d4[0]}, [r0], r3
- vld1.32 {d4[1]}, [r0]
-
- sub r0, r2, #1
+ vld1.32 {d4[1]}, [r0]
+
+ sub r0, r2, #1
vld1.32 {d5[0]}, [r0], r3
vld1.32 {d5[1]}, [r0], r3
vld1.32 {d6[0]}, [r0], r3
- vld1.32 {d6[1]}, [r0]
-
- add r0, r2, #1
+ vld1.32 {d6[1]}, [r0]
+
+ add r0, r2, #1
vld1.32 {d7[0]}, [r0], r3
vld1.32 {d7[1]}, [r0], r3
vld1.32 {d8[0]}, [r0], r3
vld1.32 {d8[1]}, [r0]
-
+
vabdl.u8 q15, d0, d2
vabdl.u8 q14, d1, d3
-
+
vabdl.u8 q13, d0, d3
vabdl.u8 q12, d1, d4
-
+
vabdl.u8 q11, d0, d5
vabdl.u8 q10, d1, d6
-
+
vabdl.u8 q9, d0, d7
vabdl.u8 q8, d1, d8
-
+
//Save SAD to 'r4'
ldr r0, [sp]
vadd.u16 q0, q14, q15
@@ -725,18 +725,18 @@
vadd.u16 q1, q12, q13
vadd.u16 q2, q10, q11
vadd.u16 q3, q8 , q9
-
+
vadd.u16 d0, d1
vadd.u16 d1, d2, d3
vadd.u16 d2, d4, d5
vadd.u16 d3, d6, d7
-
+
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
-
+
vpaddl.u32 q0, q0
vpaddl.u32 q1, q1
-
+
vshl.u32 q0, #4
vshl.u32 q1, #4
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
@@ -744,7 +744,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_satd_16x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd16x16_neon
SATD_16x4
vadd.u16 q15, q0, q2
@@ -769,7 +769,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_satd_16x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd16x8_neon
SATD_16x4
vadd.u16 q15, q0, q2
@@ -786,7 +786,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_satd_8x16_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd8x16_neon
SATD_8x4
vadd.u16 q15, q0, q1
@@ -811,7 +811,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_satd_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd8x8_neon
SATD_8x4
vadd.u16 q15, q0, q1
@@ -828,7 +828,7 @@
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN pixel_satd_4x4_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
//Load the pix1 data --- 16 bytes
vld1.32 {d0[0]}, [r0], r1
@@ -836,11 +836,11 @@
vld1.32 {d1[0]}, [r0], r1
vld1.32 {d1[1]}, [r0]
- //Load the pix2 data --- 16 bytes
+ //Load the pix2 data --- 16 bytes
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d3[0]}, [r2], r3
- vld1.32 {d3[1]}, [r2]
+ vld1.32 {d3[1]}, [r2]
//Get the difference
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@@ -861,15 +861,15 @@
vtrn.16 q13, q12
vadd.s16 q15, q13, q12
- //Do the SAD
- vabs.s16 q15, q15
+ //Do the SAD
+ vabs.s16 q15, q15
vabd.s16 q14, q13, q12
vadd.u16 q0, q15, q14
vrhadd.u16 d0, d1
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -1,1312 +1,1312 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-#ifdef APPLE_IOS
-.macro LORD_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4,:128], $5
- vld1.64 {$1}, [$4,:128], $5
- vld1.64 {$2}, [$4,:128], $5
- vld1.64 {$3}, [$4,:128], $5
-// }
-.endm
-
-.macro STORE_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4,:128], $5
- vst1.64 {$1}, [$4,:128], $5
- vst1.64 {$2}, [$4,:128], $5
- vst1.64 {$3}, [$4,:128], $5
-// }
-.endm
-
-.macro LORD_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4], $5
- vld1.64 {$1}, [$4], $5
- vld1.64 {$2}, [$4], $5
- vld1.64 {$3}, [$4], $5
-// }
-.endm
-
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4], $5
- vst1.64 {$1}, [$4], $5
- vst1.64 {$2}, [$4], $5
- vst1.64 {$3}, [$4], $5
-// }
-.endm
-
-.macro LOAD_4x4_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
- vld2.16 {$0[0],$1[0]}, [$4], $5
- vld2.16 {$2[0],$3[0]}, [$6], $7
- vld2.16 {$0[1],$1[1]}, [$4], $5
- vld2.16 {$2[1],$3[1]}, [$6], $7
-
- vld2.16 {$0[2],$1[2]}, [$4], $5
- vld2.16 {$2[2],$3[2]}, [$6], $7
- vld2.16 {$0[3],$1[3]}, [$4], $5
- vld2.16 {$2[3],$3[3]}, [$6], $7
-// }
-.endm
-
-.macro LOAD_8x8_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- vld1.64 {$0}, [$8], r2
- vld1.64 {$4}, [$9], r4
- vld1.64 {$1}, [$8], r2
- vld1.64 {$5}, [$9], r4
-
- vld1.64 {$2}, [$8], r2
- vld1.64 {$6}, [$9], r4
- vld1.64 {$3}, [$8], r2
- vld1.64 {$7}, [$9], r4
-// }
-.endm
-
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-// { // input: src_d[0]~[3], working: [4]~[7]
- vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
- vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
- vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
- vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
-
- vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
- vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
- vshl.s16 $1, $7, #1
- vshl.s16 $3, $6, #1
- vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
- vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
-
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
-
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vmax.s16 $9, $2, $3
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf , working_d (all 0), working_q
- vaba.s16 $1, $0, $3 // f + abs(coef - 0)
- vmull.s16 $4, $1, $2 // *= mf
- vshr.s32 $4, #16
- vmovn.s32 $1, $4 // >> 16
-
- vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $3, #1
- vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro DC_ZERO_COUNT_IN_DUALWORD
-// { // input: coef, dst_d, working_d (all 0x01)
- vceq.s16 $1, $0, #0
- vand.s16 $1, $2
- vpadd.s16 $1, $1, $1
- vpadd.s16 $1, $1, $1
-// }
-.endm
-
-.macro SELECT_MAX_IN_ABS_COEF
-// { // input: coef_0, coef_1, max_q (identy to follow two)
- vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
-// }
-.endm
-
-.macro ZERO_COUNT_IN_2_QUARWORD
-// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
- vceq.s16 $0, #0
- vceq.s16 $1, #0
- vand.s16 $0, $2
- vand.s16 $1, $2
-
- vpadd.s16 $3, $3, $5
- vpadd.s16 $4, $4, $6
- vpadd.s16 $3, $3, $4 // 8-->4
- vpadd.s16 $3, $3, $3
- vpadd.s16 $3, $3, $3
-// }
-.endm
-
-.macro HDM_QUANT_2x2_TOTAL_16BITS
-// { // input: src_d[0]~[3], working_d, dst_d
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- vtrn.s16 $2, $1
- vtrn.s32 $2, $1
-// }
-.endm
-
-.macro IHDM_4x4_TOTAL_16BITS
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
- vtrn.s16 $2, $1
- vrev32.16 $1, $1
- vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
-
- vrev64.16 $1, $2
- vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
- vsub.s16 $1, $2, $1
- vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
- vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-// }
-.endm
-
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
- vmovl.u8 $4,$0
- vmovl.u8 $5,$1
- vadd.s16 $4,$2
- vadd.s16 $5,$3
- vqmovun.s16 $0,$4
- vqmovun.s16 $1,$5
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
- vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
- vshr.s16 $6, $1, #1
- vshr.s16 $7, $3, #1
- vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
- vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-
-.macro ROW_TRANSFORM_0_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
- vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 $8, $1, #1
- vshr.s16 $9, $3, #1
- vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro COL_TRANSFORM_0_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 $6, $1, #1
- vshr.s32 $7, $3, #1
- vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-#else
-.macro LORD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, src*, src_stride
- vld1.64 {\arg0}, [\arg4,:128], \arg5
- vld1.64 {\arg1}, [\arg4,:128], \arg5
- vld1.64 {\arg2}, [\arg4,:128], \arg5
- vld1.64 {\arg3}, [\arg4,:128], \arg5
-// }
-.endm
-
-.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, dst*, dst_stride
- vst1.64 {\arg0}, [\arg4,:128], \arg5
- vst1.64 {\arg1}, [\arg4,:128], \arg5
- vst1.64 {\arg2}, [\arg4,:128], \arg5
- vst1.64 {\arg3}, [\arg4,:128], \arg5
-// }
-.endm
-
-.macro LORD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, src*, src_stride
- vld1.64 {\arg0}, [\arg4], \arg5
- vld1.64 {\arg1}, [\arg4], \arg5
- vld1.64 {\arg2}, [\arg4], \arg5
- vld1.64 {\arg3}, [\arg4], \arg5
-// }
-.endm
-
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: \arg0~\arg3, dst*, dst_stride
- vst1.64 {\arg0}, [\arg4], \arg5
- vst1.64 {\arg1}, [\arg4], \arg5
- vst1.64 {\arg2}, [\arg4], \arg5
- vst1.64 {\arg3}, [\arg4], \arg5
-// }
-.endm
-
-.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
- vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
- vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
- vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
- vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
-
- vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
- vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
- vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
- vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
-// }
-.endm
-
-.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- vld1.64 {\arg0}, [\arg8], r2
- vld1.64 {\arg4}, [\arg9], r4
- vld1.64 {\arg1}, [\arg8], r2
- vld1.64 {\arg5}, [\arg9], r4
-
- vld1.64 {\arg2}, [\arg8], r2
- vld1.64 {\arg6}, [\arg9], r4
- vld1.64 {\arg3}, [\arg8], r2
- vld1.64 {\arg7}, [\arg9], r4
-// }
-.endm
-
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], working: [4]~[7]
- vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
- vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
- vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2];
- vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2];
-
- vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1];
- vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1];
- vshl.s16 \arg1, \arg7, #1
- vshl.s16 \arg3, \arg6, #1
- vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2];
- vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
- veor.s16 \arg6, \arg6 // init 0 , and keep 0;
- vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
- vmull.s16 \arg7, \arg2, \arg4
- vmull.s16 \arg8, \arg3, \arg5
- vshr.s32 \arg7, #16
- vshr.s32 \arg8, #16
- vmovn.s32 \arg2, \arg7
- vmovn.s32 \arg3, \arg8
-
- vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg6, #1
- vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
- veor.s16 \arg6, \arg6 // init 0 , and keep 0;
- vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
- vmull.s16 \arg7, \arg2, \arg4
- vmull.s16 \arg8, \arg3, \arg5
- vshr.s32 \arg7, #16
- vshr.s32 \arg8, #16
- vmovn.s32 \arg2, \arg7
- vmovn.s32 \arg3, \arg8
-
- vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg6, #1
- vmax.s16 \arg9, \arg2, \arg3
- vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
-// { // input: coef, ff (dst), mf , working_d (all 0), working_q
- vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
- vmull.s16 \arg4, \arg1, \arg2 // *= mf
- vshr.s32 \arg4, #16
- vmovn.s32 \arg1, \arg4 // >> 16
-
- vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111
- vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 \arg3, #1
- vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
-// { // input: coef, dst_d, working_d (all 0x01)
- vceq.s16 \arg1, \arg0, #0
- vand.s16 \arg1, \arg2
- vpadd.s16 \arg1, \arg1, \arg1
- vpadd.s16 \arg1, \arg1, \arg1
-// }
-.endm
-
-.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
-// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
- vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
- vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
- vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1]
-// }
-.endm
-
-.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
-// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
- vceq.s16 \arg0, #0
- vceq.s16 \arg1, #0
- vand.s16 \arg0, \arg2
- vand.s16 \arg1, \arg2
-
- vpadd.s16 \arg3, \arg3, \arg5
- vpadd.s16 \arg4, \arg4, \arg6
- vpadd.s16 \arg3, \arg3, \arg4 // 8-->4
- vpadd.s16 \arg3, \arg3, \arg3
- vpadd.s16 \arg3, \arg3, \arg3
-// }
-.endm
-
-.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
-// { // input: src_d[0]~[3], working_d, dst_d
- vshr.s64 \arg1, \arg0, #32
- vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- vtrn.s16 \arg2, \arg1
- vtrn.s32 \arg2, \arg1
-// }
-.endm
-
-.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
- vshr.s64 \arg1, \arg0, #32
- vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
- vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
- vtrn.s16 \arg2, \arg1
- vrev32.16 \arg1, \arg1
- vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
-
- vrev64.16 \arg1, \arg2
- vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
- vsub.s16 \arg1, \arg2, \arg1
- vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
- vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-// }
-.endm
-
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
-// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
- vmovl.u8 \arg4,\arg0
- vmovl.u8 \arg5,\arg1
- vadd.s16 \arg4,\arg2
- vadd.s16 \arg5,\arg3
- vqmovun.s16 \arg0,\arg4
- vqmovun.s16 \arg1,\arg5
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
- vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
- vshr.s16 \arg6, \arg1, #1
- vshr.s16 \arg7, \arg3, #1
- vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3];
- vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-
-.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3];
- vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3];
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
- vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 \arg8, \arg1, #1
- vshr.s16 \arg9, \arg3, #1
- vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 \arg6, \arg1, #1
- vshr.s32 \arg7, \arg3, #1
- vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-#endif
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
-
- LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
-
- LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
-
- STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
-
- LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
-
- STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
-
- LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
-
- STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
-
- LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
-
- STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
-
- LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
-
- LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
-
- LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
-
- LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
-
- STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
-
-WELS_ASM_FUNC_END
-
-
-
-WELS_ASM_FUNC_BEGIN WelsDctT4_neon
- push {r4}
- ldr r4, [sp, #4]
-
- LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4
-
- vsubl.u8 q0, d4, d6
- vsubl.u8 q1, d5, d7
- vtrn.s32 q0, q1
- vswp d1, d2
-
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
-
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
-
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
-
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
-
- vst1.s16 {q0, q1}, [r0]!
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
- push {r4}
- ldr r4, [sp, #4]
-
- LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
-
- vsubl.u8 q0, d8, d12
- vsubl.u8 q1, d9, d13
- vsubl.u8 q2, d10, d14
- vsubl.u8 q3, d11, d15
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
-
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
-
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- vswp d1, d2
- vswp d5, d6
- vswp q1, q2
- vst1.s16 {q0, q1}, [r0]!
- vst1.s16 {q2, q3}, [r0]!
-
- ////////////////
- LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
-
- vsubl.u8 q0, d8, d12
- vsubl.u8 q1, d9, d13
- vsubl.u8 q2, d10, d14
- vsubl.u8 q3, d11, d15
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
-
- // horizontal transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- // transform element
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
-
- // vertical transform
- DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- vswp d1, d2
- vswp d5, d6
- vswp q1, q2
- vst1.s16 {q0, q1}, [r0]!
- vst1.s16 {q2, q3}, [r0]!
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q0, q1}, [r0]
- vld1.s16 {q3}, [r2]
-
- vmov q4, q2
-
- NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7
- vst1.s16 {q2}, [r0]!
-
- NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r0]!
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
-
- vld1.s16 {q0, q1}, [r0]
- vdup.s16 q2, r1 // even ff range [0, 768]
- vdup.s16 q3, r2
-
- vmov q4, q2
-
- NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7
- vst1.s16 {q2}, [r0]!
-
- NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r0]!
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q3}, [r2]
- mov r1, r0
-
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
-
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
-
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
-
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
- vst1.s16 {q4}, [r1]!
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
- vld1.s16 {q2}, [r1]
- vld1.s16 {q3}, [r2]
- mov r1, r0
-
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
- vst1.s16 {q4}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
- vst1.s16 {q8}, [r1]! // then 1st 16 elem in d18 & d20
-
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
- vst1.s16 {q4}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
- vst1.s16 {q8}, [r1]! // then 2nd 16 elem in d19 & d21
-
- SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1
- vst1.s32 {d0[0]}, [r3]!
-
- ///////////
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
- vst1.s16 {q4}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
- vst1.s16 {q8}, [r1]! // then 3rd 16 elem in d18 & d20
-
- vld1.s16 {q0, q1}, [r0]!
- vmov q4, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
- vst1.s16 {q4}, [r1]!
- vmov q8, q2
- NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
- vst1.s16 {q8}, [r1]! // then 4th 16 elem in d19 & d21
-
- SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1
- vst1.s32 {d0[0]}, [r3]!
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
- push {r2,r3}
- mov r2, #64 // 2*16*sizeof(int16_t)
- add r3, r1, #32
-
- vld1.s16 {d0}, [r1], r2
- vld1.s16 {d1}, [r3], r2
- vld1.s16 {d4}, [r1], r2
- vld1.s16 {d5}, [r3], r2
- vld1.s16 {d2}, [r1], r2
- vld1.s16 {d3}, [r3], r2
- vld1.s16 {d6}, [r1], r2
- vld1.s16 {d7}, [r3], r2
- vtrn.16 q0, q2 // d0[0 4], d1[1 5]
- vtrn.16 q1, q3 // d2[2 6], d3[3 7]
-
- vld1.s16 {d8}, [r1], r2
- vld1.s16 {d9}, [r3], r2
- vld1.s16 {d12}, [r1], r2
- vld1.s16 {d13}, [r3], r2
- vld1.s16 {d10}, [r1], r2
- vld1.s16 {d11}, [r3], r2
- vld1.s16 {d14}, [r1], r2
- vld1.s16 {d15}, [r3], r2
- vtrn.16 q4, q6 // d8[08 12], d9[09 13]
- vtrn.16 q5, q7 //d10[10 14],d11[11 15]
-
- vtrn.32 q0, q4 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16]
- vtrn.32 q1, q5 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80]
-
- ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q4, q7, q6, q5
-
- TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5
-
- // transform element 32bits
- vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-
- COL_TRANSFORM_0_STEP q0, q1, q3, q2, q4, q7, q6, q5
-
- TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5
-
- vrshrn.s32 d8, q0, #1
- vrshrn.s32 d9, q1, #1
- vrshrn.s32 d10, q2, #1
- vrshrn.s32 d11, q3, #1
- vst1.16 {q4, q5}, [r0] //store
-
- pop {r2,r3}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
-
- vdup.s16 d1, r1 //ff
- vdup.s16 d2, r2 //mf
- veor d3, d3
-
- mov r1, #32
- mov r2, r0
-
- vld1.s16 {d0[0]}, [r0], r1 //rs[00]
- vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0
- vld1.s16 {d0[1]}, [r0], r1 //rs[16]
- vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0
- vld1.s16 {d0[2]}, [r0], r1 //rs[32]
- vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0
- vld1.s16 {d0[3]}, [r0], r1 //rs[48]
- vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0
-
- HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5
-
- HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0
-
- QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
-
- vst1.s16 d1, [r3] // store to dct
- ldr r2, [sp, #0]
- vst1.s16 d1, [r2] // store to block
-
- mov r1, #1
- vdup.s16 d3, r1
- DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3
-
- vmov r0, r1, d0
- and r0, #0x07 // range [0~4]
- rsb r0, #4
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
-
- vdup.s16 d3, r1
- mov r1, #32
- vld1.s16 {d0[0]}, [r0], r1 //rs[00]
- vld1.s16 {d0[1]}, [r0], r1 //rs[16]
- vld1.s16 {d0[2]}, [r0], r1 //rs[32]
- vld1.s16 {d0[3]}, [r0], r1 //rs[48]
-
- HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2
-
- HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0
-
- vabs.s16 d1, d0
- vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold;
- vmov r0, r1, d1
- orr r0, r1
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
- push {r1}
- vld1.s16 {q0, q1}, [r0]
- vmov.s16 q8, #1
-
- ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3
- vmov r0, r1, d0
- and r0, #0x1F // range [0~16]
- rsb r0, #16
- pop {r1}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
- vld1.s16 {q0, q1}, [r0]
- vld1.u16 {q2}, [r1]
-
- vmul.s16 q4, q0, q2
- vmul.s16 q5, q1, q2
-
- vst1.s16 {q4, q5}, [r0]
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
- vld1.u16 {q8}, [r1]
- mov r1, r0
- vld1.s16 {q0, q1}, [r0]!
- vld1.s16 {q2, q3}, [r0]!
- vmul.s16 q0, q0, q8
- vld1.s16 {q4, q5}, [r0]!
- vmul.s16 q1, q1, q8
- vld1.s16 {q6, q7}, [r0]!
-
- vst1.s16 {q0, q1}, [r1]!
-
- vmul.s16 q2, q2, q8
- vmul.s16 q3, q3, q8
- vmul.s16 q4, q4, q8
- vst1.s16 {q2, q3}, [r1]!
-
- vmul.s16 q5, q5, q8
- vmul.s16 q6, q6, q8
- vmul.s16 q7, q7, q8
- vst1.s16 {q4, q5}, [r1]!
- vst1.s16 {q6, q7}, [r1]!
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
-
- vld1.s16 {q0, q1}, [r0]
- vdup.s16 q4, r1
-
- IHDM_4x4_TOTAL_16BITS q0, q2, q3
- IHDM_4x4_TOTAL_16BITS q1, q2, q3
-
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
-
- IHDM_4x4_TOTAL_16BITS q0, q2, q3
- vmul.s16 q0, q4
-
- IHDM_4x4_TOTAL_16BITS q1, q2, q3
- vmul.s16 q1, q4
-
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
- vst1.s16 {q0, q1}, [r0]
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
- vld1.u32 {d14[0]}, [r2], r3
- push {r4}
- ldr r4, [sp, #4]
- vld1.u32 {d14[1]}, [r2], r3
-
- vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles!
- vld1.u32 {d15[0]}, [r2], r3
- vld1.u32 {d15[1]}, [r2], r3 // q7 is pred
-
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
-
- TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
-
- MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
-
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
-
- TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
- vrshr.s16 d0, d0, #6
- vrshr.s16 d1, d1, #6
- vrshr.s16 d2, d2, #6
- vrshr.s16 d3, d3, #6
-
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q2,d14
- vadd.s16 q0,q2
- vqmovun.s16 d14,q0
- vst1.32 {d14[0]},[r0],r1
- vst1.32 {d14[1]},[r0],r1
-
- vmovl.u8 q2,d15
- vadd.s16 q1,q2
- vqmovun.s16 d15,q1
- vst1.32 {d15[0]},[r0],r1
- vst1.32 {d15[1]},[r0]
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
-
- vld1.u64 {d16}, [r2], r3
- push {r4}
- ldr r4, [sp, #4]
- vld1.u64 {d17}, [r2], r3
-
- vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
- vld1.u64 {d18}, [r2], r3
- vld1.u64 {d19}, [r2], r3
- vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
- vswp d1, d4
- vswp d3, d6
- vswp q1, q2 // q0~q3
-
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
-
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
- vrshr.s16 q0, q0, #6
- vrshr.s16 q1, q1, #6
- vrshr.s16 q2, q2, #6
- vrshr.s16 q3, q3, #6
-
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q4,d16
- vadd.s16 q0,q4
- vqmovun.s16 d16,q0
- vst1.u8 {d16},[r0],r1
-
- vmovl.u8 q4,d17
- vadd.s16 q1,q4
- vqmovun.s16 d17,q1
- vst1.u8 {d17},[r0],r1
-
- vmovl.u8 q4,d18
- vadd.s16 q2,q4
- vqmovun.s16 d18,q2
- vst1.u8 {d18},[r0],r1
-
- vmovl.u8 q4,d19
- vadd.s16 q3,q4
- vqmovun.s16 d19,q3
- vst1.u8 {d19},[r0],r1
-
- vld1.u64 {d16}, [r2], r3
- vld1.u64 {d17}, [r2], r3
-
- vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
- vld1.u64 {d18}, [r2], r3
- vld1.u64 {d19}, [r2], r3
- vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
- vswp d1, d4
- vswp d3, d6
- vswp q1, q2 // q0~q3
-
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
-
- ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
-
- TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
- vrshr.s16 q0, q0, #6
- vrshr.s16 q1, q1, #6
- vrshr.s16 q2, q2, #6
- vrshr.s16 q3, q3, #6
-
- //after rounding 6, clip into [0, 255]
- vmovl.u8 q4,d16
- vadd.s16 q0,q4
- vqmovun.s16 d16,q0
- vst1.u8 {d16},[r0],r1
-
- vmovl.u8 q4,d17
- vadd.s16 q1,q4
- vqmovun.s16 d17,q1
- vst1.u8 {d17},[r0],r1
-
- vmovl.u8 q4,d18
- vadd.s16 q2,q4
- vqmovun.s16 d18,q2
- vst1.u8 {d18},[r0],r1
-
- vmovl.u8 q4,d19
- vadd.s16 q3,q4
- vqmovun.s16 d19,q3
- vst1.u8 {d19},[r0],r1
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
- push {r4}
- ldr r4, [sp, #4]
-
- vld1.s16 {q8,q9}, [r4]
- vrshr.s16 q8, q8, #6
- vrshr.s16 q9, q9, #6
-
- vdup.s16 d20, d16[0]
- vdup.s16 d21, d16[1]
- vdup.s16 d22, d16[2]
- vdup.s16 d23, d16[3]
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vdup.s16 d20, d17[0]
- vdup.s16 d21, d17[1]
- vdup.s16 d22, d17[2]
- vdup.s16 d23, d17[3]
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vdup.s16 d20, d18[0]
- vdup.s16 d21, d18[1]
- vdup.s16 d22, d18[2]
- vdup.s16 d23, d18[3]
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vdup.s16 d20, d19[0]
- vdup.s16 d21, d19[1]
- vdup.s16 d22, d19[2]
- vdup.s16 d23, d19[3]
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- vld1.u8 {q0}, [r2], r3
- MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
- vst1.u8 {q0}, [r0], r1
-
- pop {r4}
-WELS_ASM_FUNC_END
-#endif
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef APPLE_IOS
+.macro LORD_ALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, src*, src_stride
+ vld1.64 {$0}, [$4,:128], $5
+ vld1.64 {$1}, [$4,:128], $5
+ vld1.64 {$2}, [$4,:128], $5
+ vld1.64 {$3}, [$4,:128], $5
+// }
+.endm
+
+.macro STORE_ALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, dst*, dst_stride
+ vst1.64 {$0}, [$4,:128], $5
+ vst1.64 {$1}, [$4,:128], $5
+ vst1.64 {$2}, [$4,:128], $5
+ vst1.64 {$3}, [$4,:128], $5
+// }
+.endm
+
+.macro LORD_UNALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, src*, src_stride
+ vld1.64 {$0}, [$4], $5
+ vld1.64 {$1}, [$4], $5
+ vld1.64 {$2}, [$4], $5
+ vld1.64 {$3}, [$4], $5
+// }
+.endm
+
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE
+// { // input: $0~$3, dst*, dst_stride
+ vst1.64 {$0}, [$4], $5
+ vst1.64 {$1}, [$4], $5
+ vst1.64 {$2}, [$4], $5
+ vst1.64 {$3}, [$4], $5
+// }
+.endm
+
+.macro LOAD_4x4_DATA_FOR_DCT
+// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
+ vld2.16 {$0[0],$1[0]}, [$4], $5
+ vld2.16 {$2[0],$3[0]}, [$6], $7
+ vld2.16 {$0[1],$1[1]}, [$4], $5
+ vld2.16 {$2[1],$3[1]}, [$6], $7
+
+ vld2.16 {$0[2],$1[2]}, [$4], $5
+ vld2.16 {$2[2],$3[2]}, [$6], $7
+ vld2.16 {$0[3],$1[3]}, [$4], $5
+ vld2.16 {$2[3],$3[3]}, [$6], $7
+// }
+.endm
+
+.macro LOAD_8x8_DATA_FOR_DCT
+// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+ vld1.64 {$0}, [$8], r2
+ vld1.64 {$4}, [$9], r4
+ vld1.64 {$1}, [$8], r2
+ vld1.64 {$5}, [$9], r4
+
+ vld1.64 {$2}, [$8], r2
+ vld1.64 {$6}, [$9], r4
+ vld1.64 {$3}, [$8], r2
+ vld1.64 {$7}, [$9], r4
+// }
+.endm
+
+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
+// { // input: src_d[0]~[3], working: [4]~[7]
+ vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
+ vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
+ vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
+ vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
+
+ vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
+ vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
+ vshl.s16 $1, $7, #1
+ vshl.s16 $3, $6, #1
+ vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
+ vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
+// }
+.endm
+
+.macro MATRIX_TRANSFORM_EACH_16BITS
+// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+ vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+// }
+.endm
+
+.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+ veor.s16 $6, $6 // init 0 , and keep 0;
+ vaba.s16 $1, $0, $6 // f + abs(coef - 0)
+ vmull.s16 $7, $2, $4
+ vmull.s16 $8, $3, $5
+ vshr.s32 $7, #16
+ vshr.s32 $8, #16
+ vmovn.s32 $2, $7
+ vmovn.s32 $3, $8
+
+ vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $6, #1
+ vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
+// }
+.endm
+
+.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+ veor.s16 $6, $6 // init 0 , and keep 0;
+ vaba.s16 $1, $0, $6 // f + abs(coef - 0)
+ vmull.s16 $7, $2, $4
+ vmull.s16 $8, $3, $5
+ vshr.s32 $7, #16
+ vshr.s32 $8, #16
+ vmovn.s32 $2, $7
+ vmovn.s32 $3, $8
+
+ vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $6, #1
+ vmax.s16 $9, $2, $3
+ vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
+// }
+.endm
+
+.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
+// { // input: coef, ff (dst), mf , working_d (all 0), working_q
+ vaba.s16 $1, $0, $3 // f + abs(coef - 0)
+ vmull.s16 $4, $1, $2 // *= mf
+ vshr.s32 $4, #16
+ vmovn.s32 $1, $4 // >> 16
+
+ vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
+ vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 $3, #1
+ vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
+// }
+.endm
+
+.macro DC_ZERO_COUNT_IN_DUALWORD
+// { // input: coef, dst_d, working_d (all 0x01)
+ vceq.s16 $1, $0, #0
+ vand.s16 $1, $2
+ vpadd.s16 $1, $1, $1
+ vpadd.s16 $1, $1, $1
+// }
+.endm
+
+.macro SELECT_MAX_IN_ABS_COEF
+// { // input: coef_0, coef_1, max_q (identy to follow two)
+ vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
+ vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
+ vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
+// }
+.endm
+
+.macro ZERO_COUNT_IN_2_QUARWORD
+// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
+ vceq.s16 $0, #0
+ vceq.s16 $1, #0
+ vand.s16 $0, $2
+ vand.s16 $1, $2
+
+ vpadd.s16 $3, $3, $5
+ vpadd.s16 $4, $4, $6
+ vpadd.s16 $3, $3, $4 // 8-->4
+ vpadd.s16 $3, $3, $3
+ vpadd.s16 $3, $3, $3
+// }
+.endm
+
+.macro HDM_QUANT_2x2_TOTAL_16BITS
+// { // input: src_d[0]~[3], working_d, dst_d
+ vshr.s64 $1, $0, #32
+ vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+ vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+ vtrn.s16 $2, $1
+ vtrn.s32 $2, $1
+// }
+.endm
+
+.macro IHDM_4x4_TOTAL_16BITS
+// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+ vshr.s64 $1, $0, #32
+ vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+ vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+ vtrn.s16 $2, $1
+ vrev32.16 $1, $1
+ vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+
+ vrev64.16 $1, $2
+ vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+ vsub.s16 $1, $2, $1
+ vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+ vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+// }
+.endm
+
+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
+// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+ vmovl.u8 $4,$0
+ vmovl.u8 $5,$1
+ vadd.s16 $4,$2
+ vadd.s16 $5,$3
+ vqmovun.s16 $0,$4
+ vqmovun.s16 $1,$5
+// }
+.endm
+
+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
+// { // input: src_d[0]~[3], output: e_d[0]~[3];
+ vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
+ vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
+ vshr.s16 $6, $1, #1
+ vshr.s16 $7, $3, #1
+ vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
+ vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
+// }
+.endm
+
+.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
+.endm
+
+
+.macro ROW_TRANSFORM_0_STEP
+// { // input: src_d[0]~[3], output: e_q[0]~[3];
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
+ vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
+// }
+.endm
+
+.macro ROW_TRANSFORM_1_STEP
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+ vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 $8, $1, #1
+ vshr.s16 $9, $3, #1
+ vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
+.endm
+
+.macro TRANSFORM_4BYTES // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
+.endm
+
+.macro COL_TRANSFORM_0_STEP
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
+.endm
+
+.macro COL_TRANSFORM_1_STEP
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 $6, $1, #1
+ vshr.s32 $7, $3, #1
+ vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
+.endm
+#else
+.macro LORD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, src*, src_stride
+ vld1.64 {\arg0}, [\arg4,:128], \arg5
+ vld1.64 {\arg1}, [\arg4,:128], \arg5
+ vld1.64 {\arg2}, [\arg4,:128], \arg5
+ vld1.64 {\arg3}, [\arg4,:128], \arg5
+// }
+.endm
+
+.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, dst*, dst_stride
+ vst1.64 {\arg0}, [\arg4,:128], \arg5
+ vst1.64 {\arg1}, [\arg4,:128], \arg5
+ vst1.64 {\arg2}, [\arg4,:128], \arg5
+ vst1.64 {\arg3}, [\arg4,:128], \arg5
+// }
+.endm
+
+.macro LORD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, src*, src_stride
+ vld1.64 {\arg0}, [\arg4], \arg5
+ vld1.64 {\arg1}, [\arg4], \arg5
+ vld1.64 {\arg2}, [\arg4], \arg5
+ vld1.64 {\arg3}, [\arg4], \arg5
+// }
+.endm
+
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: \arg0~\arg3, dst*, dst_stride
+ vst1.64 {\arg0}, [\arg4], \arg5
+ vst1.64 {\arg1}, [\arg4], \arg5
+ vst1.64 {\arg2}, [\arg4], \arg5
+ vst1.64 {\arg3}, [\arg4], \arg5
+// }
+.endm
+
+.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
+ vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
+ vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
+ vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
+ vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
+
+ vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
+ vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
+ vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
+ vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
+// }
+.endm
+
+.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+ vld1.64 {\arg0}, [\arg8], r2
+ vld1.64 {\arg4}, [\arg9], r4
+ vld1.64 {\arg1}, [\arg8], r2
+ vld1.64 {\arg5}, [\arg9], r4
+
+ vld1.64 {\arg2}, [\arg8], r2
+ vld1.64 {\arg6}, [\arg9], r4
+ vld1.64 {\arg3}, [\arg8], r2
+ vld1.64 {\arg7}, [\arg9], r4
+// }
+.endm
+
+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_d[0]~[3], working: [4]~[7]
+ vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
+ vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
+ vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2];
+ vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2];
+
+ vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1];
+ vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1];
+ vshl.s16 \arg1, \arg7, #1
+ vshl.s16 \arg3, \arg6, #1
+ vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2];
+ vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1);
+// }
+.endm
+
+.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
+// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+ vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+// }
+.endm
+
+.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+ veor.s16 \arg6, \arg6 // init 0 , and keep 0;
+ vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
+ vmull.s16 \arg7, \arg2, \arg4
+ vmull.s16 \arg8, \arg3, \arg5
+ vshr.s32 \arg7, #16
+ vshr.s32 \arg8, #16
+ vmovn.s32 \arg2, \arg7
+ vmovn.s32 \arg3, \arg8
+
+ vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg6, #1
+ vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
+// }
+.endm
+
+.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+ veor.s16 \arg6, \arg6 // init 0 , and keep 0;
+ vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
+ vmull.s16 \arg7, \arg2, \arg4
+ vmull.s16 \arg8, \arg3, \arg5
+ vshr.s32 \arg7, #16
+ vshr.s32 \arg8, #16
+ vmovn.s32 \arg2, \arg7
+ vmovn.s32 \arg3, \arg8
+
+ vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg6, #1
+ vmax.s16 \arg9, \arg2, \arg3
+ vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
+// }
+.endm
+
+.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
+// { // input: coef, ff (dst), mf , working_d (all 0), working_q
+ vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
+ vmull.s16 \arg4, \arg1, \arg2 // *= mf
+ vshr.s32 \arg4, #16
+ vmovn.s32 \arg1, \arg4 // >> 16
+
+ vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111
+ vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched
+ vshl.s16 \arg3, #1
+ vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x
+// }
+.endm
+
+.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
+// { // input: coef, dst_d, working_d (all 0x01)
+ vceq.s16 \arg1, \arg0, #0
+ vand.s16 \arg1, \arg2
+ vpadd.s16 \arg1, \arg1, \arg1
+ vpadd.s16 \arg1, \arg1, \arg1
+// }
+.endm
+
+.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
+// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
+ vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
+ vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
+ vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1]
+// }
+.endm
+
+.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
+// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
+ vceq.s16 \arg0, #0
+ vceq.s16 \arg1, #0
+ vand.s16 \arg0, \arg2
+ vand.s16 \arg1, \arg2
+
+ vpadd.s16 \arg3, \arg3, \arg5
+ vpadd.s16 \arg4, \arg4, \arg6
+ vpadd.s16 \arg3, \arg3, \arg4 // 8-->4
+ vpadd.s16 \arg3, \arg3, \arg3
+ vpadd.s16 \arg3, \arg3, \arg3
+// }
+.endm
+
+.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
+// { // input: src_d[0]~[3], working_d, dst_d
+ vshr.s64 \arg1, \arg0, #32
+ vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+ vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+ vtrn.s16 \arg2, \arg1
+ vtrn.s32 \arg2, \arg1
+// }
+.endm
+
+.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
+// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+ vshr.s64 \arg1, \arg0, #32
+ vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+ vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+ vtrn.s16 \arg2, \arg1
+ vrev32.16 \arg1, \arg1
+ vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+
+ vrev64.16 \arg1, \arg2
+ vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+ vsub.s16 \arg1, \arg2, \arg1
+ vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+ vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+// }
+.endm
+
+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
+// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+ vmovl.u8 \arg4,\arg0
+ vmovl.u8 \arg5,\arg1
+ vadd.s16 \arg4,\arg2
+ vadd.s16 \arg5,\arg3
+ vqmovun.s16 \arg0,\arg4
+ vqmovun.s16 \arg1,\arg5
+// }
+.endm
+
+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_d[0]~[3], output: e_d[0]~[3];
+ vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
+ vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg6, \arg1, #1
+ vshr.s16 \arg7, \arg3, #1
+ vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3];
+ vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1);
+// }
+.endm
+
+.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
+.endm
+
+
+.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_d[0]~[3], output: e_q[0]~[3];
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3];
+ vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3];
+// }
+.endm
+
+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
+ vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
+ vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
+ vshr.s16 \arg8, \arg1, #1
+ vshr.s16 \arg9, \arg3, #1
+ vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
+ vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
+// }
+.endm
+
+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+// { // output: f_q[0]~[3], input: e_q[0]~[3];
+ vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
+ vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
+ vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
+ vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
+// }
+.endm
+
+.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
+.endm
+
+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+// { // input: src_q[0]~[3], output: e_q[0]~[3];
+ vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
+ vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
+ vshr.s32 \arg6, \arg1, #1
+ vshr.s32 \arg7, \arg3, #1
+ vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+ vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+// }
+.endm
+#endif
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
+
+ LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+
+ STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+
+ LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
+
+ STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
+
+ LORD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+
+ STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+
+ LORD_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
+
+ STORE_ALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE q4, q5, q6, q7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
+
+ LORD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
+
+ STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
+
+WELS_ASM_FUNC_END
+
+
+
+WELS_ASM_FUNC_BEGIN WelsDctT4_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4
+
+ vsubl.u8 q0, d4, d6
+ vsubl.u8 q1, d5, d7
+ vtrn.s32 q0, q1
+ vswp d1, d2
+
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+
+ vst1.s16 {q0, q1}, [r0]!
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
+
+ vsubl.u8 q0, d8, d12
+ vsubl.u8 q1, d9, d13
+ vsubl.u8 q2, d10, d14
+ vsubl.u8 q3, d11, d15
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ vswp d1, d2
+ vswp d5, d6
+ vswp q1, q2
+ vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q2, q3}, [r0]!
+
+ ////////////////
+ LOAD_8x8_DATA_FOR_DCT d8, d9, d10, d11, d12, d13, d14, d15, r1, r3
+
+ vsubl.u8 q0, d8, d12
+ vsubl.u8 q1, d9, d13
+ vsubl.u8 q2, d10, d14
+ vsubl.u8 q3, d11, d15
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+
+ // horizontal transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ // transform element
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+
+ // vertical transform
+ DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ vswp d1, d2
+ vswp d5, d6
+ vswp q1, q2
+ vst1.s16 {q0, q1}, [r0]!
+ vst1.s16 {q2, q3}, [r0]!
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q0, q1}, [r0]
+ vld1.s16 {q3}, [r2]
+
+ vmov q4, q2
+
+ NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7
+ vst1.s16 {q2}, [r0]!
+
+ NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r0]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
+
+ vld1.s16 {q0, q1}, [r0]
+ vdup.s16 q2, r1 // even ff range [0, 768]
+ vdup.s16 q3, r2
+
+ vmov q4, q2
+
+ NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q5, q6, q7
+ vst1.s16 {q2}, [r0]!
+
+ NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r0]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q3}, [r2]
+ mov r1, r0
+
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q0, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS q1, q4, d8, d9, d6, d7, q5, q6, q7
+ vst1.s16 {q4}, [r1]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
+ vld1.s16 {q2}, [r1]
+ vld1.s16 {q3}, [r2]
+ mov r1, r0
+
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
+ vst1.s16 {q4}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
+ vst1.s16 {q8}, [r1]! // then 1st 16 elem in d18 & d20
+
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
+ vst1.s16 {q4}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
+ vst1.s16 {q8}, [r1]! // then 2nd 16 elem in d19 & d21
+
+ SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1
+ vst1.s32 {d0[0]}, [r3]!
+
+ ///////////
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d18
+ vst1.s16 {q4}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d20
+ vst1.s16 {q8}, [r1]! // then 3rd 16 elem in d18 & d20
+
+ vld1.s16 {q0, q1}, [r0]!
+ vmov q4, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q0, q4, d8, d9, d6, d7, q5, q6, q7, d19
+ vst1.s16 {q4}, [r1]!
+ vmov q8, q2
+ NEWQUANT_COEF_EACH_16BITS_MAX q1, q8, d16, d17, d6, d7, q5, q6, q7, d21
+ vst1.s16 {q8}, [r1]! // then 4th 16 elem in d19 & d21
+
+ SELECT_MAX_IN_ABS_COEF q9, q10, q0, d0, d1
+ vst1.s32 {d0[0]}, [r3]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
+ push {r2,r3}
+ mov r2, #64 // 2*16*sizeof(int16_t)
+ add r3, r1, #32
+
+ vld1.s16 {d0}, [r1], r2
+ vld1.s16 {d1}, [r3], r2
+ vld1.s16 {d4}, [r1], r2
+ vld1.s16 {d5}, [r3], r2
+ vld1.s16 {d2}, [r1], r2
+ vld1.s16 {d3}, [r3], r2
+ vld1.s16 {d6}, [r1], r2
+ vld1.s16 {d7}, [r3], r2
+ vtrn.16 q0, q2 // d0[0 4], d1[1 5]
+ vtrn.16 q1, q3 // d2[2 6], d3[3 7]
+
+ vld1.s16 {d8}, [r1], r2
+ vld1.s16 {d9}, [r3], r2
+ vld1.s16 {d12}, [r1], r2
+ vld1.s16 {d13}, [r3], r2
+ vld1.s16 {d10}, [r1], r2
+ vld1.s16 {d11}, [r3], r2
+ vld1.s16 {d14}, [r1], r2
+ vld1.s16 {d15}, [r3], r2
+ vtrn.16 q4, q6 // d8[08 12], d9[09 13]
+ vtrn.16 q5, q7 //d10[10 14],d11[11 15]
+
+ vtrn.32 q0, q4 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16]
+ vtrn.32 q1, q5 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80]
+
+ ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q4, q7, q6, q5
+
+ TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5
+
+ // transform element 32bits
+ vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+ vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+ vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+ vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+
+ COL_TRANSFORM_0_STEP q0, q1, q3, q2, q4, q7, q6, q5
+
+ TRANSFORM_4BYTES q0, q1, q3, q2, q4, q7, q6, q5
+
+ vrshrn.s32 d8, q0, #1
+ vrshrn.s32 d9, q1, #1
+ vrshrn.s32 d10, q2, #1
+ vrshrn.s32 d11, q3, #1
+ vst1.16 {q4, q5}, [r0] //store
+
+ pop {r2,r3}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
+
+ vdup.s16 d1, r1 //ff
+ vdup.s16 d2, r2 //mf
+ veor d3, d3
+
+ mov r1, #32
+ mov r2, r0
+
+ vld1.s16 {d0[0]}, [r0], r1 //rs[00]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0
+ vld1.s16 {d0[1]}, [r0], r1 //rs[16]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0
+ vld1.s16 {d0[2]}, [r0], r1 //rs[32]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0
+ vld1.s16 {d0[3]}, [r0], r1 //rs[48]
+ vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0
+
+ HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5
+
+ HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0
+
+ QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
+
+ vst1.s16 d1, [r3] // store to dct
+ ldr r2, [sp, #0]
+ vst1.s16 d1, [r2] // store to block
+
+ mov r1, #1
+ vdup.s16 d3, r1
+ DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3
+
+ vmov r0, r1, d0
+ and r0, #0x07 // range [0~4]
+ rsb r0, #4
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
+
+ vdup.s16 d3, r1
+ mov r1, #32
+ vld1.s16 {d0[0]}, [r0], r1 //rs[00]
+ vld1.s16 {d0[1]}, [r0], r1 //rs[16]
+ vld1.s16 {d0[2]}, [r0], r1 //rs[32]
+ vld1.s16 {d0[3]}, [r0], r1 //rs[48]
+
+ HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2
+
+ HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0
+
+ vabs.s16 d1, d0
+ vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold;
+ vmov r0, r1, d1
+ orr r0, r1
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
+ push {r1}
+ vld1.s16 {q0, q1}, [r0]
+ vmov.s16 q8, #1
+
+ ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3
+ vmov r0, r1, d0
+ and r0, #0x1F // range [0~16]
+ rsb r0, #16
+ pop {r1}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
+ vld1.s16 {q0, q1}, [r0]
+ vld1.u16 {q2}, [r1]
+
+ vmul.s16 q4, q0, q2
+ vmul.s16 q5, q1, q2
+
+ vst1.s16 {q4, q5}, [r0]
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
+ vld1.u16 {q8}, [r1]
+ mov r1, r0
+ vld1.s16 {q0, q1}, [r0]!
+ vld1.s16 {q2, q3}, [r0]!
+ vmul.s16 q0, q0, q8
+ vld1.s16 {q4, q5}, [r0]!
+ vmul.s16 q1, q1, q8
+ vld1.s16 {q6, q7}, [r0]!
+
+ vst1.s16 {q0, q1}, [r1]!
+
+ vmul.s16 q2, q2, q8
+ vmul.s16 q3, q3, q8
+ vmul.s16 q4, q4, q8
+ vst1.s16 {q2, q3}, [r1]!
+
+ vmul.s16 q5, q5, q8
+ vmul.s16 q6, q6, q8
+ vmul.s16 q7, q7, q8
+ vst1.s16 {q4, q5}, [r1]!
+ vst1.s16 {q6, q7}, [r1]!
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
+
+ vld1.s16 {q0, q1}, [r0]
+ vdup.s16 q4, r1
+
+ IHDM_4x4_TOTAL_16BITS q0, q2, q3
+ IHDM_4x4_TOTAL_16BITS q1, q2, q3
+
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+
+ IHDM_4x4_TOTAL_16BITS q0, q2, q3
+ vmul.s16 q0, q4
+
+ IHDM_4x4_TOTAL_16BITS q1, q2, q3
+ vmul.s16 q1, q4
+
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+ vst1.s16 {q0, q1}, [r0]
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
+ vld1.u32 {d14[0]}, [r2], r3
+ push {r4}
+ ldr r4, [sp, #4]
+ vld1.u32 {d14[1]}, [r2], r3
+
+ vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles!
+ vld1.u32 {d15[0]}, [r2], r3
+ vld1.u32 {d15[1]}, [r2], r3 // q7 is pred
+
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+
+ TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+
+ MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
+
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+
+ TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
+ vrshr.s16 d0, d0, #6
+ vrshr.s16 d1, d1, #6
+ vrshr.s16 d2, d2, #6
+ vrshr.s16 d3, d3, #6
+
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q2,d14
+ vadd.s16 q0,q2
+ vqmovun.s16 d14,q0
+ vst1.32 {d14[0]},[r0],r1
+ vst1.32 {d14[1]},[r0],r1
+
+ vmovl.u8 q2,d15
+ vadd.s16 q1,q2
+ vqmovun.s16 d15,q1
+ vst1.32 {d15[0]},[r0],r1
+ vst1.32 {d15[1]},[r0]
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
+
+ vld1.u64 {d16}, [r2], r3
+ push {r4}
+ ldr r4, [sp, #4]
+ vld1.u64 {d17}, [r2], r3
+
+ vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
+ vld1.u64 {d18}, [r2], r3
+ vld1.u64 {d19}, [r2], r3
+ vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
+ vswp d1, d4
+ vswp d3, d6
+ vswp q1, q2 // q0~q3
+
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+ vrshr.s16 q0, q0, #6
+ vrshr.s16 q1, q1, #6
+ vrshr.s16 q2, q2, #6
+ vrshr.s16 q3, q3, #6
+
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q4,d16
+ vadd.s16 q0,q4
+ vqmovun.s16 d16,q0
+ vst1.u8 {d16},[r0],r1
+
+ vmovl.u8 q4,d17
+ vadd.s16 q1,q4
+ vqmovun.s16 d17,q1
+ vst1.u8 {d17},[r0],r1
+
+ vmovl.u8 q4,d18
+ vadd.s16 q2,q4
+ vqmovun.s16 d18,q2
+ vst1.u8 {d18},[r0],r1
+
+ vmovl.u8 q4,d19
+ vadd.s16 q3,q4
+ vqmovun.s16 d19,q3
+ vst1.u8 {d19},[r0],r1
+
+ vld1.u64 {d16}, [r2], r3
+ vld1.u64 {d17}, [r2], r3
+
+ vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
+ vld1.u64 {d18}, [r2], r3
+ vld1.u64 {d19}, [r2], r3
+ vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
+ vswp d1, d4
+ vswp d3, d6
+ vswp q1, q2 // q0~q3
+
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
+
+ ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+
+ TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q4, q5, q6, q7
+ vrshr.s16 q0, q0, #6
+ vrshr.s16 q1, q1, #6
+ vrshr.s16 q2, q2, #6
+ vrshr.s16 q3, q3, #6
+
+ //after rounding 6, clip into [0, 255]
+ vmovl.u8 q4,d16
+ vadd.s16 q0,q4
+ vqmovun.s16 d16,q0
+ vst1.u8 {d16},[r0],r1
+
+ vmovl.u8 q4,d17
+ vadd.s16 q1,q4
+ vqmovun.s16 d17,q1
+ vst1.u8 {d17},[r0],r1
+
+ vmovl.u8 q4,d18
+ vadd.s16 q2,q4
+ vqmovun.s16 d18,q2
+ vst1.u8 {d18},[r0],r1
+
+ vmovl.u8 q4,d19
+ vadd.s16 q3,q4
+ vqmovun.s16 d19,q3
+ vst1.u8 {d19},[r0],r1
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ vld1.s16 {q8,q9}, [r4]
+ vrshr.s16 q8, q8, #6
+ vrshr.s16 q9, q9, #6
+
+ vdup.s16 d20, d16[0]
+ vdup.s16 d21, d16[1]
+ vdup.s16 d22, d16[2]
+ vdup.s16 d23, d16[3]
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vdup.s16 d20, d17[0]
+ vdup.s16 d21, d17[1]
+ vdup.s16 d22, d17[2]
+ vdup.s16 d23, d17[3]
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vdup.s16 d20, d18[0]
+ vdup.s16 d21, d18[1]
+ vdup.s16 d22, d18[2]
+ vdup.s16 d23, d18[3]
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vdup.s16 d20, d19[0]
+ vdup.s16 d21, d19[1]
+ vdup.s16 d22, d19[2]
+ vdup.s16 d23, d19[3]
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ vld1.u8 {q0}, [r2], r3
+ MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
+ vst1.u8 {q0}, [r0], r1
+
+ pop {r4}
+WELS_ASM_FUNC_END
+#endif
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -110,6 +110,33 @@
#endif//X86_ASM
+#if defined (HAVE_NEON)
+
+int32_t WelsSampleSad4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+
+int32_t WelsSampleSatd8x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd4x4_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+
+int32_t WelsIntra16x16Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra16x16Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra8x8Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
+int32_t WelsIntra8x8Combined3Sad_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*, uint8_t*);
+int32_t WelsIntra4x4Combined3Satd_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t);
+
+#endif
#if defined(__cplusplus)
}
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -482,6 +482,33 @@
#endif //(X86_ASM)
+#if defined (HAVE_NEON)
+ if (uiCpuFlag & WELS_CPU_NEON) {
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_neon;
+
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_neon;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_neon;
+
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_neon;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_neon;
+
+ pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsIntra4x4Combined3Satd_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntra8x8Combined3Satd_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_neon;
+ pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_neon;
+ }
+#endif
}
} // namespace WelsSVCEnc
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -231,6 +231,11 @@
pfVar = SampleVariance16x16_sse2;
}
#endif
+#ifdef HAVE_NEON
+ if (iCpuFlag & WELS_CPU_NEON) {
+ pfVar = SampleVariance16x16_neon;
+ }
+#endif
}
void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -62,6 +62,11 @@
WELSVP_EXTERN_C_END
#endif
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+VarFunc SampleVariance16x16_neon;
+WELSVP_EXTERN_C_END
+#endif
class CAdaptiveQuantization : public IStrategy {
public:
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -35,7 +35,7 @@
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
-.macro SQR_ADD_16BYTES
+.macro SQR_ADD_16BYTES
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
@@ -51,23 +51,23 @@
#endif
-WELS_ASM_FUNC_BEGIN pixel_var_16x16_neon
+WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4}
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
-
-
- vabd.u8 q13, q14, q15
+
+
+ vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
- vaddl.u8 q13, d26, d27 //sum
-
+ vaddl.u8 q13, d26, d27 //sum
+
vaddl.u8 q10, d28, d29 //sum_cur
-
+
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
@@ -78,35 +78,35 @@
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
-
+
vabd.u8 q2, q0, q1
-
+
//q10 save sum_cur
vpadal.u8 q10, q1
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
-
+
//q13 save sum
vpadal.u8 q13, q2
subs r4, #1
-
- //q9 save sqr_cur
- SQR_ADD_16BYTES d2, d3, q9
-
- bne pixel_var_16x16_loop0
-
+
+ //q9 save sqr_cur
+ SQR_ADD_16BYTES d2, d3, q9
+
+ bne pixel_var_16x16_loop0
+
vadd.u16 d0, d26, d27 //sum
- vadd.u16 d1, d20, d21 //sum_cur
+ vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
-
+
ldr r4, [sp, #4]
-
+
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
@@ -117,4 +117,4 @@
WELS_ASM_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -35,29 +35,29 @@
#include "arm_arch_common_macro.S"
-WELS_ASM_FUNC_BEGIN comp_ds_bilinear_neon
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr}
-
+
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
-
+
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
- lsr r5, #1
-
+ lsr r5, #1
+
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
-
+
add r7, r2, r3
//processing a colume data
-comp_ds_bilinear_loop0:
+comp_ds_bilinear_loop0:
vld1.8 {q0,q1}, [r2]!
- vld1.8 {q2,q3}, [r7]!
+ vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
@@ -70,9 +70,9 @@
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
- vst1.32 {q0}, [r0]!
+ vst1.32 {q0}, [r0]!
add lr, #32
-
+
cmp lr, r4
movcs lr, #0
addcs r6, r3, lsl #1
@@ -82,10 +82,10 @@
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
-
+
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
-
+
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
@@ -96,29 +96,29 @@
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
-
+
//Get the difference
- sub lr, r3, r4
+ sub lr, r3, r4
sub r1, r1, r4, lsr #1
-
+
lsr r5, #1
-
+
//processing a colume data
-comp_ds_bilinear_w_x8_loop0:
-
+comp_ds_bilinear_w_x8_loop0:
+
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x8_loop1:
-
+
vld1.8 {d0}, [r2]!
- vld1.8 {d1}, [r7]!
+ vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
-
+
vmovn.u16 d0, q0
- vst1.32 {d0[0]}, [r0]!
+ vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
@@ -126,7 +126,7 @@
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
-
+
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
@@ -137,31 +137,31 @@
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
-
+
//Get the difference
- sub lr, r3, r4
+ sub lr, r3, r4
sub r1, r1, r4, lsr #1
-
+
lsr r5, #1
-
+
//processing a colume data
-comp_ds_bilinear_w_x16_loop0:
-
+comp_ds_bilinear_w_x16_loop0:
+
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x16_loop1:
-
+
vld1.8 {q0}, [r2]!
- vld1.8 {q1}, [r7]!
+ vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
-
+
vmovn.u16 d0, q0
- vst1.32 {d0}, [r0]!
+ vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
@@ -169,34 +169,34 @@
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
-
+
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x32_neon
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
-
+
//Get the difference
- sub lr, r3, r4
+ sub lr, r3, r4
sub r1, r1, r4, lsr #1
-
+
lsr r5, #1
-
+
//processing a colume data
-comp_ds_bilinear_w_x32_loop0:
-
+comp_ds_bilinear_w_x32_loop0:
+
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x32_loop1:
-
+
vld1.8 {q0,q1}, [r2]!
- vld1.8 {q2,q3}, [r7]!
+ vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
@@ -207,10 +207,10 @@
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
-
+
vmovn.u16 d0, q0
vmovn.u16 d1, q1
- vst1.32 {q0}, [r0]!
+ vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
@@ -218,14 +218,14 @@
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
-
+
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
-WELS_ASM_FUNC_BEGIN general_ds_bilinear_accurate_neon
+WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr}
-
+
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
@@ -245,11 +245,11 @@
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
mul r11, r9 // r11 -vinc
-
+
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
-
+
mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
@@ -261,13 +261,13 @@
sub r11, #1
vdup.s16 d9, r11
vext.8 d7, d9, d8, #4 //init v 16384 16384 16383 16383
-
- veor q14, q14
- sub r1, r2 // stride - width
+
+ veor q14, q14
+ sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
-
-_HEIGHT:
+
+_HEIGHT:
ldr r4, [sp, #40] //the addr of src
mov r11, r8
lsr r11, #15
@@ -275,8 +275,8 @@
add r11, r4 // get current row address
mov r12, r11
add r12, r5
-
- mov r9, #16384 // xInverse
+
+ mov r9, #16384 // xInverse
sub r10, r2, #1
vmov.s16 d6, d1
@@ -288,8 +288,8 @@
add r4, r12,lr
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
-
- vmull.u16 q13, d6, d7 //q13: init u * init v
+
+ vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
@@ -296,13 +296,13 @@
vrshr.u64 d24, #30
vst1.8 {d24[0]}, [r0]!
- add r9, r6
+ add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
-
+
WIDTH_END:
lsr r9, #15
add r4,r11,r9
@@ -317,26 +317,26 @@
subs r3, #1
bne _HEIGHT
-LAST_ROW:
+LAST_ROW:
ldr r4, [sp, #40] //the addr of src
lsr r8, #15
mul r8, r5
- add r4, r8 // get current row address
+ add r4, r8 // get current row address
mov r9, #16384
_LAST_ROW_WIDTH:
mov r11, r9
lsr r11, #15
-
+
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
- vst1.8 {d0[0]}, [r0]
- add r0, #1
- add r9, r6
+ vst1.8 {d0[0]}, [r0]
+ add r0, #1
+ add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
-
+
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -35,24 +35,24 @@
#include "arm_arch_common_macro.S"
-WELS_ASM_FUNC_BEGIN pixel_sad_8x8_neon
+WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
stmdb sp!, {lr}
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1
+ vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
-
+
//Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1
-
+
mov lr, #7
pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes)
- vld1.8 {d0}, [r0], r1
+ vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
subs lr, #1
-
+
//Do the SAD for 8 bytes
vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0
@@ -65,4 +65,4 @@
ldmia sp!, {lr}
WELS_ASM_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -36,7 +36,7 @@
#ifdef APPLE_IOS
-.macro ABS_SUB_SUM_16BYTES
+.macro ABS_SUB_SUM_16BYTES
vld1.32 {q15}, [$0], $2
vld1.32 {q14}, [$1], $2
vabal.u8 $3, d30, d28
@@ -43,22 +43,22 @@
vabal.u8 $4, d31, d29
.endm
-.macro ABS_SUB_SUM_8x16BYTES
+.macro ABS_SUB_SUM_8x16BYTES
vld1.32 {q15}, [$0], $2
vld1.32 {q14}, [$1], $2
vabdl.u8 $3, d30, d28
vabdl.u8 $4, d31, d29
-
+
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+ ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
.endm
-.macro SAD_8X16BITS
+.macro SAD_8X16BITS
vadd.u16 d31, $0, $1
vpaddl.u16 d31, d31
vpaddl.u32 $2, d31
@@ -73,19 +73,19 @@
vabal.u8 \arg4, d31, d29
.endm
-.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
+.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
vld1.32 {q15}, [\arg0], \arg2
vld1.32 {q14}, [\arg1], \arg2
vabdl.u8 \arg3, d30, d28
vabdl.u8 \arg4, d31, d29
-
+
ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
- ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+ ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
.endm
.macro SAD_8X16BITS arg0, arg1, arg2
@@ -96,67 +96,67 @@
#endif
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_neon
+WELS_ASM_FUNC_BEGIN VAACalcSad_neon
- stmdb sp!, {r4-r8}
-
+ stmdb sp!, {r4-r8}
+
ldr r4, [sp, #20] //load pic_stride
ldr r5, [sp, #28] //load psad8x8
-
+
//Initial the Q4 register for save the "psadframe"
vmov.s64 q4, #0
-
+
//Get the jump distance to use on loop codes
lsl r8, r4, #4
sub r7, r8, #16 //R7 keep the 16*pic_stride-16
sub r8, r2 //R8 keep the 16*pic_stride-pic_width
-
+
vaa_calc_sad_loop0:
//R6 keep the pic_width
mov r6, r2
-
-vaa_calc_sad_loop1:
+vaa_calc_sad_loop1:
+
//Process the 16x16 bytes
ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
-
+
//Do the SAD
SAD_8X16BITS d0, d1, d0
SAD_8X16BITS d2, d3, d1
SAD_8X16BITS d4, d5, d2
- SAD_8X16BITS d6, d7, d3
-
+ SAD_8X16BITS d6, d7, d3
+
//Write to "psad8x8" buffer
- vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
-
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
+
//Adjust the input address
sub r0, r7
sub r1, r7
-
+
subs r6, #16
-
- //Save to calculate "psadframe"
+
+ //Save to calculate "psadframe"
vadd.u32 q0, q1
vadd.u32 q4, q0
-
+
bne vaa_calc_sad_loop1
-
+
//Adjust the input address
add r0, r8
add r1, r8
-
+
subs r3, #16
- bne vaa_calc_sad_loop0
-
+ bne vaa_calc_sad_loop0
+
ldr r6, [sp, #24] //load psadframe
vadd.u32 d8, d9
vst1.32 {d8[0]}, [r6]
-
+
ldmia sp!, {r4-r8}
-
+
WELS_ASM_FUNC_END
@@ -164,12 +164,12 @@
.macro SAD_SD_MAD_16BYTES
vld1.32 {q0}, [$0], $2
vld1.32 {q1}, [$1], $2
-
+
vpadal.u8 $3, q0
vpadal.u8 $4, q1
-
- vabd.u8 q0, q0, q1
- vmax.u8 $5, q0
+
+ vabd.u8 q0, q0, q1
+ vmax.u8 $5, q0
vpadal.u8 $6, q0
.endm
@@ -177,13 +177,13 @@
vld1.32 {q0}, [$0], $2
vld1.32 {q1}, [$1], $2
- vpaddl.u8 q2, q0
+ vpaddl.u8 q2, q0
vpaddl.u8 q3, q1
-
- vabd.u8 $3, q0, q1
+
+ vabd.u8 $3, q0, q1
vpaddl.u8 $4, $3 //abs_diff
-
+
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
@@ -191,7 +191,7 @@
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-
+
vsub.u16 $5, q2, q3
.endm
@@ -203,18 +203,18 @@
vpaddl.u16 $3, $3
vpaddl.u32 $3, $3
vpaddl.s16 $4, $4
- vpaddl.s32 $4, $4
+ vpaddl.s32 $4, $4
.endm
#else
-.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
+.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
vld1.32 {q0}, [\arg0], \arg2
vld1.32 {q1}, [\arg1], \arg2
-
+
vpadal.u8 \arg3, q0
vpadal.u8 \arg4, q1
-
- vabd.u8 q0, q0, q1
- vmax.u8 \arg5, q0
+
+ vabd.u8 q0, q0, q1
+ vmax.u8 \arg5, q0
vpadal.u8 \arg6, q0
.endm
@@ -222,13 +222,13 @@
vld1.32 {q0}, [\arg0], \arg2
vld1.32 {q1}, [\arg1], \arg2
- vpaddl.u8 q2, q0
+ vpaddl.u8 q2, q0
vpaddl.u8 q3, q1
-
- vabd.u8 \arg3, q0, q1
+
+ vabd.u8 \arg3, q0, q1
vpaddl.u8 \arg4, \arg3 //abs_diff
-
+
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
@@ -236,7 +236,7 @@
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
-
+
vsub.u16 \arg5, q2, q3
.endm
@@ -248,69 +248,69 @@
vpaddl.u16 \arg3, \arg3
vpaddl.u32 \arg3, \arg3
vpaddl.s16 \arg4, \arg4
- vpaddl.s32 \arg4, \arg4
+ vpaddl.s32 \arg4, \arg4
.endm
#endif
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_bgd_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon
stmdb sp!, {r4-r10}
-
+
ldr r4, [sp, #28] //load pic_stride
ldr r5, [sp, #36] //load psad8x8
ldr r6, [sp, #40] //load psd8x8
ldr r7, [sp, #44] //load pmad8x8
-
+
//Initial the Q4 register for save the "psadframe"
vmov.s64 q15, #0
-
+
//Get the jump distance to use on loop codes
lsl r10, r4, #4
sub r9, r10, #16 //R9 keep the 16*pic_stride-16
sub r10, r2 //R10 keep the 16*pic_stride-pic_width
-
+
vaa_calc_sad_bgd_loop0:
//R6 keep the pic_width
mov r8, r2
-
-vaa_calc_sad_bgd_loop1:
+vaa_calc_sad_bgd_loop1:
+
//Process the 16x16 bytes pmad psad psd
SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
-
+
SAD_SD_MAD_CALC d26, d27, d16, q11, q9
- SAD_SD_MAD_CALC d28, d29, d17, q12, q10
+ SAD_SD_MAD_CALC d28, d29, d17, q12, q10
//Write to "psad8x8" buffer
- vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
+ vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
//Adjust the input address
sub r0, r9
sub r1, r9
//Write to "psd8x8" buffer
- vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
+ vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
subs r8, #16
//Write to "pmad8x8" buffer
- vst2.16 {d16[0],d17[0]}, [r7]!
- //Save to calculate "psadframe"
+ vst2.16 {d16[0],d17[0]}, [r7]!
+ //Save to calculate "psadframe"
vadd.u32 q11, q12
vadd.u32 q15, q11
-
+
bne vaa_calc_sad_bgd_loop1
-
+
//Adjust the input address
add r0, r10
add r1, r10
-
+
subs r3, #16
- bne vaa_calc_sad_bgd_loop0
-
+ bne vaa_calc_sad_bgd_loop0
+
ldr r8, [sp, #32] //load psadframe
vadd.u32 d30, d31
- vst1.32 {d30[0]}, [r8]
+ vst1.32 {d30[0]}, [r8]
ldmia sp!, {r4-r10}
-
+
WELS_ASM_FUNC_END
@@ -318,7 +318,7 @@
.macro SSD_MUL_SUM_16BYTES_RESET
vmull.u8 $3, $0, $0
vpaddl.u16 $2, $3
-
+
vmull.u8 $3, $1, $1
vpadal.u16 $2, $3
.endm
@@ -326,7 +326,7 @@
.macro SSD_MUL_SUM_16BYTES
vmull.u8 $3, $0, $0
vpadal.u16 $2, $3
-
+
vmull.u8 $3, $1, $1
vpadal.u16 $2, $3
.endm
@@ -333,21 +333,21 @@
.macro SAD_SSD_BGD_16
vld1.8 {q0}, [$0], $2 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
-
+
vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vld1.8 {q1}, [$1], $2 //load ref_row
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -354,20 +354,20 @@
//the last row of a 16x16 block
.macro SAD_SSD_BGD_16_end
vld1.8 {q0}, [$0], $1 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
-
+
vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
-
+
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -374,23 +374,23 @@
//for the begin of a 8x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_8x8
vld1.8 {q0}, [$0], $2 //load cur_row
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
+
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
-
+
+
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vld1.8 {q1}, [$1], $2 //load ref_row
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -398,18 +398,18 @@
.macro SAD_SSD_BGD_16_RESET_16x16
vld1.8 {q0}, [$0], $2 //load cur_row
vld1.8 {q1}, [$1], $2 //load ref_row
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
+
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vld1.8 {q1}, [$1], $2 //load ref_row
vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
@@ -419,24 +419,24 @@
//for each 8x16 block
.macro SAD_SSD_BGD_CALC_8x16
-
+
vpmax.u8 d10, d10, d11 //4 numbers
vpmax.u8 d10, d10, d10 //2 numbers
vpmax.u8 d10, d10, d10 //1 number1
-
+
vmov $0, d10 //d26 d27 keeps the l_mad
-
+
//p_sd8x8 fix me
- vpaddl.u16 q3, q3
+ vpaddl.u16 q3, q3
vpaddl.u16 q4, q4
-
+
vsub.i32 $1, q3, q4
vpaddl.u32 $1, $1
-
+
//psad8x8
vpaddl.u16 $2, $2
vpaddl.u32 $2, $2
-
+
//psadframe
vadd.i32 q12, $2
.endm
@@ -451,9 +451,9 @@
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
SAD_SSD_BGD_16 $0, $1, $2, q6
-
+
SAD_SSD_BGD_CALC_8x16 d26, q14, q6
-
+
//for another 8x16
SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
@@ -463,20 +463,20 @@
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16 $0, $1, $2, q7
SAD_SSD_BGD_16_end $0, $2, q7
-
+
SAD_SSD_BGD_CALC_8x16 d27, q15, q7
.endm
-.macro SSD_SAD_SD_MAD_PADDL
+.macro SSD_SAD_SD_MAD_PADDL
vpaddl.s16 $0, $0
- vpaddl.s32 $0, $0
- vadd.i32 $1, $1, $2
+ vpaddl.s32 $0, $0
+ vadd.i32 $1, $1, $2
.endm
#else
.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
vmull.u8 \arg3, \arg0, \arg0
vpaddl.u16 \arg2, \arg3
-
+
vmull.u8 \arg3, \arg1, \arg1
vpadal.u16 \arg2, \arg3
.endm
@@ -484,7 +484,7 @@
.macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
vmull.u8 \arg3, \arg0, \arg0
vpadal.u16 \arg2, \arg3
-
+
vmull.u8 \arg3, \arg1, \arg1
vpadal.u16 \arg2, \arg3
.endm
@@ -491,21 +491,21 @@
.macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
-
+
vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vld1.8 {q1}, [\arg1], \arg2 //load ref_row
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -512,20 +512,20 @@
//the last row of a 16x16 block
.macro SAD_SSD_BGD_16_end arg0, arg1, arg2
vld1.8 {q0}, [\arg0], \arg1 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
-
+
vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
-
+
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -532,23 +532,23 @@
//for the begin of a 8x16 block, use some instructions to reset the register
.macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
+
vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
-
-
+
+
SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vld1.8 {q1}, [\arg1], \arg2 //load ref_row
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -556,18 +556,18 @@
.macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
vld1.8 {q1}, [\arg1], \arg2 //load ref_row
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
+
vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
-
+
vld1.8 {q1}, [\arg1], \arg2 //load ref_row
vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
@@ -577,24 +577,24 @@
//for each 8x16 block
.macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2
-
+
vpmax.u8 d10, d10, d11 //4 numbers
vpmax.u8 d10, d10, d10 //2 numbers
vpmax.u8 d10, d10, d10 //1 number1
-
+
vmov \arg0, d10 //d26 d27 keeps the l_mad
-
+
//p_sd8x8
- vpaddl.u16 q3, q3
+ vpaddl.u16 q3, q3
vpaddl.u16 q4, q4
-
+
vsub.i32 \arg1, q3, q4
vpaddl.u32 \arg1, \arg1
-
+
//psad8x8
vpaddl.u16 \arg2, \arg2
vpaddl.u32 \arg2, \arg2
-
+
//psadframe
vadd.i32 q12, \arg2
.endm
@@ -609,9 +609,9 @@
SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-
+
SAD_SSD_BGD_CALC_8x16 d26, q14, q6
-
+
//for another 8x16
SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
@@ -621,30 +621,30 @@
SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
SAD_SSD_BGD_16_end \arg0, \arg2, q7
-
+
SAD_SSD_BGD_CALC_8x16 d27, q15, q7
.endm
-.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
+.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
vpaddl.s16 \arg0, \arg0
- vpaddl.s32 \arg0, \arg0
- vadd.i32 \arg1, \arg1, \arg2
+ vpaddl.s32 \arg0, \arg0
+ vadd.i32 \arg1, \arg1, \arg2
.endm
#endif
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_bgd_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
stmdb sp!, {r0-r12, r14}
-
+
ldr r4, [sp, #56] //r4 keeps the pic_stride
-
+
sub r5, r4, #1
lsl r5, r5, #4 //r5 keeps the little step
-
+
lsl r6, r4, #4
sub r6, r2, r6 //r6 keeps the big step
-
-
+
+
ldr r8, [sp, #64]//psad8x8
ldr r9, [sp, #68]//psum16x16
ldr r10, [sp, #72]//psqsum16x16
@@ -651,62 +651,62 @@
ldr r11, [sp, #76]//psqdiff16x16
ldr r12, [sp, #80]//p_sd8x8
ldr r14, [sp, #84]//p_mad8x8
-
+
vmov.i8 q12, #0
-
+
vaa_calc_sad_ssd_bgd_height_loop:
mov r7, r2
vaa_calc_sad_ssd_bgd_width_loop:
-
+
//l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10
SAD_SSD_BGD_16x16 r0,r1,r4
-
+
//psad8x8
vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
-
+
sub r0, r0, r5 //jump to next 16x16
sub r1, r1, r5 //jump to next 16x16
-
+
//p_sd8x8
vst4.32 {d28[0], d29[0],d30[0], d31[0]}, [r12]!
//p_mad8x8
vst2.16 {d26[0], d27[0]}, [r14]!
-
+
//psqdiff16x16
- vpaddl.s32 q8, q8
+ vpaddl.s32 q8, q8
vadd.i32 d16, d16, d17
-
+
vst1.32 {d16[0]}, [r11]! //psqdiff16x16
-
+
//psum16x16
SSD_SAD_SD_MAD_PADDL q9, d18, d19
vst1.32 {d18[0]}, [r9]! //psum16x16
//psqsum16x16
- vpaddl.s32 q10, q10
- vadd.i32 d20, d20, d21
+ vpaddl.s32 q10, q10
+ vadd.i32 d20, d20, d21
vst1.32 {d20[0]}, [r10]! //psqsum16x16
-
+
subs r7, #16
-
+
bne vaa_calc_sad_ssd_bgd_width_loop
-
+
sub r0, r0, r6 //jump to next 16 x width
sub r1, r1, r6 //jump to next 16 x width
-
+
subs r3, #16
bne vaa_calc_sad_ssd_bgd_height_loop
-
+
//psadframe
ldr r7, [sp, #60]//psadframe
-
+
vadd.i32 d24, d24, d25
vst1.32 {d24[0]}, [r7]
-
+
ldmia sp!, {r0-r12, r14}
-
+
WELS_ASM_FUNC_END
@@ -713,33 +713,33 @@
#ifdef APPLE_IOS
.macro SAD_VAR_16
vld1.8 {q0}, [$0], $2 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vld1.8 {q1}, [$1], $2
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_END
vld1.8 {q0}, [$0], $1 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -746,35 +746,35 @@
.macro SAD_VAR_16_RESET_16x16
vld1.8 {q0}, [$0], $2 //load cur_row
vld1.8 {q1}, [$1], $2
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vld1.8 {q1}, [$1], $2
-
+
vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
.endm
.macro SAD_VAR_16_RESET_8x8
vld1.8 {q0}, [$0], $2 //load cur_row
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vld1.8 {q1}, [$1], $2
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -788,7 +788,7 @@
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
SAD_VAR_16 $0, $1, $2, q6
-
+
vpaddl.u16 q6, q6
vpaddl.u32 q6, q6
vadd.i32 q12, q6
@@ -802,42 +802,42 @@
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16 $0, $1, $2, q7
SAD_VAR_16_END $0, $2, q7
-
+
vpaddl.u16 q7, q7
vpaddl.u32 q7, q7
-
+
vadd.i32 q12, q7
.endm
#else
.macro SAD_VAR_16 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vld1.8 {q1}, [\arg1], \arg2
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
.macro SAD_VAR_16_END arg0, arg1, arg2
vld1.8 {q0}, [\arg0], \arg1 //load cur_row
-
+
vpadal.u8 q3, q0 //add cur_row together
vpadal.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -845,35 +845,35 @@
.macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
vld1.8 {q1}, [\arg1], \arg2
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vld1.8 {q1}, [\arg1], \arg2
-
+
vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
.endm
.macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-
+
vpaddl.u8 q3, q0 //add cur_row together
vpaddl.u8 q4, q1 //add ref_row together
-
+
vabd.u8 q2, q0, q1 //abs_diff
-
+
vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16
-
+
vld1.8 {q1}, [\arg1], \arg2
-
+
vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
+
SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
.endm
@@ -887,7 +887,7 @@
SAD_VAR_16 \arg0, \arg1, \arg2, q6
SAD_VAR_16 \arg0, \arg1, \arg2, q6
SAD_VAR_16 \arg0, \arg1, \arg2, q6
-
+
vpaddl.u16 q6, q6
vpaddl.u32 q6, q6
vadd.i32 q12, q6
@@ -901,26 +901,26 @@
SAD_VAR_16 \arg0, \arg1, \arg2, q7
SAD_VAR_16 \arg0, \arg1, \arg2, q7
SAD_VAR_16_END \arg0, \arg2, q7
-
+
vpaddl.u16 q7, q7
vpaddl.u32 q7, q7
-
+
vadd.i32 q12, q7
.endm
#endif
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_var_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
stmdb sp!, {r4-r11}
-
+
ldr r4, [sp, #32] //r4 keeps the pic_stride
-
+
sub r5, r4, #1
lsl r5, r5, #4 //r5 keeps the little step
-
+
lsl r6, r4, #4
sub r6, r2, r6 //r6 keeps the big step
-
+
ldr r7, [sp, #36] //psadframe
ldr r8, [sp, #40] //psad8x8
ldr r9, [sp, #44] //psum16x16
@@ -936,25 +936,25 @@
SAD_VAR_16x16 r0,r1,r4
//psad8x8
vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
-
+
sub r0, r0, r5 //jump to next 16x16
sub r1, r1, r5 //jump to next 16x16
-
+
//psum16x16
SSD_SAD_SD_MAD_PADDL q9, d18, d19
vst1.32 {d18[0]}, [r9]! //psum16x16
-
+
//psqsum16x16
- vpaddl.s32 q10, q10
+ vpaddl.s32 q10, q10
subs r11, #16
- vadd.i32 d20, d20, d21
+ vadd.i32 d20, d20, d21
vst1.32 {d20[0]}, [r10]! //psqsum16x16
-
+
bne vaa_calc_sad_var_width_loop
-
+
sub r0, r0, r6 //jump to next 16 x width
sub r1, r1, r6 //jump to next 16 x width
-
+
subs r3, #16
bne vaa_calc_sad_var_height_loop
@@ -968,25 +968,25 @@
#ifdef APPLE_IOS
.macro SAD_SSD_16
SAD_VAR_16 $0, $1, $2, $3
-
+
SSD_MUL_SUM_16BYTES d4,d5,q8, q11
.endm
.macro SAD_SSD_16_END
SAD_VAR_16_END $0, $1, $2, $3
-
+
SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_16x16
SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
-
+
SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_8x8
SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
-
+
SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
@@ -1000,7 +1000,7 @@
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
SAD_SSD_16 $0, $1, $2, q6
-
+
vpaddl.u16 q6, q6
vpaddl.u32 q6, q6
vadd.i32 q12, q6
@@ -1014,34 +1014,34 @@
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16 $0, $1, $2, q7
SAD_SSD_16_END $0, $2, q7
-
+
vpaddl.u16 q7, q7
vpaddl.u32 q7, q7
-
+
vadd.i32 q12, q7
.endm
#else
.macro SAD_SSD_16 arg0, arg1, arg2, arg3
SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
-
+
SSD_MUL_SUM_16BYTES d4,d5,q8, q11
.endm
.macro SAD_SSD_16_END arg0, arg1, arg2, arg3
SAD_VAR_16_END \arg0, \arg1, \arg2, \arg3
-
+
SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3
SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
-
+
SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
.macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3
SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
-
+
SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
.endm
@@ -1055,7 +1055,7 @@
SAD_SSD_16 \arg0, \arg1, \arg2, q6
SAD_SSD_16 \arg0, \arg1, \arg2, q6
SAD_SSD_16 \arg0, \arg1, \arg2, q6
-
+
vpaddl.u16 q6, q6
vpaddl.u32 q6, q6
vadd.i32 q12, q6
@@ -1069,26 +1069,26 @@
SAD_SSD_16 \arg0, \arg1, \arg2, q7
SAD_SSD_16 \arg0, \arg1, \arg2, q7
SAD_SSD_16_END \arg0, \arg2, q7
-
+
vpaddl.u16 q7, q7
vpaddl.u32 q7, q7
-
+
vadd.i32 q12, q7
.endm
#endif
-WELS_ASM_FUNC_BEGIN vaa_calc_sad_ssd_neon
+WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
stmdb sp!, {r4-r12}
ldr r4, [sp, #36] //r4 keeps the pic_stride
-
+
sub r5, r4, #1
lsl r5, r5, #4 //r5 keeps the little step
-
+
lsl r6, r4, #4
sub r6, r2, r6 //r6 keeps the big step
-
+
ldr r7, [sp, #40] //psadframe
ldr r8, [sp, #44] //psad8x8
ldr r9, [sp, #48] //psum16x16
@@ -1105,32 +1105,32 @@
SAD_SSD_16x16 r0,r1,r4
//psad8x8
vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
-
+
sub r0, r0, r5 //jump to next 16x16
sub r1, r1, r5 //jump to next 16x16
-
+
//psum16x16
vpaddl.s16 q9, q9
- vpaddl.s32 q9, q9
+ vpaddl.s32 q9, q9
vadd.i32 d18, d18, d19
vst1.32 {d18[0]}, [r9]! //psum16x16
//psqsum16x16
- vpaddl.s32 q10, q10
- vadd.i32 d20, d20, d21
+ vpaddl.s32 q10, q10
+ vadd.i32 d20, d20, d21
vst1.32 {d20[0]}, [r10]! //psqsum16x16
-
+
//psqdiff16x16
- vpaddl.s32 q8, q8
+ vpaddl.s32 q8, q8
vadd.i32 d16, d16, d17
subs r12, #16
vst1.32 {d16[0]}, [r11]! //psqdiff16x16
-
+
bne vaa_calc_sad_ssd_width_loop
-
+
sub r0, r0, r6 //jump to next 16 x width
sub r1, r1, r6 //jump to next 16 x width
-
+
subs r3, #16
bne vaa_calc_sad_ssd_height_loop
@@ -1140,4 +1140,4 @@
ldmia sp!, {r4-r12}
WELS_ASM_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -75,6 +75,16 @@
}
#endif//X86_ASM
+#if defined(HAVE_NEON)
+ if (iCpuFlag & WELS_CPU_NEON) {
+ sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
+ sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
+ sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
+ sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
+ sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
+ sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
+ }
+#endif
}
EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -103,7 +103,20 @@
WELSVP_EXTERN_C_END
#endif
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+// iSrcWidth no limitation
+HalveDownsampleFunc DyadicBilinearDownsampler_neon;
+// iSrcWidth = x32 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_neon;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_neon;
+
+void GeneralBilinearAccurateDownsampler_neon( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+
+WELSVP_EXTERN_C_END
+#endif
class CDownsampling : public IStrategy {
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -229,4 +229,14 @@
//}
#endif //X86_ASM
+#ifdef HAVE_NEON
+void GeneralBilinearAccurateDownsamplerWrap_neon(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+ const int32_t kiScaleBit = 15;
+ const uint32_t kuiScale = (1 << kiScaleBit);
+ uint32_t uiScalex = (uint32_t)((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
+ uint32_t uiScaley = (uint32_t)((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
+ GeneralBilinearAccurateDownsampler_neon(pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
+}
+#endif
WELSVP_NAMESPACE_END
--- a/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -130,6 +130,12 @@
pfSad = WelsSampleSad8x8_sse21;
}
#endif
+
+#ifdef HAVE_NEON
+ if (iCpuFlag & WELS_CPU_NEON) {
+ pfSad = WelsSampleSad8x8_neon;
+ }
+#endif
}
--- a/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -60,6 +60,12 @@
WELSVP_EXTERN_C_END
#endif
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+SadFunc WelsSampleSad8x8_neon;
+WELSVP_EXTERN_C_END
+#endif
+
WELSVP_NAMESPACE_END
#endif
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -65,6 +65,15 @@
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
}
#endif//X86_ASM
+#ifdef HAVE_NEON
+ if ((iCpuFlag & WELS_CPU_NEON) == WELS_CPU_NEON) {
+ sVaaFuncs.pfVAACalcSad = VAACalcSad_neon;
+ sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_neon;
+ sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_neon;
+ sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_neon;
+ sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_neon;
+ }
+#endif//X86_ASM
}
EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
--- a/codec/processing/src/vaacalc/vaacalculation.h
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -103,6 +103,16 @@
WELSVP_EXTERN_C_END
#endif
+#ifdef HAVE_NEON
+WELSVP_EXTERN_C_BEGIN
+VAACalcSadBgdFunc VAACalcSadBgd_neon;
+VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_neon;
+VAACalcSadFunc VAACalcSad_neon;
+VAACalcSadVarFunc VAACalcSadVar_neon;
+VAACalcSadSsdFunc VAACalcSadSsd_neon;
+WELSVP_EXTERN_C_END
+#endif
+
class CVAACalculation : public IStrategy {
public:
CVAACalculation (int32_t iCpuFlag);