ref: 71467f948ae79fb5ea2ffe48ca07cf72a36d45a3
parent: 14f5518e6a5f4e4a84de04045d749bd0e1119e5a
author: Licai Guo <[email protected]>
date: Fri Mar 7 07:18:58 EST 2014
mv mc_neon.S to common,add MC arm code to encoder
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -8,6 +8,7 @@
/* Begin PBXBuildFile section */
4C34067D18C5C94C00DFA14A /* expand_picture_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */; };
+ 4C34069B18C96CB000DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34069A18C96CB000DFA14A /* mc_neon.S */; };
4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; };
4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
@@ -48,6 +49,7 @@
/* Begin PBXFileReference section */
4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture_neon.S; sourceTree = "<group>"; };
+ 4C34069A18C96CB000DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -150,6 +152,7 @@
4CE4472F18BC61650017DF25 /* common */ = {
isa = PBXGroup;
children = (
+ 4C34069A18C96CB000DFA14A /* mc_neon.S */,
4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */,
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
@@ -256,6 +259,7 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
+ 4C34069B18C96CB000DFA14A /* mc_neon.S in Sources */,
4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */,
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@@ -38,7 +38,6 @@
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
- 4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
@@ -132,7 +131,6 @@
4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
- 4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -325,7 +323,6 @@
children = (
4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
- 4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
);
path = arm;
sourceTree = "<group>";
@@ -411,7 +408,6 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
- 4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */,
4CE4469B18BC5EAB0017DF25 /* pic_queue.cpp in Sources */,
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */,
4CE4469318BC5EAB0017DF25 /* fmo.cpp in Sources */,
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -9,7 +9,6 @@
/* Begin PBXBuildFile section */
4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066618C57D0400DFA14A /* intra_pred_neon.S */; };
4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */; };
- 4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066818C57D0400DFA14A /* mc_neon.S */; };
4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
@@ -79,7 +78,6 @@
/* Begin PBXFileReference section */
4C34066618C57D0400DFA14A /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_sad_3_opt_neon.S; sourceTree = "<group>"; };
- 4C34066818C57D0400DFA14A /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
@@ -206,7 +204,6 @@
children = (
4C34066618C57D0400DFA14A /* intra_pred_neon.S */,
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,
- 4C34066818C57D0400DFA14A /* mc_neon.S */,
4C34066918C57D0400DFA14A /* memory_neon.S */,
4C34066A18C57D0400DFA14A /* pixel_neon.S */,
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */,
@@ -492,7 +489,6 @@
4CE4472A18BC605C0017DF25 /* utils.cpp in Sources */,
4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
- 4C34066F18C57D0400DFA14A /* mc_neon.S in Sources */,
4CE4472D18BC605C0017DF25 /* welsCodecTrace.cpp in Sources */,
4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,
--- a/codec/common/mc_common.h
+++ b/codec/common/mc_common.h
@@ -82,6 +82,18 @@
void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+
+void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+
+void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+
+void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
#endif
#if defined(X86_ASM)
--- /dev/null
+++ b/codec/common/mc_neon.S
@@ -1,0 +1,2200 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef APPLE_IOS
+.macro AVERAGE_TWO_8BITS
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, $2, $1
+ vrshrn.u16 $0, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+// }
+.endm
+
+.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+ vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
+ vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 $0, $0, $0
+ vpadd.s16 $0, $0, $0
+ vqrshrun.s16 $0, $4, #5
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $2, $6
+ vrshrn.u16 $6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 $6, q12, #5
+ vaddl.u8 q13, $3, $6
+ vrshrn.u16 $6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_TO_16BITS
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, $2, $3 //src[0]+src[1]
+ vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, $1, $4 //src[-1]+src[2]
+ vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
+.endm
+
+.macro FILTER_3_IN_16BITS_TO_8BITS
+// { // input:a, b, c, dst_d;
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $3, $0, #6 //(+32)>>6
+// }
+.endm
+
+.macro UNPACK_2_16BITS_TO_ABC
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 $4, $0, $1, #2 //src[0]
+ vext.16 $3, $0, $1, #3 //src[1]
+ vadd.s16 $4, $3 //c=src[0]+src[1]
+
+ vext.16 $3, $0, $1, #1 //src[-1]
+ vext.16 $2, $0, $1, #4 //src[2]
+ vadd.s16 $3, $2 //b=src[-1]+src[2]
+
+ vext.16 $2, $0, $1, #5 //src[3]
+ vadd.s16 $2, $0 //a=src[-2]+src[3]
+// }
+.endm
+
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
+ vrev64.16 $1, $1
+ vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
+ vshr.s64 $1, $2, #16
+ vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
+
+ vsub.s16 $0, $0, $1 //a-b
+ vshr.s16 $0, $0, #2 //(a-b)/4
+ vsub.s16 $0, $0, $1 //(a-b)/4-b
+ vadd.s16 $0, $0, $2 //(a-b)/4-b+c
+ vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 $0, $3, #6 //(+32)>>6
+// }
+.endm
+#else
+.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
+// { // input:dst_d, src_d A and B; working: q13
+ vaddl.u8 q13, \arg2, \arg1
+ vrshrn.u16 \arg0, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+// }
+.endm
+
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
+// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+ vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
+ vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
+ vmul.s16 \arg0, \arg2, \arg1 // 0+1*[50]-5*[41]+20[32]
+ vpadd.s16 \arg0, \arg0, \arg0
+ vpadd.s16 \arg0, \arg0, \arg0
+ vqrshrun.s16 \arg0, \arg4, #5
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg2, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+ vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+ vqrshrun.s16 \arg6, q12, #5
+ vaddl.u8 q13, \arg3, \arg6
+ vrshrn.u16 \arg6, q13, #1
+// }
+.endm
+
+.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+ vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
+ vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
+ vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+ vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
+ vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+// }
+.endm
+
+.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
+// { // input:a, b, c, dst_d;
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
+// }
+.endm
+
+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+ vext.16 \arg4, \arg0, \arg1, #2 //src[0]
+ vext.16 \arg3, \arg0, \arg1, #3 //src[1]
+ vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
+
+ vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
+ vext.16 \arg2, \arg0, \arg1, #4 //src[2]
+ vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
+
+ vext.16 \arg2, \arg0, \arg1, #5 //src[3]
+ vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
+// }
+.endm
+
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
+// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+ vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
+ vrev64.16 \arg1, \arg1
+ vadd.u16 \arg2, \arg1 // C[2+3],B[1+4],A[0+5]
+ vshr.s64 \arg1, \arg2, #16
+ vshr.s64 \arg0, \arg2, #32 // Output: C \arg2, B \arg1, A \arg0
+
+ vsub.s16 \arg0, \arg0, \arg1 //a-b
+ vshr.s16 \arg0, \arg0, #2 //(a-b)/4
+ vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
+ vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
+ vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
+ vadd.s16 \arg1, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
+// }
+.endm
+#endif
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_h_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+ FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_h_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_h_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_h_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_h_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q4, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_h_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_xy_10_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_xy_10_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_xy_10_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_xy_10_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_xy_10_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q4, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_10_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w16_xy_30_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
+ pld [r0]
+ pld [r0, #16]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
+
+ cmp r4, #0
+ bne w16_xy_30_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w8_xy_30_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d1}, [r2], r3
+
+ cmp r4, #0
+ bne w8_xy_30_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
+ push {r4, r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w4_xy_30_mc_luma_loop:
+ vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
+ pld [r0]
+ vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
+ pld [r0]
+
+ vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
+ vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
+ vext.8 q3, q2, q2, #1 //src[0:6 *]
+ vext.8 q4, q2, q2, #2 //src[1:6 * *]
+
+ vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+ vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
+ vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
+ vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+ vmov r4, r5, d1
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_xy_30_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w16_xy_01_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_01_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_xy_01_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_01_mc_luma_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_xy_01_mc_luma_loop:
+
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vmov r4, r5, d12
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vmov r5, r6, d12
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_01_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w16_xy_03_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_xy_03_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_xy_03_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_xy_03_mc_luma_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_xy_03_mc_luma_loop:
+
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vmov r4, r5, d12
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vmov r5, r6, d12
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_xy_03_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w16_v_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #0
+ bne w16_v_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w8_v_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_v_mc_luma_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
+ push {r4, r5, r6, r7}
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ ldr r4, [r0], r1 //r4=src[-2]
+ ldr r5, [r0], r1 //r5=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ ldr r6, [r0], r1 //r6=src[0]
+ ldr r7, [r0], r1 //r7=src[1]
+
+ vmov d0, r4, r5
+ vmov d1, r5, r6
+ vmov d2, r6, r7
+
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d3, r7, r4
+ ldr r7, [sp, #16]
+
+w4_v_mc_luma_loop:
+
+// pld [r0]
+ //using reserving r4
+ ldr r5, [r0], r1 //r5=src[3]
+ ldr r6, [r0], r1 //r6=src[0]
+ vmov d4, r4, r5
+ vmov d5, r5, r6 //reserved r6
+
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vmov r4, r5, d12
+ str r4, [r2], r3 //write 1st 4Byte
+ str r5, [r2], r3 //write 2nd 4Byte
+
+ ldr r5, [r0], r1 //r5=src[1]
+ ldr r4, [r0], r1 //r4=src[2]
+ vmov d0, r6, r5
+ vmov d1, r5, r4 //reserved r4
+
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vmov r5, r6, d12
+ str r5, [r2], r3 //write 3rd 4Byte
+ str r6, [r2], r3 //write 4th 4Byte
+
+ //d4, d5, d0, d1 --> d0, d1, d2, d3
+ vmov q1, q0
+ vmov q0, q2
+
+ sub r7, #4
+ cmp r7, #0
+ bne w4_v_mc_luma_loop
+
+ pop {r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
+
+w16_hv_mc_luma_loop:
+
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2], r3 //write 16Byte
+
+
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+
+ vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
+
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
+
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
+
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
+
+ sub r4, #4
+ cmp r4, #0
+ bne w16_hv_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
+
+w8_hv_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //use 13(8+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2], r3 //write 8Byte
+
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2], r3 //write 8Byte
+
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2], r3 //write 8Byte
+
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 5 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2], r3 //write 8Byte
+
+ //q4~q5, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q5
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_hv_mc_luma_loop
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
+ push {r4 ,r5, r6}
+ ldr r6, [sp, #12]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
+
+w4_hv_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
+
+ //the 1st&2nd row
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
+
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
+
+ vmov d23, d0
+ vmov d25, d14
+ vmov d27, d16
+
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
+
+ //the 3rd&4th row
+ vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
+ vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
+ pld [r0]
+ pld [r0, r1]
+ // vertical filtered
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
+
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
+ UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
+
+ vmov d23, d4
+ vmov d25, d14
+ vmov d27, d16
+
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
+ vmov r4 ,r5, d22
+ str r4, [r2], r3 //write 4Byte
+ str r5, [r2], r3 //write 4Byte
+
+ //q4~q6, q0~q1, --> q0~q4
+ vswp q4, q0
+ vmov q3, q4
+ vmov q4, q1
+ vmov q1, q5
+ vmov q2, q6
+
+ sub r6, #4
+ cmp r6, #0
+ bne w4_hv_mc_luma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w16_copy_loop:
+ vld1.u8 {q0}, [r0], r1
+ sub r4, #2
+ vld1.u8 {q1}, [r0], r1
+ vst1.u8 {q0}, [r2], r3
+ cmp r4, #0
+ vst1.u8 {q1}, [r2], r3
+ bne w16_copy_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w8_copy_loop:
+ vld1.u8 {d0}, [r0], r1
+ vld1.u8 {d1}, [r0], r1
+ vst1.u8 {d0}, [r2], r3
+ vst1.u8 {d1}, [r2], r3
+ sub r4, #2
+ cmp r4, #0
+ bne w8_copy_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+w4_copy_loop:
+ ldr r5, [r0], r1
+ ldr r6, [r0], r1
+ str r5, [r2], r3
+ str r6, [r2], r3
+
+ sub r4, #2
+ cmp r4, #0
+ bne w4_copy_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
+ push {r4}
+ ldr r4, [sp, #4]
+w16_pix_avg_loop:
+ vld1.u8 {q0}, [r2]!
+ vld1.u8 {q1}, [r3]!
+ vld1.u8 {q2}, [r2]!
+ vld1.u8 {q3}, [r3]!
+
+ vld1.u8 {q4}, [r2]!
+ vld1.u8 {q5}, [r3]!
+ vld1.u8 {q6}, [r2]!
+ vld1.u8 {q7}, [r3]!
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
+
+ AVERAGE_TWO_8BITS d8, d8, d10
+ AVERAGE_TWO_8BITS d9, d9, d11
+ vst1.u8 {q4}, [r0], r1
+
+ AVERAGE_TWO_8BITS d12, d12, d14
+ AVERAGE_TWO_8BITS d13, d13, d15
+ vst1.u8 {q6}, [r0], r1
+
+ sub r4, #4
+ cmp r4, #0
+ bne w16_pix_avg_loop
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ mov r5, #16
+w8_pix_avg_loop:
+
+ vld1.u8 {d0}, [r2], r5
+ vld1.u8 {d2}, [r3], r5
+ vld1.u8 {d1}, [r2], r5
+ vld1.u8 {d3}, [r3], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
+
+ vld1.u8 {d4}, [r2], r5
+ vld1.u8 {d6}, [r3], r5
+ vld1.u8 {d5}, [r2], r5
+ vld1.u8 {d7}, [r3], r5
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
+
+ sub r4, #4
+ cmp r4, #0
+ bne w8_pix_avg_loop
+
+ pop {r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
+ push {r4-r8}
+ ldr r4, [sp, #20]
+w4_pix_avg_loop:
+
+ ldr r5, [r2]
+ ldr r6, [r2, #16]
+ ldr r7, [r3]
+ ldr r8, [r3, #16]
+ add r2, #32
+ add r3, #32
+
+ vmov d0, r5, r6
+ vmov d1, r7, r8
+ AVERAGE_TWO_8BITS d0, d0, d1
+ vmov r5, r6, d0
+
+ str r5, [r0], r1
+ str r6, [r0], r1
+
+ sub r4, #2
+ cmp r4, #0
+ bne w4_pix_avg_loop
+
+ pop {r4-r8}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ ldr r5, [sp, #12]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+ vld1.u8 {q0}, [r0], r1 //src[x]
+
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
+
+ vext.u8 d1, d0, d1, #1 //src[x+1]
+
+w8_mc_chroma_loop: // each two pxl row
+ vld1.u8 {q1}, [r0], r1 //src[x+stride]
+ vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
+ vext.u8 d3, d2, d3, #1 //src[x+stride+1]
+ vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
+
+ vmull.u8 q3, d0, d28 //(src[x] * A)
+ vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
+
+ vmull.u8 q3, d2, d28 //(src[x] * A)
+ vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
+ vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
+ vrshrn.u16 d6, q3, #6
+ vst1.u8 d6, [r2], r3
+
+ vmov q0, q2
+ sub r5, #2
+ cmp r5, #0
+ bne w8_mc_chroma_loop
+
+ pop {r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
+
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r6, [sp, #16]
+// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+// we can opti it by adding vert only/ hori only cases, to be continue
+ vld1.u8 {d31}, [r4] //load A/B/C/D
+
+ vdup.u8 d28, d31[0] //A
+ vdup.u8 d29, d31[1] //B
+ vdup.u8 d30, d31[2] //C
+ vdup.u8 d31, d31[3] //D
+
+w4_mc_chroma_loop: // each two pxl row
+ vld1.u8 {d0}, [r0], r1 //a::src[x]
+ vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
+ vld1.u8 {d4}, [r0] //c::src[x+2*stride]
+
+ vshr.u64 d1, d0, #8
+ vshr.u64 d3, d2, #8
+ vshr.u64 d5, d4, #8
+
+ vmov q3, q1 //b::[0:7]+b::[1~8]
+ vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+ vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+
+ vmull.u8 q1, d0, d28 //(src[x] * A)
+ vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
+ vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
+ vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
+
+ vrshrn.u16 d2, q1, #6
+ vmov r4, r5, d2
+ str r4, [r2], r3
+ str r5, [r2], r3
+
+ sub r6, #2
+ cmp r6, #0
+ bne w4_mc_chroma_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d3, r5, r4 // 0x0014FFFB00010000
+
+ sub r3, #16
+ ldr r4, [sp, #8]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w17_h_mc_luma_loop:
+ vld1.u8 {d0,d1,d2}, [r0], r1 //only use 22(17+5); q0=src[-2]
+
+ vext.8 q2, q0, q1, #1 //q2=src[-1]
+ vext.8 q3, q0, q1, #2 //q3=src[0]
+ vext.8 q4, q0, q1, #3 //q4=src[1]
+ vext.8 q5, q0, q1, #4 //q5=src[2]
+ vext.8 q6, q0, q1, #5 //q6=src[3]
+
+ FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d14, q14, q15
+
+ FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d15, q14, q15
+
+ vst1.u8 {d14, d15}, [r2]! //write [0:15] Byte
+
+ vsli.64 d2, d2, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d3, d14, q7, q1
+
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ sub r4, #1
+ cmp r4, #0
+ bne w17_h_mc_luma_loop
+ pop {r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
+ push {r4-r5}
+ mov r4, #20
+ mov r5, #1
+ sub r4, r4, r4, lsl #(16-2)
+ lsl r5, #16
+ ror r4, #16
+ vmov d7, r5, r4 // 0x0014FFFB00010000
+
+ sub r3, #8
+ ldr r4, [sp, #8]
+
+ sub r0, #2
+ vmov.u16 q14, #0x0014 // 20
+ vshr.u16 q15, q14, #2 // 5
+
+w9_h_mc_luma_loop:
+ vld1.u8 {d0,d1}, [r0], r1 //only use 14(9+5); q0=src[-2]
+ pld [r0]
+
+ vext.8 d2, d0, d1, #1 //d2=src[-1]
+ vext.8 d3, d0, d1, #2 //d3=src[0]
+ vext.8 d4, d0, d1, #3 //d4=src[1]
+ vext.8 d5, d0, d1, #4 //d5=src[2]
+ vext.8 d6, d0, d1, #5 //d6=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d8, q14, q15
+
+ sub r4, #1
+ vst1.u8 {d8}, [r2]! //write [0:7] Byte
+
+ vsli.64 d2, d1, #8 // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+ FILTER_SINGLE_TAG_8BITS d2, d7, d14, q7, q1
+ vst1.u8 {d2[0]}, [r2], r3 //write 8th Byte
+
+ cmp r4, #0
+ bne w9_h_mc_luma_loop
+ pop {r4-r5}
+WELS_ASM_FUNC_END
+
+
+ WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //q0=src[-2]
+ vld1.u8 {q1}, [r0], r1 //q1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {q2}, [r0], r1 //q2=src[0]
+ vld1.u8 {q3}, [r0], r1 //q3=src[1]
+ vld1.u8 {q4}, [r0], r1 //q4=src[2]
+
+w17_v_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
+
+ FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
+
+ FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
+ vld1.u8 {q3}, [r0], r1 //read 5th row
+ vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
+
+ FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
+ vld1.u8 {q4}, [r0], r1 //read 6th row
+ vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
+
+ FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
+ vld1.u8 {q5}, [r0], r1 //read 7th row
+ vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
+
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vld1.u8 {q0}, [r0], r1 //read 8th row
+ vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
+
+ FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
+
+ //q2, q3, q4, q5, q0 --> q0~q4
+ vswp q0, q4
+ vswp q0, q2
+ vmov q1, q3
+ vmov q3, q5 //q0~q4
+
+ sub r4, #8
+ cmp r4, #1
+ bne w17_v_mc_luma_loop
+ // the last 16Bytes
+ vld1.u8 {q5}, [r0], r1 //q5=src[3]
+ FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
+ FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
+ vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
+ pld [r0]
+ pld [r0, r1]
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0}, [r0], r1 //d0=src[-2]
+ vld1.u8 {d1}, [r0], r1 //d1=src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+ vld1.u8 {d2}, [r0], r1 //d2=src[0]
+ vld1.u8 {d3}, [r0], r1 //d3=src[1]
+
+ vld1.u8 {d4}, [r0], r1 //d4=src[2]
+ vld1.u8 {d5}, [r0], r1 //d5=src[3]
+
+w9_v_mc_luma_loop:
+
+ pld [r0]
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vld1.u8 {d0}, [r0], r1 //read 2nd row
+ vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15
+ vld1.u8 {d1}, [r0], r1 //read 3rd row
+ vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
+ vld1.u8 {d2}, [r0], r1 //read 4th row
+ vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
+
+ pld [r0]
+ FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15
+ vld1.u8 {d3}, [r0], r1 //read 5th row
+ vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
+
+ //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+ vswp q0, q2
+ vswp q1, q2
+
+ sub r4, #4
+ cmp r4, #1
+ bne w9_v_mc_luma_loop
+
+ FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
+ vst1.u8 {d12}, [r2], r3 //write last 8Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {d0-d2}, [r0], r1 //use 21(17+5), =src[-2]
+ vld1.u8 {d3-d5}, [r0], r1 //use 21(17+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {d6-d8}, [r0], r1 //use 21(17+5), =src[0]
+ vld1.u8 {d9-d11}, [r0], r1 //use 21(17+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
+ sub r3, #16
+
+w17_hv_mc_luma_loop:
+
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {d0, d1}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
+ vst1.u8 {d3, d4}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d5, d22, d23, q11 //output to d5[0]
+ vst1.u8 {d5[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
+ vst1.u8 {d6, d7}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d8, d22, d23, q11 //output to d8[0]
+ vst1.u8 {d8[0]}, [r2], r3 //write 16th Byte
+
+ vld1.u8 {d6-d8}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
+ vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d11, d22, d23, q11 //output to d11[0]
+ vst1.u8 {d11[0]}, [r2], r3 //write 16th Byte
+
+ //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+ vswp q0, q6
+ vswp q6, q3
+ vmov q5, q2
+ vmov q2, q8
+
+ vmov d20,d8
+ vmov q4, q1
+ vmov q1, q7
+ vmov d14,d20
+
+ sub r4, #4
+ cmp r4, #1
+ bne w17_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+ // vertical filtered into q9/q10
+ FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
+ // vertical filtered into q10/q11
+ FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
+ vst1.u8 {q0}, [r2]! //write 16Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d2, d22, d23, q11 //output to d2[0]
+ vst1.u8 {d2[0]}, [r2], r3 //write 16th Byte
+
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
+ push {r4}
+ ldr r4, [sp, #4]
+
+ sub r0, #2 //src[-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
+ pld [r0]
+ pld [r0, r1]
+
+ vmov.u16 q14, #0x0014 // 20
+ vld1.u8 {q0}, [r0], r1 //use 14(9+5), =src[-2]
+ vld1.u8 {q1}, [r0], r1 //use 14(9+5), =src[-1]
+
+ pld [r0]
+ pld [r0, r1]
+ vshr.u16 q15, q14, #2 // 5
+
+ vld1.u8 {q2}, [r0], r1 //use 14(9+5), =src[0]
+ vld1.u8 {q3}, [r0], r1 //use 14(9+5), =src[1]
+ pld [r0]
+ pld [r0, r1]
+ vld1.u8 {q4}, [r0], r1 //use 14(9+5), =src[2]
+ sub r3, #8
+
+w9_hv_mc_luma_loop:
+
+ vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
+ //the 1st row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q0}, [r0], r1 //read 2nd row
+ //the 2nd row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q1}, [r0], r1 //read 3rd row
+ //the 3rd row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ vld1.u8 {q2}, [r0], r1 //read 4th row
+ //the 4th row
+ pld [r0]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+
+ //q4~q5, q0~q2, --> q0~q4
+ vswp q0, q4
+ vswp q2, q4
+ vmov q3, q1
+ vmov q1, q5
+
+ sub r4, #4
+ cmp r4, #1
+ bne w9_hv_mc_luma_loop
+ //the last row
+ vld1.u8 {q5}, [r0], r1 //use 14(9+5), =src[3]
+ // vertical filtered into q6/q7
+ FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
+ FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 6 avail
+ // horizon filtered
+ UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
+ FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
+ vst1.u8 d12, [r2]! //write 8Byte
+ UNPACK_1_IN_8x16BITS_TO_8BITS d13, d14, d15, q7 //output to d13[0]
+ vst1.u8 {d13[0]}, [r2], r3 //write 8th Byte
+ pop {r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
+
+enc_w16_pix_avg_loop:
+ vld1.u8 {q0}, [r2], r3
+ vld1.u8 {q1}, [r4], r5
+ vld1.u8 {q2}, [r2], r3
+ vld1.u8 {q3}, [r4], r5
+
+ vld1.u8 {q4}, [r2], r3
+ vld1.u8 {q5}, [r4], r5
+ vld1.u8 {q6}, [r2], r3
+ vld1.u8 {q7}, [r4], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {q0}, [r0], r1
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {q2}, [r0], r1
+
+ AVERAGE_TWO_8BITS d8, d8, d10
+ AVERAGE_TWO_8BITS d9, d9, d11
+ vst1.u8 {q4}, [r0], r1
+
+ AVERAGE_TWO_8BITS d12, d12, d14
+ AVERAGE_TWO_8BITS d13, d13, d15
+ vst1.u8 {q6}, [r0], r1
+
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w16_pix_avg_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
+ push {r4, r5, r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ ldr r6, [sp, #20]
+enc_w8_pix_avg_loop:
+
+ vld1.u8 {d0}, [r2], r3
+ vld1.u8 {d2}, [r4], r5
+ vld1.u8 {d1}, [r2], r3
+ vld1.u8 {d3}, [r4], r5
+
+ AVERAGE_TWO_8BITS d0, d0, d2
+ AVERAGE_TWO_8BITS d1, d1, d3
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
+
+ vld1.u8 {d4}, [r2], r3
+ vld1.u8 {d6}, [r4], r5
+ vld1.u8 {d5}, [r2], r3
+ vld1.u8 {d7}, [r4], r5
+
+ AVERAGE_TWO_8BITS d4, d4, d6
+ AVERAGE_TWO_8BITS d5, d5, d7
+ vst1.u8 {d4}, [r0], r1
+ vst1.u8 {d5}, [r0], r1
+
+ sub r6, #4
+ cmp r6, #0
+ bne enc_w8_pix_avg_loop
+
+ pop {r4, r5, r6}
+WELS_ASM_FUNC_END
+
+#endif
--- a/codec/decoder/core/arm/mc_neon.S
+++ /dev/null
@@ -1,1602 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-#ifdef APPLE_IOS
-.macro AVERAGE_TWO_8BITS
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, $2, $1
- vrshrn.u16 $0, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $2, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $3, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS
-// { // input:a, b, c, dst_d;
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $3, $0, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 $4, $0, $1, #2 //src[0]
- vext.16 $3, $0, $1, #3 //src[1]
- vadd.s16 $4, $3 //c=src[0]+src[1]
-
- vext.16 $3, $0, $1, #1 //src[-1]
- vext.16 $2, $0, $1, #4 //src[2]
- vadd.s16 $3, $2 //b=src[-1]+src[2]
-
- vext.16 $2, $0, $1, #5 //src[3]
- vadd.s16 $2, $0 //a=src[-2]+src[3]
-// }
-.endm
-#else
-.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, \arg2, \arg1
- vrshrn.u16 \arg0, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg2, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, \arg0, \arg5 //q12=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 \arg6, q12, #5
- vaddl.u8 q13, \arg3, \arg6
- vrshrn.u16 \arg6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 \arg6, \arg0, \arg5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, \arg2, \arg3 //src[0]+src[1]
- vmla.u16 \arg6, q13, \arg7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, \arg1, \arg4 //src[-1]+src[2]
- vmls.s16 \arg6, q13, \arg8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
-// { // input:a, b, c, dst_d;
- vsub.s16 \arg0, \arg0, \arg1 //a-b
- vshr.s16 \arg0, \arg0, #2 //(a-b)/4
- vsub.s16 \arg0, \arg0, \arg1 //(a-b)/4-b
- vadd.s16 \arg0, \arg0, \arg2 //(a-b)/4-b+c
- vshr.s16 \arg0, \arg0, #2 //((a-b)/4-b+c)/4
- vadd.s16 \arg0, \arg0, \arg2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 \arg3, \arg0, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 \arg4, \arg0, \arg1, #2 //src[0]
- vext.16 \arg3, \arg0, \arg1, #3 //src[1]
- vadd.s16 \arg4, \arg3 //c=src[0]+src[1]
-
- vext.16 \arg3, \arg0, \arg1, #1 //src[-1]
- vext.16 \arg2, \arg0, \arg1, #4 //src[2]
- vadd.s16 \arg3,\arg2 //b=src[-1]+src[2]
-
- vext.16 \arg2, \arg0, \arg1, #5 //src[3]
- vadd.s16 \arg2, \arg0 //a=src[-2]+src[3]
-// }
-.endm
-#endif
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_h_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q4, q0, q1, #3 //q4=src[1]
- vext.8 q5, q0, q1, #4 //q5=src[2]
- vext.8 q6, q0, q1, #5 //q6=src[3]
-
- FILTER_6TAG_8BITS d0, d4, d6, d8, d10, d12, d2, q14, q15
-
- FILTER_6TAG_8BITS d1, d5, d7, d9, d11, d13, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_h_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_h_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_h_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_h_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q4, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_h_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q4, q0, q1, #3 //q4=src[1]
- vext.8 q5, q0, q1, #4 //q5=src[2]
- vext.8 q6, q0, q1, #5 //q6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d8, d10, d12, d2, q14, q15
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d5, d7, d9, d11, d13, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_xy_10_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_xy_10_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_xy_10_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_xy_10_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q4, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_xy_10_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w16_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1,d2}, [r0], r1 //only use 21(16+5); q0=src[-2]
- pld [r0]
- pld [r0, #16]
-
- vext.8 q2, q0, q1, #1 //q2=src[-1]
- vext.8 q3, q0, q1, #2 //q3=src[0]
- vext.8 q4, q0, q1, #3 //q4=src[1]
- vext.8 q5, q0, q1, #4 //q5=src[2]
- vext.8 q6, q0, q1, #5 //q6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d8, d10, d12, d2, q14, q15
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d5, d7, d9, d11, d13, d3, q14, q15
-
- sub r4, #1
- vst1.u8 {d2, d3}, [r2], r3 //write 16Byte
-
- cmp r4, #0
- bne w16_xy_30_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w8_xy_30_mc_luma_loop:
- vld1.u8 {d0,d1}, [r0], r1 //only use 13(8+5); q0=src[-2]
- pld [r0]
-
- vext.8 d2, d0, d1, #1 //d2=src[-1]
- vext.8 d3, d0, d1, #2 //d3=src[0]
- vext.8 d4, d0, d1, #3 //d4=src[1]
- vext.8 d5, d0, d1, #4 //d5=src[2]
- vext.8 d6, d0, d1, #5 //d6=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d3, d4, d5, d6, d1, q14, q15
-
- sub r4, #1
- vst1.u8 {d1}, [r2], r3
-
- cmp r4, #0
- bne w8_xy_30_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
- push {r4, r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2
- vmov.u16 q14, #0x0014 // 20
- vshr.u16 q15, q14, #2 // 5
-
-w4_xy_30_mc_luma_loop:
- vld1.u8 {d0, d1}, [r0], r1 //only use 9(4+5);d0: 1st row src[-2:5]
- pld [r0]
- vld1.u8 {d2, d3}, [r0], r1 //d2: 2nd row src[-2:5]
- pld [r0]
-
- vext.8 d4, d0, d1, #1 //d4: 1st row src[-1:6]
- vext.8 d5, d2, d3, #1 //d5: 2nd row src[-1:6]
- vext.8 q3, q2, q2, #1 //src[0:6 *]
- vext.8 q4, q2, q2, #2 //src[1:6 * *]
-
- vtrn.32 q3, q4 //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
- vtrn.32 d6, d7 //d6:[0:3]; d7[1:4]
- vtrn.32 d0, d2 //d0:[-2:1]; d2[2:5]
- vtrn.32 d4, d5 //d4:[-1:2]; d5[3:6]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d4, d6, d7, d2, d5, d1, q14, q15
-
- vmov r4, r5, d1
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_xy_30_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q4}, [r0], r1 //q4=src[2]
-
-w16_xy_01_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d4, d6, d8, d10, d0, d2, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d5, d7, d9, d11, d1, d3, d13, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d6, d8, d10, d0, d2, d4, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d7, d9, d11, d1, d3, d5, d13, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d8, d10, d0, d2, d4, d6, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d9, d11, d1, d3, d5, d7, d13, q14, q15
- vld1.u8 {q4}, [r0], r1 //read 6th row
- vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d10, d0, d2, d4, d6, d8, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d11, d1, d3, d5, d7, d9, d13, q14, q15
- vld1.u8 {q5}, [r0], r1 //read 7th row
- vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q4
- vswp q0, q2
- vmov q1, q3
- vmov q3, q5 //q0~q4
-
- sub r4, #8
- cmp r4, #0
- bne w16_xy_01_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_xy_01_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d1, d2, d3, d4, d5, d0, d12, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d3, d4, d5, d0, d1, d2, d12, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_xy_01_mc_luma_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_xy_01_mc_luma_loop:
-
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vmov r4, r5, d12
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_0 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vmov r5, r6, d12
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_xy_01_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q4}, [r0], r1 //q4=src[2]
-
-w16_xy_03_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d4, d6, d8, d10, d0, d2, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d5, d7, d9, d11, d1, d3, d13, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d6, d8, d10, d0, d2, d4, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d7, d9, d11, d1, d3, d5, d13, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d8, d10, d0, d2, d4, d6, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d9, d11, d1, d3, d5, d7, d13, q14, q15
- vld1.u8 {q4}, [r0], r1 //read 6th row
- vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d10, d0, d2, d4, d6, d8, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d11, d1, d3, d5, d7, d9, d13, q14, q15
- vld1.u8 {q5}, [r0], r1 //read 7th row
- vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d5, d7, d9, d11, d1, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q4
- vswp q0, q2
- vmov q1, q3
- vmov q3, q5 //q0~q4
-
- sub r4, #8
- cmp r4, #0
- bne w16_xy_03_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_xy_03_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d1, d2, d3, d4, d5, d0, d12, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d3, d4, d5, d0, d1, d2, d12, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_xy_03_mc_luma_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_xy_03_mc_luma_loop:
-
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d0, d1, d2, d3, d4, d5, d12, q14, q15
- vmov r4, r5, d12
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS_AVERAGE_WITH_1 d2, d3, d4, d5, d0, d1, d12, q14, q15
- vmov r5, r6, d12
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_xy_03_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //q0=src[-2]
- vld1.u8 {q1}, [r0], r1 //q1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {q2}, [r0], r1 //q2=src[0]
- vld1.u8 {q3}, [r0], r1 //q3=src[1]
- vld1.u8 {q4}, [r0], r1 //q4=src[2]
-
-w16_v_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //q5=src[3]
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- vst1.u8 {q6}, [r2], r3 //write 1st 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- vst1.u8 {q6}, [r2], r3 //write 2nd 16Byte
-
- FILTER_6TAG_8BITS d4, d6, d8, d10, d0, d2, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d5, d7, d9, d11, d1, d3, d13, q14, q15
- vld1.u8 {q2}, [r0], r1 //read 4th row
- vst1.u8 {q6}, [r2], r3 //write 3rd 16Byte
-
- FILTER_6TAG_8BITS d6, d8, d10, d0, d2, d4, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d7, d9, d11, d1, d3, d5, d13, q14, q15
- vld1.u8 {q3}, [r0], r1 //read 5th row
- vst1.u8 {q6}, [r2], r3 //write 4th 16Byte
-
- FILTER_6TAG_8BITS d8, d10, d0, d2, d4, d6, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d9, d11, d1, d3, d5, d7, d13, q14, q15
- vld1.u8 {q4}, [r0], r1 //read 6th row
- vst1.u8 {q6}, [r2], r3 //write 5th 16Byte
-
- FILTER_6TAG_8BITS d10, d0, d2, d4, d6, d8, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d11, d1, d3, d5, d7, d9, d13, q14, q15
- vld1.u8 {q5}, [r0], r1 //read 7th row
- vst1.u8 {q6}, [r2], r3 //write 6th 16Byte
-
- FILTER_6TAG_8BITS d0, d2, d4, d6, d8, d10, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d1, d3, d5, d7, d9, d11, d13, q14, q15
- vld1.u8 {q0}, [r0], r1 //read 8th row
- vst1.u8 {q6}, [r2], r3 //write 7th 16Byte
-
- FILTER_6TAG_8BITS d2, d4, d6, d8, d10, d0, d12, q14, q15
- pld [r0]
- FILTER_6TAG_8BITS d3, d5, d7, d9, d11, d1, d13, q14, q15
- vst1.u8 {q6}, [r2], r3 //write 8th 16Byte
-
- //q2, q3, q4, q5, q0 --> q0~q4
- vswp q0, q4
- vswp q0, q2
- vmov q1, q3
- vmov q3, q5 //q0~q4
-
- sub r4, #8
- cmp r4, #0
- bne w16_v_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0}, [r0], r1 //d0=src[-2]
- vld1.u8 {d1}, [r0], r1 //d1=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- vld1.u8 {d2}, [r0], r1 //d2=src[0]
- vld1.u8 {d3}, [r0], r1 //d3=src[1]
-
- vld1.u8 {d4}, [r0], r1 //d4=src[2]
- vld1.u8 {d5}, [r0], r1 //d5=src[3]
-
-w8_v_mc_luma_loop:
-
- pld [r0]
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
- vld1.u8 {d0}, [r0], r1 //read 2nd row
- vst1.u8 {d12}, [r2], r3 //write 1st 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d1, d2, d3, d4, d5, d0, d12, q14, q15
- vld1.u8 {d1}, [r0], r1 //read 3rd row
- vst1.u8 {d12}, [r2], r3 //write 2nd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
- vld1.u8 {d2}, [r0], r1 //read 4th row
- vst1.u8 {d12}, [r2], r3 //write 3rd 8Byte
-
- pld [r0]
- FILTER_6TAG_8BITS d3, d4, d5, d0, d1, d2, d12, q14, q15
- vld1.u8 {d3}, [r0], r1 //read 5th row
- vst1.u8 {d12}, [r2], r3 //write 4th 8Byte
-
- //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
- vswp q0, q2
- vswp q1, q2
-
- sub r4, #4
- cmp r4, #0
- bne w8_v_mc_luma_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
- push {r4, r5, r6, r7}
- sub r0, r0, r1, lsl #1 //src[-2*src_stride]
- pld [r0]
- pld [r0, r1]
- vmov.u16 q14, #0x0014 // 20
- ldr r4, [r0], r1 //r4=src[-2]
- ldr r5, [r0], r1 //r5=src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
- ldr r6, [r0], r1 //r6=src[0]
- ldr r7, [r0], r1 //r7=src[1]
-
- vmov d0, r4, r5
- vmov d1, r5, r6
- vmov d2, r6, r7
-
- ldr r4, [r0], r1 //r4=src[2]
- vmov d3, r7, r4
- ldr r7, [sp, #16]
-
-w4_v_mc_luma_loop:
-
-// pld [r0]
- //using reserving r4
- ldr r5, [r0], r1 //r5=src[3]
- ldr r6, [r0], r1 //r6=src[0]
- vmov d4, r4, r5
- vmov d5, r5, r6 //reserved r6
-
- FILTER_6TAG_8BITS d0, d1, d2, d3, d4, d5, d12, q14, q15
- vmov r4, r5, d12
- str r4, [r2], r3 //write 1st 4Byte
- str r5, [r2], r3 //write 2nd 4Byte
-
- ldr r5, [r0], r1 //r5=src[1]
- ldr r4, [r0], r1 //r4=src[2]
- vmov d0, r6, r5
- vmov d1, r5, r4 //reserved r4
-
- FILTER_6TAG_8BITS d2, d3, d4, d5, d0, d1, d12, q14, q15
- vmov r5, r6, d12
- str r5, [r2], r3 //write 3rd 4Byte
- str r6, [r2], r3 //write 4th 4Byte
-
- //d4, d5, d0, d1 --> d0, d1, d2, d3
- vmov q1, q0
- vmov q0, q2
-
- sub r7, #4
- cmp r7, #0
- bne w4_v_mc_luma_loop
-
- pop {r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {d0-d2}, [r0], r1 //use 21(16+5), =src[-2]
- vld1.u8 {d3-d5}, [r0], r1 //use 21(16+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {d6-d8}, [r0], r1 //use 21(16+5), =src[0]
- vld1.u8 {d9-d11}, [r0], r1 //use 21(16+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
-
-w16_hv_mc_luma_loop:
-
- vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d0, d3, d6, d9, d12, d15, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d4, d7,d10, d13, d16,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0 //output to q0[0]
-
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d2, d5, d8,d11, d14, d17,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1 //output to q0[1]
- vst1.u8 {q0}, [r2], r3 //write 16Byte
-
-
- vld1.u8 {d0-d2}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d3, d6, d9, d12, d15, d0, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d4, d7,d10, d13, d16, d1,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3 //output to d3
-
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d5, d8,d11, d14, d17, d2,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4 //output to d4
-
- vst1.u8 {d3, d4}, [r2], r3 //write 16Byte
-
- vld1.u8 {d3-d5}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d6, d9, d12, d15, d0, d3, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7,d10, d13, d16, d1, d4,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6 //output to d6
-
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d8,d11, d14, d17, d2, d5,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7 //output to d7
- vst1.u8 {d6, d7}, [r2], r3 //write 16Byte
-
- vld1.u8 {d6-d8}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q9/q10
- FILTER_6TAG_8BITS_TO_16BITS d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q9, q10, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9 //output to d9
- // vertical filtered into q10/q11
- FILTER_6TAG_8BITS_TO_16BITS d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q10, q11, q9, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10 //output to d10
- vst1.u8 {d9, d10}, [r2], r3 //write 16Byte
-
- //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
- vswp q0, q6
- vswp q6, q3
- vmov q5, q2
- vmov q2, q8
-
- vmov d20,d8
- vmov q4, q1
- vmov q1, q7
- vmov d14,d20
-
- sub r4, #4
- cmp r4, #0
- bne w16_hv_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 13(8+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 13(8+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {q2}, [r0], r1 //use 13(8+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 13(8+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 13(8+5), =src[2]
-
-w8_hv_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //use 13(8+5), =src[3]
- //the 1st row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q7, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2], r3 //write 8Byte
-
- vld1.u8 {q0}, [r0], r1 //read 2nd row
- //the 2nd row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8, d10, d0, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9, d11, d1, q7, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2], r3 //write 8Byte
-
- vld1.u8 {q1}, [r0], r1 //read 3rd row
- //the 3rd row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d0, d2, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d1, d3, q7, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2], r3 //write 8Byte
-
- vld1.u8 {q2}, [r0], r1 //read 4th row
- //the 4th row
- pld [r0]
- // vertical filtered into q6/q7
- FILTER_6TAG_8BITS_TO_16BITS d6, d8, d10, d0, d2, d4, q6, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9, d11, d1, d3, d5, q7, q14, q15 // 5 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q6, q7, q11, q12, q13
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d12 //output to q6[0]
- vst1.u8 d12, [r2], r3 //write 8Byte
-
- //q4~q5, q0~q2, --> q0~q4
- vswp q0, q4
- vswp q2, q4
- vmov q3, q1
- vmov q1, q5
-
- sub r4, #4
- cmp r4, #0
- bne w8_hv_mc_luma_loop
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
- push {r4 ,r5, r6}
- ldr r6, [sp, #12]
-
- sub r0, #2 //src[-2]
- sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
- pld [r0]
- pld [r0, r1]
-
- vmov.u16 q14, #0x0014 // 20
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[-2]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[-1]
-
- pld [r0]
- pld [r0, r1]
- vshr.u16 q15, q14, #2 // 5
-
- vld1.u8 {q2}, [r0], r1 //use 9(4+5), =src[0]
- vld1.u8 {q3}, [r0], r1 //use 9(4+5), =src[1]
- pld [r0]
- pld [r0, r1]
- vld1.u8 {q4}, [r0], r1 //use 9(4+5), =src[2]
-
-w4_hv_mc_luma_loop:
-
- vld1.u8 {q5}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q6}, [r0], r1 //use 9(4+5), =src[4]
-
- //the 1st&2nd row
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d0, d2, d4, d6, d8, d10, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d1, d3, d5, d7, d9, d11, q8, q14, q15 // 1 avail
-
- FILTER_6TAG_8BITS_TO_16BITS d2, d4, d6, d8,d10, d12, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d3, d5, d7, d9,d11, d13,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q0, q7, q8 //4 avail
-
- vmov d23, d0
- vmov d25, d14
- vmov d27, d16
-
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
-
- //the 3rd&4th row
- vld1.u8 {q0}, [r0], r1 //use 9(4+5), =src[3]
- vld1.u8 {q1}, [r0], r1 //use 9(4+5), =src[4]
- pld [r0]
- pld [r0, r1]
- // vertical filtered
- FILTER_6TAG_8BITS_TO_16BITS d4, d6, d8, d10, d12, d0, q7, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d5, d7, d9, d11, d13, d1, q8, q14, q15 // 1 avail
-
- FILTER_6TAG_8BITS_TO_16BITS d6, d8,d10, d12, d0, d2, q9, q14, q15 // 8 avail
- FILTER_6TAG_8BITS_TO_16BITS d7, d9,d11, d13, d1, d3,q10, q14, q15 // 1 avail
- // horizon filtered
- UNPACK_2_16BITS_TO_ABC q7, q8, q11, q12, q13 //4 avail
- UNPACK_2_16BITS_TO_ABC q9,q10, q2, q7, q8 //4 avail
-
- vmov d23, d4
- vmov d25, d14
- vmov d27, d16
-
- FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22 //output to q11[0]
- vmov r4 ,r5, d22
- str r4, [r2], r3 //write 4Byte
- str r5, [r2], r3 //write 4Byte
-
- //q4~q6, q0~q1, --> q0~q4
- vswp q4, q0
- vmov q3, q4
- vmov q4, q1
- vmov q1, q5
- vmov q2, q6
-
- sub r6, #4
- cmp r6, #0
- bne w4_hv_mc_luma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-w16_copy_loop:
- vld1.u8 {q0}, [r0], r1
- sub r4, #2
- vld1.u8 {q1}, [r0], r1
- vst1.u8 {q0}, [r2], r3
- cmp r4, #0
- vst1.u8 {q1}, [r2], r3
- bne w16_copy_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
- push {r4}
- ldr r4, [sp, #4]
-w8_copy_loop:
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d1}, [r0], r1
- vst1.u8 {d0}, [r2], r3
- vst1.u8 {d1}, [r2], r3
- sub r4, #2
- cmp r4, #0
- bne w8_copy_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12]
-w4_copy_loop:
- ldr r5, [r0], r1
- ldr r6, [r0], r1
- str r5, [r2], r3
- str r6, [r2], r3
-
- sub r4, #2
- cmp r4, #0
- bne w4_copy_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
- push {r4}
- ldr r4, [sp, #4]
-w16_pix_avg_loop:
- vld1.u8 {q0}, [r2]!
- vld1.u8 {q1}, [r3]!
- vld1.u8 {q2}, [r2]!
- vld1.u8 {q3}, [r3]!
-
- vld1.u8 {q4}, [r2]!
- vld1.u8 {q5}, [r3]!
- vld1.u8 {q6}, [r2]!
- vld1.u8 {q7}, [r3]!
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {q0}, [r0], r1
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {q2}, [r0], r1
-
- AVERAGE_TWO_8BITS d8, d8, d10
- AVERAGE_TWO_8BITS d9, d9, d11
- vst1.u8 {q4}, [r0], r1
-
- AVERAGE_TWO_8BITS d12, d12, d14
- AVERAGE_TWO_8BITS d13, d13, d15
- vst1.u8 {q6}, [r0], r1
-
- sub r4, #4
- cmp r4, #0
- bne w16_pix_avg_loop
-
- pop {r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- mov r5, #16
-w8_pix_avg_loop:
-
- vld1.u8 {d0}, [r2], r5
- vld1.u8 {d2}, [r3], r5
- vld1.u8 {d1}, [r2], r5
- vld1.u8 {d3}, [r3], r5
-
- AVERAGE_TWO_8BITS d0, d0, d2
- AVERAGE_TWO_8BITS d1, d1, d3
- vst1.u8 {d0}, [r0], r1
- vst1.u8 {d1}, [r0], r1
-
- vld1.u8 {d4}, [r2], r5
- vld1.u8 {d6}, [r3], r5
- vld1.u8 {d5}, [r2], r5
- vld1.u8 {d7}, [r3], r5
-
- AVERAGE_TWO_8BITS d4, d4, d6
- AVERAGE_TWO_8BITS d5, d5, d7
- vst1.u8 {d4}, [r0], r1
- vst1.u8 {d5}, [r0], r1
-
- sub r4, #4
- cmp r4, #0
- bne w8_pix_avg_loop
-
- pop {r4, r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
- push {r4-r8}
- ldr r4, [sp, #20]
-w4_pix_avg_loop:
-
- ldr r5, [r2]
- ldr r6, [r2, #16]
- ldr r7, [r3]
- ldr r8, [r3, #16]
- add r2, #32
- add r3, #32
-
- vmov d0, r5, r6
- vmov d1, r7, r8
- AVERAGE_TWO_8BITS d0, d0, d1
- vmov r5, r6, d0
-
- str r5, [r0], r1
- str r6, [r0], r1
-
- sub r4, #2
- cmp r4, #0
- bne w4_pix_avg_loop
-
- pop {r4-r8}
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
- push {r4, r5}
- ldr r4, [sp, #8]
- ldr r5, [sp, #12]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
- vld1.u8 {q0}, [r0], r1 //src[x]
-
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
-
- vext.u8 d1, d0, d1, #1 //src[x+1]
-
-w8_mc_chroma_loop: // each two pxl row
- vld1.u8 {q1}, [r0], r1 //src[x+stride]
- vld1.u8 {q2}, [r0], r1 //src[x+2*stride]
- vext.u8 d3, d2, d3, #1 //src[x+stride+1]
- vext.u8 d5, d4, d5, #1 //src[x+2*stride+1]
-
- vmull.u8 q3, d0, d28 //(src[x] * A)
- vmlal.u8 q3, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d2, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d3, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
-
- vmull.u8 q3, d2, d28 //(src[x] * A)
- vmlal.u8 q3, d3, d29 //+=(src[x+1] * B)
- vmlal.u8 q3, d4, d30 //+=(src[x+stride] * C)
- vmlal.u8 q3, d5, d31 //+=(src[x+stride+1] * D)
- vrshrn.u16 d6, q3, #6
- vst1.u8 d6, [r2], r3
-
- vmov q0, q2
- sub r5, #2
- cmp r5, #0
- bne w8_mc_chroma_loop
-
- pop {r4, r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
-
- push {r4, r5, r6}
- ldr r4, [sp, #12]
- ldr r6, [sp, #16]
-// normal case: {cA*src[x] + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-// we can opti it by adding vert only/ hori only cases, to be continue
- vld1.u8 {d31}, [r4] //load A/B/C/D
-
- vdup.u8 d28, d31[0] //A
- vdup.u8 d29, d31[1] //B
- vdup.u8 d30, d31[2] //C
- vdup.u8 d31, d31[3] //D
-
-w4_mc_chroma_loop: // each two pxl row
- vld1.u8 {d0}, [r0], r1 //a::src[x]
- vld1.u8 {d2}, [r0], r1 //b::src[x+stride]
- vld1.u8 {d4}, [r0] //c::src[x+2*stride]
-
- vshr.u64 d1, d0, #8
- vshr.u64 d3, d2, #8
- vshr.u64 d5, d4, #8
-
- vmov q3, q1 //b::[0:7]+b::[1~8]
- vtrn.32 q0, q1 //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
- vtrn.32 q3, q2 //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
-
- vmull.u8 q1, d0, d28 //(src[x] * A)
- vmlal.u8 q1, d1, d29 //+=(src[x+1] * B)
- vmlal.u8 q1, d6, d30 //+=(src[x+stride] * C)
- vmlal.u8 q1, d7, d31 //+=(src[x+stride+1] * D)
-
- vrshrn.u16 d2, q1, #6
- vmov r4, r5, d2
- str r4, [r2], r3
- str r5, [r2], r3
-
- sub r6, #2
- cmp r6, #0
- bne w4_mc_chroma_loop
-
- pop {r4, r5, r6}
-WELS_ASM_FUNC_END
-#endif
--- a/codec/encoder/core/src/mc.cpp
+++ b/codec/encoder/core/src/mc.cpp
@@ -480,6 +480,99 @@
}
#endif //X86_ASM
+
+ //***************************************************************************//
+ // NEON implementation //
+ //***************************************************************************//
+#if defined(HAVE_NEON)
+void McHorVer20Width9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 17)
+ McHorVer20Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 9)
+ McHorVer20Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight){
+ if (iWidth == 16)
+ McHorVer02Height17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 8)
+ McHorVer02Height9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight){
+ if (iWidth == 17)
+ McHorVer22Width17_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else //if (iWidth == 9)
+ McHorVer22Width9_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void EncMcHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+ McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer02WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+ McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+ McHorVer02WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+ McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+ McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer20WidthEq16_neon(pSrc, iSrcStride, pTmp, 16, iHeight);
+ McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pTmp, 16, iHeight);
+ McHorVer22WidthEq16_neon(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+ McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+ McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, &pTmp[256], 16, iHeight);
+ PixelAvgWidthEq16_neon(pDst, iDstStride, pTmp, &pTmp[256], iHeight);
+}
+void EncMcChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+ SMVUnitXY sMv, int32_t iWidth, int32_t iHeight) {
+ const int32_t kiD8x = sMv.iMvX&0x07;
+ const int32_t kiD8y = sMv.iMvY&0x07;
+ if (0 == kiD8x && 0 == kiD8y) {
+ if(8 == iWidth)
+ McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else // iWidth == 4
+ McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ }
+ else {
+ if(8 == iWidth)
+ McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+ else //if(4 == iWidth)
+ McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
+ }
+}
+#endif
+
typedef void (*PixelAvgFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t);
void WelsInitMcFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
static PixelAvgFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
@@ -498,6 +591,14 @@
McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
};
#endif
+#if defined(HAVE_NEON)
+ static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y]
+ McCopyWidthEq16_neon, McHorVer10WidthEq16_neon, McHorVer20WidthEq16_neon, McHorVer30WidthEq16_neon,
+ McHorVer01WidthEq16_neon, EncMcHorVer11_neon, EncMcHorVer21_neon, EncMcHorVer31_neon,
+ McHorVer02WidthEq16_neon, EncMcHorVer12_neon, McHorVer22WidthEq16_neon, EncMcHorVer32_neon,
+ McHorVer03WidthEq16_neon, EncMcHorVer13_neon, EncMcHorVer23_neon, EncMcHorVer33_neon
+ };
+#endif
pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
@@ -538,5 +639,17 @@
}
#endif //(X86_ASM)
+
+#if defined(HAVE_NEON)
+ if (uiCpuFlag & WELS_CPU_NEON) {
+ pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_neon;
+ pFuncList->sMcFuncs.pfChromaMc = EncMcChroma_neon;
+ pFuncList->sMcFuncs.pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon;
+ pFuncList->sMcFuncs.pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon;
+ pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
+ pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
+ pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
+ }
+#endif
}
}