shithub: openh264

--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj

@@ -22,6 +22,7 @@

 		F0B204F918FD23BF005DA23F /* copy_mb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F0B204F818FD23BF005DA23F /* copy_mb.cpp */; };

 		F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8221906673900E156A8 /* arm_arch64_common_macro.S */; };

 		F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; };

+		F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */; };

 		F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; };

 		FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; };

 /* End PBXBuildFile section */

@@ -69,6 +70,7 @@

 		F0B204F818FD23BF005DA23F /* copy_mb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = copy_mb.cpp; sourceTree = "<group>"; };

 		F556A8221906673900E156A8 /* arm_arch64_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = arm_arch64_common_macro.S; path = arm64/arm_arch64_common_macro.S; sourceTree = "<group>"; };

 		F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = expand_picture_aarch64_neon.S; path = arm64/expand_picture_aarch64_neon.S; sourceTree = "<group>"; };

+		F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = deblocking_aarch64_neon.S; path = arm64/deblocking_aarch64_neon.S; sourceTree = "<group>"; };

 		F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = "<group>"; };

 		FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = "<group>"; };

 		FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = "<group>"; };

@@ -175,6 +177,7 @@

 		F556A81D1906669F00E156A8 /* arm64 */ = {

 			isa = PBXGroup;

 			children = (

+				F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */,

 				F5B8D82C190757290037849A /* mc_aarch64_neon.S */,

 				F556A8221906673900E156A8 /* arm_arch64_common_macro.S */,

 				F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */,

@@ -236,6 +239,7 @@

 				F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */,

 				4C3406C918D96EA600DFA14A /* arm_arch_common_macro.S in Sources */,

 				F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */,

+				F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */,

 				4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */,

 				4C3406CF18D96EA600DFA14A /* deblocking_common.cpp in Sources */,

 				4C3406D018D96EA600DFA14A /* logging.cpp in Sources */,

--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj

@@ -460,6 +460,7 @@

 					"$(SRCROOT)/../../../../common/arm",

);

 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;

+				ONLY_ACTIVE_ARCH = NO;

 				OTHER_LDFLAGS = "-ObjC";

 				PRODUCT_NAME = "$(TARGET_NAME)";

 				SKIP_INSTALL = YES;

@@ -494,6 +495,7 @@

 					"$(SRCROOT)/../../../../common/arm",

);

 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;

+				ONLY_ACTIVE_ARCH = NO;

 				OTHER_LDFLAGS = "-ObjC";

 				PRODUCT_NAME = "$(TARGET_NAME)";

 				SKIP_INSTALL = YES;

--- /dev/null

+++ b/codec/common/arm64/deblocking_aarch64_neon.S

@@ -1,0 +1,1116 @@

+/*!

+* \copy

+*     Copyright (c)  2013, Cisco Systems

+*     All rights reserved.

+*     Redistribution and use in source and binary forms, with or without

+*     modification, are permitted provided that the following conditions

+*     are met:

+*        * Redistributions of source code must retain the above copyright

+*          notice, this list of conditions and the following disclaimer.

+*        * Redistributions in binary form must reproduce the above copyright

+*          notice, this list of conditions and the following disclaimer in

+*          the documentation and/or other materials provided with the

+*          distribution.

+*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+*     POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+#ifdef __APPLE__

+.macro	MASK_MATRIX

+  uabd    $6.16b, $1.16b, $2.16b

+  cmhi    $6.16b, $4.16b, $6.16b

+  uabd    $4.16b, $0.16b, $1.16b

+  cmhi    $4.16b, $5.16b, $4.16b

+  and     $6.16b, $6.16b, $4.16b

+  uabd    $4.16b, $3.16b, $2.16b

+  cmhi    $4.16b, $5.16b, $4.16b

+  and     $6.16b, $6.16b, $4.16b

+.endm

+.macro	DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)

+  //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20

+  urhadd	$8.16b, $2.16b, $3.16b

+  uhadd   $8.16b, $0.16b, $8.16b

+  usubl   $9.8h, $8.8b, $1.8b

+  sqxtn   $9.8b, $9.8h

+  usubl2  $8.8h, $8.16b, $1.16b

+  sqxtn2  $9.16b, $8.8h

+  smax    $8.16b, $9.16b, $5.16b

+//

+  smin	$8.16b, $8.16b, $6.16b

+  uabd	$9.16b, $0.16b, $2.16b

+  cmhi	$9.16b, $4.16b, $9.16b

+  and     $8.16b, $8.16b, $9.16b

+  and     $8.16b, $8.16b, $7.16b

+  add     $8.16b, $1.16b, $8.16b

+  abs     $9.16b, $9.16b

+.endm

+.macro	DIFF_LUMA_LT4_P0_Q0_1

+  usubl	$5.8h, $0.8b, $3.8b

+  usubl	$6.8h, $2.8b, $1.8b

+  shl     $6.8h, $6.8h, #2

+  add     $5.8h, $5.8h, $6.8h

+  sqrshrn  $4.8b, $5.8h, #3

+.endm

+.macro	DIFF_LUMA_LT4_P0_Q0_2

+  usubl2	$5.8h, $0.16b, $3.16b

+  usubl2	$6.8h, $2.16b, $1.16b

+  shl     $6.8h, $6.8h, #2

+  add     $5.8h, $5.8h, $6.8h

+  sqrshrn2  $4.16b, $5.8h, #3

+.endm

+.macro	EXTRACT_DELTA_INTO_TWO_PART

+  cmge	$1.16b, $0.16b, #0

+  and     $1.16b, $0.16b, $1.16b

+  sub     $0.16b, $1.16b, $0.16b

+.endm

+.macro	DIFF_LUMA_EQ4_P2P1P0_1

+  uaddl $8.8h, $1.8b, $2.8b

+  uaddl $9.8h, $3.8b, $4.8b

+  add   $9.8h, $9.8h, $8.8h

+  uaddl $8.8h, $0.8b, $1.8b

+  shl   $8.8h, $8.8h, #1

+  add   $8.8h, $9.8h, $8.8h

+  rshrn	$0.8b, $9.8h, #2

+  rshrn	$7.8b, $8.8h, #3

+  shl     $9.8h, $9.8h, #1

+  usubl   $8.8h, $5.8b, $1.8b

+  add     $9.8h, $8.8h, $9.8h

+  uaddl	$8.8h, $2.8b, $5.8b

+  uaddw	$8.8h, $8.8h, $2.8b

+  uaddw	$8.8h, $8.8h, $3.8b

+  rshrn	$9.8b, $9.8h, #3

+  rshrn	$8.8b, $8.8h, #2

+  bsl		$6.8b, $9.8b, $8.8b

+.endm

+.macro	DIFF_LUMA_EQ4_P2P1P0_2

+  uaddl2 $8.8h, $1.16b, $2.16b

+  uaddl2 $9.8h, $3.16b, $4.16b

+  add   $9.8h, $9.8h, $8.8h

+  uaddl2 $8.8h, $0.16b, $1.16b

+  shl   $8.8h, $8.8h, #1

+  add   $8.8h, $9.8h, $8.8h

+  rshrn2	$0.16b, $9.8h, #2

+  rshrn2	$7.16b, $8.8h, #3

+  shl     $9.8h, $9.8h, #1

+  usubl2   $8.8h, $5.16b, $1.16b

+  add     $9.8h, $8.8h, $9.8h

+  uaddl2	$8.8h, $2.16b, $5.16b

+  uaddw2	$8.8h, $8.8h, $2.16b

+  uaddw2	$8.8h, $8.8h, $3.16b

+  rshrn2	$9.16b, $9.8h, #3

+  rshrn2	$8.16b, $8.8h, #2

+  bsl		$6.16b, $9.16b, $8.16b

+.endm

+.macro	DIFF_CHROMA_EQ4_P0Q0_1

+  uaddl $4.8h, $0.8b, $3.8b

+  shl   $4.8h, $4.8h, #1

+  usubl $5.8h, $1.8b, $3.8b

+  add   $5.8h, $5.8h, $4.8h

+  rshrn $6.8b, $5.8h, #2

+  usubl $5.8h, $2.8b, $0.8b

+  add   $5.8h, $5.8h, $4.8h

+  rshrn $7.8b, $5.8h, #2

+.endm

+.macro	DIFF_CHROMA_EQ4_P0Q0_2

+  uaddl2 $4.8h, $0.16b, $3.16b

+  shl   $4.8h, $4.8h, #1

+  usubl2 $5.8h, $1.16b, $3.16b

+  add   $5.8h, $5.8h, $4.8h

+  rshrn2 $6.16b, $5.8h, #2

+  usubl2 $5.8h, $2.16b, $0.16b

+  add   $5.8h, $5.8h, $4.8h

+  rshrn2 $7.16b, $5.8h, #2

+.endm

+.macro	DIFF_LUMA_EQ4_MASK

+  mov.16b	$3, $2

+  bsl	$3.16b, $0.16b, $1.16b

+.endm

+.macro	LOAD_LUMA_DATA_3

+  ld3	{$0.b, $1.b, $2.b} [$6], [x2], x1

+  ld3	{$3.b, $4.b, $5.b} [$6], [x0], x1

+.endm

+.macro	LOAD_LUMA_DATA_4

+  ld4	{$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1

+  ld4	{$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1

+.endm

+.macro	STORE_LUMA_DATA_4

+  st4	{$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1

+  st4	{$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1

+.endm

+.macro	STORE_LUMA_DATA_3

+  st3 {$0.b, $1.b, $2.b} [$6], [x3], x1

+  st3	{$3.b, $4.b, $5.b} [$6], [x0], x1

+.endm

+.macro	LOAD_CHROMA_DATA_4

+  ld4	{$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2

+.endm

+.macro	STORE_CHROMA_DATA_2

+  st2	{$0.b, $1.b} [$3], [$2], x2

+.endm

+.macro	ZERO_JUMP_END

+  mov $1, $0.d[0]

+  mov $2, $0.d[1]

+  orr $1, $1, $2

+  cbz $1, $3

+.endm

+.macro BS_NZC_CHECK

+  ld1 {v0.16b}, [$0]

+  //Arrange the input data --- TOP

+  ands     x6, $1, #2

+  cbz      x6, bs_nzc_check_jump0

+  sub      x6, $0, $2, lsl #4

+  sub      x6, x6, $2, lsl #3

+  add      x6, x6, #12

+  ld1      {v1.s} [3], [x6]

+  bs_nzc_check_jump0:

+  ext.16b  v1, v1, v0, #12

+  add      $3.16b, v0.16b, v1.16b

+  // Arrange the input data --- LEFT

+  ands     x6, $1, #1

+  cbz      x6, bs_nzc_check_jump1

+  sub      x6, $0, #21

+  add      x7, x6, #4

+  ld1      {v1.b} [12], [x6]

+  add      x6, x7, #4

+  ld1      {v1.b} [13], [x7]

+  add      x7, x6, #4

+  ld1      {v1.b} [14], [x6]

+  ld1      {v1.b} [15], [x7]

+bs_nzc_check_jump1:

+  ins      v2.d[0], v0.d[1]

+  zip1     v0.16b, v0.16b, v2.16b

+  ins      v2.d[0], v0.d[1]

+  zip1     v0.16b, v0.16b, v2.16b

+  ext.16b  v1, v1, v0, #12

+  add      $4.16b, v0.16b, v1.16b

+.endm

+.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5

+  mov   w6, #4

+  sabd  v20.8h, $0.8h, $1.8h

+  sabd  v21.8h, $1.8h, $2.8h

+  dup   $0.8h, w6

+  sabd  v22.8h, $2.8h, $3.8h

+  sabd  v23.8h, $3.8h, $4.8h

+  cmge  v20.8h, v20.8h, $0.8h

+  cmge  v21.8h, v21.8h, $0.8h

+  cmge  v22.8h, v22.8h, $0.8h

+  cmge  v23.8h, v23.8h, $0.8h

+  addp v20.8h, v20.8h, v21.8h

+  addp v21.8h, v22.8h, v23.8h

+  addhn  $5.8b, v20.8h, v20.8h

+  addhn2  $5.16b, v21.8h, v21.8h

+.endm

+.macro BS_MV_CHECK

+  ldp q0, q1, [$0], #32

+  ldp q2, q3, [$0]

+  sub $0, $0, #32

+  // Arrenge the input data --- TOP

+  ands     x6, $1, #2

+  cbz     x6, bs_mv_check_jump0

+  sub      x6, $0, $2, lsl #6

+  add      x6, x6, #48

+  ld1      {v4.16b}, [x6]

+bs_mv_check_jump0:

+  BS_COMPARE_MV  v4, v0, v1, v2, v3, $3

+  // Arrange the input data --- LEFT

+  ands     x6, $1, #1

+  cbz      x6, bs_mv_check_jump1

+  sub      x6, $0, #52

+  add      x7, x6, #16

+  ld1      {v4.s} [0], [x6]

+  add      x6, x7, #16

+  ld1      {v4.s} [1], [x7]

+  add      x7, x6, #16

+  ld1      {v4.s} [2], [x6]

+  ld1      {v4.s} [3], [x7]

+bs_mv_check_jump1:

+  zip1  $5.4s, v0.4s, v2.4s

+  zip2  $6.4s, v0.4s, v2.4s

+  zip1  v0.4s, v1.4s, v3.4s

+  zip2  v2.4s, v1.4s, v3.4s

+  zip2  v1.4s, $5.4s, v0.4s

+  zip1  v0.4s, $5.4s, v0.4s

+  zip2  v3.4s, $6.4s, v2.4s

+  zip1  v2.4s, $6.4s, v2.4s

+  BS_COMPARE_MV  v4, v0, v1, v2, v3, $4

+.endm

+#else

+.macro	MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6

+  uabd    \arg6.16b, \arg1.16b, \arg2.16b

+  cmhi    \arg6.16b, \arg4.16b, \arg6.16b

+  uabd    \arg4.16b, \arg0.16b, \arg1.16b

+  cmhi    \arg4.16b, \arg5.16b, \arg4.16b

+  and     \arg6.16b, \arg6.16b, \arg4.16b

+  uabd    \arg4.16b, \arg3.16b, \arg2.16b

+  cmhi    \arg4.16b, \arg5.16b, \arg4.16b

+  and     \arg6.16b, \arg6.16b, \arg4.16b

+.endm

+.macro	DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+  //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20

+  urhadd	\arg8.16b, \arg2.16b, \arg3.16b

+  uhadd   \arg8.16b, \arg0.16b, \arg8.16b

+  usubl   \arg9.8h, \arg8.8b, \arg1.8b

+  sqxtn   \arg9.8b, \arg9.8h

+  usubl2  \arg8.8h, \arg8.16b, \arg1.16b

+  sqxtn2  \arg9.16b, \arg8.8h

+  smax    \arg8.16b, \arg9.16b, \arg5.16b

+  //

+  smin	\arg8.16b, \arg8.16b, \arg6.16b

+  uabd	\arg9.16b, \arg0.16b, \arg2.16b

+  cmhi	\arg9.16b, \arg4.16b, \arg9.16b

+  and     \arg8.16b, \arg8.16b, \arg9.16b

+  and     \arg8.16b, \arg8.16b, \arg7.16b

+  add     \arg8.16b, \arg1.16b, \arg8.16b

+  abs     \arg9.16b, \arg9.16b

+.endm

+.macro	DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6

+  usubl	\arg5.8h, \arg0.8b, \arg3.8b

+  usubl	\arg6.8h, \arg2.8b, \arg1.8b

+  shl     \arg6.8h, \arg6.8h, #2

+  add     \arg5.8h, \arg5.8h, \arg6.8h

+  sqrshrn  \arg4.8b, \arg5.8h, #3

+.endm

+.macro	DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6

+  usubl2	\arg5.8h, \arg0.16b, \arg3.16b

+  usubl2	\arg6.8h, \arg2.16b, \arg1.16b

+  shl     \arg6.8h, \arg6.8h, #2

+  add     \arg5.8h, \arg5.8h, \arg6.8h

+  sqrshrn2  \arg4.16b, \arg5.8h, #3

+.endm

+.macro	EXTRACT_DELTA_INTO_TWO_PART arg0, arg1

+  cmge	\arg1.16b, \arg0.16b, #0

+  and     \arg1.16b, \arg0.16b, \arg1.16b

+  sub     \arg0.16b, \arg1.16b, \arg0.16b

+.endm

+.macro	DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+  uaddl \arg8.8h, \arg1.8b, \arg2.8b

+  uaddl \arg9.8h, \arg3.8b, \arg4.8b

+  add   \arg9.8h, \arg9.8h, \arg8.8h

+  uaddl \arg8.8h, \arg0.8b, \arg1.8b

+  shl   \arg8.8h, \arg8.8h, #1

+  add   \arg8.8h, \arg9.8h, \arg8.8h

+  rshrn	\arg0.8b, \arg9.8h, #2

+  rshrn	\arg7.8b, \arg8.8h, #3

+  shl     \arg9.8h, \arg9.8h, #1

+  usubl   \arg8.8h, \arg5.8b, \arg1.8b

+  add     \arg9.8h, \arg8.8h, \arg9.8h

+  uaddl	\arg8.8h, \arg2.8b, \arg5.8b

+  uaddw	\arg8.8h, \arg8.8h, \arg2.8b

+  uaddw	\arg8.8h, \arg8.8h, \arg3.8b

+  rshrn	\arg9.8b, \arg9.8h, #3

+  rshrn	\arg8.8b, \arg8.8h, #2

+  bsl		\arg6.8b, \arg9.8b, \arg8.8b

+.endm

+.macro	DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+  uaddl2 \arg8.8h, \arg1.16b, \arg2.16b

+  uaddl2 \arg9.8h, \arg3.16b, \arg4.16b

+  add   \arg9.8h, \arg9.8h, \arg8.8h

+  uaddl2 \arg8.8h, \arg0.16b, \arg1.16b

+  shl   \arg8.8h, \arg8.8h, #1

+  add   \arg8.8h, \arg9.8h, \arg8.8h

+  rshrn2	\arg0.16b, \arg9.8h, #2

+  rshrn2	\arg7.16b, \arg8.8h, #3

+  shl     \arg9.8h, \arg9.8h, #1

+  usubl2   \arg8.8h, \arg5.16b, \arg1.16b

+  add     \arg9.8h, \arg8.8h, \arg9.8h

+  uaddl2	\arg8.8h, \arg2.16b, \arg5.16b

+  uaddw2	\arg8.8h, \arg8.8h, \arg2.16b

+  uaddw2	\arg8.8h, \arg8.8h, \arg3.16b

+  rshrn2	\arg9.16b, \arg9.8h, #3

+  rshrn2	\arg8.16b, \arg8.8h, #2

+  bsl		\arg6.16b, \arg9.16b, \arg8.16b

+.endm

+.macro	DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+  uaddl \arg4.8h, \arg0.8b, \arg3.8b

+  shl   \arg4.8h, \arg4.8h, #1

+  usubl \arg5.8h, \arg1.8b, \arg3.8b

+  add   \arg5.8h, \arg5.8h, \arg4.8h

+  rshrn \arg6.8b, \arg5.8h, #2

+  usubl \arg5.8h, \arg2.8b, \arg0.8b

+  add   \arg5.8h, \arg5.8h, \arg4.8h

+  rshrn \arg7.8b, \arg5.8h, #2

+.endm

+.macro	DIFF_CHROMA_EQ4_P0Q0_2  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+  uaddl2 \arg4.8h, \arg0.16b, \arg3.16b

+  shl   \arg4.8h, \arg4.8h, #1

+  usubl2 \arg5.8h, \arg1.16b, \arg3.16b

+  add   \arg5.8h, \arg5.8h, \arg4.8h

+  rshrn2 \arg6.16b, \arg5.8h, #2

+  usubl2 \arg5.8h, \arg2.16b, \arg0.16b

+  add   \arg5.8h, \arg5.8h, \arg4.8h

+  rshrn2 \arg7.16b, \arg5.8h, #2

+.endm

+.macro	DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3

+  mov.16b	\arg3, \arg2

+  bsl	\arg3.16b, \arg0.16b, \arg1.16b

+.endm

+.macro	LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6

+  ld3	{\arg0.b, \arg1.b, \arg2.b} [\arg6], [x2], x1

+  ld3	{\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1

+.endm

+.macro	LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8

+  ld4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg8], [x3], x1

+  ld4	{\arg4.b, \arg5.b, \arg6.b, \arg7.b} [\arg8], [x0], x1

+.endm

+.macro	STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5

+  st4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg4], [x0], x1

+  st4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [x2], x1

+.endm

+.macro	STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6

+  st3   {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x3], x1

+  st3	{\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1

+.endm

+.macro	LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5

+  ld4	{\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [\arg4], x2

+.endm

+.macro	STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3

+  st2	{\arg0.b, \arg1.b} [\arg3], [\arg2], x2

+.endm

+.macro	ZERO_JUMP_END arg0, arg1, arg2, arg3

+  mov \arg1, \arg0.d[0]

+  mov \arg2, \arg0.d[1]

+  orr \arg1, \arg1, \arg2

+  cbz \arg1, \arg3

+.endm

+.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4

+  ld1 {v0.16b}, [\arg0]

+  //Arrange the input data --- TOP

+  ands     x6, \arg1, #2

+  cbz      x6, bs_nzc_check_jump0

+  sub      x6, \arg0, \arg2, lsl #4

+  sub      x6, x6, \arg2, lsl #3

+  add      x6, x6, #12

+  ld1      {v1.s} [3], [x6]

+bs_nzc_check_jump0:

+  ext.16b  v1, v1, v0, #12

+  add      \arg3.16b, v0.16b, v1.16b

+  // Arrange the input data --- LEFT

+  ands     x6, \arg1, #1

+  cbz      x6, bs_nzc_check_jump1

+  sub      x6, \arg0, #21

+  add      x7, x6, #4

+  ld1      {v1.b} [12], [x6]

+  add      x6, x7, #4

+  ld1      {v1.b} [13], [x7]

+  add      x7, x6, #4

+  ld1      {v1.b} [14], [x6]

+  ld1      {v1.b} [15], [x7]

+bs_nzc_check_jump1:

+  ins      v2.d[0], v0.d[1]

+  zip1     v0.16b, v0.16b, v2.16b

+  ins      v2.d[0], v0.d[1]

+  zip1     v0.16b, v0.16b, v2.16b

+  ext.16b  v1, v1, v0, #12

+  add      \arg4.16b, v0.16b, v1.16b

+.endm

+.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5

+  //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5

+  mov   w6, #4

+  sabd  v20.8h, \arg0.8h, \arg1.8h

+  sabd  v21.8h, \arg1.8h, \arg2.8h

+  dup   \arg0.8h, w6

+  sabd  v22.8h, \arg2.8h, \arg3.8h

+  sabd  v23.8h, \arg3.8h, \arg4.8h

+  cmge  v20.8h, v20.8h, \arg0.8h

+  cmge  v21.8h, v21.8h, \arg0.8h

+  cmge  v22.8h, v22.8h, \arg0.8h

+  cmge  v23.8h, v23.8h, \arg0.8h

+  addp v20.8h, v20.8h, v21.8h

+  addp v21.8h, v22.8h, v23.8h

+  addhn  \arg5.8b, v20.8h, v20.8h

+  addhn2  \arg5.16b, v21.8h, v21.8h

+.endm

+.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6

+  ldp q0, q1, [\arg0], #32

+  ldp q2, q3, [\arg0]

+  sub \arg0, \arg0, #32

+  // Arrenge the input data --- TOP

+  ands     x6, \arg1, #2

+  cbz     x6, bs_mv_check_jump0

+  sub      x6, \arg0, \arg2, lsl #6

+  add      x6, x6, #48

+  ld1      {v4.16b}, [x6]

+bs_mv_check_jump0:

+  BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg3

+  // Arrange the input data --- LEFT

+  ands     x6, \arg1, #1

+  cbz      x6, bs_mv_check_jump1

+  sub      x6, \arg0, #52

+  add      x7, x6, #16

+  ld1      {v4.s} [0], [x6]

+  add      x6, x7, #16

+  ld1      {v4.s} [1], [x7]

+  add      x7, x6, #16

+  ld1      {v4.s} [2], [x6]

+  ld1      {v4.s} [3], [x7]

+bs_mv_check_jump1:

+  zip1  \arg5.4s, v0.4s, v2.4s

+  zip2  \arg6.4s, v0.4s, v2.4s

+  zip1  v0.4s, v1.4s, v3.4s

+  zip2  v2.4s, v1.4s, v3.4s

+  zip2  v1.4s, \arg5.4s, v0.4s

+  zip1  v0.4s, \arg5.4s, v0.4s

+  zip2  v3.4s, \arg6.4s, v2.4s

+  zip1  v2.4s, \arg6.4s, v2.4s

+  BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg4

+.endm

+#endif

+WELS_ASM_ARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon

+  ld1 {v0.8b, v1.8b, v2.8b}, [x0]

+  ins v0.d[1], v1.d[0]

+  uzp1 v0.2d, v0.2d, v1.2d

+  cmeq v0.16b, v0.16b, #0

+  cmeq v2.8b, v2.8b, #0

+  mvn v0.16b, v0.16b

+  mvn v2.8b, v2.8b

+  abs v0.16b, v0.16b

+  abs v2.8b, v2.8b

+  ins v1.d[0], v0.d[1]

+  st1 {v0.8b, v1.8b, v2.8b}, [x0]

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc

+  dup v16.16b, w2 //alpha

+  dup v17.16b, w3 //beta

+  add x2, x1, x1, lsl #1

+  sub x2, x0, x2

+  movi v23.16b, #128

+  ld1 {v0.16b}, [x2], x1

+  ld1 {v1.16b}, [x2], x1

+  ld1 {v2.16b}, [x2]

+  ld1 {v3.16b}, [x0], x1

+  ld1 {v4.16b}, [x0], x1

+  ld1 {v5.16b}, [x0]

+  sub	x2, x2, x1

+  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]

+  trn1 v18.2s, v18.2s, v19.2s

+  trn1 v20.2s, v20.2s, v21.2s

+  trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333

+  cmge v7.16b, v6.16b, #0 // iTc0 Flag

+  MASK_MATRIX	v1, v2, v3, v4, v16, v17, v18

+  and	v7.16b, v7.16b, v18.16b // need filter flag

+  ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end

+  eor	v18.16b, v18.16b, v18.16b

+  sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333

+  DIFF_LUMA_LT4_P1_Q1	v0, v1, v2, v3, v17, v18, v6, v7, v19, v20

+  st1	{v19.16b}, [x2], x1

+  DIFF_LUMA_LT4_P1_Q1	v5, v4, v3, v2, v17, v18, v6, v7, v21, v22

+  abs	v20.16b, v20.16b

+  abs	v22.16b, v22.16b

+  add	v6.16b, v6.16b, v20.16b

+  add	v6.16b, v6.16b, v22.16b

+  eor	v18.16b, v18.16b, v18.16b

+  sub	v18.16b, v18.16b, v6.16b

+  DIFF_LUMA_LT4_P0_Q0_1	v1, v2, v3, v4, v19, v20, v22

+  DIFF_LUMA_LT4_P0_Q0_2	v1, v2, v3, v4, v19, v20, v22

+  smax	v19.16b, v19.16b, v18.16b

+  smin	v19.16b, v19.16b, v6.16b

+  and     v19.16b, v19.16b, v7.16b

+  EXTRACT_DELTA_INTO_TWO_PART	v19, v20

+  uqadd	v2.16b, v2.16b, v20.16b

+  uqsub	v2.16b, v2.16b, v19.16b

+  st1     {v2.16b}, [x2], x1

+  uqsub	v3.16b, v3.16b, v20.16b

+  uqadd	v3.16b, v3.16b, v19.16b

+  st1     {v3.16b}, [x2], x1

+  st1     {v21.16b}, [x2]

+DeblockLumaLt4V_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon

+  dup     v16.16b, w2 //alpha

+  dup     v17.16b, w3 //beta

+  sub     x3, x0, x1, lsl #2

+  ld1     {v0.16b}, [x3], x1

+  ld1     {v4.16b}, [x0], x1

+  ld1     {v1.16b}, [x3], x1

+  ld1     {v5.16b}, [x0], x1

+  ld1     {v2.16b}, [x3], x1

+  ld1     {v6.16b}, [x0], x1

+  ld1     {v3.16b}, [x3]

+  ld1     {v7.16b}, [x0]

+  sub     x3, x3, x1, lsl #1

+  MASK_MATRIX	v2, v3, v4, v5, v16, v17, v18

+  lsr		w2, w2, #2

+  add		w2, w2, #2

+  dup     v16.16b, w2 //((alpha >> 2) + 2)

+  uabd	v19.16b, v3.16b, v4.16b

+  cmhi	v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)

+  uabd 	v21.16b, v1.16b, v3.16b

+  cmhi	v21.16b, v17.16b, v21.16b //bDetaP2P0

+  and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0

+  uabd	v22.16b, v6.16b, v4.16b

+  cmhi	v22.16b, v17.16b, v22.16b //bDetaQ2Q0

+  and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0

+  and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))

+  mov.16b v23, v21

+  mov.16b v24, v21

+  mov.16b v25, v0

+  DIFF_LUMA_EQ4_P2P1P0_1		v0, v1, v2, v3, v4, v5, v23, v19, v17, v16

+  DIFF_LUMA_EQ4_P2P1P0_2		v25, v1, v2, v3, v4, v5, v24, v19, v17, v16

+  ins v0.d[1], v25.d[1]

+  ins v23.d[1], v24.d[1]

+  and	v21.16b, v20.16b, v21.16b

+  DIFF_LUMA_EQ4_MASK	v19, v1, v21, v17

+  st1	{v17.16b}, [x3], x1

+  DIFF_LUMA_EQ4_MASK	v0, v2, v21, v17

+  st1	{v17.16b}, [x3], x1

+  DIFF_LUMA_EQ4_MASK	v23, v3, v18, v17

+  st1	{v17.16b}, [x3], x1

+  mov.16b v23, v22

+  mov.16b v24, v22

+  mov.16b v25, v7

+  DIFF_LUMA_EQ4_P2P1P0_1		v7, v6, v5, v4, v3, v2, v23, v19, v17, v16

+  DIFF_LUMA_EQ4_P2P1P0_2		v25, v6, v5, v4, v3, v2, v24, v19, v17, v16

+  ins v7.d[1], v25.d[1]

+  ins v23.d[1], v24.d[1]

+  and	v22.16b, v20.16b, v22.16b

+  DIFF_LUMA_EQ4_MASK	v23, v4, v18, v17

+  st1	{v17.16b}, [x3], x1

+  DIFF_LUMA_EQ4_MASK	v7, v5, v22, v17

+  st1	{v17.16b}, [x3], x1

+  DIFF_LUMA_EQ4_MASK	v19, v6, v22, v17

+  st1	{v17.16b}, [x3], x1

+DeblockLumaEq4V_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc

+  dup v16.16b, w2 //alpha

+  dup v17.16b, w3 //beta

+  sub x2, x0, #3

+  movi v23.16b, #128

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 0

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 1

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 2

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 3

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 4

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 5

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 6

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 7

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 8

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 9

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 10

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 11

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 12

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 13

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 14

+  LOAD_LUMA_DATA_3		v0, v1, v2, v3, v4, v5, 15

+  sub x0, x0, x1, lsl #4

+  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]

+  trn1 v18.2s, v18.2s, v19.2s

+  trn1 v20.2s, v20.2s, v21.2s

+  trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333

+  cmge v7.16b, v6.16b, #0 // iTc0 Flag

+  MASK_MATRIX	v1, v2, v3, v4, v16, v17, v18

+  and	v7.16b, v7.16b, v18.16b // need filter flag

+  ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end

+  eor	v18.16b, v18.16b, v18.16b

+  sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333

+  DIFF_LUMA_LT4_P1_Q1	v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24

+  mov.16b v25, v19

+  DIFF_LUMA_LT4_P1_Q1	v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24

+  abs	v20.16b, v20.16b

+  abs	v22.16b, v22.16b

+  add	v6.16b, v6.16b, v20.16b

+  add	v6.16b, v6.16b, v22.16b

+  eor	v18.16b, v18.16b, v18.16b

+  sub	v18.16b, v18.16b, v6.16b

+  DIFF_LUMA_LT4_P0_Q0_1	v1, v2, v3, v4, v19, v20, v22

+  DIFF_LUMA_LT4_P0_Q0_2	v1, v2, v3, v4, v19, v20, v22

+  smax	v19.16b, v19.16b, v18.16b

+  smin	v19.16b, v19.16b, v6.16b

+  and     v19.16b, v19.16b, v7.16b

+  EXTRACT_DELTA_INTO_TWO_PART	v19, v20

+  uqadd	v2.16b, v2.16b, v20.16b

+  uqsub	v2.16b, v2.16b, v19.16b

+  mov.16b v26, v2

+  uqsub	v3.16b, v3.16b, v20.16b

+  uqadd	v3.16b, v3.16b, v19.16b

+  mov.16b v27, v3

+  mov.16b v28, v21

+  sub	x0, x0, #2

+  add	x2, x0, x1

+  lsl	x1, x1, #1

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 0, 1

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 2, 3

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 4, 5

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 6, 7

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 8, 9

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 10, 11

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 12, 13

+  STORE_LUMA_DATA_4		v25, v26, v27, v28, 14, 15

+DeblockLumaLt4H_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon

+  dup     v16.16b, w2 //alpha

+  dup     v17.16b, w3 //beta

+  sub     x3, x0, #4

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 0

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 1

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 2

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 3

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 4

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 5

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 6

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 7

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 8

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 9

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 10

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 11

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 12

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 13

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 14

+  LOAD_LUMA_DATA_4		v0, v1, v2, v3, v4, v5, v6, v7, 15

+  sub x0, x0, x1, lsl #4

+  sub x3, x0, #3

+  MASK_MATRIX	v2, v3, v4, v5, v16, v17, v18

+  ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end

+  lsr		w2, w2, #2

+  add		w2, w2, #2

+  dup     v16.16b, w2 //((alpha >> 2) + 2)

+  uabd	v19.16b, v3.16b, v4.16b

+  cmhi	v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)

+  uabd	v21.16b, v1.16b, v3.16b

+  cmhi	v21.16b, v17.16b, v21.16b //bDetaP2P0

+  and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0

+  uabd	v22.16b, v6.16b, v4.16b

+  cmhi	v22.16b, v17.16b, v22.16b //bDetaQ2Q0

+  and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0

+  and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))

+  mov.16b v23, v21

+  mov.16b v24, v21

+  mov.16b v25, v0

+  DIFF_LUMA_EQ4_P2P1P0_1		v0, v1, v2, v3, v4, v5, v23, v19, v17, v16

+  DIFF_LUMA_EQ4_P2P1P0_2		v25, v1, v2, v3, v4, v5, v24, v19, v17, v16

+  ins v0.d[1], v25.d[1]

+  ins v23.d[1], v24.d[1]

+  and	v21.16b, v20.16b, v21.16b

+  DIFF_LUMA_EQ4_MASK	v19, v1, v21, v17

+  mov.16b v26, v17

+  DIFF_LUMA_EQ4_MASK	v0, v2, v21, v17

+  mov.16b v27, v17

+  DIFF_LUMA_EQ4_MASK	v23, v3, v18, v17

+  mov.16b v28, v17

+  mov.16b v23, v22

+  mov.16b v24, v22

+  mov.16b v25, v7

+  DIFF_LUMA_EQ4_P2P1P0_1		v7, v6, v5, v4, v3, v2, v23, v19, v17, v16

+  DIFF_LUMA_EQ4_P2P1P0_2		v25, v6, v5, v4, v3, v2, v24, v19, v17, v16

+  ins v7.d[1], v25.d[1]

+  ins v23.d[1], v24.d[1]

+  and	v22.16b, v20.16b, v22.16b

+  DIFF_LUMA_EQ4_MASK	v23, v4, v18, v17

+  mov.16b v29, v17

+  DIFF_LUMA_EQ4_MASK	v7, v5, v22, v17

+  mov.16b v30, v17

+  DIFF_LUMA_EQ4_MASK	v19, v6, v22, v17

+  mov.16b v31, v17

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 0

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 1

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 2

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 3

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 4

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 5

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 6

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 7

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 8

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 9

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 10

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 11

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 12

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 13

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 14

+  STORE_LUMA_DATA_3		v26, v27, v28, v29, v30, v31, 15

+DeblockLumaEq4H_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc

+  dup v16.16b, w3 //alpha

+  dup v17.16b, w4 //beta

+  lsl x3, x2, #1

+  sub x6, x0, x3 //pPixCb-2*Stride

+  sub x7, x1, x3 //pPixCr-2*Stride

+  ld1 {v0.d} [0], [x6], x2

+  ld1 {v1.d} [0], [x6]

+  ld1 {v2.d} [0], [x0], x2

+  ld1 {v3.d} [0], [x0]

+  ld1 {v0.d} [1], [x7], x2

+  ld1 {v1.d} [1], [x7]

+  ld1 {v2.d} [1], [x1], x2

+  ld1 {v3.d} [1], [x1]

+  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]

+  trn1 v18.4h, v18.4h, v19.4h //0011,0011,

+  trn1 v20.4h, v20.4h, v21.4h //2233,2233

+  zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233

+  cmgt v7.16b, v6.16b, #0 // iTc0 Flag

+  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v18

+  and	v7.16b, v7.16b, v18.16b // need filter flag

+  ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end

+  eor	v18.16b, v18.16b, v18.16b

+  sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233

+  DIFF_LUMA_LT4_P0_Q0_1	v0, v1, v2, v3, v19, v20, v22

+  DIFF_LUMA_LT4_P0_Q0_2	v0, v1, v2, v3, v19, v20, v22

+  smax	v19.16b, v19.16b, v18.16b

+  smin	v19.16b, v19.16b, v6.16b

+  and     v19.16b, v19.16b, v7.16b

+  EXTRACT_DELTA_INTO_TWO_PART	v19, v20

+  uqadd	v1.16b, v1.16b, v20.16b

+  uqsub	v1.16b, v1.16b, v19.16b

+  st1     {v1.d} [0], [x6], x2

+  st1     {v1.d} [1], [x7], x2

+  uqsub	v2.16b, v2.16b, v20.16b

+  uqadd	v2.16b, v2.16b, v19.16b

+  st1     {v2.d} [0], [x6]

+  st1     {v2.d} [1], [x7]

+DeblockChromaLt4V_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc

+  dup v16.16b, w3 //alpha

+  dup v17.16b, w4 //beta

+  sub x6, x0, #2 //pPixCb-2

+  sub x7, x1, #2 //pPixCr-2

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 0

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 1

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 2

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 3

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 4

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 5

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 6

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 7

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 8

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 9

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 10

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 11

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 12

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 13

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 14

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 15

+  sub x0, x0, #1

+  sub x1, x1, #1

+  ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]

+  trn1 v18.4h, v18.4h, v19.4h //0011,0011,

+  trn1 v20.4h, v20.4h, v21.4h //2233,2233

+  zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233

+  cmgt v7.16b, v6.16b, #0 // iTc0 Flag

+  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v18

+  and	v7.16b, v7.16b, v18.16b // need filter flag

+  ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end

+  eor	v18.16b, v18.16b, v18.16b

+  sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233

+  DIFF_LUMA_LT4_P0_Q0_1	v0, v1, v2, v3, v19, v20, v22

+  DIFF_LUMA_LT4_P0_Q0_2	v0, v1, v2, v3, v19, v20, v22

+  smax	v19.16b, v19.16b, v18.16b

+  smin	v19.16b, v19.16b, v6.16b

+  and     v19.16b, v19.16b, v7.16b

+  EXTRACT_DELTA_INTO_TWO_PART	v19, v20

+  uqadd	v1.16b, v1.16b, v20.16b

+  uqsub	v1.16b, v1.16b, v19.16b

+  uqsub	v2.16b, v2.16b, v20.16b

+  uqadd	v2.16b, v2.16b, v19.16b

+  STORE_CHROMA_DATA_2 v1, v2, x0, 0

+  STORE_CHROMA_DATA_2 v1, v2, x0, 1

+  STORE_CHROMA_DATA_2 v1, v2, x0, 2

+  STORE_CHROMA_DATA_2 v1, v2, x0, 3

+  STORE_CHROMA_DATA_2 v1, v2, x0, 4

+  STORE_CHROMA_DATA_2 v1, v2, x0, 5

+  STORE_CHROMA_DATA_2 v1, v2, x0, 6

+  STORE_CHROMA_DATA_2 v1, v2, x0, 7

+  STORE_CHROMA_DATA_2 v1, v2, x1, 8

+  STORE_CHROMA_DATA_2 v1, v2, x1, 9

+  STORE_CHROMA_DATA_2 v1, v2, x1, 10

+  STORE_CHROMA_DATA_2 v1, v2, x1, 11

+  STORE_CHROMA_DATA_2 v1, v2, x1, 12

+  STORE_CHROMA_DATA_2 v1, v2, x1, 13

+  STORE_CHROMA_DATA_2 v1, v2, x1, 14

+  STORE_CHROMA_DATA_2 v1, v2, x1, 15

+DeblockChromaLt4H_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta

+  dup v16.16b, w3 //alpha

+  dup v17.16b, w4 //beta

+  lsl x3, x2, #1

+  sub x6, x0, x3 //pPixCb-2*Stride

+  sub x7, x1, x3 //pPixCr-2*Stride

+  ld1 {v0.d} [0], [x6], x2

+  ld1 {v1.d} [0], [x6]

+  ld1 {v2.d} [0], [x0], x2

+  ld1 {v3.d} [0], [x0]

+  ld1 {v0.d} [1], [x7], x2

+  ld1 {v1.d} [1], [x7]

+  ld1 {v2.d} [1], [x1], x2

+  ld1 {v3.d} [1], [x1]

+  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v7

+  ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end

+  DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21

+  DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21

+  mov.16b v6, v7

+  bsl v6.16b, v20.16b, v1.16b

+  bsl v7.16b, v21.16b, v2.16b

+  st1     {v6.d} [0], [x6], x2

+  st1     {v6.d} [1], [x7], x2

+  st1     {v7.d} [0], [x6]

+  st1     {v7.d} [1], [x7]

+DeblockChromaEq4V_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta

+  dup v16.16b, w3 //alpha

+  dup v17.16b, w4 //beta

+  sub x6, x0, #2 //pPixCb-2

+  sub x7, x1, #2 //pPixCr-2

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 0

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 1

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 2

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 3

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 4

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 5

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 6

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x6, 7

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 8

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 9

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 10

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 11

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 12

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 13

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 14

+  LOAD_CHROMA_DATA_4		v0, v1, v2, v3, x7, 15

+  sub x0, x0, #1

+  sub x1, x1, #1

+  MASK_MATRIX	v0, v1, v2, v3, v16, v17, v7

+  ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end

+  DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21

+  DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21

+  mov.16b v6, v7

+  bsl v6.16b, v20.16b, v1.16b

+  bsl v7.16b, v21.16b, v2.16b

+  STORE_CHROMA_DATA_2 v6, v7, x0, 0

+  STORE_CHROMA_DATA_2 v6, v7, x0, 1

+  STORE_CHROMA_DATA_2 v6, v7, x0, 2

+  STORE_CHROMA_DATA_2 v6, v7, x0, 3

+  STORE_CHROMA_DATA_2 v6, v7, x0, 4

+  STORE_CHROMA_DATA_2 v6, v7, x0, 5

+  STORE_CHROMA_DATA_2 v6, v7, x0, 6

+  STORE_CHROMA_DATA_2 v6, v7, x0, 7

+  STORE_CHROMA_DATA_2 v6, v7, x1, 8

+  STORE_CHROMA_DATA_2 v6, v7, x1, 9

+  STORE_CHROMA_DATA_2 v6, v7, x1, 10

+  STORE_CHROMA_DATA_2 v6, v7, x1, 11

+  STORE_CHROMA_DATA_2 v6, v7, x1, 12

+  STORE_CHROMA_DATA_2 v6, v7, x1, 13

+  STORE_CHROMA_DATA_2 v6, v7, x1, 14

+  STORE_CHROMA_DATA_2 v6, v7, x1, 15

+  DeblockChromaEq4H_AArch64_neon_end:

+WELS_ASM_ARCH64_FUNC_END

+WELS_ASM_ARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon

+  // Checking the nzc status

+  BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status

+  // For checking bS[I] = 2

+  movi     v0.16b, #0

+  cmgt     v16.16b, v16.16b, v0.16b

+  cmgt     v17.16b, v17.16b, v0.16b

+  movi     v0.16b, #2

+  and  v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top

+  and  v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left

+  // Checking the mv status

+  BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status

+  // For checking bS[I] = 1

+  movi   v0.16b, #1

+  and  v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top

+  and  v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left

+  // Check bS[I] is '1' or '2'

+  umax v1.16b, v18.16b, v16.16b

+  umax v0.16b, v19.16b, v17.16b

+  st1 {v0.16b, v1.16b}, [x4]

+WELS_ASM_ARCH64_FUNC_END

+#endif

--- a/codec/common/inc/deblocking_common.h

+++ b/codec/common/inc/deblocking_common.h

@@ -50,6 +50,18 @@

 void DeblockChromaEq4H_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);

 #endif

+#if defined(HAVE_NEON_AARCH64)

+void DeblockLumaLt4V_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);

+void DeblockLumaEq4V_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);

+void DeblockLumaLt4H_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);

+void DeblockLumaEq4H_AArch64_neon (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);

+void DeblockChromaLt4V_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,

+                                 int8_t* pTC);

+void DeblockChromaEq4V_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);

+void DeblockChromaLt4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,

+                                 int8_t* pTC);

+void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);

+#endif

 #if defined(__cplusplus)

 #endif//__cplusplus

--- a/codec/common/targets.mk

+++ b/codec/common/targets.mk

@@ -39,6 +39,7 @@

 ifeq ($(ASM_ARCH), arm64)

 COMMON_ASM_ARM64_SRCS=\

+	$(COMMON_SRCDIR)/arm64/deblocking_aarch64_neon.S\

 	$(COMMON_SRCDIR)/arm64/expand_picture_aarch64_neon.S\

 	$(COMMON_SRCDIR)/arm64/mc_aarch64_neon.S\

--- a/codec/decoder/core/src/deblocking.cpp

+++ b/codec/decoder/core/src/deblocking.cpp

@@ -732,6 +732,20 @@

     pFunc->pfChromaDeblockingEQ4Hor      = DeblockChromaEq4H_neon;

 #endif

+#if defined(HAVE_NEON_AARCH64)

+  if (iCpu & WELS_CPU_NEON) {

+    pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_AArch64_neon;

+    pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_AArch64_neon;

+    pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_AArch64_neon;

+    pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_AArch64_neon;

+    pFunc->pfChromaDeblockingLT4Ver     = DeblockChromaLt4V_AArch64_neon;

+    pFunc->pfChromaDeblockingEQ4Ver     = DeblockChromaEq4V_AArch64_neon;

+    pFunc->pfChromaDeblockingLT4Hor     = DeblockChromaLt4H_AArch64_neon;

+    pFunc->pfChromaDeblockingEQ4Hor      = DeblockChromaEq4H_AArch64_neon;

+  }

+#endif

 } // namespace WelsDec

--- a/codec/encoder/core/inc/deblocking.h

+++ b/codec/encoder/core/inc/deblocking.h

@@ -69,6 +69,10 @@

 void DeblockingBSCalcEnc_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride,

                                uint8_t (*pBS)[4][4]);

 #endif

+#if defined(HAVE_NEON_AARCH64)

+void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);

+void DeblockingBSCalcEnc_AArch64_neon (int8_t* pNzc, SMVUnitXY* pMv, int32_t iBoundryFlag, int32_t iMbStride, uint8_t (*pBS)[4][4]);

+#endif

 #if defined(__cplusplus)

 #endif//__cplusplus

--- a/codec/encoder/core/src/deblocking.cpp

+++ b/codec/encoder/core/src/deblocking.cpp

@@ -573,6 +573,27 @@

 #endif

+#if defined(HAVE_NEON_AARCH64) && defined(SINGLE_REF_FRAME)

+void DeblockingBSCalc_AArch64_neon (SWelsFuncPtrList* pFunc, SMB* pCurMb, uint8_t uiBS[2][4][4], Mb_Type uiCurMbType,

+                                int32_t iMbStride, int32_t iLeftFlag, int32_t iTopFlag) {

+    DeblockingBSCalcEnc_AArch64_neon (pCurMb->pNonZeroCount, pCurMb->sMv, pCurMb->uiNeighborAvail, iMbStride, uiBS);

+    if (iLeftFlag) {

+        if (IS_INTRA ((pCurMb - 1)->uiMbType)) {

+            * (uint32_t*)uiBS[0][0] = 0x04040404;

+        }

+    } else {

+        * (uint32_t*)uiBS[0][0] = 0;

+    }

+    if (iTopFlag) {

+        if (IS_INTRA ((pCurMb - iMbStride)->uiMbType)) {

+            * (uint32_t*)uiBS[1][0] = 0x04040404;

+        }

+    } else {

+        * (uint32_t*)uiBS[1][0] = 0;

+    }

+}

+#endif

 void DeblockingBSCalc_c (SWelsFuncPtrList* pFunc, SMB* pCurMb, uint8_t uiBS[2][4][4], Mb_Type uiCurMbType,

                          int32_t iMbStride, int32_t iLeftFlag, int32_t iTopFlag) {

   if (iLeftFlag) {

@@ -765,6 +786,11 @@

     *pfSetNZCZero = WelsNonZeroCount_neon;

 #endif

+#ifdef	HAVE_NEON_AARCH64

+    if (iCpu & WELS_CPU_NEON) {

+        *pfSetNZCZero = WelsNonZeroCount_AArch64_neon;

+    }

+#endif

 void  DeblockingInit (DeblockingFunc*   pFunc,  int32_t iCpu) {

@@ -810,6 +836,24 @@

     pFunc->pfDeblockingBSCalc           = DeblockingBSCalc_neon;

 #endif

+#endif

+#if defined(HAVE_NEON_AARCH64)

+    if (iCpu & WELS_CPU_NEON) {

+        pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_AArch64_neon;

+        pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_AArch64_neon;

+        pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_AArch64_neon;

+        pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_AArch64_neon;

+        pFunc->pfChromaDeblockingLT4Ver     = DeblockChromaLt4V_AArch64_neon;

+        pFunc->pfChromaDeblockingEQ4Ver     = DeblockChromaEq4V_AArch64_neon;

+        pFunc->pfChromaDeblockingLT4Hor     = DeblockChromaLt4H_AArch64_neon;

+        pFunc->pfChromaDeblockingEQ4Hor     = DeblockChromaEq4H_AArch64_neon;

+#if defined(SINGLE_REF_FRAME)

+        pFunc->pfDeblockingBSCalc           = DeblockingBSCalc_AArch64_neon;

+#endif

+    }

 #endif

--- a/test/decoder/DecUT_Deblock.cpp

+++ b/test/decoder/DecUT_Deblock.cpp

@@ -127,3 +127,20 @@

 GENERATE_CHROMA_UT (ChromaEq4V_neon, DeblockChromaEq4V_neon_wrap, DeblockChromaEq4V_c_wrap, WELS_CPU_NEON, 0)

 GENERATE_CHROMA_UT (ChromaEq4H_neon, DeblockChromaEq4H_neon_wrap, DeblockChromaEq4H_c_wrap, WELS_CPU_NEON, 1)

 #endif

+#if defined(HAVE_NEON_AARCH64)

+WRAP_LUMA_FUNC (DeblockLumaEq4V_AArch64_neon)

+WRAP_LUMA_FUNC (DeblockLumaEq4H_AArch64_neon)

+WRAP_CHROMA_FUNC (DeblockChromaEq4V_AArch64_neon)

+WRAP_CHROMA_FUNC (DeblockChromaEq4H_AArch64_neon)

+GENERATE_LUMA_UT (LumaLt4V_AArch64_neon, DeblockLumaLt4V_AArch64_neon, DeblockLumaLt4V_c, WELS_CPU_NEON, 0)

+GENERATE_LUMA_UT (LumaLt4H_AArch64_neon, DeblockLumaLt4H_AArch64_neon, DeblockLumaLt4H_c, WELS_CPU_NEON, 1)

+GENERATE_LUMA_UT (LumaEq4V_AArch64_neon, DeblockLumaEq4V_AArch64_neon_wrap, DeblockLumaEq4V_c_wrap, WELS_CPU_NEON, 0)

+GENERATE_LUMA_UT (LumaEq4H_AArch64_neon, DeblockLumaEq4H_AArch64_neon_wrap, DeblockLumaEq4H_c_wrap, WELS_CPU_NEON, 1)

+GENERATE_CHROMA_UT (ChromaLt4V_AArch64_neon, DeblockChromaLt4V_AArch64_neon, DeblockChromaLt4V_c, WELS_CPU_NEON, 0)

+GENERATE_CHROMA_UT (ChromaLt4H_AArch64_neon, DeblockChromaLt4H_AArch64_neon, DeblockChromaLt4H_c, WELS_CPU_NEON, 1)

+GENERATE_CHROMA_UT (ChromaEq4V_AArch64_neon, DeblockChromaEq4V_AArch64_neon_wrap, DeblockChromaEq4V_c_wrap, WELS_CPU_NEON, 0)

+GENERATE_CHROMA_UT (ChromaEq4H_AArch64_neon, DeblockChromaEq4H_AArch64_neon_wrap, DeblockChromaEq4H_c_wrap, WELS_CPU_NEON, 1)

+#endif