shithub: openh264

Download patch

ref: fa0eee24702064ca17e301c726118570699b24e2
parent: 777a3951aaf653da34c924fe56f76ca5956cf773
parent: cfc2b95f5928bb4bae6a456110d17cd55766ca6e
author: huili2 <[email protected]>
date: Sat Jun 21 03:03:24 EDT 2014

Merge pull request #990 from zhilwang/arm64-intrapred

Add Arm64 neon Intra-pred code

--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		4CBC1B81194AC4E100214D9E /* intra_pred_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */; };
 		4CE4427D18B6FC360017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4427C18B6FC360017DF25 /* Foundation.framework */; };
 		4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4466718BC5EAA0017DF25 /* au_parser.cpp */; };
 		4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */; };
@@ -48,6 +49,7 @@
 /* End PBXCopyFilesBuildPhase section */
 
 /* Begin PBXFileReference section */
+		4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_aarch64_neon.S; path = arm64/intra_pred_aarch64_neon.S; sourceTree = "<group>"; };
 		4CE4427918B6FC360017DF25 /* libwelsdec.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsdec.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		4CE4427C18B6FC360017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
 		4CE4428D18B6FC360017DF25 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = Library/Frameworks/UIKit.framework; sourceTree = DEVELOPER_DIR; };
@@ -127,6 +129,14 @@
 /* End PBXFrameworksBuildPhase section */
 
 /* Begin PBXGroup section */
+		4CBC1B7F194AC4A400214D9E /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */,
+			);
+			name = arm64;
+			sourceTree = "<group>";
+		};
 		4CE4427018B6FC360017DF25 = {
 			isa = PBXGroup;
 			children = (
@@ -166,6 +176,7 @@
 		4CE4463F18BC5EAA0017DF25 /* core */ = {
 			isa = PBXGroup;
 			children = (
+				4CBC1B7F194AC4A400214D9E /* arm64 */,
 				4CE447A518BC6BE90017DF25 /* arm */,
 				4CE4464418BC5EAA0017DF25 /* inc */,
 				4CE4466618BC5EAA0017DF25 /* src */,
@@ -343,6 +354,7 @@
 				4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */,
 				9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */,
 				F0B204FC18FD23D8005DA23F /* error_concealment.cpp in Sources */,
+				4CBC1B81194AC4E100214D9E /* intra_pred_aarch64_neon.S in Sources */,
 				4CE4469018BC5EAB0017DF25 /* decoder_core.cpp in Sources */,
 				4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
 				4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
@@ -458,6 +470,7 @@
 					"$(SRCROOT)/../../../../common/inc",
 					"$(SRCROOT)/../../../../api/svc",
 					"$(SRCROOT)/../../../../common/arm",
+					"$(SRCROOT)/../../../../common/arm64",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;
 				ONLY_ACTIVE_ARCH = NO;
@@ -493,6 +506,7 @@
 					"$(SRCROOT)/../../../../common/inc",
 					"$(SRCROOT)/../../../../api/svc",
 					"$(SRCROOT)/../../../../common/arm",
+					"$(SRCROOT)/../../../../common/arm64",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;
 				ONLY_ACTIVE_ARCH = NO;
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -12,7 +12,8 @@
 		4C34067018C57D0400DFA14A /* memory_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066918C57D0400DFA14A /* memory_neon.S */; };
 		4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066A18C57D0400DFA14A /* pixel_neon.S */; };
 		4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34066B18C57D0400DFA14A /* reconstruct_neon.S */; };
-		4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */; };
+		4CB8F2B419235FC5005D6386 /* pixel_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */; };
+		4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */; };
 		4CE4431518B6FFA00017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4431418B6FFA00017DF25 /* Foundation.framework */; };
 		4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446DD18BC605C0017DF25 /* au_set.cpp */; };
 		4CE4470F18BC605C0017DF25 /* deblocking.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446DE18BC605C0017DF25 /* deblocking.cpp */; };
@@ -65,7 +66,8 @@
 		4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
 		4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
 		4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
-		4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = pixel_neon_aarch64.S; path = arm64/pixel_neon_aarch64.S; sourceTree = "<group>"; };
+		4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = pixel_aarch64_neon.S; path = arm64/pixel_aarch64_neon.S; sourceTree = "<group>"; };
+		4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_aarch64_neon.S; path = arm64/intra_pred_aarch64_neon.S; sourceTree = "<group>"; };
 		4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
 		4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
@@ -180,7 +182,8 @@
 		4CB8F2B219235FAC005D6386 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
-				4CB8F2B319235FC5005D6386 /* pixel_neon_aarch64.S */,
+				4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,
+				4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,
 			);
 			name = arm64;
 			sourceTree = "<group>";
@@ -421,6 +424,7 @@
 				4C34066E18C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S in Sources */,
 				4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */,
 				4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
+				4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,
 				4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,
 				4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */,
 				4CE4471818BC605C0017DF25 /* md.cpp in Sources */,
@@ -428,7 +432,7 @@
 				4CE4471918BC605C0017DF25 /* memory_align.cpp in Sources */,
 				4CE4472418BC605C0017DF25 /* svc_enc_slice_segment.cpp in Sources */,
 				4CE4472318BC605C0017DF25 /* svc_base_layer_md.cpp in Sources */,
-				4CB8F2B419235FC5005D6386 /* pixel_neon_aarch64.S in Sources */,
+				4CB8F2B419235FC5005D6386 /* pixel_aarch64_neon.S in Sources */,
 				4CE4471E18BC605C0017DF25 /* ratectl.cpp in Sources */,
 				4C34066D18C57D0400DFA14A /* intra_pred_neon.S in Sources */,
 				4CE4471C18BC605C0017DF25 /* picture_handle.cpp in Sources */,
--- /dev/null
+++ b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S
@@ -1,0 +1,525 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+// for Luma 4x4
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredH_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, #1
+.rept 4
+    ld1r    {v0.8b}, [x2], x1
+    st1     {v0.S}[0], [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDc_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    sub     x3, x0, #1
+    ldr     s0, [x2]
+    ld1     {v0.b}[4], [x3], x1
+    ld1     {v0.b}[5], [x3], x1
+    ld1     {v0.b}[6], [x3], x1
+    ld1     {v0.b}[7], [x3]
+    uaddlv  h0, v0.8b
+    uqrshrn b0, h0, #3
+    dup     v0.8b, v0.b[0]
+.rept 4
+    st1     {v0.S}[0], [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDcTop_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    sub     v0.8b, v0.8b, v0.8b
+    ldr     s0, [x2]
+    uaddlv  h0, v0.8b
+    uqrshrn v0.8b, v0.8h, #2
+    dup     v0.8b, v0.b[0]
+.rept 4
+    st1     {v0.S}[0], [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.8b}, [x2]
+    dup     v1.8b, v0.b[7]
+    ext     v2.8b, v0.8b, v1.8b, #1
+    ext     v3.8b, v0.8b, v1.8b, #2
+    ushll   v2.8h, v2.8b, #1
+    uaddl   v1.8h, v3.8b, v0.8b
+    add     v1.8h, v1.8h, v2.8h
+    uqrshrn v1.8b, v1.8h, #2
+    st1     {v1.S}[0], [x0], x1
+    ext     v0.8b, v1.8b, v2.8b, #1
+    st1     {v0.S}[0], [x0], x1
+    ext     v0.8b, v1.8b, v2.8b, #2
+    st1     {v0.S}[0], [x0], x1
+    ext     v0.8b, v1.8b, v2.8b, #3
+    st1     {v0.S}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredDDLTop_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.8b}, [x2]
+    dup     v1.8b, v0.b[3]
+    mov     v0.S[1], v1.S[0]
+    ext     v2.8b, v0.8b, v1.8b, #1
+    ext     v3.8b, v0.8b, v1.8b, #2
+    ushll   v2.8h, v2.8b, #1
+    uaddl   v1.8h, v3.8b, v0.8b
+    add     v1.8h, v1.8h, v2.8h
+    uqrshrn v1.8b, v1.8h, #2
+    st1     {v1.S}[0], [x0], x1
+    ext     v0.8b, v1.8b, v2.8b, #1
+    st1     {v0.S}[0], [x0], x1
+    ext     v0.8b, v1.8b, v2.8b, #2
+    st1     {v0.S}[0], [x0], x1
+    ext     v0.8b, v1.8b, v2.8b, #3
+    st1     {v0.S}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.8b}, [x2]
+    ext     v1.8b, v0.8b, v0.8b, #1
+    uaddl   v1.8h, v1.8b, v0.8b
+    uqrshrn v0.8b, v1.8h, #1           // v0.8b is VL0, VL1, VL2, VL3, VL4, ...
+    ext     v2.16b, v1.16b, v1.16b, #2
+    add     v1.8h, v2.8h, v1.8h
+    uqrshrn v1.8b, v1.8h, #2          // v1.8b is VL5, VL6, VL7, VL8, VL9
+    st1     {v0.s}[0], [x0], x1 // write the first row
+    st1     {v1.s}[0], [x0], x1 // write the second row
+    ext     v3.8b, v0.8b, v0.8b, #1
+    ext     v2.8b, v1.8b, v1.8b, #1
+    st1     {v3.s}[0], [x0], x1 // write the third row
+    st1     {v2.s}[0], [x0]     // write the fourth row
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVLTop_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.8b}, [x2]
+    dup     v1.8b, v0.b[3]
+    mov     v0.s[1], v1.s[0]
+    ext     v1.8b, v0.8b, v0.8b, #1
+    uaddl   v1.8h, v1.8b, v0.8b
+    uqrshrn v0.8b, v1.8h, #1           // v0.8b is VL0, VL1, VL2, VL3, VL4, ...
+    ext     v2.16b, v1.16b, v1.16b, #2
+    add     v1.8h, v2.8h, v1.8h
+    uqrshrn v1.8b, v1.8h, #2          // v1.8b is VL5, VL6, VL7, VL8, VL9
+    st1     {v0.s}[0], [x0], x1 // write the first row
+    st1     {v1.s}[0], [x0], x1 // write the second row
+    ext     v3.8b, v0.8b, v0.8b, #1
+    ext     v2.8b, v1.8b, v1.8b, #1
+    st1     {v3.s}[0], [x0], x1 // write the third row
+    st1     {v2.s}[0], [x0]     // write the fourth row
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.s}[1], [x2]
+    sub     x2, x2, #1
+    ld1     {v0.b}[3], [x2], x1
+    ld1     {v0.b}[2], [x2], x1
+    ld1     {v0.b}[1], [x2], x1
+    ld1     {v0.b}[0], [x2]         // v0.8b l2, l1, l0, lt, t0, t1, t2, t3
+
+    ext     v1.8b, v0.8b, v0.8b, #7
+    uaddl   v2.8h, v1.8b, v0.8b     //v2:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+    ext     v1.16b, v2.16b, v2.16b, #14
+    add     v3.8h, v2.8h, v1.8h     //v3:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+
+    uqrshrn v3.8b, v3.8h, #2
+    uqrshrn v2.8b, v2.8h, #1
+
+    st1     {v2.s}[1], [x0], x1
+    st1     {v3.s}[1], [x0], x1
+
+    ext     v2.8b, v2.8b, v2.8b, #7
+    ins     v2.b[4], v3.b[3]
+    st1     {v2.s}[1], [x0], x1
+
+    ext     v3.8b, v3.8b, v3.8b, #7
+    ins     v3.b[4], v3.b[3]
+    st1     {v3.s}[1], [x0], x1
+
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, #1
+    mov     x3, #3
+    mul     x3, x3, x1
+    add     x3, x3, x2
+    ld1r    {v0.8b}, [x3]
+    ld1     {v0.b}[4], [x2], x1
+    ld1     {v0.b}[5], [x2], x1
+    ld1     {v0.b}[6], [x2], x1     //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+
+    ext     v1.8b, v0.8b, v0.8b, #1
+    uaddl   v2.8h, v0.8b, v1.8b     //v2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+    ext     v3.16b, v2.16b, v2.16b, #2
+    add     v3.8h, v3.8h, v2.8h     //v2:{x, HU1, HU3, HU5, x}
+
+    uqrshrn v2.8b, v2.8h, #1 // HU0, HU2, HU4
+    uqrshrn v3.8b, v3.8h, #2 // HU1, HU3, HU5
+    zip2    v3.8b, v2.8b, v3.8b // HU0, HU1, HU2, HU3, HU4, HU5
+    mov     v3.h[3], v0.h[0] // v0.8b is hu0, hu1, hu2, hu3, hu4, hu5, l3, l3
+    ext     v2.8b, v3.8b, v0.8b, #2
+    st1     {v3.s}[0], [x0], x1
+    st1     {v2.s}[0], [x0], x1
+    st1     {v3.s}[1], [x0], x1
+    st1     {v0.s}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, #1
+    sub     x2, x2, x1 // x2 points to top left
+    ld1     {v0.s}[1], [x2], x1
+    ld1     {v0.b}[3], [x2], x1
+    ld1     {v0.b}[2], [x2], x1
+    ld1     {v0.b}[1], [x2], x1
+    ld1     {v0.b}[0], [x2]         // v0.8b: l3, l2, l1, l0, lt, t0, t1, t2
+    ext     v1.8b, v0.8b, v0.8b, #1 // v1.8b: l2, l1, l0, lt, t0, t1, t2, l3
+    uaddl   v2.8h, v0.8b, v1.8b
+    ext     v3.16b, v2.16b, v2.16b, #2
+    add     v3.8h, v3.8h, v2.8h
+    uqrshrn v2.8b, v2.8h, #1  // hd8, hd6, hd4, hd0, xxx
+    uqrshrn v3.8b, v3.8h, #2 // hd9, hd7, hd5, hd1, hd2, hd3
+    zip1    v2.8b, v2.8b, v3.8b // hd8, hd9, hd6, hd7, hd4, hd5, hd0, hd1
+    mov     v1.h[0], v3.h[2]
+    ext     v3.8b, v2.8b, v1.8b, #6
+    st1     {v3.s}[0], [x0], x1
+    st1     {v2.s}[1], [x0], x1
+    ext     v3.8b, v2.8b, v1.8b, #2
+    st1     {v3.s}[0], [x0], x1
+    st1     {v2.s}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+// for Chroma 8x8
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredV_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.8b}, [x2]
+.rept   8
+    st1     {v0.8b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredH_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, #1
+.rept 8
+    ld1r    {v0.8b}, [x2], x1
+    st1     {v0.8b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDc_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    sub     x3, x0, #1
+    ld1     {v0.8b}, [x2]
+    ld1     {v0.b}[8], [x3], x1
+    ld1     {v0.b}[9], [x3], x1
+    ld1     {v0.b}[10], [x3], x1
+    ld1     {v0.b}[11], [x3], x1
+    ld1     {v0.b}[12], [x3], x1
+    ld1     {v0.b}[13], [x3], x1
+    ld1     {v0.b}[14], [x3], x1
+    ld1     {v0.b}[15], [x3]
+
+    uaddlp  v1.8h, v0.16b
+    uaddlp  v2.4s, v1.8h
+    ins     v3.d[0], v2.d[1]
+    add     v3.2s, v2.2s, v3.2s
+    urshr   v2.4s, v2.4s, #2
+    urshr   v3.2s, v3.2s, #3
+
+    dup     v0.8b, v3.b[0]
+    dup     v1.8b, v2.b[4]
+    dup     v2.8b, v2.b[12]
+    dup     v3.8b, v3.b[4]
+    ins     v0.s[1], v1.s[0]
+    ins     v2.s[1], v3.s[0]
+.rept 4
+    st1     {v0.8b}, [x0], x1
+.endr
+.rept 4
+    st1     {v2.8b}, [x0], x1
+.endr
+
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTop_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.8b}, [x2]
+    uaddlp  v0.4h, v0.8b
+    addp    v0.8h, v0.8h, v0.8h
+    dup     v1.8h, v0.h[0]
+    dup     v2.8h, v0.h[1]
+    mov     v1.D[1], v2.D[0]
+    uqrshrn  v1.8b, v1.8h, #2
+.rept 8
+    st1     {v1.8b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+.align 16
+intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4
+intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    sub     x2, x2, #1
+    mov     x3, x2
+    // load pTop[2-i] and pLeft[(2-i)*kiStride]
+    ld1     {v1.b}[3], [x2], #1
+    ld1     {v1.b}[2], [x2], #1
+    ld1     {v1.b}[1], [x2], #1
+    ld1     {v1.b}[0], [x2], #1
+    ld1     {v1.b}[7], [x3], x1
+    ld1     {v1.b}[6], [x3], x1
+    ld1     {v1.b}[5], [x3], x1
+    ld1     {v1.b}[4], [x3], x1
+    add     x2, x2, #1
+    add     x3, x3, x1
+    // load pTop[4+i] and pLeft[(4+i)*kiStride]
+    ld1     {v0.b}[0], [x2], #1
+    ld1     {v0.b}[1], [x2], #1
+    ld1     {v0.b}[2], [x2], #1
+    ld1     {v0.b}[3], [x2], #1
+    ld1     {v0.b}[4], [x3], x1
+    ld1     {v0.b}[5], [x3], x1
+    ld1     {v0.b}[6], [x3], x1
+    ld1     {v0.b}[7], [x3], x1
+
+    uxtl    v1.8h, v1.8b
+    uxtl    v0.8h, v0.8b
+    ldr     q2, intra_1_to_4
+    ldr     q3, intra_m3_to_p4
+    dup     v4.8h, v0.h[3]
+    dup     v5.8h, v0.h[7]
+    add     v4.8h, v4.8h, v5.8h
+    sub     v0.8h, v0.8h, v1.8h
+    shl     v4.8h, v4.8h, #4 // v4.8h is a
+    mul     v0.8h, v0.8h, v2.8h // v0.h[0-3] is H, v0.h[4-7] is V
+    saddlp  v0.4s, v0.8h
+    addp    v0.4s, v0.4s, v0.4s // v0.s[0] is H, v0.s[1] is V
+    sqrshrn v0.4h, v0.4s, #5
+    dup     v1.8h, v0.h[0]      // v1.8h is b
+    dup     v0.8h, v0.h[1]      // v0.8h is c
+    mla     v4.8h, v1.8h, v3.8h
+    mla     v4.8h, v0.8h, v3.h[0]
+    sqrshrun v1.8b, v4.8h, #5
+    st1     {v1.8b}, [x0], x1
+.rept 7
+    add     v4.8h, v4.8h, v0.8h
+    sqrshrun v1.8b, v4.8h, #5
+    st1     {v1.8b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+//for Luma 16x16
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredV_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.16b}, [x2]
+.rept 16
+    st1     {v0.16b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredH_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, #1
+.rept 16
+    ld1r    {v0.16b}, [x2], x1
+    st1     {v0.16b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    sub     x3, x0, #1
+    ld1     {v0.16b}, [x2]
+    ld1     {v1.b}[0], [x3], x1
+    ld1     {v1.b}[1], [x3], x1
+    ld1     {v1.b}[2], [x3], x1
+    ld1     {v1.b}[3], [x3], x1
+    ld1     {v1.b}[4], [x3], x1
+    ld1     {v1.b}[5], [x3], x1
+    ld1     {v1.b}[6], [x3], x1
+    ld1     {v1.b}[7], [x3], x1
+    ld1     {v1.b}[8], [x3], x1
+    ld1     {v1.b}[9], [x3], x1
+    ld1     {v1.b}[10], [x3], x1
+    ld1     {v1.b}[11], [x3], x1
+    ld1     {v1.b}[12], [x3], x1
+    ld1     {v1.b}[13], [x3], x1
+    ld1     {v1.b}[14], [x3], x1
+    ld1     {v1.b}[15], [x3]
+    // reduce instruction
+    uaddlv    h0, v0.16b
+    uaddlv    h1, v1.16b
+    add       v0.8h, v0.8h, v1.8h
+    uqrshrn    b0, h0, #5
+    dup       v0.16b, v0.b[0]
+.rept 16
+    st1     {v0.16b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcTop_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    ld1     {v0.16b}, [x2]
+    // reduce instruction
+    uaddlv    h0, v0.16b
+    uqrshrn    v0.8b, v0.8h, 4
+    dup       v0.16b, v0.b[0]
+.rept 16
+    st1     {v0.16b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcLeft_AArch64_neon
+    sxtw    x1, w1
+    sub     x3, x0, #1
+    ld1     {v1.b}[0], [x3], x1
+    ld1     {v1.b}[1], [x3], x1
+    ld1     {v1.b}[2], [x3], x1
+    ld1     {v1.b}[3], [x3], x1
+    ld1     {v1.b}[4], [x3], x1
+    ld1     {v1.b}[5], [x3], x1
+    ld1     {v1.b}[6], [x3], x1
+    ld1     {v1.b}[7], [x3], x1
+    ld1     {v1.b}[8], [x3], x1
+    ld1     {v1.b}[9], [x3], x1
+    ld1     {v1.b}[10], [x3], x1
+    ld1     {v1.b}[11], [x3], x1
+    ld1     {v1.b}[12], [x3], x1
+    ld1     {v1.b}[13], [x3], x1
+    ld1     {v1.b}[14], [x3], x1
+    ld1     {v1.b}[15], [x3]
+    // reduce instruction
+    uaddlv    h1, v1.16b
+    uqrshrn    v0.8b, v1.8h, #4
+    dup       v0.16b, v0.b[0]
+.rept 16
+    st1     {v0.16b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+
+.align 16
+intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40
+intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon
+    sxtw    x1, w1
+    sub     x2, x0, x1
+    sub     x2, x2, #1
+    mov     x3, x2
+    ld1     {v0.8b}, [x3] // v0 low 8 bit in top(reverse order)
+    add     x3, x3, #9
+    rev64   v0.8b, v0.8b  // reverse v0
+    ld1     {v1.8b}, [x3] // v1 high 8 bit in top
+    uxtl    v0.8h, v0.8b  // extend to 16 bit integer
+    uxtl    v1.8h, v1.8b  // extend to 16 bit integer
+    ld1     {v2.b}[7], [x2], x1
+    ld1     {v2.b}[6], [x2], x1
+    ld1     {v2.b}[5], [x2], x1
+    ld1     {v2.b}[4], [x2], x1
+    ld1     {v2.b}[3], [x2], x1
+    ld1     {v2.b}[2], [x2], x1
+    ld1     {v2.b}[1], [x2], x1
+    ld1     {v2.b}[0], [x2], x1 // v2.8b low 8 bit in left
+    add     x2, x2, x1
+    ld1     {v3.b}[0], [x2], x1
+    ld1     {v3.b}[1], [x2], x1
+    ld1     {v3.b}[2], [x2], x1
+    ld1     {v3.b}[3], [x2], x1
+    ld1     {v3.b}[4], [x2], x1
+    ld1     {v3.b}[5], [x2], x1
+    ld1     {v3.b}[6], [x2], x1
+    ld1     {v3.b}[7], [x2]      // v3.8b high 8bit in left
+    uxtl    v2.8h, v2.8b
+    uxtl    v3.8h, v3.8b
+    sub     v0.8h, v1.8h, v0.8h
+    sub     v2.8h, v3.8h, v2.8h
+    ldr     q4, intra_1_to_8
+    mul     v0.8h, v0.8h, v4.8h
+    mul     v2.8h, v2.8h, v4.8h
+    saddlv  s0, v0.8h
+    saddlv  s2, v2.8h
+    add     v1.8h, v1.8h, v3.8h
+    sqrshrn v0.4h, v0.4S, #6  // b is in v0.h[0]
+    sqrshrn v2.4h, v2.4S, #6  // c is in v2.h[0]
+    shl     v1.8h, v1.8h, #4   // a is in v1.h[7]
+    ldr     q4, intra_m7_to_p8
+    ldr     q5, intra_m7_to_p8 + 16
+    dup     v1.8h, v1.h[7]
+    dup     v3.8h, v1.h[7]
+    mla     v1.8h, v4.8h, v0.h[0]
+    mla     v3.8h, v5.8h, v0.h[0]
+    dup     v2.8h, v2.h[0] // v2.8h is [cccccccc]
+    mla     v1.8h, v2.8h, v4.h[0]
+    mla     v3.8h, v2.8h, v4.h[0]
+    sqrshrun v4.8b, v1.8h, #5
+    sqrshrun2 v4.16b, v3.8h, #5
+    st1     {v4.16b}, [x0], x1
+.rept 15
+    add     v1.8h, v1.8h, v2.8h
+    add     v3.8h, v3.8h, v2.8h
+    sqrshrun v4.8b, v1.8h, #5
+    sqrshrun2 v4.16b, v3.8h, #5
+    st1     {v4.16b}, [x0], x1
+.endr
+WELS_ASM_ARCH64_FUNC_END
+#endif
\ No newline at end of file
--- a/codec/decoder/core/inc/get_intra_predictor.h
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@@ -128,6 +128,31 @@
 void WelsDecoderIChromaPredPlane_neon (uint8_t* pPred, const int32_t kiStride);
 #endif//HAVE_NEON
 
+#if defined(HAVE_NEON_AARCH64)
+void WelsDecoderI16x16LumaPredV_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredH_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDc_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDcLeft_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+
+void WelsDecoderI4x4LumaPredH_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredDDL_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredDDLTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredVL_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredVLTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredVR_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredHU_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredHD_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredDc_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+
+void WelsDecoderIChromaPredV_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredH_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredDc_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredPlane_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
+#endif//HAVE_NEON_AARCH64
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -672,6 +672,35 @@
   }
 #endif//HAVE_NEON
 
+#if defined(HAVE_NEON_AARCH64)
+  if (pCtx->uiCpuFlag & WELS_CPU_NEON) {
+    //pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;
+
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_AArch64_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_AArch64_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_AArch64_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_AArch64_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_L]  = WelsDecoderI16x16LumaPredDcLeft_AArch64_neon;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T]  = WelsDecoderI16x16LumaPredDcTop_AArch64_neon;
+
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_H    ] = WelsDecoderI4x4LumaPredH_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL  ] = WelsDecoderI4x4LumaPredDDL_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL_TOP] = WelsDecoderI4x4LumaPredDDLTop_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL   ] = WelsDecoderI4x4LumaPredVL_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL_TOP ] = WelsDecoderI4x4LumaPredVLTop_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR   ] = WelsDecoderI4x4LumaPredVR_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU   ] = WelsDecoderI4x4LumaPredHU_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD   ] = WelsDecoderI4x4LumaPredHD_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC   ] = WelsDecoderI4x4LumaPredDc_AArch64_neon;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_T   ] = WelsDecoderI4x4LumaPredDcTop_AArch64_neon;
+
+    pCtx->pGetIChromaPredFunc[C_PRED_H]       = WelsDecoderIChromaPredH_AArch64_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_V]       = WelsDecoderIChromaPredV_AArch64_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_AArch64_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDc_AArch64_neon;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC_T]      = WelsDecoderIChromaPredDcTop_AArch64_neon;
+  }
+#endif//HAVE_NEON_AARCH64
 
 #if defined(X86_ASM)
   if (pCtx->uiCpuFlag & WELS_CPU_MMXEXT) {
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -39,6 +39,13 @@
 DECODER_OBJS += $(DECODER_ASM_ARM_SRCS:.S=.$(OBJ))
 endif
 
+ifeq ($(ASM_ARCH), arm64)
+DECODER_ASM_ARM64_SRCS=\
+	$(DECODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
+
+DECODER_OBJS += $(DECODER_ASM_ARM64_SRCS:.S=.$(OBJ))
+endif
+
 OBJS += $(DECODER_OBJS)
 $(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c $(CXX_O) $<
--- /dev/null
+++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
@@ -1,0 +1,504 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+// for Luma 4x4
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredH_AArch64_neon
+    sub     x3, x1, #1
+.rept 4
+    ld1r    {v0.8b}, [x3], x2
+    st1     {v0.S}[0], [x0], 4
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDc_AArch64_neon
+    sub     x3, x1, x2
+    sub     x4, x1, #1
+    ldr     s0, [x3]
+    ld1     {v0.b}[4], [x4], x2
+    ld1     {v0.b}[5], [x4], x2
+    ld1     {v0.b}[6], [x4], x2
+    ld1     {v0.b}[7], [x4]
+    uaddlv  h0, v0.8b
+    uqrshrn b0, h0, #3
+    dup     v0.8b, v0.b[0]
+.rept 4
+    st1     {v0.S}[0], [x0], 4
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDcTop_AArch64_neon
+    sub     x3, x1, x2
+    sub     v0.8b, v0.8b, v0.8b
+    ldr     s0, [x3]
+    uaddlv  h0, v0.8b
+    uqrshrn v0.8b, v0.8h, #2
+    dup     v0.8b, v0.b[0]
+.rept 4
+    st1     {v0.S}[0], [x0], 4
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDL_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.8b}, [x3]
+    dup     v1.8b, v0.b[7]
+    ext     v2.8b, v0.8b, v1.8b, #1
+    ext     v3.8b, v0.8b, v1.8b, #2
+    ushll   v2.8h, v2.8b, #1
+    uaddl   v1.8h, v3.8b, v0.8b
+    add     v1.8h, v1.8h, v2.8h
+    uqrshrn v1.8b, v1.8h, #2
+    st1     {v1.S}[0], [x0], 4
+    ext     v0.8b, v1.8b, v2.8b, #1
+    st1     {v0.S}[0], [x0], 4
+    ext     v0.8b, v1.8b, v2.8b, #2
+    st1     {v0.S}[0], [x0], 4
+    ext     v0.8b, v1.8b, v2.8b, #3
+    st1     {v0.S}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredDDLTop_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.8b}, [x3]
+    dup     v1.8b, v0.b[3]
+    mov     v0.S[1], v1.S[0]
+    ext     v2.8b, v0.8b, v1.8b, #1
+    ext     v3.8b, v0.8b, v1.8b, #2
+    ushll   v2.8h, v2.8b, #1
+    uaddl   v1.8h, v3.8b, v0.8b
+    add     v1.8h, v1.8h, v2.8h
+    uqrshrn v1.8b, v1.8h, #2
+    st1     {v1.S}[0], [x0], 4
+    ext     v0.8b, v1.8b, v2.8b, #1
+    st1     {v0.S}[0], [x0], 4
+    ext     v0.8b, v1.8b, v2.8b, #2
+    st1     {v0.S}[0], [x0], 4
+    ext     v0.8b, v1.8b, v2.8b, #3
+    st1     {v0.S}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVL_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.8b}, [x3]
+    ext     v1.8b, v0.8b, v0.8b, #1
+    uaddl   v1.8h, v1.8b, v0.8b
+    uqrshrn v0.8b, v1.8h, #1           // v0.8b is VL0, VL1, VL2, VL3, VL4, ...
+    ext     v2.16b, v1.16b, v1.16b, #2
+    add     v1.8h, v2.8h, v1.8h
+    uqrshrn v1.8b, v1.8h, #2          // v1.8b is VL5, VL6, VL7, VL8, VL9
+    st1     {v0.s}[0], [x0], 4 // write the first row
+    st1     {v1.s}[0], [x0], 4 // write the second row
+    ext     v3.8b, v0.8b, v0.8b, #1
+    ext     v2.8b, v1.8b, v1.8b, #1
+    st1     {v3.s}[0], [x0], 4 // write the third row
+    st1     {v2.s}[0], [x0]     // write the fourth row
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVLTop_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.8b}, [x3]
+    dup     v1.8b, v0.b[3]
+    mov     v0.s[1], v1.s[0]
+    ext     v1.8b, v0.8b, v0.8b, #1
+    uaddl   v1.8h, v1.8b, v0.8b
+    uqrshrn v0.8b, v1.8h, #1           // v0.8b is VL0, VL1, VL2, VL3, VL4, ...
+    ext     v2.16b, v1.16b, v1.16b, #2
+    add     v1.8h, v2.8h, v1.8h
+    uqrshrn v1.8b, v1.8h, #2          // v1.8b is VL5, VL6, VL7, VL8, VL9
+    st1     {v0.s}[0], [x0], 4 // write the first row
+    st1     {v1.s}[0], [x0], 4 // write the second row
+    ext     v3.8b, v0.8b, v0.8b, #1
+    ext     v2.8b, v1.8b, v1.8b, #1
+    st1     {v3.s}[0], [x0], 4 // write the third row
+    st1     {v2.s}[0], [x0]     // write the fourth row
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredVR_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.s}[1], [x3]
+    sub     x3, x3, #1
+    ld1     {v0.b}[3], [x3], x2
+    ld1     {v0.b}[2], [x3], x2
+    ld1     {v0.b}[1], [x3], x2
+    ld1     {v0.b}[0], [x3]         // v0.8b l2, l1, l0, lt, t0, t1, t2, t3
+
+    ext     v1.8b, v0.8b, v0.8b, #7
+    uaddl   v2.8h, v1.8b, v0.8b     //v2:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+    ext     v1.16b, v2.16b, v2.16b, #14
+    add     v3.8h, v2.8h, v1.8h     //v3:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+
+    uqrshrn v3.8b, v3.8h, #2
+    uqrshrn v2.8b, v2.8h, #1
+
+    st1     {v2.s}[1], [x0], 4
+    st1     {v3.s}[1], [x0], 4
+
+    ext     v2.8b, v2.8b, v2.8b, #7
+    ins     v2.b[4], v3.b[3]
+    st1     {v2.s}[1], [x0], 4
+
+    ext     v3.8b, v3.8b, v3.8b, #7
+    ins     v3.b[4], v3.b[3]
+    st1     {v3.s}[1], [x0]
+
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHU_AArch64_neon
+    sub     x3, x1, #1
+    mov     x4, #3
+    mul     x4, x4, x2
+    add     x4, x4, x3
+    ld1r    {v0.8b}, [x4]
+    ld1     {v0.b}[4], [x3], x2
+    ld1     {v0.b}[5], [x3], x2
+    ld1     {v0.b}[6], [x3], x2     //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+
+    ext     v1.8b, v0.8b, v0.8b, #1
+    uaddl   v2.8h, v0.8b, v1.8b     //v2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+    ext     v3.16b, v2.16b, v2.16b, #2
+    add     v3.8h, v3.8h, v2.8h     //v2:{x, HU1, HU3, HU5, x}
+
+    uqrshrn v2.8b, v2.8h, #1 // HU0, HU2, HU4
+    uqrshrn v3.8b, v3.8h, #2 // HU1, HU3, HU5
+    zip2    v3.8b, v2.8b, v3.8b // HU0, HU1, HU2, HU3, HU4, HU5
+    mov     v3.h[3], v0.h[0] // v0.8b is hu0, hu1, hu2, hu3, hu4, hu5, l3, l3
+    ext     v2.8b, v3.8b, v0.8b, #2
+    st1     {v3.s}[0], [x0], 4
+    st1     {v2.s}[0], [x0], 4
+    st1     {v3.s}[1], [x0], 4
+    st1     {v0.s}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI4x4LumaPredHD_AArch64_neon
+    sub     x3, x1, #1
+    sub     x3, x3, x2 // x2 points to top left
+    ld1     {v0.s}[1], [x3], x2
+    ld1     {v0.b}[3], [x3], x2
+    ld1     {v0.b}[2], [x3], x2
+    ld1     {v0.b}[1], [x3], x2
+    ld1     {v0.b}[0], [x3]         // v0.8b: l3, l2, l1, l0, lt, t0, t1, t2
+    ext     v1.8b, v0.8b, v0.8b, #1 // v1.8b: l2, l1, l0, lt, t0, t1, t2, l3
+    uaddl   v2.8h, v0.8b, v1.8b
+    ext     v3.16b, v2.16b, v2.16b, #2
+    add     v3.8h, v3.8h, v2.8h
+    uqrshrn v2.8b, v2.8h, #1  // hd8, hd6, hd4, hd0, xxx
+    uqrshrn v3.8b, v3.8h, #2 // hd9, hd7, hd5, hd1, hd2, hd3
+    zip1    v2.8b, v2.8b, v3.8b // hd8, hd9, hd6, hd7, hd4, hd5, hd0, hd1
+    mov     v1.h[0], v3.h[2]
+    ext     v3.8b, v2.8b, v1.8b, #6
+    st1     {v3.s}[0], [x0], 4
+    st1     {v2.s}[1], [x0], 4
+    ext     v3.8b, v2.8b, v1.8b, #2
+    st1     {v3.s}[0], [x0], 4
+    st1     {v2.s}[0], [x0]
+WELS_ASM_ARCH64_FUNC_END
+
+// for Chroma 8x8
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredV_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.8b}, [x3]
+.rept   8
+    st1     {v0.8b}, [x0], 8
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredH_AArch64_neon
+    sub     x3, x1, #1
+.rept 8
+    ld1r    {v0.8b}, [x3], x2
+    st1     {v0.8b}, [x0], 8
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDc_AArch64_neon
+    sub     x3, x1, x2
+    sub     x4, x1, #1
+    ld1     {v0.8b}, [x3]
+    ld1     {v0.b}[8], [x4], x2
+    ld1     {v0.b}[9], [x4], x2
+    ld1     {v0.b}[10], [x4], x2
+    ld1     {v0.b}[11], [x4], x2
+    ld1     {v0.b}[12], [x4], x2
+    ld1     {v0.b}[13], [x4], x2
+    ld1     {v0.b}[14], [x4], x2
+    ld1     {v0.b}[15], [x4]
+
+    uaddlp  v1.8h, v0.16b
+    uaddlp  v2.4s, v1.8h
+    ins     v3.d[0], v2.d[1]
+    add     v3.2s, v2.2s, v3.2s
+    urshr   v2.4s, v2.4s, #2
+    urshr   v3.2s, v3.2s, #3
+
+    dup     v0.8b, v3.b[0]
+    dup     v1.8b, v2.b[4]
+    dup     v2.8b, v2.b[12]
+    dup     v3.8b, v3.b[4]
+    ins     v0.s[1], v1.s[0]
+    ins     v2.s[1], v3.s[0]
+.rept 4
+    st1     {v0.8b}, [x0], 8
+.endr
+.rept 4
+    st1     {v2.8b}, [x0], 8
+.endr
+
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.8b}, [x3]
+    uaddlp  v0.4h, v0.8b
+    addp    v0.8h, v0.8h, v0.8h
+    dup     v1.8h, v0.h[0]
+    dup     v2.8h, v0.h[1]
+    mov     v1.D[1], v2.D[0]
+    uqrshrn  v1.8b, v1.8h, #2
+.rept 8
+    st1     {v1.8b}, [x0], 8
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+.align 16
+intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4
+intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon
+    sub     x3, x1, x2
+    sub     x3, x3, #1
+    mov     x4, x3
+    // load pTop[2-i] and pLeft[(2-i)*kiStride]
+    ld1     {v1.b}[3], [x3], #1
+    ld1     {v1.b}[2], [x3], #1
+    ld1     {v1.b}[1], [x3], #1
+    ld1     {v1.b}[0], [x3], #1
+    ld1     {v1.b}[7], [x4], x2
+    ld1     {v1.b}[6], [x4], x2
+    ld1     {v1.b}[5], [x4], x2
+    ld1     {v1.b}[4], [x4], x2
+    add     x3, x3, #1
+    add     x4, x4, x2
+    // load pTop[4+i] and pLeft[(4+i)*kiStride]
+    ld1     {v0.b}[0], [x3], #1
+    ld1     {v0.b}[1], [x3], #1
+    ld1     {v0.b}[2], [x3], #1
+    ld1     {v0.b}[3], [x3], #1
+    ld1     {v0.b}[4], [x4], x2
+    ld1     {v0.b}[5], [x4], x2
+    ld1     {v0.b}[6], [x4], x2
+    ld1     {v0.b}[7], [x4], x2
+
+    uxtl    v1.8h, v1.8b
+    uxtl    v0.8h, v0.8b
+    ldr     q2, intra_1_to_4
+    ldr     q3, intra_m3_to_p4
+    dup     v4.8h, v0.h[3]
+    dup     v5.8h, v0.h[7]
+    add     v4.8h, v4.8h, v5.8h
+    sub     v0.8h, v0.8h, v1.8h
+    shl     v4.8h, v4.8h, #4 // v4.8h is a
+    mul     v0.8h, v0.8h, v2.8h // v0.h[0-3] is H, v0.h[4-7] is V
+    saddlp  v0.4s, v0.8h
+    addp    v0.4s, v0.4s, v0.4s // v0.s[0] is H, v0.s[1] is V
+    sqrshrn v0.4h, v0.4s, #5
+    dup     v1.8h, v0.h[0]      // v1.8h is b
+    dup     v0.8h, v0.h[1]      // v0.8h is c
+    mla     v4.8h, v1.8h, v3.8h
+    mla     v4.8h, v0.8h, v3.h[0]
+    sqrshrun v1.8b, v4.8h, #5
+    st1     {v1.8b}, [x0], 8
+.rept 7
+    add     v4.8h, v4.8h, v0.8h
+    sqrshrun v1.8b, v4.8h, #5
+    st1     {v1.8b}, [x0], 8
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+//for Luma 16x16
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.16b}, [x3]
+.rept 16
+    st1     {v0.16b}, [x0], 16
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon
+    sub     x3, x1, #1
+.rept 16
+    ld1r    {v0.16b}, [x3], x2
+    st1     {v0.16b}, [x0], 16
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon
+    sub     x3, x1, x2
+    sub     x4, x1, #1
+    ld1     {v0.16b}, [x3]
+    ld1     {v1.b}[0], [x4], x2
+    ld1     {v1.b}[1], [x4], x2
+    ld1     {v1.b}[2], [x4], x2
+    ld1     {v1.b}[3], [x4], x2
+    ld1     {v1.b}[4], [x4], x2
+    ld1     {v1.b}[5], [x4], x2
+    ld1     {v1.b}[6], [x4], x2
+    ld1     {v1.b}[7], [x4], x2
+    ld1     {v1.b}[8], [x4], x2
+    ld1     {v1.b}[9], [x4], x2
+    ld1     {v1.b}[10], [x4], x2
+    ld1     {v1.b}[11], [x4], x2
+    ld1     {v1.b}[12], [x4], x2
+    ld1     {v1.b}[13], [x4], x2
+    ld1     {v1.b}[14], [x4], x2
+    ld1     {v1.b}[15], [x4]
+    // reduce instruction
+    uaddlv    h0, v0.16b
+    uaddlv    h1, v1.16b
+    add       v0.8h, v0.8h, v1.8h
+    uqrshrn    b0, h0, #5
+    dup       v0.16b, v0.b[0]
+.rept 16
+    st1     {v0.16b}, [x0], 16
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcTop_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.16b}, [x3]
+    // reduce instruction
+    uaddlv    h0, v0.16b
+    uqrshrn    v0.8b, v0.8h, 4
+    dup       v0.16b, v0.b[0]
+.rept 16
+    st1     {v0.16b}, [x0], 16
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon
+    sub     x3, x1, #1
+    ld1     {v1.b}[0], [x3], x2
+    ld1     {v1.b}[1], [x3], x2
+    ld1     {v1.b}[2], [x3], x2
+    ld1     {v1.b}[3], [x3], x2
+    ld1     {v1.b}[4], [x3], x2
+    ld1     {v1.b}[5], [x3], x2
+    ld1     {v1.b}[6], [x3], x2
+    ld1     {v1.b}[7], [x3], x2
+    ld1     {v1.b}[8], [x3], x2
+    ld1     {v1.b}[9], [x3], x2
+    ld1     {v1.b}[10], [x3], x2
+    ld1     {v1.b}[11], [x3], x2
+    ld1     {v1.b}[12], [x3], x2
+    ld1     {v1.b}[13], [x3], x2
+    ld1     {v1.b}[14], [x3], x2
+    ld1     {v1.b}[15], [x3]
+    // reduce instruction
+    uaddlv    h1, v1.16b
+    uqrshrn    v0.8b, v1.8h, #4
+    dup       v0.16b, v0.b[0]
+.rept 16
+    st1     {v0.16b}, [x0], 16
+.endr
+WELS_ASM_ARCH64_FUNC_END
+
+
+.align 16
+intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40
+intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon
+    sub     x3, x1, x2
+    sub     x3, x3, #1
+    mov     x4, x3
+    ld1     {v0.8b}, [x4] // v0 low 8 bit in top(reverse order)
+    add     x4, x4, #9
+    rev64   v0.8b, v0.8b  // reverse v0
+    ld1     {v1.8b}, [x4] // v1 high 8 bit in top
+    uxtl    v0.8h, v0.8b  // extend to 16 bit integer
+    uxtl    v1.8h, v1.8b  // extend to 16 bit integer
+    ld1     {v2.b}[7], [x3], x2
+    ld1     {v2.b}[6], [x3], x2
+    ld1     {v2.b}[5], [x3], x2
+    ld1     {v2.b}[4], [x3], x2
+    ld1     {v2.b}[3], [x3], x2
+    ld1     {v2.b}[2], [x3], x2
+    ld1     {v2.b}[1], [x3], x2
+    ld1     {v2.b}[0], [x3], x2 // v2.8b low 8 bit in left
+    add     x3, x3, x2
+    ld1     {v3.b}[0], [x3], x2
+    ld1     {v3.b}[1], [x3], x2
+    ld1     {v3.b}[2], [x3], x2
+    ld1     {v3.b}[3], [x3], x2
+    ld1     {v3.b}[4], [x3], x2
+    ld1     {v3.b}[5], [x3], x2
+    ld1     {v3.b}[6], [x3], x2
+    ld1     {v3.b}[7], [x3]      // v3.8b high 8bit in left
+    uxtl    v2.8h, v2.8b
+    uxtl    v3.8h, v3.8b
+    sub     v0.8h, v1.8h, v0.8h
+    sub     v2.8h, v3.8h, v2.8h
+    ldr     q4, intra_1_to_8
+    mul     v0.8h, v0.8h, v4.8h
+    mul     v2.8h, v2.8h, v4.8h
+    saddlv  s0, v0.8h
+    saddlv  s2, v2.8h
+    add     v1.8h, v1.8h, v3.8h
+    sqrshrn v0.4h, v0.4S, #6  // b is in v0.h[0]
+    sqrshrn v2.4h, v2.4S, #6  // c is in v2.h[0]
+    shl     v1.8h, v1.8h, #4   // a is in v1.h[7]
+    ldr     q4, intra_m7_to_p8
+    ldr     q5, intra_m7_to_p8 + 16
+    dup     v1.8h, v1.h[7]
+    dup     v3.8h, v1.h[7]
+    mla     v1.8h, v4.8h, v0.h[0]
+    mla     v3.8h, v5.8h, v0.h[0]
+    dup     v2.8h, v2.h[0] // v2.8h is [cccccccc]
+    mla     v1.8h, v2.8h, v4.h[0]
+    mla     v3.8h, v2.8h, v4.h[0]
+    sqrshrun v4.8b, v1.8h, #5
+    sqrshrun2 v4.16b, v3.8h, #5
+    st1     {v4.16b}, [x0], 16
+.rept 15
+    add     v1.8h, v1.8h, v2.8h
+    add     v3.8h, v3.8h, v2.8h
+    sqrshrun v4.8b, v1.8h, #5
+    sqrshrun2 v4.16b, v3.8h, #5
+    st1     {v4.16b}, [x0], 16
+.endr
+WELS_ASM_ARCH64_FUNC_END
+#endif
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/arm64/pixel_aarch64_neon.S
@@ -1,0 +1,706 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+.macro CALC_AND_STORE_SAD
+    saddlv  s2, v2.8h
+    fmov    w0, s2
+.endm
+
+.macro CALC_AND_STORE_SAD_FOUR
+    saddlv  s28, v28.8h
+    saddlv  s29, v29.8h
+    saddlv  s30, v30.8h
+    saddlv  s31, v31.8h
+    st4     {v28.s, v29.s, v30.s, v31.s}[0], [x4]
+.endm
+
+.macro LOAD_8X8_1
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x0], x1
+    ld1     {v2.8b}, [x0], x1
+    ld1     {v3.8b}, [x0], x1
+    ld1     {v4.8b}, [x0], x1
+    ld1     {v5.8b}, [x0], x1
+    ld1     {v6.8b}, [x0], x1
+    ld1     {v7.8b}, [x0], x1
+.endm
+
+.macro LOAD_16X8_1
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x0], x1
+    ld1     {v2.16b}, [x0], x1
+    ld1     {v3.16b}, [x0], x1
+    ld1     {v4.16b}, [x0], x1
+    ld1     {v5.16b}, [x0], x1
+    ld1     {v6.16b}, [x0], x1
+    ld1     {v7.16b}, [x0], x1
+.endm
+
+#ifdef __APPLE__
+.macro LOAD_8X8_2
+    ld1     {v16.8b}, [$0], x3
+    ld1     {v17.8b}, [$0], x3
+    ld1     {v18.8b}, [$0], x3
+    ld1     {v19.8b}, [$0], x3
+    ld1     {v20.8b}, [$0], x3
+    ld1     {v21.8b}, [$0], x3
+    ld1     {v22.8b}, [$0], x3
+    ld1     {v23.8b}, [$0], x3
+.endm
+
+.macro CALC_ABS_8X8_1
+    uab$1l  $0, v0.8b, v16.8b
+    uabal   $0, v1.8b, v17.8b
+    uabal   $0, v2.8b, v18.8b
+    uabal   $0, v3.8b, v19.8b
+    uabal   $0, v4.8b, v20.8b
+    uabal   $0, v5.8b, v21.8b
+    uabal   $0, v6.8b, v22.8b
+    uabal   $0, v7.8b, v23.8b
+.endm
+
+.macro CALC_ABS_8X8_2
+    uab$0l  v29.8h, v0.8b, v18.8b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal   v29.8h, v7.8b, v25.8b
+.endm
+
+.macro LOAD_16X8_2
+    ld1     {v16.16b}, [$0], x3
+    ld1     {v17.16b}, [$0], x3
+    ld1     {v18.16b}, [$0], x3
+    ld1     {v19.16b}, [$0], x3
+    ld1     {v20.16b}, [$0], x3
+    ld1     {v21.16b}, [$0], x3
+    ld1     {v22.16b}, [$0], x3
+    ld1     {v23.16b}, [$0], x3
+.endm
+
+.macro CALC_ABS_16X8_1
+    uab$1l  $0, v0.8b, v16.8b
+    uabal2  $0, v0.16b,v16.16b
+    uabal   $0, v1.8b, v17.8b
+    uabal2  $0, v1.16b,v17.16b
+    uabal   $0, v2.8b, v18.8b
+    uabal2  $0, v2.16b,v18.16b
+    uabal   $0, v3.8b, v19.8b
+    uabal2  $0, v3.16b,v19.16b
+    uabal   $0, v4.8b, v20.8b
+    uabal2  $0, v4.16b,v20.16b
+    uabal   $0, v5.8b, v21.8b
+    uabal2  $0, v5.16b,v21.16b
+    uabal   $0, v6.8b, v22.8b
+    uabal2  $0, v6.16b,v22.16b
+    uabal   $0, v7.8b, v23.8b
+    uabal2  $0, v7.16b,v23.16b
+.endm
+
+.macro CALC_ABS_16X8_2
+    uab$0l  v29.8h, v0.8b, v18.8b
+    uabal2  v29.8h, v0.16b,v18.16b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal2  v29.8h, v1.16b,v19.16b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal2  v29.8h, v2.16b,v20.16b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal2  v29.8h, v3.16b,v21.16b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal2  v29.8h, v4.16b,v22.16b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal2  v29.8h, v5.16b,v23.16b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal2  v29.8h, v6.16b,v24.16b
+    uabal   v29.8h, v7.8b, v25.8b
+    uabal2  v29.8h, v7.16b,v25.16b
+.endm
+#else
+.macro LOAD_8X8_2 arg0
+    ld1     {v16.8b}, [\arg0], x3
+    ld1     {v17.8b}, [\arg0], x3
+    ld1     {v18.8b}, [\arg0], x3
+    ld1     {v19.8b}, [\arg0], x3
+    ld1     {v20.8b}, [\arg0], x3
+    ld1     {v21.8b}, [\arg0], x3
+    ld1     {v22.8b}, [\arg0], x3
+    ld1     {v23.8b}, [\arg0], x3
+.endm
+
+.macro CALC_ABS_8X8_1 arg0, arg1
+    uab\arg1\()l    \arg0, v0.8b, v16.8b
+    uabal   \arg0, v1.8b, v17.8b
+    uabal   \arg0, v2.8b, v18.8b
+    uabal   \arg0, v3.8b, v19.8b
+    uabal   \arg0, v4.8b, v20.8b
+    uabal   \arg0, v5.8b, v21.8b
+    uabal   \arg0, v6.8b, v22.8b
+    uabal   \arg0, v7.8b, v23.8b
+.endm
+
+.macro CALC_ABS_8X8_2 arg0
+    uab\arg0\()l    v29.8h, v0.8b, v18.8b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal   v29.8h, v7.8b, v25.8b
+.endm
+
+.macro LOAD_16X8_2 arg0
+    ld1     {v16.16b}, [\arg0], x3
+    ld1     {v17.16b}, [\arg0], x3
+    ld1     {v18.16b}, [\arg0], x3
+    ld1     {v19.16b}, [\arg0], x3
+    ld1     {v20.16b}, [\arg0], x3
+    ld1     {v21.16b}, [\arg0], x3
+    ld1     {v22.16b}, [\arg0], x3
+    ld1     {v23.16b}, [\arg0], x3
+.endm
+
+.macro CALC_ABS_16X8_1 arg0, arg1
+    uab\arg1\()l  \arg0, v0.8b, v16.8b
+    uabal2  \arg0, v0.16b,v16.16b
+    uabal   \arg0, v1.8b, v17.8b
+    uabal2  \arg0, v1.16b,v17.16b
+    uabal   \arg0, v2.8b, v18.8b
+    uabal2  \arg0, v2.16b,v18.16b
+    uabal   \arg0, v3.8b, v19.8b
+    uabal2  \arg0, v3.16b,v19.16b
+    uabal   \arg0, v4.8b, v20.8b
+    uabal2  \arg0, v4.16b,v20.16b
+    uabal   \arg0, v5.8b, v21.8b
+    uabal2  \arg0, v5.16b,v21.16b
+    uabal   \arg0, v6.8b, v22.8b
+    uabal2  \arg0, v6.16b,v22.16b
+    uabal   \arg0, v7.8b, v23.8b
+    uabal2  \arg0, v7.16b,v23.16b
+.endm
+
+.macro CALC_ABS_16X8_2 arg0
+    uab\arg0\()l  v29.8h, v0.8b, v18.8b
+    uabal2  v29.8h, v0.16b,v18.16b
+    uabal   v29.8h, v1.8b, v19.8b
+    uabal2  v29.8h, v1.16b,v19.16b
+    uabal   v29.8h, v2.8b, v20.8b
+    uabal2  v29.8h, v2.16b,v20.16b
+    uabal   v29.8h, v3.8b, v21.8b
+    uabal2  v29.8h, v3.16b,v21.16b
+    uabal   v29.8h, v4.8b, v22.8b
+    uabal2  v29.8h, v4.16b,v22.16b
+    uabal   v29.8h, v5.8b, v23.8b
+    uabal2  v29.8h, v5.16b,v23.16b
+    uabal   v29.8h, v6.8b, v24.8b
+    uabal2  v29.8h, v6.16b,v24.16b
+    uabal   v29.8h, v7.8b, v25.8b
+    uabal2  v29.8h, v7.16b,v25.16b
+.endm
+#endif
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v1.s}[0], [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+.rept 3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v1.s}[0], [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+.endr
+    saddlv  s2, v2.4h
+    fmov    w0, s2
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+.rept 7
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+.rept 15
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.rept 7
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabdl   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.rept 15
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    uabal   v2.8h, v0.8b, v1.8b
+    uabal2  v2.8h, v0.16b, v1.16b
+.endr
+    CALC_AND_STORE_SAD
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v0.s}[1], [x0], x1
+    ld1     {v1.s}[0], [x0], x1
+    ld1     {v1.s}[1], [x0]
+    sub     x0, x2, x3
+    ld1     {v2.s}[0], [x0], x3
+    ld1     {v2.s}[1], [x0], x3
+    ld1     {v3.s}[0], [x0], x3
+    ld1     {v3.s}[1], [x0], x3
+    ld1     {v4.s}[0], [x0], x3
+    ld1     {v4.s}[1], [x0], x3
+
+    uabdl   v28.8h, v0.8b, v2.8b
+    uabal   v28.8h, v1.8b, v3.8b
+
+    uabdl   v29.8h, v0.8b, v3.8b
+    uabal   v29.8h, v1.8b, v4.8b
+
+    sub     x0, x2, #1
+    ld1     {v2.s}[0], [x0], x3
+    ld1     {v2.s}[1], [x0], x3
+    ld1     {v3.s}[0], [x0], x3
+    ld1     {v3.s}[1], [x0]
+    uabdl   v30.8h, v0.8b, v2.8b
+    uabal   v30.8h, v1.8b, v3.8b
+
+    add     x0, x2, #1
+    ld1     {v2.s}[0], [x0], x3
+    ld1     {v2.s}[1], [x0], x3
+    ld1     {v3.s}[0], [x0], x3
+    ld1     {v3.s}[1], [x0]
+    uabdl   v31.8h, v0.8b, v2.8b
+    uabal   v31.8h, v1.8b, v3.8b
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    LOAD_8X8_1
+    sub     x0, x2, x3
+    LOAD_8X8_2 x0
+    ld1     {v24.8b}, [x0], x3
+    ld1     {v25.8b}, [x0]
+
+    CALC_ABS_8X8_1 v28.8h, d
+    CALC_ABS_8X8_2 d
+
+    sub     x0, x2, #1
+    LOAD_8X8_2 x0
+    CALC_ABS_8X8_1 v30.8h, d
+
+    add     x0, x2, #1
+    LOAD_8X8_2 x0
+    CALC_ABS_8X8_1 v31.8h, d
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    LOAD_8X8_1
+    sub     x5, x2, x3
+    LOAD_8X8_2 x5
+    ld1     {v24.8b}, [x5], x3
+    ld1     {v25.8b}, [x5], x3
+
+    CALC_ABS_8X8_1 v28.8h, d
+    CALC_ABS_8X8_2 d
+
+    sub     x6, x2, #1
+    LOAD_8X8_2 x6
+    CALC_ABS_8X8_1 v30.8h, d
+
+    add     x7, x2, #1
+    LOAD_8X8_2 x7
+    CALC_ABS_8X8_1 v31.8h, d
+
+    LOAD_8X8_1
+    sub     x5, x5, x3
+    sub     x5, x5, x3
+    LOAD_8X8_2 x5
+    ld1     {v24.8b}, [x5], x3
+    ld1     {v25.8b}, [x5]
+
+    CALC_ABS_8X8_1 v28.8h, a
+    CALC_ABS_8X8_2 a
+
+    LOAD_8X8_2 x6
+    CALC_ABS_8X8_1 v30.8h, a
+
+    LOAD_8X8_2 x7
+    CALC_ABS_8X8_1 v31.8h, a
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    LOAD_16X8_1
+    sub     x0, x2, x3
+    LOAD_16X8_2 x0
+    ld1     {v24.16b}, [x0], x3
+    ld1     {v25.16b}, [x0]
+
+    CALC_ABS_16X8_1 v28.8h, d
+    CALC_ABS_16X8_2 d
+
+    sub     x0, x2, #1
+    LOAD_16X8_2 x0
+    CALC_ABS_16X8_1 v30.8h, d
+
+    add     x0, x2, #1
+    LOAD_16X8_2 x0
+    CALC_ABS_16X8_1 v31.8h, d
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+
+    LOAD_16X8_1
+    sub     x5, x2, x3
+    LOAD_16X8_2 x5
+    ld1     {v24.16b}, [x5], x3
+    ld1     {v25.16b}, [x5], x3
+
+    CALC_ABS_16X8_1 v28.8h, d
+    CALC_ABS_16X8_2 d
+
+    sub     x6, x2, #1
+    LOAD_16X8_2 x6
+    CALC_ABS_16X8_1 v30.8h, d
+
+    add     x7, x2, #1
+    LOAD_16X8_2 x7
+    CALC_ABS_16X8_1 v31.8h, d
+
+    LOAD_16X8_1
+    sub     x5, x5, x3
+    sub     x5, x5, x3
+    LOAD_16X8_2 x5
+    ld1     {v24.16b}, [x5], x3
+    ld1     {v25.16b}, [x5]
+
+    CALC_ABS_16X8_1 v28.8h, a
+    CALC_ABS_16X8_2 a
+
+    LOAD_16X8_2 x6
+    CALC_ABS_16X8_1 v30.8h, a
+
+    LOAD_16X8_2 x7
+    CALC_ABS_16X8_1 v31.8h, a
+
+    CALC_AND_STORE_SAD_FOUR
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v0.s}[1], [x0], x1
+    ld1     {v1.s}[0], [x0], x1
+    ld1     {v1.s}[1], [x0]
+
+    ld1     {v2.s}[0], [x2], x3
+    ld1     {v2.s}[1], [x2], x3
+    ld1     {v3.s}[0], [x2], x3
+    ld1     {v3.s}[1], [x2]
+    usubl   v4.8h, v0.8b, v2.8b //{0,1,2,3,4,5,6,7}
+    usubl   v5.8h, v1.8b, v3.8b //{8,9,10,11,12,13,14,15}
+
+    //Do the vertical transform
+    add     v6.8h, v4.8h, v5.8h //{0,4,8,12,1,5,9,13}
+    sub     v7.8h, v4.8h, v5.8h //{2,6,10,14,3,7,11,15}
+    mov     x4,      v6.d[1]
+    mov     v6.d[1], v7.d[0]
+    ins     v7.d[0], x4
+    add     v4.8h, v6.8h, v7.8h
+    sub     v5.8h, v6.8h, v7.8h
+
+    //Do the horizontal transform
+    trn1    v6.4s, v4.4s, v5.4s
+    trn2    v7.4s, v4.4s, v5.4s
+    add     v4.8h, v6.8h, v7.8h
+    sub     v5.8h, v6.8h, v7.8h
+    trn1    v6.8h, v4.8h, v5.8h
+    trn2    v7.8h, v4.8h, v5.8h
+    add     v4.8h, v6.8h, v7.8h
+    abs     v4.8h, v4.8h
+    saba    v4.8h, v6.8h, v7.8h
+    uaddlv  s4, v4.8h
+    fmov    w0, s4
+    add     w0, w0, #1
+    lsr     w0, w0, #1
+
+WELS_ASM_ARCH64_FUNC_END
+
+.macro SATD_8x4
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    ld1     {v2.8b}, [x0], x1
+    usubl   v16.8h,  v0.8b, v1.8b
+
+    ld1     {v3.8b}, [x2], x3
+    usubl   v17.8h,  v2.8b, v3.8b
+    ld1     {v4.8b}, [x0], x1
+    ld1     {v5.8b}, [x2], x3
+
+    add     v25.8h,  v16.8h, v17.8h
+    usubl   v18.8h,  v4.8b,  v5.8b
+
+    ld1     {v6.8b}, [x0], x1
+    ld1     {v7.8b}, [x2], x3
+
+    usubl   v19.8h,  v6.8b,  v7.8b
+    sub     v26.8h,  v16.8h, v17.8h
+
+    add     v27.8h,  v18.8h, v19.8h
+    sub     v28.8h,  v18.8h, v19.8h
+
+    add     v0.8h,  v25.8h, v27.8h
+    sub     v1.8h,  v25.8h, v27.8h
+
+    add     v2.8h,  v26.8h, v28.8h
+    sub     v3.8h,  v26.8h, v28.8h
+
+    trn1    v4.8h, v0.8h, v1.8h
+    trn2    v5.8h, v0.8h, v1.8h
+    trn1    v6.8h, v2.8h, v3.8h
+    trn2    v7.8h, v2.8h, v3.8h
+
+    add     v16.8h, v4.8h, v5.8h
+    sabd    v17.8h, v4.8h, v5.8h
+    abs     v16.8h, v16.8h
+    add     v18.8h, v6.8h, v7.8h
+    sabd    v19.8h, v6.8h, v7.8h
+    abs     v18.8h, v18.8h
+
+    trn1    v4.4s, v16.4s, v17.4s
+    trn2    v5.4s, v16.4s, v17.4s
+    trn1    v6.4s, v18.4s, v19.4s
+    trn2    v7.4s, v18.4s, v19.4s
+
+    smax    v0.8h, v4.8h, v5.8h
+    smax    v1.8h, v6.8h, v7.8h
+.endm
+
+.macro SATD_16x4
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    ld1     {v2.16b}, [x0], x1
+    usubl   v16.8h,  v0.8b, v1.8b
+    usubl2  v24.8h,  v0.16b, v1.16b
+
+    ld1     {v3.16b}, [x2], x3
+    usubl   v17.8h,  v2.8b, v3.8b
+    usubl2  v25.8h,  v2.16b, v3.16b
+
+    ld1     {v4.16b}, [x0], x1
+    ld1     {v5.16b}, [x2], x3
+    usubl   v18.8h,  v4.8b, v5.8b
+    usubl2  v26.8h,  v4.16b, v5.16b
+
+    ld1     {v6.16b}, [x0], x1
+    ld1     {v7.16b}, [x2], x3
+    usubl   v19.8h,  v6.8b, v7.8b
+    usubl2  v27.8h,  v6.16b, v7.16b
+
+    add     v0.8h,  v16.8h, v17.8h
+    sub     v1.8h,  v16.8h, v17.8h
+    add     v2.8h,  v18.8h, v19.8h
+    sub     v3.8h,  v18.8h, v19.8h
+
+    add     v4.8h,  v24.8h, v25.8h
+    sub     v5.8h,  v24.8h, v25.8h
+    add     v6.8h,  v26.8h, v27.8h
+    sub     v7.8h,  v26.8h, v27.8h
+
+    add     v16.8h,  v0.8h, v2.8h
+    sub     v18.8h,  v0.8h, v2.8h
+    add     v17.8h,  v4.8h, v6.8h
+    sub     v19.8h,  v4.8h, v6.8h
+
+    add     v0.8h,  v1.8h, v3.8h
+    sub     v2.8h,  v1.8h, v3.8h
+    add     v1.8h,  v5.8h, v7.8h
+    sub     v3.8h,  v5.8h, v7.8h
+
+    trn1    v4.8h, v16.8h, v18.8h
+    trn2    v6.8h, v16.8h, v18.8h
+    trn1    v5.8h, v17.8h, v19.8h
+    trn2    v7.8h, v17.8h, v19.8h
+
+    add     v16.8h, v4.8h, v6.8h
+    sabd    v18.8h, v4.8h, v6.8h
+    add     v17.8h, v5.8h, v7.8h
+    sabd    v19.8h, v5.8h, v7.8h
+    abs     v16.8h, v16.8h
+    abs     v17.8h, v17.8h
+
+    trn1    v4.8h, v0.8h, v2.8h
+    trn2    v6.8h, v0.8h, v2.8h
+    trn1    v5.8h, v1.8h, v3.8h
+    trn2    v7.8h, v1.8h, v3.8h
+
+    add     v0.8h, v4.8h, v6.8h
+    sabd    v2.8h, v4.8h, v6.8h
+    add     v1.8h, v5.8h, v7.8h
+    sabd    v3.8h, v5.8h, v7.8h
+    abs     v0.8h, v0.8h
+    abs     v1.8h, v1.8h
+
+    trn1    v4.4s, v16.4s, v18.4s
+    trn2    v6.4s, v16.4s, v18.4s
+    trn1    v5.4s, v17.4s, v19.4s
+    trn2    v7.4s, v17.4s, v19.4s
+
+    trn1    v16.4s, v0.4s, v2.4s
+    trn2    v18.4s, v0.4s, v2.4s
+    trn1    v17.4s, v1.4s, v3.4s
+    trn2    v19.4s, v1.4s, v3.4s
+
+    smax    v0.8h, v4.8h, v6.8h
+    smax    v1.8h, v5.8h, v7.8h
+    smax    v2.8h, v16.8h, v18.8h
+    smax    v3.8h, v17.8h, v19.8h
+    add     v0.8h, v0.8h, v1.8h
+    add     v2.8h, v2.8h, v3.8h
+.endm
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_16x4
+    add     v31.8h, v0.8h, v2.8h
+.rept 3
+    SATD_16x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v2.8h
+.endr
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_16x4
+    add     v31.8h, v0.8h, v2.8h
+
+    SATD_16x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v2.8h
+
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_8x4
+    add     v31.8h, v0.8h, v1.8h
+.rept 3
+    SATD_8x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v1.8h
+.endr
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_8x4
+    add     v31.8h, v0.8h, v1.8h
+
+    SATD_8x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v1.8h
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
+#endif
\ No newline at end of file
--- a/codec/encoder/core/arm64/pixel_neon_aarch64.S
+++ /dev/null
@@ -1,706 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON_AARCH64
-.text
-#include "arm_arch64_common_macro.S"
-
-.macro CALC_AND_STORE_SAD
-    saddlv  s2, v2.8h
-    fmov    w0, s2
-.endm
-
-.macro CALC_AND_STORE_SAD_FOUR
-    saddlv  s28, v28.8h
-    saddlv  s29, v29.8h
-    saddlv  s30, v30.8h
-    saddlv  s31, v31.8h
-    st4     {v28.s, v29.s, v30.s, v31.s}[0], [x4]
-.endm
-
-.macro LOAD_8X8_1
-    ld1     {v0.8b}, [x0], x1
-    ld1     {v1.8b}, [x0], x1
-    ld1     {v2.8b}, [x0], x1
-    ld1     {v3.8b}, [x0], x1
-    ld1     {v4.8b}, [x0], x1
-    ld1     {v5.8b}, [x0], x1
-    ld1     {v6.8b}, [x0], x1
-    ld1     {v7.8b}, [x0], x1
-.endm
-
-.macro LOAD_16X8_1
-    ld1     {v0.16b}, [x0], x1
-    ld1     {v1.16b}, [x0], x1
-    ld1     {v2.16b}, [x0], x1
-    ld1     {v3.16b}, [x0], x1
-    ld1     {v4.16b}, [x0], x1
-    ld1     {v5.16b}, [x0], x1
-    ld1     {v6.16b}, [x0], x1
-    ld1     {v7.16b}, [x0], x1
-.endm
-
-#ifdef __APPLE__
-.macro LOAD_8X8_2
-    ld1     {v16.8b}, [$0], x3
-    ld1     {v17.8b}, [$0], x3
-    ld1     {v18.8b}, [$0], x3
-    ld1     {v19.8b}, [$0], x3
-    ld1     {v20.8b}, [$0], x3
-    ld1     {v21.8b}, [$0], x3
-    ld1     {v22.8b}, [$0], x3
-    ld1     {v23.8b}, [$0], x3
-.endm
-
-.macro CALC_ABS_8X8_1
-    uab$1l  $0, v0.8b, v16.8b
-    uabal   $0, v1.8b, v17.8b
-    uabal   $0, v2.8b, v18.8b
-    uabal   $0, v3.8b, v19.8b
-    uabal   $0, v4.8b, v20.8b
-    uabal   $0, v5.8b, v21.8b
-    uabal   $0, v6.8b, v22.8b
-    uabal   $0, v7.8b, v23.8b
-.endm
-
-.macro CALC_ABS_8X8_2
-    uab$0l  v29.8h, v0.8b, v18.8b
-    uabal   v29.8h, v1.8b, v19.8b
-    uabal   v29.8h, v2.8b, v20.8b
-    uabal   v29.8h, v3.8b, v21.8b
-    uabal   v29.8h, v4.8b, v22.8b
-    uabal   v29.8h, v5.8b, v23.8b
-    uabal   v29.8h, v6.8b, v24.8b
-    uabal   v29.8h, v7.8b, v25.8b
-.endm
-
-.macro LOAD_16X8_2
-    ld1     {v16.16b}, [$0], x3
-    ld1     {v17.16b}, [$0], x3
-    ld1     {v18.16b}, [$0], x3
-    ld1     {v19.16b}, [$0], x3
-    ld1     {v20.16b}, [$0], x3
-    ld1     {v21.16b}, [$0], x3
-    ld1     {v22.16b}, [$0], x3
-    ld1     {v23.16b}, [$0], x3
-.endm
-
-.macro CALC_ABS_16X8_1
-    uab$1l  $0, v0.8b, v16.8b
-    uabal2  $0, v0.16b,v16.16b
-    uabal   $0, v1.8b, v17.8b
-    uabal2  $0, v1.16b,v17.16b
-    uabal   $0, v2.8b, v18.8b
-    uabal2  $0, v2.16b,v18.16b
-    uabal   $0, v3.8b, v19.8b
-    uabal2  $0, v3.16b,v19.16b
-    uabal   $0, v4.8b, v20.8b
-    uabal2  $0, v4.16b,v20.16b
-    uabal   $0, v5.8b, v21.8b
-    uabal2  $0, v5.16b,v21.16b
-    uabal   $0, v6.8b, v22.8b
-    uabal2  $0, v6.16b,v22.16b
-    uabal   $0, v7.8b, v23.8b
-    uabal2  $0, v7.16b,v23.16b
-.endm
-
-.macro CALC_ABS_16X8_2
-    uab$0l  v29.8h, v0.8b, v18.8b
-    uabal2  v29.8h, v0.16b,v18.16b
-    uabal   v29.8h, v1.8b, v19.8b
-    uabal2  v29.8h, v1.16b,v19.16b
-    uabal   v29.8h, v2.8b, v20.8b
-    uabal2  v29.8h, v2.16b,v20.16b
-    uabal   v29.8h, v3.8b, v21.8b
-    uabal2  v29.8h, v3.16b,v21.16b
-    uabal   v29.8h, v4.8b, v22.8b
-    uabal2  v29.8h, v4.16b,v22.16b
-    uabal   v29.8h, v5.8b, v23.8b
-    uabal2  v29.8h, v5.16b,v23.16b
-    uabal   v29.8h, v6.8b, v24.8b
-    uabal2  v29.8h, v6.16b,v24.16b
-    uabal   v29.8h, v7.8b, v25.8b
-    uabal2  v29.8h, v7.16b,v25.16b
-.endm
-#else
-.macro LOAD_8X8_2 arg0
-    ld1     {v16.8b}, [\arg0], x3
-    ld1     {v17.8b}, [\arg0], x3
-    ld1     {v18.8b}, [\arg0], x3
-    ld1     {v19.8b}, [\arg0], x3
-    ld1     {v20.8b}, [\arg0], x3
-    ld1     {v21.8b}, [\arg0], x3
-    ld1     {v22.8b}, [\arg0], x3
-    ld1     {v23.8b}, [\arg0], x3
-.endm
-
-.macro CALC_ABS_8X8_1 arg0, arg1
-    uab\arg1\()l    \arg0, v0.8b, v16.8b
-    uabal   \arg0, v1.8b, v17.8b
-    uabal   \arg0, v2.8b, v18.8b
-    uabal   \arg0, v3.8b, v19.8b
-    uabal   \arg0, v4.8b, v20.8b
-    uabal   \arg0, v5.8b, v21.8b
-    uabal   \arg0, v6.8b, v22.8b
-    uabal   \arg0, v7.8b, v23.8b
-.endm
-
-.macro CALC_ABS_8X8_2 arg0
-    uab\arg0\()l    v29.8h, v0.8b, v18.8b
-    uabal   v29.8h, v1.8b, v19.8b
-    uabal   v29.8h, v2.8b, v20.8b
-    uabal   v29.8h, v3.8b, v21.8b
-    uabal   v29.8h, v4.8b, v22.8b
-    uabal   v29.8h, v5.8b, v23.8b
-    uabal   v29.8h, v6.8b, v24.8b
-    uabal   v29.8h, v7.8b, v25.8b
-.endm
-
-.macro LOAD_16X8_2 arg0
-    ld1     {v16.16b}, [\arg0], x3
-    ld1     {v17.16b}, [\arg0], x3
-    ld1     {v18.16b}, [\arg0], x3
-    ld1     {v19.16b}, [\arg0], x3
-    ld1     {v20.16b}, [\arg0], x3
-    ld1     {v21.16b}, [\arg0], x3
-    ld1     {v22.16b}, [\arg0], x3
-    ld1     {v23.16b}, [\arg0], x3
-.endm
-
-.macro CALC_ABS_16X8_1 arg0, arg1
-    uab\arg1\()l  \arg0, v0.8b, v16.8b
-    uabal2  \arg0, v0.16b,v16.16b
-    uabal   \arg0, v1.8b, v17.8b
-    uabal2  \arg0, v1.16b,v17.16b
-    uabal   \arg0, v2.8b, v18.8b
-    uabal2  \arg0, v2.16b,v18.16b
-    uabal   \arg0, v3.8b, v19.8b
-    uabal2  \arg0, v3.16b,v19.16b
-    uabal   \arg0, v4.8b, v20.8b
-    uabal2  \arg0, v4.16b,v20.16b
-    uabal   \arg0, v5.8b, v21.8b
-    uabal2  \arg0, v5.16b,v21.16b
-    uabal   \arg0, v6.8b, v22.8b
-    uabal2  \arg0, v6.16b,v22.16b
-    uabal   \arg0, v7.8b, v23.8b
-    uabal2  \arg0, v7.16b,v23.16b
-.endm
-
-.macro CALC_ABS_16X8_2 arg0
-    uab\arg0\()l  v29.8h, v0.8b, v18.8b
-    uabal2  v29.8h, v0.16b,v18.16b
-    uabal   v29.8h, v1.8b, v19.8b
-    uabal2  v29.8h, v1.16b,v19.16b
-    uabal   v29.8h, v2.8b, v20.8b
-    uabal2  v29.8h, v2.16b,v20.16b
-    uabal   v29.8h, v3.8b, v21.8b
-    uabal2  v29.8h, v3.16b,v21.16b
-    uabal   v29.8h, v4.8b, v22.8b
-    uabal2  v29.8h, v4.16b,v22.16b
-    uabal   v29.8h, v5.8b, v23.8b
-    uabal2  v29.8h, v5.16b,v23.16b
-    uabal   v29.8h, v6.8b, v24.8b
-    uabal2  v29.8h, v6.16b,v24.16b
-    uabal   v29.8h, v7.8b, v25.8b
-    uabal2  v29.8h, v7.16b,v25.16b
-.endm
-#endif
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    ld1     {v0.s}[0], [x0], x1
-    ld1     {v1.s}[0], [x2], x3
-    uabdl   v2.8h, v0.8b, v1.8b
-.rept 3
-    ld1     {v0.s}[0], [x0], x1
-    ld1     {v1.s}[0], [x2], x3
-    uabal   v2.8h, v0.8b, v1.8b
-.endr
-    saddlv  s2, v2.4h
-    fmov    w0, s2
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    ld1     {v0.8b}, [x0], x1
-    ld1     {v1.8b}, [x2], x3
-    uabdl   v2.8h, v0.8b, v1.8b
-.rept 7
-    ld1     {v0.8b}, [x0], x1
-    ld1     {v1.8b}, [x2], x3
-    uabal   v2.8h, v0.8b, v1.8b
-.endr
-    CALC_AND_STORE_SAD
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    ld1     {v0.8b}, [x0], x1
-    ld1     {v1.8b}, [x2], x3
-    uabdl   v2.8h, v0.8b, v1.8b
-.rept 15
-    ld1     {v0.8b}, [x0], x1
-    ld1     {v1.8b}, [x2], x3
-    uabal   v2.8h, v0.8b, v1.8b
-.endr
-    CALC_AND_STORE_SAD
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    ld1     {v0.16b}, [x0], x1
-    ld1     {v1.16b}, [x2], x3
-    uabdl   v2.8h, v0.8b, v1.8b
-    uabal2  v2.8h, v0.16b, v1.16b
-.rept 7
-    ld1     {v0.16b}, [x0], x1
-    ld1     {v1.16b}, [x2], x3
-    uabal   v2.8h, v0.8b, v1.8b
-    uabal2  v2.8h, v0.16b, v1.16b
-.endr
-    CALC_AND_STORE_SAD
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    ld1     {v0.16b}, [x0], x1
-    ld1     {v1.16b}, [x2], x3
-    uabdl   v2.8h, v0.8b, v1.8b
-    uabal2  v2.8h, v0.16b, v1.16b
-.rept 15
-    ld1     {v0.16b}, [x0], x1
-    ld1     {v1.16b}, [x2], x3
-    uabal   v2.8h, v0.8b, v1.8b
-    uabal2  v2.8h, v0.16b, v1.16b
-.endr
-    CALC_AND_STORE_SAD
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    ld1     {v0.s}[0], [x0], x1
-    ld1     {v0.s}[1], [x0], x1
-    ld1     {v1.s}[0], [x0], x1
-    ld1     {v1.s}[1], [x0]
-    sub     x0, x2, x3
-    ld1     {v2.s}[0], [x0], x3
-    ld1     {v2.s}[1], [x0], x3
-    ld1     {v3.s}[0], [x0], x3
-    ld1     {v3.s}[1], [x0], x3
-    ld1     {v4.s}[0], [x0], x3
-    ld1     {v4.s}[1], [x0], x3
-
-    uabdl   v28.8h, v0.8b, v2.8b
-    uabal   v28.8h, v1.8b, v3.8b
-
-    uabdl   v29.8h, v0.8b, v3.8b
-    uabal   v29.8h, v1.8b, v4.8b
-
-    sub     x0, x2, #1
-    ld1     {v2.s}[0], [x0], x3
-    ld1     {v2.s}[1], [x0], x3
-    ld1     {v3.s}[0], [x0], x3
-    ld1     {v3.s}[1], [x0]
-    uabdl   v30.8h, v0.8b, v2.8b
-    uabal   v30.8h, v1.8b, v3.8b
-
-    add     x0, x2, #1
-    ld1     {v2.s}[0], [x0], x3
-    ld1     {v2.s}[1], [x0], x3
-    ld1     {v3.s}[0], [x0], x3
-    ld1     {v3.s}[1], [x0]
-    uabdl   v31.8h, v0.8b, v2.8b
-    uabal   v31.8h, v1.8b, v3.8b
-
-    CALC_AND_STORE_SAD_FOUR
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    LOAD_8X8_1
-    sub     x0, x2, x3
-    LOAD_8X8_2 x0
-    ld1     {v24.8b}, [x0], x3
-    ld1     {v25.8b}, [x0]
-
-    CALC_ABS_8X8_1 v28.8h, d
-    CALC_ABS_8X8_2 d
-
-    sub     x0, x2, #1
-    LOAD_8X8_2 x0
-    CALC_ABS_8X8_1 v30.8h, d
-
-    add     x0, x2, #1
-    LOAD_8X8_2 x0
-    CALC_ABS_8X8_1 v31.8h, d
-
-    CALC_AND_STORE_SAD_FOUR
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    LOAD_8X8_1
-    sub     x5, x2, x3
-    LOAD_8X8_2 x5
-    ld1     {v24.8b}, [x5], x3
-    ld1     {v25.8b}, [x5], x3
-
-    CALC_ABS_8X8_1 v28.8h, d
-    CALC_ABS_8X8_2 d
-
-    sub     x6, x2, #1
-    LOAD_8X8_2 x6
-    CALC_ABS_8X8_1 v30.8h, d
-
-    add     x7, x2, #1
-    LOAD_8X8_2 x7
-    CALC_ABS_8X8_1 v31.8h, d
-
-    LOAD_8X8_1
-    sub     x5, x5, x3
-    sub     x5, x5, x3
-    LOAD_8X8_2 x5
-    ld1     {v24.8b}, [x5], x3
-    ld1     {v25.8b}, [x5]
-
-    CALC_ABS_8X8_1 v28.8h, a
-    CALC_ABS_8X8_2 a
-
-    LOAD_8X8_2 x6
-    CALC_ABS_8X8_1 v30.8h, a
-
-    LOAD_8X8_2 x7
-    CALC_ABS_8X8_1 v31.8h, a
-
-    CALC_AND_STORE_SAD_FOUR
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    LOAD_16X8_1
-    sub     x0, x2, x3
-    LOAD_16X8_2 x0
-    ld1     {v24.16b}, [x0], x3
-    ld1     {v25.16b}, [x0]
-
-    CALC_ABS_16X8_1 v28.8h, d
-    CALC_ABS_16X8_2 d
-
-    sub     x0, x2, #1
-    LOAD_16X8_2 x0
-    CALC_ABS_16X8_1 v30.8h, d
-
-    add     x0, x2, #1
-    LOAD_16X8_2 x0
-    CALC_ABS_16X8_1 v31.8h, d
-
-    CALC_AND_STORE_SAD_FOUR
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-
-    LOAD_16X8_1
-    sub     x5, x2, x3
-    LOAD_16X8_2 x5
-    ld1     {v24.16b}, [x5], x3
-    ld1     {v25.16b}, [x5], x3
-
-    CALC_ABS_16X8_1 v28.8h, d
-    CALC_ABS_16X8_2 d
-
-    sub     x6, x2, #1
-    LOAD_16X8_2 x6
-    CALC_ABS_16X8_1 v30.8h, d
-
-    add     x7, x2, #1
-    LOAD_16X8_2 x7
-    CALC_ABS_16X8_1 v31.8h, d
-
-    LOAD_16X8_1
-    sub     x5, x5, x3
-    sub     x5, x5, x3
-    LOAD_16X8_2 x5
-    ld1     {v24.16b}, [x5], x3
-    ld1     {v25.16b}, [x5]
-
-    CALC_ABS_16X8_1 v28.8h, a
-    CALC_ABS_16X8_2 a
-
-    LOAD_16X8_2 x6
-    CALC_ABS_16X8_1 v30.8h, a
-
-    LOAD_16X8_2 x7
-    CALC_ABS_16X8_1 v31.8h, a
-
-    CALC_AND_STORE_SAD_FOUR
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    ld1     {v0.s}[0], [x0], x1
-    ld1     {v0.s}[1], [x0], x1
-    ld1     {v1.s}[0], [x0], x1
-    ld1     {v1.s}[1], [x0]
-
-    ld1     {v2.s}[0], [x2], x3
-    ld1     {v2.s}[1], [x2], x3
-    ld1     {v3.s}[0], [x2], x3
-    ld1     {v3.s}[1], [x2]
-    usubl   v4.8h, v0.8b, v2.8b //{0,1,2,3,4,5,6,7}
-    usubl   v5.8h, v1.8b, v3.8b //{8,9,10,11,12,13,14,15}
-
-    //Do the vertical transform
-    add     v6.8h, v4.8h, v5.8h //{0,4,8,12,1,5,9,13}
-    sub     v7.8h, v4.8h, v5.8h //{2,6,10,14,3,7,11,15}
-    mov     x4,      v6.d[1]
-    mov     v6.d[1], v7.d[0]
-    ins     v7.d[0], x4
-    add     v4.8h, v6.8h, v7.8h
-    sub     v5.8h, v6.8h, v7.8h
-
-    //Do the horizontal transform
-    trn1    v6.4s, v4.4s, v5.4s
-    trn2    v7.4s, v4.4s, v5.4s
-    add     v4.8h, v6.8h, v7.8h
-    sub     v5.8h, v6.8h, v7.8h
-    trn1    v6.8h, v4.8h, v5.8h
-    trn2    v7.8h, v4.8h, v5.8h
-    add     v4.8h, v6.8h, v7.8h
-    abs     v4.8h, v4.8h
-    saba    v4.8h, v6.8h, v7.8h
-    uaddlv  s4, v4.8h
-    fmov    w0, s4
-    add     w0, w0, #1
-    lsr     w0, w0, #1
-
-WELS_ASM_ARCH64_FUNC_END
-
-.macro SATD_8x4
-    ld1     {v0.8b}, [x0], x1
-    ld1     {v1.8b}, [x2], x3
-    ld1     {v2.8b}, [x0], x1
-    usubl   v16.8h,  v0.8b, v1.8b
-
-    ld1     {v3.8b}, [x2], x3
-    usubl   v17.8h,  v2.8b, v3.8b
-    ld1     {v4.8b}, [x0], x1
-    ld1     {v5.8b}, [x2], x3
-
-    add     v25.8h,  v16.8h, v17.8h
-    usubl   v18.8h,  v4.8b,  v5.8b
-
-    ld1     {v6.8b}, [x0], x1
-    ld1     {v7.8b}, [x2], x3
-
-    usubl   v19.8h,  v6.8b,  v7.8b
-    sub     v26.8h,  v16.8h, v17.8h
-
-    add     v27.8h,  v18.8h, v19.8h
-    sub     v28.8h,  v18.8h, v19.8h
-
-    add     v0.8h,  v25.8h, v27.8h
-    sub     v1.8h,  v25.8h, v27.8h
-
-    add     v2.8h,  v26.8h, v28.8h
-    sub     v3.8h,  v26.8h, v28.8h
-
-    trn1    v4.8h, v0.8h, v1.8h
-    trn2    v5.8h, v0.8h, v1.8h
-    trn1    v6.8h, v2.8h, v3.8h
-    trn2    v7.8h, v2.8h, v3.8h
-
-    add     v16.8h, v4.8h, v5.8h
-    sabd    v17.8h, v4.8h, v5.8h
-    abs     v16.8h, v16.8h
-    add     v18.8h, v6.8h, v7.8h
-    sabd    v19.8h, v6.8h, v7.8h
-    abs     v18.8h, v18.8h
-
-    trn1    v4.4s, v16.4s, v17.4s
-    trn2    v5.4s, v16.4s, v17.4s
-    trn1    v6.4s, v18.4s, v19.4s
-    trn2    v7.4s, v18.4s, v19.4s
-
-    smax    v0.8h, v4.8h, v5.8h
-    smax    v1.8h, v6.8h, v7.8h
-.endm
-
-.macro SATD_16x4
-    ld1     {v0.16b}, [x0], x1
-    ld1     {v1.16b}, [x2], x3
-    ld1     {v2.16b}, [x0], x1
-    usubl   v16.8h,  v0.8b, v1.8b
-    usubl2  v24.8h,  v0.16b, v1.16b
-
-    ld1     {v3.16b}, [x2], x3
-    usubl   v17.8h,  v2.8b, v3.8b
-    usubl2  v25.8h,  v2.16b, v3.16b
-
-    ld1     {v4.16b}, [x0], x1
-    ld1     {v5.16b}, [x2], x3
-    usubl   v18.8h,  v4.8b, v5.8b
-    usubl2  v26.8h,  v4.16b, v5.16b
-
-    ld1     {v6.16b}, [x0], x1
-    ld1     {v7.16b}, [x2], x3
-    usubl   v19.8h,  v6.8b, v7.8b
-    usubl2  v27.8h,  v6.16b, v7.16b
-
-    add     v0.8h,  v16.8h, v17.8h
-    sub     v1.8h,  v16.8h, v17.8h
-    add     v2.8h,  v18.8h, v19.8h
-    sub     v3.8h,  v18.8h, v19.8h
-
-    add     v4.8h,  v24.8h, v25.8h
-    sub     v5.8h,  v24.8h, v25.8h
-    add     v6.8h,  v26.8h, v27.8h
-    sub     v7.8h,  v26.8h, v27.8h
-
-    add     v16.8h,  v0.8h, v2.8h
-    sub     v18.8h,  v0.8h, v2.8h
-    add     v17.8h,  v4.8h, v6.8h
-    sub     v19.8h,  v4.8h, v6.8h
-
-    add     v0.8h,  v1.8h, v3.8h
-    sub     v2.8h,  v1.8h, v3.8h
-    add     v1.8h,  v5.8h, v7.8h
-    sub     v3.8h,  v5.8h, v7.8h
-
-    trn1    v4.8h, v16.8h, v18.8h
-    trn2    v6.8h, v16.8h, v18.8h
-    trn1    v5.8h, v17.8h, v19.8h
-    trn2    v7.8h, v17.8h, v19.8h
-
-    add     v16.8h, v4.8h, v6.8h
-    sabd    v18.8h, v4.8h, v6.8h
-    add     v17.8h, v5.8h, v7.8h
-    sabd    v19.8h, v5.8h, v7.8h
-    abs     v16.8h, v16.8h
-    abs     v17.8h, v17.8h
-
-    trn1    v4.8h, v0.8h, v2.8h
-    trn2    v6.8h, v0.8h, v2.8h
-    trn1    v5.8h, v1.8h, v3.8h
-    trn2    v7.8h, v1.8h, v3.8h
-
-    add     v0.8h, v4.8h, v6.8h
-    sabd    v2.8h, v4.8h, v6.8h
-    add     v1.8h, v5.8h, v7.8h
-    sabd    v3.8h, v5.8h, v7.8h
-    abs     v0.8h, v0.8h
-    abs     v1.8h, v1.8h
-
-    trn1    v4.4s, v16.4s, v18.4s
-    trn2    v6.4s, v16.4s, v18.4s
-    trn1    v5.4s, v17.4s, v19.4s
-    trn2    v7.4s, v17.4s, v19.4s
-
-    trn1    v16.4s, v0.4s, v2.4s
-    trn2    v18.4s, v0.4s, v2.4s
-    trn1    v17.4s, v1.4s, v3.4s
-    trn2    v19.4s, v1.4s, v3.4s
-
-    smax    v0.8h, v4.8h, v6.8h
-    smax    v1.8h, v5.8h, v7.8h
-    smax    v2.8h, v16.8h, v18.8h
-    smax    v3.8h, v17.8h, v19.8h
-    add     v0.8h, v0.8h, v1.8h
-    add     v2.8h, v2.8h, v3.8h
-.endm
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    SATD_16x4
-    add     v31.8h, v0.8h, v2.8h
-.rept 3
-    SATD_16x4
-    add     v31.8h, v31.8h, v0.8h
-    add     v31.8h, v31.8h, v2.8h
-.endr
-    uaddlv  s4, v31.8h
-    fmov    w0, s4
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    SATD_16x4
-    add     v31.8h, v0.8h, v2.8h
-
-    SATD_16x4
-    add     v31.8h, v31.8h, v0.8h
-    add     v31.8h, v31.8h, v2.8h
-
-    uaddlv  s4, v31.8h
-    fmov    w0, s4
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    SATD_8x4
-    add     v31.8h, v0.8h, v1.8h
-.rept 3
-    SATD_8x4
-    add     v31.8h, v31.8h, v0.8h
-    add     v31.8h, v31.8h, v1.8h
-.endr
-    uaddlv  s4, v31.8h
-    fmov    w0, s4
-WELS_ASM_ARCH64_FUNC_END
-
-WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon
-    sxtw    x1, w1
-    sxtw    x3, w3
-    SATD_8x4
-    add     v31.8h, v0.8h, v1.8h
-
-    SATD_8x4
-    add     v31.8h, v31.8h, v0.8h
-    add     v31.8h, v31.8h, v1.8h
-    uaddlv  s4, v31.8h
-    fmov    w0, s4
-WELS_ASM_ARCH64_FUNC_END
-#endif
\ No newline at end of file
--- a/codec/encoder/core/inc/get_intra_predictor.h
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -135,6 +135,32 @@
 void WelsIChromaPredDc_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsIChromaPredPlane_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 #endif//HAVE_NEON
+
+#if defined(HAVE_NEON_AARCH64)
+void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcLeft_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+void WelsI4x4LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDL_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDLTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVL_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVLTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVR_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHU_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHD_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+void WelsIChromaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_NEON_AARCH64
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -732,6 +732,34 @@
   }
 #endif
 
+#if defined(HAVE_NEON_AARCH64)
+  if (kuiCpuFlag & WELS_CPU_NEON) {
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_AArch64_neon;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_P]  = WelsI16x16LumaPredPlane_AArch64_neon;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_H]  = WelsI16x16LumaPredH_AArch64_neon;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_V]  = WelsI16x16LumaPredV_AArch64_neon;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L]  = WelsI16x16LumaPredDcLeft_AArch64_neon;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T]  = WelsI16x16LumaPredDcTop_AArch64_neon;
+
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_H    ] = WelsI4x4LumaPredH_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL  ] = WelsI4x4LumaPredDDL_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL   ] = WelsI4x4LumaPredVL_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP ] = WelsI4x4LumaPredVLTop_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR   ] = WelsI4x4LumaPredVR_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU   ] = WelsI4x4LumaPredHU_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD   ] = WelsI4x4LumaPredHD_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC   ] = WelsI4x4LumaPredDc_AArch64_neon;
+    pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T   ] = WelsI4x4LumaPredDcTop_AArch64_neon;
+
+    pFuncList->pfGetChromaPred[C_PRED_H]       = WelsIChromaPredH_AArch64_neon;
+    pFuncList->pfGetChromaPred[C_PRED_V]       = WelsIChromaPredV_AArch64_neon;
+    pFuncList->pfGetChromaPred[C_PRED_P ]      = WelsIChromaPredPlane_AArch64_neon;
+    pFuncList->pfGetChromaPred[C_PRED_DC]      = WelsIChromaPredDc_AArch64_neon;
+    pFuncList->pfGetChromaPred[C_PRED_DC_T]      = WelsIChromaPredDcTop_AArch64_neon;
+  }
+#endif//HAVE_NEON_AARCH64
+
 #ifdef X86_ASM
   if (kuiCpuFlag & WELS_CPU_MMXEXT) {
     pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_mmx;
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -59,7 +59,8 @@
 
 ifeq ($(ASM_ARCH), arm64)
 ENCODER_ASM_ARM64_SRCS=\
-	$(ENCODER_SRCDIR)/core/arm64/pixel_neon_aarch64.S\
+	$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
+	$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
 
 ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
 endif