shithub: openh264

Download patch

ref: f0c6c2b318f7f048f83aac1bd42b5cbac8bdfb2a
parent: 594fc4fe7b9018f181714cd2a3e9d17bd96a785b
parent: a4cecd80048e187033fc4ee084abb4d6c7a79e66
author: ruil2 <[email protected]>
date: Fri Mar 7 10:59:23 EST 2014

Merge branch 'master' of https://github.com/cisco/openh264 into encoder_update

--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -7,7 +7,7 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
-		4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture.S */; };
+		4C34067D18C5C94C00DFA14A /* expand_picture_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */; };
 		4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
 		4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; };
 		4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
@@ -47,7 +47,7 @@
 /* End PBXCopyFilesBuildPhase section */
 
 /* Begin PBXFileReference section */
-		4C34067C18C5C94C00DFA14A /* expand_picture.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture.S; sourceTree = "<group>"; };
+		4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture_neon.S; sourceTree = "<group>"; };
 		4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
 		4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -150,7 +150,7 @@
 		4CE4472F18BC61650017DF25 /* common */ = {
 			isa = PBXGroup;
 			children = (
-				4C34067C18C5C94C00DFA14A /* expand_picture.S */,
+				4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */,
 				4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
 				4CE447BC18C085320017DF25 /* deblocking_neon.S */,
 				4CE4473118BC61650017DF25 /* cpu.cpp */,
@@ -260,7 +260,7 @@
 				4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
 				4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
 				4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
-				4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */,
+				4C34067D18C5C94C00DFA14A /* expand_picture_neon.S in Sources */,
 				4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
 				4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
 				4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
--- a/codec/common/arm_arch_common_macro.S
+++ b/codec/common/arm_arch_common_macro.S
@@ -44,6 +44,8 @@
 .endm
 #else
 
+.syntax unified
+
 .macro WELS_ASM_FUNC_BEGIN funcName
 .align 2
 .arm
--- a/codec/common/deblocking_neon.S
+++ b/codec/common/deblocking_neon.S
@@ -910,7 +910,7 @@
     beq      bs_nzc_check_jump0
 
     sub      r6, \arg0, \arg2, lsl #4
-    sub      r6, \arg2, lsl #3
+    sub      r6, r6, \arg2, lsl #3
     add      r6, #12
     vld1.32  d3[1], [r6]
 
--- a/codec/common/expand_picture.S
+++ /dev/null
@@ -1,137 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
-    stmdb sp!, {r4-r8}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
-
-	add r4, r7, r2
-	sub r4, #1
-    //For the left and right expand
-_expand_picture_luma_loop2:
-	sub r5, r7, #32
-	add r6, r4, #1
-
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
-
-	vst1.8 {q0}, [r5]!
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]!
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_luma_loop2
-
-	//for the top and bottom expand
-	add r2, #64
-	sub r0, #32
-	mla r4, r1, r3, r0
-	sub r4, r1
-_expand_picture_luma_loop0:
-	mov r5, #32
-    mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
-
-	mov r8, #32
-_expand_picture_luma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
-    bne _expand_picture_luma_loop1
-
-	subs r2, #16
-	bne	_expand_picture_luma_loop0
-
-    //vldreq.32 d0, [r0]
-
-	ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
-    stmdb sp!, {r4-r8}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
-
-	add r4, r7, r2
-	sub r4, #1
-    //For the left and right expand
-_expand_picture_chroma_loop2:
-	sub r5, r7, #16
-	add r6, r4, #1
-
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
-
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_chroma_loop2
-
-	//for the top and bottom expand
-	add r2, #32
-	sub r0, #16
-	mla r4, r1, r3, r0
-	sub r4, r1
-_expand_picture_chroma_loop0:
-	mov r5, #16
-    mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
-
-	mov r8, #16
-_expand_picture_chroma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
-    bne _expand_picture_chroma_loop1
-
-	subs r2, #16
-	bne	_expand_picture_chroma_loop0
-
-    //vldreq.32 d0, [r0]
-
-	ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-#endif
--- /dev/null
+++ b/codec/common/expand_picture_neon.S
@@ -1,0 +1,137 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
+    stmdb sp!, {r4-r8}
+	//Save the dst
+	mov r7, r0
+	mov r8, r3
+
+	add r4, r7, r2
+	sub r4, #1
+    //For the left and right expand
+_expand_picture_luma_loop2:
+	sub r5, r7, #32
+	add r6, r4, #1
+
+	vld1.8 {d0[], d1[]}, [r7], r1
+	vld1.8 {d2[], d3[]}, [r4], r1
+
+	vst1.8 {q0}, [r5]!
+	vst1.8 {q0}, [r5]
+	vst1.8 {q1}, [r6]!
+	vst1.8 {q1}, [r6]
+	subs r8, #1
+	bne	_expand_picture_luma_loop2
+
+	//for the top and bottom expand
+	add r2, #64
+	sub r0, #32
+	mla r4, r1, r3, r0
+	sub r4, r1
+_expand_picture_luma_loop0:
+	mov r5, #32
+    mls r5, r5, r1, r0
+	add r6, r4, r1
+	vld1.8 {q0}, [r0]!
+	vld1.8 {q1}, [r4]!
+
+	mov r8, #32
+_expand_picture_luma_loop1:
+	vst1.8 {q0}, [r5], r1
+	vst1.8 {q1}, [r6], r1
+	subs r8, #1
+    bne _expand_picture_luma_loop1
+
+	subs r2, #16
+	bne	_expand_picture_luma_loop0
+
+    //vldreq.32 d0, [r0]
+
+	ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
+    stmdb sp!, {r4-r8}
+	//Save the dst
+	mov r7, r0
+	mov r8, r3
+
+	add r4, r7, r2
+	sub r4, #1
+    //For the left and right expand
+_expand_picture_chroma_loop2:
+	sub r5, r7, #16
+	add r6, r4, #1
+
+	vld1.8 {d0[], d1[]}, [r7], r1
+	vld1.8 {d2[], d3[]}, [r4], r1
+
+	vst1.8 {q0}, [r5]
+	vst1.8 {q1}, [r6]
+	subs r8, #1
+	bne	_expand_picture_chroma_loop2
+
+	//for the top and bottom expand
+	add r2, #32
+	sub r0, #16
+	mla r4, r1, r3, r0
+	sub r4, r1
+_expand_picture_chroma_loop0:
+	mov r5, #16
+    mls r5, r5, r1, r0
+	add r6, r4, r1
+	vld1.8 {q0}, [r0]!
+	vld1.8 {q1}, [r4]!
+
+	mov r8, #16
+_expand_picture_chroma_loop1:
+	vst1.8 {q0}, [r5], r1
+	vst1.8 {q1}, [r6], r1
+	subs r8, #1
+    bne _expand_picture_chroma_loop1
+
+	subs r2, #16
+	bne	_expand_picture_chroma_loop0
+
+    //vldreq.32 d0, [r0]
+
+	ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+#endif
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -25,6 +25,7 @@
 ifeq ($(ASM_ARCH), arm)
 COMMON_ASM_S_SRCS=\
 	$(COMMON_SRCDIR)/deblocking_neon.S\
+	$(COMMON_SRCDIR)/expand_picture_neon.S\
 
 COMMON_OBJS += $(COMMON_ASM_S_SRCS:.S=.o)
 endif
--- a/codec/decoder/core/arm/mc_neon.S
+++ b/codec/decoder/core/arm/mc_neon.S
@@ -499,7 +499,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -581,7 +581,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -633,7 +633,7 @@
 
 WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
 	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -694,7 +694,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -776,7 +776,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -828,7 +828,7 @@
 
 WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
 	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -889,7 +889,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -971,7 +971,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -1023,7 +1023,7 @@
 
 WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
 	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -1085,7 +1085,7 @@
 	ldr			r4, [sp, #4]
 
 	sub			r0, #2					//src[-2]
-	sub			r0, r1, lsl #1		//src[-2*src_stride-2]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
 	pld			[r0]
 	pld			[r0, r1]
 
@@ -1197,7 +1197,7 @@
 	ldr			r4, [sp, #4]
 
 	sub			r0, #2				//src[-2]
-	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
+	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
 	pld			[r0]
 	pld			[r0, r1]
 
@@ -1279,7 +1279,7 @@
 	ldr			r6, [sp, #12]
 
 	sub			r0, #2				//src[-2]
-	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
+	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
 	pld			[r0]
 	pld			[r0, r1]
 
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -272,7 +272,7 @@
 	vpaddl.u16 d16, d16
 	vpaddl.u32 d16, d16
 	vmov.u32   r1, d16[0]
-	add  r1, r6, lsl #1
+	add  r1, r1, r6, lsl #1
 
 	//vadd.u16   d20, d21
 	vrshr.u16  d17, #1
@@ -279,7 +279,7 @@
 	vpaddl.u16 d17, d17
 	vpaddl.u32 d17, d17
 	vmov.u32   r2, d17[0]
-	add  r2, r6, lsl #1
+	add  r2, r2, r6, lsl #1
 
     mov r4, #0
     cmp r1, r0
@@ -362,13 +362,13 @@
 	vpaddl.u16 d22, d22
 	vpaddl.u32 d22, d22
 	vmov.u32   r1, d22[0]
-	add  r1, r6, lsl #1
+	add  r1, r1, r6, lsl #1
 
 	vadd.u16   d20, d21
 	vpaddl.u16 d20, d20
 	vpaddl.u32 d20, d20
 	vmov.u32   r2, d20[0]
-	add  r2, r6, lsl #1
+	add  r2, r2, r6, lsl #1
 
     mov r4, #0
     cmp r1, r0
@@ -506,13 +506,13 @@
 	vpaddl.u16 d22, d22
 	vpaddl.u32 d22, d22
 	vmov.u32   r0, d22[0]
-	add  r0, r6, lsl #1
+	add  r0, r0, r6, lsl #1
 
 	vadd.u16   d20, d21
 	vpaddl.u16 d20, d20
 	vpaddl.u32 d20, d20
 	vmov.u32   r1, d20[0]
-	add  r1, r6, lsl #1
+	add  r1, r1, r6, lsl #1
 
 	vadd.u16   d18, d19
 	vpaddl.u16 d18, d18
@@ -644,13 +644,13 @@
 	vpaddl.u16 d11, d11
 	vpaddl.u32 d11, d11
 	vmov.u32   lr, d11[0]
-	add  lr, r6, lsl #1
+	add  lr, lr, r6, lsl #1
 
 	vrshr.u16  d10, #1
 	vpaddl.u16 d10, d10
 	vpaddl.u32 d10, d10
 	vmov.u32   r3, d10[0]
-	add  r3, r6, lsl #1
+	add  r3, r3, r6, lsl #1
 
 	vrshr.u16  d28, #1
 	vpaddl.u16 d28, d28
--- a/codec/encoder/core/arm/mc_neon.S
+++ b/codec/encoder/core/arm/mc_neon.S
@@ -612,7 +612,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -694,7 +694,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -746,7 +746,7 @@
 
 WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon
 	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -806,7 +806,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -888,7 +888,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -939,7 +939,7 @@
 
     WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon
 	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -999,7 +999,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -1080,7 +1080,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -1168,7 +1168,7 @@
 	push		{r4}
 	ldr			r4, [sp, #4]
 
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -1223,7 +1223,7 @@
 
 WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon
 	push		{r4, r5, r6, r7}
-	sub			r0, r1, lsl #1		//src[-2*src_stride]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
 	pld			[r0]
 	pld			[r0, r1]
 	vmov.u16	q14, #0x0014			// 20
@@ -1285,7 +1285,7 @@
 	ldr			r4, [sp, #4]
 
 	sub			r0, #2					//src[-2]
-	sub			r0, r1, lsl #1		//src[-2*src_stride-2]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
 	pld			[r0]
 	pld			[r0, r1]
 
@@ -1393,7 +1393,7 @@
 	ldr			r4, [sp, #4]
 
 	sub			r0, #2					//src[-2]
-	sub			r0, r1, lsl #1		//src[-2*src_stride-2]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
 	pld			[r0]
 	pld			[r0, r1]
 
@@ -1526,7 +1526,7 @@
 	ldr			r4, [sp, #4]
 
 	sub			r0, #2				//src[-2]
-	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
+	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
 	pld			[r0]
 	pld			[r0, r1]
 
@@ -1628,7 +1628,7 @@
 	ldr			r6, [sp, #12]
 
 	sub			r0, #2				//src[-2]
-	sub			r0, r1, lsl #1	//src[-2*src_stride-2]
+	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
 	pld			[r0]
 	pld			[r0, r1]
 
--- a/codec/encoder/core/arm/memory_neon.S
+++ b/codec/encoder/core/arm/memory_neon.S
@@ -48,12 +48,12 @@
     vst1.64 {q0}, [r0]!
     vst1.64 {q0}, [r0]!
     bne mem_zero_loop
-WELS_ASM_FUNC_END
+    bx lr
 
 mem_zero_32_neon_start:
     vst1.64 {q0}, [r0]!
     vst1.64 {q0}, [r0]!
-WELS_ASM_FUNC_END
+    bx lr
 
 mem_zero_24_neon_start:
     vst1.64 {q0}, [r0]!
--- a/codec/encoder/core/inc/wels_preprocess.h
+++ b/codec/encoder/core/inc/wels_preprocess.h
@@ -121,7 +121,6 @@
 
  private:
   int32_t SingleLayerPreprocess (sWelsEncCtx* pEncCtx, const SSourcePicture* kpSrc, Scaled_Picture* m_sScaledPicture);
-  int32_t MultiLayerPreprocess (sWelsEncCtx* pEncCtx, const SSourcePicture* kpSrcPic);
 
   void	BilateralDenoising (SPicture* pSrc, const int32_t iWidth, const int32_t iHeight);
   bool  DetectSceneChange (SPicture* pCurPicture, SPicture* pRefPicture);
@@ -150,7 +149,6 @@
   CWelsLib*        m_pEncLib;
   sWelsEncCtx*     m_pEncCtx;
   bool             m_bInitDone;
-  bool             m_bOfficialBranch;
   /* For Downsampling & VAA I420 based source pictures */
   SPicture*        m_pSpatialPic[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL + 1 +
       LONG_TERM_REF_NUM];	// need memory requirement with total number of (log2(uiGopSize)+1+1+long_term_ref_num)
--- a/codec/encoder/core/src/wels_preprocess.cpp
+++ b/codec/encoder/core/src/wels_preprocess.cpp
@@ -204,7 +204,6 @@
   m_pInterfaceVp = NULL;
   m_pEncLib = NULL;
   m_bInitDone = false;
-  m_bOfficialBranch  = false;
   m_pEncCtx = pEncCtx;
   memset (&m_sScaledPicture, 0, sizeof (m_sScaledPicture));
   memset (m_pSpatialPic, 0, sizeof(m_pSpatialPic));
@@ -316,13 +315,6 @@
     if (WelsPreprocessReset (pCtx) != 0)
       return -1;
 
-    m_bOfficialBranch  = (iNumDependencyLayer != 1);
-    if ( iNumDependencyLayer == 1 ) {
-      if (pSvcParam->sDependencyLayers[0].iFrameWidth != kpSrcPic->iPicWidth ||
-          pSvcParam->sDependencyLayers[0].iFrameHeight != kpSrcPic->iPicHeight) {
-        m_bOfficialBranch = true;
-      }
-    }
     m_bInitDone = true;
   }
 
@@ -333,12 +325,7 @@
   if (pSvcParam->uiIntraPeriod)
     pCtx->pVaa->bIdrPeriodFlag = (1 + pCtx->iFrameIndex >= (int32_t)pSvcParam->uiIntraPeriod) ? true : false;
 
-  if (m_bOfficialBranch) {	// Perform Down Sampling potentially due to application
-    iSpatialNum	= SingleLayerPreprocess (pCtx, kpSrcPic, &m_sScaledPicture);
-  } else { // for console each spatial pictures are available there
-    iSpatialNum = 1;
-    MultiLayerPreprocess (pCtx, kpSrcPic);
-  }
+  iSpatialNum = SingleLayerPreprocess (pCtx, kpSrcPic, &m_sScaledPicture);
 
   return iSpatialNum;
 }
@@ -518,46 +505,6 @@
   return iSpatialNum;
 }
 
-int32_t CWelsPreProcess::MultiLayerPreprocess (sWelsEncCtx* pCtx, const SSourcePicture* kpSrcPic) {
-  SWelsSvcCodingParam* pSvcParam	= pCtx->pSvcParam;
-  const SSourcePicture* pSrc			= NULL;
-  SPicture* pDstPic						= NULL;
-  const int32_t iSpatialLayersCfgCount =
-    pSvcParam->iSpatialLayerNum;	// count number of spatial layers to be encoded in cfg
-  int32_t j							= -1;
-
-  // do not clear j, just let it continue to save complexity
-  do {
-    ++ j;
-    if (pSvcParam->sDependencyLayers[j].iFrameWidth == kpSrcPic->iPicWidth &&
-        pSvcParam->sDependencyLayers[j].iFrameHeight == kpSrcPic->iPicHeight) {
-      break;
-    }
-  } while (j < iSpatialLayersCfgCount);
-
-  assert (j < iSpatialLayersCfgCount);
-  pDstPic = m_pSpatialPic[j][m_uiSpatialLayersInTemporal[j] - 1];
-
-  WelsUpdateSpatialIdxMap (pCtx, 0, pDstPic, j);
-
-  WelsMoveMemoryWrapper (pSvcParam, pDstPic, kpSrcPic, kpSrcPic->iPicWidth, kpSrcPic->iPicHeight);
-
-  if (pSvcParam->bEnableDenoise)
-    BilateralDenoising (pDstPic, kpSrcPic->iPicWidth, kpSrcPic->iPicHeight);
-
-  m_pLastSpatialPicture[j][1]	= pDstPic;
-
-  if (pSvcParam->bEnableSceneChangeDetect && (1 == pSvcParam->iSpatialLayerNum)
-      && !pCtx->pVaa->bIdrPeriodFlag && !pCtx->bEncCurFrmAsIdrFlag) {
-    SPicture* pRef = pCtx->pLtr[0].bReceivedT0LostFlag ?
-                     m_pSpatialPic[0][m_uiSpatialLayersInTemporal[0] + pCtx->pVaa->uiValidLongTermPicIdx] :
-                     m_pLastSpatialPicture[0][0];
-
-    pCtx->pVaa->bSceneChangeFlag = DetectSceneChange (pDstPic, pRef);
-  }
-
-  return 0;
-}
 
 /*!
  * \brief	Whether input picture need be scaled?
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -47,6 +47,18 @@
 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.o)
 endif
 
+ifeq ($(ASM_ARCH), arm)
+ENCODER_ASM_S_SRCS=\
+	$(ENCODER_SRCDIR)/core/arm/intra_pred_neon.S\
+	$(ENCODER_SRCDIR)/core/arm/intra_pred_sad_3_opt_neon.S\
+	$(ENCODER_SRCDIR)/core/arm/mc_neon.S\
+	$(ENCODER_SRCDIR)/core/arm/memory_neon.S\
+	$(ENCODER_SRCDIR)/core/arm/pixel_neon.S\
+	$(ENCODER_SRCDIR)/core/arm/reconstruct_neon.S\
+
+ENCODER_OBJS += $(ENCODER_ASM_S_SRCS:.S=.o)
+endif
+
 OBJS += $(ENCODER_OBJS)
 $(ENCODER_SRCDIR)/%.o: $(ENCODER_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
@@ -53,6 +65,9 @@
 
 $(ENCODER_SRCDIR)/%.o: $(ENCODER_SRCDIR)/%.asm
 	$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $@ $<
+
+$(ENCODER_SRCDIR)/%.o: $(ENCODER_SRCDIR)/%.S
+	$(QUIET_CCAS)$(CCAS) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c -o $@ $<
 
 $(LIBPREFIX)encoder.$(LIBSUFFIX): $(ENCODER_OBJS)
 	$(QUIET)rm -f $@
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -75,7 +75,7 @@
 
 	cmp	lr,	r4
 	movcs	lr,	#0
-	addcs	r6,	r3,	lsl	#1
+	addcs	r6,	r6,	r3,	lsl	#1
 	movcs	r2,	r6
 	addcs	r7,	r2,	r3
 	addcs	r8,	r1
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -35,7 +35,7 @@
 #include "arm_arch_common_macro.S"
 
 
-WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
+WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
     stmdb sp!, {lr}
 	//Loading a horizontal line data (8 bytes)
 	vld1.8 {d0}, [r0], r1
--- a/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -133,7 +133,7 @@
 
 #ifdef HAVE_NEON
   if (iCpuFlag & WELS_CPU_NEON) {
-    pfSad = WelsSampleSad8x8_neon;
+    pfSad = WelsProcessingSampleSad8x8_neon;
   }
 #endif
 }
--- a/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -62,7 +62,7 @@
 
 #ifdef HAVE_NEON
 WELSVP_EXTERN_C_BEGIN
-SadFunc      WelsSampleSad8x8_neon;
+SadFunc      WelsProcessingSampleSad8x8_neon;
 WELSVP_EXTERN_C_END
 #endif
 
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -29,6 +29,16 @@
 PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)
 endif
 
+ifeq ($(ASM_ARCH), arm)
+PROCESSING_ASM_S_SRCS=\
+	$(PROCESSING_SRCDIR)/src/arm/adaptive_quantization.S\
+	$(PROCESSING_SRCDIR)/src/arm/down_sample_neon.S\
+	$(PROCESSING_SRCDIR)/src/arm/pixel_sad_neon.S\
+	$(PROCESSING_SRCDIR)/src/arm/vaa_calc_neon.S\
+
+PROCESSING_OBJS += $(PROCESSING_ASM_S_SRCS:.S=.o)
+endif
+
 OBJS += $(PROCESSING_OBJS)
 $(PROCESSING_SRCDIR)/%.o: $(PROCESSING_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<
@@ -35,6 +45,9 @@
 
 $(PROCESSING_SRCDIR)/%.o: $(PROCESSING_SRCDIR)/%.asm
 	$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $@ $<
+
+$(PROCESSING_SRCDIR)/%.o: $(PROCESSING_SRCDIR)/%.S
+	$(QUIET_CCAS)$(CCAS) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $@ $<
 
 $(LIBPREFIX)processing.$(LIBSUFFIX): $(PROCESSING_OBJS)
 	$(QUIET)rm -f $@