ref: f0c6c2b318f7f048f83aac1bd42b5cbac8bdfb2a
parent: 594fc4fe7b9018f181714cd2a3e9d17bd96a785b
parent: a4cecd80048e187033fc4ee084abb4d6c7a79e66
author: ruil2 <[email protected]>
date: Fri Mar 7 10:59:23 EST 2014
Merge branch 'master' of https://github.com/cisco/openh264 into encoder_update
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -7,7 +7,7 @@
objects = {
/* Begin PBXBuildFile section */
- 4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture.S */; };
+ 4C34067D18C5C94C00DFA14A /* expand_picture_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */; };
4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
4CE443E718B722CD0017DF25 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443E618B722CD0017DF25 /* XCTest.framework */; };
4CE443E818B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
@@ -47,7 +47,7 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
- 4C34067C18C5C94C00DFA14A /* expand_picture.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture.S; sourceTree = "<group>"; };
+ 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = expand_picture_neon.S; sourceTree = "<group>"; };
4CE443D518B722CD0017DF25 /* libcommon.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libcommon.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE443D818B722CD0017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE443E518B722CD0017DF25 /* commonTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = commonTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -150,7 +150,7 @@
4CE4472F18BC61650017DF25 /* common */ = {
isa = PBXGroup;
children = (
- 4C34067C18C5C94C00DFA14A /* expand_picture.S */,
+ 4C34067C18C5C94C00DFA14A /* expand_picture_neon.S */,
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
4CE4473118BC61650017DF25 /* cpu.cpp */,
@@ -260,7 +260,7 @@
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
- 4C34067D18C5C94C00DFA14A /* expand_picture.S in Sources */,
+ 4C34067D18C5C94C00DFA14A /* expand_picture_neon.S in Sources */,
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
--- a/codec/common/arm_arch_common_macro.S
+++ b/codec/common/arm_arch_common_macro.S
@@ -44,6 +44,8 @@
.endm
#else
+.syntax unified
+
.macro WELS_ASM_FUNC_BEGIN funcName
.align 2
.arm
--- a/codec/common/deblocking_neon.S
+++ b/codec/common/deblocking_neon.S
@@ -910,7 +910,7 @@
beq bs_nzc_check_jump0
sub r6, \arg0, \arg2, lsl #4
- sub r6, \arg2, lsl #3
+ sub r6, r6, \arg2, lsl #3
add r6, #12
vld1.32 d3[1], [r6]
--- a/codec/common/expand_picture.S
+++ /dev/null
@@ -1,137 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
- stmdb sp!, {r4-r8}
- //Save the dst
- mov r7, r0
- mov r8, r3
-
- add r4, r7, r2
- sub r4, #1
- //For the left and right expand
-_expand_picture_luma_loop2:
- sub r5, r7, #32
- add r6, r4, #1
-
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
-
- vst1.8 {q0}, [r5]!
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]!
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_luma_loop2
-
- //for the top and bottom expand
- add r2, #64
- sub r0, #32
- mla r4, r1, r3, r0
- sub r4, r1
-_expand_picture_luma_loop0:
- mov r5, #32
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
-
- mov r8, #32
-_expand_picture_luma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
- bne _expand_picture_luma_loop1
-
- subs r2, #16
- bne _expand_picture_luma_loop0
-
- //vldreq.32 d0, [r0]
-
- ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
- stmdb sp!, {r4-r8}
- //Save the dst
- mov r7, r0
- mov r8, r3
-
- add r4, r7, r2
- sub r4, #1
- //For the left and right expand
-_expand_picture_chroma_loop2:
- sub r5, r7, #16
- add r6, r4, #1
-
- vld1.8 {d0[], d1[]}, [r7], r1
- vld1.8 {d2[], d3[]}, [r4], r1
-
- vst1.8 {q0}, [r5]
- vst1.8 {q1}, [r6]
- subs r8, #1
- bne _expand_picture_chroma_loop2
-
- //for the top and bottom expand
- add r2, #32
- sub r0, #16
- mla r4, r1, r3, r0
- sub r4, r1
-_expand_picture_chroma_loop0:
- mov r5, #16
- mls r5, r5, r1, r0
- add r6, r4, r1
- vld1.8 {q0}, [r0]!
- vld1.8 {q1}, [r4]!
-
- mov r8, #16
-_expand_picture_chroma_loop1:
- vst1.8 {q0}, [r5], r1
- vst1.8 {q1}, [r6], r1
- subs r8, #1
- bne _expand_picture_chroma_loop1
-
- subs r2, #16
- bne _expand_picture_chroma_loop0
-
- //vldreq.32 d0, [r0]
-
- ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-#endif
--- /dev/null
+++ b/codec/common/expand_picture_neon.S
@@ -1,0 +1,137 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
+ stmdb sp!, {r4-r8}
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
+
+ add r4, r7, r2
+ sub r4, #1
+ //For the left and right expand
+_expand_picture_luma_loop2:
+ sub r5, r7, #32
+ add r6, r4, #1
+
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
+
+ vst1.8 {q0}, [r5]!
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]!
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_luma_loop2
+
+ //for the top and bottom expand
+ add r2, #64
+ sub r0, #32
+ mla r4, r1, r3, r0
+ sub r4, r1
+_expand_picture_luma_loop0:
+ mov r5, #32
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
+
+ mov r8, #32
+_expand_picture_luma_loop1:
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_luma_loop1
+
+ subs r2, #16
+ bne _expand_picture_luma_loop0
+
+ //vldreq.32 d0, [r0]
+
+ ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
+ stmdb sp!, {r4-r8}
+ //Save the dst
+ mov r7, r0
+ mov r8, r3
+
+ add r4, r7, r2
+ sub r4, #1
+ //For the left and right expand
+_expand_picture_chroma_loop2:
+ sub r5, r7, #16
+ add r6, r4, #1
+
+ vld1.8 {d0[], d1[]}, [r7], r1
+ vld1.8 {d2[], d3[]}, [r4], r1
+
+ vst1.8 {q0}, [r5]
+ vst1.8 {q1}, [r6]
+ subs r8, #1
+ bne _expand_picture_chroma_loop2
+
+ //for the top and bottom expand
+ add r2, #32
+ sub r0, #16
+ mla r4, r1, r3, r0
+ sub r4, r1
+_expand_picture_chroma_loop0:
+ mov r5, #16
+ mls r5, r5, r1, r0
+ add r6, r4, r1
+ vld1.8 {q0}, [r0]!
+ vld1.8 {q1}, [r4]!
+
+ mov r8, #16
+_expand_picture_chroma_loop1:
+ vst1.8 {q0}, [r5], r1
+ vst1.8 {q1}, [r6], r1
+ subs r8, #1
+ bne _expand_picture_chroma_loop1
+
+ subs r2, #16
+ bne _expand_picture_chroma_loop0
+
+ //vldreq.32 d0, [r0]
+
+ ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+#endif
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -25,6 +25,7 @@
ifeq ($(ASM_ARCH), arm)
COMMON_ASM_S_SRCS=\
$(COMMON_SRCDIR)/deblocking_neon.S\
+ $(COMMON_SRCDIR)/expand_picture_neon.S\
COMMON_OBJS += $(COMMON_ASM_S_SRCS:.S=.o)
endif
--- a/codec/decoder/core/arm/mc_neon.S
+++ b/codec/decoder/core/arm/mc_neon.S
@@ -499,7 +499,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -581,7 +581,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -633,7 +633,7 @@
WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -694,7 +694,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -776,7 +776,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -828,7 +828,7 @@
WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -889,7 +889,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -971,7 +971,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -1023,7 +1023,7 @@
WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -1085,7 +1085,7 @@
ldr r4, [sp, #4]
sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
pld [r0]
pld [r0, r1]
@@ -1197,7 +1197,7 @@
ldr r4, [sp, #4]
sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
pld [r0]
pld [r0, r1]
@@ -1279,7 +1279,7 @@
ldr r6, [sp, #12]
sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
pld [r0]
pld [r0, r1]
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -272,7 +272,7 @@
vpaddl.u16 d16, d16
vpaddl.u32 d16, d16
vmov.u32 r1, d16[0]
- add r1, r6, lsl #1
+ add r1, r1, r6, lsl #1
//vadd.u16 d20, d21
vrshr.u16 d17, #1
@@ -279,7 +279,7 @@
vpaddl.u16 d17, d17
vpaddl.u32 d17, d17
vmov.u32 r2, d17[0]
- add r2, r6, lsl #1
+ add r2, r2, r6, lsl #1
mov r4, #0
cmp r1, r0
@@ -362,13 +362,13 @@
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
vmov.u32 r1, d22[0]
- add r1, r6, lsl #1
+ add r1, r1, r6, lsl #1
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
vmov.u32 r2, d20[0]
- add r2, r6, lsl #1
+ add r2, r2, r6, lsl #1
mov r4, #0
cmp r1, r0
@@ -506,13 +506,13 @@
vpaddl.u16 d22, d22
vpaddl.u32 d22, d22
vmov.u32 r0, d22[0]
- add r0, r6, lsl #1
+ add r0, r0, r6, lsl #1
vadd.u16 d20, d21
vpaddl.u16 d20, d20
vpaddl.u32 d20, d20
vmov.u32 r1, d20[0]
- add r1, r6, lsl #1
+ add r1, r1, r6, lsl #1
vadd.u16 d18, d19
vpaddl.u16 d18, d18
@@ -644,13 +644,13 @@
vpaddl.u16 d11, d11
vpaddl.u32 d11, d11
vmov.u32 lr, d11[0]
- add lr, r6, lsl #1
+ add lr, lr, r6, lsl #1
vrshr.u16 d10, #1
vpaddl.u16 d10, d10
vpaddl.u32 d10, d10
vmov.u32 r3, d10[0]
- add r3, r6, lsl #1
+ add r3, r3, r6, lsl #1
vrshr.u16 d28, #1
vpaddl.u16 d28, d28
--- a/codec/encoder/core/arm/mc_neon.S
+++ b/codec/encoder/core/arm/mc_neon.S
@@ -612,7 +612,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -694,7 +694,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -746,7 +746,7 @@
WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_01_neon
push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -806,7 +806,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -888,7 +888,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -939,7 +939,7 @@
WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_xy_03_neon
push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -999,7 +999,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -1080,7 +1080,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -1168,7 +1168,7 @@
push {r4}
ldr r4, [sp, #4]
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -1223,7 +1223,7 @@
WELS_ASM_FUNC_BEGIN enc_mc_luma_w4_v_neon
push {r4, r5, r6, r7}
- sub r0, r1, lsl #1 //src[-2*src_stride]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride]
pld [r0]
pld [r0, r1]
vmov.u16 q14, #0x0014 // 20
@@ -1285,7 +1285,7 @@
ldr r4, [sp, #4]
sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
pld [r0]
pld [r0, r1]
@@ -1393,7 +1393,7 @@
ldr r4, [sp, #4]
sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
pld [r0]
pld [r0, r1]
@@ -1526,7 +1526,7 @@
ldr r4, [sp, #4]
sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
pld [r0]
pld [r0, r1]
@@ -1628,7 +1628,7 @@
ldr r6, [sp, #12]
sub r0, #2 //src[-2]
- sub r0, r1, lsl #1 //src[-2*src_stride-2]
+ sub r0, r0, r1, lsl #1 //src[-2*src_stride-2]
pld [r0]
pld [r0, r1]
--- a/codec/encoder/core/arm/memory_neon.S
+++ b/codec/encoder/core/arm/memory_neon.S
@@ -48,12 +48,12 @@
vst1.64 {q0}, [r0]!
vst1.64 {q0}, [r0]!
bne mem_zero_loop
-WELS_ASM_FUNC_END
+ bx lr
mem_zero_32_neon_start:
vst1.64 {q0}, [r0]!
vst1.64 {q0}, [r0]!
-WELS_ASM_FUNC_END
+ bx lr
mem_zero_24_neon_start:
vst1.64 {q0}, [r0]!
--- a/codec/encoder/core/inc/wels_preprocess.h
+++ b/codec/encoder/core/inc/wels_preprocess.h
@@ -121,7 +121,6 @@
private:
int32_t SingleLayerPreprocess (sWelsEncCtx* pEncCtx, const SSourcePicture* kpSrc, Scaled_Picture* m_sScaledPicture);
- int32_t MultiLayerPreprocess (sWelsEncCtx* pEncCtx, const SSourcePicture* kpSrcPic);
void BilateralDenoising (SPicture* pSrc, const int32_t iWidth, const int32_t iHeight);
bool DetectSceneChange (SPicture* pCurPicture, SPicture* pRefPicture);
@@ -150,7 +149,6 @@
CWelsLib* m_pEncLib;
sWelsEncCtx* m_pEncCtx;
bool m_bInitDone;
- bool m_bOfficialBranch;
/* For Downsampling & VAA I420 based source pictures */
SPicture* m_pSpatialPic[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL + 1 +
LONG_TERM_REF_NUM]; // need memory requirement with total number of (log2(uiGopSize)+1+1+long_term_ref_num)
--- a/codec/encoder/core/src/wels_preprocess.cpp
+++ b/codec/encoder/core/src/wels_preprocess.cpp
@@ -204,7 +204,6 @@
m_pInterfaceVp = NULL;
m_pEncLib = NULL;
m_bInitDone = false;
- m_bOfficialBranch = false;
m_pEncCtx = pEncCtx;
memset (&m_sScaledPicture, 0, sizeof (m_sScaledPicture));
memset (m_pSpatialPic, 0, sizeof(m_pSpatialPic));
@@ -316,13 +315,6 @@
if (WelsPreprocessReset (pCtx) != 0)
return -1;
- m_bOfficialBranch = (iNumDependencyLayer != 1);
- if ( iNumDependencyLayer == 1 ) {
- if (pSvcParam->sDependencyLayers[0].iFrameWidth != kpSrcPic->iPicWidth ||
- pSvcParam->sDependencyLayers[0].iFrameHeight != kpSrcPic->iPicHeight) {
- m_bOfficialBranch = true;
- }
- }
m_bInitDone = true;
}
@@ -333,12 +325,7 @@
if (pSvcParam->uiIntraPeriod)
pCtx->pVaa->bIdrPeriodFlag = (1 + pCtx->iFrameIndex >= (int32_t)pSvcParam->uiIntraPeriod) ? true : false;
- if (m_bOfficialBranch) { // Perform Down Sampling potentially due to application
- iSpatialNum = SingleLayerPreprocess (pCtx, kpSrcPic, &m_sScaledPicture);
- } else { // for console each spatial pictures are available there
- iSpatialNum = 1;
- MultiLayerPreprocess (pCtx, kpSrcPic);
- }
+ iSpatialNum = SingleLayerPreprocess (pCtx, kpSrcPic, &m_sScaledPicture);
return iSpatialNum;
}
@@ -518,46 +505,6 @@
return iSpatialNum;
}
-int32_t CWelsPreProcess::MultiLayerPreprocess (sWelsEncCtx* pCtx, const SSourcePicture* kpSrcPic) {
- SWelsSvcCodingParam* pSvcParam = pCtx->pSvcParam;
- const SSourcePicture* pSrc = NULL;
- SPicture* pDstPic = NULL;
- const int32_t iSpatialLayersCfgCount =
- pSvcParam->iSpatialLayerNum; // count number of spatial layers to be encoded in cfg
- int32_t j = -1;
-
- // do not clear j, just let it continue to save complexity
- do {
- ++ j;
- if (pSvcParam->sDependencyLayers[j].iFrameWidth == kpSrcPic->iPicWidth &&
- pSvcParam->sDependencyLayers[j].iFrameHeight == kpSrcPic->iPicHeight) {
- break;
- }
- } while (j < iSpatialLayersCfgCount);
-
- assert (j < iSpatialLayersCfgCount);
- pDstPic = m_pSpatialPic[j][m_uiSpatialLayersInTemporal[j] - 1];
-
- WelsUpdateSpatialIdxMap (pCtx, 0, pDstPic, j);
-
- WelsMoveMemoryWrapper (pSvcParam, pDstPic, kpSrcPic, kpSrcPic->iPicWidth, kpSrcPic->iPicHeight);
-
- if (pSvcParam->bEnableDenoise)
- BilateralDenoising (pDstPic, kpSrcPic->iPicWidth, kpSrcPic->iPicHeight);
-
- m_pLastSpatialPicture[j][1] = pDstPic;
-
- if (pSvcParam->bEnableSceneChangeDetect && (1 == pSvcParam->iSpatialLayerNum)
- && !pCtx->pVaa->bIdrPeriodFlag && !pCtx->bEncCurFrmAsIdrFlag) {
- SPicture* pRef = pCtx->pLtr[0].bReceivedT0LostFlag ?
- m_pSpatialPic[0][m_uiSpatialLayersInTemporal[0] + pCtx->pVaa->uiValidLongTermPicIdx] :
- m_pLastSpatialPicture[0][0];
-
- pCtx->pVaa->bSceneChangeFlag = DetectSceneChange (pDstPic, pRef);
- }
-
- return 0;
-}
/*!
* \brief Whether input picture need be scaled?
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -47,6 +47,18 @@
ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.o)
endif
+ifeq ($(ASM_ARCH), arm)
+ENCODER_ASM_S_SRCS=\
+ $(ENCODER_SRCDIR)/core/arm/intra_pred_neon.S\
+ $(ENCODER_SRCDIR)/core/arm/intra_pred_sad_3_opt_neon.S\
+ $(ENCODER_SRCDIR)/core/arm/mc_neon.S\
+ $(ENCODER_SRCDIR)/core/arm/memory_neon.S\
+ $(ENCODER_SRCDIR)/core/arm/pixel_neon.S\
+ $(ENCODER_SRCDIR)/core/arm/reconstruct_neon.S\
+
+ENCODER_OBJS += $(ENCODER_ASM_S_SRCS:.S=.o)
+endif
+
OBJS += $(ENCODER_OBJS)
$(ENCODER_SRCDIR)/%.o: $(ENCODER_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
@@ -53,6 +65,9 @@
$(ENCODER_SRCDIR)/%.o: $(ENCODER_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $@ $<
+
+$(ENCODER_SRCDIR)/%.o: $(ENCODER_SRCDIR)/%.S
+ $(QUIET_CCAS)$(CCAS) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c -o $@ $<
$(LIBPREFIX)encoder.$(LIBSUFFIX): $(ENCODER_OBJS)
$(QUIET)rm -f $@
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -75,7 +75,7 @@
cmp lr, r4
movcs lr, #0
- addcs r6, r3, lsl #1
+ addcs r6, r6, r3, lsl #1
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -35,7 +35,7 @@
#include "arm_arch_common_macro.S"
-WELS_ASM_FUNC_BEGIN WelsSampleSad8x8_neon
+WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
stmdb sp!, {lr}
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
--- a/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -133,7 +133,7 @@
#ifdef HAVE_NEON
if (iCpuFlag & WELS_CPU_NEON) {
- pfSad = WelsSampleSad8x8_neon;
+ pfSad = WelsProcessingSampleSad8x8_neon;
}
#endif
}
--- a/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -62,7 +62,7 @@
#ifdef HAVE_NEON
WELSVP_EXTERN_C_BEGIN
-SadFunc WelsSampleSad8x8_neon;
+SadFunc WelsProcessingSampleSad8x8_neon;
WELSVP_EXTERN_C_END
#endif
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -29,6 +29,16 @@
PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)
endif
+ifeq ($(ASM_ARCH), arm)
+PROCESSING_ASM_S_SRCS=\
+ $(PROCESSING_SRCDIR)/src/arm/adaptive_quantization.S\
+ $(PROCESSING_SRCDIR)/src/arm/down_sample_neon.S\
+ $(PROCESSING_SRCDIR)/src/arm/pixel_sad_neon.S\
+ $(PROCESSING_SRCDIR)/src/arm/vaa_calc_neon.S\
+
+PROCESSING_OBJS += $(PROCESSING_ASM_S_SRCS:.S=.o)
+endif
+
OBJS += $(PROCESSING_OBJS)
$(PROCESSING_SRCDIR)/%.o: $(PROCESSING_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<
@@ -35,6 +45,9 @@
$(PROCESSING_SRCDIR)/%.o: $(PROCESSING_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $@ $<
+
+$(PROCESSING_SRCDIR)/%.o: $(PROCESSING_SRCDIR)/%.S
+ $(QUIET_CCAS)$(CCAS) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $@ $<
$(LIBPREFIX)processing.$(LIBSUFFIX): $(PROCESSING_OBJS)
$(QUIET)rm -f $@