ref: 720f8dcc525c2fef52518080e6e26b353a535abf
parent: b9477cdb942bfed223a2fa6cde5e3198d31756d0
author: Martin Storsjö <[email protected]>
date: Tue Jun 17 06:10:50 EDT 2014
Fix building the deblocking aarch64 assembly with gnu binutils
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -295,166 +295,166 @@
#else
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
- uabd \arg6.16b, \arg1.16b, \arg2.16b
- cmhi \arg6.16b, \arg4.16b, \arg6.16b
+ uabd \arg6\().16b, \arg1\().16b, \arg2\().16b
+ cmhi \arg6\().16b, \arg4\().16b, \arg6\().16b
- uabd \arg4.16b, \arg0.16b, \arg1.16b
- cmhi \arg4.16b, \arg5.16b, \arg4.16b
- and \arg6.16b, \arg6.16b, \arg4.16b
+ uabd \arg4\().16b, \arg0\().16b, \arg1\().16b
+ cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b
+ and \arg6\().16b, \arg6\().16b, \arg4\().16b
- uabd \arg4.16b, \arg3.16b, \arg2.16b
- cmhi \arg4.16b, \arg5.16b, \arg4.16b
- and \arg6.16b, \arg6.16b, \arg4.16b
+ uabd \arg4\().16b, \arg3\().16b, \arg2\().16b
+ cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b
+ and \arg6\().16b, \arg6\().16b, \arg4\().16b
.endm
.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
//v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
- urhadd \arg8.16b, \arg2.16b, \arg3.16b
- uhadd \arg8.16b, \arg0.16b, \arg8.16b
- usubl \arg9.8h, \arg8.8b, \arg1.8b
- sqxtn \arg9.8b, \arg9.8h
- usubl2 \arg8.8h, \arg8.16b, \arg1.16b
- sqxtn2 \arg9.16b, \arg8.8h
- smax \arg8.16b, \arg9.16b, \arg5.16b
+ urhadd \arg8\().16b, \arg2\().16b, \arg3\().16b
+ uhadd \arg8\().16b, \arg0\().16b, \arg8\().16b
+ usubl \arg9\().8h, \arg8\().8b, \arg1\().8b
+ sqxtn \arg9\().8b, \arg9\().8h
+ usubl2 \arg8\().8h, \arg8\().16b, \arg1\().16b
+ sqxtn2 \arg9\().16b, \arg8\().8h
+ smax \arg8\().16b, \arg9\().16b, \arg5\().16b
//
- smin \arg8.16b, \arg8.16b, \arg6.16b
- uabd \arg9.16b, \arg0.16b, \arg2.16b
- cmhi \arg9.16b, \arg4.16b, \arg9.16b
- and \arg8.16b, \arg8.16b, \arg9.16b
- and \arg8.16b, \arg8.16b, \arg7.16b
- add \arg8.16b, \arg1.16b, \arg8.16b
- abs \arg9.16b, \arg9.16b
+ smin \arg8\().16b, \arg8\().16b, \arg6\().16b
+ uabd \arg9\().16b, \arg0\().16b, \arg2\().16b
+ cmhi \arg9\().16b, \arg4\().16b, \arg9\().16b
+ and \arg8\().16b, \arg8\().16b, \arg9\().16b
+ and \arg8\().16b, \arg8\().16b, \arg7\().16b
+ add \arg8\().16b, \arg1\().16b, \arg8\().16b
+ abs \arg9\().16b, \arg9\().16b
.endm
.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- usubl \arg5.8h, \arg0.8b, \arg3.8b
- usubl \arg6.8h, \arg2.8b, \arg1.8b
- shl \arg6.8h, \arg6.8h, #2
- add \arg5.8h, \arg5.8h, \arg6.8h
- sqrshrn \arg4.8b, \arg5.8h, #3
+ usubl \arg5\().8h, \arg0\().8b, \arg3\().8b
+ usubl \arg6\().8h, \arg2\().8b, \arg1\().8b
+ shl \arg6\().8h, \arg6\().8h, #2
+ add \arg5\().8h, \arg5\().8h, \arg6\().8h
+ sqrshrn \arg4\().8b, \arg5\().8h, #3
.endm
.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- usubl2 \arg5.8h, \arg0.16b, \arg3.16b
- usubl2 \arg6.8h, \arg2.16b, \arg1.16b
- shl \arg6.8h, \arg6.8h, #2
- add \arg5.8h, \arg5.8h, \arg6.8h
- sqrshrn2 \arg4.16b, \arg5.8h, #3
+ usubl2 \arg5\().8h, \arg0\().16b, \arg3\().16b
+ usubl2 \arg6\().8h, \arg2\().16b, \arg1\().16b
+ shl \arg6\().8h, \arg6\().8h, #2
+ add \arg5\().8h, \arg5\().8h, \arg6\().8h
+ sqrshrn2 \arg4\().16b, \arg5\().8h, #3
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
- cmge \arg1.16b, \arg0.16b, #0
- and \arg1.16b, \arg0.16b, \arg1.16b
- sub \arg0.16b, \arg1.16b, \arg0.16b
+ cmge \arg1\().16b, \arg0\().16b, #0
+ and \arg1\().16b, \arg0\().16b, \arg1\().16b
+ sub \arg0\().16b, \arg1\().16b, \arg0\().16b
.endm
.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
- uaddl \arg8.8h, \arg1.8b, \arg2.8b
- uaddl \arg9.8h, \arg3.8b, \arg4.8b
- add \arg9.8h, \arg9.8h, \arg8.8h
+ uaddl \arg8\().8h, \arg1\().8b, \arg2\().8b
+ uaddl \arg9\().8h, \arg3\().8b, \arg4\().8b
+ add \arg9\().8h, \arg9\().8h, \arg8\().8h
- uaddl \arg8.8h, \arg0.8b, \arg1.8b
- shl \arg8.8h, \arg8.8h, #1
- add \arg8.8h, \arg9.8h, \arg8.8h
+ uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b
+ shl \arg8\().8h, \arg8\().8h, #1
+ add \arg8\().8h, \arg9\().8h, \arg8\().8h
- rshrn \arg0.8b, \arg9.8h, #2
- rshrn \arg7.8b, \arg8.8h, #3
- shl \arg9.8h, \arg9.8h, #1
- usubl \arg8.8h, \arg5.8b, \arg1.8b
- add \arg9.8h, \arg8.8h, \arg9.8h
+ rshrn \arg0\().8b, \arg9\().8h, #2
+ rshrn \arg7\().8b, \arg8\().8h, #3
+ shl \arg9\().8h, \arg9\().8h, #1
+ usubl \arg8\().8h, \arg5\().8b, \arg1\().8b
+ add \arg9\().8h, \arg8\().8h, \arg9\().8h
- uaddl \arg8.8h, \arg2.8b, \arg5.8b
- uaddw \arg8.8h, \arg8.8h, \arg2.8b
- uaddw \arg8.8h, \arg8.8h, \arg3.8b
+ uaddl \arg8\().8h, \arg2\().8b, \arg5\().8b
+ uaddw \arg8\().8h, \arg8\().8h, \arg2\().8b
+ uaddw \arg8\().8h, \arg8\().8h, \arg3\().8b
- rshrn \arg9.8b, \arg9.8h, #3
- rshrn \arg8.8b, \arg8.8h, #2
- bsl \arg6.8b, \arg9.8b, \arg8.8b
+ rshrn \arg9\().8b, \arg9\().8h, #3
+ rshrn \arg8\().8b, \arg8\().8h, #2
+ bsl \arg6\().8b, \arg9\().8b, \arg8\().8b
.endm
.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
- uaddl2 \arg8.8h, \arg1.16b, \arg2.16b
- uaddl2 \arg9.8h, \arg3.16b, \arg4.16b
- add \arg9.8h, \arg9.8h, \arg8.8h
+ uaddl2 \arg8\().8h, \arg1\().16b, \arg2\().16b
+ uaddl2 \arg9\().8h, \arg3\().16b, \arg4\().16b
+ add \arg9\().8h, \arg9\().8h, \arg8\().8h
- uaddl2 \arg8.8h, \arg0.16b, \arg1.16b
- shl \arg8.8h, \arg8.8h, #1
- add \arg8.8h, \arg9.8h, \arg8.8h
+ uaddl2 \arg8\().8h, \arg0\().16b, \arg1\().16b
+ shl \arg8\().8h, \arg8\().8h, #1
+ add \arg8\().8h, \arg9\().8h, \arg8\().8h
- rshrn2 \arg0.16b, \arg9.8h, #2
- rshrn2 \arg7.16b, \arg8.8h, #3
- shl \arg9.8h, \arg9.8h, #1
- usubl2 \arg8.8h, \arg5.16b, \arg1.16b
- add \arg9.8h, \arg8.8h, \arg9.8h
+ rshrn2 \arg0\().16b, \arg9\().8h, #2
+ rshrn2 \arg7\().16b, \arg8\().8h, #3
+ shl \arg9\().8h, \arg9\().8h, #1
+ usubl2 \arg8\().8h, \arg5\().16b, \arg1\().16b
+ add \arg9\().8h, \arg8\().8h, \arg9\().8h
- uaddl2 \arg8.8h, \arg2.16b, \arg5.16b
- uaddw2 \arg8.8h, \arg8.8h, \arg2.16b
- uaddw2 \arg8.8h, \arg8.8h, \arg3.16b
+ uaddl2 \arg8\().8h, \arg2\().16b, \arg5\().16b
+ uaddw2 \arg8\().8h, \arg8\().8h, \arg2\().16b
+ uaddw2 \arg8\().8h, \arg8\().8h, \arg3\().16b
- rshrn2 \arg9.16b, \arg9.8h, #3
- rshrn2 \arg8.16b, \arg8.8h, #2
- bsl \arg6.16b, \arg9.16b, \arg8.16b
+ rshrn2 \arg9\().16b, \arg9\().8h, #3
+ rshrn2 \arg8\().16b, \arg8\().8h, #2
+ bsl \arg6\().16b, \arg9\().16b, \arg8\().16b
.endm
.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
- uaddl \arg4.8h, \arg0.8b, \arg3.8b
- shl \arg4.8h, \arg4.8h, #1
- usubl \arg5.8h, \arg1.8b, \arg3.8b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn \arg6.8b, \arg5.8h, #2
- usubl \arg5.8h, \arg2.8b, \arg0.8b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn \arg7.8b, \arg5.8h, #2
+ uaddl \arg4\().8h, \arg0\().8b, \arg3\().8b
+ shl \arg4\().8h, \arg4\().8h, #1
+ usubl \arg5\().8h, \arg1\().8b, \arg3\().8b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn \arg6\().8b, \arg5\().8h, #2
+ usubl \arg5\().8h, \arg2\().8b, \arg0\().8b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn \arg7\().8b, \arg5\().8h, #2
.endm
.macro DIFF_CHROMA_EQ4_P0Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
- uaddl2 \arg4.8h, \arg0.16b, \arg3.16b
- shl \arg4.8h, \arg4.8h, #1
- usubl2 \arg5.8h, \arg1.16b, \arg3.16b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn2 \arg6.16b, \arg5.8h, #2
- usubl2 \arg5.8h, \arg2.16b, \arg0.16b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn2 \arg7.16b, \arg5.8h, #2
+ uaddl2 \arg4\().8h, \arg0\().16b, \arg3\().16b
+ shl \arg4\().8h, \arg4\().8h, #1
+ usubl2 \arg5\().8h, \arg1\().16b, \arg3\().16b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn2 \arg6\().16b, \arg5\().8h, #2
+ usubl2 \arg5\().8h, \arg2\().16b, \arg0\().16b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn2 \arg7\().16b, \arg5\().8h, #2
.endm
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
mov.16b \arg3, \arg2
- bsl \arg3.16b, \arg0.16b, \arg1.16b
+ bsl \arg3\().16b, \arg0\().16b, \arg1\().16b
.endm
.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- ld3 {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x2], x1
- ld3 {\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1
+ ld3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x2], x1
+ ld3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
.endm
.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- ld4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg8], [x3], x1
- ld4 {\arg4.b, \arg5.b, \arg6.b, \arg7.b} [\arg8], [x0], x1
+ ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg8], [x3], x1
+ ld4 {\arg4\().b, \arg5\().b, \arg6\().b, \arg7\().b} [\arg8], [x0], x1
.endm
.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
- st4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg4], [x0], x1
- st4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [x2], x1
+ st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg4], [x0], x1
+ st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [x2], x1
.endm
.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- st3 {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x3], x1
- st3 {\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1
+ st3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x3], x1
+ st3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
.endm
.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
- ld4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [\arg4], x2
+ ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [\arg4], x2
.endm
.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3
- st2 {\arg0.b, \arg1.b} [\arg3], [\arg2], x2
+ st2 {\arg0\().b, \arg1\().b} [\arg3], [\arg2], x2
.endm
.macro ZERO_JUMP_END arg0, arg1, arg2, arg3
- mov \arg1, \arg0.d[0]
- mov \arg2, \arg0.d[1]
+ mov \arg1, \arg0\().d[0]
+ mov \arg2, \arg0\().d[1]
orr \arg1, \arg1, \arg2
cbz \arg1, \arg3
.endm
@@ -471,7 +471,7 @@
bs_nzc_check_jump0:
ext.16b v1, v1, v0, #12
- add \arg3.16b, v0.16b, v1.16b
+ add \arg3\().16b, v0.16b, v1.16b
// Arrange the input data --- LEFT
ands x6, \arg1, #1
@@ -492,28 +492,28 @@
ins v2.d[0], v0.d[1]
zip1 v0.16b, v0.16b, v2.16b
ext.16b v1, v1, v0, #12
- add \arg4.16b, v0.16b, v1.16b
+ add \arg4\().16b, v0.16b, v1.16b
.endm
.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5
//in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5
mov w6, #4
- sabd v20.8h, \arg0.8h, \arg1.8h
- sabd v21.8h, \arg1.8h, \arg2.8h
- dup \arg0.8h, w6
- sabd v22.8h, \arg2.8h, \arg3.8h
- sabd v23.8h, \arg3.8h, \arg4.8h
+ sabd v20.8h, \arg0\().8h, \arg1\().8h
+ sabd v21.8h, \arg1\().8h, \arg2\().8h
+ dup \arg0\().8h, w6
+ sabd v22.8h, \arg2\().8h, \arg3\().8h
+ sabd v23.8h, \arg3\().8h, \arg4\().8h
- cmge v20.8h, v20.8h, \arg0.8h
- cmge v21.8h, v21.8h, \arg0.8h
- cmge v22.8h, v22.8h, \arg0.8h
- cmge v23.8h, v23.8h, \arg0.8h
+ cmge v20.8h, v20.8h, \arg0\().8h
+ cmge v21.8h, v21.8h, \arg0\().8h
+ cmge v22.8h, v22.8h, \arg0\().8h
+ cmge v23.8h, v23.8h, \arg0\().8h
addp v20.8h, v20.8h, v21.8h
addp v21.8h, v22.8h, v23.8h
- addhn \arg5.8b, v20.8h, v20.8h
- addhn2 \arg5.16b, v21.8h, v21.8h
+ addhn \arg5\().8b, v20.8h, v20.8h
+ addhn2 \arg5\().16b, v21.8h, v21.8h
.endm
.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
@@ -540,14 +540,14 @@
ld1 {v4.s} [2], [x6]
ld1 {v4.s} [3], [x7]
bs_mv_check_jump1:
- zip1 \arg5.4s, v0.4s, v2.4s
- zip2 \arg6.4s, v0.4s, v2.4s
+ zip1 \arg5\().4s, v0.4s, v2.4s
+ zip2 \arg6\().4s, v0.4s, v2.4s
zip1 v0.4s, v1.4s, v3.4s
zip2 v2.4s, v1.4s, v3.4s
- zip2 v1.4s, \arg5.4s, v0.4s
- zip1 v0.4s, \arg5.4s, v0.4s
- zip2 v3.4s, \arg6.4s, v2.4s
- zip1 v2.4s, \arg6.4s, v2.4s
+ zip2 v1.4s, \arg5\().4s, v0.4s
+ zip1 v0.4s, \arg5\().4s, v0.4s
+ zip2 v3.4s, \arg6\().4s, v2.4s
+ zip1 v2.4s, \arg6\().4s, v2.4s
BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
.endm
#endif