ref: 8bbcd3f7a7518808032be82260fd1ebb02337d2d
parent: 556780b7556b5be83ab49d75b23f74434f848132
author: Martin Storsjö <[email protected]>
date: Sun Apr 7 20:11:03 EDT 2019
arm: Add a _neon suffix to all internal functions This eases disambiguating these functions when looking at perf profiles.
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -218,7 +218,7 @@
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (clz(w)-24).
-function put
+function put_neon
adr r9, L(put_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
@@ -309,7 +309,7 @@
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r4,
// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
-function prep
+function prep_neon
adr r9, L(prep_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
@@ -660,7 +660,7 @@
push {r4-r11,lr}
movw r8, \type_h
movw r9, \type_v
- b \op\()_8tap
+ b \op\()_8tap_neon
endfunc
.endm
@@ -680,7 +680,7 @@
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-function \type\()_8tap
+function \type\()_8tap_neon
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
@@ -699,7 +699,7 @@
bne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
bne L(\type\()_8tap_v)
- b \type
+ b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
@@ -1831,7 +1831,7 @@
bne L(\type\()_bilin_h)
cmp \my, #0
bne L(\type\()_bilin_v)
- b \type
+ b \type\()_neon
L(\type\()_bilin_h):
cmp \my, #0
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -236,7 +236,7 @@
// This has got the same signature as the put_8tap functions,
// and assumes that x8 is set to (clz(w)-24).
-function put
+function put_neon
adr x9, L(put_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
@@ -331,7 +331,7 @@
// This has got the same signature as the prep_8tap functions,
// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
-function prep
+function prep_neon
adr x9, L(prep_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
@@ -703,7 +703,7 @@
function \op\()_8tap_\type\()_8bpc_neon, export=1
mov x8, \type_h
mov x9, \type_v
- b \op\()_8tap
+ b \op\()_8tap\()_neon
endfunc
.endm
@@ -723,7 +723,7 @@
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
-function \type\()_8tap
+function \type\()_8tap_neon
mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, w10
mul \my, \my, w10
@@ -741,7 +741,7 @@
b.ne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
b.ne L(\type\()_8tap_v)
- b \type
+ b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
@@ -1826,7 +1826,7 @@
sub w8, w8, #24
cbnz \mx, L(\type\()_bilin_h)
cbnz \my, L(\type\()_bilin_v)
- b \type
+ b \type\()_neon
L(\type\()_bilin_h):
cbnz \my, L(\type\()_bilin_hv)
@@ -2335,7 +2335,7 @@
add \src, \src, \inc
.endm
-function warp_filter_horz
+function warp_filter_horz_neon
add w12, w5, #512
ld1 {v16.8b, v17.8b}, [x2], x3
@@ -2431,24 +2431,24 @@
lsl x1, x1, #1
.endif
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v24.16b, v16.16b
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v25.16b, v16.16b
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v26.16b, v16.16b
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v27.16b, v16.16b
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v28.16b, v16.16b
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v29.16b, v16.16b
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v30.16b, v16.16b
1:
add w14, w6, #512
- bl warp_filter_horz
+ bl warp_filter_horz_neon
mov v31.16b, v16.16b
load_filter_row d0, w14, w9