shithub: dav1d

--- a/src/arm/32/mc.S

+++ b/src/arm/32/mc.S

@@ -218,7 +218,7 @@

 // This has got the same signature as the put_8tap functions,

 // assumes that the caller has loaded the h argument into r5,

 // and assumes that r8 is set to (clz(w)-24).

-function put

+function put_neon

         adr             r9,  L(put_tbl)

         ldr             r8,  [r9, r8, lsl #2]

         add             r9,  r9,  r8

@@ -309,7 +309,7 @@

 // This has got the same signature as the put_8tap functions,

 // assumes that the caller has loaded the h argument into r4,

 // and assumes that r8 is set to (clz(w)-24), and r7 to w*2.

-function prep

+function prep_neon

         adr             r9,  L(prep_tbl)

         ldr             r8,  [r9, r8, lsl #2]

         add             r9,  r9,  r8

@@ -660,7 +660,7 @@

         push            {r4-r11,lr}

         movw            r8,  \type_h

         movw            r9,  \type_v

-        b               \op\()_8tap

+        b               \op\()_8tap_neon

 endfunc

 .endm

@@ -680,7 +680,7 @@

 make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR

 make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH

-function \type\()_8tap

+function \type\()_8tap_neon

         ldrd            r4,  r5,  [sp, #36]

         ldrd            r6,  r7,  [sp, #44]

         movw            r10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)

@@ -699,7 +699,7 @@

         bne             L(\type\()_8tap_h)

         tst             \my, #(0x7f << 14)

         bne             L(\type\()_8tap_v)

-        b               \type

+        b               \type\()_neon

 L(\type\()_8tap_h):

         cmp             \w,  #4

@@ -1831,7 +1831,7 @@

         bne             L(\type\()_bilin_h)

         cmp             \my, #0

         bne             L(\type\()_bilin_v)

-        b               \type

+        b               \type\()_neon

 L(\type\()_bilin_h):

         cmp             \my, #0

--- a/src/arm/64/mc.S

+++ b/src/arm/64/mc.S

@@ -236,7 +236,7 @@

 // This has got the same signature as the put_8tap functions,

 // and assumes that x8 is set to (clz(w)-24).

-function put

+function put_neon

         adr             x9,  L(put_tbl)

         ldrh            w8,  [x9, x8, lsl #1]

         sub             x9,  x9,  w8, uxtw

@@ -331,7 +331,7 @@

 // This has got the same signature as the prep_8tap functions,

 // and assumes that x8 is set to (clz(w)-24), and x7 to w*2.

-function prep

+function prep_neon

         adr             x9,  L(prep_tbl)

         ldrh            w8,  [x9, x8, lsl #1]

         sub             x9,  x9,  w8, uxtw

@@ -703,7 +703,7 @@

 function \op\()_8tap_\type\()_8bpc_neon, export=1

         mov             x8,  \type_h

         mov             x9,  \type_v

-        b               \op\()_8tap

+        b               \op\()_8tap\()_neon

 endfunc

 .endm

@@ -723,7 +723,7 @@

 make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR

 make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH

-function \type\()_8tap

+function \type\()_8tap_neon

         mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)

         mul             \mx,  \mx, w10

         mul             \my,  \my, w10

@@ -741,7 +741,7 @@

         b.ne            L(\type\()_8tap_h)

         tst             \my, #(0x7f << 14)

         b.ne            L(\type\()_8tap_v)

-        b               \type

+        b               \type\()_neon

 L(\type\()_8tap_h):

         cmp             \w,  #4

@@ -1826,7 +1826,7 @@

         sub             w8,  w8,  #24

         cbnz            \mx, L(\type\()_bilin_h)

         cbnz            \my, L(\type\()_bilin_v)

-        b               \type

+        b               \type\()_neon

 L(\type\()_bilin_h):

         cbnz            \my, L(\type\()_bilin_hv)

@@ -2335,7 +2335,7 @@

         add             \src, \src, \inc

 .endm

-function warp_filter_horz

+function warp_filter_horz_neon

         add             w12, w5,  #512

         ld1             {v16.8b, v17.8b}, [x2], x3

@@ -2431,24 +2431,24 @@

         lsl             x1,  x1,  #1

 .endif

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v24.16b, v16.16b

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v25.16b, v16.16b

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v26.16b, v16.16b

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v27.16b, v16.16b

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v28.16b, v16.16b

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v29.16b, v16.16b

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v30.16b, v16.16b

1:

         add             w14, w6,  #512

-        bl              warp_filter_horz

+        bl              warp_filter_horz_neon

         mov             v31.16b, v16.16b

         load_filter_row d0, w14, w9