ref: 4aa0363a342613b41034911ba5a5d285bf9c7335
parent: 842b2074110741fce556856f622c99fd6dcc810b
author: Martin Storsjö <[email protected]>
date: Fri Oct 26 20:54:36 EDT 2018
arm64: mc: Implement 8tap and bilin functions These functions have been tuned against Cortex A53 and Snapdragon 835. The bilin functions have mainly been written with code size in mind, as they aren't used much in practice. Relative speedups for the actual filtering fuctions (that don't just do a plain copy) are around 4-15x, some over 20x. This is in comparison with GCC 5.4 with autovectorization disabled; the actual real-world speedup against autovectorized C code is around 4-10x. Relative speedups measured with checkasm: Cortex A53 Snapdragon 835 mc_8tap_regular_w2_0_8bpc_neon: 6.96 5.28 mc_8tap_regular_w2_h_8bpc_neon: 5.16 4.35 mc_8tap_regular_w2_hv_8bpc_neon: 5.37 4.98 mc_8tap_regular_w2_v_8bpc_neon: 6.35 4.85 mc_8tap_regular_w4_0_8bpc_neon: 6.78 5.73 mc_8tap_regular_w4_h_8bpc_neon: 8.40 6.60 mc_8tap_regular_w4_hv_8bpc_neon: 7.23 7.10 mc_8tap_regular_w4_v_8bpc_neon: 9.06 7.76 mc_8tap_regular_w8_0_8bpc_neon: 6.96 5.55 mc_8tap_regular_w8_h_8bpc_neon: 10.36 6.88 mc_8tap_regular_w8_hv_8bpc_neon: 9.49 6.86 mc_8tap_regular_w8_v_8bpc_neon: 12.06 9.61 mc_8tap_regular_w16_0_8bpc_neon: 6.68 4.51 mc_8tap_regular_w16_h_8bpc_neon: 12.30 7.77 mc_8tap_regular_w16_hv_8bpc_neon: 9.50 6.68 mc_8tap_regular_w16_v_8bpc_neon: 12.93 9.68 mc_8tap_regular_w32_0_8bpc_neon: 3.91 2.93 mc_8tap_regular_w32_h_8bpc_neon: 13.06 7.89 mc_8tap_regular_w32_hv_8bpc_neon: 9.37 6.70 mc_8tap_regular_w32_v_8bpc_neon: 12.88 9.49 mc_8tap_regular_w64_0_8bpc_neon: 2.89 1.68 mc_8tap_regular_w64_h_8bpc_neon: 13.48 8.00 mc_8tap_regular_w64_hv_8bpc_neon: 9.23 6.53 mc_8tap_regular_w64_v_8bpc_neon: 13.11 9.68 mc_8tap_regular_w128_0_8bpc_neon: 1.89 1.24 mc_8tap_regular_w128_h_8bpc_neon: 13.58 7.98 mc_8tap_regular_w128_hv_8bpc_neon: 8.86 6.53 mc_8tap_regular_w128_v_8bpc_neon: 12.46 9.63 mc_bilinear_w2_0_8bpc_neon: 7.02 5.40 mc_bilinear_w2_h_8bpc_neon: 3.65 3.14 mc_bilinear_w2_hv_8bpc_neon: 4.36 4.84 mc_bilinear_w2_v_8bpc_neon: 5.22 4.28 mc_bilinear_w4_0_8bpc_neon: 6.87 5.99 mc_bilinear_w4_h_8bpc_neon: 6.50 8.61 mc_bilinear_w4_hv_8bpc_neon: 7.70 7.99 mc_bilinear_w4_v_8bpc_neon: 7.04 9.10 mc_bilinear_w8_0_8bpc_neon: 7.03 5.70 mc_bilinear_w8_h_8bpc_neon: 11.30 15.14 mc_bilinear_w8_hv_8bpc_neon: 15.74 13.50 mc_bilinear_w8_v_8bpc_neon: 13.40 17.54 mc_bilinear_w16_0_8bpc_neon: 6.75 4.48 mc_bilinear_w16_h_8bpc_neon: 17.02 13.95 mc_bilinear_w16_hv_8bpc_neon: 17.37 13.78 mc_bilinear_w16_v_8bpc_neon: 23.69 22.98 mc_bilinear_w32_0_8bpc_neon: 3.88 3.18 mc_bilinear_w32_h_8bpc_neon: 18.80 14.97 mc_bilinear_w32_hv_8bpc_neon: 17.74 14.02 mc_bilinear_w32_v_8bpc_neon: 24.46 23.04 mc_bilinear_w64_0_8bpc_neon: 2.87 1.66 mc_bilinear_w64_h_8bpc_neon: 19.54 16.02 mc_bilinear_w64_hv_8bpc_neon: 17.80 14.32 mc_bilinear_w64_v_8bpc_neon: 24.79 23.63 mc_bilinear_w128_0_8bpc_neon: 2.13 1.23 mc_bilinear_w128_h_8bpc_neon: 19.89 16.24 mc_bilinear_w128_hv_8bpc_neon: 17.55 14.15 mc_bilinear_w128_v_8bpc_neon: 24.45 23.54 mct_8tap_regular_w4_0_8bpc_neon: 5.56 5.51 mct_8tap_regular_w4_h_8bpc_neon: 7.48 5.80 mct_8tap_regular_w4_hv_8bpc_neon: 7.27 7.09 mct_8tap_regular_w4_v_8bpc_neon: 7.80 6.84 mct_8tap_regular_w8_0_8bpc_neon: 9.54 9.25 mct_8tap_regular_w8_h_8bpc_neon: 9.08 6.55 mct_8tap_regular_w8_hv_8bpc_neon: 9.16 6.30 mct_8tap_regular_w8_v_8bpc_neon: 10.79 8.66 mct_8tap_regular_w16_0_8bpc_neon: 15.35 10.50 mct_8tap_regular_w16_h_8bpc_neon: 10.18 6.76 mct_8tap_regular_w16_hv_8bpc_neon: 9.17 6.11 mct_8tap_regular_w16_v_8bpc_neon: 11.52 8.72 mct_8tap_regular_w32_0_8bpc_neon: 15.82 10.09 mct_8tap_regular_w32_h_8bpc_neon: 10.75 6.85 mct_8tap_regular_w32_hv_8bpc_neon: 9.00 6.22 mct_8tap_regular_w32_v_8bpc_neon: 11.58 8.67 mct_8tap_regular_w64_0_8bpc_neon: 15.28 9.68 mct_8tap_regular_w64_h_8bpc_neon: 10.93 6.96 mct_8tap_regular_w64_hv_8bpc_neon: 8.81 6.53 mct_8tap_regular_w64_v_8bpc_neon: 11.42 8.73 mct_8tap_regular_w128_0_8bpc_neon: 14.41 7.67 mct_8tap_regular_w128_h_8bpc_neon: 10.92 6.96 mct_8tap_regular_w128_hv_8bpc_neon: 8.56 6.51 mct_8tap_regular_w128_v_8bpc_neon: 11.16 8.70 mct_bilinear_w4_0_8bpc_neon: 5.66 5.77 mct_bilinear_w4_h_8bpc_neon: 5.16 6.40 mct_bilinear_w4_hv_8bpc_neon: 6.86 6.82 mct_bilinear_w4_v_8bpc_neon: 4.75 6.09 mct_bilinear_w8_0_8bpc_neon: 9.78 10.00 mct_bilinear_w8_h_8bpc_neon: 8.98 11.37 mct_bilinear_w8_hv_8bpc_neon: 14.42 10.83 mct_bilinear_w8_v_8bpc_neon: 9.12 11.62 mct_bilinear_w16_0_8bpc_neon: 15.59 10.76 mct_bilinear_w16_h_8bpc_neon: 11.98 8.77 mct_bilinear_w16_hv_8bpc_neon: 15.83 10.73 mct_bilinear_w16_v_8bpc_neon: 14.70 14.60 mct_bilinear_w32_0_8bpc_neon: 15.89 10.32 mct_bilinear_w32_h_8bpc_neon: 13.47 9.07 mct_bilinear_w32_hv_8bpc_neon: 16.01 10.95 mct_bilinear_w32_v_8bpc_neon: 14.85 14.16 mct_bilinear_w64_0_8bpc_neon: 15.36 10.51 mct_bilinear_w64_h_8bpc_neon: 14.00 9.61 mct_bilinear_w64_hv_8bpc_neon: 15.82 11.27 mct_bilinear_w64_v_8bpc_neon: 14.61 14.76 mct_bilinear_w128_0_8bpc_neon: 14.41 7.92 mct_bilinear_w128_h_8bpc_neon: 13.31 9.58 mct_bilinear_w128_hv_8bpc_neon: 14.07 11.18 mct_bilinear_w128_v_8bpc_neon: 11.57 14.42
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1,6 +1,7 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Janne Grunau
+ * Copyright © 2018, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -26,6 +27,7 @@
*/
#include "src/arm/asm.S"
+#include "src/arm/64/util.S"
.macro avg dst, t0, t1
ld1 {\t0\().8h}, [x2], 16
@@ -230,3 +232,2102 @@
bidir_fn avg
bidir_fn w_avg
bidir_fn mask
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x8 is set to (24-clz(w)).
+function put
+ adr x9, L(put_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+2:
+ ld1 {v0.h}[0], [x2], x3
+ ld1 {v1.h}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.h}[0], [x0], x1
+ st1 {v1.h}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 4b
+ ret
+8:
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ b.gt 8b
+ ret
+160:
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+16:
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x8], x1
+ b.gt 16b
+ ret
+32:
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 160b
+ .hword L(put_tbl) - 8b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x8 is set to (24-clz(w)), and x7 to w*2.
+function prep
+ adr x9, L(prep_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+4:
+ ld1 {v0.s}[0], [x1], x2
+ ld1 {v1.s}[0], [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.4h, v1.4h}, [x0], #16
+ b.gt 4b
+ ret
+8:
+ ld1 {v0.8b}, [x1], x2
+ ld1 {v1.8b}, [x1], x2
+ subs w4, w4, #2
+ ushll v0.8h, v0.8b, #4
+ ushll v1.8h, v1.8b, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+160:
+ add x9, x1, x2
+ lsl x2, x2, #1
+16:
+ ld1 {v0.16b}, [x1], x2
+ ld1 {v1.16b}, [x9], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+ b.gt 16b
+ ret
+320:
+ add x8, x0, w3, uxtw
+32:
+ ld1 {v0.16b, v1.16b}, [x1], x2
+ subs w4, w4, #2
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ld1 {v2.16b, v3.16b}, [x1], x2
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x7
+ ushll2 v17.8h, v2.16b, #4
+ st1 {v6.8h, v7.8h}, [x8], x7
+ ushll v18.8h, v3.8b, #4
+ st1 {v16.8h, v17.8h}, [x0], x7
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v18.8h, v19.8h}, [x8], x7
+ b.gt 32b
+ ret
+640:
+ add x8, x0, #32
+ mov x6, #64
+64:
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ ushll v4.8h, v0.8b, #4
+ ushll2 v5.8h, v0.16b, #4
+ ldp q2, q3, [x1, #32]
+ ushll v6.8h, v1.8b, #4
+ ushll2 v7.8h, v1.16b, #4
+ add x1, x1, x2
+ ushll v16.8h, v2.8b, #4
+ st1 {v4.8h, v5.8h}, [x0], x6
+ ushll2 v17.8h, v2.16b, #4
+ ushll v18.8h, v3.8b, #4
+ st1 {v6.8h, v7.8h}, [x8], x6
+ ushll2 v19.8h, v3.16b, #4
+ st1 {v16.8h, v17.8h}, [x0], x6
+ st1 {v18.8h, v19.8h}, [x8], x6
+ b.gt 64b
+ ret
+1280:
+ add x8, x0, #64
+ mov x6, #128
+128:
+ ldp q0, q1, [x1]
+ ldp q2, q3, [x1, #32]
+ ushll v16.8h, v0.8b, #4
+ ushll2 v17.8h, v0.16b, #4
+ ushll v18.8h, v1.8b, #4
+ ushll2 v19.8h, v1.16b, #4
+ ushll v20.8h, v2.8b, #4
+ ushll2 v21.8h, v2.16b, #4
+ ldp q4, q5, [x1, #64]
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
+ ushll v22.8h, v3.8b, #4
+ ushll2 v23.8h, v3.16b, #4
+ ushll v24.8h, v4.8b, #4
+ ushll2 v25.8h, v4.16b, #4
+ ushll v26.8h, v5.8b, #4
+ ushll2 v27.8h, v5.16b, #4
+ ldp q6, q7, [x1, #96]
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
+ ushll v28.8h, v6.8b, #4
+ ushll2 v29.8h, v6.16b, #4
+ ushll v30.8h, v7.8b, #4
+ ushll2 v31.8h, v7.16b, #4
+ subs w4, w4, #1
+ add x1, x1, x2
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 1280b
+ .hword L(prep_tbl) - 640b
+ .hword L(prep_tbl) - 320b
+ .hword L(prep_tbl) - 160b
+ .hword L(prep_tbl) - 8b
+ .hword L(prep_tbl) - 4b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_h r0, r1, r2, r3, r4
+ interleave_1 .4h, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
+ trn1 \r0\wd, \r0\wd, \r2\wd
+ trn1 \r1\wd, \r1\wd, \r3\wd
+ trn1 \r2\wd, \r2\wd, \r4\wd
+ trn1 \r3\wd, \r3\wd, \r5\wd
+.endm
+.macro interleave_2_s r0, r1, r2, r3, r4, r5
+ interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5
+.endm
+.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
+ uxtl \r0\().8h, \r0\().8b
+ uxtl \r1\().8h, \r1\().8b
+.ifnb \r2
+ uxtl \r2\().8h, \r2\().8b
+ uxtl \r3\().8h, \r3\().8b
+.endif
+.ifnb \r4
+ uxtl \r4\().8h, \r4\().8b
+.endif
+.ifnb \r5
+ uxtl \r5\().8h, \r5\().8b
+.endif
+.ifnb \r6
+ uxtl \r6\().8h, \r6\().8b
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3, wd
+ mul \d\wd, \s0\wd, v0.h[0]
+ mla \d\wd, \s1\wd, v0.h[1]
+ mla \d\wd, \s2\wd, v0.h[2]
+ mla \d\wd, \s3\wd, v0.h[3]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mul \d1\().8h, \s1\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d1\().8h, \s2\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d1\().8h, \s3\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d1\().8h, \s4\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d1\().8h, \s5\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d1\().8h, \s6\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d1\().8h, \s7\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mla \d1\().8h, \s8\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mul \d1\().8h, \s2\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d1\().8h, \s3\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d1\().8h, \s4\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d1\().8h, \s5\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d1\().8h, \s6\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d1\().8h, \s7\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d1\().8h, \s8\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mla \d1\().8h, \s9\().8h, v0.h[7]
+.endm
+.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
+ mul \d0\().8h, \s0\().8h, v0.h[0]
+ mul \d1\().8h, \s4\().8h, v0.h[0]
+ mla \d0\().8h, \s1\().8h, v0.h[1]
+ mla \d1\().8h, \s5\().8h, v0.h[1]
+ mla \d0\().8h, \s2\().8h, v0.h[2]
+ mla \d1\().8h, \s6\().8h, v0.h[2]
+ mla \d0\().8h, \s3\().8h, v0.h[3]
+ mla \d1\().8h, \s7\().8h, v0.h[3]
+ mla \d0\().8h, \s4\().8h, v0.h[4]
+ mla \d1\().8h, \s8\().8h, v0.h[4]
+ mla \d0\().8h, \s5\().8h, v0.h[5]
+ mla \d1\().8h, \s9\().8h, v0.h[5]
+ mla \d0\().8h, \s6\().8h, v0.h[6]
+ mla \d1\().8h, \s10\().8h, v0.h[6]
+ mla \d0\().8h, \s7\().8h, v0.h[7]
+ mla \d1\().8h, \s11\().8h, v0.h[7]
+.endm
+.macro sqrshrun_b shift, r0, r1, r2, r3
+ sqrshrun \r0\().8b, \r0\().8h, #\shift
+.ifnb \r1
+ sqrshrun \r1\().8b, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().8b, \r2\().8h, #\shift
+ sqrshrun \r3\().8b, \r3\().8h, #\shift
+.endif
+.endm
+.macro srshr_h shift, r0, r1, r2, r3
+ srshr \r0\().8h, \r0\().8h, #\shift
+.ifnb \r1
+ srshr \r1\().8h, \r1\().8h, #\shift
+.endif
+.ifnb \r2
+ srshr \r2\().8h, \r2\().8h, #\shift
+ srshr \r3\().8h, \r3\().8h, #\shift
+.endif
+.endm
+.macro st_h strd, reg, lanes
+ st1 {\reg\().h}[0], [x0], \strd
+ st1 {\reg\().h}[1], [x8], \strd
+.if \lanes > 2
+ st1 {\reg\().h}[2], [x0], \strd
+ st1 {\reg\().h}[3], [x8], \strd
+.endif
+.endm
+.macro st_s strd, r0, r1, r2, r3
+ st1 {\r0\().s}[0], [x0], \strd
+ st1 {\r0\().s}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().s}[0], [x0], \strd
+ st1 {\r1\().s}[1], [x8], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1, r2, r3
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x8], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x8], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1, \r2, \r3
+ st_s \strd, \r0, \r1, \r2, \r3
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st_d \strd, \r0, \r1, \r2, \r3
+.endif
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x8], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x8], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x8], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x8], \strd
+.endif
+.endm
+.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_b 6, \r0, \r1, \r2, \r3
+ st_8b \strd, \r0, \r1, \r2, \r3
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st_16b \strd, \r0, \r1, \r2, \r3
+.endif
+.endm
+.macro shift_store_16 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun \r0\().8b, \r0\().8h, #6
+ sqrshrun2 \r0\().16b, \r1\().8h, #6
+ sqrshrun \r2\().8b, \r2\().8h, #6
+ sqrshrun2 \r2\().16b, \r3\().8h, #6
+ st_16b \strd, \r0, \r2
+.else
+ srshr_h 2, \r0, \r1, \r2, \r3
+ st1 {\r0\().8h, \r1\().8h}, [x0], \strd
+ st1 {\r2\().8h, \r3\().8h}, [x8], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+ mov x8, \type_h
+ mov x9, \type_v
+ b \op\()_8tap
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap
+ mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w10
+ mul \my, \my, w10
+ add \mx, \mx, w8 // mx, 8tap_h, 4tap_h
+ add \my, \my, w9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ tst \mx, #(0x7f << 14)
+ sub w8, w8, #24
+ movrel x10, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfm w9, \mx, #7, #13
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w9
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x10, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x9, L(\type\()_8tap_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ uxtl v4.8h, v4.8b
+ uxtl v6.8h, v6.8b
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ mul v3.4h, v3.4h, v0.h[0]
+ mla v3.4h, v4.4h, v0.h[1]
+ mla v3.4h, v6.4h, v0.h[2]
+ mla v3.4h, v7.4h, v0.h[3]
+ srshr v3.4h, v3.4h, #2
+ sqrshrun v3.8b, v3.8h, #4
+ st1 {v3.h}[0], [\dst], \d_strd
+ st1 {v3.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #1
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8b}, [\src], \s_strd
+ ld1 {v20.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v20.8h, v20.8b
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ mul v16.4h, v16.4h, v0.h[0]
+ mla v16.4h, v17.4h, v0.h[1]
+ mla v16.4h, v18.4h, v0.h[2]
+ mla v16.4h, v19.4h, v0.h[3]
+ mul v20.4h, v20.4h, v0.h[0]
+ mla v20.4h, v21.4h, v0.h[1]
+ mla v20.4h, v22.4h, v0.h[2]
+ mla v20.4h, v23.4h, v0.h[3]
+ srshr v16.4h, v16.4h, #2
+ srshr v20.4h, v20.4h, #2
+.ifc \type, put
+ sqrshrun v16.8b, v16.8h, #4
+ sqrshrun v20.8b, v20.8h, #4
+ st1 {v16.s}[0], [\dst], \d_strd
+ st1 {v20.s}[0], [\ds2], \d_strd
+.else
+ st1 {v16.4h}, [\dst], \d_strd
+ st1 {v20.4h}, [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+8:
+ ld1 {v16.8b, v17.8b}, [\src], \s_strd
+ ld1 {v20.8b, v21.8b}, [\sr2], \s_strd
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+
+ mul v18.8h, v16.8h, v0.h[0]
+ mul v22.8h, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v19.16b, v16.16b, v17.16b, #(2*\i)
+ ext v23.16b, v20.16b, v21.16b, #(2*\i)
+ mla v18.8h, v19.8h, v0.h[\i]
+ mla v22.8h, v23.8h, v0.h[\i]
+.endr
+ subs \h, \h, #2
+ srshr v18.8h, v18.8h, #2
+ srshr v22.8h, v22.8h, #2
+.ifc \type, put
+ sqrshrun v18.8b, v18.8h, #4
+ sqrshrun v22.8b, v22.8h, #4
+ st1 {v18.8b}, [\dst], \d_strd
+ st1 {v22.8b}, [\ds2], \d_strd
+.else
+ st1 {v18.8h}, [\dst], \d_strd
+ st1 {v22.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #3
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24
+ ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24
+ mov \mx, \w
+ uxtl v16.8h, v16.8b
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v20.8h, v20.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+
+16:
+ mul v24.8h, v16.8h, v0.h[0]
+ mul v25.8h, v17.8h, v0.h[0]
+ mul v26.8h, v20.8h, v0.h[0]
+ mul v27.8h, v21.8h, v0.h[0]
+.irpc i, 1234567
+ ext v28.16b, v16.16b, v17.16b, #(2*\i)
+ ext v29.16b, v17.16b, v18.16b, #(2*\i)
+ ext v30.16b, v20.16b, v21.16b, #(2*\i)
+ ext v31.16b, v21.16b, v22.16b, #(2*\i)
+ mla v24.8h, v28.8h, v0.h[\i]
+ mla v25.8h, v29.8h, v0.h[\i]
+ mla v26.8h, v30.8h, v0.h[\i]
+ mla v27.8h, v31.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ srshr v26.8h, v26.8h, #2
+ srshr v27.8h, v27.8h, #2
+ subs \mx, \mx, #16
+.ifc \type, put
+ sqrshrun v24.8b, v24.8h, #4
+ sqrshrun2 v24.16b, v25.8h, #4
+ sqrshrun v26.8b, v26.8h, #4
+ sqrshrun2 v26.16b, v27.8h, #4
+ st1 {v24.16b}, [\dst], #16
+ st1 {v26.16b}, [\ds2], #16
+.else
+ st1 {v24.8h, v25.8h}, [\dst], #32
+ st1 {v26.8h, v27.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ ld1 {v17.8b, v18.8b}, [\src], #16
+ ld1 {v21.8b, v22.8b}, [\sr2], #16
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v21.8h, v21.8b
+ uxtl v22.8h, v22.8b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfm w9, \my, #7, #13
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_h v1, v2, v3, v4, v5
+ b.gt 24f
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .4h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_h \sr2, \src, \s_strd, v6, v7
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ sqrshrun_b 6, v6
+ st_h \d_strd, v6, 4
+ ret
+
+28: // 2x8, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_h v1, v2, v3, v4, v5
+ interleave_1_h v5, v6, v7
+ interleave_2_s v1, v2, v3, v4, v5, v6
+ uxtl_b v1, v2, v3, v4
+216:
+ subs \h, \h, #8
+ load_h \sr2, \src, \s_strd, v16, v17, v18, v19
+ load_h \sr2, \src, \s_strd, v20, v21, v22, v23
+ interleave_1_h v7, v16, v17, v18, v19
+ interleave_1_h v19, v20, v21, v22, v23
+ interleave_2_s v5, v6, v7, v16, v17, v18
+ interleave_2_s v17, v18, v19, v20, v21, v22
+ uxtl_b v5, v6, v7, v16
+ uxtl_b v17, v18, v19, v20
+ mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20
+ sqrshrun_b 6, v30, v31
+ st_h \d_strd, v30, 4
+ st_h \d_strd, v31, 4
+ b.le 0f
+ mov v1.16b, v17.16b
+ mov v2.16b, v18.16b
+ mov v3.16b, v19.16b
+ mov v4.16b, v20.16b
+ mov v5.16b, v21.16b
+ mov v6.16b, v22.16b
+ mov v7.16b, v23.16b
+ b 216b
+0:
+ ret
+.endif
+
+40:
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ shift_store_4 \type, \d_strd, v6
+ b.le 0f
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ uxtl_b v5, v6
+ mul_mla_4 v7, v3, v4, v5, v6, .8h
+ shift_store_4 \type, \d_strd, v7
+0:
+ ret
+
+480: // 4x8, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ interleave_1_s v16, v17, v18
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v16, v17
+ uxtl_b v18, v19, v20, v21
+
+48:
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v23, v24, v25, v26
+ interleave_1_s v22, v23, v24, v25, v26
+ uxtl_b v22, v23, v24, v25
+ mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ shift_store_4 \type, \d_strd, v1, v2
+ b.le 0f
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v27, v16, v17, v18
+ interleave_1_s v26, v27, v16, v17, v18
+ uxtl_b v26, v27, v16, v17
+ mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
+ shift_store_4 \type, \d_strd, v1, v2
+ b.le 0f
+ subs \h, \h, #4
+ load_s \sr2, \src, \s_strd, v19, v20, v21, v22
+ interleave_1_s v18, v19, v20, v21, v22
+ uxtl_b v18, v19, v20, v21
+ mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ shift_store_4 \type, \d_strd, v1, v2
+ b 48b
+0:
+ ret
+
+80:
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl_b v1, v2, v3, v4, v5
+ mul_mla_4 v6, v1, v2, v3, v4, .8h
+ mul_mla_4 v7, v2, v3, v4, v5, .8h
+ shift_store_8 \type, \d_strd, v6, v7
+ b.le 0f
+ load_8b \sr2, \src, \s_strd, v6, v7
+ uxtl_b v6, v7
+ mul_mla_4 v1, v3, v4, v5, v6, .8h
+ mul_mla_4 v2, v4, v5, v6, v7, .8h
+ shift_store_8 \type, \d_strd, v1, v2
+0:
+ ret
+
+880: // 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+ uxtl_b v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v23, v24
+ uxtl_b v23, v24
+ mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v25, v26
+ uxtl_b v25, v26
+ mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v3, v4
+ b.le 9f
+ subs \h, \h, #4
+ load_8b \sr2, \src, \s_strd, v27, v16, v17, v18
+ uxtl_b v27, v16, v17, v18
+ mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ subs \h, \h, #4
+ load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
+ uxtl_b v19, v20, v21, v22
+ mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+ mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.gt 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ ret
+
+160:
+ b.gt 1680b
+
+ // 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ cmp \h, #2
+ load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ uxtl v16.8h, v1.8b
+ uxtl v17.8h, v2.8b
+ uxtl v18.8h, v3.8b
+ uxtl v19.8h, v4.8b
+ uxtl v20.8h, v5.8b
+ uxtl2 v23.8h, v1.16b
+ uxtl2 v24.8h, v2.16b
+ uxtl2 v25.8h, v3.16b
+ uxtl2 v26.8h, v4.16b
+ uxtl2 v27.8h, v5.16b
+ mul_mla_4 v1, v16, v17, v18, v19, .8h
+ mul_mla_4 v16, v17, v18, v19, v20, .8h
+ mul_mla_4 v2, v23, v24, v25, v26, .8h
+ mul_mla_4 v17, v24, v25, v26, v27, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v16, v17
+ b.le 0f
+ load_16b \sr2, \src, \s_strd, v6, v7
+ uxtl v21.8h, v6.8b
+ uxtl v22.8h, v7.8b
+ uxtl2 v28.8h, v6.16b
+ uxtl2 v29.8h, v7.16b
+ mul_mla_4 v1, v18, v19, v20, v21, .8h
+ mul_mla_4 v3, v19, v20, v21, v22, .8h
+ mul_mla_4 v2, v25, v26, v27, v28, .8h
+ mul_mla_4 v4, v26, v27, v28, v29, .8h
+ shift_store_16 \type, \d_strd, v1, v2, v3, v4
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfm w9, \my, #7, #13
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w9
+4:
+ add \xmy, x10, \my, uxtw #3
+
+ adr x9, L(\type\()_8tap_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20:
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addv h28, v28.4h
+ addv h29, v29.4h
+ trn1 v16.4h, v28.4h, v29.4h
+ srshr v16.4h, v16.4h, #2
+ bl L(\type\()_8tap_filter_2)
+
+ trn1 v16.2s, v16.2s, v28.2s
+ trn1 v17.2s, v28.2s, v30.2s
+ mov v18.8b, v30.8b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ trn1 v18.2s, v18.2s, v28.2s
+ trn1 v19.2s, v28.2s, v30.2s
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v30.8b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v28.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ mul v28.4h, v28.4h, v0.4h
+ mul v29.4h, v29.4h, v0.4h
+ addv h28, v28.4h
+ addv h29, v29.4h
+ trn1 v16.4h, v28.4h, v29.4h
+ srshr v16.4h, v16.4h, #2
+
+ bl L(\type\()_8tap_filter_2)
+ trn1 v16.2s, v16.2s, v28.2s
+ trn1 v17.2s, v28.2s, v30.2s
+ mov v18.8b, v30.8b
+ bl L(\type\()_8tap_filter_2)
+ trn1 v18.2s, v18.2s, v28.2s
+ trn1 v19.2s, v28.2s, v30.2s
+ mov v20.8b, v30.8b
+ bl L(\type\()_8tap_filter_2)
+ trn1 v20.2s, v20.2s, v28.2s
+ trn1 v21.2s, v28.2s, v30.2s
+ mov v22.8b, v30.8b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ trn1 v22.2s, v22.2s, v28.2s
+ trn1 v23.2s, v28.2s, v30.2s
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v2.4s, v23.4h, v1.h[7]
+
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqxtun v2.8b, v2.8h
+ subs \h, \h, #2
+ st1 {v2.h}[0], [\dst], \d_strd
+ st1 {v2.h}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v23.8b
+ mov v22.8b, v30.8b
+ b 28b
+
+0:
+ br x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v30.8h, v30.8b
+ ext v29.16b, v28.16b, v28.16b, #2
+ ext v31.16b, v30.16b, v30.16b, #2
+ trn1 v27.2s, v28.2s, v30.2s
+ trn2 v30.2s, v28.2s, v30.2s
+ trn1 v28.2s, v29.2s, v31.2s
+ trn2 v31.2s, v29.2s, v31.2s
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v30.4h, v0.h[2]
+ mla v27.4h, v31.4h, v0.h[3]
+ srshr v28.4h, v27.4h, #2
+ trn2 v30.2s, v28.2s, v28.2s
+ ret
+.endif
+
+40:
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+
+4:
+ smull v2.4s, v16.4h, v1.h[0]
+ bl L(\type\()_8tap_filter_4)
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v28.4h, v1.h[2]
+ smlal v2.4s, v28.4h, v1.h[3]
+ smlal v3.4s, v29.4h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.16b, v18.16b
+ mov v17.16b, v28.16b
+ mov v18.16b, v29.16b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #1
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v26.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ srshr v16.4h, v31.4h, #2
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v28.8b
+ mov v18.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v28.8b
+ mov v20.8b, v29.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+
+48:
+ smull v2.4s, v16.4h, v1.h[0]
+ bl L(\type\()_8tap_filter_4)
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v28.4h, v1.h[6]
+ smlal v2.4s, v28.4h, v1.h[7]
+ smlal v3.4s, v29.4h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn v3.4h, v3.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v3.s}[0], [\ds2], \d_strd
+.else
+ st1 {v2.4h}, [\dst], \d_strd
+ st1 {v3.4h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v28.8b
+ mov v22.8b, v29.8b
+ b 48b
+0:
+ br x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v26.8b}, [\sr2], \s_strd
+ ld1 {v27.8b}, [\src], \s_strd
+ uxtl v26.8h, v26.8b
+ uxtl v27.8h, v27.8b
+ ext v28.16b, v26.16b, v26.16b, #2
+ ext v29.16b, v26.16b, v26.16b, #4
+ ext v30.16b, v26.16b, v26.16b, #6
+ mul v31.4h, v26.4h, v0.h[0]
+ mla v31.4h, v28.4h, v0.h[1]
+ mla v31.4h, v29.4h, v0.h[2]
+ mla v31.4h, v30.4h, v0.h[3]
+ ext v28.16b, v27.16b, v27.16b, #2
+ ext v29.16b, v27.16b, v27.16b, #4
+ ext v30.16b, v27.16b, v27.16b, #6
+ mul v27.4h, v27.4h, v0.h[0]
+ mla v27.4h, v28.4h, v0.h[1]
+ mla v27.4h, v29.4h, v0.h[2]
+ mla v27.4h, v30.4h, v0.h[3]
+ srshr v28.4h, v31.4h, #2
+ srshr v29.4h, v27.4h, #2
+ ret
+
+80:
+160:
+320:
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v28.8b, v29.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ mul v24.8h, v28.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+.endr
+ srshr v16.8h, v24.8h, #2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v24.4h, v1.h[2]
+ smlal2 v5.4s, v24.8h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smlal2 v3.4s, v24.8h, v1.h[3]
+ smlal v4.4s, v25.4h, v1.h[3]
+ smlal2 v5.4s, v25.8h, v1.h[3]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #3
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v28.8b, v29.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ mul v24.8h, v28.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+.endr
+ srshr v16.8h, v24.8h, #2
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v24.16b
+ mov v18.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v24.16b
+ mov v20.16b, v25.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal2 v5.4s, v24.8h, v1.h[6]
+ smlal v2.4s, v24.4h, v1.h[7]
+ smlal2 v3.4s, v24.8h, v1.h[7]
+ smlal v4.4s, v25.4h, v1.h[7]
+ smlal2 v5.4s, v25.8h, v1.h[7]
+ sqrshrn v2.4h, v2.4s, #\shift_hv
+ sqrshrn2 v2.8h, v3.4s, #\shift_hv
+ sqrshrn v4.4h, v4.4s, #\shift_hv
+ sqrshrn2 v4.8h, v5.4s, #\shift_hv
+ subs \h, \h, #2
+.ifc \type, put
+ sqxtun v2.8b, v2.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v2.8b}, [\dst], \d_strd
+ st1 {v4.8b}, [\ds2], \d_strd
+.else
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v4.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v24.16b
+ mov v22.16b, v25.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 168b
+0:
+ br x15
+
+L(\type\()_8tap_filter_8):
+ ld1 {v28.8b, v29.8b}, [\sr2], \s_strd
+ ld1 {v30.8b, v31.8b}, [\src], \s_strd
+ uxtl v28.8h, v28.8b
+ uxtl v29.8h, v29.8b
+ uxtl v30.8h, v30.8b
+ uxtl v31.8h, v31.8b
+ mul v24.8h, v28.8h, v0.h[0]
+ mul v25.8h, v30.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v28.16b, v29.16b, #(2*\i)
+ ext v27.16b, v30.16b, v31.16b, #(2*\i)
+ mla v24.8h, v26.8h, v0.h[\i]
+ mla v25.8h, v27.8h, v0.h[\i]
+.endr
+ srshr v24.8h, v24.8h, #2
+ srshr v25.8h, v25.8h, #2
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+ dup v1.16b, \mx
+ dup v3.16b, \my
+ mov w9, #16
+ sub w8, w9, \mx
+ sub w9, w9, \my
+ dup v0.16b, w8
+ dup v2.16b, w9
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz w8, \w
+ sub w8, w8, #24
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x9, L(\type\()_bilin_h_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.s}[0], [\src], \s_strd
+ ld1 {v6.s}[0], [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.4h, v4.4h, v6.4h
+ trn1 v5.4h, v5.4h, v7.4h
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8b}, [\src], \s_strd
+ ld1 {v6.8b}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #1
+ ext v7.8b, v6.8b, v6.8b, #1
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ld1 {v4.16b}, [\src], \s_strd
+ ld1 {v6.16b}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v6.16b, v6.16b, #1
+ subs \h, \h, #2
+ umull v4.8h, v4.8b, v0.8b
+ umull v6.8h, v6.8b, v0.8b
+ umlal v4.8h, v5.8b, v1.8b
+ umlal v6.8h, v7.8b, v1.8b
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v6.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+.endif
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw
+ sub \s_strd, \s_strd, #8
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw
+.endif
+161:
+ ld1 {v16.d}[1], [\src], #8
+ ld1 {v20.d}[1], [\sr2], #8
+ mov \mx, \w
+
+16:
+ ld1 {v18.16b}, [\src], #16
+ ld1 {v22.16b}, [\sr2], #16
+ ext v17.16b, v16.16b, v18.16b, #8
+ ext v19.16b, v16.16b, v18.16b, #9
+ ext v21.16b, v20.16b, v22.16b, #8
+ ext v23.16b, v20.16b, v22.16b, #9
+ umull v16.8h, v17.8b, v0.8b
+ umull2 v17.8h, v17.16b, v0.16b
+ umull v20.8h, v21.8b, v0.8b
+ umull2 v21.8h, v21.16b, v0.16b
+ umlal v16.8h, v19.8b, v1.8b
+ umlal2 v17.8h, v19.16b, v1.16b
+ umlal v20.8h, v23.8b, v1.8b
+ umlal2 v21.8h, v23.16b, v1.16b
+ subs \mx, \mx, #16
+.ifc \type, put
+ uqrshrn v16.8b, v16.8h, #4
+ uqrshrn2 v16.16b, v17.8h, #4
+ uqrshrn v20.8b, v20.8h, #4
+ uqrshrn2 v20.16b, v21.8h, #4
+ st1 {v16.16b}, [\dst], #16
+ st1 {v20.16b}, [\ds2], #16
+.else
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v20.8h, v21.8h}, [\ds2], #32
+.endif
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v20.16b, v22.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x9, L(\type\()_bilin_v_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.h}[0], [\src], \s_strd
+ b.gt 24f
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst]
+ st1 {v4.h}[1], [\ds2]
+ ret
+24: // 2x4, 2x8, ... v
+ ld1 {v17.h}[0], [\sr2], \s_strd
+ ld1 {v18.h}[0], [\src], \s_strd
+ ld1 {v19.h}[0], [\sr2], \s_strd
+ ld1 {v20.h}[0], [\src], \s_strd
+ trn1 v16.4h, v16.4h, v17.4h
+ trn1 v17.4h, v17.4h, v18.4h
+ trn1 v18.4h, v18.4h, v19.4h
+ trn1 v19.4h, v19.4h, v20.4h
+ trn1 v16.2s, v16.2s, v18.2s
+ trn1 v17.2s, v17.2s, v19.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ subs \h, \h, #4
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ st1 {v4.h}[2], [\dst], \d_strd
+ st1 {v4.h}[3], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v20.8b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.s}[0], [\src], \s_strd
+4:
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ umull v4.8h, v16.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8b}, [\src], \s_strd
+8:
+ ld1 {v17.8b}, [\sr2], \s_strd
+ ld1 {v18.8b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull v5.8h, v17.8b, v2.8b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal v5.8h, v18.8b, v3.8b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn v5.8b, v5.8h, #4
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.16b}, [\src], \s_strd
+2:
+ ld1 {v17.16b}, [\sr2], \s_strd
+ ld1 {v18.16b}, [\src], \s_strd
+ umull v4.8h, v16.8b, v2.8b
+ umull2 v5.8h, v16.16b, v2.16b
+ umull v6.8h, v17.8b, v2.8b
+ umull2 v7.8h, v17.16b, v2.16b
+ umlal v4.8h, v17.8b, v3.8b
+ umlal2 v5.8h, v17.16b, v3.16b
+ umlal v6.8h, v18.8b, v3.8b
+ umlal2 v7.8h, v18.16b, v3.16b
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #4
+ uqrshrn2 v4.16b, v5.8h, #4
+ uqrshrn v6.8b, v6.8h, #4
+ uqrshrn2 v6.16b, v7.8h, #4
+ st1 {v4.16b}, [\dst], \d_strd
+ st1 {v6.16b}, [\ds2], \d_strd
+.else
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+.ifc \type, put
+ add \dst, \dst, #16
+.else
+ add \dst, \dst, #32
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ uxtl v2.8h, v2.8b
+ uxtl v3.8h, v3.8b
+ adr x9, L(\type\()_bilin_hv_tbl)
+ ldrh w8, [x9, x8, lsl #1]
+ sub x9, x9, w8, uxtw
+ br x9
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.s}[0], [\sr2], \s_strd
+ ld1 {v30.s}[0], [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.4h, v28.4h, v30.4h
+ trn1 v29.4h, v29.4h, v31.4h
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ uqrshrn v4.8b, v4.8h, #8
+ subs \h, \h, #2
+ st1 {v4.h}[0], [\dst], \d_strd
+ st1 {v4.h}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+4:
+ ld1 {v28.8b}, [\sr2], \s_strd
+ ld1 {v30.8b}, [\src], \s_strd
+ ext v29.8b, v28.8b, v28.8b, #1
+ ext v31.8b, v30.8b, v30.8b, #1
+ trn1 v28.2s, v28.2s, v30.2s
+ trn1 v29.2s, v29.2s, v31.2s
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+.endif
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v28.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ umull v16.8h, v28.8b, v0.8b
+ umlal v16.8h, v29.8b, v1.8b
+
+2:
+ ld1 {v28.16b}, [\sr2], \s_strd
+ ld1 {v30.16b}, [\src], \s_strd
+ ext v29.16b, v28.16b, v28.16b, #1
+ ext v31.16b, v30.16b, v30.16b, #1
+ umull v17.8h, v28.8b, v0.8b
+ umlal v17.8h, v29.8b, v1.8b
+ umull v18.8h, v30.8b, v0.8b
+ umlal v18.8h, v31.8b, v1.8b
+
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ uqrshrn v4.8b, v4.8h, #8
+ uqrshrn v5.8b, v5.8h, #8
+ st1 {v4.8b}, [\dst], \d_strd
+ st1 {v5.8b}, [\ds2], \d_strd
+.else
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+.endif
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #8
+.ifc \type, put
+ add \dst, \dst, #8
+.else
+ add \dst, \dst, #16
+.endif
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
--- a/src/arm/asm.S
+++ b/src/arm/asm.S
@@ -129,4 +129,6 @@
#define L(x) .L ## x
#endif
+#define X(x) CONCAT(EXTERN, x)
+
#endif /* __DAV1D_SRC_ARM_ASM_S__ */
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -30,16 +30,66 @@
#include "src/mc.h"
#include "src/cpu.h"
+decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon);
+decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon);
+decl_mc_fn(dav1d_put_bilin_8bpc_neon);
+
+decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
+decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
+
decl_avg_fn(dav1d_avg_8bpc_neon);
decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
decl_mask_fn(dav1d_mask_8bpc_neon);
void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = dav1d_put_##name##_8bpc_##suffix
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8
+#if ARCH_AARCH64
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
+#endif
+
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon;