ref: fe44861b1d7730d888aa5af36fbb108b578f22e0
parent: c851c65c048e1580d7b365523f9386a14ada6149
author: Martin Storsjö <[email protected]>
date: Tue Feb 4 06:50:38 EST 2020
arm64: mc: NEON implementation of put/prep 8tap/bilin for 16 bpc Examples of checkasm benchmarks: Cortex A53 A72 A73 mc_8tap_regular_w8_0_16bpc_neon: 96.8 49.6 62.5 mc_8tap_regular_w8_h_16bpc_neon: 570.3 388.0 467.2 mc_8tap_regular_w8_hv_16bpc_neon: 1035.8 776.7 891.3 mc_8tap_regular_w8_v_16bpc_neon: 400.6 285.0 278.3 mc_bilinear_w8_0_16bpc_neon: 90.0 44.8 57.8 mc_bilinear_w8_h_16bpc_neon: 191.2 158.7 156.4 mc_bilinear_w8_hv_16bpc_neon: 295.9 234.6 244.9 mc_bilinear_w8_v_16bpc_neon: 147.2 98.7 89.2 mct_8tap_regular_w8_0_16bpc_neon: 139.4 78.7 84.9 mct_8tap_regular_w8_h_16bpc_neon: 612.5 396.8 479.1 mct_8tap_regular_w8_hv_16bpc_neon: 1112.4 814.6 963.2 mct_8tap_regular_w8_v_16bpc_neon: 461.8 370.8 353.4 mct_bilinear_w8_0_16bpc_neon: 135.6 76.2 80.5 mct_bilinear_w8_h_16bpc_neon: 211.3 159.4 141.7 mct_bilinear_w8_hv_16bpc_neon: 325.7 237.2 227.2 mct_bilinear_w8_v_16bpc_neon: 180.7 135.9 129.5 For comparison, the corresponding numbers for 8 bpc: mc_8tap_regular_w8_0_8bpc_neon: 78.6 41.0 39.5 mc_8tap_regular_w8_h_8bpc_neon: 371.2 299.6 348.3 mc_8tap_regular_w8_hv_8bpc_neon: 817.1 675.0 726.5 mc_8tap_regular_w8_v_8bpc_neon: 243.7 260.4 253.0 mc_bilinear_w8_0_8bpc_neon: 74.8 35.4 36.1 mc_bilinear_w8_h_8bpc_neon: 179.9 69.9 79.2 mc_bilinear_w8_hv_8bpc_neon: 210.8 132.4 144.8 mc_bilinear_w8_v_8bpc_neon: 141.6 64.9 65.4 mct_8tap_regular_w8_0_8bpc_neon: 101.7 54.4 59.5 mct_8tap_regular_w8_h_8bpc_neon: 391.3 329.1 358.3 mct_8tap_regular_w8_hv_8bpc_neon: 880.4 754.9 829.4 mct_8tap_regular_w8_v_8bpc_neon: 270.8 300.8 277.4 mct_bilinear_w8_0_8bpc_neon: 97.6 54.0 55.4 mct_bilinear_w8_h_8bpc_neon: 173.3 73.5 79.5 mct_bilinear_w8_hv_8bpc_neon: 228.3 163.0 174.0 mct_bilinear_w8_v_8bpc_neon: 128.9 72.5 63.3
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -239,3 +239,2198 @@
bidir_fn avg, w6
bidir_fn w_avg, w7
bidir_fn mask, w7
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that x9 is set to (clz(w)-24).
+function put_neon
+ adr x10, L(put_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ sub x10, x10, w9, uxtw
+ br x10
+
+2:
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ subs w5, w5, #2
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 2b
+ ret
+4:
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ subs w5, w5, #2
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ b.gt 4b
+ ret
+80:
+ add x8, x0, x1
+ lsl x1, x1, #1
+ add x9, x2, x3
+ lsl x3, x3, #1
+8:
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x9], x3
+ subs w5, w5, #2
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x8], x1
+ b.gt 8b
+ ret
+16:
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ subs w5, w5, #1
+ stp x8, x9, [x0, #16]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 16b
+ ret
+32:
+ ldp x6, x7, [x2]
+ ldp x8, x9, [x2, #16]
+ stp x6, x7, [x0]
+ ldp x10, x11, [x2, #32]
+ stp x8, x9, [x0, #16]
+ subs w5, w5, #1
+ ldp x12, x13, [x2, #48]
+ stp x10, x11, [x0, #32]
+ stp x12, x13, [x0, #48]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 32b
+ ret
+64:
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 64b
+ ret
+128:
+ ldp q0, q1, [x2]
+ ldp q2, q3, [x2, #32]
+ stp q0, q1, [x0]
+ ldp q4, q5, [x2, #64]
+ stp q2, q3, [x0, #32]
+ ldp q6, q7, [x2, #96]
+ subs w5, w5, #1
+ stp q4, q5, [x0, #64]
+ ldp q16, q17, [x2, #128]
+ stp q6, q7, [x0, #96]
+ ldp q18, q19, [x2, #160]
+ stp q16, q17, [x0, #128]
+ ldp q20, q21, [x2, #192]
+ stp q18, q19, [x0, #160]
+ ldp q22, q23, [x2, #224]
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 128b
+ ret
+
+L(put_tbl):
+ .hword L(put_tbl) - 128b
+ .hword L(put_tbl) - 64b
+ .hword L(put_tbl) - 32b
+ .hword L(put_tbl) - 16b
+ .hword L(put_tbl) - 80b
+ .hword L(put_tbl) - 4b
+ .hword L(put_tbl) - 2b
+endfunc
+
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
+// x8 to w*2.
+function prep_neon
+ adr x10, L(prep_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+ dup v31.8h, w7 // intermediate_bits
+ movi v30.8h, #(PREP_BIAS >> 8), lsl #8
+ sub x10, x10, w9, uxtw
+ br x10
+
+40:
+ add x9, x1, x2
+ lsl x2, x2, #1
+4:
+ ld1 {v0.d}[0], [x1], x2
+ ld1 {v0.d}[1], [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ st1 {v0.8h}, [x0], #16
+ b.gt 4b
+ ret
+80:
+ add x9, x1, x2
+ lsl x2, x2, #1
+8:
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x9], x2
+ subs w4, w4, #2
+ sshl v0.8h, v0.8h, v31.8h
+ sshl v1.8h, v1.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 8b
+ ret
+16:
+ ldp q0, q1, [x1]
+ add x1, x1, x2
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1]
+ add x1, x1, x2
+ subs w4, w4, #2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 16b
+ ret
+32:
+ ldp q0, q1, [x1]
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ add x1, x1, x2
+ sshl v1.8h, v1.8h, v31.8h
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ subs w4, w4, #1
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ b.gt 32b
+ ret
+64:
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ add x1, x1, x2
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ stp q6, q7, [x0, #96]
+ add x0, x0, x8
+ b.gt 64b
+ ret
+128:
+ ldp q0, q1, [x1]
+ subs w4, w4, #1
+ sshl v0.8h, v0.8h, v31.8h
+ ldp q2, q3, [x1, #32]
+ sshl v1.8h, v1.8h, v31.8h
+ ldp q4, q5, [x1, #64]
+ sshl v2.8h, v2.8h, v31.8h
+ sshl v3.8h, v3.8h, v31.8h
+ ldp q6, q7, [x1, #96]
+ sshl v4.8h, v4.8h, v31.8h
+ sshl v5.8h, v5.8h, v31.8h
+ ldp q16, q17, [x1, #128]
+ sshl v6.8h, v6.8h, v31.8h
+ sshl v7.8h, v7.8h, v31.8h
+ ldp q18, q19, [x1, #160]
+ sshl v16.8h, v16.8h, v31.8h
+ sshl v17.8h, v17.8h, v31.8h
+ ldp q20, q21, [x1, #192]
+ sshl v18.8h, v18.8h, v31.8h
+ sshl v19.8h, v19.8h, v31.8h
+ ldp q22, q23, [x1, #224]
+ add x1, x1, x2
+ sshl v20.8h, v20.8h, v31.8h
+ sshl v21.8h, v21.8h, v31.8h
+ sshl v22.8h, v22.8h, v31.8h
+ sshl v23.8h, v23.8h, v31.8h
+ sub v0.8h, v0.8h, v30.8h
+ sub v1.8h, v1.8h, v30.8h
+ sub v2.8h, v2.8h, v30.8h
+ sub v3.8h, v3.8h, v30.8h
+ stp q0, q1, [x0]
+ sub v4.8h, v4.8h, v30.8h
+ sub v5.8h, v5.8h, v30.8h
+ stp q2, q3, [x0, #32]
+ sub v6.8h, v6.8h, v30.8h
+ sub v7.8h, v7.8h, v30.8h
+ stp q4, q5, [x0, #64]
+ sub v16.8h, v16.8h, v30.8h
+ sub v17.8h, v17.8h, v30.8h
+ stp q6, q7, [x0, #96]
+ sub v18.8h, v18.8h, v30.8h
+ sub v19.8h, v19.8h, v30.8h
+ stp q16, q17, [x0, #128]
+ sub v20.8h, v20.8h, v30.8h
+ sub v21.8h, v21.8h, v30.8h
+ stp q18, q19, [x0, #160]
+ sub v22.8h, v22.8h, v30.8h
+ sub v23.8h, v23.8h, v30.8h
+ stp q20, q21, [x0, #192]
+ stp q22, q23, [x0, #224]
+ add x0, x0, x8
+ b.gt 128b
+ ret
+
+L(prep_tbl):
+ .hword L(prep_tbl) - 128b
+ .hword L(prep_tbl) - 64b
+ .hword L(prep_tbl) - 32b
+ .hword L(prep_tbl) - 16b
+ .hword L(prep_tbl) - 80b
+ .hword L(prep_tbl) - 40b
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}[0], [\s0], \strd
+ ld1 {\d1\wd}[0], [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}[0], [\s0], \strd
+ ld1 {\d3\wd}[0], [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}[0], [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}[0], [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}[0], [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+ ld1 {\d0\wd}, [\s0], \strd
+ ld1 {\d1\wd}, [\s1], \strd
+.ifnb \d2
+ ld1 {\d2\wd}, [\s0], \strd
+ ld1 {\d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd}, [\s0], \strd
+.endif
+.ifnb \d5
+ ld1 {\d5\wd}, [\s1], \strd
+.endif
+.ifnb \d6
+ ld1 {\d6\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
+ ld1 {\d0\wd, \d1\wd}, [\s0], \strd
+.ifnb \d2
+ ld1 {\d2\wd, \d3\wd}, [\s1], \strd
+.endif
+.ifnb \d4
+ ld1 {\d4\wd, \d5\wd}, [\s0], \strd
+.endif
+.endm
+.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+ load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
+ load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1 wd, r0, r1, r2, r3, r4
+ trn1 \r0\wd, \r0\wd, \r1\wd
+ trn1 \r1\wd, \r1\wd, \r2\wd
+.ifnb \r3
+ trn1 \r2\wd, \r2\wd, \r3\wd
+ trn1 \r3\wd, \r3\wd, \r4\wd
+.endif
+.endm
+.macro interleave_1_s r0, r1, r2, r3, r4
+ interleave_1 .2s, \r0, \r1, \r2, \r3, \r4
+.endm
+.macro umin_h c, wd, r0, r1, r2, r3
+ umin \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ umin \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ umin \r2\wd, \r2\wd, \c\wd
+ umin \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro sub_h c, wd, r0, r1, r2, r3
+ sub \r0\wd, \r0\wd, \c\wd
+.ifnb \r1
+ sub \r1\wd, \r1\wd, \c\wd
+.endif
+.ifnb \r2
+ sub \r2\wd, \r2\wd, \c\wd
+ sub \r3\wd, \r3\wd, \c\wd
+.endif
+.endm
+.macro smull_smlal_4 d, s0, s1, s2, s3
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+.endm
+.macro smull2_smlal2_4 d, s0, s1, s2, s3
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+.endm
+.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull \d\().4s, \s0\().4h, v0.h[0]
+ smlal \d\().4s, \s1\().4h, v0.h[1]
+ smlal \d\().4s, \s2\().4h, v0.h[2]
+ smlal \d\().4s, \s3\().4h, v0.h[3]
+ smlal \d\().4s, \s4\().4h, v0.h[4]
+ smlal \d\().4s, \s5\().4h, v0.h[5]
+ smlal \d\().4s, \s6\().4h, v0.h[6]
+ smlal \d\().4s, \s7\().4h, v0.h[7]
+.endm
+.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+ smull2 \d\().4s, \s0\().8h, v0.h[0]
+ smlal2 \d\().4s, \s1\().8h, v0.h[1]
+ smlal2 \d\().4s, \s2\().8h, v0.h[2]
+ smlal2 \d\().4s, \s3\().8h, v0.h[3]
+ smlal2 \d\().4s, \s4\().8h, v0.h[4]
+ smlal2 \d\().4s, \s5\().8h, v0.h[5]
+ smlal2 \d\().4s, \s6\().8h, v0.h[6]
+ smlal2 \d\().4s, \s7\().8h, v0.h[7]
+.endm
+.macro sqrshrun_h shift, r0, r1, r2, r3
+ sqrshrun \r0\().4h, \r0\().4s, #\shift
+.ifnb \r1
+ sqrshrun2 \r0\().8h, \r1\().4s, #\shift
+.endif
+.ifnb \r2
+ sqrshrun \r2\().4h, \r2\().4s, #\shift
+ sqrshrun2 \r2\().8h, \r3\().4s, #\shift
+.endif
+.endm
+.macro xtn_h r0, r1, r2, r3
+ xtn \r0\().4h, \r0\().4s
+ xtn2 \r0\().8h, \r1\().4s
+.ifnb \r2
+ xtn \r2\().4h, \r2\().4s
+ xtn2 \r2\().8h, \r3\().4s
+.endif
+.endm
+.macro srshl_s shift, r0, r1, r2, r3
+ srshl \r0\().4s, \r0\().4s, \shift\().4s
+ srshl \r1\().4s, \r1\().4s, \shift\().4s
+.ifnb \r2
+ srshl \r2\().4s, \r2\().4s, \shift\().4s
+ srshl \r3\().4s, \r3\().4s, \shift\().4s
+.endif
+.endm
+.macro st_s strd, reg, lanes
+ st1 {\reg\().s}[0], [x0], \strd
+ st1 {\reg\().s}[1], [x9], \strd
+.if \lanes > 2
+ st1 {\reg\().s}[2], [x0], \strd
+ st1 {\reg\().s}[3], [x9], \strd
+.endif
+.endm
+.macro st_d strd, r0, r1
+ st1 {\r0\().d}[0], [x0], \strd
+ st1 {\r0\().d}[1], [x9], \strd
+.ifnb \r1
+ st1 {\r1\().d}[0], [x0], \strd
+ st1 {\r1\().d}[1], [x9], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_d \strd, \r0, \r2
+.endm
+.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
+ st1 {\r0\wd}, [x0], \strd
+ st1 {\r1\wd}, [x9], \strd
+.ifnb \r2
+ st1 {\r2\wd}, [x0], \strd
+ st1 {\r3\wd}, [x9], \strd
+.endif
+.ifnb \r4
+ st1 {\r4\wd}, [x0], \strd
+ st1 {\r5\wd}, [x9], \strd
+ st1 {\r6\wd}, [x0], \strd
+ st1 {\r7\wd}, [x9], \strd
+.endif
+.endm
+.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
+ st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endm
+.macro shift_store_8 type, strd, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin_h v31, .8h, \r0, \r2
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub_h v29, .8h, \r0, \r2 // PREP_BIAS
+.endif
+ st_8h \strd, \r0, \r2
+.endm
+.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
+.ifc \type, put
+ sqrshrun_h 6, \r0, \r1, \r2, \r3
+ umin \r0\().8h, \r0\().8h, v31.8h
+ umin \r1\().8h, \r2\().8h, v31.8h
+.else
+ srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits)
+ xtn_h \r0, \r1, \r2, \r3
+ sub \r0\().8h, \r0\().8h, v29.8h
+ sub \r1\().8h, \r2\().8h, v29.8h
+.endif
+ st1 {\r0\().8h, \r1\().8h}, [\dst], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+ mov w9, \type_h
+ mov w10, \type_v
+ b \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH ((1*15<<7)|4*15)
+#define SHARP ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
+make_8tap_fn \type, regular, REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp, REGULAR, SHARP
+make_8tap_fn \type, smooth, SMOOTH, SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
+make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
+make_8tap_fn \type, sharp, SHARP, SHARP
+make_8tap_fn \type, sharp_regular, SHARP, REGULAR
+make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
+
+function \type\()_8tap_neon
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
+ mul \mx, \mx, w11
+ mul \my, \my, w11
+ add \mx, \mx, w9 // mx, 8tap_h, 4tap_h
+ add \my, \my, w10 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ dup v31.8h, \bdmax // bitdepth_max
+ clz \bdmax, \bdmax
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w12, #6
+ tst \mx, #(0x7f << 14)
+ sub w9, w9, #24
+ add w13, w12, \bdmax // 6 + intermediate_bits
+ sub w12, w12, \bdmax // 6 - intermediate_bits
+ movrel x11, X(mc_subpel_filters), -8
+ b.ne L(\type\()_8tap_h)
+ tst \my, #(0x7f << 14)
+ b.ne L(\type\()_8tap_v)
+ b \type\()_neon
+
+L(\type\()_8tap_h):
+ cmp \w, #4
+ ubfx w10, \mx, #7, #7
+ and \mx, \mx, #0x7f
+ b.le 4f
+ mov \mx, w10
+4:
+ tst \my, #(0x7f << 14)
+ add \xmx, x11, \mx, uxtw #3
+ b.ne L(\type\()_8tap_hv)
+
+ adr x10, L(\type\()_8tap_h_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.8h, \bdmax // intermediate_bits
+.else
+ movi v28.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.8h, v29.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+2:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ subs \h, \h, #2
+ trn1 v3.2s, v4.2s, v6.2s
+ trn2 v6.2s, v4.2s, v6.2s
+ trn1 v4.2s, v5.2s, v7.2s
+ trn2 v7.2s, v5.2s, v7.2s
+ smull v3.4s, v3.4h, v0.h[0]
+ smlal v3.4s, v4.4h, v0.h[1]
+ smlal v3.4s, v6.4h, v0.h[2]
+ smlal v3.4s, v7.4h, v0.h[3]
+ srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ srshl v3.4h, v3.4h, v29.4h // -intermediate_bits
+ umin v3.4h, v3.4h, v31.4h
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ sub \src, \src, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+4:
+ ld1 {v16.8h}, [\src], \s_strd
+ ld1 {v20.8h}, [\sr2], \s_strd
+ ext v17.16b, v16.16b, v16.16b, #2
+ ext v18.16b, v16.16b, v16.16b, #4
+ ext v19.16b, v16.16b, v16.16b, #6
+ ext v21.16b, v20.16b, v20.16b, #2
+ ext v22.16b, v20.16b, v20.16b, #4
+ ext v23.16b, v20.16b, v20.16b, #6
+ subs \h, \h, #2
+ smull v16.4s, v16.4h, v0.h[0]
+ smlal v16.4s, v17.4h, v0.h[1]
+ smlal v16.4s, v18.4h, v0.h[2]
+ smlal v16.4s, v19.4h, v0.h[3]
+ smull v20.4s, v20.4h, v0.h[0]
+ smlal v20.4s, v21.4h, v0.h[1]
+ smlal v20.4s, v22.4h, v0.h[2]
+ smlal v20.4s, v23.4h, v0.h[3]
+ srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits)
+ srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v16.4h, v16.4s
+ sqxtun2 v16.8h, v20.4s
+ srshl v16.8h, v16.8h, v29.8h // -intermediate_bits
+ umin v16.8h, v16.8h, v31.8h
+.else
+ xtn v16.4h, v16.4s
+ xtn2 v16.8h, v20.4s
+ sub v16.8h, v16.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v16.d}[0], [\dst], \d_strd
+ st1 {v16.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80:
+160:
+320:
+640:
+1280: // 8xN, 16xN, 32xN, ... h
+ ld1 {v0.8b}, [\xmx]
+ sub \src, \src, #6
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+81:
+ ld1 {v16.8h, v17.8h}, [\src], #32
+ ld1 {v20.8h, v21.8h}, [\sr2], #32
+ mov \mx, \w
+
+8:
+ smull v18.4s, v16.4h, v0.h[0]
+ smull2 v19.4s, v16.8h, v0.h[0]
+ smull v22.4s, v20.4h, v0.h[0]
+ smull2 v23.4s, v20.8h, v0.h[0]
+.irpc i, 1234567
+ ext v24.16b, v16.16b, v17.16b, #(2*\i)
+ ext v25.16b, v20.16b, v21.16b, #(2*\i)
+ smlal v18.4s, v24.4h, v0.h[\i]
+ smlal2 v19.4s, v24.8h, v0.h[\i]
+ smlal v22.4s, v25.4h, v0.h[\i]
+ smlal2 v23.4s, v25.8h, v0.h[\i]
+.endr
+ subs \mx, \mx, #8
+ srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits)
+ srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits)
+ srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits)
+ srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ sqxtun v18.4h, v18.4s
+ sqxtun2 v18.8h, v19.4s
+ sqxtun v22.4h, v22.4s
+ sqxtun2 v22.8h, v23.4s
+ srshl v18.8h, v18.8h, v29.8h // -intermediate_bits
+ srshl v22.8h, v22.8h, v29.8h // -intermediate_bits
+ umin v18.8h, v18.8h, v31.8h
+ umin v22.8h, v22.8h, v31.8h
+.else
+ xtn v18.4h, v18.4s
+ xtn2 v18.8h, v19.4s
+ xtn v22.4h, v22.4s
+ xtn2 v22.8h, v23.4s
+ sub v18.8h, v18.8h, v28.8h // PREP_BIAS
+ sub v22.8h, v22.8h, v28.8h // PREP_BIAS
+.endif
+ st1 {v18.8h}, [\dst], #16
+ st1 {v22.8h}, [\ds2], #16
+ b.le 9f
+
+ mov v16.16b, v17.16b
+ mov v20.16b, v21.16b
+ ld1 {v17.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ b 8b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 81b
+ ret
+
+L(\type\()_8tap_h_tbl):
+ .hword L(\type\()_8tap_h_tbl) - 1280b
+ .hword L(\type\()_8tap_h_tbl) - 640b
+ .hword L(\type\()_8tap_h_tbl) - 320b
+ .hword L(\type\()_8tap_h_tbl) - 160b
+ .hword L(\type\()_8tap_h_tbl) - 80b
+ .hword L(\type\()_8tap_h_tbl) - 40b
+ .hword L(\type\()_8tap_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_8tap_v):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+.ifc \type, prep
+ dup v30.4s, w12 // 6 - intermediate_bits
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ adr x10, L(\type\()_8tap_v_tbl)
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+.ifc \type, put
+ b.gt 28f
+
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ // 2x2 v
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ interleave_1_s v1, v2, v3, v4, v5
+ b.gt 24f
+ smull_smlal_4 v6, v1, v2, v3, v4
+ sqrshrun_h 6, v6
+ umin_h v31, .8h, v6
+ st_s \d_strd, v6, 2
+ ret
+
+24: // 2x4 v
+ load_s \sr2, \src, \s_strd, v6, v7
+ interleave_1_s v5, v6, v7
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull_smlal_4 v17, v3, v4, v5, v6
+ sqrshrun_h 6, v16, v17
+ umin_h v31, .8h, v16
+ st_s \d_strd, v16, 4
+ ret
+
+28: // 2x8, 2x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7
+ interleave_1_s v1, v2, v3, v4, v5
+ interleave_1_s v5, v6, v7
+216:
+ subs \h, \h, #8
+ load_s \sr2, \src, \s_strd, v16, v17, v18, v19
+ load_s \sr2, \src, \s_strd, v20, v21, v22, v23
+ interleave_1_s v7, v16, v17, v18, v19
+ interleave_1_s v19, v20, v21, v22, v23
+ smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16
+ smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18
+ smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20
+ smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22
+ sqrshrun_h 6, v24, v25, v26, v27
+ umin_h v31, .8h, v24, v26
+ st_s \d_strd, v24, 4
+ st_s \d_strd, v26, 4
+ b.le 0f
+ mov v1.16b, v17.16b
+ mov v2.16b, v18.16b
+ mov v3.16b, v19.16b
+ mov v4.16b, v20.16b
+ mov v5.16b, v21.16b
+ mov v6.16b, v22.16b
+ mov v7.16b, v23.16b
+ b 216b
+0:
+ ret
+.endif
+
+40:
+ b.gt 480f
+
+ // 4x2, 4x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v6, v1, v2, v3, v4
+ smull_smlal_4 v7, v2, v3, v4, v5
+ shift_store_4 \type, \d_strd, v6, v7
+ b.le 0f
+ load_4h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v1, v3, v4, v5, v6
+ smull_smlal_4 v2, v4, v5, v6, v7
+ shift_store_4 \type, \d_strd, v1, v2
+0:
+ ret
+
+480: // 4x8, 4x16 v
+ ld1 {v0.8b}, [\xmy]
+ sub \sr2, \src, \s_strd, lsl #1
+ add \ds2, \dst, \d_strd
+ sub \src, \sr2, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+48:
+ subs \h, \h, #4
+ load_4h \sr2, \src, \s_strd, v23, v24, v25, v26
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24
+ smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_4 \type, \d_strd, v1, v2, v3, v4
+ b.le 0f
+ mov v16.8b, v20.8b
+ mov v17.8b, v21.8b
+ mov v18.8b, v22.8b
+ mov v19.8b, v23.8b
+ mov v20.8b, v24.8b
+ mov v21.8b, v25.8b
+ mov v22.8b, v26.8b
+ b 48b
+0:
+ ret
+
+80:
+ b.gt 880f
+
+ // 8x2, 8x4 v
+ cmp \h, #2
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+
+ load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5
+ smull_smlal_4 v16, v1, v2, v3, v4
+ smull2_smlal2_4 v17, v1, v2, v3, v4
+ smull_smlal_4 v18, v2, v3, v4, v5
+ smull2_smlal2_4 v19, v2, v3, v4, v5
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+ b.le 0f
+ load_8h \sr2, \src, \s_strd, v6, v7
+ smull_smlal_4 v16, v3, v4, v5, v6
+ smull2_smlal2_4 v17, v3, v4, v5, v6
+ smull_smlal_4 v18, v4, v5, v6, v7
+ smull2_smlal2_4 v19, v4, v5, v6, v7
+ shift_store_8 \type, \d_strd, v16, v17, v18, v19
+0:
+ ret
+
+880: // 8x6, 8x8, 8x16, 8x32 v
+1680: // 16x8, 16x16, ...
+320: // 32x8, 32x16, ...
+640:
+1280:
+ ld1 {v0.8b}, [\xmy]
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ mov \my, \h
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
+
+88:
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v23, v24
+ smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23
+ smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23
+ smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24
+ smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ subs \h, \h, #2
+ load_8h \sr2, \src, \s_strd, v25, v26
+ smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25
+ smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25
+ smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26
+ smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26
+ shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ ret
+
+160:
+ b.gt 1680b
+
+ // 16x2, 16x4 v
+ add \xmy, \xmy, #2
+ ld1 {v0.s}[0], [\xmy]
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+
+ load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
+16:
+ load_16h \src, \src, \s_strd, v22, v23
+ subs \h, \h, #1
+ smull_smlal_4 v1, v16, v18, v20, v22
+ smull2_smlal2_4 v2, v16, v18, v20, v22
+ smull_smlal_4 v3, v17, v19, v21, v23
+ smull2_smlal2_4 v4, v17, v19, v21, v23
+ shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4
+ b.le 0f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ b 16b
+0:
+ ret
+
+L(\type\()_8tap_v_tbl):
+ .hword L(\type\()_8tap_v_tbl) - 1280b
+ .hword L(\type\()_8tap_v_tbl) - 640b
+ .hword L(\type\()_8tap_v_tbl) - 320b
+ .hword L(\type\()_8tap_v_tbl) - 160b
+ .hword L(\type\()_8tap_v_tbl) - 80b
+ .hword L(\type\()_8tap_v_tbl) - 40b
+ .hword L(\type\()_8tap_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_8tap_hv):
+ cmp \h, #4
+ ubfx w10, \my, #7, #7
+ and \my, \my, #0x7f
+ b.le 4f
+ mov \my, w10
+4:
+ add \xmy, x11, \my, uxtw #3
+
+ adr x10, L(\type\()_8tap_hv_tbl)
+ dup v30.4s, w12 // 6 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v30.4s, v30.4s // -(6-intermediate_bits)
+.ifc \type, put
+ dup v29.4s, w13 // 6 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v29.4s, v29.4s // -(6+intermediate_bits)
+.endif
+ br x10
+
+20:
+.ifc \type, put
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 280f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+
+ // 2x2, 2x4 hv
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ sxtl v1.4s, v1.4h
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ bl L(\type\()_8tap_filter_2)
+
+ trn1 v16.2d, v16.2d, v24.2d
+ mov v17.16b, v24.16b
+
+2:
+ bl L(\type\()_8tap_filter_2)
+
+ ext v18.16b, v17.16b, v24.16b, #8
+ mov v19.16b, v24.16b
+ mul v2.4s, v16.4s, v1.s[0]
+ mla v2.4s, v17.4s, v1.s[1]
+ mla v2.4s, v18.4s, v1.s[2]
+ mla v2.4s, v19.4s, v1.s[3]
+
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ umin v2.4h, v2.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v2.s}[0], [\dst], \d_strd
+ st1 {v2.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ b 2b
+
+280: // 2x8, 2x16, 2x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ sxtl2 v2.4s, v1.8h
+ sxtl v1.4s, v1.4h
+
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v28.16b, v27.16b, v27.16b, #2
+ smull v27.4s, v27.4h, v0.4h
+ smull v28.4s, v28.4h, v0.4h
+ addp v27.4s, v27.4s, v28.4s
+ addp v16.4s, v27.4s, v27.4s
+ srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+
+ bl L(\type\()_8tap_filter_2)
+ trn1 v16.2d, v16.2d, v24.2d
+ mov v17.16b, v24.16b
+ bl L(\type\()_8tap_filter_2)
+ ext v18.16b, v17.16b, v24.16b, #8
+ mov v19.16b, v24.16b
+ bl L(\type\()_8tap_filter_2)
+ ext v20.16b, v19.16b, v24.16b, #8
+ mov v21.16b, v24.16b
+
+28:
+ bl L(\type\()_8tap_filter_2)
+ ext v22.16b, v21.16b, v24.16b, #8
+ mov v23.16b, v24.16b
+ mul v3.4s, v16.4s, v1.s[0]
+ mla v3.4s, v17.4s, v1.s[1]
+ mla v3.4s, v18.4s, v1.s[2]
+ mla v3.4s, v19.4s, v1.s[3]
+ mla v3.4s, v20.4s, v2.s[0]
+ mla v3.4s, v21.4s, v2.s[1]
+ mla v3.4s, v22.4s, v2.s[2]
+ mla v3.4s, v23.4s, v2.s[3]
+
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ umin v3.4h, v3.4h, v31.4h
+ subs \h, \h, #2
+ st1 {v3.s}[0], [\dst], \d_strd
+ st1 {v3.s}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ b 28b
+
+0:
+ br x15
+
+L(\type\()_8tap_filter_2):
+ ld1 {v25.8h}, [\sr2], \s_strd
+ ld1 {v27.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v28.16b, v27.16b, v27.16b, #2
+ trn1 v24.2s, v25.2s, v27.2s
+ trn2 v27.2s, v25.2s, v27.2s
+ trn1 v25.2s, v26.2s, v28.2s
+ trn2 v28.2s, v26.2s, v28.2s
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v25.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ ret
+.endif
+
+40:
+ add \xmx, \xmx, #2
+ ld1 {v0.s}[0], [\xmx]
+ b.gt 480f
+ add \xmy, \xmy, #2
+ ld1 {v1.s}[0], [\xmy]
+ sub \sr2, \src, #2
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ // 4x2, 4x4 hv
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+
+4:
+ bl L(\type\()_8tap_filter_4)
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v24.4h, v1.h[2]
+ smlal v3.4s, v25.4h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ umin v2.8h, v2.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+
+ st1 {v2.d}[0], [\dst], \d_strd
+ st1 {v2.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ b 4b
+
+480: // 4x8, 4x16, 4x32 hv
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #2
+ sub \sr2, \src, \s_strd, lsl #1
+ sub \src, \sr2, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
+
+ bl L(\type\()_8tap_filter_4)
+ mov v17.8b, v24.8b
+ mov v18.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v19.8b, v24.8b
+ mov v20.8b, v25.8b
+ bl L(\type\()_8tap_filter_4)
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+
+48:
+ bl L(\type\()_8tap_filter_4)
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
+ smull v4.4s, v17.4h, v1.h[0]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal v4.4s, v24.4h, v1.h[6]
+ smlal v4.4s, v25.4h, v1.h[7]
+.ifc \type, put
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v3.4h, v3.4s
+ sqxtun2 v3.8h, v4.4s
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v3.4h, v3.4s, #6
+ rshrn2 v3.8h, v4.4s, #6
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v3.d}[0], [\dst], \d_strd
+ st1 {v3.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
+ mov v22.8b, v25.8b
+ b 48b
+0:
+ br x15
+
+L(\type\()_8tap_filter_4):
+ ld1 {v24.8h}, [\sr2], \s_strd
+ ld1 {v25.8h}, [\src], \s_strd
+ ext v26.16b, v24.16b, v24.16b, #2
+ ext v27.16b, v24.16b, v24.16b, #4
+ ext v28.16b, v24.16b, v24.16b, #6
+ smull v24.4s, v24.4h, v0.h[0]
+ smlal v24.4s, v26.4h, v0.h[1]
+ smlal v24.4s, v27.4h, v0.h[2]
+ smlal v24.4s, v28.4h, v0.h[3]
+ ext v26.16b, v25.16b, v25.16b, #2
+ ext v27.16b, v25.16b, v25.16b, #4
+ ext v28.16b, v25.16b, v25.16b, #6
+ smull v25.4s, v25.4h, v0.h[0]
+ smlal v25.4s, v26.4h, v0.h[1]
+ smlal v25.4s, v27.4h, v0.h[2]
+ smlal v25.4s, v28.4h, v0.h[3]
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
+ xtn v25.4h, v25.4s
+ ret
+
+80:
+160:
+320:
+ b.gt 880f
+ add \xmy, \xmy, #2
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.s}[0], [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ xtn v16.4h, v24.4s
+ xtn2 v16.8h, v25.4s
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+
+8:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v23.4h, v1.h[2]
+ smlal2 v5.4s, v23.8h, v1.h[2]
+ smlal v2.4s, v23.4h, v1.h[3]
+ smlal2 v3.4s, v23.8h, v1.h[3]
+ smlal v4.4s, v24.4h, v1.h[3]
+ smlal2 v5.4s, v24.8h, v1.h[3]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ b 8b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #2
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 164b
+
+880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+ ld1 {v0.8b}, [\xmx]
+ ld1 {v1.8b}, [\xmy]
+ sub \src, \src, #6
+ sub \src, \src, \s_strd
+ sub \src, \src, \s_strd, lsl #1
+ sxtl v0.8h, v0.8b
+ sxtl v1.8h, v1.8b
+ mov x15, x30
+ mov \my, \h
+
+168:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+
+ ld1 {v27.8h, v28.8h}, [\src], \s_strd
+ smull v24.4s, v27.4h, v0.h[0]
+ smull2 v25.4s, v27.8h, v0.h[0]
+.irpc i, 1234567
+ ext v26.16b, v27.16b, v28.16b, #(2*\i)
+ smlal v24.4s, v26.4h, v0.h[\i]
+ smlal2 v25.4s, v26.8h, v0.h[\i]
+.endr
+ srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53),
+ // and conserves register space (no need to clobber v8-v15).
+ xtn v16.4h, v24.4s
+ xtn2 v16.8h, v25.4s
+
+ bl L(\type\()_8tap_filter_8)
+ mov v17.16b, v23.16b
+ mov v18.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v19.16b, v23.16b
+ mov v20.16b, v24.16b
+ bl L(\type\()_8tap_filter_8)
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+
+88:
+ smull v2.4s, v16.4h, v1.h[0]
+ smull2 v3.4s, v16.8h, v1.h[0]
+ bl L(\type\()_8tap_filter_8)
+ smull v4.4s, v17.4h, v1.h[0]
+ smull2 v5.4s, v17.8h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal2 v3.4s, v17.8h, v1.h[1]
+ smlal v4.4s, v18.4h, v1.h[1]
+ smlal2 v5.4s, v18.8h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal2 v3.4s, v18.8h, v1.h[2]
+ smlal v4.4s, v19.4h, v1.h[2]
+ smlal2 v5.4s, v19.8h, v1.h[2]
+ smlal v2.4s, v19.4h, v1.h[3]
+ smlal2 v3.4s, v19.8h, v1.h[3]
+ smlal v4.4s, v20.4h, v1.h[3]
+ smlal2 v5.4s, v20.8h, v1.h[3]
+ smlal v2.4s, v20.4h, v1.h[4]
+ smlal2 v3.4s, v20.8h, v1.h[4]
+ smlal v4.4s, v21.4h, v1.h[4]
+ smlal2 v5.4s, v21.8h, v1.h[4]
+ smlal v2.4s, v21.4h, v1.h[5]
+ smlal2 v3.4s, v21.8h, v1.h[5]
+ smlal v4.4s, v22.4h, v1.h[5]
+ smlal2 v5.4s, v22.8h, v1.h[5]
+ smlal v2.4s, v22.4h, v1.h[6]
+ smlal2 v3.4s, v22.8h, v1.h[6]
+ smlal v4.4s, v23.4h, v1.h[6]
+ smlal2 v5.4s, v23.8h, v1.h[6]
+ smlal v2.4s, v23.4h, v1.h[7]
+ smlal2 v3.4s, v23.8h, v1.h[7]
+ smlal v4.4s, v24.4h, v1.h[7]
+ smlal2 v5.4s, v24.8h, v1.h[7]
+.ifc \type, put
+ srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
+ srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
+ srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits)
+ srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits)
+ sqxtun v2.4h, v2.4s
+ sqxtun2 v2.8h, v3.4s
+ sqxtun v3.4h, v4.4s
+ sqxtun2 v3.8h, v5.4s
+ umin v2.8h, v2.8h, v31.8h
+ umin v3.8h, v3.8h, v31.8h
+.else
+ rshrn v2.4h, v2.4s, #6
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ sub v2.8h, v2.8h, v29.8h // PREP_BIAS
+ sub v3.8h, v3.8h, v29.8h // PREP_BIAS
+.endif
+ subs \h, \h, #2
+ st1 {v2.8h}, [\dst], \d_strd
+ st1 {v3.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v20.16b, v22.16b
+ mov v21.16b, v23.16b
+ mov v22.16b, v24.16b
+ b 88b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #3
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 168b
+0:
+ br x15
+
+L(\type\()_8tap_filter_8):
+ ld1 {v4.8h, v5.8h}, [\sr2], \s_strd
+ ld1 {v6.8h, v7.8h}, [\src], \s_strd
+ smull v25.4s, v4.4h, v0.h[0]
+ smull2 v26.4s, v4.8h, v0.h[0]
+ smull v27.4s, v6.4h, v0.h[0]
+ smull2 v28.4s, v6.8h, v0.h[0]
+.irpc i, 1234567
+ ext v23.16b, v4.16b, v5.16b, #(2*\i)
+ ext v24.16b, v6.16b, v7.16b, #(2*\i)
+ smlal v25.4s, v23.4h, v0.h[\i]
+ smlal2 v26.4s, v23.8h, v0.h[\i]
+ smlal v27.4s, v24.4h, v0.h[\i]
+ smlal2 v28.4s, v24.8h, v0.h[\i]
+.endr
+ srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits)
+ srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits)
+ srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits)
+ srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits)
+ xtn v23.4h, v25.4s
+ xtn2 v23.8h, v26.4s
+ xtn v24.4h, v27.4s
+ xtn2 v24.8h, v28.4s
+ ret
+
+L(\type\()_8tap_hv_tbl):
+ .hword L(\type\()_8tap_hv_tbl) - 1280b
+ .hword L(\type\()_8tap_hv_tbl) - 640b
+ .hword L(\type\()_8tap_hv_tbl) - 320b
+ .hword L(\type\()_8tap_hv_tbl) - 160b
+ .hword L(\type\()_8tap_hv_tbl) - 80b
+ .hword L(\type\()_8tap_hv_tbl) - 40b
+ .hword L(\type\()_8tap_hv_tbl) - 20b
+ .hword 0
+endfunc
+
+
+function \type\()_bilin_16bpc_neon, export=1
+.ifc \bdmax, w8
+ ldr w8, [sp]
+.endif
+ dup v1.8h, \mx
+ dup v3.8h, \my
+ mov w10, #16
+ sub w9, w10, \mx
+ sub w10, w10, \my
+ dup v0.8h, w9
+ dup v2.8h, w10
+.ifc \type, prep
+ uxtw \d_strd, \w
+ lsl \d_strd, \d_strd, #1
+.endif
+
+ clz \bdmax, \bdmax // bitdepth_max
+ clz w9, \w
+ sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18
+ mov w11, #4
+ sub w9, w9, #24
+ sub w11, w11, \bdmax // 4 - intermediate_bits
+ add w12, \bdmax, #4 // 4 + intermediate_bits
+ cbnz \mx, L(\type\()_bilin_h)
+ cbnz \my, L(\type\()_bilin_v)
+ b \type\()_neon
+
+L(\type\()_bilin_h):
+ cbnz \my, L(\type\()_bilin_hv)
+
+ adr x10, L(\type\()_bilin_h_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.8h, \bdmax // intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.8h, v30.8h // -intermediate_bits
+.endif
+ br x10
+
+20: // 2xN h
+.ifc \type, put
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+2:
+ ld1 {v4.4h}, [\src], \s_strd
+ ld1 {v6.4h}, [\sr2], \s_strd
+ ext v5.8b, v4.8b, v4.8b, #2
+ ext v7.8b, v6.8b, v6.8b, #2
+ trn1 v4.2s, v4.2s, v6.2s
+ trn1 v5.2s, v5.2s, v7.2s
+ subs \h, \h, #2
+ mul v4.4h, v4.4h, v0.4h
+ mla v4.4h, v5.4h, v1.4h
+ urshl v4.4h, v4.4h, v31.4h
+ urshl v4.4h, v4.4h, v30.4h
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.gt 2b
+ ret
+.endif
+
+40: // 4xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+4:
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v4.16b, #2
+ ext v7.16b, v6.16b, v6.16b, #2
+ trn1 v4.2d, v4.2d, v6.2d
+ trn1 v5.2d, v5.2d, v7.2d
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.gt 4b
+ ret
+
+80: // 8xN h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \d_strd, \d_strd, #1
+ lsl \s_strd, \s_strd, #1
+8:
+ ldr h5, [\src, #16]
+ ldr h7, [\sr2, #16]
+ ld1 {v4.8h}, [\src], \s_strd
+ ld1 {v6.8h}, [\sr2], \s_strd
+ ext v5.16b, v4.16b, v5.16b, #2
+ ext v7.16b, v6.16b, v7.16b, #2
+ subs \h, \h, #2
+ mul v4.8h, v4.8h, v0.8h
+ mla v4.8h, v5.8h, v1.8h
+ mul v6.8h, v6.8h, v0.8h
+ mla v6.8h, v7.8h, v1.8h
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+.ifc \type, put
+ urshl v4.8h, v4.8h, v30.8h
+ urshl v6.8h, v6.8h, v30.8h
+.else
+ sub v4.8h, v4.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v6.8h}, [\ds2], \d_strd
+ b.gt 8b
+ ret
+160:
+320:
+640:
+1280: // 16xN, 32xN, ... h
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+
+ sub \s_strd, \s_strd, \w, uxtw #1
+ sub \s_strd, \s_strd, #16
+.ifc \type, put
+ lsl \d_strd, \d_strd, #1
+ sub \d_strd, \d_strd, \w, uxtw #1
+.endif
+161:
+ ld1 {v16.8h}, [\src], #16
+ ld1 {v21.8h}, [\sr2], #16
+ mov \mx, \w
+
+16:
+ ld1 {v17.8h, v18.8h}, [\src], #32
+ ld1 {v22.8h, v23.8h}, [\sr2], #32
+ ext v19.16b, v16.16b, v17.16b, #2
+ ext v20.16b, v17.16b, v18.16b, #2
+ ext v24.16b, v21.16b, v22.16b, #2
+ ext v25.16b, v22.16b, v23.16b, #2
+ mul v16.8h, v16.8h, v0.8h
+ mla v16.8h, v19.8h, v1.8h
+ mul v17.8h, v17.8h, v0.8h
+ mla v17.8h, v20.8h, v1.8h
+ mul v21.8h, v21.8h, v0.8h
+ mla v21.8h, v24.8h, v1.8h
+ mul v22.8h, v22.8h, v0.8h
+ mla v22.8h, v25.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v21.8h, v21.8h, v31.8h
+ urshl v22.8h, v22.8h, v31.8h
+ subs \mx, \mx, #16
+.ifc \type, put
+ urshl v16.8h, v16.8h, v30.8h
+ urshl v17.8h, v17.8h, v30.8h
+ urshl v21.8h, v21.8h, v30.8h
+ urshl v22.8h, v22.8h, v30.8h
+.else
+ sub v16.8h, v16.8h, v29.8h
+ sub v17.8h, v17.8h, v29.8h
+ sub v21.8h, v21.8h, v29.8h
+ sub v22.8h, v22.8h, v29.8h
+.endif
+ st1 {v16.8h, v17.8h}, [\dst], #32
+ st1 {v21.8h, v22.8h}, [\ds2], #32
+ b.le 9f
+
+ mov v16.16b, v18.16b
+ mov v21.16b, v23.16b
+ b 16b
+
+9:
+ add \dst, \dst, \d_strd
+ add \ds2, \ds2, \d_strd
+ add \src, \src, \s_strd
+ add \sr2, \sr2, \s_strd
+
+ subs \h, \h, #2
+ b.gt 161b
+ ret
+
+L(\type\()_bilin_h_tbl):
+ .hword L(\type\()_bilin_h_tbl) - 1280b
+ .hword L(\type\()_bilin_h_tbl) - 640b
+ .hword L(\type\()_bilin_h_tbl) - 320b
+ .hword L(\type\()_bilin_h_tbl) - 160b
+ .hword L(\type\()_bilin_h_tbl) - 80b
+ .hword L(\type\()_bilin_h_tbl) - 40b
+ .hword L(\type\()_bilin_h_tbl) - 20b
+ .hword 0
+
+
+L(\type\()_bilin_v):
+ cmp \h, #4
+ adr x10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+ dup v31.8h, w11 // 4 - intermediate_bits
+.endif
+ ldrh w9, [x10, x9, lsl #1]
+.ifc \type, prep
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.endif
+ sub x10, x10, w9, uxtw
+ br x10
+
+20: // 2xN v
+.ifc \type, put
+ cmp \h, #2
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ // 2x2 v
+ ld1 {v16.s}[0], [\src], \s_strd
+ b.gt 24f
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ mul v4.4h, v16.4h, v2.4h
+ mla v4.4h, v17.4h, v3.4h
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst]
+ st1 {v4.s}[1], [\ds2]
+ ret
+24: // 2x4, 2x8, ... v
+ ld1 {v17.s}[0], [\sr2], \s_strd
+ ld1 {v18.s}[0], [\src], \s_strd
+ ld1 {v19.s}[0], [\sr2], \s_strd
+ ld1 {v20.s}[0], [\src], \s_strd
+ trn1 v16.2s, v16.2s, v17.2s
+ trn1 v17.2s, v17.2s, v18.2s
+ trn1 v18.2s, v18.2s, v19.2s
+ trn1 v19.2s, v19.2s, v20.2s
+ trn1 v16.2d, v16.2d, v18.2d
+ trn1 v17.2d, v17.2d, v19.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #4
+ urshr v4.8h, v4.8h, #4
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ st1 {v4.s}[2], [\dst], \d_strd
+ st1 {v4.s}[3], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v20.8b
+ b 24b
+0:
+ ret
+.endif
+
+40: // 4xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.4h}, [\src], \s_strd
+4:
+ ld1 {v17.4h}, [\sr2], \s_strd
+ ld1 {v18.4h}, [\src], \s_strd
+ trn1 v16.2d, v16.2d, v17.2d
+ trn1 v17.2d, v17.2d, v18.2d
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ mov v16.8b, v18.8b
+ b 4b
+0:
+ ret
+
+80: // 8xN v
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+ ld1 {v16.8h}, [\src], \s_strd
+8:
+ ld1 {v17.8h}, [\sr2], \s_strd
+ ld1 {v18.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v17.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v18.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 0f
+ mov v16.16b, v18.16b
+ b 8b
+0:
+ ret
+
+160: // 16xN, 32xN, ...
+320:
+640:
+1280:
+ mov \my, \h
+1:
+ add \ds2, \dst, \d_strd
+ add \sr2, \src, \s_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v16.8h, v17.8h}, [\src], \s_strd
+2:
+ ld1 {v18.8h, v19.8h}, [\sr2], \s_strd
+ ld1 {v20.8h, v21.8h}, [\src], \s_strd
+ mul v4.8h, v16.8h, v2.8h
+ mla v4.8h, v18.8h, v3.8h
+ mul v5.8h, v17.8h, v2.8h
+ mla v5.8h, v19.8h, v3.8h
+ mul v6.8h, v18.8h, v2.8h
+ mla v6.8h, v20.8h, v3.8h
+ mul v7.8h, v19.8h, v2.8h
+ mla v7.8h, v21.8h, v3.8h
+ subs \h, \h, #2
+.ifc \type, put
+ urshr v4.8h, v4.8h, #4
+ urshr v5.8h, v5.8h, #4
+ urshr v6.8h, v6.8h, #4
+ urshr v7.8h, v7.8h, #4
+.else
+ urshl v4.8h, v4.8h, v31.8h
+ urshl v5.8h, v5.8h, v31.8h
+ urshl v6.8h, v6.8h, v31.8h
+ urshl v7.8h, v7.8h, v31.8h
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+ sub v6.8h, v6.8h, v29.8h
+ sub v7.8h, v7.8h, v29.8h
+.endif
+ st1 {v4.8h, v5.8h}, [\dst], \d_strd
+ st1 {v6.8h, v7.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ b 2b
+9:
+ subs \w, \w, #16
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #32
+ add \dst, \dst, #32
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_v_tbl):
+ .hword L(\type\()_bilin_v_tbl) - 1280b
+ .hword L(\type\()_bilin_v_tbl) - 640b
+ .hword L(\type\()_bilin_v_tbl) - 320b
+ .hword L(\type\()_bilin_v_tbl) - 160b
+ .hword L(\type\()_bilin_v_tbl) - 80b
+ .hword L(\type\()_bilin_v_tbl) - 40b
+ .hword L(\type\()_bilin_v_tbl) - 20b
+ .hword 0
+
+L(\type\()_bilin_hv):
+ adr x10, L(\type\()_bilin_hv_tbl)
+ dup v31.8h, w11 // 4 - intermediate_bits
+ ldrh w9, [x10, x9, lsl #1]
+ neg v31.8h, v31.8h // -(4-intermediate_bits)
+.ifc \type, put
+ dup v30.4s, w12 // 4 + intermediate_bits
+.else
+ movi v29.8h, #(PREP_BIAS >> 8), lsl #8
+.endif
+ sub x10, x10, w9, uxtw
+.ifc \type, put
+ neg v30.4s, v30.4s // -(4+intermediate_bits)
+.endif
+ br x10
+
+20: // 2xN hv
+.ifc \type, put
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.4h}, [\src], \s_strd
+ ext v21.8b, v20.8b, v20.8b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+2:
+ ld1 {v22.4h}, [\sr2], \s_strd
+ ld1 {v24.4h}, [\src], \s_strd
+ ext v23.8b, v22.8b, v22.8b, #2
+ ext v25.8b, v24.8b, v24.8b, #2
+ trn1 v22.2s, v22.2s, v24.2s
+ trn1 v23.2s, v23.2s, v25.2s
+ mul v17.4h, v22.4h, v0.4h
+ mla v17.4h, v23.4h, v1.4h
+ urshl v17.4h, v17.4h, v31.4h
+
+ trn1 v16.2s, v16.2s, v17.2s
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ urshl v4.4s, v4.4s, v30.4s
+ xtn v4.4h, v4.4s
+ subs \h, \h, #2
+ st1 {v4.s}[0], [\dst], \d_strd
+ st1 {v4.s}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2s, v17.2s, v17.2s
+ b 2b
+0:
+ ret
+.endif
+
+40: // 4xN hv
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v20.16b, #2
+ mul v16.4h, v20.4h, v0.4h
+ mla v16.4h, v21.4h, v1.4h
+ urshl v16.4h, v16.4h, v31.4h
+
+4:
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v22.16b, #2
+ ext v25.16b, v24.16b, v24.16b, #2
+ trn1 v22.2d, v22.2d, v24.2d
+ trn1 v23.2d, v23.2d, v25.2d
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+
+ trn1 v16.2d, v16.2d, v17.2d
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ xtn v4.4h, v4.4s
+ xtn2 v4.8h, v5.4s
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.d}[0], [\dst], \d_strd
+ st1 {v4.d}[1], [\ds2], \d_strd
+ b.le 0f
+ trn2 v16.2d, v17.2d, v17.2d
+ b 4b
+0:
+ ret
+
+80: // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+ mov \my, \h
+
+1:
+ add \sr2, \src, \s_strd
+ add \ds2, \dst, \d_strd
+ lsl \s_strd, \s_strd, #1
+ lsl \d_strd, \d_strd, #1
+
+ ldr h21, [\src, #16]
+ ld1 {v20.8h}, [\src], \s_strd
+ ext v21.16b, v20.16b, v21.16b, #2
+ mul v16.8h, v20.8h, v0.8h
+ mla v16.8h, v21.8h, v1.8h
+ urshl v16.8h, v16.8h, v31.8h
+
+2:
+ ldr h23, [\sr2, #16]
+ ld1 {v22.8h}, [\sr2], \s_strd
+ ldr h25, [\src, #16]
+ ld1 {v24.8h}, [\src], \s_strd
+ ext v23.16b, v22.16b, v23.16b, #2
+ ext v25.16b, v24.16b, v25.16b, #2
+ mul v17.8h, v22.8h, v0.8h
+ mla v17.8h, v23.8h, v1.8h
+ mul v18.8h, v24.8h, v0.8h
+ mla v18.8h, v25.8h, v1.8h
+ urshl v17.8h, v17.8h, v31.8h
+ urshl v18.8h, v18.8h, v31.8h
+
+ umull v4.4s, v16.4h, v2.4h
+ umlal v4.4s, v17.4h, v3.4h
+ umull2 v5.4s, v16.8h, v2.8h
+ umlal2 v5.4s, v17.8h, v3.8h
+ umull v6.4s, v17.4h, v2.4h
+ umlal v6.4s, v18.4h, v3.4h
+ umull2 v7.4s, v17.8h, v2.8h
+ umlal2 v7.4s, v18.8h, v3.8h
+.ifc \type, put
+ urshl v4.4s, v4.4s, v30.4s
+ urshl v5.4s, v5.4s, v30.4s
+ urshl v6.4s, v6.4s, v30.4s
+ urshl v7.4s, v7.4s, v30.4s
+ xtn v4.4h, v4.4s
+ xtn2 v4.8h, v5.4s
+ xtn v5.4h, v6.4s
+ xtn2 v5.8h, v7.4s
+.else
+ rshrn v4.4h, v4.4s, #4
+ rshrn2 v4.8h, v5.4s, #4
+ rshrn v5.4h, v6.4s, #4
+ rshrn2 v5.8h, v7.4s, #4
+ sub v4.8h, v4.8h, v29.8h
+ sub v5.8h, v5.8h, v29.8h
+.endif
+ subs \h, \h, #2
+ st1 {v4.8h}, [\dst], \d_strd
+ st1 {v5.8h}, [\ds2], \d_strd
+ b.le 9f
+ mov v16.16b, v18.16b
+ b 2b
+9:
+ subs \w, \w, #8
+ b.le 0f
+ asr \s_strd, \s_strd, #1
+ asr \d_strd, \d_strd, #1
+ msub \src, \s_strd, \xmy, \src
+ msub \dst, \d_strd, \xmy, \dst
+ sub \src, \src, \s_strd, lsl #1
+ mov \h, \my
+ add \src, \src, #16
+ add \dst, \dst, #16
+ b 1b
+0:
+ ret
+
+L(\type\()_bilin_hv_tbl):
+ .hword L(\type\()_bilin_hv_tbl) - 1280b
+ .hword L(\type\()_bilin_hv_tbl) - 640b
+ .hword L(\type\()_bilin_hv_tbl) - 320b
+ .hword L(\type\()_bilin_hv_tbl) - 160b
+ .hword L(\type\()_bilin_hv_tbl) - 80b
+ .hword L(\type\()_bilin_hv_tbl) - 40b
+ .hword L(\type\()_bilin_hv_tbl) - 20b
+ .hword 0
+endfunc
+.endm
+
+filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -75,7 +75,7 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8
+#if BITDEPTH == 8 || ARCH_AARCH64
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
@@ -97,9 +97,7 @@
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
-#endif
-#if BITDEPTH == 8 || ARCH_AARCH64
c->avg = BF(dav1d_avg, neon);
c->w_avg = BF(dav1d_w_avg, neon);
c->mask = BF(dav1d_mask, neon);