ref: 41a58e644010c51d0c1a0b4380bc974295718a6f
parent: 59c31e7737f4b03047acd851102597c15e185d33
author: Martin Storsjö <[email protected]>
date: Sun Mar 15 19:57:30 EDT 2020
arm64: ipred: Add NEON implementation of ipred for 16 bpc The FILTER_PRED function is templated and has two separate instantations for 10 and 12 bit separately. (They're switched between using a runtime check on entry to the function.)
--- /dev/null
+++ b/src/arm/64/ipred16.S
@@ -1,0 +1,2834 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+ ldr w8, [sp]
+ clz w3, w3
+ adr x5, L(ipred_dc_128_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ dup v0.8h, w8
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ urshr v0.8h, v0.8h, #1
+ br x5
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ mov v1.16b, v0.16b
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_128_tbl):
+ .hword L(ipred_dc_128_tbl) - 640b
+ .hword L(ipred_dc_128_tbl) - 320b
+ .hword L(ipred_dc_128_tbl) - 160b
+ .hword L(ipred_dc_128_tbl) - 8b
+ .hword L(ipred_dc_128_tbl) - 4b
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_v_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ ld1 {v0.4h}, [x2]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ ld1 {v0.8h}, [x2]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ ld1 {v0.8h, v1.8h}, [x2]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ sub x1, x1, #64
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_v_tbl):
+ .hword L(ipred_v_tbl) - 640b
+ .hword L(ipred_v_tbl) - 320b
+ .hword L(ipred_v_tbl) - 160b
+ .hword L(ipred_v_tbl) - 80b
+ .hword L(ipred_v_tbl) - 40b
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_h_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ sub x2, x2, #8
+ sub x5, x5, w3, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+4:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.4h}, [x0], x1
+ st1 {v2.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+16:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 16b
+ ret
+32:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 32b
+ ret
+64:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+ str q3, [x0, #16]
+ str q2, [x6, #16]
+ stp q3, q3, [x0, #32]
+ stp q2, q2, [x6, #32]
+ stp q3, q3, [x0, #64]
+ stp q2, q2, [x6, #64]
+ stp q3, q3, [x0, #96]
+ stp q2, q2, [x6, #96]
+ st1 {v3.8h}, [x0], x1
+ st1 {v2.8h}, [x6], x1
+ subs w4, w4, #4
+ str q1, [x0, #16]
+ str q0, [x6, #16]
+ stp q1, q1, [x0, #32]
+ stp q0, q0, [x6, #32]
+ stp q1, q1, [x0, #64]
+ stp q0, q0, [x6, #64]
+ stp q1, q1, [x0, #96]
+ stp q0, q0, [x6, #96]
+ st1 {v1.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_h_tbl):
+ .hword L(ipred_h_tbl) - 64b
+ .hword L(ipred_h_tbl) - 32b
+ .hword L(ipred_h_tbl) - 16b
+ .hword L(ipred_h_tbl) - 8b
+ .hword L(ipred_h_tbl) - 4b
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+ clz w3, w3
+ adr x5, L(ipred_dc_top_tbl)
+ sub w3, w3, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ add x2, x2, #2
+ sub x5, x5, w3, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.4h, v0.h[0]
+4:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+8:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+16:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 16b
+ ret
+320:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+32:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 32b
+ ret
+640:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+64:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 64b
+ ret
+
+L(ipred_dc_top_tbl):
+ .hword L(ipred_dc_top_tbl) - 640b
+ .hword L(ipred_dc_top_tbl) - 320b
+ .hword L(ipred_dc_top_tbl) - 160b
+ .hword L(ipred_dc_top_tbl) - 80b
+ .hword L(ipred_dc_top_tbl) - 40b
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ clz w3, w3
+ clz w7, w4
+ adr x5, L(ipred_dc_left_tbl)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w7, w7, #25
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w7, [x5, w7, uxtw #1]
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w7, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_left_h4):
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w4):
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt L(ipred_dc_left_w4)
+ ret
+
+L(ipred_dc_left_h8):
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x3
+L(ipred_dc_left_w8):
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt L(ipred_dc_left_w8)
+ ret
+
+L(ipred_dc_left_h16):
+ ld1 {v0.8h, v1.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addv h0, v0.8h
+ urshr v2.4h, v0.4h, #4
+ dup v0.8h, v2.h[0]
+ dup v1.8h, v2.h[0]
+ br x3
+L(ipred_dc_left_w16):
+ mov v1.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h32):
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlp v0.4s, v0.8h
+ addv s0, v0.4s
+ rshrn v4.4h, v0.4s, #5
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w32):
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_h64):
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v4.4h, v0.4s, #6
+ dup v0.8h, v4.h[0]
+ br x3
+L(ipred_dc_left_w64):
+ mov v1.16b, v0.16b
+ mov v2.16b, v0.16b
+ mov v3.16b, v0.16b
+ sub x1, x1, #64
+1:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 1b
+ ret
+
+L(ipred_dc_left_tbl):
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+ .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+ sub x2, x2, w4, uxtw #1
+ add w7, w3, w4 // width + height
+ clz w3, w3
+ clz w6, w4
+ dup v16.4s, w7 // width + height
+ adr x5, L(ipred_dc_tbl)
+ rbit w7, w7 // rbit(width + height)
+ sub w3, w3, #20 // 25 leading bits, minus table offset 5
+ sub w6, w6, #25
+ clz w7, w7 // ctz(width + height)
+ ldrh w3, [x5, w3, uxtw #1]
+ ldrh w6, [x5, w6, uxtw #1]
+ neg w7, w7 // -ctz(width + height)
+ sub x3, x5, w3, uxtw
+ sub x5, x5, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w7 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+
+L(ipred_dc_h4):
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ br x3
+L(ipred_dc_w4):
+ add x2, x2, #2
+ ld1 {v1.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.4h, v0.h[0]
+2:
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.4h}, [x0], x1
+ st1 {v0.4h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h8):
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w8):
+ add x2, x2, #2
+ ld1 {v1.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s1, v1.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v1.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+2:
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h16):
+ ld1 {v0.8h, v1.8h}, [x2], #32
+ addp v0.8h, v0.8h, v1.8h
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w16):
+ add x2, x2, #2
+ ld1 {v1.8h, v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ uaddlv s1, v1.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32/64
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], x1
+ st1 {v0.8h, v1.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h32):
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v2.8h
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w32):
+ add x2, x2, #2
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ addp v3.8h, v3.8h, v4.8h
+ addp v1.8h, v1.8h, v3.8h
+ uaddlv s1, v1.8h
+ cmp w4, #32
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16/64
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_h64):
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ addp v0.8h, v0.8h, v2.8h
+ addp v4.8h, v4.8h, v6.8h
+ addp v0.8h, v0.8h, v4.8h
+ uaddlv s0, v0.8h
+ br x3
+L(ipred_dc_w64):
+ add x2, x2, #2
+ ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
+ add v0.2s, v0.2s, v16.2s
+ addp v1.8h, v1.8h, v2.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
+ addp v3.8h, v3.8h, v4.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ addp v1.8h, v1.8h, v3.8h
+ addp v20.8h, v20.8h, v22.8h
+ addp v1.8h, v1.8h, v20.8h
+ uaddlv s1, v1.8h
+ cmp w4, #64
+ add v0.2s, v0.2s, v1.2s
+ ushl v4.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 16/32
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v4.2s, v4.2s, v16.2s
+ ushr v4.2s, v4.2s, #17
+1:
+ sub x1, x1, #64
+ dup v0.8h, v4.h[0]
+ dup v1.8h, v4.h[0]
+ dup v2.8h, v4.h[0]
+ dup v3.8h, v4.h[0]
+2:
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+ b.gt 2b
+ ret
+
+L(ipred_dc_tbl):
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+ .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+ clz w9, w3
+ adr x5, L(ipred_paeth_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x2]
+ add x8, x2, #2
+ sub x2, x2, #8
+ sub x5, x5, w9, uxtw
+ mov x7, #-8
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ ld1r {v5.2d}, [x8]
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v2.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v4.8h, v16.8h // tldiff
+ sabd v23.8h, v4.8h, v17.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v2.8h, v17.8h
+ umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff)
+ umin v19.8h, v21.8h, v23.8h
+ cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v23.8h, v21.8h
+ cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v19.8h, v17.8h
+ bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v20.16b, v5.16b, v4.16b
+ bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ...
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v21.d}[1], [x0], x1
+ st1 {v21.d}[0], [x6], x1
+ subs w4, w4, #4
+ st1 {v20.d}[1], [x0], x1
+ st1 {v20.d}[0], [x6], x1
+ b.gt 4b
+ ret
+80:
+160:
+320:
+640:
+ ld1 {v5.8h}, [x8], #16
+ mov w9, w3
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7
+2:
+ sub v6.8h, v5.8h, v4.8h // top - topleft
+ add v16.8h, v6.8h, v0.8h // base
+ add v17.8h, v6.8h, v1.8h
+ add v18.8h, v6.8h, v2.8h
+ add v19.8h, v6.8h, v3.8h
+ sabd v20.8h, v5.8h, v16.8h // tdiff
+ sabd v21.8h, v5.8h, v17.8h
+ sabd v22.8h, v5.8h, v18.8h
+ sabd v23.8h, v5.8h, v19.8h
+ sabd v24.8h, v4.8h, v16.8h // tldiff
+ sabd v25.8h, v4.8h, v17.8h
+ sabd v26.8h, v4.8h, v18.8h
+ sabd v27.8h, v4.8h, v19.8h
+ sabd v16.8h, v0.8h, v16.8h // ldiff
+ sabd v17.8h, v1.8h, v17.8h
+ sabd v18.8h, v2.8h, v18.8h
+ sabd v19.8h, v3.8h, v19.8h
+ umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff)
+ umin v29.8h, v21.8h, v25.8h
+ umin v30.8h, v22.8h, v26.8h
+ umin v31.8h, v23.8h, v27.8h
+ cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff
+ cmge v21.8h, v25.8h, v21.8h
+ cmge v22.8h, v26.8h, v22.8h
+ cmge v23.8h, v27.8h, v23.8h
+ cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff
+ cmge v17.8h, v29.8h, v17.8h
+ cmge v18.8h, v30.8h, v18.8h
+ cmge v19.8h, v31.8h, v19.8h
+ bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft
+ bsl v22.16b, v5.16b, v4.16b
+ bsl v21.16b, v5.16b, v4.16b
+ bsl v20.16b, v5.16b, v4.16b
+ bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ...
+ bit v22.16b, v2.16b, v18.16b
+ bit v21.16b, v1.16b, v17.16b
+ bit v20.16b, v0.16b, v16.16b
+ st1 {v23.8h}, [x0], #16
+ st1 {v22.8h}, [x6], #16
+ subs w3, w3, #8
+ st1 {v21.8h}, [x5], #16
+ st1 {v20.8h}, [x10], #16
+ b.le 8f
+ ld1 {v5.8h}, [x8], #16
+ b 2b
+8:
+ subs w4, w4, #4
+ b.le 9f
+ // End of horizontal loop, move pointers to next four rows
+ sub x8, x8, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ // Load the top row as early as possible
+ ld1 {v5.8h}, [x8], #16
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_paeth_tbl):
+ .hword L(ipred_paeth_tbl) - 640b
+ .hword L(ipred_paeth_tbl) - 320b
+ .hword L(ipred_paeth_tbl) - 160b
+ .hword L(ipred_paeth_tbl) - 80b
+ .hword L(ipred_paeth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+ movrel x10, X(sm_weights)
+ add x11, x10, w4, uxtw
+ add x10, x10, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_tbl)
+ sub x12, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x12] // bottom
+ add x8, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ sub x2, x2, #8
+ mov x7, #-8
+ ld1r {v6.2d}, [x8] // top
+ ld1r {v7.2s}, [x10] // weights_hor
+ dup v5.8h, v6.h[3] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v18.8h, v18.8b
+ smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v0.8h, v7.8h
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v18.4h
+ smlal2 v23.4s, v6.8h, v18.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn v21.4h, v21.4s, #9
+ rshrn v22.4h, v22.4s, #9
+ rshrn v23.4h, v23.4s, #9
+ st1 {v20.4h}, [x0], x1
+ st1 {v21.4h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.4h}, [x0], x1
+ st1 {v23.4h}, [x6], x1
+ b.gt 4b
+ ret
+80:
+ sub x2, x2, #8
+ mov x7, #-8
+ ld1 {v6.8h}, [x8] // top
+ ld1 {v7.8b}, [x10] // weights_hor
+ dup v5.8h, v6.h[7] // right
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+ uxtl v7.8h, v7.8b // weights_hor
+ add v31.4h, v4.4h, v5.4h // bottom+right
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+ uxtl v18.8h, v18.8b
+ uxtl v19.8h, v19.8b
+ smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v3.8h, v7.8h // (left flipped)
+ smlal v22.4s, v2.4h, v7.4h
+ smlal2 v23.4s, v2.8h, v7.8h
+ smlal v24.4s, v1.4h, v7.4h
+ smlal2 v25.4s, v1.8h, v7.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v6.8h, v16.8h
+ smlal v22.4s, v6.4h, v17.4h
+ smlal2 v23.4s, v6.8h, v17.8h
+ smlal v24.4s, v6.4h, v18.4h
+ smlal2 v25.4s, v6.8h, v18.8h
+ smlal v26.4s, v6.4h, v19.4h
+ smlal2 v27.4s, v6.8h, v19.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ add x12, x2, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ ld1r {v5.8h}, [x12] // right
+ sub x2, x2, #4
+ mov x7, #-4
+ mov w9, w3
+ add v31.4h, v4.4h, v5.4h // bottom+right
+
+1:
+ ld2r {v0.8h, v1.8h}, [x2], x7 // left
+ ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ uxtl v16.8h, v16.8b // weights_ver
+ uxtl v17.8h, v17.8b
+2:
+ ld1 {v7.16b}, [x10], #16 // weights_hor
+ ld1 {v2.8h, v3.8h}, [x8], #32 // top
+ ushll v20.4s, v31.4h, #8 // (bottom+right)*256
+ ushll v21.4s, v31.4h, #8
+ ushll v22.4s, v31.4h, #8
+ ushll v23.4s, v31.4h, #8
+ ushll v24.4s, v31.4h, #8
+ ushll v25.4s, v31.4h, #8
+ ushll v26.4s, v31.4h, #8
+ ushll v27.4s, v31.4h, #8
+ uxtl v6.8h, v7.8b // weights_hor
+ uxtl2 v7.8h, v7.16b
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor
+ smlal2 v21.4s, v1.8h, v6.8h // (left flipped)
+ smlal v22.4s, v1.4h, v7.4h
+ smlal2 v23.4s, v1.8h, v7.8h
+ smlal v24.4s, v0.4h, v6.4h
+ smlal2 v25.4s, v0.8h, v6.8h
+ smlal v26.4s, v0.4h, v7.4h
+ smlal2 v27.4s, v0.8h, v7.8h
+ smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver
+ smlal2 v21.4s, v2.8h, v16.8h
+ smlal v22.4s, v3.4h, v16.4h
+ smlal2 v23.4s, v3.8h, v16.8h
+ smlal v24.4s, v2.4h, v17.4h
+ smlal2 v25.4s, v2.8h, v17.8h
+ smlal v26.4s, v3.4h, v17.4h
+ smlal2 v27.4s, v3.8h, v17.8h
+ rshrn v20.4h, v20.4s, #9
+ rshrn2 v20.8h, v21.4s, #9
+ rshrn v21.4h, v22.4s, #9
+ rshrn2 v21.8h, v23.4s, #9
+ rshrn v22.4h, v24.4s, #9
+ rshrn2 v22.8h, v25.4s, #9
+ rshrn v23.4h, v26.4s, #9
+ rshrn2 v23.8h, v27.4s, #9
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x8, w9, uxtw #1
+ sub x10, x10, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_tbl):
+ .hword L(ipred_smooth_tbl) - 640b
+ .hword L(ipred_smooth_tbl) - 320b
+ .hword L(ipred_smooth_tbl) - 160b
+ .hword L(ipred_smooth_tbl) - 80b
+ .hword L(ipred_smooth_tbl) - 40b
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+ movrel x7, X(sm_weights)
+ add x7, x7, w4, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_v_tbl)
+ sub x8, x2, w4, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v4.8h}, [x8] // bottom
+ add x2, x2, #2
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ ld1r {v6.2d}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+4:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ zip1 v16.2s, v16.2s, v17.2s // weights_ver
+ zip1 v18.2s, v18.2s, v19.2s
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v18.8h, v18.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v18.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ ld1 {v6.8h}, [x2] // top
+ sub v6.8h, v6.8h, v4.8h // top-bottom
+8:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+ sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v6.8h, v17.8h
+ sqrdmulh v22.8h, v6.8h, v18.8h
+ sqrdmulh v23.8h, v6.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ // Set up pointers for four rows in parallel; x0, x6, x5, x8
+ add x5, x0, x1
+ add x8, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+ ushll v16.8h, v16.8b, #7 // weights_ver << 7
+ ushll v17.8h, v17.8b, #7
+ ushll v18.8h, v18.8b, #7
+ ushll v19.8h, v19.8b, #7
+2:
+ ld1 {v2.8h, v3.8h}, [x2], #32 // top
+ sub v2.8h, v2.8h, v4.8h // top-bottom
+ sub v3.8h, v3.8h, v4.8h
+ sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v16.8h
+ sqrdmulh v22.8h, v2.8h, v17.8h
+ sqrdmulh v23.8h, v3.8h, v17.8h
+ sqrdmulh v24.8h, v2.8h, v18.8h
+ sqrdmulh v25.8h, v3.8h, v18.8h
+ sqrdmulh v26.8h, v2.8h, v19.8h
+ sqrdmulh v27.8h, v3.8h, v19.8h
+ add v20.8h, v20.8h, v4.8h
+ add v21.8h, v21.8h, v4.8h
+ add v22.8h, v22.8h, v4.8h
+ add v23.8h, v23.8h, v4.8h
+ add v24.8h, v24.8h, v4.8h
+ add v25.8h, v25.8h, v4.8h
+ add v26.8h, v26.8h, v4.8h
+ add v27.8h, v27.8h, v4.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x8], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x2, x2, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x8, x8, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_v_tbl):
+ .hword L(ipred_smooth_v_tbl) - 640b
+ .hword L(ipred_smooth_v_tbl) - 320b
+ .hword L(ipred_smooth_v_tbl) - 160b
+ .hword L(ipred_smooth_v_tbl) - 80b
+ .hword L(ipred_smooth_v_tbl) - 40b
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int a,
+// const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+ movrel x8, X(sm_weights)
+ add x8, x8, w3, uxtw
+ clz w9, w3
+ adr x5, L(ipred_smooth_h_tbl)
+ add x12, x2, w3, uxtw #1
+ sub w9, w9, #25
+ ldrh w9, [x5, w9, uxtw #1]
+ ld1r {v5.8h}, [x12] // right
+ sub x5, x5, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ br x5
+40:
+ ld1r {v7.2s}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+4:
+ ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left
+ zip1 v1.2d, v1.2d, v0.2d // left, flipped
+ zip1 v0.2d, v3.2d, v2.2d
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v1.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ st1 {v20.d}[0], [x0], x1
+ st1 {v20.d}[1], [x6], x1
+ subs w4, w4, #4
+ st1 {v21.d}[0], [x0], x1
+ st1 {v21.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ ld1 {v7.8b}, [x8] // weights_hor
+ sub x2, x2, #8
+ mov x7, #-8
+ ushll v7.8h, v7.8b, #7 // weights_hor << 7
+8:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v3.8h, v3.8h, v5.8h // left-right
+ sub v2.8h, v2.8h, v5.8h
+ sub v1.8h, v1.8h, v5.8h
+ sub v0.8h, v0.8h, v5.8h
+ sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v1.8h, v7.8h
+ sqrdmulh v23.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ st1 {v20.8h}, [x0], x1
+ st1 {v21.8h}, [x6], x1
+ subs w4, w4, #4
+ st1 {v22.8h}, [x0], x1
+ st1 {v23.8h}, [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+640:
+ sub x2, x2, #8
+ mov x7, #-8
+ // Set up pointers for four rows in parallel; x0, x6, x5, x10
+ add x5, x0, x1
+ add x10, x6, x1
+ lsl x1, x1, #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left
+ sub v0.8h, v0.8h, v5.8h // left-right
+ sub v1.8h, v1.8h, v5.8h
+ sub v2.8h, v2.8h, v5.8h
+ sub v3.8h, v3.8h, v5.8h
+2:
+ ld1 {v7.16b}, [x8], #16 // weights_hor
+ ushll v6.8h, v7.8b, #7 // weights_hor << 7
+ ushll2 v7.8h, v7.16b, #7
+ sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8
+ sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped)
+ sqrdmulh v22.8h, v2.8h, v6.8h
+ sqrdmulh v23.8h, v2.8h, v7.8h
+ sqrdmulh v24.8h, v1.8h, v6.8h
+ sqrdmulh v25.8h, v1.8h, v7.8h
+ sqrdmulh v26.8h, v0.8h, v6.8h
+ sqrdmulh v27.8h, v0.8h, v7.8h
+ add v20.8h, v20.8h, v5.8h
+ add v21.8h, v21.8h, v5.8h
+ add v22.8h, v22.8h, v5.8h
+ add v23.8h, v23.8h, v5.8h
+ add v24.8h, v24.8h, v5.8h
+ add v25.8h, v25.8h, v5.8h
+ add v26.8h, v26.8h, v5.8h
+ add v27.8h, v27.8h, v5.8h
+ subs w3, w3, #16
+ st1 {v20.8h, v21.8h}, [x0], #32
+ st1 {v22.8h, v23.8h}, [x6], #32
+ st1 {v24.8h, v25.8h}, [x5], #32
+ st1 {v26.8h, v27.8h}, [x10], #32
+ b.gt 2b
+ subs w4, w4, #4
+ b.le 9f
+ sub x8, x8, w9, uxtw
+ add x0, x0, x1
+ add x6, x6, x1
+ add x5, x5, x1
+ add x10, x10, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_smooth_h_tbl):
+ .hword L(ipred_smooth_h_tbl) - 640b
+ .hword L(ipred_smooth_h_tbl) - 320b
+ .hword L(ipred_smooth_h_tbl) - 160b
+ .hword L(ipred_smooth_h_tbl) - 80b
+ .hword L(ipred_smooth_h_tbl) - 40b
+endfunc
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height, const int filt_idx,
+// const int max_width, const int max_height,
+// const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon
+ and w5, w5, #511
+ movrel x6, X(filter_intra_taps)
+ lsl w5, w5, #6
+ add x6, x6, w5, uxtw
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+ clz w9, w3
+ adr x5, L(ipred_filter\bpc\()_tbl)
+ ld1 {v20.8b, v21.8b, v22.8b}, [x6]
+ sub w9, w9, #26
+ ldrh w9, [x5, w9, uxtw #1]
+ sxtl v16.8h, v16.8b
+ sxtl v17.8h, v17.8b
+ sub x5, x5, w9, uxtw
+ sxtl v18.8h, v18.8b
+ sxtl v19.8h, v19.8b
+ add x6, x0, x1
+ lsl x1, x1, #1
+ sxtl v20.8h, v20.8b
+ sxtl v21.8h, v21.8b
+ sxtl v22.8h, v22.8b
+ dup v31.8h, w8
+ movi v30.8h, #0
+ br x5
+40:
+ ldur d0, [x2, #2] // top (0-3)
+ sub x2, x2, #4
+ mov x7, #-4
+4:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+.endif
+ smin v2.8h, v2.8h, v31.8h
+ subs w4, w4, #2
+ st1 {v2.d}[0], [x0], x1
+ uxtl v0.8h, v2.8b
+ ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3]
+ st1 {v2.d}[1], [x6], x1
+ b.gt 4b
+ ret
+80:
+ ldur q0, [x2, #2] // top (0-7)
+ sub x2, x2, #4
+ mov x7, #-4
+8:
+ ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2)
+.if \bpc == 10
+ mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ srshr v2.8h, v2.8h, #4
+ smax v2.8h, v2.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+.else
+ smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0)
+ smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6)
+ smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
+ smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
+ smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
+ smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
+ smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0)
+ smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
+ smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
+ smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v2.4h, v2.4s, #4
+ sqrshrun2 v2.8h, v3.4s, #4
+ smin v2.8h, v2.8h, v31.8h
+ smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0)
+ smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6)
+ smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
+ smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
+ smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
+ smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
+ smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0)
+ smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5)
+ smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6)
+ sqrshrun v3.4h, v4.4s, #4
+ sqrshrun2 v3.8h, v5.4s, #4
+.endif
+ smin v3.8h, v3.8h, v31.8h
+ subs w4, w4, #2
+ st2 {v2.d, v3.d}[0], [x0], x1
+ zip2 v0.2d, v2.2d, v3.2d
+ st2 {v2.d, v3.d}[1], [x6], x1
+ b.gt 8b
+ ret
+160:
+320:
+ add x8, x2, #2
+ sub x2, x2, #4
+ mov x7, #-4
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+
+1:
+ ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2)
+2:
+ ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15)
+.if \bpc == 10
+ mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ srshr v3.8h, v3.8h, #4
+ smax v3.8h, v3.8h, v30.8h
+ smin v3.8h, v3.8h, v31.8h
+ mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ srshr v4.8h, v4.8h, #4
+ smax v4.8h, v4.8h, v30.8h
+ smin v4.8h, v4.8h, v31.8h
+ mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ srshr v5.8h, v5.8h, #4
+ smax v5.8h, v5.8h, v30.8h
+ smin v5.8h, v5.8h, v31.8h
+ mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ srshr v6.8h, v6.8h, #4
+ smax v6.8h, v6.8h, v30.8h
+.else
+ smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0)
+ smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4)
+ smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0)
+ smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
+ smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
+ smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
+ smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
+ smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
+ smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
+
+ smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v3.4h, v3.4s, #4
+ sqrshrun2 v3.8h, v4.4s, #4
+ smin v3.8h, v3.8h, v31.8h
+ smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0)
+ smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6)
+ smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
+ smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
+ smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
+ smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
+ smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0)
+ smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5)
+ smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6)
+
+ smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3)
+ sqrshrun v4.4h, v5.4s, #4
+ sqrshrun2 v4.8h, v6.4s, #4
+ smin v4.8h, v4.8h, v31.8h
+ smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0)
+ smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6)
+ smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
+ smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
+ smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
+ smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
+ smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0)
+ smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5)
+ smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6)
+
+ smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3)
+ sqrshrun v5.4h, v24.4s, #4
+ sqrshrun2 v5.8h, v25.4s, #4
+ smin v5.8h, v5.8h, v31.8h
+ smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0)
+ smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6)
+ smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
+ smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
+ smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
+ smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
+ smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0)
+ smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5)
+ smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6)
+
+ subs w3, w3, #16
+ sqrshrun v6.4h, v26.4s, #4
+ sqrshrun2 v6.8h, v27.4s, #4
+.endif
+ smin v6.8h, v6.8h, v31.8h
+
+ ins v0.h[2], v2.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
+ ins v0.h[0], v6.h[7]
+ st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
+ ins v0.h[1], v6.h[3]
+ b.gt 2b
+ subs w4, w4, #2
+ b.le 9f
+ sub x8, x6, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b 1b
+9:
+ ret
+
+L(ipred_filter\bpc\()_tbl):
+ .hword L(ipred_filter\bpc\()_tbl) - 320b
+ .hword L(ipred_filter\bpc\()_tbl) - 160b
+ .hword L(ipred_filter\bpc\()_tbl) - 80b
+ .hword L(ipred_filter\bpc\()_tbl) - 40b
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+ ldr w8, [sp]
+ cmp w8, 0x3ff
+ b.le ipred_filter_10bpc_neon
+ b ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const uint16_t *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+ ld1 {v30.8h}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x6, w9, uxtw #1]
+ movi v31.8h, #1, lsl #8
+ sub x6, x6, w9, uxtw
+ br x6
+40:
+ add x2, x0, x1
+ lsl x1, x1, #1
+4:
+ ld1 {v1.16b}, [x3], #16
+ subs w5, w5, #4
+ // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+ add v1.16b, v1.16b, v1.16b
+ zip1 v0.16b, v1.16b, v1.16b
+ zip2 v1.16b, v1.16b, v1.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ st1 {v0.d}[0], [x0], x1
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.d}[1], [x2], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x2], x1
+ b.gt 4b
+ ret
+80:
+ add x2, x0, x1
+ lsl x1, x1, #1
+8:
+ ld1 {v2.16b, v3.16b}, [x3], #32
+ subs w5, w5, #4
+ add v2.16b, v2.16b, v2.16b
+ add v3.16b, v3.16b, v3.16b
+ zip1 v0.16b, v2.16b, v2.16b
+ zip2 v1.16b, v2.16b, v2.16b
+ zip1 v2.16b, v3.16b, v3.16b
+ zip2 v3.16b, v3.16b, v3.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ tbl v1.16b, {v30.16b}, v1.16b
+ st1 {v0.8h}, [x0], x1
+ tbl v2.16b, {v30.16b}, v2.16b
+ st1 {v1.8h}, [x2], x1
+ tbl v3.16b, {v30.16b}, v3.16b
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x2], x1
+ b.gt 8b
+ ret
+160:
+ add x2, x0, x1
+ lsl x1, x1, #1
+16:
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+ subs w5, w5, #4
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ st1 {v2.8h, v3.8h}, [x2], x1
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h}, [x0], x1
+ st1 {v6.8h, v7.8h}, [x2], x1
+ b.gt 16b
+ ret
+320:
+ add x2, x0, x1
+ lsl x1, x1, #1
+32:
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+ subs w5, w5, #2
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 32b
+ ret
+640:
+ add x2, x0, #64
+64:
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+ subs w5, w5, #1
+ add v4.16b, v4.16b, v4.16b
+ add v5.16b, v5.16b, v5.16b
+ add v6.16b, v6.16b, v6.16b
+ add v7.16b, v7.16b, v7.16b
+ zip1 v0.16b, v4.16b, v4.16b
+ zip2 v1.16b, v4.16b, v4.16b
+ zip1 v2.16b, v5.16b, v5.16b
+ zip2 v3.16b, v5.16b, v5.16b
+ zip1 v4.16b, v6.16b, v6.16b
+ zip2 v5.16b, v6.16b, v6.16b
+ zip1 v6.16b, v7.16b, v7.16b
+ zip2 v7.16b, v7.16b, v7.16b
+ add v0.8h, v0.8h, v31.8h
+ add v1.8h, v1.8h, v31.8h
+ add v2.8h, v2.8h, v31.8h
+ add v3.8h, v3.8h, v31.8h
+ add v4.8h, v4.8h, v31.8h
+ tbl v0.16b, {v30.16b}, v0.16b
+ add v5.8h, v5.8h, v31.8h
+ tbl v1.16b, {v30.16b}, v1.16b
+ add v6.8h, v6.8h, v31.8h
+ tbl v2.16b, {v30.16b}, v2.16b
+ add v7.8h, v7.8h, v31.8h
+ tbl v3.16b, {v30.16b}, v3.16b
+ tbl v4.16b, {v30.16b}, v4.16b
+ tbl v5.16b, {v30.16b}, v5.16b
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ tbl v6.16b, {v30.16b}, v6.16b
+ tbl v7.16b, {v30.16b}, v7.16b
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 640b
+ .hword L(pal_pred_tbl) - 320b
+ .hword L(pal_pred_tbl) - 160b
+ .hword L(pal_pred_tbl) - 80b
+ .hword L(pal_pred_tbl) - 40b
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_128_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ urshr v0.8h, v31.8h, #1
+ dup v1.8h, w6 // alpha
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+L(ipred_cfl_splat_w4):
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #4
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ sshr v16.4s, v2.4s, #31 // sign = diff >> 31
+ sshr v17.4s, v3.4s, #31
+ sshr v18.4s, v4.4s, #31
+ sshr v19.4s, v5.4s, #31
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x6], x1
+ st1 {v3.d}[0], [x0], x1
+ st1 {v3.d}[1], [x6], x1
+ b.gt L(ipred_cfl_splat_w4)
+ ret
+L(ipred_cfl_splat_w8):
+ ld1 {v4.8h, v5.8h}, [x5], #32
+ subs w4, w4, #2
+ smull v2.4s, v4.4h, v1.4h // diff = ac * alpha
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ sshr v16.4s, v2.4s, #31 // sign = diff >> 31
+ sshr v17.4s, v3.4s, #31
+ sshr v18.4s, v4.4s, #31
+ sshr v19.4s, v5.4s, #31
+ add v2.4s, v2.4s, v16.4s // diff + sign
+ add v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v18.4s
+ add v5.4s, v5.4s, v19.4s
+ rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v2.8h, v3.4s, #6
+ rshrn v3.4h, v4.4s, #6
+ rshrn2 v3.8h, v5.4s, #6
+ add v2.8h, v2.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v3.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x6], x1
+ b.gt L(ipred_cfl_splat_w8)
+ ret
+L(ipred_cfl_splat_w16):
+ add x7, x5, w3, uxtw #1
+ sub x1, x1, w3, uxtw #1
+ mov w9, w3
+1:
+ ld1 {v2.8h, v3.8h}, [x5], #32
+ ld1 {v4.8h, v5.8h}, [x7], #32
+ subs w3, w3, #16
+ smull v16.4s, v2.4h, v1.4h // diff = ac * alpha
+ smull2 v17.4s, v2.8h, v1.8h
+ smull v18.4s, v3.4h, v1.4h
+ smull2 v19.4s, v3.8h, v1.8h
+ smull v2.4s, v4.4h, v1.4h
+ smull2 v3.4s, v4.8h, v1.8h
+ smull v4.4s, v5.4h, v1.4h
+ smull2 v5.4s, v5.8h, v1.8h
+ sshr v20.4s, v16.4s, #31 // sign = diff >> 31
+ sshr v21.4s, v17.4s, #31
+ sshr v22.4s, v18.4s, #31
+ sshr v23.4s, v19.4s, #31
+ sshr v24.4s, v2.4s, #31
+ sshr v25.4s, v3.4s, #31
+ sshr v26.4s, v4.4s, #31
+ sshr v27.4s, v5.4s, #31
+ add v16.4s, v16.4s, v20.4s // diff + sign
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+ add v19.4s, v19.4s, v23.4s
+ add v2.4s, v2.4s, v24.4s
+ add v3.4s, v3.4s, v25.4s
+ add v4.4s, v4.4s, v26.4s
+ add v5.4s, v5.4s, v27.4s
+ rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign()
+ rshrn2 v16.8h, v17.4s, #6
+ rshrn v17.4h, v18.4s, #6
+ rshrn2 v17.8h, v19.4s, #6
+ rshrn v6.4h, v2.4s, #6
+ rshrn2 v6.8h, v3.4s, #6
+ rshrn v7.4h, v4.4s, #6
+ rshrn2 v7.8h, v5.4s, #6
+ add v2.8h, v16.8h, v0.8h // dc + apply_sign()
+ add v3.8h, v17.8h, v0.8h
+ add v4.8h, v6.8h, v0.8h
+ add v5.8h, v7.8h, v0.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smax v4.8h, v4.8h, v30.8h
+ smax v5.8h, v5.8h, v30.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], #32
+ st1 {v4.8h, v5.8h}, [x6], #32
+ b.gt 1b
+ subs w4, w4, #2
+ add x5, x5, w9, uxtw #1
+ add x7, x7, w9, uxtw #1
+ add x0, x0, x1
+ add x6, x6, x1
+ mov w3, w9
+ b.gt 1b
+ ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+ .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ clz w9, w3
+ adr x7, L(ipred_cfl_top_tbl)
+ sub w9, w9, #26
+ ldrh w9, [x7, w9, uxtw #1]
+ dup v1.8h, w6 // alpha
+ add x2, x2, #2
+ sub x7, x7, w9, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+4:
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+8:
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+16:
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+32:
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+ .hword L(ipred_cfl_top_tbl) - 32b
+ .hword L(ipred_cfl_top_tbl) - 16b
+ .hword L(ipred_cfl_top_tbl) - 8b
+ .hword L(ipred_cfl_top_tbl) - 4b
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ clz w9, w3
+ clz w8, w4
+ adr x10, L(ipred_cfl_splat_tbl)
+ adr x7, L(ipred_cfl_left_tbl)
+ sub w9, w9, #26
+ sub w8, w8, #26
+ ldrh w9, [x10, w9, uxtw #1]
+ ldrh w8, [x7, w8, uxtw #1]
+ dup v1.8h, w6 // alpha
+ sub x9, x10, w9, uxtw
+ sub x7, x7, w8, uxtw
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_left_h4):
+ ld1 {v0.4h}, [x2]
+ addv h0, v0.4h
+ urshr v0.4h, v0.4h, #2
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h8):
+ ld1 {v0.8h}, [x2]
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #3
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h16):
+ ld1 {v2.8h, v3.8h}, [x2]
+ addp v0.8h, v2.8h, v3.8h
+ addv h0, v0.8h
+ urshr v0.4h, v0.4h, #4
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_h32):
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ rshrn v0.4h, v0.4s, #5
+ dup v0.8h, v0.h[0]
+ br x9
+
+L(ipred_cfl_left_tbl):
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+ .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *const topleft,
+// const int width, const int height,
+// const int16_t *ac, const int alpha,
+// const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+ dup v31.8h, w7 // bitdepth_max
+ sub x2, x2, w4, uxtw #1
+ add w8, w3, w4 // width + height
+ dup v1.8h, w6 // alpha
+ clz w9, w3
+ clz w6, w4
+ dup v16.4s, w8 // width + height
+ adr x7, L(ipred_cfl_tbl)
+ rbit w8, w8 // rbit(width + height)
+ sub w9, w9, #22 // 22 leading bits, minus table offset 4
+ sub w6, w6, #26
+ clz w8, w8 // ctz(width + height)
+ ldrh w9, [x7, w9, uxtw #1]
+ ldrh w6, [x7, w6, uxtw #1]
+ neg w8, w8 // -ctz(width + height)
+ sub x9, x7, w9, uxtw
+ sub x7, x7, w6, uxtw
+ ushr v16.4s, v16.4s, #1 // (width + height) >> 1
+ dup v17.4s, w8 // -ctz(width + height)
+ add x6, x0, x1
+ lsl x1, x1, #1
+ movi v30.8h, #0
+ br x7
+
+L(ipred_cfl_h4):
+ ld1 {v0.4h}, [x2], #8
+ uaddlv s0, v0.4h
+ br x9
+L(ipred_cfl_w4):
+ add x2, x2, #2
+ ld1 {v2.4h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.4h
+ cmp w4, #4
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #16
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+ ld1 {v0.8h}, [x2], #16
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w8):
+ add x2, x2, #2
+ ld1 {v2.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ uaddlv s2, v2.8h
+ cmp w4, #8
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/16/32
+ cmp w4, #32
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+ ld1 {v2.8h, v3.8h}, [x2], #32
+ addp v0.8h, v2.8h, v3.8h
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w16):
+ add x2, x2, #2
+ ld1 {v2.8h, v3.8h}, [x2]
+ add v0.2s, v0.2s, v16.2s
+ addp v2.8h, v2.8h, v3.8h
+ uaddlv s2, v2.8h
+ cmp w4, #16
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 4/8/32
+ tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v0.8h, v2.8h, v4.8h
+ uaddlv s0, v0.8h
+ br x9
+L(ipred_cfl_w32):
+ add x2, x2, #2
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+ add v0.4s, v0.4s, v16.4s
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v2.8h, v2.8h, v4.8h
+ cmp w4, #32
+ uaddlv s2, v2.8h
+ add v0.2s, v0.2s, v2.2s
+ ushl v0.2s, v0.2s, v17.2s
+ b.eq 1f
+ // h = 8/16
+ cmp w4, #8
+ mov w16, #0x6667
+ mov w17, #0xAAAB
+ csel w16, w16, w17, eq
+ dup v16.2s, w16
+ mul v0.2s, v0.2s, v16.2s
+ ushr v0.2s, v0.2s, #17
+1:
+ dup v0.8h, v0.h[0]
+ b L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+ .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_420_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_420_w4):
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h}, [x0], #16
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ b.gt 1b
+ trn2 v1.2d, v0.2d, v0.2d
+ trn2 v0.2d, v0.2d, v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+ // Aggregate the sums
+ add v24.4s, v24.4s, v25.4s
+ add v26.4s, v26.4s, v27.4s
+ add v0.4s, v24.4s, v26.4s
+ addv s0, v0.4s // sum
+ sub x0, x0, w6, uxtw #3
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
+ dup v4.8h, v4.h[0]
+6: // Subtract dc from ac
+ ld1 {v0.8h, v1.8h}, [x0]
+ subs w6, w6, #4
+ sub v0.8h, v0.8h, v4.8h
+ sub v1.8h, v1.8h, v4.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ b.gt 6b
+ ret
+
+L(ipred_cfl_ac_420_w8):
+ cbnz w3, L(ipred_cfl_ac_420_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v4.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ mov v0.16b, v1.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v2.8h
+ addp v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.4h, v0.h[3]
+ dup v3.4h, v0.h[7]
+ trn2 v2.2d, v0.2d, v0.2d
+ subs w8, w8, #2
+ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw v25.4s, v25.4s, v1.4h
+ uaddw v26.4s, v26.4s, v2.4h
+ uaddw v27.4s, v27.4s, v3.4h
+ b.gt 1b
+ trn1 v0.2d, v2.2d, v3.2d
+ trn1 v1.2d, v2.2d, v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 2b
+3:
+
+ // Double the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #1
+ lsl w9, w9, #1
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+ adr x7, L(ipred_cfl_ac_420_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2
+ add v0.8h, v0.8h, v4.8h
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
+ add v2.8h, v2.8h, v6.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v18.8h, v18.8h, v19.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v22.8h, v22.8h, v23.8h
+ add v16.8h, v16.8h, v20.8h
+ add v18.8h, v18.8h, v22.8h
+ shl v0.8h, v0.8h, #1
+ shl v1.8h, v2.8h, #1
+ shl v2.8h, v16.8h, #1
+ shl v3.8h, v18.8h, #1
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q5, [x10, #32]
+ ld1 {v3.8h, v4.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v5.8h, v5.8h, v5.8h
+ addp v3.8h, v3.8h, v4.8h
+ ldr q18, [x1, #32]
+ add v2.4h, v2.4h, v5.4h
+ ld1 {v16.8h, v17.8h}, [x1], x2
+ add v0.8h, v0.8h, v3.8h
+ ldr q21, [x10, #32]
+ ld1 {v19.8h, v20.8h}, [x10], x2
+ addp v18.8h, v18.8h, v18.8h
+ addp v16.8h, v16.8h, v17.8h
+ addp v21.8h, v21.8h, v21.8h
+ addp v19.8h, v19.8h, v20.8h
+ add v18.4h, v18.4h, v21.4h
+ add v16.8h, v16.8h, v19.8h
+ shl v1.4h, v2.4h, #1
+ shl v0.8h, v0.8h, #1
+ shl v3.4h, v18.4h, #1
+ shl v2.8h, v16.8h, #1
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ shl v0.8h, v0.8h, #1
+ shl v2.8h, v4.8h, #1
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ ld1 {v4.8h}, [x1], x2
+ ld1 {v6.8h}, [x10], x2
+ addp v0.8h, v0.8h, v4.8h
+ addp v2.8h, v2.8h, v6.8h
+ add v0.8h, v0.8h, v2.8h
+ shl v0.8h, v0.8h, #1
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v0.h[7]
+ trn2 v2.2d, v0.2d, v3.2d
+ trn1 v0.2d, v0.2d, v1.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+ cbz w4, 3f
+2: // Vertical padding (h_pad > 0)
+ subs w4, w4, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 2b
+3:
+
+ // Quadruple the height and reuse the w4 summing/subtracting
+ lsl w6, w6, #2
+ lsl w9, w9, #2
+ b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+ .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+ .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+ .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+// const ptrdiff_t stride, const int w_pad,
+// const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+ clz w8, w5
+ lsl w4, w4, #2
+ adr x7, L(ipred_cfl_ac_422_tbl)
+ sub w8, w8, #27
+ ldrh w8, [x7, w8, uxtw #1]
+ movi v24.4s, #0
+ movi v25.4s, #0
+ movi v26.4s, #0
+ movi v27.4s, #0
+ sub x7, x7, w8, uxtw
+ sub w8, w6, w4 // height - h_pad
+ rbit w9, w5 // rbit(width)
+ rbit w10, w6 // rbit(height)
+ clz w9, w9 // ctz(width)
+ clz w10, w10 // ctz(height)
+ add w9, w9, w10 // log2sz
+ add x10, x1, x2
+ dup v31.4s, w9
+ lsl x2, x2, #1
+ neg v31.4s, v31.4s // -log2sz
+ br x7
+
+L(ipred_cfl_ac_422_w4):
+1: // Copy and subsample input
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h}, [x0], #32
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ b.gt 1b
+ trn2 v0.2d, v1.2d, v1.2d
+ trn2 v1.2d, v1.2d, v1.2d
+ b L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+ cbnz w3, L(ipred_cfl_ac_422_w8_wpad)
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ ld1 {v4.8h, v5.8h}, [x1], x2
+ addp v0.8h, v0.8h, v1.8h
+ ld1 {v6.8h, v7.8h}, [x10], x2
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1: // Copy and subsample input, padding 4
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v1.8h}, [x10], x2
+ ld1 {v2.8h}, [x1], x2
+ ld1 {v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v4.4h, v0.h[3]
+ dup v5.8h, v0.h[7]
+ dup v6.4h, v2.h[3]
+ dup v7.8h, v2.h[7]
+ trn2 v1.2d, v0.2d, v5.2d
+ trn1 v0.2d, v0.2d, v4.2d
+ trn2 v3.2d, v2.2d, v7.2d
+ trn1 v2.2d, v2.2d, v6.2d
+ subs w8, w8, #4
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v3.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+ adr x7, L(ipred_cfl_ac_422_w16_tbl)
+ ldrh w3, [x7, w3, uxtw #1]
+ sub x7, x7, w3, uxtw
+ br x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1: // Copy and subsample input, without padding
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ addp v4.8h, v4.8h, v5.8h
+ addp v6.8h, v6.8h, v7.8h
+ shl v0.8h, v0.8h, #2
+ shl v1.8h, v2.8h, #2
+ shl v2.8h, v4.8h, #2
+ shl v3.8h, v6.8h, #2
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1: // Copy and subsample input, padding 4
+ ldr q2, [x1, #32]
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ldr q6, [x10, #32]
+ ld1 {v4.8h, v5.8h}, [x10], x2
+ addp v2.8h, v2.8h, v2.8h
+ addp v0.8h, v0.8h, v1.8h
+ addp v6.8h, v6.8h, v6.8h
+ addp v4.8h, v4.8h, v5.8h
+ shl v1.4h, v2.4h, #2
+ shl v0.8h, v0.8h, #2
+ shl v3.4h, v6.4h, #2
+ shl v2.8h, v4.8h, #2
+ dup v4.4h, v1.h[3]
+ dup v5.4h, v3.h[3]
+ trn1 v1.2d, v1.2d, v4.2d
+ trn1 v3.2d, v3.2d, v5.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1: // Copy and subsample input, padding 8
+ ld1 {v0.8h, v1.8h}, [x1], x2
+ ld1 {v2.8h, v3.8h}, [x10], x2
+ addp v0.8h, v0.8h, v1.8h
+ addp v2.8h, v2.8h, v3.8h
+ shl v0.8h, v0.8h, #2
+ shl v2.8h, v2.8h, #2
+ dup v1.8h, v0.h[7]
+ dup v3.8h, v2.h[7]
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1: // Copy and subsample input, padding 12
+ ld1 {v0.8h}, [x1], x2
+ ld1 {v2.8h}, [x10], x2
+ addp v0.8h, v0.8h, v0.8h
+ addp v2.8h, v2.8h, v2.8h
+ shl v0.4h, v0.4h, #2
+ shl v2.4h, v2.4h, #2
+ dup v1.8h, v0.h[3]
+ dup v3.8h, v2.h[3]
+ trn1 v0.2d, v0.2d, v1.2d
+ trn1 v2.2d, v2.2d, v3.2d
+ subs w8, w8, #2
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ uaddw v24.4s, v24.4s, v0.4h
+ uaddw2 v25.4s, v25.4s, v0.8h
+ uaddw v26.4s, v26.4s, v1.4h
+ uaddw2 v27.4s, v27.4s, v1.8h
+ uaddw v24.4s, v24.4s, v2.4h
+ uaddw2 v25.4s, v25.4s, v2.8h
+ uaddw v26.4s, v26.4s, v3.4h
+ uaddw2 v27.4s, v27.4s, v3.8h
+ b.gt 1b
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ b L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+ .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+ .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred_init_tmpl.c
@@ -54,7 +54,7 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8
+#if BITDEPTH == 8 || ARCH_AARCH64
c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon);
c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon);
c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon);
--- a/src/meson.build
+++ b/src/meson.build
@@ -120,6 +120,7 @@
if dav1d_bitdepths.contains('16')
libdav1d_sources += files(
'arm/64/cdef16.S',
+ 'arm/64/ipred16.S',
'arm/64/loopfilter16.S',
'arm/64/looprestoration16.S',
'arm/64/mc16.S',