shithub: dav1d

--- /dev/null

+++ b/src/arm/64/ipred16.S

@@ -1,0 +1,2834 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2019, Martin Storsjo

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "src/arm/asm.S"

+#include "util.S"

+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                              const pixel *const topleft,

+//                              const int width, const int height, const int a,

+//                              const int max_width, const int max_height,

+//                              const int bitdepth_max);

+function ipred_dc_128_16bpc_neon, export=1

+        ldr             w8,  [sp]

+        clz             w3,  w3

+        adr             x5,  L(ipred_dc_128_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        dup             v0.8h,   w8

+        sub             x5,  x5,  w3, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        urshr           v0.8h,   v0.8h,  #1

+        br              x5

+4:

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        b.gt            4b

+        ret

+8:

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        b.gt            8b

+        ret

+160:

+        mov             v1.16b,  v0.16b

+16:

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        b.gt            16b

+        ret

+320:

+        mov             v1.16b,  v0.16b

+        mov             v2.16b,  v0.16b

+        mov             v3.16b,  v0.16b

+32:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            32b

+        ret

+640:

+        mov             v1.16b,  v0.16b

+        mov             v2.16b,  v0.16b

+        mov             v3.16b,  v0.16b

+        sub             x1,  x1,  #64

+64:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_dc_128_tbl):

+        .hword L(ipred_dc_128_tbl) - 640b

+        .hword L(ipred_dc_128_tbl) - 320b

+        .hword L(ipred_dc_128_tbl) - 160b

+        .hword L(ipred_dc_128_tbl) -   8b

+        .hword L(ipred_dc_128_tbl) -   4b

+endfunc

+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                         const pixel *const topleft,

+//                         const int width, const int height, const int a,

+//                         const int max_width, const int max_height);

+function ipred_v_16bpc_neon, export=1

+        clz             w3,  w3

+        adr             x5,  L(ipred_v_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        add             x2,  x2,  #2

+        sub             x5,  x5,  w3, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        ld1             {v0.4h},  [x2]

+4:

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        b.gt            4b

+        ret

+80:

+        ld1             {v0.8h},  [x2]

+8:

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        b.gt            8b

+        ret

+160:

+        ld1             {v0.8h, v1.8h}, [x2]

+16:

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        b.gt            16b

+        ret

+320:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]

+32:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            32b

+        ret

+640:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64

+        sub             x1,  x1,  #64

+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]

+64:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_v_tbl):

+        .hword L(ipred_v_tbl) - 640b

+        .hword L(ipred_v_tbl) - 320b

+        .hword L(ipred_v_tbl) - 160b

+        .hword L(ipred_v_tbl) -  80b

+        .hword L(ipred_v_tbl) -  40b

+endfunc

+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                         const pixel *const topleft,

+//                         const int width, const int height, const int a,

+//                         const int max_width, const int max_height);

+function ipred_h_16bpc_neon, export=1

+        clz             w3,  w3

+        adr             x5,  L(ipred_h_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        sub             x2,  x2,  #8

+        sub             x5,  x5,  w3, uxtw

+        mov             x7,  #-8

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+4:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7

+        st1             {v3.4h},  [x0], x1

+        st1             {v2.4h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v1.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        b.gt            4b

+        ret

+8:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7

+        st1             {v3.8h},  [x0], x1

+        st1             {v2.8h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v1.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        b.gt            8b

+        ret

+16:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7

+        str             q3,  [x0, #16]

+        str             q2,  [x6, #16]

+        st1             {v3.8h}, [x0], x1

+        st1             {v2.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        str             q1,  [x0, #16]

+        str             q0,  [x6, #16]

+        st1             {v1.8h}, [x0], x1

+        st1             {v0.8h}, [x6], x1

+        b.gt            16b

+        ret

+32:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7

+        str             q3,  [x0, #16]

+        str             q2,  [x6, #16]

+        stp             q3,  q3,  [x0, #32]

+        stp             q2,  q2,  [x6, #32]

+        st1             {v3.8h}, [x0], x1

+        st1             {v2.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        str             q1,  [x0, #16]

+        str             q0,  [x6, #16]

+        stp             q1,  q1,  [x0, #32]

+        stp             q0,  q0,  [x6, #32]

+        st1             {v1.8h}, [x0], x1

+        st1             {v0.8h}, [x6], x1

+        b.gt            32b

+        ret

+64:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7

+        str             q3,  [x0, #16]

+        str             q2,  [x6, #16]

+        stp             q3,  q3,  [x0, #32]

+        stp             q2,  q2,  [x6, #32]

+        stp             q3,  q3,  [x0, #64]

+        stp             q2,  q2,  [x6, #64]

+        stp             q3,  q3,  [x0, #96]

+        stp             q2,  q2,  [x6, #96]

+        st1             {v3.8h}, [x0], x1

+        st1             {v2.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        str             q1,  [x0, #16]

+        str             q0,  [x6, #16]

+        stp             q1,  q1,  [x0, #32]

+        stp             q0,  q0,  [x6, #32]

+        stp             q1,  q1,  [x0, #64]

+        stp             q0,  q0,  [x6, #64]

+        stp             q1,  q1,  [x0, #96]

+        stp             q0,  q0,  [x6, #96]

+        st1             {v1.8h}, [x0], x1

+        st1             {v0.8h}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_h_tbl):

+        .hword L(ipred_h_tbl) - 64b

+        .hword L(ipred_h_tbl) - 32b

+        .hword L(ipred_h_tbl) - 16b

+        .hword L(ipred_h_tbl) -  8b

+        .hword L(ipred_h_tbl) -  4b

+endfunc

+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                              const pixel *const topleft,

+//                              const int width, const int height, const int a,

+//                              const int max_width, const int max_height);

+function ipred_dc_top_16bpc_neon, export=1

+        clz             w3,  w3

+        adr             x5,  L(ipred_dc_top_tbl)

+        sub             w3,  w3,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        add             x2,  x2,  #2

+        sub             x5,  x5,  w3, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        ld1             {v0.4h},  [x2]

+        addv            h0,      v0.4h

+        urshr           v0.4h,   v0.4h,   #2

+        dup             v0.4h,   v0.h[0]

+4:

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        b.gt            4b

+        ret

+80:

+        ld1             {v0.8h},  [x2]

+        addv            h0,      v0.8h

+        urshr           v0.4h,   v0.4h,   #3

+        dup             v0.8h,   v0.h[0]

+8:

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        b.gt            8b

+        ret

+160:

+        ld1             {v0.8h, v1.8h}, [x2]

+        addp            v0.8h,   v0.8h,   v1.8h

+        addv            h0,      v0.8h

+        urshr           v2.4h,   v0.4h,   #4

+        dup             v0.8h,   v2.h[0]

+        dup             v1.8h,   v2.h[0]

+16:

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        b.gt            16b

+        ret

+320:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v0.8h,   v0.8h,   v2.8h

+        uaddlv          s0,      v0.8h

+        rshrn           v4.4h,   v0.4s,   #5

+        dup             v0.8h,   v4.h[0]

+        dup             v1.8h,   v4.h[0]

+        dup             v2.8h,   v4.h[0]

+        dup             v3.8h,   v4.h[0]

+32:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            32b

+        ret

+640:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64

+        addp            v0.8h,   v0.8h,   v1.8h

+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        addp            v0.8h,   v0.8h,   v2.8h

+        addp            v4.8h,   v4.8h,   v6.8h

+        addp            v0.8h,   v0.8h,   v4.8h

+        uaddlv          s0,      v0.8h

+        rshrn           v4.4h,   v0.4s,   #6

+        sub             x1,  x1,  #64

+        dup             v0.8h,   v4.h[0]

+        dup             v1.8h,   v4.h[0]

+        dup             v2.8h,   v4.h[0]

+        dup             v3.8h,   v4.h[0]

+64:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            64b

+        ret

+L(ipred_dc_top_tbl):

+        .hword L(ipred_dc_top_tbl) - 640b

+        .hword L(ipred_dc_top_tbl) - 320b

+        .hword L(ipred_dc_top_tbl) - 160b

+        .hword L(ipred_dc_top_tbl) -  80b

+        .hword L(ipred_dc_top_tbl) -  40b

+endfunc

+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                               const pixel *const topleft,

+//                               const int width, const int height, const int a,

+//                               const int max_width, const int max_height);

+function ipred_dc_left_16bpc_neon, export=1

+        sub             x2,  x2,  w4, uxtw #1

+        clz             w3,  w3

+        clz             w7,  w4

+        adr             x5,  L(ipred_dc_left_tbl)

+        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5

+        sub             w7,  w7,  #25

+        ldrh            w3,  [x5, w3, uxtw #1]

+        ldrh            w7,  [x5, w7, uxtw #1]

+        sub             x3,  x5,  w3, uxtw

+        sub             x5,  x5,  w7, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+L(ipred_dc_left_h4):

+        ld1             {v0.4h},  [x2]

+        addv            h0,      v0.4h

+        urshr           v0.4h,   v0.4h,   #2

+        dup             v0.8h,   v0.h[0]

+        br              x3

+L(ipred_dc_left_w4):

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        b.gt            L(ipred_dc_left_w4)

+        ret

+L(ipred_dc_left_h8):

+        ld1             {v0.8h},  [x2]

+        addv            h0,      v0.8h

+        urshr           v0.4h,   v0.4h,   #3

+        dup             v0.8h,   v0.h[0]

+        br              x3

+L(ipred_dc_left_w8):

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        b.gt            L(ipred_dc_left_w8)

+        ret

+L(ipred_dc_left_h16):

+        ld1             {v0.8h, v1.8h}, [x2]

+        addp            v0.8h,   v0.8h,   v1.8h

+        addv            h0,      v0.8h

+        urshr           v2.4h,   v0.4h,   #4

+        dup             v0.8h,   v2.h[0]

+        dup             v1.8h,   v2.h[0]

+        br              x3

+L(ipred_dc_left_w16):

+        mov             v1.16b,  v0.16b

+1:

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        b.gt            1b

+        ret

+L(ipred_dc_left_h32):

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v0.8h,   v0.8h,   v2.8h

+        uaddlp          v0.4s,   v0.8h

+        addv            s0,      v0.4s

+        rshrn           v4.4h,   v0.4s,   #5

+        dup             v0.8h,   v4.h[0]

+        br              x3

+L(ipred_dc_left_w32):

+        mov             v1.16b,  v0.16b

+        mov             v2.16b,  v0.16b

+        mov             v3.16b,  v0.16b

+1:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            1b

+        ret

+L(ipred_dc_left_h64):

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64

+        addp            v0.8h,   v0.8h,   v1.8h

+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        addp            v0.8h,   v0.8h,   v2.8h

+        addp            v4.8h,   v4.8h,   v6.8h

+        addp            v0.8h,   v0.8h,   v4.8h

+        uaddlv          s0,      v0.8h

+        rshrn           v4.4h,   v0.4s,   #6

+        dup             v0.8h,   v4.h[0]

+        br              x3

+L(ipred_dc_left_w64):

+        mov             v1.16b,  v0.16b

+        mov             v2.16b,  v0.16b

+        mov             v3.16b,  v0.16b

+        sub             x1,  x1,  #64

+1:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            1b

+        ret

+L(ipred_dc_left_tbl):

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)

+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)

+endfunc

+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                          const pixel *const topleft,

+//                          const int width, const int height, const int a,

+//                          const int max_width, const int max_height);

+function ipred_dc_16bpc_neon, export=1

+        sub             x2,  x2,  w4, uxtw #1

+        add             w7,  w3,  w4             // width + height

+        clz             w3,  w3

+        clz             w6,  w4

+        dup             v16.4s, w7               // width + height

+        adr             x5,  L(ipred_dc_tbl)

+        rbit            w7,  w7                  // rbit(width + height)

+        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5

+        sub             w6,  w6,  #25

+        clz             w7,  w7                  // ctz(width + height)

+        ldrh            w3,  [x5, w3, uxtw #1]

+        ldrh            w6,  [x5, w6, uxtw #1]

+        neg             w7,  w7                  // -ctz(width + height)

+        sub             x3,  x5,  w3, uxtw

+        sub             x5,  x5,  w6, uxtw

+        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1

+        dup             v17.4s,  w7              // -ctz(width + height)

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+L(ipred_dc_h4):

+        ld1             {v0.4h},  [x2], #8

+        uaddlv          s0,      v0.4h

+        br              x3

+L(ipred_dc_w4):

+        add             x2,  x2,  #2

+        ld1             {v1.4h},  [x2]

+        add             v0.2s,   v0.2s,   v16.2s

+        uaddlv          s1,      v1.4h

+        cmp             w4,  #4

+        add             v0.2s,   v0.2s,   v1.2s

+        ushl            v0.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 8/16

+        cmp             w4,  #16

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v0.2s,   v0.2s,   v16.2s

+        ushr            v0.2s,   v0.2s,   #17

+1:

+        dup             v0.4h,   v0.h[0]

+2:

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.4h},  [x0], x1

+        st1             {v0.4h},  [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h8):

+        ld1             {v0.8h},  [x2], #16

+        uaddlv          s0,      v0.8h

+        br              x3

+L(ipred_dc_w8):

+        add             x2,  x2,  #2

+        ld1             {v1.8h},  [x2]

+        add             v0.2s,   v0.2s,   v16.2s

+        uaddlv          s1,      v1.8h

+        cmp             w4,  #8

+        add             v0.2s,   v0.2s,   v1.2s

+        ushl            v0.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 4/16/32

+        cmp             w4,  #32

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v0.2s,   v0.2s,   v16.2s

+        ushr            v0.2s,   v0.2s,   #17

+1:

+        dup             v0.8h,   v0.h[0]

+2:

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h},  [x0], x1

+        st1             {v0.8h},  [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h16):

+        ld1             {v0.8h, v1.8h}, [x2], #32

+        addp            v0.8h,   v0.8h,   v1.8h

+        uaddlv          s0,      v0.8h

+        br              x3

+L(ipred_dc_w16):

+        add             x2,  x2,  #2

+        ld1             {v1.8h, v2.8h}, [x2]

+        add             v0.2s,   v0.2s,   v16.2s

+        addp            v1.8h,   v1.8h,   v2.8h

+        uaddlv          s1,      v1.8h

+        cmp             w4,  #16

+        add             v0.2s,   v0.2s,   v1.2s

+        ushl            v4.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 4/8/32/64

+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v4.2s,   v4.2s,   v16.2s

+        ushr            v4.2s,   v4.2s,   #17

+1:

+        dup             v0.8h,   v4.h[0]

+        dup             v1.8h,   v4.h[0]

+2:

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h}, [x0], x1

+        st1             {v0.8h, v1.8h}, [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h32):

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v0.8h,   v0.8h,   v2.8h

+        uaddlv          s0,      v0.8h

+        br              x3

+L(ipred_dc_w32):

+        add             x2,  x2,  #2

+        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]

+        add             v0.2s,   v0.2s,   v16.2s

+        addp            v1.8h,   v1.8h,   v2.8h

+        addp            v3.8h,   v3.8h,   v4.8h

+        addp            v1.8h,   v1.8h,   v3.8h

+        uaddlv          s1,      v1.8h

+        cmp             w4,  #32

+        add             v0.2s,   v0.2s,   v1.2s

+        ushl            v4.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 8/16/64

+        cmp             w4,  #8

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v4.2s,   v4.2s,   v16.2s

+        ushr            v4.2s,   v4.2s,   #17

+1:

+        dup             v0.8h,   v4.h[0]

+        dup             v1.8h,   v4.h[0]

+        dup             v2.8h,   v4.h[0]

+        dup             v3.8h,   v4.h[0]

+2:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_h64):

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64

+        addp            v0.8h,   v0.8h,   v1.8h

+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        addp            v0.8h,   v0.8h,   v2.8h

+        addp            v4.8h,   v4.8h,   v6.8h

+        addp            v0.8h,   v0.8h,   v4.8h

+        uaddlv          s0,      v0.8h

+        br              x3

+L(ipred_dc_w64):

+        add             x2,  x2,  #2

+        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64

+        add             v0.2s,   v0.2s,   v16.2s

+        addp            v1.8h,   v1.8h,   v2.8h

+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]

+        addp            v3.8h,   v3.8h,   v4.8h

+        addp            v20.8h,  v20.8h,  v21.8h

+        addp            v22.8h,  v22.8h,  v23.8h

+        addp            v1.8h,   v1.8h,   v3.8h

+        addp            v20.8h,  v20.8h,  v22.8h

+        addp            v1.8h,   v1.8h,   v20.8h

+        uaddlv          s1,      v1.8h

+        cmp             w4,  #64

+        add             v0.2s,   v0.2s,   v1.2s

+        ushl            v4.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 16/32

+        cmp             w4,  #16

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v4.2s,   v4.2s,   v16.2s

+        ushr            v4.2s,   v4.2s,   #17

+1:

+        sub             x1,  x1,  #64

+        dup             v0.8h,   v4.h[0]

+        dup             v1.8h,   v4.h[0]

+        dup             v2.8h,   v4.h[0]

+        dup             v3.8h,   v4.h[0]

+2:

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1

+        b.gt            2b

+        ret

+L(ipred_dc_tbl):

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)

+        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)

+endfunc

+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                             const pixel *const topleft,

+//                             const int width, const int height, const int a,

+//                             const int max_width, const int max_height);

+function ipred_paeth_16bpc_neon, export=1

+        clz             w9,  w3

+        adr             x5,  L(ipred_paeth_tbl)

+        sub             w9,  w9,  #25

+        ldrh            w9,  [x5, w9, uxtw #1]

+        ld1r            {v4.8h},  [x2]

+        add             x8,  x2,  #2

+        sub             x2,  x2,  #8

+        sub             x5,  x5,  w9, uxtw

+        mov             x7,  #-8

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        ld1r            {v5.2d},  [x8]

+        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft

+4:

+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7

+        zip1            v0.2d,   v0.2d,   v1.2d

+        zip1            v2.2d,   v2.2d,   v3.2d

+        add             v16.8h,  v6.8h,   v0.8h   // base

+        add             v17.8h,  v6.8h,   v2.8h

+        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff

+        sabd            v21.8h,  v5.8h,   v17.8h

+        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff

+        sabd            v23.8h,  v4.8h,   v17.8h

+        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff

+        sabd            v17.8h,  v2.8h,   v17.8h

+        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)

+        umin            v19.8h,  v21.8h,  v23.8h

+        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff

+        cmge            v21.8h,  v23.8h,  v21.8h

+        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff

+        cmge            v17.8h,  v19.8h,  v17.8h

+        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft

+        bsl             v20.16b, v5.16b,  v4.16b

+        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...

+        bit             v20.16b, v0.16b,  v16.16b

+        st1             {v21.d}[1], [x0], x1

+        st1             {v21.d}[0], [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v20.d}[1], [x0], x1

+        st1             {v20.d}[0], [x6], x1

+        b.gt            4b

+        ret

+80:

+160:

+320:

+640:

+        ld1             {v5.8h},  [x8], #16

+        mov             w9,  w3

+        // Set up pointers for four rows in parallel; x0, x6, x5, x10

+        add             x5,  x0,  x1

+        add             x10, x6,  x1

+        lsl             x1,  x1,  #1

+        sub             x1,  x1,  w3, uxtw #1

+1:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7

+2:

+        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft

+        add             v16.8h,  v6.8h,   v0.8h   // base

+        add             v17.8h,  v6.8h,   v1.8h

+        add             v18.8h,  v6.8h,   v2.8h

+        add             v19.8h,  v6.8h,   v3.8h

+        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff

+        sabd            v21.8h,  v5.8h,   v17.8h

+        sabd            v22.8h,  v5.8h,   v18.8h

+        sabd            v23.8h,  v5.8h,   v19.8h

+        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff

+        sabd            v25.8h,  v4.8h,   v17.8h

+        sabd            v26.8h,  v4.8h,   v18.8h

+        sabd            v27.8h,  v4.8h,   v19.8h

+        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff

+        sabd            v17.8h,  v1.8h,   v17.8h

+        sabd            v18.8h,  v2.8h,   v18.8h

+        sabd            v19.8h,  v3.8h,   v19.8h

+        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)

+        umin            v29.8h,  v21.8h,  v25.8h

+        umin            v30.8h,  v22.8h,  v26.8h

+        umin            v31.8h,  v23.8h,  v27.8h

+        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff

+        cmge            v21.8h,  v25.8h,  v21.8h

+        cmge            v22.8h,  v26.8h,  v22.8h

+        cmge            v23.8h,  v27.8h,  v23.8h

+        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff

+        cmge            v17.8h,  v29.8h,  v17.8h

+        cmge            v18.8h,  v30.8h,  v18.8h

+        cmge            v19.8h,  v31.8h,  v19.8h

+        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft

+        bsl             v22.16b, v5.16b,  v4.16b

+        bsl             v21.16b, v5.16b,  v4.16b

+        bsl             v20.16b, v5.16b,  v4.16b

+        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...

+        bit             v22.16b, v2.16b,  v18.16b

+        bit             v21.16b, v1.16b,  v17.16b

+        bit             v20.16b, v0.16b,  v16.16b

+        st1             {v23.8h}, [x0], #16

+        st1             {v22.8h}, [x6], #16

+        subs            w3,  w3,  #8

+        st1             {v21.8h}, [x5], #16

+        st1             {v20.8h}, [x10], #16

+        b.le            8f

+        ld1             {v5.8h},  [x8], #16

+        b               2b

+8:

+        subs            w4,  w4,  #4

+        b.le            9f

+        // End of horizontal loop, move pointers to next four rows

+        sub             x8,  x8,  w9, uxtw #1

+        add             x0,  x0,  x1

+        add             x6,  x6,  x1

+        // Load the top row as early as possible

+        ld1             {v5.8h},  [x8], #16

+        add             x5,  x5,  x1

+        add             x10, x10, x1

+        mov             w3,  w9

+        b               1b

+9:

+        ret

+L(ipred_paeth_tbl):

+        .hword L(ipred_paeth_tbl) - 640b

+        .hword L(ipred_paeth_tbl) - 320b

+        .hword L(ipred_paeth_tbl) - 160b

+        .hword L(ipred_paeth_tbl) -  80b

+        .hword L(ipred_paeth_tbl) -  40b

+endfunc

+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                              const pixel *const topleft,

+//                              const int width, const int height, const int a,

+//                              const int max_width, const int max_height);

+function ipred_smooth_16bpc_neon, export=1

+        movrel          x10, X(sm_weights)

+        add             x11, x10, w4, uxtw

+        add             x10, x10, w3, uxtw

+        clz             w9,  w3

+        adr             x5,  L(ipred_smooth_tbl)

+        sub             x12, x2,  w4, uxtw #1

+        sub             w9,  w9,  #25

+        ldrh            w9,  [x5, w9, uxtw #1]

+        ld1r            {v4.8h},  [x12] // bottom

+        add             x8,  x2,  #2

+        sub             x5,  x5,  w9, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        sub             x2,  x2,  #8

+        mov             x7,  #-8

+        ld1r            {v6.2d}, [x8]             // top

+        ld1r            {v7.2s}, [x10]            // weights_hor

+        dup             v5.8h,   v6.h[3]          // right

+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom

+        uxtl            v7.8h,   v7.8b            // weights_hor

+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right

+4:

+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left

+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver

+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256

+        ushll           v21.4s,  v31.4h,  #8

+        ushll           v22.4s,  v31.4h,  #8

+        ushll           v23.4s,  v31.4h,  #8

+        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped

+        zip1            v0.2d,   v3.2d,   v2.2d

+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver

+        zip1            v18.2s,  v18.2s,  v19.2s

+        sub             v0.8h,   v0.8h,   v5.8h   // left-right

+        sub             v1.8h,   v1.8h,   v5.8h

+        uxtl            v16.8h,  v16.8b           // weights_ver

+        uxtl            v18.8h,  v18.8b

+        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor

+        smlal2          v21.4s,  v0.8h,   v7.8h

+        smlal           v22.4s,  v1.4h,   v7.4h

+        smlal2          v23.4s,  v1.8h,   v7.8h

+        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver

+        smlal2          v21.4s,  v6.8h,   v16.8h

+        smlal           v22.4s,  v6.4h,   v18.4h

+        smlal2          v23.4s,  v6.8h,   v18.8h

+        rshrn           v20.4h,  v20.4s,  #9

+        rshrn           v21.4h,  v21.4s,  #9

+        rshrn           v22.4h,  v22.4s,  #9

+        rshrn           v23.4h,  v23.4s,  #9

+        st1             {v20.4h}, [x0], x1

+        st1             {v21.4h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v22.4h}, [x0], x1

+        st1             {v23.4h}, [x6], x1

+        b.gt            4b

+        ret

+80:

+        sub             x2,  x2,  #8

+        mov             x7,  #-8

+        ld1             {v6.8h}, [x8]             // top

+        ld1             {v7.8b}, [x10]            // weights_hor

+        dup             v5.8h,   v6.h[7]          // right

+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom

+        uxtl            v7.8h,   v7.8b            // weights_hor

+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right

+8:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left

+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver

+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256

+        ushll           v21.4s,  v31.4h,  #8

+        ushll           v22.4s,  v31.4h,  #8

+        ushll           v23.4s,  v31.4h,  #8

+        ushll           v24.4s,  v31.4h,  #8

+        ushll           v25.4s,  v31.4h,  #8

+        ushll           v26.4s,  v31.4h,  #8

+        ushll           v27.4s,  v31.4h,  #8

+        sub             v0.8h,   v0.8h,   v5.8h   // left-right

+        sub             v1.8h,   v1.8h,   v5.8h

+        sub             v2.8h,   v2.8h,   v5.8h

+        sub             v3.8h,   v3.8h,   v5.8h

+        uxtl            v16.8h,  v16.8b           // weights_ver

+        uxtl            v17.8h,  v17.8b

+        uxtl            v18.8h,  v18.8b

+        uxtl            v19.8h,  v19.8b

+        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor

+        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)

+        smlal           v22.4s,  v2.4h,   v7.4h

+        smlal2          v23.4s,  v2.8h,   v7.8h

+        smlal           v24.4s,  v1.4h,   v7.4h

+        smlal2          v25.4s,  v1.8h,   v7.8h

+        smlal           v26.4s,  v0.4h,   v7.4h

+        smlal2          v27.4s,  v0.8h,   v7.8h

+        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver

+        smlal2          v21.4s,  v6.8h,   v16.8h

+        smlal           v22.4s,  v6.4h,   v17.4h

+        smlal2          v23.4s,  v6.8h,   v17.8h

+        smlal           v24.4s,  v6.4h,   v18.4h

+        smlal2          v25.4s,  v6.8h,   v18.8h

+        smlal           v26.4s,  v6.4h,   v19.4h

+        smlal2          v27.4s,  v6.8h,   v19.8h

+        rshrn           v20.4h,  v20.4s,  #9

+        rshrn2          v20.8h,  v21.4s,  #9

+        rshrn           v21.4h,  v22.4s,  #9

+        rshrn2          v21.8h,  v23.4s,  #9

+        rshrn           v22.4h,  v24.4s,  #9

+        rshrn2          v22.8h,  v25.4s,  #9

+        rshrn           v23.4h,  v26.4s,  #9

+        rshrn2          v23.8h,  v27.4s,  #9

+        st1             {v20.8h}, [x0], x1

+        st1             {v21.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v22.8h}, [x0], x1

+        st1             {v23.8h}, [x6], x1

+        b.gt            8b

+        ret

+160:

+320:

+640:

+        add             x12, x2,  w3, uxtw #1

+        sub             x1,  x1,  w3, uxtw #1

+        ld1r            {v5.8h}, [x12]            // right

+        sub             x2,  x2,  #4

+        mov             x7,  #-4

+        mov             w9,  w3

+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right

+1:

+        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left

+        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver

+        sub             v0.8h,   v0.8h,   v5.8h   // left-right

+        sub             v1.8h,   v1.8h,   v5.8h

+        uxtl            v16.8h,  v16.8b           // weights_ver

+        uxtl            v17.8h,  v17.8b

+2:

+        ld1             {v7.16b}, [x10],  #16     // weights_hor

+        ld1             {v2.8h, v3.8h}, [x8], #32 // top

+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256

+        ushll           v21.4s,  v31.4h,  #8

+        ushll           v22.4s,  v31.4h,  #8

+        ushll           v23.4s,  v31.4h,  #8

+        ushll           v24.4s,  v31.4h,  #8

+        ushll           v25.4s,  v31.4h,  #8

+        ushll           v26.4s,  v31.4h,  #8

+        ushll           v27.4s,  v31.4h,  #8

+        uxtl            v6.8h,   v7.8b            // weights_hor

+        uxtl2           v7.8h,   v7.16b

+        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom

+        sub             v3.8h,   v3.8h,   v4.8h

+        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor

+        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)

+        smlal           v22.4s,  v1.4h,   v7.4h

+        smlal2          v23.4s,  v1.8h,   v7.8h

+        smlal           v24.4s,  v0.4h,   v6.4h

+        smlal2          v25.4s,  v0.8h,   v6.8h

+        smlal           v26.4s,  v0.4h,   v7.4h

+        smlal2          v27.4s,  v0.8h,   v7.8h

+        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver

+        smlal2          v21.4s,  v2.8h,   v16.8h

+        smlal           v22.4s,  v3.4h,   v16.4h

+        smlal2          v23.4s,  v3.8h,   v16.8h

+        smlal           v24.4s,  v2.4h,   v17.4h

+        smlal2          v25.4s,  v2.8h,   v17.8h

+        smlal           v26.4s,  v3.4h,   v17.4h

+        smlal2          v27.4s,  v3.8h,   v17.8h

+        rshrn           v20.4h,  v20.4s,  #9

+        rshrn2          v20.8h,  v21.4s,  #9

+        rshrn           v21.4h,  v22.4s,  #9

+        rshrn2          v21.8h,  v23.4s,  #9

+        rshrn           v22.4h,  v24.4s,  #9

+        rshrn2          v22.8h,  v25.4s,  #9

+        rshrn           v23.4h,  v26.4s,  #9

+        rshrn2          v23.8h,  v27.4s,  #9

+        subs            w3,  w3,  #16

+        st1             {v20.8h, v21.8h}, [x0], #32

+        st1             {v22.8h, v23.8h}, [x6], #32

+        b.gt            2b

+        subs            w4,  w4,  #2

+        b.le            9f

+        sub             x8,  x8,  w9, uxtw #1

+        sub             x10, x10, w9, uxtw

+        add             x0,  x0,  x1

+        add             x6,  x6,  x1

+        mov             w3,  w9

+        b               1b

+9:

+        ret

+L(ipred_smooth_tbl):

+        .hword L(ipred_smooth_tbl) - 640b

+        .hword L(ipred_smooth_tbl) - 320b

+        .hword L(ipred_smooth_tbl) - 160b

+        .hword L(ipred_smooth_tbl) -  80b

+        .hword L(ipred_smooth_tbl) -  40b

+endfunc

+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                                const pixel *const topleft,

+//                                const int width, const int height, const int a,

+//                                const int max_width, const int max_height);

+function ipred_smooth_v_16bpc_neon, export=1

+        movrel          x7,  X(sm_weights)

+        add             x7,  x7,  w4, uxtw

+        clz             w9,  w3

+        adr             x5,  L(ipred_smooth_v_tbl)

+        sub             x8,  x2,  w4, uxtw #1

+        sub             w9,  w9,  #25

+        ldrh            w9,  [x5, w9, uxtw #1]

+        ld1r            {v4.8h},  [x8] // bottom

+        add             x2,  x2,  #2

+        sub             x5,  x5,  w9, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        ld1r            {v6.2d}, [x2]             // top

+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom

+4:

+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver

+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver

+        zip1            v18.2s,  v18.2s,  v19.2s

+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7

+        ushll           v18.8h,  v18.8b,  #7

+        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8

+        sqrdmulh        v21.8h,  v6.8h,   v18.8h

+        add             v20.8h,  v20.8h,  v4.8h

+        add             v21.8h,  v21.8h,  v4.8h

+        st1             {v20.d}[0], [x0], x1

+        st1             {v20.d}[1], [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v21.d}[0], [x0], x1

+        st1             {v21.d}[1], [x6], x1

+        b.gt            4b

+        ret

+80:

+        ld1             {v6.8h}, [x2]             // top

+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom

+8:

+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver

+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7

+        ushll           v17.8h,  v17.8b,  #7

+        ushll           v18.8h,  v18.8b,  #7

+        ushll           v19.8h,  v19.8b,  #7

+        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8

+        sqrdmulh        v21.8h,  v6.8h,   v17.8h

+        sqrdmulh        v22.8h,  v6.8h,   v18.8h

+        sqrdmulh        v23.8h,  v6.8h,   v19.8h

+        add             v20.8h,  v20.8h,  v4.8h

+        add             v21.8h,  v21.8h,  v4.8h

+        add             v22.8h,  v22.8h,  v4.8h

+        add             v23.8h,  v23.8h,  v4.8h

+        st1             {v20.8h}, [x0], x1

+        st1             {v21.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v22.8h}, [x0], x1

+        st1             {v23.8h}, [x6], x1

+        b.gt            8b

+        ret

+160:

+320:

+640:

+        // Set up pointers for four rows in parallel; x0, x6, x5, x8

+        add             x5,  x0,  x1

+        add             x8,  x6,  x1

+        lsl             x1,  x1,  #1

+        sub             x1,  x1,  w3, uxtw #1

+        mov             w9,  w3

+1:

+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver

+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7

+        ushll           v17.8h,  v17.8b,  #7

+        ushll           v18.8h,  v18.8b,  #7

+        ushll           v19.8h,  v19.8b,  #7

+2:

+        ld1             {v2.8h, v3.8h}, [x2], #32 // top

+        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom

+        sub             v3.8h,   v3.8h,   v4.8h

+        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8

+        sqrdmulh        v21.8h,  v3.8h,   v16.8h

+        sqrdmulh        v22.8h,  v2.8h,   v17.8h

+        sqrdmulh        v23.8h,  v3.8h,   v17.8h

+        sqrdmulh        v24.8h,  v2.8h,   v18.8h

+        sqrdmulh        v25.8h,  v3.8h,   v18.8h

+        sqrdmulh        v26.8h,  v2.8h,   v19.8h

+        sqrdmulh        v27.8h,  v3.8h,   v19.8h

+        add             v20.8h,  v20.8h,  v4.8h

+        add             v21.8h,  v21.8h,  v4.8h

+        add             v22.8h,  v22.8h,  v4.8h

+        add             v23.8h,  v23.8h,  v4.8h

+        add             v24.8h,  v24.8h,  v4.8h

+        add             v25.8h,  v25.8h,  v4.8h

+        add             v26.8h,  v26.8h,  v4.8h

+        add             v27.8h,  v27.8h,  v4.8h

+        subs            w3,  w3,  #16

+        st1             {v20.8h, v21.8h}, [x0], #32

+        st1             {v22.8h, v23.8h}, [x6], #32

+        st1             {v24.8h, v25.8h}, [x5], #32

+        st1             {v26.8h, v27.8h}, [x8], #32

+        b.gt            2b

+        subs            w4,  w4,  #4

+        b.le            9f

+        sub             x2,  x2,  w9, uxtw #1

+        add             x0,  x0,  x1

+        add             x6,  x6,  x1

+        add             x5,  x5,  x1

+        add             x8,  x8,  x1

+        mov             w3,  w9

+        b               1b

+9:

+        ret

+L(ipred_smooth_v_tbl):

+        .hword L(ipred_smooth_v_tbl) - 640b

+        .hword L(ipred_smooth_v_tbl) - 320b

+        .hword L(ipred_smooth_v_tbl) - 160b

+        .hword L(ipred_smooth_v_tbl) -  80b

+        .hword L(ipred_smooth_v_tbl) -  40b

+endfunc

+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                                const pixel *const topleft,

+//                                const int width, const int height, const int a,

+//                                const int max_width, const int max_height);

+function ipred_smooth_h_16bpc_neon, export=1

+        movrel          x8,  X(sm_weights)

+        add             x8,  x8,  w3, uxtw

+        clz             w9,  w3

+        adr             x5,  L(ipred_smooth_h_tbl)

+        add             x12, x2,  w3, uxtw #1

+        sub             w9,  w9,  #25

+        ldrh            w9,  [x5, w9, uxtw #1]

+        ld1r            {v5.8h},  [x12] // right

+        sub             x5,  x5,  w9, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        br              x5

+40:

+        ld1r            {v7.2s}, [x8]             // weights_hor

+        sub             x2,  x2,  #8

+        mov             x7,  #-8

+        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7

+4:

+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left

+        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped

+        zip1            v0.2d,   v3.2d,   v2.2d

+        sub             v0.8h,   v0.8h,   v5.8h   // left-right

+        sub             v1.8h,   v1.8h,   v5.8h

+        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8

+        sqrdmulh        v21.8h,  v1.8h,   v7.8h

+        add             v20.8h,  v20.8h,  v5.8h

+        add             v21.8h,  v21.8h,  v5.8h

+        st1             {v20.d}[0], [x0], x1

+        st1             {v20.d}[1], [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v21.d}[0], [x0], x1

+        st1             {v21.d}[1], [x6], x1

+        b.gt            4b

+        ret

+80:

+        ld1             {v7.8b}, [x8]             // weights_hor

+        sub             x2,  x2,  #8

+        mov             x7,  #-8

+        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7

+8:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left

+        sub             v3.8h,   v3.8h,   v5.8h   // left-right

+        sub             v2.8h,   v2.8h,   v5.8h

+        sub             v1.8h,   v1.8h,   v5.8h

+        sub             v0.8h,   v0.8h,   v5.8h

+        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8

+        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)

+        sqrdmulh        v22.8h,  v1.8h,   v7.8h

+        sqrdmulh        v23.8h,  v0.8h,   v7.8h

+        add             v20.8h,  v20.8h,  v5.8h

+        add             v21.8h,  v21.8h,  v5.8h

+        add             v22.8h,  v22.8h,  v5.8h

+        add             v23.8h,  v23.8h,  v5.8h

+        st1             {v20.8h}, [x0], x1

+        st1             {v21.8h}, [x6], x1

+        subs            w4,  w4,  #4

+        st1             {v22.8h}, [x0], x1

+        st1             {v23.8h}, [x6], x1

+        b.gt            8b

+        ret

+160:

+320:

+640:

+        sub             x2,  x2,  #8

+        mov             x7,  #-8

+        // Set up pointers for four rows in parallel; x0, x6, x5, x10

+        add             x5,  x0,  x1

+        add             x10, x6,  x1

+        lsl             x1,  x1,  #1

+        sub             x1,  x1,  w3, uxtw #1

+        mov             w9,  w3

+1:

+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left

+        sub             v0.8h,   v0.8h,   v5.8h   // left-right

+        sub             v1.8h,   v1.8h,   v5.8h

+        sub             v2.8h,   v2.8h,   v5.8h

+        sub             v3.8h,   v3.8h,   v5.8h

+2:

+        ld1             {v7.16b}, [x8],   #16     // weights_hor

+        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7

+        ushll2          v7.8h,   v7.16b,  #7

+        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8

+        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)

+        sqrdmulh        v22.8h,  v2.8h,   v6.8h

+        sqrdmulh        v23.8h,  v2.8h,   v7.8h

+        sqrdmulh        v24.8h,  v1.8h,   v6.8h

+        sqrdmulh        v25.8h,  v1.8h,   v7.8h

+        sqrdmulh        v26.8h,  v0.8h,   v6.8h

+        sqrdmulh        v27.8h,  v0.8h,   v7.8h

+        add             v20.8h,  v20.8h,  v5.8h

+        add             v21.8h,  v21.8h,  v5.8h

+        add             v22.8h,  v22.8h,  v5.8h

+        add             v23.8h,  v23.8h,  v5.8h

+        add             v24.8h,  v24.8h,  v5.8h

+        add             v25.8h,  v25.8h,  v5.8h

+        add             v26.8h,  v26.8h,  v5.8h

+        add             v27.8h,  v27.8h,  v5.8h

+        subs            w3,  w3,  #16

+        st1             {v20.8h, v21.8h}, [x0],  #32

+        st1             {v22.8h, v23.8h}, [x6],  #32

+        st1             {v24.8h, v25.8h}, [x5],  #32

+        st1             {v26.8h, v27.8h}, [x10], #32

+        b.gt            2b

+        subs            w4,  w4,  #4

+        b.le            9f

+        sub             x8,  x8,  w9, uxtw

+        add             x0,  x0,  x1

+        add             x6,  x6,  x1

+        add             x5,  x5,  x1

+        add             x10, x10, x1

+        mov             w3,  w9

+        b               1b

+9:

+        ret

+L(ipred_smooth_h_tbl):

+        .hword L(ipred_smooth_h_tbl) - 640b

+        .hword L(ipred_smooth_h_tbl) - 320b

+        .hword L(ipred_smooth_h_tbl) - 160b

+        .hword L(ipred_smooth_h_tbl) -  80b

+        .hword L(ipred_smooth_h_tbl) -  40b

+endfunc

+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                              const pixel *const topleft,

+//                              const int width, const int height, const int filt_idx,

+//                              const int max_width, const int max_height,

+//                              const int bitdepth_max);

+.macro filter_fn bpc

+function ipred_filter_\bpc\()bpc_neon

+        and             w5,  w5,  #511

+        movrel          x6,  X(filter_intra_taps)

+        lsl             w5,  w5,  #6

+        add             x6,  x6,  w5, uxtw

+        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32

+        clz             w9,  w3

+        adr             x5,  L(ipred_filter\bpc\()_tbl)

+        ld1             {v20.8b, v21.8b, v22.8b}, [x6]

+        sub             w9,  w9,  #26

+        ldrh            w9,  [x5, w9, uxtw #1]

+        sxtl            v16.8h,  v16.8b

+        sxtl            v17.8h,  v17.8b

+        sub             x5,  x5,  w9, uxtw

+        sxtl            v18.8h,  v18.8b

+        sxtl            v19.8h,  v19.8b

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        sxtl            v20.8h,  v20.8b

+        sxtl            v21.8h,  v21.8b

+        sxtl            v22.8h,  v22.8b

+        dup             v31.8h,  w8

+        movi            v30.8h,  #0

+        br              x5

+40:

+        ldur            d0,  [x2, #2]             // top (0-3)

+        sub             x2,  x2,  #4

+        mov             x7,  #-4

+4:

+        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)

+.if \bpc == 10

+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)

+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)

+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)

+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)

+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)

+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)

+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)

+        srshr           v2.8h,   v2.8h,   #4

+        smax            v2.8h,   v2.8h,   v30.8h

+.else

+        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)

+        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)

+        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)

+        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)

+        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)

+        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)

+        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)

+        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)

+        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)

+        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)

+        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)

+        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)

+        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)

+        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)

+        sqrshrun        v2.4h,   v2.4s,   #4

+        sqrshrun2       v2.8h,   v3.4s,   #4

+.endif

+        smin            v2.8h,   v2.8h,   v31.8h

+        subs            w4,  w4,  #2

+        st1             {v2.d}[0], [x0], x1

+        uxtl            v0.8h,   v2.8b

+        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]

+        st1             {v2.d}[1], [x6], x1

+        b.gt            4b

+        ret

+80:

+        ldur            q0,  [x2, #2]             // top (0-7)

+        sub             x2,  x2,  #4

+        mov             x7,  #-4

+8:

+        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)

+.if \bpc == 10

+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)

+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)

+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)

+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)

+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)

+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)

+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)

+        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)

+        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)

+        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)

+        srshr           v2.8h,   v2.8h,   #4

+        smax            v2.8h,   v2.8h,   v30.8h

+        smin            v2.8h,   v2.8h,   v31.8h

+        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)

+        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)

+        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)

+        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)

+        srshr           v3.8h,   v3.8h,   #4

+        smax            v3.8h,   v3.8h,   v30.8h

+.else

+        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)

+        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)

+        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)

+        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)

+        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)

+        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)

+        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)

+        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)

+        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)

+        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)

+        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)

+        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)

+        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)

+        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)

+        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)

+        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)

+        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)

+        sqrshrun        v2.4h,   v2.4s,   #4

+        sqrshrun2       v2.8h,   v3.4s,   #4

+        smin            v2.8h,   v2.8h,   v31.8h

+        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)

+        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)

+        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)

+        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)

+        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)

+        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)

+        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)

+        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)

+        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)

+        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)

+        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)

+        sqrshrun        v3.4h,   v4.4s,   #4

+        sqrshrun2       v3.8h,   v5.4s,   #4

+.endif

+        smin            v3.8h,   v3.8h,   v31.8h

+        subs            w4,  w4,  #2

+        st2             {v2.d, v3.d}[0], [x0], x1

+        zip2            v0.2d,   v2.2d,   v3.2d

+        st2             {v2.d, v3.d}[1], [x6], x1

+        b.gt            8b

+        ret

+160:

+320:

+        add             x8,  x2,  #2

+        sub             x2,  x2,  #4

+        mov             x7,  #-4

+        sub             x1,  x1,  w3, uxtw #1

+        mov             w9,  w3

+1:

+        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)

+2:

+        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)

+.if \bpc == 10

+        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)

+        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)

+        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)

+        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)

+        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)

+        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)

+        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)

+        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)

+        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)

+        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)

+        srshr           v3.8h,   v3.8h,   #4

+        smax            v3.8h,   v3.8h,   v30.8h

+        smin            v3.8h,   v3.8h,   v31.8h

+        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)

+        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)

+        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)

+        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)

+        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)

+        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)

+        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)

+        srshr           v4.8h,   v4.8h,   #4

+        smax            v4.8h,   v4.8h,   v30.8h

+        smin            v4.8h,   v4.8h,   v31.8h

+        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)

+        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)

+        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)

+        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)

+        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)

+        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)

+        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)

+        srshr           v5.8h,   v5.8h,   #4

+        smax            v5.8h,   v5.8h,   v30.8h

+        smin            v5.8h,   v5.8h,   v31.8h

+        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)

+        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)

+        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)

+        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)

+        subs            w3,  w3,  #16

+        srshr           v6.8h,   v6.8h,   #4

+        smax            v6.8h,   v6.8h,   v30.8h

+.else

+        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)

+        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)

+        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)

+        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)

+        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)

+        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)

+        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)

+        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)

+        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)

+        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)

+        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)

+        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)

+        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)

+        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)

+        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)

+        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)

+        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)

+        sqrshrun        v3.4h,   v3.4s,   #4

+        sqrshrun2       v3.8h,   v4.4s,   #4

+        smin            v3.8h,   v3.8h,   v31.8h

+        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)

+        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)

+        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)

+        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)

+        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)

+        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)

+        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)

+        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)

+        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)

+        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)

+        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)

+        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)

+        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)

+        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)

+        sqrshrun        v4.4h,   v5.4s,   #4

+        sqrshrun2       v4.8h,   v6.4s,   #4

+        smin            v4.8h,   v4.8h,   v31.8h

+        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)

+        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)

+        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)

+        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)

+        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)

+        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)

+        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)

+        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)

+        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)

+        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)

+        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)

+        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)

+        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)

+        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)

+        sqrshrun        v5.4h,   v24.4s,  #4

+        sqrshrun2       v5.8h,   v25.4s,  #4

+        smin            v5.8h,   v5.8h,   v31.8h

+        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)

+        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)

+        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)

+        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)

+        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)

+        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)

+        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)

+        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)

+        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)

+        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)

+        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)

+        subs            w3,  w3,  #16

+        sqrshrun        v6.4h,   v26.4s,  #4

+        sqrshrun2       v6.8h,   v27.4s,  #4

+.endif

+        smin            v6.8h,   v6.8h,   v31.8h

+        ins             v0.h[2], v2.h[7]

+        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32

+        ins             v0.h[0], v6.h[7]

+        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32

+        ins             v0.h[1], v6.h[3]

+        b.gt            2b

+        subs            w4,  w4,  #2

+        b.le            9f

+        sub             x8,  x6,  w9, uxtw #1

+        add             x0,  x0,  x1

+        add             x6,  x6,  x1

+        mov             w3,  w9

+        b               1b

+9:

+        ret

+L(ipred_filter\bpc\()_tbl):

+        .hword L(ipred_filter\bpc\()_tbl) - 320b

+        .hword L(ipred_filter\bpc\()_tbl) - 160b

+        .hword L(ipred_filter\bpc\()_tbl) -  80b

+        .hword L(ipred_filter\bpc\()_tbl) -  40b

+endfunc

+.endm

+filter_fn 10

+filter_fn 12

+function ipred_filter_16bpc_neon, export=1

+        ldr             w8,  [sp]

+        cmp             w8,  0x3ff

+        b.le            ipred_filter_10bpc_neon

+        b               ipred_filter_12bpc_neon

+endfunc

+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                          const uint16_t *const pal, const uint8_t *idx,

+//                          const int w, const int h);

+function pal_pred_16bpc_neon, export=1

+        ld1             {v30.8h}, [x2]

+        clz             w9,  w4

+        adr             x6,  L(pal_pred_tbl)

+        sub             w9,  w9,  #25

+        ldrh            w9,  [x6, w9, uxtw #1]

+        movi            v31.8h,  #1, lsl #8

+        sub             x6,  x6,  w9, uxtw

+        br              x6

+40:

+        add             x2,  x0,  x1

+        lsl             x1,  x1,  #1

+4:

+        ld1             {v1.16b}, [x3], #16

+        subs            w5,  w5,  #4

+        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...

+        add             v1.16b,  v1.16b,  v1.16b

+        zip1            v0.16b,  v1.16b,  v1.16b

+        zip2            v1.16b,  v1.16b,  v1.16b

+        add             v0.8h,   v0.8h,   v31.8h

+        add             v1.8h,   v1.8h,   v31.8h

+        tbl             v0.16b, {v30.16b}, v0.16b

+        st1             {v0.d}[0], [x0], x1

+        tbl             v1.16b, {v30.16b}, v1.16b

+        st1             {v0.d}[1], [x2], x1

+        st1             {v1.d}[0], [x0], x1

+        st1             {v1.d}[1], [x2], x1

+        b.gt            4b

+        ret

+80:

+        add             x2,  x0,  x1

+        lsl             x1,  x1,  #1

+8:

+        ld1             {v2.16b, v3.16b}, [x3], #32

+        subs            w5,  w5,  #4

+        add             v2.16b,  v2.16b,  v2.16b

+        add             v3.16b,  v3.16b,  v3.16b

+        zip1            v0.16b,  v2.16b,  v2.16b

+        zip2            v1.16b,  v2.16b,  v2.16b

+        zip1            v2.16b,  v3.16b,  v3.16b

+        zip2            v3.16b,  v3.16b,  v3.16b

+        add             v0.8h,   v0.8h,   v31.8h

+        add             v1.8h,   v1.8h,   v31.8h

+        add             v2.8h,   v2.8h,   v31.8h

+        add             v3.8h,   v3.8h,   v31.8h

+        tbl             v0.16b, {v30.16b}, v0.16b

+        tbl             v1.16b, {v30.16b}, v1.16b

+        st1             {v0.8h}, [x0], x1

+        tbl             v2.16b, {v30.16b}, v2.16b

+        st1             {v1.8h}, [x2], x1

+        tbl             v3.16b, {v30.16b}, v3.16b

+        st1             {v2.8h}, [x0], x1

+        st1             {v3.8h}, [x2], x1

+        b.gt            8b

+        ret

+160:

+        add             x2,  x0,  x1

+        lsl             x1,  x1,  #1

+16:

+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64

+        subs            w5,  w5,  #4

+        add             v4.16b,  v4.16b,  v4.16b

+        add             v5.16b,  v5.16b,  v5.16b

+        add             v6.16b,  v6.16b,  v6.16b

+        add             v7.16b,  v7.16b,  v7.16b

+        zip1            v0.16b,  v4.16b,  v4.16b

+        zip2            v1.16b,  v4.16b,  v4.16b

+        zip1            v2.16b,  v5.16b,  v5.16b

+        zip2            v3.16b,  v5.16b,  v5.16b

+        zip1            v4.16b,  v6.16b,  v6.16b

+        zip2            v5.16b,  v6.16b,  v6.16b

+        zip1            v6.16b,  v7.16b,  v7.16b

+        zip2            v7.16b,  v7.16b,  v7.16b

+        add             v0.8h,   v0.8h,   v31.8h

+        add             v1.8h,   v1.8h,   v31.8h

+        add             v2.8h,   v2.8h,   v31.8h

+        add             v3.8h,   v3.8h,   v31.8h

+        add             v4.8h,   v4.8h,   v31.8h

+        tbl             v0.16b, {v30.16b}, v0.16b

+        add             v5.8h,   v5.8h,   v31.8h

+        tbl             v1.16b, {v30.16b}, v1.16b

+        add             v6.8h,   v6.8h,   v31.8h

+        tbl             v2.16b, {v30.16b}, v2.16b

+        add             v7.8h,   v7.8h,   v31.8h

+        tbl             v3.16b, {v30.16b}, v3.16b

+        tbl             v4.16b, {v30.16b}, v4.16b

+        tbl             v5.16b, {v30.16b}, v5.16b

+        st1             {v0.8h, v1.8h}, [x0], x1

+        tbl             v6.16b, {v30.16b}, v6.16b

+        st1             {v2.8h, v3.8h}, [x2], x1

+        tbl             v7.16b, {v30.16b}, v7.16b

+        st1             {v4.8h, v5.8h}, [x0], x1

+        st1             {v6.8h, v7.8h}, [x2], x1

+        b.gt            16b

+        ret

+320:

+        add             x2,  x0,  x1

+        lsl             x1,  x1,  #1

+32:

+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64

+        subs            w5,  w5,  #2

+        add             v4.16b,  v4.16b,  v4.16b

+        add             v5.16b,  v5.16b,  v5.16b

+        add             v6.16b,  v6.16b,  v6.16b

+        add             v7.16b,  v7.16b,  v7.16b

+        zip1            v0.16b,  v4.16b,  v4.16b

+        zip2            v1.16b,  v4.16b,  v4.16b

+        zip1            v2.16b,  v5.16b,  v5.16b

+        zip2            v3.16b,  v5.16b,  v5.16b

+        zip1            v4.16b,  v6.16b,  v6.16b

+        zip2            v5.16b,  v6.16b,  v6.16b

+        zip1            v6.16b,  v7.16b,  v7.16b

+        zip2            v7.16b,  v7.16b,  v7.16b

+        add             v0.8h,   v0.8h,   v31.8h

+        add             v1.8h,   v1.8h,   v31.8h

+        add             v2.8h,   v2.8h,   v31.8h

+        add             v3.8h,   v3.8h,   v31.8h

+        add             v4.8h,   v4.8h,   v31.8h

+        tbl             v0.16b, {v30.16b}, v0.16b

+        add             v5.8h,   v5.8h,   v31.8h

+        tbl             v1.16b, {v30.16b}, v1.16b

+        add             v6.8h,   v6.8h,   v31.8h

+        tbl             v2.16b, {v30.16b}, v2.16b

+        add             v7.8h,   v7.8h,   v31.8h

+        tbl             v3.16b, {v30.16b}, v3.16b

+        tbl             v4.16b, {v30.16b}, v4.16b

+        tbl             v5.16b, {v30.16b}, v5.16b

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        tbl             v6.16b, {v30.16b}, v6.16b

+        tbl             v7.16b, {v30.16b}, v7.16b

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1

+        b.gt            32b

+        ret

+640:

+        add             x2,  x0,  #64

+64:

+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64

+        subs            w5,  w5,  #1

+        add             v4.16b,  v4.16b,  v4.16b

+        add             v5.16b,  v5.16b,  v5.16b

+        add             v6.16b,  v6.16b,  v6.16b

+        add             v7.16b,  v7.16b,  v7.16b

+        zip1            v0.16b,  v4.16b,  v4.16b

+        zip2            v1.16b,  v4.16b,  v4.16b

+        zip1            v2.16b,  v5.16b,  v5.16b

+        zip2            v3.16b,  v5.16b,  v5.16b

+        zip1            v4.16b,  v6.16b,  v6.16b

+        zip2            v5.16b,  v6.16b,  v6.16b

+        zip1            v6.16b,  v7.16b,  v7.16b

+        zip2            v7.16b,  v7.16b,  v7.16b

+        add             v0.8h,   v0.8h,   v31.8h

+        add             v1.8h,   v1.8h,   v31.8h

+        add             v2.8h,   v2.8h,   v31.8h

+        add             v3.8h,   v3.8h,   v31.8h

+        add             v4.8h,   v4.8h,   v31.8h

+        tbl             v0.16b, {v30.16b}, v0.16b

+        add             v5.8h,   v5.8h,   v31.8h

+        tbl             v1.16b, {v30.16b}, v1.16b

+        add             v6.8h,   v6.8h,   v31.8h

+        tbl             v2.16b, {v30.16b}, v2.16b

+        add             v7.8h,   v7.8h,   v31.8h

+        tbl             v3.16b, {v30.16b}, v3.16b

+        tbl             v4.16b, {v30.16b}, v4.16b

+        tbl             v5.16b, {v30.16b}, v5.16b

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        tbl             v6.16b, {v30.16b}, v6.16b

+        tbl             v7.16b, {v30.16b}, v7.16b

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1

+        b.gt            64b

+        ret

+L(pal_pred_tbl):

+        .hword L(pal_pred_tbl) - 640b

+        .hword L(pal_pred_tbl) - 320b

+        .hword L(pal_pred_tbl) - 160b

+        .hword L(pal_pred_tbl) -  80b

+        .hword L(pal_pred_tbl) -  40b

+endfunc

+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                               const pixel *const topleft,

+//                               const int width, const int height,

+//                               const int16_t *ac, const int alpha,

+//                               const int bitdepth_max);

+function ipred_cfl_128_16bpc_neon, export=1

+        dup             v31.8h,  w7   // bitdepth_max

+        clz             w9,  w3

+        adr             x7,  L(ipred_cfl_128_tbl)

+        sub             w9,  w9,  #26

+        ldrh            w9,  [x7, w9, uxtw #1]

+        urshr           v0.8h,   v31.8h,  #1

+        dup             v1.8h,   w6   // alpha

+        sub             x7,  x7,  w9, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        movi            v30.8h,  #0

+        br              x7

+L(ipred_cfl_splat_w4):

+        ld1             {v4.8h, v5.8h}, [x5], #32

+        subs            w4,  w4,  #4

+        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha

+        smull2          v3.4s,   v4.8h,   v1.8h

+        smull           v4.4s,   v5.4h,   v1.4h

+        smull2          v5.4s,   v5.8h,   v1.8h

+        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31

+        sshr            v17.4s,  v3.4s,   #31

+        sshr            v18.4s,  v4.4s,   #31

+        sshr            v19.4s,  v5.4s,   #31

+        add             v2.4s,   v2.4s,   v16.4s // diff + sign

+        add             v3.4s,   v3.4s,   v17.4s

+        add             v4.4s,   v4.4s,   v18.4s

+        add             v5.4s,   v5.4s,   v19.4s

+        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()

+        rshrn2          v2.8h,   v3.4s,   #6

+        rshrn           v3.4h,   v4.4s,   #6

+        rshrn2          v3.8h,   v5.4s,   #6

+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()

+        add             v3.8h,   v3.8h,   v0.8h

+        smax            v2.8h,   v2.8h,   v30.8h

+        smax            v3.8h,   v3.8h,   v30.8h

+        smin            v2.8h,   v2.8h,   v31.8h

+        smin            v3.8h,   v3.8h,   v31.8h

+        st1             {v2.d}[0],  [x0], x1

+        st1             {v2.d}[1],  [x6], x1

+        st1             {v3.d}[0],  [x0], x1

+        st1             {v3.d}[1],  [x6], x1

+        b.gt            L(ipred_cfl_splat_w4)

+        ret

+L(ipred_cfl_splat_w8):

+        ld1             {v4.8h, v5.8h}, [x5], #32

+        subs            w4,  w4,  #2

+        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha

+        smull2          v3.4s,   v4.8h,   v1.8h

+        smull           v4.4s,   v5.4h,   v1.4h

+        smull2          v5.4s,   v5.8h,   v1.8h

+        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31

+        sshr            v17.4s,  v3.4s,   #31

+        sshr            v18.4s,  v4.4s,   #31

+        sshr            v19.4s,  v5.4s,   #31

+        add             v2.4s,   v2.4s,   v16.4s // diff + sign

+        add             v3.4s,   v3.4s,   v17.4s

+        add             v4.4s,   v4.4s,   v18.4s

+        add             v5.4s,   v5.4s,   v19.4s

+        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()

+        rshrn2          v2.8h,   v3.4s,   #6

+        rshrn           v3.4h,   v4.4s,   #6

+        rshrn2          v3.8h,   v5.4s,   #6

+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()

+        add             v3.8h,   v3.8h,   v0.8h

+        smax            v2.8h,   v2.8h,   v30.8h

+        smax            v3.8h,   v3.8h,   v30.8h

+        smin            v2.8h,   v2.8h,   v31.8h

+        smin            v3.8h,   v3.8h,   v31.8h

+        st1             {v2.8h},  [x0], x1

+        st1             {v3.8h},  [x6], x1

+        b.gt            L(ipred_cfl_splat_w8)

+        ret

+L(ipred_cfl_splat_w16):

+        add             x7,  x5,  w3, uxtw #1

+        sub             x1,  x1,  w3, uxtw #1

+        mov             w9,  w3

+1:

+        ld1             {v2.8h, v3.8h}, [x5], #32

+        ld1             {v4.8h, v5.8h}, [x7], #32

+        subs            w3,  w3,  #16

+        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha

+        smull2          v17.4s,  v2.8h,   v1.8h

+        smull           v18.4s,  v3.4h,   v1.4h

+        smull2          v19.4s,  v3.8h,   v1.8h

+        smull           v2.4s,   v4.4h,   v1.4h

+        smull2          v3.4s,   v4.8h,   v1.8h

+        smull           v4.4s,   v5.4h,   v1.4h

+        smull2          v5.4s,   v5.8h,   v1.8h

+        sshr            v20.4s,  v16.4s,  #31    // sign = diff >> 31

+        sshr            v21.4s,  v17.4s,  #31

+        sshr            v22.4s,  v18.4s,  #31

+        sshr            v23.4s,  v19.4s,  #31

+        sshr            v24.4s,  v2.4s,   #31

+        sshr            v25.4s,  v3.4s,   #31

+        sshr            v26.4s,  v4.4s,   #31

+        sshr            v27.4s,  v5.4s,   #31

+        add             v16.4s,  v16.4s,  v20.4s // diff + sign

+        add             v17.4s,  v17.4s,  v21.4s

+        add             v18.4s,  v18.4s,  v22.4s

+        add             v19.4s,  v19.4s,  v23.4s

+        add             v2.4s,   v2.4s,   v24.4s

+        add             v3.4s,   v3.4s,   v25.4s

+        add             v4.4s,   v4.4s,   v26.4s

+        add             v5.4s,   v5.4s,   v27.4s

+        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()

+        rshrn2          v16.8h,  v17.4s,  #6

+        rshrn           v17.4h,  v18.4s,  #6

+        rshrn2          v17.8h,  v19.4s,  #6

+        rshrn           v6.4h,   v2.4s,   #6

+        rshrn2          v6.8h,   v3.4s,   #6

+        rshrn           v7.4h,   v4.4s,   #6

+        rshrn2          v7.8h,   v5.4s,   #6

+        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()

+        add             v3.8h,   v17.8h,  v0.8h

+        add             v4.8h,   v6.8h,   v0.8h

+        add             v5.8h,   v7.8h,   v0.8h

+        smax            v2.8h,   v2.8h,   v30.8h

+        smax            v3.8h,   v3.8h,   v30.8h

+        smax            v4.8h,   v4.8h,   v30.8h

+        smax            v5.8h,   v5.8h,   v30.8h

+        smin            v2.8h,   v2.8h,   v31.8h

+        smin            v3.8h,   v3.8h,   v31.8h

+        smin            v4.8h,   v4.8h,   v31.8h

+        smin            v5.8h,   v5.8h,   v31.8h

+        st1             {v2.8h, v3.8h},  [x0], #32

+        st1             {v4.8h, v5.8h},  [x6], #32

+        b.gt            1b

+        subs            w4,  w4,  #2

+        add             x5,  x5,  w9, uxtw #1

+        add             x7,  x7,  w9, uxtw #1

+        add             x0,  x0,  x1

+        add             x6,  x6,  x1

+        mov             w3,  w9

+        b.gt            1b

+        ret

+L(ipred_cfl_128_tbl):

+L(ipred_cfl_splat_tbl):

+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)

+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)

+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)

+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)

+endfunc

+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                               const pixel *const topleft,

+//                               const int width, const int height,

+//                               const int16_t *ac, const int alpha,

+//                               const int bitdepth_max);

+function ipred_cfl_top_16bpc_neon, export=1

+        dup             v31.8h,  w7   // bitdepth_max

+        clz             w9,  w3

+        adr             x7,  L(ipred_cfl_top_tbl)

+        sub             w9,  w9,  #26

+        ldrh            w9,  [x7, w9, uxtw #1]

+        dup             v1.8h,   w6   // alpha

+        add             x2,  x2,  #2

+        sub             x7,  x7,  w9, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        movi            v30.8h,  #0

+        br              x7

+4:

+        ld1             {v0.4h},  [x2]

+        addv            h0,      v0.4h

+        urshr           v0.4h,   v0.4h,   #2

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w4)

+8:

+        ld1             {v0.8h},  [x2]

+        addv            h0,      v0.8h

+        urshr           v0.4h,   v0.4h,   #3

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w8)

+16:

+        ld1             {v2.8h, v3.8h}, [x2]

+        addp            v0.8h,   v2.8h,   v3.8h

+        addv            h0,      v0.8h

+        urshr           v0.4h,   v0.4h,   #4

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w16)

+32:

+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v0.8h,   v2.8h,   v4.8h

+        uaddlv          s0,      v0.8h

+        rshrn           v0.4h,   v0.4s,   #5

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w16)

+L(ipred_cfl_top_tbl):

+        .hword L(ipred_cfl_top_tbl) - 32b

+        .hword L(ipred_cfl_top_tbl) - 16b

+        .hword L(ipred_cfl_top_tbl) -  8b

+        .hword L(ipred_cfl_top_tbl) -  4b

+endfunc

+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                                const pixel *const topleft,

+//                                const int width, const int height,

+//                                const int16_t *ac, const int alpha,

+//                                const int bitdepth_max);

+function ipred_cfl_left_16bpc_neon, export=1

+        dup             v31.8h,  w7   // bitdepth_max

+        sub             x2,  x2,  w4, uxtw #1

+        clz             w9,  w3

+        clz             w8,  w4

+        adr             x10, L(ipred_cfl_splat_tbl)

+        adr             x7,  L(ipred_cfl_left_tbl)

+        sub             w9,  w9,  #26

+        sub             w8,  w8,  #26

+        ldrh            w9,  [x10, w9, uxtw #1]

+        ldrh            w8,  [x7,  w8, uxtw #1]

+        dup             v1.8h,   w6   // alpha

+        sub             x9,  x10, w9, uxtw

+        sub             x7,  x7,  w8, uxtw

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        movi            v30.8h,  #0

+        br              x7

+L(ipred_cfl_left_h4):

+        ld1             {v0.4h},  [x2]

+        addv            h0,      v0.4h

+        urshr           v0.4h,   v0.4h,   #2

+        dup             v0.8h,   v0.h[0]

+        br              x9

+L(ipred_cfl_left_h8):

+        ld1             {v0.8h},  [x2]

+        addv            h0,      v0.8h

+        urshr           v0.4h,   v0.4h,   #3

+        dup             v0.8h,   v0.h[0]

+        br              x9

+L(ipred_cfl_left_h16):

+        ld1             {v2.8h, v3.8h}, [x2]

+        addp            v0.8h,   v2.8h,   v3.8h

+        addv            h0,      v0.8h

+        urshr           v0.4h,   v0.4h,   #4

+        dup             v0.8h,   v0.h[0]

+        br              x9

+L(ipred_cfl_left_h32):

+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v0.8h,   v2.8h,   v4.8h

+        uaddlv          s0,      v0.8h

+        rshrn           v0.4h,   v0.4s,   #5

+        dup             v0.8h,   v0.h[0]

+        br              x9

+L(ipred_cfl_left_tbl):

+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)

+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)

+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)

+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)

+endfunc

+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                           const pixel *const topleft,

+//                           const int width, const int height,

+//                           const int16_t *ac, const int alpha,

+//                           const int bitdepth_max);

+function ipred_cfl_16bpc_neon, export=1

+        dup             v31.8h,  w7              // bitdepth_max

+        sub             x2,  x2,  w4, uxtw #1

+        add             w8,  w3,  w4             // width + height

+        dup             v1.8h,   w6              // alpha

+        clz             w9,  w3

+        clz             w6,  w4

+        dup             v16.4s, w8               // width + height

+        adr             x7,  L(ipred_cfl_tbl)

+        rbit            w8,  w8                  // rbit(width + height)

+        sub             w9,  w9,  #22            // 22 leading bits, minus table offset 4

+        sub             w6,  w6,  #26

+        clz             w8,  w8                  // ctz(width + height)

+        ldrh            w9,  [x7, w9, uxtw #1]

+        ldrh            w6,  [x7, w6, uxtw #1]

+        neg             w8,  w8                  // -ctz(width + height)

+        sub             x9,  x7,  w9, uxtw

+        sub             x7,  x7,  w6, uxtw

+        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1

+        dup             v17.4s,  w8              // -ctz(width + height)

+        add             x6,  x0,  x1

+        lsl             x1,  x1,  #1

+        movi            v30.8h,  #0

+        br              x7

+L(ipred_cfl_h4):

+        ld1             {v0.4h},  [x2], #8

+        uaddlv          s0,      v0.4h

+        br              x9

+L(ipred_cfl_w4):

+        add             x2,  x2,  #2

+        ld1             {v2.4h},  [x2]

+        add             v0.2s,   v0.2s,   v16.2s

+        uaddlv          s2,      v2.4h

+        cmp             w4,  #4

+        add             v0.2s,   v0.2s,   v2.2s

+        ushl            v0.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 8/16

+        cmp             w4,  #16

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v0.2s,   v0.2s,   v16.2s

+        ushr            v0.2s,   v0.2s,   #17

+1:

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w4)

+L(ipred_cfl_h8):

+        ld1             {v0.8h},  [x2], #16

+        uaddlv          s0,      v0.8h

+        br              x9

+L(ipred_cfl_w8):

+        add             x2,  x2,  #2

+        ld1             {v2.8h},  [x2]

+        add             v0.2s,   v0.2s,   v16.2s

+        uaddlv          s2,      v2.8h

+        cmp             w4,  #8

+        add             v0.2s,   v0.2s,   v2.2s

+        ushl            v0.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 4/16/32

+        cmp             w4,  #32

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v0.2s,   v0.2s,   v16.2s

+        ushr            v0.2s,   v0.2s,   #17

+1:

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w8)

+L(ipred_cfl_h16):

+        ld1             {v2.8h, v3.8h}, [x2], #32

+        addp            v0.8h,   v2.8h,   v3.8h

+        uaddlv          s0,      v0.8h

+        br              x9

+L(ipred_cfl_w16):

+        add             x2,  x2,  #2

+        ld1             {v2.8h, v3.8h}, [x2]

+        add             v0.2s,   v0.2s,   v16.2s

+        addp            v2.8h,   v2.8h,   v3.8h

+        uaddlv          s2,      v2.8h

+        cmp             w4,  #16

+        add             v0.2s,   v0.2s,   v2.2s

+        ushl            v0.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 4/8/32

+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v0.2s,   v0.2s,   v16.2s

+        ushr            v0.2s,   v0.2s,   #17

+1:

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w16)

+L(ipred_cfl_h32):

+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v0.8h,   v2.8h,   v4.8h

+        uaddlv          s0,      v0.8h

+        br              x9

+L(ipred_cfl_w32):

+        add             x2,  x2,  #2

+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]

+        add             v0.4s,   v0.4s,   v16.4s

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v2.8h,   v2.8h,   v4.8h

+        cmp             w4,  #32

+        uaddlv          s2,      v2.8h

+        add             v0.2s,   v0.2s,   v2.2s

+        ushl            v0.2s,   v0.2s,   v17.2s

+        b.eq            1f

+        // h = 8/16

+        cmp             w4,  #8

+        mov             w16, #0x6667

+        mov             w17, #0xAAAB

+        csel            w16, w16, w17, eq

+        dup             v16.2s,  w16

+        mul             v0.2s,   v0.2s,   v16.2s

+        ushr            v0.2s,   v0.2s,   #17

+1:

+        dup             v0.8h,   v0.h[0]

+        b               L(ipred_cfl_splat_w16)

+L(ipred_cfl_tbl):

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)

+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)

+endfunc

+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,

+//                            const ptrdiff_t stride, const int w_pad,

+//                            const int h_pad, const int cw, const int ch);

+function ipred_cfl_ac_420_16bpc_neon, export=1

+        clz             w8,  w5

+        lsl             w4,  w4,  #2

+        adr             x7,  L(ipred_cfl_ac_420_tbl)

+        sub             w8,  w8,  #27

+        ldrh            w8,  [x7, w8, uxtw #1]

+        movi            v24.4s,  #0

+        movi            v25.4s,  #0

+        movi            v26.4s,  #0

+        movi            v27.4s,  #0

+        sub             x7,  x7,  w8, uxtw

+        sub             w8,  w6,  w4         // height - h_pad

+        rbit            w9,  w5              // rbit(width)

+        rbit            w10, w6              // rbit(height)

+        clz             w9,  w9              // ctz(width)

+        clz             w10, w10             // ctz(height)

+        add             w9,  w9,  w10        // log2sz

+        add             x10, x1,  x2

+        dup             v31.4s,  w9

+        lsl             x2,  x2,  #1

+        neg             v31.4s,  v31.4s      // -log2sz

+        br              x7

+L(ipred_cfl_ac_420_w4):

+1:      // Copy and subsample input

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v1.8h}, [x10], x2

+        ld1             {v2.8h}, [x1],  x2

+        ld1             {v3.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v2.8h

+        addp            v1.8h,   v1.8h,   v3.8h

+        add             v0.8h,   v0.8h,   v1.8h

+        shl             v0.8h,   v0.8h,   #1

+        subs            w8,  w8,  #2

+        st1             {v0.8h}, [x0], #16

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        b.gt            1b

+        trn2            v1.2d,   v0.2d,   v0.2d

+        trn2            v0.2d,   v0.2d,   v0.2d

+L(ipred_cfl_ac_420_w4_hpad):

+        cbz             w4,  3f

+2:      // Vertical padding (h_pad > 0)

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h}, [x0], #32

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        b.gt            2b

+3:

+L(ipred_cfl_ac_420_w4_calc_subtract_dc):

+        // Aggregate the sums

+        add             v24.4s,  v24.4s,  v25.4s

+        add             v26.4s,  v26.4s,  v27.4s

+        add             v0.4s,   v24.4s,  v26.4s

+        addv            s0,  v0.4s                // sum

+        sub             x0,  x0,  w6, uxtw #3

+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz

+        dup             v4.8h,   v4.h[0]

+6:      // Subtract dc from ac

+        ld1             {v0.8h, v1.8h}, [x0]

+        subs            w6,  w6,  #4

+        sub             v0.8h,   v0.8h,   v4.8h

+        sub             v1.8h,   v1.8h,   v4.8h

+        st1             {v0.8h, v1.8h}, [x0], #32

+        b.gt            6b

+        ret

+L(ipred_cfl_ac_420_w8):

+        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)

+1:      // Copy and subsample input, without padding

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        ld1             {v2.8h, v3.8h}, [x10], x2

+        ld1             {v4.8h, v5.8h}, [x1],  x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        ld1             {v6.8h, v7.8h}, [x10], x2

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        add             v0.8h,   v0.8h,   v2.8h

+        add             v4.8h,   v4.8h,   v6.8h

+        shl             v0.8h,   v0.8h,   #1

+        shl             v1.8h,   v4.8h,   #1

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h}, [x0], #32

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        b.gt            1b

+        mov             v0.16b,  v1.16b

+        b               L(ipred_cfl_ac_420_w8_hpad)

+L(ipred_cfl_ac_420_w8_wpad):

+1:      // Copy and subsample input, padding 4

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v1.8h}, [x10], x2

+        ld1             {v2.8h}, [x1],  x2

+        ld1             {v3.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v2.8h

+        addp            v1.8h,   v1.8h,   v3.8h

+        add             v0.8h,   v0.8h,   v1.8h

+        shl             v0.8h,   v0.8h,   #1

+        dup             v1.4h,   v0.h[3]

+        dup             v3.4h,   v0.h[7]

+        trn2            v2.2d,   v0.2d,   v0.2d

+        subs            w8,  w8,  #2

+        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw           v25.4s,  v25.4s,  v1.4h

+        uaddw           v26.4s,  v26.4s,  v2.4h

+        uaddw           v27.4s,  v27.4s,  v3.4h

+        b.gt            1b

+        trn1            v0.2d,   v2.2d,   v3.2d

+        trn1            v1.2d,   v2.2d,   v3.2d

+L(ipred_cfl_ac_420_w8_hpad):

+        cbz             w4,  3f

+2:      // Vertical padding (h_pad > 0)

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h}, [x0], #32

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        st1             {v0.8h, v1.8h}, [x0], #32

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        b.gt            2b

+3:

+        // Double the height and reuse the w4 summing/subtracting

+        lsl             w6,  w6,  #1

+        lsl             w9,  w9,  #1

+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

+L(ipred_cfl_ac_420_w16):

+        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)

+        ldrh            w3,  [x7, w3, uxtw #1]

+        sub             x7,  x7,  w3, uxtw

+        br              x7

+L(ipred_cfl_ac_420_w16_wpad0):

+1:      // Copy and subsample input, without padding

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2

+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2

+        add             v0.8h,   v0.8h,   v4.8h

+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2

+        add             v2.8h,   v2.8h,   v6.8h

+        addp            v16.8h,  v16.8h,  v17.8h

+        addp            v18.8h,  v18.8h,  v19.8h

+        addp            v20.8h,  v20.8h,  v21.8h

+        addp            v22.8h,  v22.8h,  v23.8h

+        add             v16.8h,  v16.8h,  v20.8h

+        add             v18.8h,  v18.8h,  v22.8h

+        shl             v0.8h,   v0.8h,   #1

+        shl             v1.8h,   v2.8h,   #1

+        shl             v2.8h,   v16.8h,  #1

+        shl             v3.8h,   v18.8h,  #1

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_420_w16_wpad1):

+1:      // Copy and subsample input, padding 4

+        ldr             q2,  [x1,  #32]

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        ldr             q5,  [x10, #32]

+        ld1             {v3.8h, v4.8h}, [x10], x2

+        addp            v2.8h,   v2.8h,   v2.8h

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v5.8h,   v5.8h,   v5.8h

+        addp            v3.8h,   v3.8h,   v4.8h

+        ldr             q18, [x1,  #32]

+        add             v2.4h,   v2.4h,   v5.4h

+        ld1             {v16.8h, v17.8h}, [x1],  x2

+        add             v0.8h,   v0.8h,   v3.8h

+        ldr             q21, [x10, #32]

+        ld1             {v19.8h, v20.8h}, [x10], x2

+        addp            v18.8h,  v18.8h,  v18.8h

+        addp            v16.8h,  v16.8h,  v17.8h

+        addp            v21.8h,  v21.8h,  v21.8h

+        addp            v19.8h,  v19.8h,  v20.8h

+        add             v18.4h,  v18.4h,  v21.4h

+        add             v16.8h,  v16.8h,  v19.8h

+        shl             v1.4h,   v2.4h,   #1

+        shl             v0.8h,   v0.8h,   #1

+        shl             v3.4h,   v18.4h,  #1

+        shl             v2.8h,   v16.8h,  #1

+        dup             v4.4h,   v1.h[3]

+        dup             v5.4h,   v3.h[3]

+        trn1            v1.2d,   v1.2d,   v4.2d

+        trn1            v3.2d,   v3.2d,   v5.2d

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_420_w16_wpad2):

+1:      // Copy and subsample input, padding 8

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        ld1             {v2.8h, v3.8h}, [x10], x2

+        ld1             {v4.8h, v5.8h}, [x1],  x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        ld1             {v6.8h, v7.8h}, [x10], x2

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        add             v0.8h,   v0.8h,   v2.8h

+        add             v4.8h,   v4.8h,   v6.8h

+        shl             v0.8h,   v0.8h,   #1

+        shl             v2.8h,   v4.8h,   #1

+        dup             v1.8h,   v0.h[7]

+        dup             v3.8h,   v2.h[7]

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_420_w16_wpad3):

+1:      // Copy and subsample input, padding 12

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v2.8h}, [x10], x2

+        ld1             {v4.8h}, [x1],  x2

+        ld1             {v6.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v4.8h

+        addp            v2.8h,   v2.8h,   v6.8h

+        add             v0.8h,   v0.8h,   v2.8h

+        shl             v0.8h,   v0.8h,   #1

+        dup             v1.8h,   v0.h[3]

+        dup             v3.8h,   v0.h[7]

+        trn2            v2.2d,   v0.2d,   v3.2d

+        trn1            v0.2d,   v0.2d,   v1.2d

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_420_w16_hpad):

+        cbz             w4,  3f

+2:      // Vertical padding (h_pad > 0)

+        subs            w4,  w4,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            2b

+3:

+        // Quadruple the height and reuse the w4 summing/subtracting

+        lsl             w6,  w6,  #2

+        lsl             w9,  w9,  #2

+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

+L(ipred_cfl_ac_420_tbl):

+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)

+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)

+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)

+        .hword 0

+L(ipred_cfl_ac_420_w16_tbl):

+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)

+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)

+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)

+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)

+endfunc

+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,

+//                            const ptrdiff_t stride, const int w_pad,

+//                            const int h_pad, const int cw, const int ch);

+function ipred_cfl_ac_422_16bpc_neon, export=1

+        clz             w8,  w5

+        lsl             w4,  w4,  #2

+        adr             x7,  L(ipred_cfl_ac_422_tbl)

+        sub             w8,  w8,  #27

+        ldrh            w8,  [x7, w8, uxtw #1]

+        movi            v24.4s,  #0

+        movi            v25.4s,  #0

+        movi            v26.4s,  #0

+        movi            v27.4s,  #0

+        sub             x7,  x7,  w8, uxtw

+        sub             w8,  w6,  w4         // height - h_pad

+        rbit            w9,  w5              // rbit(width)

+        rbit            w10, w6              // rbit(height)

+        clz             w9,  w9              // ctz(width)

+        clz             w10, w10             // ctz(height)

+        add             w9,  w9,  w10        // log2sz

+        add             x10, x1,  x2

+        dup             v31.4s,  w9

+        lsl             x2,  x2,  #1

+        neg             v31.4s,  v31.4s      // -log2sz

+        br              x7

+L(ipred_cfl_ac_422_w4):

+1:      // Copy and subsample input

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v1.8h}, [x10], x2

+        ld1             {v2.8h}, [x1],  x2

+        ld1             {v3.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        shl             v0.8h,   v0.8h,   #2

+        shl             v1.8h,   v2.8h,   #2

+        subs            w8,  w8,  #4

+        st1             {v0.8h, v1.8h}, [x0], #32

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        b.gt            1b

+        trn2            v0.2d,   v1.2d,   v1.2d

+        trn2            v1.2d,   v1.2d,   v1.2d

+        b               L(ipred_cfl_ac_420_w4_hpad)

+L(ipred_cfl_ac_422_w8):

+        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)

+1:      // Copy and subsample input, without padding

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        ld1             {v2.8h, v3.8h}, [x10], x2

+        ld1             {v4.8h, v5.8h}, [x1],  x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        ld1             {v6.8h, v7.8h}, [x10], x2

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        shl             v0.8h,   v0.8h,   #2

+        shl             v1.8h,   v2.8h,   #2

+        shl             v2.8h,   v4.8h,   #2

+        shl             v3.8h,   v6.8h,   #2

+        subs            w8,  w8,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v3.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w8_hpad)

+L(ipred_cfl_ac_422_w8_wpad):

+1:      // Copy and subsample input, padding 4

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v1.8h}, [x10], x2

+        ld1             {v2.8h}, [x1],  x2

+        ld1             {v3.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        shl             v0.8h,   v0.8h,   #2

+        shl             v2.8h,   v2.8h,   #2

+        dup             v4.4h,   v0.h[3]

+        dup             v5.8h,   v0.h[7]

+        dup             v6.4h,   v2.h[3]

+        dup             v7.8h,   v2.h[7]

+        trn2            v1.2d,   v0.2d,   v5.2d

+        trn1            v0.2d,   v0.2d,   v4.2d

+        trn2            v3.2d,   v2.2d,   v7.2d

+        trn1            v2.2d,   v2.2d,   v6.2d

+        subs            w8,  w8,  #4

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v3.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w8_hpad)

+L(ipred_cfl_ac_422_w16):

+        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)

+        ldrh            w3,  [x7, w3, uxtw #1]

+        sub             x7,  x7,  w3, uxtw

+        br              x7

+L(ipred_cfl_ac_422_w16_wpad0):

+1:      // Copy and subsample input, without padding

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2

+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        addp            v6.8h,   v6.8h,   v7.8h

+        shl             v0.8h,   v0.8h,   #2

+        shl             v1.8h,   v2.8h,   #2

+        shl             v2.8h,   v4.8h,   #2

+        shl             v3.8h,   v6.8h,   #2

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_422_w16_wpad1):

+1:      // Copy and subsample input, padding 4

+        ldr             q2,  [x1,  #32]

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        ldr             q6,  [x10, #32]

+        ld1             {v4.8h, v5.8h}, [x10], x2

+        addp            v2.8h,   v2.8h,   v2.8h

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v6.8h,   v6.8h,   v6.8h

+        addp            v4.8h,   v4.8h,   v5.8h

+        shl             v1.4h,   v2.4h,   #2

+        shl             v0.8h,   v0.8h,   #2

+        shl             v3.4h,   v6.4h,   #2

+        shl             v2.8h,   v4.8h,   #2

+        dup             v4.4h,   v1.h[3]

+        dup             v5.4h,   v3.h[3]

+        trn1            v1.2d,   v1.2d,   v4.2d

+        trn1            v3.2d,   v3.2d,   v5.2d

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_422_w16_wpad2):

+1:      // Copy and subsample input, padding 8

+        ld1             {v0.8h, v1.8h}, [x1],  x2

+        ld1             {v2.8h, v3.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v1.8h

+        addp            v2.8h,   v2.8h,   v3.8h

+        shl             v0.8h,   v0.8h,   #2

+        shl             v2.8h,   v2.8h,   #2

+        dup             v1.8h,   v0.h[7]

+        dup             v3.8h,   v2.h[7]

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_422_w16_wpad3):

+1:      // Copy and subsample input, padding 12

+        ld1             {v0.8h}, [x1],  x2

+        ld1             {v2.8h}, [x10], x2

+        addp            v0.8h,   v0.8h,   v0.8h

+        addp            v2.8h,   v2.8h,   v2.8h

+        shl             v0.4h,   v0.4h,   #2

+        shl             v2.4h,   v2.4h,   #2

+        dup             v1.8h,   v0.h[3]

+        dup             v3.8h,   v2.h[3]

+        trn1            v0.2d,   v0.2d,   v1.2d

+        trn1            v2.2d,   v2.2d,   v3.2d

+        subs            w8,  w8,  #2

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        uaddw           v24.4s,  v24.4s,  v0.4h

+        uaddw2          v25.4s,  v25.4s,  v0.8h

+        uaddw           v26.4s,  v26.4s,  v1.4h

+        uaddw2          v27.4s,  v27.4s,  v1.8h

+        uaddw           v24.4s,  v24.4s,  v2.4h

+        uaddw2          v25.4s,  v25.4s,  v2.8h

+        uaddw           v26.4s,  v26.4s,  v3.4h

+        uaddw2          v27.4s,  v27.4s,  v3.8h

+        b.gt            1b

+        mov             v0.16b,  v2.16b

+        mov             v1.16b,  v3.16b

+        b               L(ipred_cfl_ac_420_w16_hpad)

+L(ipred_cfl_ac_422_tbl):

+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)

+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)

+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)

+        .hword 0

+L(ipred_cfl_ac_422_w16_tbl):

+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)

+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)

+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)

+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)

+endfunc

--- a/src/arm/ipred_init_tmpl.c

+++ b/src/arm/ipred_init_tmpl.c

@@ -54,7 +54,7 @@

     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8

+#if BITDEPTH == 8 || ARCH_AARCH64

     c->intra_pred[DC_PRED]       = BF(dav1d_ipred_dc, neon);

     c->intra_pred[DC_128_PRED]   = BF(dav1d_ipred_dc_128, neon);

     c->intra_pred[TOP_DC_PRED]   = BF(dav1d_ipred_dc_top, neon);

--- a/src/meson.build

+++ b/src/meson.build

@@ -120,6 +120,7 @@

             if dav1d_bitdepths.contains('16')

                 libdav1d_sources += files(

                     'arm/64/cdef16.S',

+                    'arm/64/ipred16.S',

                     'arm/64/loopfilter16.S',

                     'arm/64/looprestoration16.S',

                     'arm/64/mc16.S',