ref: a0678eac0ee1a67ff871247b551e42fa448591b7
dir: /src/arm/64/itx16.S/
/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, // int bitdepth_max); // Most of the functions use the following register layout: // x0-x3 external parameters // x4 function pointer to first transform // x5 function pointer to second transform // x6 output parameter for helper function // x7 input parameter for helper function // x8 input stride for helper function // x9-x12 scratch variables for helper functions // x13 pointer to list of eob thresholds // x14 return pointer for helper function // x15 return pointer for main function // The SIMD registers most often use the following layout: // v0-v1 multiplication coefficients // v2-v7 scratch registers // v8-v15 unused // v16-v31 inputs/outputs of transforms const idct_coeffs, align=4 // idct4 .int 2896, 2896*8*(1<<16), 1567, 3784 // idct8 .int 799, 4017, 3406, 2276 // idct16 .int 401, 4076, 3166, 2598 .int 1931, 3612, 3920, 1189 // idct32 .int 201, 4091, 3035, 2751 .int 1751, 3703, 3857, 1380 .int 995, 3973, 3513, 2106 .int 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) .int 4076, 401, 4017, 799 .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) .int -3166, -2598, -799, -4017 .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) .int 3612, 1931, 2276, 3406 .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) .int -3920, -1189, -3406, -2276 endconst const iadst4_coeffs, align=4 .int 1321, 3803, 2482, 3344 endconst const iadst8_coeffs, align=4 .int 4076, 401, 3612, 1931 .int 2598, 3166, 1189, 3920 // idct_coeffs .int 2896, 0, 1567, 3784 endconst const iadst16_coeffs, align=4 .int 4091, 201, 3973, 995 .int 3703, 1751, 3290, 2440 .int 2751, 3035, 2106, 3513 .int 1380, 3857, 601, 4052 endconst .macro mul_mla d, s0, s1, c0, c1 mul \d\().4s, \s0\().4s, \c0 mla \d\().4s, \s1\().4s, \c1 .endm .macro mul_mls d, s0, s1, c0, c1 mul \d\().4s, \s0\().4s, \c0 mls \d\().4s, \s1\().4s, \c1 .endm .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 sqrdmulh \r0\sz, \r0\sz, \c sqrdmulh \r1\sz, \r1\sz, \c sqrdmulh \r2\sz, \r2\sz, \c sqrdmulh \r3\sz, \r3\sz, \c .ifnb \r4 sqrdmulh \r4\sz, \r4\sz, \c sqrdmulh \r5\sz, \r5\sz, \c sqrdmulh \r6\sz, \r6\sz, \c sqrdmulh \r7\sz, \r7\sz, \c .endif .endm .macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 .endif .ifnb \shift srshr \shift, \shift, #\shiftbits .endif .ifnb \addsrc sqadd \adddst, \adddst, \addsrc .endif .ifnb \max smax \max, \max, v6.8h .endif .ifnb \min smin \min, \min, v7.8h .endif .ifnb \store st1 {\store}, [\dst], x1 .endif .endm .macro load_add_store_8x16 dst, src mov \src, \dst movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , , \dst, \src load_add_store v3.8h, v17.8h, , , , , , \dst, \src load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src load_add_store , , , , , v31.8h, v30.8h, \dst, \src load_add_store , , , , , , v31.8h, \dst, \src .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src, shiftbits=4 mov \src, \dst movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits .endm .macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src .ifnb \load ld1 {\load}[0], [\src], x1 .endif .ifnb \inssrc ins \insdst\().d[1], \inssrc\().d[0] .endif .ifnb \shift srshr \shift, \shift, #4 .endif .ifnb \load ld1 {\load}[1], [\src], x1 .endif .ifnb \addsrc sqadd \adddst, \adddst, \addsrc .endif .ifnb \store st1 {\store}[0], [\dst], x1 .endif .ifnb \max smax \max, \max, v6.8h .endif .ifnb \min smin \min, \min, v7.8h .endif .ifnb \store st1 {\store}[1], [\dst], x1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src load_add_store4 , , , , , , , , v30.d, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src load_add_store4 , , , , , , , , v22.d, \dst, \src .endm .macro idct_dc w, h, shift cbnz w3, 1f movz w16, #2896*8, lsl #16 ld1r {v16.4s}, [x2] dup v0.2s, w16 sqrdmulh v20.4s, v16.4s, v0.s[0] str wzr, [x2] .if (\w == 2*\h) || (2*\w == \h) sqrdmulh v20.4s, v20.4s, v0.s[0] .endif .if \shift > 0 sqrshrn v16.4h, v20.4s, #\shift sqrshrn2 v16.8h, v20.4s, #\shift .else sqxtn v16.4h, v20.4s sqxtn2 v16.8h, v20.4s .endif sqrdmulh v16.8h, v16.8h, v0.h[1] srshr v16.8h, v16.8h, #4 mov w4, #\h b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 ld1 {v1.d}[0], [x0], x1 subs w4, w4, #4 ld1 {v1.d}[1], [x0], x1 sqadd v0.8h, v0.8h, v16.8h sub x0, x0, x1, lsl #2 sqadd v1.8h, v1.8h, v16.8h smax v0.8h, v0.8h, v30.8h smax v1.8h, v1.8h, v30.8h smin v0.8h, v0.8h, v31.8h st1 {v0.d}[0], [x0], x1 smin v1.8h, v1.8h, v31.8h st1 {v0.d}[1], [x0], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 b.gt 1b ret endfunc function idct_dc_w8_neon movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h}, [x0], x1 subs w4, w4, #4 ld1 {v1.8h}, [x0], x1 sqadd v0.8h, v0.8h, v16.8h ld1 {v2.8h}, [x0], x1 sqadd v1.8h, v1.8h, v16.8h ld1 {v3.8h}, [x0], x1 sqadd v2.8h, v2.8h, v16.8h sqadd v3.8h, v3.8h, v16.8h sub x0, x0, x1, lsl #2 smax v0.8h, v0.8h, v30.8h smax v1.8h, v1.8h, v30.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h st1 {v0.8h}, [x0], x1 smin v2.8h, v2.8h, v31.8h st1 {v1.8h}, [x0], x1 smin v3.8h, v3.8h, v31.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w16_neon movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h}, [x0], x1 subs w4, w4, #2 ld1 {v2.8h, v3.8h}, [x0], x1 sqadd v0.8h, v0.8h, v16.8h sqadd v1.8h, v1.8h, v16.8h sub x0, x0, x1, lsl #1 sqadd v2.8h, v2.8h, v16.8h sqadd v3.8h, v3.8h, v16.8h smax v0.8h, v0.8h, v30.8h smax v1.8h, v1.8h, v30.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h st1 {v0.8h, v1.8h}, [x0], x1 smin v3.8h, v3.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w32_neon movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w4, w4, #1 sqadd v0.8h, v0.8h, v16.8h sqadd v1.8h, v1.8h, v16.8h sqadd v2.8h, v2.8h, v16.8h sqadd v3.8h, v3.8h, v16.8h smax v0.8h, v0.8h, v30.8h smax v1.8h, v1.8h, v30.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w64_neon movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x1, x1, #64 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 subs w4, w4, #1 sqadd v0.8h, v0.8h, v16.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] sqadd v1.8h, v1.8h, v16.8h sub x0, x0, #64 sqadd v2.8h, v2.8h, v16.8h sqadd v3.8h, v3.8h, v16.8h sqadd v4.8h, v4.8h, v16.8h sqadd v5.8h, v5.8h, v16.8h sqadd v6.8h, v6.8h, v16.8h sqadd v7.8h, v7.8h, v16.8h smax v0.8h, v0.8h, v30.8h smax v1.8h, v1.8h, v30.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smax v4.8h, v4.8h, v30.8h smax v5.8h, v5.8h, v30.8h smax v6.8h, v6.8h, v30.8h smax v7.8h, v7.8h, v30.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h smin v4.8h, v4.8h, v31.8h smin v5.8h, v5.8h, v31.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 smin v6.8h, v6.8h, v31.8h smin v7.8h, v7.8h, v31.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 b.gt 1b ret endfunc .macro iwht4 add v16.4s, v16.4s, v17.4s sub v21.4s, v18.4s, v19.4s sub v20.4s, v16.4s, v21.4s sshr v20.4s, v20.4s, #1 sub v18.4s, v20.4s, v17.4s sub v17.4s, v20.4s, v19.4s add v19.4s, v21.4s, v18.4s sub v16.4s, v16.4s, v17.4s .endm .macro idct_4 r0, r1, r2, r3 mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] srshr v6.4s, v6.4s, #12 srshr v7.4s, v4.4s, #12 srshr v2.4s, v2.4s, #12 srshr v3.4s, v3.4s, #12 sqadd \r0\().4s, v2.4s, v6.4s sqsub \r3\().4s, v2.4s, v6.4s sqadd \r1\().4s, v3.4s, v7.4s sqsub \r2\().4s, v3.4s, v7.4s .endm function inv_dct_4s_x4_neon movrel x16, idct_coeffs ld1 {v0.4s}, [x16] idct_4 v16, v17, v18, v19 ret endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel x16, iadst4_coeffs ld1 {v0.4s}, [x16] sub v3.4s, v16.4s, v18.4s mul v4.4s, v16.4s, v0.s[0] mla v4.4s, v18.4s, v0.s[1] mla v4.4s, v19.4s, v0.s[2] mul v7.4s, v17.4s, v0.s[3] add v3.4s, v3.4s, v19.4s mul v5.4s, v16.4s, v0.s[2] mls v5.4s, v18.4s, v0.s[0] mls v5.4s, v19.4s, v0.s[1] add \o3\().4s, v4.4s, v5.4s mul \o2\().4s, v3.4s, v0.s[3] add \o0\().4s, v4.4s, v7.4s add \o1\().4s, v5.4s, v7.4s sub \o3\().4s, \o3\().4s, v7.4s srshr \o0\().4s, \o0\().4s, #12 srshr \o2\().4s, \o2\().4s, #12 srshr \o1\().4s, \o1\().4s, #12 srshr \o3\().4s, \o3\().4s, #12 .endm function inv_adst_4s_x4_neon iadst_4x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_4s_x4_neon iadst_4x4 v19, v18, v17, v16 ret endfunc function inv_identity_4s_x4_neon movz w16, #(5793-4096)*8, lsl #16 dup v0.2s, w16 sqrdmulh v4.4s, v16.4s, v0.s[0] sqrdmulh v5.4s, v17.4s, v0.s[0] sqrdmulh v6.4s, v18.4s, v0.s[0] sqrdmulh v7.4s, v19.4s, v0.s[0] sqadd v16.4s, v16.4s, v4.4s sqadd v17.4s, v17.4s, v5.4s sqadd v18.4s, v18.4s, v6.4s sqadd v19.4s, v19.4s, v7.4s ret endfunc function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 mov x15, x30 movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 sshr v16.4s, v16.4s, #2 sshr v17.4s, v17.4s, #2 sshr v18.4s, v18.4s, #2 sshr v19.4s, v19.4s, #2 iwht4 st1 {v30.4s, v31.4s}, [x2], #32 transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 iwht4 ld1 {v0.d}[0], [x0], x1 sqxtn v16.4h, v16.4s ld1 {v0.d}[1], [x0], x1 sqxtn2 v16.8h, v17.4s ld1 {v1.d}[0], [x0], x1 sqxtn v18.4h, v18.4s ld1 {v1.d}[1], [x0], x1 sqxtn2 v18.8h, v19.4s b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 blr x4 st1 {v30.4s, v31.4s}, [x2], #32 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 blr x5 ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 ins v16.d[1], v17.d[0] ins v18.d[1], v19.d[0] ld1 {v1.d}[0], [x0], x1 ld1 {v1.d}[1], [x0], x1 srshr v16.8h, v16.8h, #4 srshr v18.8h, v18.8h, #4 L(itx_4x4_end): mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x0, x0, x1, lsl #2 sqadd v16.8h, v16.8h, v0.8h sqadd v18.8h, v18.8h, v1.8h smax v16.8h, v16.8h, v30.8h smax v18.8h, v18.8h, v30.8h smin v16.8h, v16.8h, v31.8h st1 {v16.d}[0], [x0], x1 smin v18.8h, v18.8h, v31.8h st1 {v16.d}[1], [x0], x1 st1 {v18.d}[0], [x0], x1 st1 {v18.d}[1], [x0], x1 br x15 endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct cbnz w3, 1f movz w16, #2896*8, lsl #16 ld1r {v16.4s}, [x2] dup v4.2s, w16 str wzr, [x2] sqrdmulh v16.4s, v16.4s, v4.s[0] ld1 {v0.d}[0], [x0], x1 sqxtn v20.4h, v16.4s sqxtn2 v20.8h, v16.4s ld1 {v0.d}[1], [x0], x1 sqrdmulh v20.8h, v20.8h, v4.h[1] ld1 {v1.d}[0], [x0], x1 srshr v16.8h, v20.8h, #4 ld1 {v1.d}[1], [x0], x1 srshr v18.8h, v20.8h, #4 movi v30.8h, #0 b L(itx_4x4_end) 1: .endif adr x4, inv_\txfm1\()_4s_x4_neon movrel x5, X(inv_\txfm2\()_4h_x4_neon) b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4 \r0, \r2, \r4, \r6 mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a srshr \r1\().4s, v2.4s, #12 // t4a srshr \r7\().4s, v4.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a srshr \r5\().4s, v7.4s, #12 // taa sqadd v2.4s, \r1\().4s, \r3\().4s // t4 sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a sqadd v3.4s, \r7\().4s, \r5\().4s // t7 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 srshr v4.4s, v4.4s, #12 // t5 srshr v5.4s, v6.4s, #12 // t6 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 sqadd \r1\().4s, \r2\().4s, v5.4s // out1 sqsub v6.4s, \r2\().4s, v5.4s // out6 sqadd \r2\().4s, \r4\().4s, v4.4s // out2 sqsub \r5\().4s, \r4\().4s, v4.4s // out5 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 mov \r6\().16b, v6.16b // out6 .endm function inv_dct_4s_x8_neon movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23 ret endfunc .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 movrel x16, iadst8_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 mul_mla v2, v23, v16, v0.s[0], v0.s[1] mul_mls v4, v23, v16, v0.s[1], v0.s[0] mul_mla v6, v21, v18, v0.s[2], v0.s[3] srshr v16.4s, v2.4s, #12 // t0a srshr v23.4s, v4.4s, #12 // t1a mul_mls v2, v21, v18, v0.s[3], v0.s[2] mul_mla v4, v19, v20, v1.s[0], v1.s[1] srshr v18.4s, v6.4s, #12 // t2a srshr v21.4s, v2.4s, #12 // t3a mul_mls v6, v19, v20, v1.s[1], v1.s[0] mul_mla v2, v17, v22, v1.s[2], v1.s[3] srshr v20.4s, v4.4s, #12 // t4a srshr v19.4s, v6.4s, #12 // t5a mul_mls v4, v17, v22, v1.s[3], v1.s[2] srshr v22.4s, v2.4s, #12 // t6a srshr v17.4s, v4.4s, #12 // t7a ld1 {v0.4s}, [x16] sqadd v2.4s, v16.4s, v20.4s // t0 sqsub v3.4s, v16.4s, v20.4s // t4 sqadd v4.4s, v23.4s, v19.4s // t1 sqsub v5.4s, v23.4s, v19.4s // t5 sqadd v6.4s, v18.4s, v22.4s // t2 sqsub v7.4s, v18.4s, v22.4s // t6 sqadd v18.4s, v21.4s, v17.4s // t3 sqsub v19.4s, v21.4s, v17.4s // t7 mul_mla v16, v3, v5, v0.s[3], v0.s[2] mul_mls v20, v3, v5, v0.s[2], v0.s[3] mul_mls v22, v19, v7, v0.s[3], v0.s[2] srshr v3.4s, v16.4s, #12 // t4a srshr v5.4s, v20.4s, #12 // t5a mul_mla v16, v19, v7, v0.s[2], v0.s[3] srshr v7.4s, v22.4s, #12 // t6a srshr v19.4s, v16.4s, #12 // t7a sqadd \o0\().4s, v2.4s, v6.4s // out0 sqsub v2.4s, v2.4s, v6.4s // t2 sqadd \o7\().4s, v4.4s, v18.4s // out7 sqsub v4.4s, v4.4s, v18.4s // t3 sqneg \o7\().4s, \o7\().4s // out7 sqadd \o1\().4s, v3.4s, v7.4s // out1 sqsub v3.4s, v3.4s, v7.4s // t6 sqadd \o6\().4s, v5.4s, v19.4s // out6 sqsub v5.4s, v5.4s, v19.4s // t7 sqneg \o1\().4s, \o1\().4s // out1 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) srshr v2.4s, v18.4s, #12 // out3 mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) srshr v3.4s, v20.4s, #12 // out5 srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) sqneg \o3\().4s, v2.4s // out3 sqneg \o5\().4s, v3.4s // out5 .endm function inv_adst_4s_x8_neon iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 ret endfunc function inv_flipadst_4s_x8_neon iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x8_neon sqshl v16.4s, v16.4s, #1 sqshl v17.4s, v17.4s, #1 sqshl v18.4s, v18.4s, #1 sqshl v19.4s, v19.4s, #1 sqshl v20.4s, v20.4s, #1 sqshl v21.4s, v21.4s, #1 sqshl v22.4s, v22.4s, #1 sqshl v23.4s, v23.4s, #1 ret endfunc function inv_txfm_add_8x8_neon movi v31.4s, #0 cmp w3, w13 mov x11, #32 b.lt 1f add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v31.4s}, [x6], x11 .endr blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 sqrshrn2 v24.8h, v20.4s, #1 sqrshrn2 v25.8h, v21.4s, #1 sqrshrn2 v26.8h, v22.4s, #1 sqrshrn2 v27.8h, v23.4s, #1 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h movi \i, #0 .endr 2: .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x2] st1 {v31.4s}, [x2], x11 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 mov v20.16b, v24.16b mov v21.16b, v25.16b mov v22.16b, v26.16b mov v23.16b, v27.16b blr x5 load_add_store_8x8 x0, x7 br x15 endfunc .macro def_fn_8x8 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif movrel x5, X(inv_\txfm2\()_8h_x8_neon) mov w13, #\eob_half adr x4, inv_\txfm1\()_4s_x8_neon b inv_txfm_add_8x8_neon endfunc .endm def_fn_8x8 dct, dct, 10 def_fn_8x8 identity, identity, 10 def_fn_8x8 dct, adst, 10 def_fn_8x8 dct, flipadst, 10 def_fn_8x8 dct, identity, 4 def_fn_8x8 adst, dct, 10 def_fn_8x8 adst, adst, 10 def_fn_8x8 adst, flipadst, 10 def_fn_8x8 flipadst, dct, 10 def_fn_8x8 flipadst, adst, 10 def_fn_8x8 flipadst, flipadst, 10 def_fn_8x8 identity, dct, 4 def_fn_8x8 adst, identity, 4 def_fn_8x8 flipadst, identity, 4 def_fn_8x8 identity, adst, 4 def_fn_8x8 identity, flipadst, 4 function inv_txfm_add_8x4_neon movi v28.4s, #0 movi v29.4s, #0 movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 movz w16, #2896*8, lsl #16 dup v0.2s, w16 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s sqxtn v20.4h, v20.4s sqxtn v21.4h, v21.4s sqxtn v22.4h, v22.4s sqxtn v23.4h, v23.4s transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 ins v16.d[1], v20.d[0] ins v17.d[1], v21.d[0] ins v18.d[1], v22.d[0] ins v19.d[1], v23.d[0] blr x5 load_add_store_8x4 x0, x7 br x15 endfunc function inv_txfm_add_4x8_neon movz w16, #2896*8, lsl #16 movi v31.4s, #0 dup v30.2s, w16 cmp w3, w13 mov x11, #32 b.lt 1f add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v31.4s}, [x6], x11 .endr scale_input .4s, v30.s[0], v16, v17, v18, v19 blr x4 sqxtn v20.4h, v16.4s sqxtn v21.4h, v17.4s sqxtn v22.4h, v18.4s sqxtn v23.4h, v19.4s transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f 1: .irp i, v20, v21, v22, v23 movi \i\().4h, #0 .endr 2: .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x2] st1 {v31.4s}, [x2], x11 .endr scale_input .4s, v30.s[0], v16, v17, v18, v19 blr x4 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 load_add_store_4x8 x0, x7 br x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif adr x4, inv_\txfm1\()_4s_x\w\()_neon .if \w == 4 mov w13, #\eob_half .endif movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct, 13 def_fn_48 \w, \h, identity, identity, 13 def_fn_48 \w, \h, dct, adst, 13 def_fn_48 \w, \h, dct, flipadst, 13 def_fn_48 \w, \h, dct, identity, 4 def_fn_48 \w, \h, adst, dct, 13 def_fn_48 \w, \h, adst, adst, 13 def_fn_48 \w, \h, adst, flipadst, 13 def_fn_48 \w, \h, flipadst, dct, 13 def_fn_48 \w, \h, flipadst, adst, 13 def_fn_48 \w, \h, flipadst, flipadst, 13 def_fn_48 \w, \h, identity, dct, 16 def_fn_48 \w, \h, adst, identity, 4 def_fn_48 \w, \h, flipadst, identity, 4 def_fn_48 \w, \h, identity, adst, 16 def_fn_48 \w, \h, identity, flipadst, 16 .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_dct_4s_x16_neon movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 idct_8 v16, v18, v20, v22, v24, v26, v28, v30 ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #32 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a srshr v17.4s, v2.4s, #12 // t8a srshr v31.4s, v4.4s, #12 // t15a mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a srshr v23.4s, v6.4s, #12 // t9a srshr v25.4s, v2.4s, #12 // t14a mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a srshr v21.4s, v4.4s, #12 // t10a srshr v27.4s, v6.4s, #12 // t13a mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a srshr v19.4s, v2.4s, #12 // t11a srshr v29.4s, v4.4s, #12 // t12a ld1 {v0.4s}, [x16] sqsub v2.4s, v17.4s, v23.4s // t9 sqadd v17.4s, v17.4s, v23.4s // t8 sqsub v3.4s, v31.4s, v25.4s // t14 sqadd v31.4s, v31.4s, v25.4s // t15 sqsub v23.4s, v19.4s, v21.4s // t10 sqadd v19.4s, v19.4s, v21.4s // t11 sqadd v25.4s, v29.4s, v27.4s // t12 sqsub v29.4s, v29.4s, v27.4s // t13 mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a srshr v21.4s, v4.4s, #12 // t9a srshr v27.4s, v6.4s, #12 // t14a mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a srshr v29.4s, v4.4s, #12 // t13a neg v6.4s, v6.4s srshr v23.4s, v6.4s, #12 // t10a sqsub v2.4s, v17.4s, v19.4s // t11a sqadd v17.4s, v17.4s, v19.4s // t8a sqsub v3.4s, v31.4s, v25.4s // t12a sqadd v31.4s, v31.4s, v25.4s // t15a sqadd v19.4s, v21.4s, v23.4s // t9 sqsub v21.4s, v21.4s, v23.4s // t10 sqsub v25.4s, v27.4s, v29.4s // t13 sqadd v27.4s, v27.4s, v29.4s // t14 mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a srshr v4.4s, v4.4s, #12 // t11 srshr v5.4s, v6.4s, #12 // t12 mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a srshr v2.4s, v2.4s, #12 // t10a srshr v3.4s, v6.4s, #12 // t13a sqadd v6.4s, v16.4s, v31.4s // out0 sqsub v31.4s, v16.4s, v31.4s // out15 mov v16.16b, v6.16b sqadd v23.4s, v30.4s, v17.4s // out7 sqsub v7.4s, v30.4s, v17.4s // out8 sqadd v17.4s, v18.4s, v27.4s // out1 sqsub v30.4s, v18.4s, v27.4s // out14 sqadd v18.4s, v20.4s, v3.4s // out2 sqsub v29.4s, v20.4s, v3.4s // out13 sqadd v3.4s, v28.4s, v19.4s // out6 sqsub v25.4s, v28.4s, v19.4s // out9 sqadd v19.4s, v22.4s, v5.4s // out3 sqsub v28.4s, v22.4s, v5.4s // out12 sqadd v20.4s, v24.4s, v4.4s // out4 sqsub v27.4s, v24.4s, v4.4s // out11 sqadd v21.4s, v26.4s, v2.4s // out5 sqsub v26.4s, v26.4s, v2.4s // out10 mov v24.16b, v7.16b mov v22.16b, v3.16b ret endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 movrel x16, iadst16_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 srshr v16.4s, v2.4s, #12 // t0 srshr v31.4s, v4.4s, #12 // t1 mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 srshr v18.4s, v6.4s, #12 // t2 srshr v29.4s, v2.4s, #12 // t3 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 srshr v20.4s, v4.4s, #12 // t4 srshr v27.4s, v6.4s, #12 // t5 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 ld1 {v0.4s, v1.4s}, [x16] movrel x16, idct_coeffs mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 srshr v22.4s, v2.4s, #12 // t6 srshr v25.4s, v4.4s, #12 // t7 mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 srshr v23.4s, v6.4s, #12 // t8 srshr v24.4s, v2.4s, #12 // t9 mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 srshr v21.4s, v4.4s, #12 // t10 srshr v26.4s, v6.4s, #12 // t11 mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 srshr v19.4s, v2.4s, #12 // t12 srshr v28.4s, v4.4s, #12 // t13 mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 srshr v17.4s, v6.4s, #12 // t14 srshr v30.4s, v2.4s, #12 // t15 ld1 {v0.4s, v1.4s}, [x16] sqsub v2.4s, v16.4s, v23.4s // t8a sqadd v16.4s, v16.4s, v23.4s // t0a sqsub v3.4s, v31.4s, v24.4s // t9a sqadd v31.4s, v31.4s, v24.4s // t1a sqadd v23.4s, v18.4s, v21.4s // t2a sqsub v18.4s, v18.4s, v21.4s // t10a sqadd v24.4s, v29.4s, v26.4s // t3a sqsub v29.4s, v29.4s, v26.4s // t11a sqadd v21.4s, v20.4s, v19.4s // t4a sqsub v20.4s, v20.4s, v19.4s // t12a sqadd v26.4s, v27.4s, v28.4s // t5a sqsub v27.4s, v27.4s, v28.4s // t13a sqadd v19.4s, v22.4s, v17.4s // t6a sqsub v22.4s, v22.4s, v17.4s // t14a sqadd v28.4s, v25.4s, v30.4s // t7a sqsub v25.4s, v25.4s, v30.4s // t15a mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 srshr v17.4s, v4.4s, #12 // t8 srshr v30.4s, v6.4s, #12 // t9 mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 srshr v18.4s, v2.4s, #12 // t10 srshr v29.4s, v4.4s, #12 // t11 mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 srshr v27.4s, v6.4s, #12 // t12 srshr v20.4s, v2.4s, #12 // t13 mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 srshr v25.4s, v4.4s, #12 // t14 srshr v22.4s, v6.4s, #12 // t15 sqsub v2.4s, v16.4s, v21.4s // t4 sqadd v16.4s, v16.4s, v21.4s // t0 sqsub v3.4s, v31.4s, v26.4s // t5 sqadd v31.4s, v31.4s, v26.4s // t1 sqadd v21.4s, v23.4s, v19.4s // t2 sqsub v23.4s, v23.4s, v19.4s // t6 sqadd v26.4s, v24.4s, v28.4s // t3 sqsub v24.4s, v24.4s, v28.4s // t7 sqadd v19.4s, v17.4s, v27.4s // t8a sqsub v17.4s, v17.4s, v27.4s // t12a sqadd v28.4s, v30.4s, v20.4s // t9a sqsub v30.4s, v30.4s, v20.4s // t13a sqadd v27.4s, v18.4s, v25.4s // t10a sqsub v18.4s, v18.4s, v25.4s // t14a sqadd v20.4s, v29.4s, v22.4s // t11a sqsub v29.4s, v29.4s, v22.4s // t15a mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a srshr v22.4s, v4.4s, #12 // t4a srshr v25.4s, v6.4s, #12 // t5a mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 srshr v24.4s, v2.4s, #12 // t6a srshr v23.4s, v4.4s, #12 // t7a mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 srshr v17.4s, v6.4s, #12 // t12 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 srshr v29.4s, v2.4s, #12 // t13 srshr v30.4s, v4.4s, #12 // t14 srshr v18.4s, v6.4s, #12 // t15 sqsub v2.4s, v16.4s, v21.4s // t2a .ifc \o0, v16 sqadd \o0\().4s, v16.4s, v21.4s // out0 sqsub v21.4s, v31.4s, v26.4s // t3a sqadd \o15\().4s, v31.4s, v26.4s // out15 .else sqadd v4.4s, v16.4s, v21.4s // out0 sqsub v21.4s, v31.4s, v26.4s // t3a sqadd \o15\().4s, v31.4s, v26.4s // out15 mov \o0\().16b, v4.16b .endif sqneg \o15\().4s, \o15\().4s // out15 sqsub v3.4s, v29.4s, v18.4s // t15a sqadd \o13\().4s, v29.4s, v18.4s // out13 sqadd \o2\().4s, v17.4s, v30.4s // out2 sqsub v26.4s, v17.4s, v30.4s // t14a sqneg \o13\().4s, \o13\().4s // out13 sqadd \o1\().4s, v19.4s, v27.4s // out1 sqsub v27.4s, v19.4s, v27.4s // t10 sqadd \o14\().4s, v28.4s, v20.4s // out14 sqsub v20.4s, v28.4s, v20.4s // t11 sqneg \o1\().4s, \o1\().4s // out1 sqadd \o3\().4s, v22.4s, v24.4s // out3 sqsub v22.4s, v22.4s, v24.4s // t6 sqadd \o12\().4s, v25.4s, v23.4s // out12 sqsub v23.4s, v25.4s, v23.4s // t7 sqneg \o3\().4s, \o3\().4s // out3 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) srshr v24.4s, v24.4s, #12 // out8 srshr v4.4s, v4.4s, #12 // out7 srshr v5.4s, v6.4s, #12 // out5 mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) srshr v26.4s, v6.4s, #12 // out10 mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) srshr \o4\().4s, v2.4s, #12 // out4 srshr v6.4s, v6.4s, #12 // out11 srshr v7.4s, v21.4s, #12 // out9 srshr \o6\().4s, v22.4s, #12 // out6 .ifc \o8, v23 mov \o8\().16b, v24.16b mov \o10\().16b, v26.16b .endif sqneg \o7\().4s, v4.4s // out7 sqneg \o5\().4s, v5.4s // out5 sqneg \o11\().4s, v6.4s // out11 sqneg \o9\().4s, v7.4s // out9 .endm function inv_adst_4s_x16_neon iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 ret endfunc function inv_flipadst_4s_x16_neon iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x16_neon movz w16, #2*(5793-4096)*8, lsl #16 dup v0.2s, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 sqrdmulh v2.4s, v\i\().4s, v0.s[0] sqadd v\i\().4s, v\i\().4s, v\i\().4s sqadd v\i\().4s, v\i\().4s, v2.4s .endr ret endfunc .macro identity_4x16_shift1 c .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s sqrdmulh v3.4s, \i, \c srshr v3.4s, v3.4s, #1 sqadd \i, \i, v3.4s .endr .endm .macro identity_4x16 c .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s sqrdmulh v3.4s, \i, \c sqadd \i, \i, \i sqadd \i, \i, v3.4s .endr .endm .macro def_horz_16 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x4_neon mov x14, x30 movi v7.4s, #0 .if \scale movz w16, #2896*8, lsl #16 dup v0.2s, w16 .endif .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr .if \scale scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif blr x4 sqrshrn v16.4h, v16.4s, #\shift sqrshrn v17.4h, v17.4s, #\shift sqrshrn v18.4h, v18.4s, #\shift sqrshrn v19.4h, v19.4s, #\shift sqrshrn2 v16.8h, v20.4s, #\shift sqrshrn2 v17.8h, v21.4s, #\shift sqrshrn2 v18.8h, v22.4s, #\shift sqrshrn2 v19.8h, v23.4s, #\shift sqrshrn v20.4h, v24.4s, #\shift sqrshrn v21.4h, v25.4s, #\shift sqrshrn v22.4h, v26.4s, #\shift sqrshrn v23.4h, v27.4s, #\shift sqrshrn2 v20.8h, v28.4s, #\shift sqrshrn2 v21.8h, v29.4s, #\shift sqrshrn2 v22.8h, v30.4s, #\shift sqrshrn2 v23.8h, v31.4s, #\shift transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 .irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h st1 {\i}, [x6], #16 .endr br x14 endfunc .endm def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_8x16_neon mov x14, x30 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr blr x5 load_add_store_8x16 x6, x7 br x14 endfunc function inv_txfm_add_16x16_neon mov x15, x30 sub sp, sp, #512 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12 add x6, sp, #(\i*16*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .if \i < 12 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #16*4 bl inv_txfm_horz_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #512 br x15 endfunc const eob_16x16 .short 10, 36, 78, 256 endconst const eob_16x16_identity .short 4, 8, 12, 256 endconst .macro def_fn_16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif adr x4, inv_\txfm1\()_4s_x16_neon movrel x5, X(inv_\txfm2\()_8h_x16_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_16x16 .else movrel x13, eob_16x16_identity .endif .else .ifc \txfm2, identity movrel x13, eob_16x16_identity .else movrel x13, eob_16x16 .endif .endif b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct, 36 def_fn_16x16 identity, identity, 36 def_fn_16x16 dct, adst, 36 def_fn_16x16 dct, flipadst, 36 def_fn_16x16 dct, identity, 8 def_fn_16x16 adst, dct, 36 def_fn_16x16 adst, adst, 36 def_fn_16x16 adst, flipadst, 36 def_fn_16x16 flipadst, dct, 36 def_fn_16x16 flipadst, adst, 36 def_fn_16x16 flipadst, flipadst, 36 def_fn_16x16 identity, dct, 8 function inv_txfm_add_16x4_neon mov x15, x30 movi v4.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], #16 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x4 x6, x7 sqrshrn v16.4h, v24.4s, #1 sqrshrn v17.4h, v25.4s, #1 sqrshrn v18.4h, v26.4s, #1 sqrshrn v19.4h, v27.4s, #1 sqrshrn2 v16.8h, v28.4s, #1 sqrshrn2 v17.8h, v29.4s, #1 sqrshrn2 v18.8h, v30.4s, #1 sqrshrn2 v19.8h, v31.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 add x6, x0, #16 load_add_store_8x4 x6, x7 br x15 endfunc function inv_txfm_add_4x16_neon ldrh w12, [x13, #4] mov x15, x30 mov x11, #64 cmp w3, w12 ldrh w12, [x13, #2] b.lt 1f add x6, x2, #48 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 rshrn v28.4h, v16.4s, #1 rshrn v29.4h, v17.4s, #1 rshrn v30.4h, v18.4s, #1 rshrn v31.4h, v19.4s, #1 transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 b 2f 1: .irp i, v28.4h, v29.4h, v30.4h, v31.4h movi \i, #0 .endr 2: cmp w3, w12 ldrh w12, [x13, #0] b.lt 1f add x6, x2, #32 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 rshrn v24.4h, v16.4s, #1 rshrn v25.4h, v17.4s, #1 rshrn v26.4h, v18.4s, #1 rshrn v27.4h, v19.4s, #1 transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 b 2f 1: .irp i, v24.4h, v25.4h, v26.4h, v27.4h movi \i, #0 .endr 2: cmp w3, w12 b.lt 1f add x6, x2, #16 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 rshrn v20.4h, v16.4s, #1 rshrn v21.4h, v17.4s, #1 rshrn v22.4h, v18.4s, #1 rshrn v23.4h, v19.4s, #1 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f 1: .irp i, v20.4h, v21.4h, v22.4h, v23.4h movi \i, #0 .endr 2: movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x2] st1 {v2.4s}, [x2], x11 .endr blr x4 rshrn v16.4h, v16.4s, #1 rshrn v17.4h, v17.4s, #1 rshrn v18.4h, v18.4s, #1 rshrn v19.4h, v19.4s, #1 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 load_add_store_4x16 x0, x6 br x15 endfunc const eob_4x16 .short 13, 29, 45, 64 endconst const eob_4x16_identity1 .short 16, 32, 48, 64 endconst const eob_4x16_identity2 .short 4, 8, 12, 64 endconst .macro def_fn_416 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .if \w == 4 adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_4x16 .else movrel x13, eob_4x16_identity1 .endif .else .ifc \txfm2, identity movrel x13, eob_4x16_identity2 .else movrel x13, eob_4x16 .endif .endif .else adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct def_fn_416 \w, \h, identity, identity def_fn_416 \w, \h, dct, adst def_fn_416 \w, \h, dct, flipadst def_fn_416 \w, \h, dct, identity def_fn_416 \w, \h, adst, dct def_fn_416 \w, \h, adst, adst def_fn_416 \w, \h, adst, flipadst def_fn_416 \w, \h, flipadst, dct def_fn_416 \w, \h, flipadst, adst def_fn_416 \w, \h, flipadst, flipadst def_fn_416 \w, \h, identity, dct def_fn_416 \w, \h, adst, identity def_fn_416 \w, \h, flipadst, identity def_fn_416 \w, \h, identity, adst def_fn_416 \w, \h, identity, flipadst .endm def_fns_416 4, 16 def_fns_416 16, 4 function inv_txfm_add_16x8_neon mov x15, x30 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 mov x11, #32 add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 blr x4 sqrshrn v8.4h, v16.4s, #1 sqrshrn v9.4h, v17.4s, #1 sqrshrn v10.4h, v18.4s, #1 sqrshrn v11.4h, v19.4s, #1 sqrshrn2 v8.8h, v20.4s, #1 sqrshrn2 v9.8h, v21.4s, #1 sqrshrn2 v10.8h, v22.4s, #1 sqrshrn2 v11.8h, v23.4s, #1 sqrshrn v12.4h, v24.4s, #1 sqrshrn v13.4h, v25.4s, #1 sqrshrn v14.4h, v26.4s, #1 sqrshrn v15.4h, v27.4s, #1 sqrshrn2 v12.8h, v28.4s, #1 sqrshrn2 v13.8h, v29.4s, #1 sqrshrn2 v14.8h, v30.4s, #1 sqrshrn2 v15.8h, v31.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 movz w16, #2896*8, lsl #16 dup v0.2s, w16 movi v4.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 mov v20.16b, v8.16b mov v21.16b, v9.16b mov v22.16b, v10.16b mov v23.16b, v11.16b transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 sqrshrn v8.4h, v24.4s, #1 sqrshrn v9.4h, v25.4s, #1 sqrshrn v10.4h, v26.4s, #1 sqrshrn v11.4h, v27.4s, #1 sqrshrn2 v8.8h, v28.4s, #1 sqrshrn2 v9.8h, v29.4s, #1 sqrshrn2 v10.8h, v30.4s, #1 sqrshrn2 v11.8h, v31.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x8 x6, x7 mov v16.16b, v8.16b mov v17.16b, v9.16b mov v18.16b, v10.16b mov v19.16b, v11.16b mov v20.16b, v12.16b mov v21.16b, v13.16b mov v22.16b, v14.16b mov v23.16b, v15.16b blr x5 add x0, x0, #16 load_add_store_8x8 x0, x7 ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 br x15 endfunc function inv_txfm_add_8x16_neon mov x15, x30 stp d8, d9, [sp, #-0x20]! stp d10, d11, [sp, #0x10] ldrh w12, [x13, #4] mov x11, #64 cmp w3, w12 ldrh w12, [x13, #2] b.lt 1f add x6, x2, #48 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v28.4h, v16.4s, #1 sqrshrn v29.4h, v17.4s, #1 sqrshrn v30.4h, v18.4s, #1 sqrshrn v31.4h, v19.4s, #1 sqrshrn2 v28.8h, v20.4s, #1 sqrshrn2 v29.8h, v21.4s, #1 sqrshrn2 v30.8h, v22.4s, #1 sqrshrn2 v31.8h, v23.4s, #1 transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 b 2f 1: .irp i, v28.8h, v29.8h, v30.8h, v31.8h movi \i, #0 .endr 2: cmp w3, w12 ldrh w12, [x13, #0] b.lt 1f add x6, x2, #32 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 sqrshrn2 v24.8h, v20.4s, #1 sqrshrn2 v25.8h, v21.4s, #1 sqrshrn2 v26.8h, v22.4s, #1 sqrshrn2 v27.8h, v23.4s, #1 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h movi \i, #0 .endr 2: cmp w3, w12 b.lt 1f add x6, x2, #16 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v8.4h, v16.4s, #1 sqrshrn v9.4h, v17.4s, #1 sqrshrn v10.4h, v18.4s, #1 sqrshrn v11.4h, v19.4s, #1 sqrshrn2 v8.8h, v20.4s, #1 sqrshrn2 v9.8h, v21.4s, #1 sqrshrn2 v10.8h, v22.4s, #1 sqrshrn2 v11.8h, v23.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 b 2f 1: .irp i, v8.8h, v9.8h, v10.8h, v11.8h movi \i, #0 .endr 2: movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 mov v20.16b, v8.16b mov v21.16b, v9.16b mov v22.16b, v10.16b mov v23.16b, v11.16b blr x5 load_add_store_8x16 x0, x6 ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x20 br x15 endfunc const eob_8x16 .short 10, 43, 75, 128 endconst const eob_8x16_identity1 .short 4, 64, 96, 128 endconst const eob_8x16_identity2 .short 4, 8, 12, 128 endconst .macro def_fn_816 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) .if \w == 8 .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_8x16 .else movrel x13, eob_8x16_identity1 .endif .else .ifc \txfm2, identity movrel x13, eob_8x16_identity2 .else movrel x13, eob_8x16 .endif .endif .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct def_fn_816 \w, \h, identity, identity def_fn_816 \w, \h, dct, adst def_fn_816 \w, \h, dct, flipadst def_fn_816 \w, \h, dct, identity def_fn_816 \w, \h, adst, dct def_fn_816 \w, \h, adst, adst def_fn_816 \w, \h, adst, flipadst def_fn_816 \w, \h, flipadst, dct def_fn_816 \w, \h, flipadst, adst def_fn_816 \w, \h, flipadst, flipadst def_fn_816 \w, \h, identity, dct def_fn_816 \w, \h, adst, identity def_fn_816 \w, \h, flipadst, identity def_fn_816 \w, \h, identity, adst def_fn_816 \w, \h, identity, flipadst .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_4s_x16_neon movrel x16, idct_coeffs, 4*16 ld1 {v0.4s, v1.4s}, [x16], #32 mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a srshr v16.4s, v2.4s, #12 // t16a srshr v31.4s, v4.4s, #12 // t31a mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a srshr v24.4s, v6.4s, #12 // t17a srshr v23.4s, v2.4s, #12 // t30a mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a srshr v20.4s, v4.4s, #12 // t18a srshr v27.4s, v6.4s, #12 // t29a mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #4*24 mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a srshr v28.4s, v2.4s, #12 // t19a srshr v19.4s, v4.4s, #12 // t28a mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a srshr v18.4s, v6.4s, #12 // t20a srshr v29.4s, v2.4s, #12 // t27a mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a srshr v26.4s, v4.4s, #12 // t21a srshr v21.4s, v6.4s, #12 // t26a mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a srshr v22.4s, v2.4s, #12 // t22a srshr v25.4s, v4.4s, #12 // t25a mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a srshr v30.4s, v6.4s, #12 // t23a srshr v17.4s, v2.4s, #12 // t24a ld1 {v0.4s, v1.4s}, [x16] sqsub v2.4s, v16.4s, v24.4s // t17 sqadd v16.4s, v16.4s, v24.4s // t16 sqsub v3.4s, v31.4s, v23.4s // t30 sqadd v31.4s, v31.4s, v23.4s // t31 sqsub v24.4s, v28.4s, v20.4s // t18 sqadd v28.4s, v28.4s, v20.4s // t19 sqadd v23.4s, v18.4s, v26.4s // t20 sqsub v18.4s, v18.4s, v26.4s // t21 sqsub v20.4s, v30.4s, v22.4s // t22 sqadd v30.4s, v30.4s, v22.4s // t23 sqadd v26.4s, v17.4s, v25.4s // t24 sqsub v17.4s, v17.4s, v25.4s // t25 sqsub v22.4s, v29.4s, v21.4s // t26 sqadd v29.4s, v29.4s, v21.4s // t27 sqadd v25.4s, v19.4s, v27.4s // t28 sqsub v19.4s, v19.4s, v27.4s // t29 mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a srshr v21.4s, v4.4s, #12 // t17a srshr v27.4s, v6.4s, #12 // t30a neg v2.4s, v2.4s // -> t18a mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a srshr v19.4s, v2.4s, #12 // t18a srshr v24.4s, v4.4s, #12 // t29a mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a srshr v22.4s, v6.4s, #12 // t21a srshr v18.4s, v2.4s, #12 // t26a neg v4.4s, v4.4s // -> t22a mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a srshr v17.4s, v4.4s, #12 // t22a srshr v20.4s, v6.4s, #12 // t25a sqsub v2.4s, v27.4s, v24.4s // t29 sqadd v27.4s, v27.4s, v24.4s // t30 sqsub v3.4s, v21.4s, v19.4s // t18 sqadd v21.4s, v21.4s, v19.4s // t17 sqsub v24.4s, v16.4s, v28.4s // t19a sqadd v16.4s, v16.4s, v28.4s // t16a sqsub v19.4s, v30.4s, v23.4s // t20a sqadd v30.4s, v30.4s, v23.4s // t23a sqsub v28.4s, v17.4s, v22.4s // t21 sqadd v17.4s, v17.4s, v22.4s // t22 sqadd v23.4s, v26.4s, v29.4s // t24a sqsub v26.4s, v26.4s, v29.4s // t27a sqadd v22.4s, v20.4s, v18.4s // t25 sqsub v20.4s, v20.4s, v18.4s // t26 sqsub v29.4s, v31.4s, v25.4s // t28a sqadd v31.4s, v31.4s, v25.4s // t31a mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 srshr v18.4s, v4.4s, #12 // t18a srshr v25.4s, v6.4s, #12 // t29a mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 srshr v29.4s, v2.4s, #12 // t19 srshr v24.4s, v4.4s, #12 // t28 neg v6.4s, v6.4s // -> t20 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a srshr v26.4s, v6.4s, #12 // t20 srshr v19.4s, v2.4s, #12 // t27 neg v4.4s, v4.4s // -> t21a mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a srshr v20.4s, v4.4s, #12 // t21a srshr v28.4s, v6.4s, #12 // t26a sqsub v2.4s, v16.4s, v30.4s // t23 sqadd v16.4s, v16.4s, v30.4s // t16 = out16 sqsub v3.4s, v31.4s, v23.4s // t24 sqadd v31.4s, v31.4s, v23.4s // t31 = out31 sqsub v23.4s, v21.4s, v17.4s // t22a sqadd v17.4s, v21.4s, v17.4s // t17a = out17 sqadd v30.4s, v27.4s, v22.4s // t30a = out30 sqsub v21.4s, v27.4s, v22.4s // t25a sqsub v27.4s, v18.4s, v20.4s // t21 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 sqadd v4.4s, v29.4s, v26.4s // t19a = out19 sqsub v26.4s, v29.4s, v26.4s // t20a sqadd v29.4s, v25.4s, v28.4s // t29 = out29 sqsub v25.4s, v25.4s, v28.4s // t26 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 sqsub v24.4s, v24.4s, v19.4s // t27a mov v19.16b, v4.16b // out19 mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 srshr v20.4s, v4.4s, #12 // t20 srshr v22.4s, v6.4s, #12 // t27 mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a mov v27.16b, v22.16b // t27 srshr v26.4s, v4.4s, #12 // t26a mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 srshr v21.4s, v6.4s, #12 // t21a srshr v22.4s, v24.4s, #12 // t22 srshr v25.4s, v4.4s, #12 // t25 mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a srshr v23.4s, v4.4s, #12 // t23a srshr v24.4s, v6.4s, #12 // t24a ret endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x4_neon mov x14, x30 movi v7.4s, #0 lsl x8, x8, #1 .if \scale movz w16, #2896*8, lsl #16 dup v0.2s, w16 .endif .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .if \scale scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_4s_x16_neon transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 .macro store1 r0, r1, r2, r3 st1 {\r0}, [x6], #16 st1 {\r1}, [x6], #16 st1 {\r2}, [x6], #16 st1 {\r3}, [x6], #16 .endm store1 v16.4s, v20.4s, v24.4s, v28.4s store1 v17.4s, v21.4s, v25.4s, v29.4s store1 v18.4s, v22.4s, v26.4s, v30.4s store1 v19.4s, v23.4s, v27.4s, v31.4s .purgem store1 sub x6, x6, #64*4 movi v7.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in v0.s[1] scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct32_odd_4s_x16_neon transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 .macro store2 r0, r1, r2, r3, shift ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] sqsub v4.4s, v0.4s, \r0 sqadd v0.4s, v0.4s, \r0 sqsub v5.4s, v1.4s, \r1 sqadd v1.4s, v1.4s, \r1 sqsub v6.4s, v2.4s, \r2 sqadd v2.4s, v2.4s, \r2 sqsub v7.4s, v3.4s, \r3 sqadd v3.4s, v3.4s, \r3 sqrshrn v0.4h, v0.4s, #\shift sqrshrn2 v0.8h, v1.4s, #\shift sqrshrn v1.4h, v2.4s, #\shift sqrshrn2 v1.8h, v3.4s, #\shift sqrshrn v2.4h, v7.4s, #\shift sqrshrn2 v2.8h, v6.4s, #\shift sqrshrn v3.4h, v5.4s, #\shift sqrshrn2 v3.8h, v4.4s, #\shift st1 {v0.8h, v1.8h}, [x6], #32 rev64 v2.8h, v2.8h rev64 v3.8h, v3.8h st1 {v2.8h, v3.8h}, [x6], #32 .endm store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift .purgem store2 br x14 endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_8x32_neon mov x14, x30 lsl x8, x8, #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 bl X(inv_dct_8h_x16_neon) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 bl X(inv_dct32_odd_8h_x16_neon) neg x9, x8 mov x10, x6 movi v0.8h, #0 mvni v1.8h, #0xfc, lsl #8 // 0x3ff .macro combine r0, r1, r2, r3, op, stride ld1 {v5.8h}, [x7], \stride ld1 {v2.8h}, [x10], x1 ld1 {v6.8h}, [x7], \stride ld1 {v3.8h}, [x10], x1 \op v5.8h, v5.8h, \r0 ld1 {v7.8h}, [x7], \stride ld1 {v4.8h}, [x10], x1 srshr v5.8h, v5.8h, #4 \op v6.8h, v6.8h, \r1 sqadd v5.8h, v5.8h, v2.8h srshr v6.8h, v6.8h, #4 \op v7.8h, v7.8h, \r2 smax v2.8h, v5.8h, v0.8h ld1 {v5.8h}, [x7], \stride sqadd v6.8h, v6.8h, v3.8h smin v2.8h, v2.8h, v1.8h srshr v7.8h, v7.8h, #4 \op v5.8h, v5.8h, \r3 st1 {v2.8h}, [x6], x1 ld1 {v2.8h}, [x10], x1 smax v3.8h, v6.8h, v0.8h sqadd v7.8h, v7.8h, v4.8h smin v3.8h, v3.8h, v1.8h srshr v5.8h, v5.8h, #4 st1 {v3.8h}, [x6], x1 smax v4.8h, v7.8h, v0.8h sqadd v5.8h, v5.8h, v2.8h smin v4.8h, v4.8h, v1.8h st1 {v4.8h}, [x6], x1 smax v2.8h, v5.8h, v0.8h smin v2.8h, v2.8h, v1.8h st1 {v2.8h}, [x6], x1 .endm combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 sub x7, x7, x8 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine br x14 endfunc const eob_32x32 .short 10, 36, 78, 136, 210, 300, 406, 1024 endconst const eob_16x32 .short 10, 36, 78, 151, 215, 279, 343, 512 endconst const eob_16x32_shortside .short 10, 36, 78, 512 endconst const eob_8x32 .short 10, 43, 75, 107, 139, 171, 203, 256 endconst function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 movi v0.8h, #0 movi v1.8h, #0 movrel x13, eob_32x32, 2 mov x8, #4*32 1: mov w9, #0 movrel x12, eob_32x32, 2 2: add w9, w9, #8 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 load_add_store_8x8 x0, x7, shiftbits=2 ldrh w11, [x12], #4 sub x0, x0, x1, lsl #3 add x0, x0, #2*8 cmp w3, w11 b.ge 2b ldrh w11, [x13], #4 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw #1 add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #4*8 b 1b 9: ret endfunc .macro shift_16_regs op, shift .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 movz w16, #2896*8, lsl #16 movz w17, #2*(5793-4096)*8, lsl #16 movi v0.4s, #0 movi v1.4s, #0 movrel x13, eob_16x32\hshort, 2 mov x8, #4*\h 1: mov w9, #0 movrel x12, eob_16x32\wshort, 2 2: add w9, w9, #8 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 dup v2.2s, w16 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 mov v2.s[1], w17 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .if \w == 16 // 16x32 identity_4x16_shift1 v2.s[1] .else // 32x16 shift_16_regs sqshl, 1 identity_4x16 v2.s[1] .endif sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 .if \w == 16 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=4 .endif ldrh w11, [x12], #4 sub x0, x0, x1, lsl #3 add x0, x0, #16 cmp w3, w11 b.ge 2b ldrh w11, [x13], #4 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw #1 add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #4*8 b 1b 9: ret endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 movi v0.4s, #0 movi v1.4s, #0 // Working on 8x8 blocks, read every other entry from eob_8x32 movrel x13, eob_8x32, 2 mov w8, #4*\h 1: // Working on 8x8 blocks, read every other entry from eob_8x32 ldrh w12, [x13], #4 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 .if \w == 8 sqrshrn v16.4h, v16.4s, #1 sqrshrn2 v16.8h, v17.4s, #1 sqrshrn v17.4h, v18.4s, #1 sqrshrn2 v17.8h, v19.4s, #1 sqrshrn v18.4h, v20.4s, #1 sqrshrn2 v18.8h, v21.4s, #1 sqrshrn v19.4h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 sqrshrn v20.4h, v24.4s, #1 sqrshrn2 v20.8h, v25.4s, #1 sqrshrn v21.4h, v26.4s, #1 sqrshrn2 v21.8h, v27.4s, #1 sqrshrn v22.4h, v28.4s, #1 sqrshrn2 v22.8h, v29.4s, #1 sqrshrn v23.4h, v30.4s, #1 sqrshrn2 v23.8h, v31.4s, #1 .else sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 add x2, x2, #4*8 .else sub x0, x0, x1, lsl #3 add x0, x0, #2*8 .endif b 1b 9: ret endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 idct_dc 32, 32, 2 mov x15, x30 sub sp, sp, #2048 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, sp, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #2048 br x15 endfunc function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 idct_dc 16, 32, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_4s_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, sp, #(\i*16*2) add x7, x2, #(\i*4) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif mov x8, #4*32 bl inv_txfm_horz_scale_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #16*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #1024 br x15 endfunc function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 idct_dc 32, 16, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 movrel x5, X(inv_dct_8h_x16_neon) ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12 add x6, sp, #(\i*32*2) add x7, x2, #(\i*4) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f ldrh w12, [x13], #2 .endif mov x8, #4*16 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #1024 br x15 endfunc function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 idct_dc 8, 32, 2 mov x15, x30 sub sp, sp, #512 movrel x13, eob_8x32 movi v28.4s, #0 mov x8, #4*32 mov w9, #32 mov x6, sp mov x7, x2 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().4s}, [x7] st1 {v28.4s}, [x7], x8 .endr ldrh w12, [x13], #2 sub w9, w9, #4 sub x7, x7, x8, lsl #3 add x7, x7, #4*4 bl inv_dct_4s_x8_neon sqrshrn v16.4h, v16.4s, #2 sqrshrn v17.4h, v17.4s, #2 sqrshrn v18.4h, v18.4s, #2 sqrshrn v19.4h, v19.4s, #2 sqrshrn2 v16.8h, v20.4s, #2 sqrshrn2 v17.8h, v21.4s, #2 sqrshrn2 v18.8h, v22.4s, #2 sqrshrn2 v19.8h, v23.4s, #2 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 cmp w3, w12 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 b.ge 1b cbz w9, 3f movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 2: subs w9, w9, #4 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 b.gt 2b 3: mov x6, x0 mov x7, sp mov x8, #8*2 bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 br x15 endfunc function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 idct_dc 32, 8, 2 mov x15, x30 sub sp, sp, #512 .irp i, 0, 4 add x6, sp, #(\i*32*2) add x7, x2, #(\i*4) .if \i > 0 cmp w3, #10 b.lt 1f .endif mov x8, #8*4 bl inv_txfm_horz_dct_32x4_neon .endr b 2f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr 2: mov x8, #2*32 mov w9, #0 1: add x6, x0, x9, lsl #1 add x7, sp, x9, lsl #1 // #(\i*2) .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x7], x8 .endr add w9, w9, #8 bl X(inv_dct_8h_x8_neon) cmp w9, #32 load_add_store_8x8 x6, x7 b.lt 1b add sp, sp, #512 br x15 endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a ld1 {v0.4s, v1.4s}, [x17], #32 sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a ld1 {v0.4s}, [x17], #16 sqadd v24.4s, v16.4s, v17.4s // t32 sqsub v25.4s, v16.4s, v17.4s // t33 sqsub v26.4s, v19.4s, v18.4s // t34 sqadd v27.4s, v19.4s, v18.4s // t35 sqadd v28.4s, v20.4s, v21.4s // t60 sqsub v29.4s, v20.4s, v21.4s // t61 sqsub v30.4s, v23.4s, v22.4s // t62 sqadd v31.4s, v23.4s, v22.4s // t63 mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a neg v2.4s, v2.4s // t34a mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a srshr v26.4s, v2.4s, #12 // t34a mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a srshr v29.4s, v4.4s, #12 // t61a srshr v25.4s, v6.4s, #12 // t33a srshr v30.4s, v2.4s, #12 // t62a sqadd v16.4s, v24.4s, v27.4s // t32a sqsub v19.4s, v24.4s, v27.4s // t35a sqadd v17.4s, v25.4s, v26.4s // t33 sqsub v18.4s, v25.4s, v26.4s // t34 sqsub v20.4s, v31.4s, v28.4s // t60a sqadd v23.4s, v31.4s, v28.4s // t63a sqsub v21.4s, v30.4s, v29.4s // t61 sqadd v22.4s, v30.4s, v29.4s // t62 mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 srshr v21.4s, v2.4s, #12 // t61a srshr v18.4s, v4.4s, #12 // t34a mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 srshr v20.4s, v6.4s, #12 // t60 srshr v19.4s, v2.4s, #12 // t35 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 ret endfunc function inv_dct64_step2_neon movrel x16, idct_coeffs ld1 {v0.4s}, [x16] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a ldr q16, [x6, #4*4*0] // t32a ldr q17, [x9, #4*4*8] // t39a ldr q18, [x9, #4*4*0] // t63a ldr q19, [x6, #4*4*8] // t56a ldr q20, [x6, #4*4*16] // t40a ldr q21, [x9, #4*4*24] // t47a ldr q22, [x9, #4*4*16] // t55a ldr q23, [x6, #4*4*24] // t48a sqadd v24.4s, v16.4s, v17.4s // t32 sqsub v25.4s, v16.4s, v17.4s // t39 sqadd v26.4s, v18.4s, v19.4s // t63 sqsub v27.4s, v18.4s, v19.4s // t56 sqsub v28.4s, v21.4s, v20.4s // t40 sqadd v29.4s, v21.4s, v20.4s // t47 sqadd v30.4s, v23.4s, v22.4s // t48 sqsub v31.4s, v23.4s, v22.4s // t55 mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a srshr v25.4s, v2.4s, #12 // t56a srshr v27.4s, v4.4s, #12 // t39a neg v6.4s, v6.4s // t40a mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a srshr v31.4s, v6.4s, #12 // t40a srshr v28.4s, v2.4s, #12 // t55a sqadd v16.4s, v24.4s, v29.4s // t32a sqsub v19.4s, v24.4s, v29.4s // t47a sqadd v17.4s, v27.4s, v31.4s // t39 sqsub v18.4s, v27.4s, v31.4s // t40 sqsub v20.4s, v26.4s, v30.4s // t48a sqadd v23.4s, v26.4s, v30.4s // t63a sqsub v21.4s, v25.4s, v28.4s // t55 sqadd v22.4s, v25.4s, v28.4s // t56 mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 srshr v18.4s, v2.4s, #12 // t40a srshr v21.4s, v4.4s, #12 // t55a mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 srshr v19.4s, v6.4s, #12 // t47 srshr v20.4s, v2.4s, #12 // t48 str q16, [x6, #4*4*0] // t32a str q17, [x9, #4*4*0] // t39 str q18, [x6, #4*4*8] // t40a str q19, [x9, #4*4*8] // t47 str q20, [x6, #4*4*16] // t48 str q21, [x9, #4*4*16] // t55a str q22, [x6, #4*4*24] // t56 str q23, [x9, #4*4*24] // t63a add x6, x6, #4*4 sub x9, x9, #4*4 cmp x6, x9 b.lt 1b ret endfunc .macro load8 src, strd, zero, clear .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s .if \clear ld1 {\i}, [\src] st1 {\zero}, [\src], \strd .else ld1 {\i}, [\src], \strd .endif .endr .endm .macro store16 dst .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s st1 {\i}, [\dst], #16 .endr .endm .macro clear_upper8 .irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s movi \i, #0 .endr .endm .macro movi_if reg, val, cond .if \cond movi \reg, \val .endif .endm .macro movz16dup_if reg, gpr, val, cond .if \cond movz \gpr, \val, lsl #16 dup \reg, \gpr .endif .endm .macro st1_if regs, dst, cond .if \cond st1 \regs, \dst .endif .endm .macro str_if reg, dst, cond .if \cond str \reg, \dst .endif .endm .macro stroff_if reg, dst, dstoff, cond .if \cond str \reg, \dst, \dstoff .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_4s_x64_neon mov x14, x30 mov x6, sp lsl x8, x8, #2 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear load8 x7, x8, v7.4s, \clear clear_upper8 sub x7, x7, x8, lsl #3 add x7, x7, x8, lsr #1 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct_4s_x16_neon store16 x6 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.8h, #0, \clear load8 x7, x8, v7.4s, \clear clear_upper8 sub x7, x7, x8, lsl #3 lsr x8, x8, #1 sub x7, x7, x8, lsr #1 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct32_odd_4s_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 mov x9, #-16 .macro store_addsub r0, r1, r2, r3 ld1 {v2.4s}, [x6], #16 ld1 {v3.4s}, [x6], #16 sqadd v6.4s, v2.4s, \r0 sqsub \r0, v2.4s, \r0 ld1 {v4.4s}, [x6], #16 sqadd v7.4s, v3.4s, \r1 sqsub \r1, v3.4s, \r1 ld1 {v5.4s}, [x6], #16 sqadd v2.4s, v4.4s, \r2 sub x6, x6, #16*4 sqsub \r2, v4.4s, \r2 st1 {v6.4s}, [x6], #16 st1 {\r0}, [x10], x9 sqadd v3.4s, v5.4s, \r3 sqsub \r3, v5.4s, \r3 st1 {v7.4s}, [x6], #16 st1 {\r1}, [x10], x9 st1 {v2.4s}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.4s}, [x6], #16 st1 {\r3}, [x10], x9 .endm store_addsub v31.4s, v30.4s, v29.4s, v28.4s store_addsub v27.4s, v26.4s, v25.4s, v24.4s store_addsub v23.4s, v22.4s, v21.4s, v20.4s store_addsub v19.4s, v18.4s, v17.4s, v16.4s .purgem store_addsub add x6, x6, #4*4*16 movrel x17, idct64_coeffs movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x9, x7, x8, lsl #4 // offset 16 add x10, x7, x8, lsl #3 // offset 8 sub x9, x9, x8 // offset 15 sub x11, x10, x8 // offset 7 ld1 {v16.4s}, [x7] // in1 (offset 0) ld1 {v17.4s}, [x9] // in31 (offset 15) ld1 {v18.4s}, [x10] // in17 (offset 8) ld1 {v19.4s}, [x11] // in15 (offset 7) st1_if {v7.4s}, [x7], \clear st1_if {v7.4s}, [x9], \clear st1_if {v7.4s}, [x10], \clear st1_if {v7.4s}, [x11], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x7, x7, x8, lsl #2 // offset 4 sub x9, x9, x8, lsl #2 // offset 11 sub x10, x7, x8 // offset 3 add x11, x9, x8 // offset 12 ld1 {v16.4s}, [x10] // in7 (offset 3) ld1 {v17.4s}, [x11] // in25 (offset 12) ld1 {v18.4s}, [x9] // in23 (offset 11) ld1 {v19.4s}, [x7] // in9 (offset 4) st1_if {v7.4s}, [x7], \clear st1_if {v7.4s}, [x9], \clear st1_if {v7.4s}, [x10], \clear st1_if {v7.4s}, [x11], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear sub x10, x10, x8, lsl #1 // offset 1 sub x9, x9, x8, lsl #1 // offset 9 add x7, x7, x8 // offset 5 add x11, x11, x8 // offset 13 ldr q16, [x10, x8] // in5 (offset 2) ldr q17, [x11] // in27 (offset 13) ldr q18, [x9, x8] // in21 (offset 10) ldr q19, [x7] // in11 (offset 5) stroff_if q7, [x10, x8], \clear str_if q7, [x11], \clear stroff_if q7, [x9, x8], \clear str_if q7, [x7], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear ldr q16, [x10] // in3 (offset 1) ldr q17, [x11, x8] // in29 (offset 14) ldr q18, [x9] // in19 (offset 9) ldr q19, [x7, x8] // in13 (offset 6) str_if q7, [x10], \clear stroff_if q7, [x11, x8], \clear str_if q7, [x9], \clear stroff_if q7, [x7, x8], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon sub x6, x6, #4*4*32 add x9, x6, #4*4*7 bl inv_dct64_step2_neon br x14 endfunc .endm def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x4_neon mov x14, x30 mov x7, sp add x8, sp, #4*4*(64 - 4) add x9, x6, #2*56 mov x10, #2*64 mov x11, #-4*4*4 dup v7.4s, w12 1: ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 .macro store_addsub src0, src1, src2, src3 sqsub v1.4s, \src0, \src1 sqadd v0.4s, \src0, \src1 sqsub v3.4s, \src2, \src3 srshl v1.4s, v1.4s, v7.4s sqadd v2.4s, \src2, \src3 srshl v3.4s, v3.4s, v7.4s srshl v0.4s, v0.4s, v7.4s srshl v2.4s, v2.4s, v7.4s sqxtn v3.4h, v3.4s sqxtn2 v3.8h, v1.4s sqxtn v0.4h, v0.4s sqxtn2 v0.8h, v2.4s rev64 v3.8h, v3.8h st1 {v0.8h}, [x6], x10 st1 {v3.8h}, [x9], x10 .endm store_addsub v16.4s, v31.4s, v20.4s, v27.4s store_addsub v17.4s, v30.4s, v21.4s, v26.4s store_addsub v18.4s, v29.4s, v22.4s, v25.4s store_addsub v19.4s, v28.4s, v23.4s, v24.4s .purgem store_addsub sub x6, x6, x10, lsl #2 sub x9, x9, x10, lsl #2 add x6, x6, #16 sub x9, x9, #16 cmp x7, x8 b.lt 1b br x14 endfunc function inv_txfm_add_vert_dct_8x64_neon mov x14, x30 lsl x8, x8, #1 mov x7, sp add x8, sp, #2*8*(64 - 4) add x9, x6, x1, lsl #6 sub x9, x9, x1 neg x10, x1 mov x11, #-2*8*4 1: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff .macro add_dest_addsub src0, src1, src2, src3 ld1 {v0.8h}, [x6], x1 ld1 {v1.8h}, [x9], x10 sqadd v4.8h, \src0, \src1 ld1 {v2.8h}, [x6] sqsub \src0, \src0, \src1 ld1 {v3.8h}, [x9] sqadd v5.8h, \src2, \src3 sqsub \src2, \src2, \src3 sub x6, x6, x1 sub x9, x9, x10 srshr v4.8h, v4.8h, #4 srshr v5.8h, v5.8h, #4 srshr \src0, \src0, #4 sqadd v0.8h, v0.8h, v4.8h srshr \src2, \src2, #4 sqadd v1.8h, v1.8h, \src0 sqadd v2.8h, v2.8h, v5.8h smax v0.8h, v0.8h, v6.8h sqadd v3.8h, v3.8h, \src2 smax v1.8h, v1.8h, v6.8h smin v0.8h, v0.8h, v7.8h smax v2.8h, v2.8h, v6.8h smin v1.8h, v1.8h, v7.8h st1 {v0.8h}, [x6], x1 smax v3.8h, v3.8h, v6.8h smin v2.8h, v2.8h, v7.8h st1 {v1.8h}, [x9], x10 smin v3.8h, v3.8h, v7.8h st1 {v2.8h}, [x6], x1 st1 {v3.8h}, [x9], x10 .endm add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h .purgem add_dest_addsub cmp x7, x8 b.lt 1b br x14 endfunc .macro sub_sp space #ifdef _WIN32 .if \space > 4096 sub x16, sp, #4096 ldr xzr, [x16] sub sp, x16, #(\space - 4096) .else sub sp, sp, #\space .endif #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif .else sub sp, sp, #\space .endif #endif .endm function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 idct_dc 64, 64, 2 mov x15, x30 sub_sp 64*32*2+64*4*4 add x5, sp, #64*4*4 movrel x13, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #32*4 mov x12, #-2 // shift bl inv_txfm_dct_clear_4s_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #64*32*2 br x15 endfunc function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 idct_dc 64, 32, 1 mov x15, x30 sub_sp 64*32*2+64*4*4 add x5, sp, #64*4*4 movrel x13, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #32*4 mov x12, #-1 // shift bl inv_txfm_dct_clear_scale_4s_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i*2) add x7, x5, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, x5, #64*32*2 br x15 endfunc function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 idct_dc 32, 64, 1 mov x15, x30 sub_sp 32*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f ldrh w12, [x13], #2 .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #32*32*2 br x15 endfunc function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 idct_dc 64, 16, 2 mov x15, x30 sub_sp 64*16*2+64*4*4 add x4, sp, #64*4*4 movrel x13, eob_16x32 .irp i, 0, 4, 8, 12 add x6, x4, #(\i*64*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #16*4 mov x12, #-2 // shift bl inv_txfm_dct_clear_4s_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 12 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: movrel x5, X(inv_dct_8h_x16_neon) .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i*2) add x7, x4, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, x4, #64*16*2 br x15 endfunc function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 idct_dc 16, 64, 2 mov x15, x30 sub_sp 16*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_4s_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*16*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f ldrh w12, [x13], #2 .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #16*32*2 br x15 endfunc