shithub: dav1d

--- /dev/null

+++ b/src/arm/32/mc16.S

@@ -1,0 +1,274 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Janne Grunau

+ * Copyright © 2020, Martin Storsjo

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "src/arm/asm.S"

+#include "util.S"

+#define PREP_BIAS 8192

+.macro avg d0, d00, d01, d1, d10, d11

+        vld1.16         {q0, q1}, [r2, :128]!

+        vld1.16         {q2, q3}, [r3, :128]!

+        vqadd.s16       q0,  q0,  q2

+        vqadd.s16       q1,  q1,  q3

+        vmax.s16        q0,  q0,  q12 // -2*PREP_BIAS - 1 << intermediate_bits

+        vmax.s16        q1,  q1,  q12 // -2*PREP_BIAS - 1 << intermediate_bits

+        vqsub.s16       q0,  q0,  q12 // -2*PREP_BIAS - 1 << intermediate_bits

+        vqsub.s16       q1,  q1,  q12 // -2*PREP_BIAS - 1 << intermediate_bits

+        vshl.s16        \d0, q0,  q13 // -(intermediate_bits+1)

+        vshl.s16        \d1, q1,  q13 // -(intermediate_bits+1)

+.endm

+.macro w_avg d0, d00, d01, d1, d10, d11

+        vld1.16         {q0, q1}, [r2, :128]!

+        vld1.16         {q2, q3}, [r3, :128]!

+        // This difference requires a 17 bit range, and all bits are

+        // significant for the following multiplication.

+        vsubl.s16       \d0, d4,  d0

+        vsubl.s16       q0,  d5,  d1

+        vsubl.s16       \d1, d6,  d2

+        vsubl.s16       q1,  d7,  d3

+        vmul.s32        \d0, \d0, q4

+        vmul.s32        q0,  q0,  q4

+        vmul.s32        \d1, \d1, q4

+        vmul.s32        q1,  q1,  q4

+        vshr.s32        \d0, \d0, #4

+        vshr.s32        q0,  q0,  #4

+        vshr.s32        \d1, \d1, #4

+        vshr.s32        q1,  q1,  #4

+        vaddw.s16       \d0, \d0, d4

+        vaddw.s16       q0,  q0,  d5

+        vaddw.s16       \d1, \d1, d6

+        vaddw.s16       q1,  q1,  d7

+        vmovn.i32       \d00, \d0

+        vmovn.i32       \d01, q0

+        vmovn.i32       \d10, \d1

+        vmovn.i32       \d11, q1

+        vrshl.s16       \d0, \d0, q13 // -intermediate_bits

+        vrshl.s16       \d1, \d1, q13 // -intermediate_bits

+        vadd.s16        \d0, \d0, q12 // PREP_BIAS >> intermediate_bits

+        vadd.s16        \d1, \d1, q12 // PREP_BIAS >> intermediate_bits

+        vmin.s16        \d0, \d0, q15 // bitdepth_max

+        vmin.s16        \d1, \d1, q15 // bitdepth_max

+        vmax.s16        \d0, \d0, q14 // 0

+        vmax.s16        \d1, \d1, q14 // 0

+.endm

+.macro mask d0, d00, d01, d1, d10, d11

+        vld1.8          {q7},     [r6, :128]!

+        vld1.16         {q0, q1}, [r2, :128]!

+        vneg.s8         q7,  q7

+        vld1.16         {q2, q3}, [r3, :128]!

+        vmovl.s8        q6,  d14

+        vmovl.s8        q7,  d15

+        vmovl.s16       q4,  d12

+        vmovl.s16       q5,  d13

+        vmovl.s16       q6,  d14

+        vmovl.s16       q7,  d15

+        vsubl.s16       \d0, d4,  d0

+        vsubl.s16       q0,  d5,  d1

+        vsubl.s16       \d1, d6,  d2

+        vsubl.s16       q1,  d7,  d3

+        vmul.s32        \d0, \d0, q4

+        vmul.s32        q0,  q0,  q5

+        vmul.s32        \d1, \d1, q6

+        vmul.s32        q1,  q1,  q7

+        vshr.s32        \d0, \d0, #6

+        vshr.s32        q0,  q0,  #6

+        vshr.s32        \d1, \d1, #6

+        vshr.s32        q1,  q1,  #6

+        vaddw.s16       \d0, \d0, d4

+        vaddw.s16       q0,  q0,  d5

+        vaddw.s16       \d1, \d1, d6

+        vaddw.s16       q1,  q1,  d7

+        vmovn.i32       \d00, \d0

+        vmovn.i32       \d01, q0

+        vmovn.i32       \d10, \d1

+        vmovn.i32       \d11, q1

+        vrshl.s16       \d0, \d0, q13 // -intermediate_bits

+        vrshl.s16       \d1, \d1, q13 // -intermediate_bits

+        vadd.s16        \d0, \d0, q12 // PREP_BIAS >> intermediate_bits

+        vadd.s16        \d1, \d1, q12 // PREP_BIAS >> intermediate_bits

+        vmin.s16        \d0, \d0, q15 // bitdepth_max

+        vmin.s16        \d1, \d1, q15 // bitdepth_max

+        vmax.s16        \d0, \d0, q14 // 0

+        vmax.s16        \d1, \d1, q14 // 0

+.endm

+.macro bidir_fn type, bdmax

+function \type\()_16bpc_neon, export=1

+        push            {r4-r7,lr}

+        ldr             r4,  [sp, #20]

+        ldr             r5,  [sp, #24]

+        ldr             r6,  [sp, #28]

+        clz             r4,  r4

+.ifnc \type, avg

+        ldr             r7,  [sp, #32]

+        vmov.i16        q14, #0

+        vdup.16         q15, r7         // bitdepth_max

+.endif

+.ifc \type, w_avg

+        vpush           {q4}

+.endif

+.ifc \type, mask

+        vpush           {q4-q7}

+.endif

+        clz             r7,  \bdmax

+        sub             r7,  r7,  #18   // intermediate_bits = clz(bitdepth_max) - 18

+.ifc \type, avg

+        mov             lr,  #1

+        movw            r12, #2*PREP_BIAS

+        lsl             lr,  lr,  r7    // 1 << intermediate_bits

+        neg             r12, r12         // -2*PREP_BIAS

+        add             r7,  r7,  #1

+        sub             r12, r12, lr    // -2*PREP_BIAS - 1 << intermediate_bits

+        neg             r7,  r7         // -(intermediate_bits+1)

+        vdup.16         q12, r12         // -2*PREP_BIAS - 1 << intermediate_bits

+        vdup.16         q13, r7         // -(intermediate_bits+1)

+.else

+        mov             r12, #PREP_BIAS

+        lsr             r12, r12, r7    // PREP_BIAS >> intermediate_bits

+        neg             r7,  r7         // -intermediate_bits

+        vdup.16         q12, r12         // PREP_BIAS >> intermediate_bits

+        vdup.16         q13, r7         // -intermediate_bits

+.endif

+.ifc \type, w_avg

+        vdup.32         q4,  r6

+        vneg.s32        q4,  q4

+.endif

+        adr             r7,  L(\type\()_tbl)

+        sub             r4,  r4,  #24

+        \type           q8,  d16, d17, q9,  d18, d19

+        ldr             r4,  [r7, r4, lsl #2]

+        add             r7,  r7,  r4

+        bx              r7

+        .align 2

+L(\type\()_tbl):

+        .word 1280f - L(\type\()_tbl) + CONFIG_THUMB

+        .word 640f  - L(\type\()_tbl) + CONFIG_THUMB

+        .word 320f  - L(\type\()_tbl) + CONFIG_THUMB

+        .word 160f  - L(\type\()_tbl) + CONFIG_THUMB

+        .word 80f   - L(\type\()_tbl) + CONFIG_THUMB

+        .word 40f   - L(\type\()_tbl) + CONFIG_THUMB

+40:

+        add             r7,  r0,  r1

+        lsl             r1,  r1,  #1

+4:

+        subs            r5,  r5,  #4

+        vst1.16         {d16},  [r0, :64], r1

+        vst1.16         {d17},  [r7, :64], r1

+        vst1.16         {d18},  [r0, :64], r1

+        vst1.16         {d19},  [r7, :64], r1

+        ble             0f

+        \type           q8,  d16, d17, q9,  d18, d19

+        b               4b

+80:

+        add             r7,  r0,  r1

+        lsl             r1,  r1,  #1

+8:

+        vst1.16         {q8},  [r0, :128], r1

+        subs            r5,  r5,  #2

+        vst1.16         {q9},  [r7, :128], r1

+        ble             0f

+        \type           q8,  d16, d17, q9,  d18, d19

+        b               8b

+160:

+16:

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r1

+        subs            r5,  r5,  #2

+        vst1.16         {q10, q11}, [r0, :128], r1

+        ble             0f

+        \type           q8,  d16, d17, q9,  d18, d19

+        b               16b

+320:

+        add             r7,  r0,  #32

+32:

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r1

+        subs            r5,  r5,  #1

+        vst1.16         {q10, q11}, [r7, :128], r1

+        ble             0f

+        \type           q8,  d16, d17, q9,  d18, d19

+        b               32b

+640:

+        add             r7,  r0,  #32

+        mov             r12, #64

+        sub             r1,  r1,  #64

+64:

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r12

+        \type           q8,  d16, d17, q9,  d18, d19

+        vst1.16         {q10, q11}, [r7, :128], r12

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r1

+        subs            r5,  r5,  #1

+        vst1.16         {q10, q11}, [r7, :128], r1

+        ble             0f

+        \type           q8,  d16, d17, q9,  d18, d19

+        b               64b

+1280:

+        add             r7,  r0,  #32

+        mov             r12, #64

+        sub             r1,  r1,  #192

+128:

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r12

+        \type           q8,  d16, d17, q9,  d18, d19

+        vst1.16         {q10, q11}, [r7, :128], r12

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r12

+        \type           q8,  d16, d17, q9,  d18, d19

+        vst1.16         {q10, q11}, [r7, :128], r12

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r12

+        \type           q8,  d16, d17, q9,  d18, d19

+        vst1.16         {q10, q11}, [r7, :128], r12

+        \type           q10, d20, d21, q11, d22, d23

+        vst1.16         {q8,  q9},  [r0, :128], r1

+        subs            r5,  r5,  #1

+        vst1.16         {q10, q11}, [r7, :128], r1

+        ble             0f

+        \type           q8,  d16, d17, q9,  d18, d19

+        b               128b

+0:

+.ifc \type, mask

+        vpop            {q4-q7}

+.endif

+.ifc \type, w_avg

+        vpop            {q4}

+.endif

+        pop             {r4-r7,pc}

+endfunc

+.endm

+bidir_fn avg, r6

+bidir_fn w_avg, r7

+bidir_fn mask, r7

--- a/src/arm/mc_init_tmpl.c

+++ b/src/arm/mc_init_tmpl.c

@@ -99,10 +99,12 @@

     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);

     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);

     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);

+#endif

     c->avg = BF(dav1d_avg, neon);

     c->w_avg = BF(dav1d_w_avg, neon);

     c->mask = BF(dav1d_mask, neon);

+#if BITDEPTH == 8 || ARCH_AARCH64

     c->blend = BF(dav1d_blend, neon);

     c->blend_h = BF(dav1d_blend_h, neon);

     c->blend_v = BF(dav1d_blend_v, neon);

--- a/src/meson.build

+++ b/src/meson.build

@@ -147,6 +147,7 @@

             if dav1d_bitdepths.contains('16')

                 libdav1d_sources += files(

+                    'arm/32/mc16.S',

             endif

         endif