shithub: dav1d

Download patch

ref: 2e8a3a219b0c886ccbda94f4cf97e43d23be5052
parent: 84f938ecbee168b54c8d118cec1680ba37b3de91
author: Martin Storsjö <[email protected]>
date: Tue May 14 06:23:00 EDT 2019

arm64: msac: Add handwritten versions of msac_decode_bool functions

GCC                     Cortex A53   A72   A73
msac_decode_bool_c:           29.9  17.9  23.2
msac_decode_bool_neon:        27.4  15.3  20.4
msac_decode_bool_adapt_c:     49.2  26.8  31.0
msac_decode_bool_adapt_neon:  38.2  22.2  25.4
msac_decode_bool_equi_c:      26.6  16.8  19.4
msac_decode_bool_equi_neon:   23.9  13.7  15.7

Clang                   Cortex A53   A72   A73
msac_decode_bool_c:           28.0  16.4  23.1
msac_decode_bool_neon:        26.9  14.6  21.0
msac_decode_bool_adapt_c:     46.8  25.1  31.4
msac_decode_bool_adapt_neon:  36.2  19.0  26.2
msac_decode_bool_equi_c:      23.7  13.4  18.8
msac_decode_bool_equi_neon:   23.7  11.3  14.2

This is as fast as, or faster than, what either GCC or Clang
produces.

--- a/src/arm/64/msac.S
+++ b/src/arm/64/msac.S
@@ -215,6 +215,7 @@
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         mvn             x7,  x7                // ~dif
         add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+L(renorm2):
         lsl             w4,  w4,  w5           // rng << d
         subs            w6,  w6,  w5           // cnt -= d
         lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
@@ -277,4 +278,87 @@
 function msac_decode_symbol_adapt16_neon, export=1
         decode_update   .8h, .16b, 16
         b               L(renorm)
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+        ldp             w5,  w6,  [x0, #RNG]   // + CNT
+        sub             sp,  sp,  #48
+        ldr             x7,  [x0, #DIF]
+        bic             w4,  w5,  #0xff        // r &= 0xff00
+        add             w4,  w4,  #8
+        subs            x8,  x7,  x4, lsl #47  // dif - vw
+        lsr             w4,  w4,  #1           // v
+        sub             w5,  w5,  w4           // r - v
+        cset            w15, lo
+        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
+        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
+
+        clz             w5,  w4                // clz(rng)
+        mvn             x7,  x7                // ~dif
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+        ldp             w5,  w6,  [x0, #RNG]   // + CNT
+        sub             sp,  sp,  #48
+        ldr             x7,  [x0, #DIF]
+        lsr             w4,  w5,  #8           // r >> 8
+        bic             w1,  w1,  #0x3f        // f &= ~63
+        mul             w4,  w4,  w1
+        lsr             w4,  w4,  #7
+        add             w4,  w4,  #4           // v
+        subs            x8,  x7,  x4, lsl #48  // dif - vw
+        sub             w5,  w5,  w4           // r - v
+        cset            w15, lo
+        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
+        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
+
+        clz             w5,  w4                // clz(rng)
+        mvn             x7,  x7                // ~dif
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+        ldr             w9,  [x1]              // cdf[0-1]
+        ldp             w5,  w6,  [x0, #RNG]   // + CNT
+        sub             sp,  sp,  #48
+        ldr             x7,  [x0, #DIF]
+        lsr             w4,  w5,  #8           // r >> 8
+        and             w2,  w9,  #0xffc0      // f &= ~63
+        mul             w4,  w4,  w2
+        lsr             w4,  w4,  #7
+        add             w4,  w4,  #4           // v
+        subs            x8,  x7,  x4, lsl #48  // dif - vw
+        sub             w5,  w5,  w4           // r - v
+        cset            w15, lo
+        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
+        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
+
+        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
+
+        clz             w5,  w4                // clz(rng)
+        mvn             x7,  x7                // ~dif
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+
+        cbz             w10, L(renorm2)
+
+        lsr             w2,  w9,  #16          // count = cdf[1]
+        and             w9,  w9,  #0xffff      // cdf[0]
+
+        sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)
+        lsr             w2,  w2,  #4           // count >> 4
+        add             w10, w3,  #1           // count + (count < 32)
+        add             w2,  w2,  #4           // rate = (count >> 4) | 4
+
+        sub             w9,  w9,  w15          // cdf[0] -= bit
+        sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}
+        asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate
+        sub             w9,  w9,  w11          // cdf[0]
+
+        strh            w9,  [x1]
+        strh            w10, [x1, #2]
+
+        b               L(renorm2)
 endfunc
--- a/src/msac.h
+++ b/src/msac.h
@@ -61,12 +61,15 @@
                                               size_t n_symbols);
 unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
                                                size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_neon
 #define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_neon
 #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
-#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_c
-#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_c
-#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_c
+#define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool           dav1d_msac_decode_bool_neon
 #elif ARCH_X86_64 && HAVE_ASM
 unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
                                               size_t n_symbols);
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -171,6 +171,9 @@
         c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_neon;
         c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_neon;
         c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon;
+        c.bool_adapt     = dav1d_msac_decode_bool_adapt_neon;
+        c.bool_equi      = dav1d_msac_decode_bool_equi_neon;
+        c.bool           = dav1d_msac_decode_bool_neon;
     }
 #elif ARCH_X86_64 && HAVE_ASM
     if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {