ref: 2e8a3a219b0c886ccbda94f4cf97e43d23be5052
parent: 84f938ecbee168b54c8d118cec1680ba37b3de91
author: Martin Storsjö <[email protected]>
date: Tue May 14 06:23:00 EDT 2019
arm64: msac: Add handwritten versions of msac_decode_bool functions GCC Cortex A53 A72 A73 msac_decode_bool_c: 29.9 17.9 23.2 msac_decode_bool_neon: 27.4 15.3 20.4 msac_decode_bool_adapt_c: 49.2 26.8 31.0 msac_decode_bool_adapt_neon: 38.2 22.2 25.4 msac_decode_bool_equi_c: 26.6 16.8 19.4 msac_decode_bool_equi_neon: 23.9 13.7 15.7 Clang Cortex A53 A72 A73 msac_decode_bool_c: 28.0 16.4 23.1 msac_decode_bool_neon: 26.9 14.6 21.0 msac_decode_bool_adapt_c: 46.8 25.1 31.4 msac_decode_bool_adapt_neon: 36.2 19.0 26.2 msac_decode_bool_equi_c: 23.7 13.4 18.8 msac_decode_bool_equi_neon: 23.7 11.3 14.2 This is as fast as, or faster than, what either GCC or Clang produces.
--- a/src/arm/64/msac.S
+++ b/src/arm/64/msac.S
@@ -215,6 +215,7 @@
eor w5, w5, #16 // d = clz(rng) ^ 16
mvn x7, x7 // ~dif
add x7, x7, x3, lsl #48 // ~dif + (v << 48)
+L(renorm2):
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (~dif + (v << 48)) << d
@@ -277,4 +278,87 @@
function msac_decode_symbol_adapt16_neon, export=1
decode_update .8h, .16b, 16
b L(renorm)
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ bic w4, w5, #0xff // r &= 0xff00
+ add w4, w4, #8
+ subs x8, x7, x4, lsl #47 // dif - vw
+ lsr w4, w4, #1 // v
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ bic w1, w1, #0x3f // f &= ~63
+ mul w4, w4, w1
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+ b L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+ ldr w9, [x1] // cdf[0-1]
+ ldp w5, w6, [x0, #RNG] // + CNT
+ sub sp, sp, #48
+ ldr x7, [x0, #DIF]
+ lsr w4, w5, #8 // r >> 8
+ and w2, w9, #0xffc0 // f &= ~63
+ mul w4, w4, w2
+ lsr w4, w4, #7
+ add w4, w4, #4 // v
+ subs x8, x7, x4, lsl #48 // dif - vw
+ sub w5, w5, w4 // r - v
+ cset w15, lo
+ csel w4, w5, w4, hs // if (ret) v = r - v;
+ csel x7, x8, x7, hs // if (ret) dif = dif - vw;
+
+ ldr w10, [x0, #ALLOW_UPDATE_CDF]
+
+ clz w5, w4 // clz(rng)
+ mvn x7, x7 // ~dif
+ eor w5, w5, #16 // d = clz(rng) ^ 16
+
+ cbz w10, L(renorm2)
+
+ lsr w2, w9, #16 // count = cdf[1]
+ and w9, w9, #0xffff // cdf[0]
+
+ sub w3, w2, w2, lsr #5 // count - (count >= 32)
+ lsr w2, w2, #4 // count >> 4
+ add w10, w3, #1 // count + (count < 32)
+ add w2, w2, #4 // rate = (count >> 4) | 4
+
+ sub w9, w9, w15 // cdf[0] -= bit
+ sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
+ asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
+ sub w9, w9, w11 // cdf[0]
+
+ strh w9, [x1]
+ strh w10, [x1, #2]
+
+ b L(renorm2)
endfunc
--- a/src/msac.h
+++ b/src/msac.h
@@ -61,12 +61,15 @@
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
+unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
+unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
+unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
-#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_c
-#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_c
-#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c
+#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
+#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
+#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
#elif ARCH_X86_64 && HAVE_ASM
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -171,6 +171,9 @@
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon;
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon;
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon;
+ c.bool_adapt = dav1d_msac_decode_bool_adapt_neon;
+ c.bool_equi = dav1d_msac_decode_bool_equi_neon;
+ c.bool = dav1d_msac_decode_bool_neon;
}
#elif ARCH_X86_64 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {