shithub: dav1d

Download patch

ref: a75ee78bd998f393b9deada2859cf5f8ebbafe75
parent: ea74e3d513206fcdda4316f3f1303df47b890d48
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Mon Jun 15 09:46:55 EDT 2020

x86: Add put/prep_bilin_scaled AVX2 asm

Bilin scaled being very rarely used, add a new table entry to
mc_subpel_filters, and jump to the put/prep_8tap_scaled code.

AVX2 performance is obviously the same as the 8tap code, the speed up is
much smaller though, as the C code is a true bilinear codepath,
auto-vectorized. Yet, the AVX2 performance are always better.

--- a/src/tables.c
+++ b/src/tables.c
@@ -442,7 +442,7 @@
       0
 };
 
-const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
+const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = {
     [DAV1D_FILTER_8TAP_REGULAR] = {
         {   0,   1,  -3,  63,   4,  -1,   0,   0 },
         {   0,   1,  -5,  61,   9,  -2,   0,   0 },
@@ -524,6 +524,27 @@
         {   0,   0,   2,  20,  31,  11,   0,   0 },
         {   0,   0,   2,  18,  31,  13,   0,   0 },
         {   0,   0,   1,  17,  31,  15,   0,   0 }
+#if ARCH_X86_64
+    /* Bilin scaled being very rarely used, add a new table entry
+     * and use the put/prep_8tap_scaled code, thus acting as a
+     * scaled bilinear filter. */
+    }, [5] = {
+        {   0,   0,   0, 60,   4,   0,   0,   0 },
+        {   0,   0,   0, 56,   8,   0,   0,   0 },
+        {   0,   0,   0, 52,  12,   0,   0,   0 },
+        {   0,   0,   0, 48,  16,   0,   0,   0 },
+        {   0,   0,   0, 44,  20,   0,   0,   0 },
+        {   0,   0,   0, 40,  24,   0,   0,   0 },
+        {   0,   0,   0, 36,  28,   0,   0,   0 },
+        {   0,   0,   0, 32,  32,   0,   0,   0 },
+        {   0,   0,   0, 28,  36,   0,   0,   0 },
+        {   0,   0,   0, 24,  40,   0,   0,   0 },
+        {   0,   0,   0, 20,  44,   0,   0,   0 },
+        {   0,   0,   0, 16,  48,   0,   0,   0 },
+        {   0,   0,   0, 12,  52,   0,   0,   0 },
+        {   0,   0,   0,  8,  56,   0,   0,   0 },
+        {   0,   0,   0,  4,  60,   0,   0,   0 }
+#endif
     }
 };
 
--- a/src/tables.h
+++ b/src/tables.h
@@ -110,7 +110,7 @@
 extern const int16_t dav1d_sgr_params[16][4];
 extern const uint8_t dav1d_sgr_x_by_x[256];
 
-extern const int8_t dav1d_mc_subpel_filters[5][15][8];
+extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
 extern const int8_t dav1d_mc_warp_filter[193][8];
 extern const int8_t dav1d_resize_filter[64][8];
 
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -5719,12 +5719,21 @@
 %undef isprep
 %endmacro
 
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+    mov                 t0d, (5*15 << 16) | 5*15
+    mov                 t1d, (5*15 << 16) | 5*15
+    jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
 %if WIN64
 DECLARE_REG_TMP 6, 5
 %else
 DECLARE_REG_TMP 6, 8
 %endif
-%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+BILIN_SCALED_FN put
 PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
 PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
@@ -5741,7 +5750,7 @@
 %else
 DECLARE_REG_TMP 6, 7
 %endif
-%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+BILIN_SCALED_FN prep
 PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
 PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
 PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -99,6 +99,7 @@
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
 decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
 
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
@@ -109,6 +110,7 @@
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
 decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
 
 decl_avg_fn(dav1d_avg_avx512icl);
 decl_avg_fn(dav1d_avg_avx2);
@@ -264,6 +266,7 @@
     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
+    init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
 
     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
@@ -274,6 +277,7 @@
     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
+    init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
 
     c->avg = dav1d_avg_avx2;
     c->w_avg = dav1d_w_avg_avx2;