shithub: dav1d

Download patch

ref: bb505b3e7bbd23e67531ce82059845b2d83cdbe9
parent: 7e693a1d26779acf8e21378a898c25ab35f2bcac
author: Henrik Gramner <[email protected]>
date: Fri Oct 5 11:33:35 EDT 2018

Intra prediction DSP refactoring

Also remove some redundant clips in DC pred.

--- a/src/decode.c
+++ b/src/decode.c
@@ -2713,7 +2713,7 @@
     const int bd_idx = (f->seq_hdr.bpc - 8) >> 1;
     f->dsp = &c->dsp[bd_idx];
 
-    if (!f->dsp->ipred.intra_pred[TX_4X4][DC_PRED]) {
+    if (!f->dsp->ipred.intra_pred[DC_PRED]) {
         Dav1dDSPContext *const dsp = &c->dsp[bd_idx];
 
         switch (f->seq_hdr.bpc) {
--- a/src/ipred.c
+++ b/src/ipred.c
@@ -35,45 +35,25 @@
 #include "common/intops.h"
 
 #include "src/ipred.h"
+#include "src/tables.h"
 
-#define sz_grid(l_fn) \
-l_fn( 4,  4) \
-l_fn( 4,  8) \
-l_fn( 4, 16) \
-l_fn( 8,  4) \
-l_fn( 8,  8) \
-l_fn( 8, 16) \
-l_fn( 8, 32) \
-l_fn(16,  4) \
-l_fn(16,  8) \
-l_fn(16, 16) \
-l_fn(16, 32) \
-l_fn(16, 64) \
-l_fn(32,  8) \
-l_fn(32, 16) \
-l_fn(32, 32) \
-l_fn(32, 64) \
-l_fn(64, 16) \
-l_fn(64, 32) \
-l_fn(64, 64)
-
 static NOINLINE void
-splat_dc_c(pixel *dst, const ptrdiff_t stride,
-           const int w, const int h, const unsigned dc)
+splat_dc(pixel *dst, const ptrdiff_t stride,
+         const int width, const int height, const unsigned dc)
 {
     assert(dc <= (1 << BITDEPTH) - 1);
 #if BITDEPTH == 8
-    if (w > 4) {
+    if (width > 4) {
         const uint64_t dcN = dc * 0x0101010101010101ULL;
-        for (int y = 0; y < h; y++) {
-            for (int x = 0; x < w; x += sizeof(dcN))
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x += sizeof(dcN))
                 *((uint64_t *) &dst[x]) = dcN;
             dst += PXSTRIDE(stride);
         }
     } else {
         const unsigned dcN = dc * 0x01010101U;
-        for (int y = 0; y < h; y++) {
-            for (int x = 0; x < w; x += sizeof(dcN))
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x += sizeof(dcN))
                 *((unsigned *) &dst[x]) = dcN;
             dst += PXSTRIDE(stride);
         }
@@ -80,8 +60,8 @@
     }
 #else
     const uint64_t dcN = dc * 0x0001000100010001ULL;
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x += sizeof(dcN) >> 1)
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += sizeof(dcN) >> 1)
             *((uint64_t *) &dst[x]) = dcN;
         dst += PXSTRIDE(stride);
     }
@@ -88,52 +68,28 @@
 #endif
 }
 
-#define dc_lfn(w, h, dir, dc_gen) \
-static void dc##dir##_##w##x##h##_c(pixel *dst, const ptrdiff_t stride, \
-                                    const pixel *const topleft, const int a) \
-{ \
-    dc_gen; \
-    splat_dc_c(dst, stride, w, h, dc); \
+static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a)
+{
+    unsigned dc = width >> 1;
+    for (int i = 0; i < width; i++)
+       dc += topleft[1 + i];
+
+    splat_dc(dst, stride, width, height, dc >> ctz(width));
 }
 
-#define dc1d_lfns(width, height, sh1, sh2) \
-dc_lfn(width, height, top, unsigned dc = width >> 1; \
-                           for (int i = 0; i < width; i++) \
-                               dc += topleft[1 + i]; \
-                           dc >>= sh1) \
-dc_lfn(width, height, left, unsigned dc = height >> 1; \
-                            for (int i = 0; i < height; i++) \
-                                dc += topleft[-(1 + i)]; \
-                            dc >>= sh2)
+static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
+                            const pixel *const topleft,
+                            const int width, const int height, const int a)
+{
+    unsigned dc = height >> 1;
+    for (int i = 0; i < height; i++)
+       dc += topleft[-(1 + i)];
 
-dc1d_lfns( 4,  4, 2, 2)
-dc1d_lfns( 4,  8, 2, 3)
-dc1d_lfns( 4, 16, 2, 4)
-dc1d_lfns( 8,  4, 3, 2)
-dc1d_lfns( 8,  8, 3, 3)
-dc1d_lfns( 8, 16, 3, 4)
-dc1d_lfns( 8, 32, 3, 5)
-dc1d_lfns(16,  4, 4, 2)
-dc1d_lfns(16,  8, 4, 3)
-dc1d_lfns(16, 16, 4, 4)
-dc1d_lfns(16, 32, 4, 5)
-dc1d_lfns(16, 64, 4, 6)
-dc1d_lfns(32,  8, 5, 3)
-dc1d_lfns(32, 16, 5, 4)
-dc1d_lfns(32, 32, 5, 5)
-dc1d_lfns(32, 64, 5, 6)
-dc1d_lfns(64, 16, 6, 4)
-dc1d_lfns(64, 32, 6, 5)
-dc1d_lfns(64, 64, 6, 6)
+    splat_dc(dst, stride, width, height, dc >> ctz(height));
+}
 
-#define dc2d_lfn(width, height, dc_gen) \
-dc_lfn(width, height,, unsigned dc = (width + height) >> 1; \
-                       for (int i = 0; i < width; i++) \
-                           dc += topleft[i + 1]; \
-                       for (int i = 0; i < height; i++) \
-                           dc += topleft[-(i + 1)]; \
-                       dc_gen)
-
 #if BITDEPTH == 8
 #define MULTIPLIER_1x2 0x5556
 #define MULTIPLIER_1x4 0x3334
@@ -144,38 +100,40 @@
 #define BASE_SHIFT 17
 #endif
 
-dc2d_lfn( 4,  4, dc >>= 3)
-dc2d_lfn( 4,  8, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 2) >> BASE_SHIFT))
-dc2d_lfn( 4, 16, dc = iclip_pixel(MULTIPLIER_1x4 * (dc >> 2) >> BASE_SHIFT))
-dc2d_lfn( 8,  4, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 2) >> BASE_SHIFT))
-dc2d_lfn( 8,  8, dc >>= 4)
-dc2d_lfn( 8, 16, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 3) >> BASE_SHIFT))
-dc2d_lfn( 8, 32, dc = iclip_pixel(MULTIPLIER_1x4 * (dc >> 3) >> BASE_SHIFT))
-dc2d_lfn(16,  4, dc = iclip_pixel(MULTIPLIER_1x4 * (dc >> 2) >> BASE_SHIFT))
-dc2d_lfn(16,  8, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 3) >> BASE_SHIFT))
-dc2d_lfn(16, 16, dc >>= 5)
-dc2d_lfn(16, 32, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 4) >> BASE_SHIFT))
-dc2d_lfn(16, 64, dc = iclip_pixel(MULTIPLIER_1x4 * (dc >> 4) >> BASE_SHIFT))
-dc2d_lfn(32,  8, dc = iclip_pixel(MULTIPLIER_1x4 * (dc >> 3) >> BASE_SHIFT))
-dc2d_lfn(32, 16, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 4) >> BASE_SHIFT))
-dc2d_lfn(32, 32, dc >>= 6)
-dc2d_lfn(32, 64, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 5) >> BASE_SHIFT))
-dc2d_lfn(64, 16, dc = iclip_pixel(MULTIPLIER_1x4 * (dc >> 4) >> BASE_SHIFT))
-dc2d_lfn(64, 32, dc = iclip_pixel(MULTIPLIER_1x2 * (dc >> 5) >> BASE_SHIFT))
-dc2d_lfn(64, 64, dc >>= 7)
+static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft,
+                       const int width, const int height, const int a)
+{
+    unsigned dc = (width + height) >> 1;
+    for (int i = 0; i < width; i++)
+       dc += topleft[i + 1];
+    for (int i = 0; i < height; i++)
+       dc += topleft[-(i + 1)];
+    dc >>= ctz(width + height);
 
+    if (width != height) {
+        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
+                                                           MULTIPLIER_1x2;
+        dc >>= BASE_SHIFT;
+    }
+
+    splat_dc(dst, stride, width, height, dc);
+}
+
 #undef MULTIPLIER_1x2
 #undef MULTIPLIER_1x4
 #undef BASE_SHIFT
 
-#define dc128_lfn(width, height) \
-dc_lfn(width, height, 128, const unsigned dc = (1 << BITDEPTH) >> 1)
+static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a)
+{
+    splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
+}
 
-sz_grid(dc128_lfn)
-
-static NOINLINE void
-v_c(pixel *dst, const ptrdiff_t stride,
-    const pixel *const topleft, const int width, const int height)
+static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
+                      const pixel *const topleft,
+                      const int width, const int height, const int a)
 {
     for (int y = 0; y < height; y++) {
         pixel_copy(dst, topleft + 1, width);
@@ -183,18 +141,9 @@
     }
 }
 
-#define v_lfn(width, height) \
-static void v_##width##x##height##_##c(pixel *dst, const ptrdiff_t stride, \
-                                       const pixel *const topleft, const int a) \
-{ \
-    v_c(dst, stride, topleft, width, height); \
-}
-
-sz_grid(v_lfn)
-
-static NOINLINE void
-h_c(pixel *dst, const ptrdiff_t stride,
-    const pixel *const topleft, const int width, const int height)
+static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
+                      const pixel *const topleft,
+                      const int width, const int height, const int a)
 {
     for (int y = 0; y < height; y++) {
         pixel_set(dst, topleft[-(1 + y)], width);
@@ -202,18 +151,9 @@
     }
 }
 
-#define h_lfn(width, height) \
-static void h_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                     const pixel *const topleft, const int a) \
-{ \
-    h_c(dst, stride, topleft, width, height); \
-}
-
-sz_grid(h_lfn)
-
-static NOINLINE void
-paeth_c(pixel *dst, const ptrdiff_t stride, const pixel *const tl_ptr,
-        const int width, const int height)
+static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
+                          const pixel *const tl_ptr,
+                          const int width, const int height, const int a)
 {
     const int topleft = tl_ptr[0];
     for (int y = 0; y < height; y++) {
@@ -232,43 +172,12 @@
     }
 }
 
-#define paeth_lfn(width, height) \
-static void paeth_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                         const pixel *const topleft, \
-                                         const int a) \
-{ \
-    paeth_c(dst, stride, topleft, width, height); \
-}
-
-sz_grid(paeth_lfn)
-
-static const uint8_t sm_weight_arrays[] = {
-    // Unused, because we always offset by bs, which is at least 2.
-    0, 0,
-    // bs = 2
-    255, 128,
-    // bs = 4
-    255, 149, 85, 64,
-    // bs = 8
-    255, 197, 146, 105, 73, 50, 37, 32,
-    // bs = 16
-    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-    // bs = 32
-    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-    // bs = 64
-    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
-    65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
-    13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
-};
-
-static NOINLINE void
-smooth_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
-         const int width, const int height)
+static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a)
 {
-    const uint8_t *const weights_hor = &sm_weight_arrays[width];
-    const uint8_t *const weights_ver = &sm_weight_arrays[height];
+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
     const int right = topleft[width], bottom = topleft[-height];
 
     for (int y = 0; y < height; y++) {
@@ -283,21 +192,11 @@
     }
 }
 
-#define smooth_lfn(width, height) \
-static void smooth_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                          const pixel *const topleft, \
-                                          const int a) \
-{ \
-    smooth_c(dst, stride, topleft, width, height); \
-}
-
-sz_grid(smooth_lfn)
-
-static NOINLINE void
-smooth_v_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
-           const int width, const int height)
+static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height, const int a)
 {
-    const uint8_t *const weights_ver = &sm_weight_arrays[height];
+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
     const int bottom = topleft[-height];
 
     for (int y = 0; y < height; y++) {
@@ -310,21 +209,11 @@
     }
 }
 
-#define smooth_v_lfn(width, height) \
-static void smooth_v_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                            const pixel *const topleft, \
-                                            const int a) \
-{ \
-    smooth_v_c(dst, stride, topleft, width, height); \
-}
-
-sz_grid(smooth_v_lfn)
-
-static NOINLINE void
-smooth_h_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
-           const int width, const int height)
+static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height, const int a)
 {
-    const uint8_t *const weights_hor = &sm_weight_arrays[width];
+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
     const int right = topleft[width];
 
     for (int y = 0; y < height; y++) {
@@ -337,50 +226,6 @@
     }
 }
 
-#define smooth_h_lfn(width, height) \
-static void smooth_h_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                            const pixel *const topleft, \
-                                            const int a) \
-{ \
-    smooth_h_c(dst, stride, topleft, width, height); \
-}
-
-sz_grid(smooth_h_lfn)
-
-static const int16_t dr_intra_derivative[90] = {
-  // More evenly spread out angles and limited to 10-bit
-  // Values that are 0 will never be used
-  //                    Approx angle
-  0,    0, 0,        //
-  1023, 0, 0,        // 3, ...
-  547,  0, 0,        // 6, ...
-  372,  0, 0, 0, 0,  // 9, ...
-  273,  0, 0,        // 14, ...
-  215,  0, 0,        // 17, ...
-  178,  0, 0,        // 20, ...
-  151,  0, 0,        // 23, ... (113 & 203 are base angles)
-  132,  0, 0,        // 26, ...
-  116,  0, 0,        // 29, ...
-  102,  0, 0, 0,     // 32, ...
-  90,   0, 0,        // 36, ...
-  80,   0, 0,        // 39, ...
-  71,   0, 0,        // 42, ...
-  64,   0, 0,        // 45, ... (45 & 135 are base angles)
-  57,   0, 0,        // 48, ...
-  51,   0, 0,        // 51, ...
-  45,   0, 0, 0,     // 54, ...
-  40,   0, 0,        // 58, ...
-  35,   0, 0,        // 61, ...
-  31,   0, 0,        // 64, ...
-  27,   0, 0,        // 67, ... (67 & 157 are base angles)
-  23,   0, 0,        // 70, ...
-  19,   0, 0,        // 73, ...
-  15,   0, 0, 0, 0,  // 76, ...
-  11,   0, 0,        // 81, ...
-  7,    0, 0,        // 84, ...
-  3,    0, 0,        // 87, ...
-};
-
 static int get_filter_strength(const unsigned blk_wh, const unsigned d,
                                const int type)
 {
@@ -421,11 +266,10 @@
     return strength;
 }
 
-static void filter_edge(pixel *const out, const int sz,
-                        const pixel *const in, const int from, const int to,
-                        const unsigned strength)
+static void filter_edge(pixel *const out, const int sz, const pixel *const in,
+                        const int from, const int to, const unsigned strength)
 {
-    const uint8_t kernel[3][5] = {
+    static const uint8_t kernel[3][5] = {
         { 0, 4, 8, 4, 0 },
         { 0, 5, 6, 5, 0 },
         { 2, 4, 4, 4, 2 }
@@ -448,7 +292,7 @@
 static void upsample_edge(pixel *const out, const int hsz,
                           const pixel *const in, const int from, const int to)
 {
-    const int8_t kernel[4] = { -1, 9, 9, -1 };
+    static const int8_t kernel[4] = { -1, 9, 9, -1 };
     int i;
     for (i = 0; i < hsz - 1; i++) {
         out[i * 2] = in[iclip(i, from, to - 1)];
@@ -461,14 +305,14 @@
     out[i * 2] = in[iclip(i, from, to - 1)];
 }
 
-static NOINLINE void
-z1_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
-     int angle, const int width, const int height)
+static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle)
 {
     const int is_sm = angle >> 9;
     angle &= 511;
     assert(angle < 90);
-    const int dx = dr_intra_derivative[angle];
+    const int dx = dav1d_dr_intra_derivative[angle];
     pixel top_out[(64 + 64) * 2];
     const pixel *top;
     int max_base_x;
@@ -513,25 +357,15 @@
     }
 }
 
-#define z1_lfn(width, height) \
-static void z1_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                      const pixel *const topleft, \
-                                      const int angle) \
-{ \
-    z1_c(dst, stride, topleft, angle, width, height); \
-}
-
-sz_grid(z1_lfn)
-
-static NOINLINE void
-z2_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
-     int angle, const int width, const int height)
+static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle)
 {
     const int is_sm = angle >> 9;
     angle &= 511;
     assert(angle > 90 && angle < 180);
-    const int dy = dr_intra_derivative[angle - 90];
-    const int dx = dr_intra_derivative[180 - angle];
+    const int dy = dav1d_dr_intra_derivative[angle - 90];
+    const int dx = dav1d_dr_intra_derivative[180 - angle];
     const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);
     const int upsample_above = get_upsample(width + height, angle - 90, is_sm);
     pixel edge[64 * 2 + 64 * 2 + 1];
@@ -594,24 +428,14 @@
     }
 }
 
-#define z2_lfn(width, height) \
-static void z2_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                      const pixel *const topleft, \
-                                      const int angle) \
-{ \
-    z2_c(dst, stride, topleft, angle, width, height); \
-}
-
-sz_grid(z2_lfn)
-
-static NOINLINE void
-z3_c(pixel *dst, const ptrdiff_t stride, const pixel *const topleft_in,
-     int angle, const int width, const int height)
+static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle)
 {
     const int is_sm = angle >> 9;
     angle &= 511;
     assert(angle > 180);
-    const int dy = dr_intra_derivative[270 - angle];
+    const int dy = dav1d_dr_intra_derivative[270 - angle];
     pixel left_out[(64 + 64) * 2];
     const pixel *left;
     int max_base_y;
@@ -659,74 +483,15 @@
     }
 }
 
-#define z3_lfn(width, height) \
-static void z3_##width##x##height##_c(pixel *dst, const ptrdiff_t stride, \
-                                      const pixel *const topleft, \
-                                      const int angle) \
-{ \
-    z3_c(dst, stride, topleft, angle, width, height); \
-}
-
-sz_grid(z3_lfn)
-
-static const int8_t av1_filter_intra_taps[5][8][8] = {
-    {
-        { -6, 10,  0,  0,  0, 12,  0, 0 },
-        { -5,  2, 10,  0,  0,  9,  0, 0 },
-        { -3,  1,  1, 10,  0,  7,  0, 0 },
-        { -3,  1,  1,  2, 10,  5,  0, 0 },
-        { -4,  6,  0,  0,  0,  2, 12, 0 },
-        { -3,  2,  6,  0,  0,  2,  9, 0 },
-        { -3,  2,  2,  6,  0,  2,  7, 0 },
-        { -3,  1,  2,  2,  6,  3,  5, 0 },
-    }, {
-        { -10, 16,  0,  0,  0, 10,  0, 0 },
-        {  -6,  0, 16,  0,  0,  6,  0, 0 },
-        {  -4,  0,  0, 16,  0,  4,  0, 0 },
-        {  -2,  0,  0,  0, 16,  2,  0, 0 },
-        { -10, 16,  0,  0,  0,  0, 10, 0 },
-        {  -6,  0, 16,  0,  0,  0,  6, 0 },
-        {  -4,  0,  0, 16,  0,  0,  4, 0 },
-        {  -2,  0,  0,  0, 16,  0,  2, 0 },
-    }, {
-        { -8, 8, 0, 0, 0, 16,  0, 0 },
-        { -8, 0, 8, 0, 0, 16,  0, 0 },
-        { -8, 0, 0, 8, 0, 16,  0, 0 },
-        { -8, 0, 0, 0, 8, 16,  0, 0 },
-        { -4, 4, 0, 0, 0,  0, 16, 0 },
-        { -4, 0, 4, 0, 0,  0, 16, 0 },
-        { -4, 0, 0, 4, 0,  0, 16, 0 },
-        { -4, 0, 0, 0, 4,  0, 16, 0 },
-    }, {
-        { -2, 8, 0, 0, 0, 10,  0, 0 },
-        { -1, 3, 8, 0, 0,  6,  0, 0 },
-        { -1, 2, 3, 8, 0,  4,  0, 0 },
-        {  0, 1, 2, 3, 8,  2,  0, 0 },
-        { -1, 4, 0, 0, 0,  3, 10, 0 },
-        { -1, 3, 4, 0, 0,  4,  6, 0 },
-        { -1, 2, 3, 4, 0,  4,  4, 0 },
-        { -1, 2, 2, 3, 4,  3,  3, 0 },
-    }, {
-        { -12, 14,  0,  0,  0, 14,  0, 0 },
-        { -10,  0, 14,  0,  0, 12,  0, 0 },
-        {  -9,  0,  0, 14,  0, 11,  0, 0 },
-        {  -8,  0,  0,  0, 14, 10,  0, 0 },
-        { -10, 12,  0,  0,  0,  0, 14, 0 },
-        {  -9,  1, 12,  0,  0,  0, 12, 0 },
-        {  -8,  0,  0, 12,  0,  1, 11, 0 },
-        {  -7,  0,  0,  1, 12,  1,  9, 0 },
-    },
-};
-
-static NOINLINE void
-filter_intra_c(pixel *dst, const ptrdiff_t stride,
-               const pixel *const topleft_in,
-               int filt_idx, const int width, const int height)
+/* Up to 32x32 only */
+static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft_in,
+                           const int width, const int height, int filt_idx)
 {
     filt_idx &= 511;
     assert(filt_idx < 5);
 
-    const int8_t (*const filter)[8] = av1_filter_intra_taps[filt_idx];
+    const int8_t (*const filter)[8] = dav1d_filter_intra_taps[filt_idx];
     int x, y;
     ptrdiff_t left_stride;
     const pixel *left, *topleft, *top;
@@ -764,30 +529,6 @@
     }
 }
 
-#define filter_lfn(width, height) \
-static void filter_##width##x##height##_c(pixel *const dst, \
-                                          const ptrdiff_t stride, \
-                                          const pixel *const topleft, \
-                                          const int filt_idx) \
-{ \
-    filter_intra_c(dst, stride, topleft, filt_idx, width, height); \
-}
-
-filter_lfn( 4,  4)
-filter_lfn( 8,  4)
-filter_lfn(16,  4)
-filter_lfn( 4,  8)
-filter_lfn( 8,  8)
-filter_lfn(16,  8)
-filter_lfn(32,  8)
-filter_lfn( 4, 16)
-filter_lfn( 8, 16)
-filter_lfn(16, 16)
-filter_lfn(32, 16)
-filter_lfn( 8, 32)
-filter_lfn(16, 32)
-filter_lfn(32, 32)
-
 static NOINLINE void
 cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
          const int w_pad, const int h_pad, const int width, const int height,
@@ -956,57 +697,20 @@
 }
 
 void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
-#define assign_lfn(w, h, p1, p2, pfx) \
-    c->intra_pred[pfx##TX_##w##X##h][p1##_PRED] = p2##_##w##x##h##_c
-#define assign_fns(p1, p2) \
-    assign_lfn( 4,  4, p1, p2,); \
-    assign_lfn( 4,  8, p1, p2, R); \
-    assign_lfn( 4, 16, p1, p2, R); \
-    assign_lfn( 8,  4, p1, p2, R); \
-    assign_lfn( 8,  8, p1, p2,); \
-    assign_lfn( 8, 16, p1, p2, R); \
-    assign_lfn( 8, 32, p1, p2, R); \
-    assign_lfn(16,  4, p1, p2, R); \
-    assign_lfn(16,  8, p1, p2, R); \
-    assign_lfn(16, 16, p1, p2,); \
-    assign_lfn(16, 32, p1, p2, R); \
-    assign_lfn(16, 64, p1, p2, R); \
-    assign_lfn(32,  8, p1, p2, R); \
-    assign_lfn(32, 16, p1, p2, R); \
-    assign_lfn(32, 32, p1, p2,); \
-    assign_lfn(32, 64, p1, p2, R); \
-    assign_lfn(64, 16, p1, p2, R); \
-    assign_lfn(64, 32, p1, p2, R); \
-    assign_lfn(64, 64, p1, p2,); \
-
-    assign_fns(DC, dc);
-    assign_fns(DC_128, dc128);
-    assign_fns(TOP_DC,  dctop);
-    assign_fns(LEFT_DC, dcleft);
-    assign_fns(HOR, h);
-    assign_fns(VERT, v);
-    assign_fns(PAETH, paeth);
-    assign_fns(SMOOTH, smooth);
-    assign_fns(SMOOTH_V, smooth_v);
-    assign_fns(SMOOTH_H, smooth_h);
-    assign_fns(Z1, z1);
-    assign_fns(Z2, z2);
-    assign_fns(Z3, z3);
-
-    assign_lfn( 4,  4, FILTER, filter,);
-    assign_lfn( 8,  4, FILTER, filter, R);
-    assign_lfn(16,  4, FILTER, filter, R);
-    assign_lfn( 4,  8, FILTER, filter, R);
-    assign_lfn( 8,  8, FILTER, filter,);
-    assign_lfn(16,  8, FILTER, filter, R);
-    assign_lfn(32,  8, FILTER, filter, R);
-    assign_lfn( 4, 16, FILTER, filter, R);
-    assign_lfn( 8, 16, FILTER, filter, R);
-    assign_lfn(16, 16, FILTER, filter,);
-    assign_lfn(32, 16, FILTER, filter, R);
-    assign_lfn( 8, 32, FILTER, filter, R);
-    assign_lfn(16, 32, FILTER, filter, R);
-    assign_lfn(32, 32, FILTER, filter,);
+    c->intra_pred[DC_PRED      ] = ipred_dc_c;
+    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
+    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
+    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
+    c->intra_pred[HOR_PRED     ] = ipred_h_c;
+    c->intra_pred[VERT_PRED    ] = ipred_v_c;
+    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
+    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
+    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
+    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
+    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
+    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
+    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
+    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
 
     // cfl functions are split per chroma subsampling type
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4  ] = cfl_ac_8x8_to_4x4_c;
--- a/src/ipred.h
+++ b/src/ipred.h
@@ -40,7 +40,8 @@
  *   see ipred_prepare.h for more detailed documentation.
  */
 #define decl_angular_ipred_fn(name) \
-void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, int angle)
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
+            int width, int height, int angle)
 typedef decl_angular_ipred_fn(*angular_ipred_fn);
 
 /*
@@ -84,7 +85,7 @@
 typedef decl_pal_pred_fn(*pal_pred_fn);
 
 typedef struct Dav1dIntraPredDSPContext {
-    angular_ipred_fn intra_pred[N_RECT_TX_SIZES][N_IMPL_INTRA_PRED_MODES];
+    angular_ipred_fn intra_pred[N_IMPL_INTRA_PRED_MODES];
 
     // chroma-from-luma
     cfl_ac_fn cfl_ac[3 /* 420, 422, 444 */][N_RECT_TX_SIZES /* chroma tx size */];
--- a/src/recon.c
+++ b/src/recon.c
@@ -767,8 +767,9 @@
                                                           f->cur.p.stride[0], top_sb_edge,
                                                           b->y_mode, &angle,
                                                           t_dim->w, t_dim->h, edge);
-                    dsp->ipred.intra_pred[b->tx][m](dst, f->cur.p.stride[0],
-                                                    edge, angle | sm_fl);
+                    dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
+                                             t_dim->w * 4, t_dim->h * 4,
+                                             angle | sm_fl);
 
                     if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                         hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
@@ -869,7 +870,9 @@
                                                           top_sb_edge, DC_PRED, &angle,
                                                           cfl_uv_t_dim->w,
                                                           cfl_uv_t_dim->h, edge);
-                    dsp->ipred.intra_pred[cfl_uvtx][m](uv_dst[pl], stride, edge, 0);
+                    dsp->ipred.intra_pred[m](uv_dst[pl], stride, edge,
+                                             cfl_uv_t_dim->w * 4,
+                                             cfl_uv_t_dim->h * 4, 0);
                 }
                 const int furthest_r =
                     ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
@@ -981,8 +984,10 @@
                                                               top_sb_edge, b->uv_mode,
                                                               &angle, uv_t_dim->w,
                                                               uv_t_dim->h, edge);
-                        dsp->ipred.intra_pred[b->uvtx][m](dst, stride,
-                                                          edge, angle | sm_uv_fl);
+                        dsp->ipred.intra_pred[m](dst, stride, edge,
+                                                 uv_t_dim->w * 4,
+                                                 uv_t_dim->h * 4,
+                                                 angle | sm_uv_fl);
                         if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                             hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
                                      uv_t_dim->h * 4, 2, "l");
@@ -1100,7 +1105,6 @@
                 obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
         }
         if (b->interintra_type) {
-            const enum RectTxfmSize ii_tx = dav1d_max_txfm_size_for_bs[bs][0];
             pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
             enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
                                    SMOOTH_PRED : b->interintra_mode;
@@ -1117,7 +1121,8 @@
                                                   ts->tiling.col_end, ts->tiling.row_end,
                                                   0, dst, f->cur.p.stride[0], top_sb_edge,
                                                   m, &angle, bw4, bh4, tl_edge);
-            dsp->ipred.intra_pred[ii_tx][m](tmp, 4 * bw4 * sizeof(pixel), tl_edge, 0);
+            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
+                                     tl_edge, bw4 * 4, bh4 * 4, 0);
             const uint8_t *const ii_mask =
                 b->interintra_type == INTER_INTRA_BLEND ?
                      dav1d_ii_masks[bs][0][b->interintra_mode] :
@@ -1210,8 +1215,6 @@
                 // FIXME for 8x32 with 4:2:2 subsampling, this probably does
                 // the wrong thing since it will select 4x16, not 4x32, as a
                 // transform size...
-                const enum RectTxfmSize ii_tx =
-                    dav1d_max_txfm_size_for_bs[bs][f->cur.p.p.layout];
                 const uint8_t *const ii_mask =
                     b->interintra_type == INTER_INTRA_BLEND ?
                          dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
@@ -1242,7 +1245,8 @@
                                                           0, uvdst, f->cur.p.stride[1],
                                                           top_sb_edge, m,
                                                           &angle, cbw4, cbh4, tl_edge);
-                    dsp->ipred.intra_pred[ii_tx][m](tmp, cbw4 * 4 * sizeof(pixel), tl_edge, 0);
+                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
+                                             tl_edge, cbw4 * 4, cbh4 * 4, 0);
                     dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),
                                   cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
                 }
--- a/src/tables.c
+++ b/src/tables.c
@@ -719,3 +719,113 @@
     // dummy (replicate row index 191)
     { 0, 0, 0,   0,   2, 127, - 1, 0 },
 };
+
+const uint8_t dav1d_sm_weights[128] = {
+    // Unused, because we always offset by bs, which is at least 2.
+      0,   0,
+    // bs = 2
+    255, 128,
+    // bs = 4
+    255, 149,  85,  64,
+    // bs = 8
+    255, 197, 146, 105,  73,  50,  37,  32,
+    // bs = 16
+    255, 225, 196, 170, 145, 123, 102,  84,
+     68,  54,  43,  33,  26,  20,  17,  16,
+    // bs = 32
+    255, 240, 225, 210, 196, 182, 169, 157,
+    145, 133, 122, 111, 101,  92,  83,  74,
+     66,  59,  52,  45,  39,  34,  29,  25,
+     21,  17,  14,  12,  10,   9,   8,   8,
+    // bs = 64
+    255, 248, 240, 233, 225, 218, 210, 203,
+    196, 189, 182, 176, 169, 163, 156, 150,
+    144, 138, 133, 127, 121, 116, 111, 106,
+    101,  96,  91,  86,  82,  77,  73,  69,
+     65,  61,  57,  54,  50,  47,  44,  41,
+     38,  35,  32,  29,  27,  25,  22,  20,
+     18,  16,  15,  13,  12,  10,   9,   8,
+      7,   6,   6,   5,   5,   4,   4,   4
+};
+
+const int16_t dav1d_dr_intra_derivative[90] = {
+    // More evenly spread out angles and limited to 10-bit
+    // Values that are 0 will never be used
+       0, 0, 0,       // Approx angle
+    1023, 0, 0,       // 3, ...
+     547, 0, 0,       // 6, ...
+     372, 0, 0, 0, 0, // 9, ...
+     273, 0, 0,       // 14, ...
+     215, 0, 0,       // 17, ...
+     178, 0, 0,       // 20, ...
+     151, 0, 0,       // 23, ... (113 & 203 are base angles)
+     132, 0, 0,       // 26, ...
+     116, 0, 0,       // 29, ...
+     102, 0, 0, 0,    // 32, ...
+      90, 0, 0,       // 36, ...
+      80, 0, 0,       // 39, ...
+      71, 0, 0,       // 42, ...
+      64, 0, 0,       // 45, ... (45 & 135 are base angles)
+      57, 0, 0,       // 48, ...
+      51, 0, 0,       // 51, ...
+      45, 0, 0, 0,    // 54, ...
+      40, 0, 0,       // 58, ...
+      35, 0, 0,       // 61, ...
+      31, 0, 0,       // 64, ...
+      27, 0, 0,       // 67, ... (67 & 157 are base angles)
+      23, 0, 0,       // 70, ...
+      19, 0, 0,       // 73, ...
+      15, 0, 0, 0, 0, // 76, ...
+      11, 0, 0,       // 81, ...
+       7, 0, 0,       // 84, ...
+       3, 0, 0,       // 87, ...
+};
+
+const int8_t dav1d_filter_intra_taps[5][8][8] = {
+    {
+        {  -6, 10,  0,  0,  0, 12,  0,  0 },
+        {  -5,  2, 10,  0,  0,  9,  0,  0 },
+        {  -3,  1,  1, 10,  0,  7,  0,  0 },
+        {  -3,  1,  1,  2, 10,  5,  0,  0 },
+        {  -4,  6,  0,  0,  0,  2, 12,  0 },
+        {  -3,  2,  6,  0,  0,  2,  9,  0 },
+        {  -3,  2,  2,  6,  0,  2,  7,  0 },
+        {  -3,  1,  2,  2,  6,  3,  5,  0 },
+    }, {
+        { -10, 16,  0,  0,  0, 10,  0,  0 },
+        {  -6,  0, 16,  0,  0,  6,  0,  0 },
+        {  -4,  0,  0, 16,  0,  4,  0,  0 },
+        {  -2,  0,  0,  0, 16,  2,  0,  0 },
+        { -10, 16,  0,  0,  0,  0, 10,  0 },
+        {  -6,  0, 16,  0,  0,  0,  6,  0 },
+        {  -4,  0,  0, 16,  0,  0,  4,  0 },
+        {  -2,  0,  0,  0, 16,  0,  2,  0 },
+    }, {
+        {  -8,  8,  0,  0,  0, 16,  0,  0 },
+        {  -8,  0,  8,  0,  0, 16,  0,  0 },
+        {  -8,  0,  0,  8,  0, 16,  0,  0 },
+        {  -8,  0,  0,  0,  8, 16,  0,  0 },
+        {  -4,  4,  0,  0,  0,  0, 16,  0 },
+        {  -4,  0,  4,  0,  0,  0, 16,  0 },
+        {  -4,  0,  0,  4,  0,  0, 16,  0 },
+        {  -4,  0,  0,  0,  4,  0, 16,  0 },
+    }, {
+        {  -2,  8,  0,  0,  0, 10,  0,  0 },
+        {  -1,  3,  8,  0,  0,  6,  0,  0 },
+        {  -1,  2,  3,  8,  0,  4,  0,  0 },
+        {   0,  1,  2,  3,  8,  2,  0,  0 },
+        {  -1,  4,  0,  0,  0,  3, 10,  0 },
+        {  -1,  3,  4,  0,  0,  4,  6,  0 },
+        {  -1,  2,  3,  4,  0,  4,  4,  0 },
+        {  -1,  2,  2,  3,  4,  3,  3,  0 },
+    }, {
+        { -12, 14,  0,  0,  0, 14,  0,  0 },
+        { -10,  0, 14,  0,  0, 12,  0,  0 },
+        {  -9,  0,  0, 14,  0, 11,  0,  0 },
+        {  -8,  0,  0,  0, 14, 10,  0,  0 },
+        { -10, 12,  0,  0,  0,  0, 14,  0 },
+        {  -9,  1, 12,  0,  0,  0, 12,  0 },
+        {  -8,  0,  0, 12,  0,  1, 11,  0 },
+        {  -7,  0,  0,  1, 12,  1,  9,  0 },
+    }
+};
--- a/src/tables.h
+++ b/src/tables.h
@@ -113,4 +113,8 @@
 extern const int8_t dav1d_mc_subpel_filters[5][15][8];
 extern const int8_t dav1d_mc_warp_filter[][8];
 
+extern const uint8_t dav1d_sm_weights[128];
+extern const int16_t dav1d_dr_intra_derivative[90];
+extern const int8_t dav1d_filter_intra_taps[5][8][8];
+
 #endif /* __DAV1D_SRC_TABLES_H__ */