shithub: dav1d

Download patch

ref: 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0
parent: 367d785a4e70b3e43eee234b3c745b047e3fbd40
author: Marvin Scholz <[email protected]>
date: Thu Oct 25 12:45:12 EDT 2018

Build: Add suffix to templated BITDEPTH files

Fix #96

--- a/src/cdef.c
+++ /dev/null
@@ -1,298 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stdlib.h>
-
-#include "common/intops.h"
-
-#include "src/cdef.h"
-
-static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {
-    { -1 * 8 + 1, -2 * 8 + 2 },
-    {  0 * 8 + 1, -1 * 8 + 2 },
-    {  0 * 8 + 1,  0 * 8 + 2 },
-    {  0 * 8 + 1,  1 * 8 + 2 },
-    {  1 * 8 + 1,  2 * 8 + 2 },
-    {  1 * 8 + 0,  2 * 8 + 1 },
-    {  1 * 8 + 0,  2 * 8 + 0 },
-    {  1 * 8 + 0,  2 * 8 - 1 }
-};
-
-static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {
-    { -1 * 16 + 1, -2 * 16 + 2 },
-    {  0 * 16 + 1, -1 * 16 + 2 },
-    {  0 * 16 + 1,  0 * 16 + 2 },
-    {  0 * 16 + 1,  1 * 16 + 2 },
-    {  1 * 16 + 1,  2 * 16 + 2 },
-    {  1 * 16 + 0,  2 * 16 + 1 },
-    {  1 * 16 + 0,  2 * 16 + 0 },
-    {  1 * 16 + 0,  2 * 16 - 1 }
-};
-static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
-static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
-
-static inline int constrain(const int diff, const int threshold,
-                            const int damping)
-{
-    if (!threshold) return 0;
-    const int shift = imax(0, damping - ulog2(threshold));
-    return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
-                      diff);
-}
-
-/*
- * <code partially copied from libaom>
- */
-
-#define CDEF_VERY_LARGE (30000)
-
-static void fill(uint16_t *tmp, const ptrdiff_t stride,
-                 const int w, const int h)
-{
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++)
-            tmp[x] = CDEF_VERY_LARGE;
-        tmp += stride;
-    }
-}
-
-/* Smooth in the direction detected. */
-static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
-                                /*const*/ pixel *const top[2],
-                                const int w, const int h, const int pri_strength,
-                                const int sec_strength, const int dir,
-                                const int damping, const enum CdefEdgeFlags edges)
-{
-    const ptrdiff_t tmp_stride = 16 >> (w == 4);
-    assert((w == 4 || w == 8) && (h == 4 || h == 8));
-    uint16_t tmp[192];  // 16*12 is the maximum value of tmp_stride * (h + 4)
-    uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;
-    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
-    const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
-    const int8_t (*cdef_directions)[2];
-
-    assert(w == 4 || w == 8);
-    cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;
-
-    // fill extended input buffer
-    int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
-    if (!(edges & HAVE_TOP)) {
-        fill(tmp, tmp_stride, w + 4, 2);
-        y_start = 0;
-    }
-    if (!(edges & HAVE_BOTTOM)) {
-        fill(tmp + (h + 2) * tmp_stride, tmp_stride, w + 4, 2);
-        y_end -= 2;
-    }
-    if (!(edges & HAVE_LEFT)) {
-        fill(tmp + (2 + y_start) * tmp_stride, tmp_stride, 2, y_end - y_start);
-        x_start = 0;
-    }
-    if (!(edges & HAVE_RIGHT)) {
-        fill(tmp + (2 + y_start) * tmp_stride + w + 2, tmp_stride,
-             2, y_end - y_start);
-        x_end -= 2;
-    }
-    for (int y = y_start; y < 0; y++)
-        for (int x = x_start; x < x_end; x++)
-            tmp2[y * tmp_stride + x] = top[y & 1][x];
-    for (int y = 0; y < y_end; y++)
-        for (int x = x_start; x < x_end; x++)
-            tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];
-
-    // run actual filter
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++) {
-            int sum = 0;
-            const int px = dst[y * PXSTRIDE(dst_stride) + x];
-            int max = px, min = px;
-            for (int k = 0; k < 2; k++) {
-                const int8_t off1 = cdef_directions[dir][k];
-                const int p0 = tmp2[y * tmp_stride + x + off1];
-                const int p1 = tmp2[y * tmp_stride + x - off1];
-                sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);
-                sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);
-                if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);
-                if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);
-                min = imin(p0, min);
-                min = imin(p1, min);
-                const int8_t off2 = cdef_directions[(dir + 2) & 7][k];
-                const int s0 = tmp2[y * tmp_stride + x + off2];
-                const int s1 = tmp2[y * tmp_stride + x - off2];
-                const int8_t off3 = cdef_directions[(dir + 6) & 7][k];
-                const int s2 = tmp2[y * tmp_stride + x + off3];
-                const int s3 = tmp2[y * tmp_stride + x - off3];
-                if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);
-                if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);
-                if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);
-                if (s3 != CDEF_VERY_LARGE) max = imax(s3, max);
-                min = imin(s0, min);
-                min = imin(s1, min);
-                min = imin(s2, min);
-                min = imin(s3, min);
-                sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);
-                sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);
-                sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);
-                sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);
-            }
-            dst[y * PXSTRIDE(dst_stride) + x] =
-                iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
-        }
-    }
-}
-
-/*
- * </code partially copied from libaom>
- */
-
-#define cdef_fn(w, h) \
-static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
-                                            const ptrdiff_t stride, \
-                                            /*const*/ pixel *const top[2], \
-                                            const int pri_strength, \
-                                            const int sec_strength, \
-                                            const int dir, \
-                                            const int damping, \
-                                            const enum CdefEdgeFlags edges) \
-{ \
-    cdef_filter_block_c(dst, stride, top, w, h, pri_strength, sec_strength, \
-                        dir, damping, edges); \
-}
-
-cdef_fn(4, 4);
-cdef_fn(4, 8);
-cdef_fn(8, 8);
-
-/*
- * <code copied from libaom>
- */
-
-/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
-   The search minimizes the weighted variance along all the lines in a
-   particular direction, i.e. the squared error between the input and a
-   "predicted" block where each pixel is replaced by the average along a line
-   in a particular direction. Since each direction have the same sum(x^2) term,
-   that term is never computed. See Section 2, step 2, of:
-   http://jmvalin.ca/notes/intra_paint.pdf */
-static const uint16_t div_table[] = {
-    0, 840, 420, 280, 210, 168, 140, 120, 105
-};
-static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
-                           unsigned *const var)
-{
-    int i;
-    int32_t cost[8] = { 0 };
-    int partial[8][15] = { { 0 } };
-    int32_t best_cost = 0;
-    int best_dir = 0;
-    /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
-     The output is then 840 times larger, but we don't care for finding
-     the max. */
-    for (i = 0; i < 8; i++) {
-        int j;
-        for (j = 0; j < 8; j++) {
-            int x;
-            /* We subtract 128 here to reduce the maximum range of the squared
-             partial sums. */
-            x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;
-            partial[0][i + j] += x;
-            partial[1][i + j / 2] += x;
-            partial[2][i] += x;
-            partial[3][3 + i - j / 2] += x;
-            partial[4][7 + i - j] += x;
-            partial[5][3 - i / 2 + j] += x;
-            partial[6][j] += x;
-            partial[7][i / 2 + j] += x;
-        }
-    }
-    for (i = 0; i < 8; i++) {
-        cost[2] += partial[2][i] * partial[2][i];
-        cost[6] += partial[6][i] * partial[6][i];
-    }
-    cost[2] *= div_table[8];
-    cost[6] *= div_table[8];
-    for (i = 0; i < 7; i++) {
-        cost[0] += (partial[0][i] * partial[0][i] +
-                    partial[0][14 - i] * partial[0][14 - i]) *
-                   div_table[i + 1];
-        cost[4] += (partial[4][i] * partial[4][i] +
-                    partial[4][14 - i] * partial[4][14 - i]) *
-                   div_table[i + 1];
-    }
-    cost[0] += partial[0][7] * partial[0][7] * div_table[8];
-    cost[4] += partial[4][7] * partial[4][7] * div_table[8];
-    for (i = 1; i < 8; i += 2) {
-        int j;
-        for (j = 0; j < 4 + 1; j++) {
-            cost[i] += partial[i][3 + j] * partial[i][3 + j];
-        }
-        cost[i] *= div_table[8];
-        for (j = 0; j < 4 - 1; j++) {
-            cost[i] += (partial[i][j] * partial[i][j] +
-                        partial[i][10 - j] * partial[i][10 - j]) *
-                       div_table[2 * j + 2];
-        }
-    }
-    for (i = 0; i < 8; i++) {
-        if (cost[i] > best_cost) {
-            best_cost = cost[i];
-            best_dir = i;
-        }
-    }
-    /* Difference between the optimal variance and the variance along the
-     orthogonal direction. Again, the sum(x^2) terms cancel out. */
-    *var = best_cost - cost[(best_dir + 4) & 7];
-    /* We'd normally divide by 840, but dividing by 1024 is close enough
-     for what we're going to do with this. */
-    *var >>= 10;
-    return best_dir;
-}
-
-/*
- * </code copied from libaom>
- */
-
-void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
-    c->dir = cdef_find_dir_c;
-    c->fb[0] = cdef_filter_block_8x8_c;
-    c->fb[1] = cdef_filter_block_4x8_c;
-    c->fb[2] = cdef_filter_block_4x4_c;
-}
--- a/src/cdef_apply.c
+++ /dev/null
@@ -1,237 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <string.h>
-
-#include "common/intops.h"
-
-#include "src/cdef_apply.h"
-
-static void backup2lines(pixel *const dst[3][2],
-                         /*const*/ pixel *const src[3],
-                         const ptrdiff_t src_stride[2], int y_off, int w,
-                         const enum Dav1dPixelLayout layout)
-{
-    pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
-    pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
-
-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
-    w >>= ss_hor;
-    y_off >>= ss_ver;
-    pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
-}
-
-static void backup2x8(pixel dst[3][8][2],
-                      /*const*/ pixel *const src[3],
-                      const ptrdiff_t src_stride[2], int x_off,
-                      const enum Dav1dPixelLayout layout)
-{
-    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
-        pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
-
-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
-    x_off >>= ss_hor;
-    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
-        pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
-        pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
-    }
-}
-
-static void restore2x8(pixel *const dst[3],
-                       const ptrdiff_t dst_stride[2],
-                       const pixel src[3][8][2], const enum Dav1dPixelLayout layout)
-{
-    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(dst_stride[0]))
-        pixel_copy(&dst[0][y_off - 2], src[0][y], 2);
-
-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-
-    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(dst_stride[1])) {
-        pixel_copy(&dst[1][y_off - 2], src[1][y], 2);
-        pixel_copy(&dst[2][y_off - 2], src[2][y], 2);
-    }
-}
-
-static int adjust_strength(const int strength, const unsigned var) {
-    if (!var) return 0;
-    const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
-    return (strength * (4 + i) + 8) >> 4;
-}
-
-void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
-                             pixel *const p[3],
-                             const Av1Filter *const lflvl,
-                             const int by_start, const int by_end)
-{
-    const Dav1dDSPContext *const dsp = f->dsp;
-    enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);
-    pixel *ptrs[3] = { p[0], p[1], p[2] };
-    const int sbsz = 16;
-    const int sb64w = f->sb128w << 1;
-    const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;
-    const enum Dav1dPixelLayout layout = f->cur.p.p.layout;
-    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
-    const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
-    // FIXME a design improvement that could be made here is to keep a set of
-    // flags for each block position on whether the block was filtered; if not,
-    // the backup of pre-filter data is empty, and the restore is therefore
-    // unnecessary as well.
-
-    for (int by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
-        const int tf = f->lf.top_pre_cdef_toggle;
-        if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;
-
-        if (edges & HAVE_BOTTOM) {
-            // backup pre-filter data for next iteration
-            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,
-                         8, f->bw * 4, layout);
-        }
-
-        pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
-        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
-        edges &= ~HAVE_LEFT;
-        edges |= HAVE_RIGHT;
-        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {
-            const int sb128x = sbx >>1;
-            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
-            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
-            if (cdef_idx == -1 ||
-                (!f->frame_hdr.cdef.y_strength[cdef_idx] &&
-                 !f->frame_hdr.cdef.uv_strength[cdef_idx]))
-            {
-                last_skip = 1;
-                goto next_sb;
-            }
-
-            const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];
-            const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];
-            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
-            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
-                 bx += 2, edges |= HAVE_LEFT)
-            {
-                if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;
-
-                // check if this 8x8 block had any coded coefficients; if not,
-                // go to the next block
-                const unsigned bx_mask = 3U << (bx & 14);
-                const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
-                if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
-                       lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
-                {
-                    last_skip = 1;
-                    goto next_b;
-                }
-
-                if (!last_skip) {
-                    // backup post-filter data (will be restored at the end)
-                    backup2x8(lr_bak[1], bptrs, f->cur.p.stride, 0, layout);
-
-                    // restore pre-filter data from last iteration
-                    restore2x8(bptrs, f->cur.p.stride, lr_bak[0], layout);
-                }
-                if (edges & HAVE_RIGHT) {
-                    // backup pre-filter data for next iteration
-                    backup2x8(lr_bak[0], bptrs, f->cur.p.stride, 8, layout);
-                }
-
-                // the actual filter
-                const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);
-                int y_sec_lvl = y_lvl & 3;
-                y_sec_lvl += y_sec_lvl == 3;
-                y_sec_lvl <<= BITDEPTH - 8;
-                const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);
-                int uv_sec_lvl = uv_lvl & 3;
-                uv_sec_lvl += uv_sec_lvl == 3;
-                uv_sec_lvl <<= BITDEPTH - 8;
-                unsigned variance;
-                const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],
-                                              &variance);
-                if (y_lvl) {
-                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0],
-                                    (pixel *const [2]) {
-                                        &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
-                                        &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
-                                    },
-                                    adjust_strength(y_pri_lvl, variance),
-                                    y_sec_lvl, y_pri_lvl ? dir : 0,
-                                    damping, edges);
-                }
-                if (uv_lvl && has_chroma) {
-                    const int uvdir =
-                        f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
-                        ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
-                    for (int pl = 1; pl <= 2; pl++) {
-                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],
-                                             (pixel *const [2]) {
-                                                 &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
-                                                 &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
-                                             },
-                                             uv_pri_lvl, uv_sec_lvl,
-                                             uv_pri_lvl ? uvdir : 0,
-                                             damping - 1, edges);
-                    }
-                }
-
-                if (!last_skip) {
-                    // restore post-filter data from the beginning of this loop
-                    restore2x8(bptrs, f->cur.p.stride, lr_bak[1], layout);
-                }
-                last_skip = 0;
-
-            next_b:
-                bptrs[0] += 8;
-                bptrs[1] += 8 >> ss_hor;
-                bptrs[2] += 8 >> ss_hor;
-            }
-
-        next_sb:
-            iptrs[0] += sbsz * 4;
-            iptrs[1] += sbsz * 4 >> ss_hor;
-            iptrs[2] += sbsz * 4 >> ss_hor;
-        }
-
-        ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);
-        ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-        ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-        f->lf.top_pre_cdef_toggle ^= 1;
-    }
-}
--- /dev/null
+++ b/src/cdef_apply_tmpl.c
@@ -1,0 +1,237 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/cdef_apply.h"
+
+static void backup2lines(pixel *const dst[3][2],
+                         /*const*/ pixel *const src[3],
+                         const ptrdiff_t src_stride[2], int y_off, int w,
+                         const enum Dav1dPixelLayout layout)
+{
+    pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
+    pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
+
+    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+    w >>= ss_hor;
+    y_off >>= ss_ver;
+    pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
+    pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
+    pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
+    pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
+}
+
+static void backup2x8(pixel dst[3][8][2],
+                      /*const*/ pixel *const src[3],
+                      const ptrdiff_t src_stride[2], int x_off,
+                      const enum Dav1dPixelLayout layout)
+{
+    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
+        pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
+
+    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+    x_off >>= ss_hor;
+    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
+        pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
+        pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
+    }
+}
+
+static void restore2x8(pixel *const dst[3],
+                       const ptrdiff_t dst_stride[2],
+                       const pixel src[3][8][2], const enum Dav1dPixelLayout layout)
+{
+    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(dst_stride[0]))
+        pixel_copy(&dst[0][y_off - 2], src[0][y], 2);
+
+    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+
+    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(dst_stride[1])) {
+        pixel_copy(&dst[1][y_off - 2], src[1][y], 2);
+        pixel_copy(&dst[2][y_off - 2], src[2][y], 2);
+    }
+}
+
+static int adjust_strength(const int strength, const unsigned var) {
+    if (!var) return 0;
+    const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
+    return (strength * (4 + i) + 8) >> 4;
+}
+
+void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
+                             pixel *const p[3],
+                             const Av1Filter *const lflvl,
+                             const int by_start, const int by_end)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+    enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);
+    pixel *ptrs[3] = { p[0], p[1], p[2] };
+    const int sbsz = 16;
+    const int sb64w = f->sb128w << 1;
+    const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;
+    const enum Dav1dPixelLayout layout = f->cur.p.p.layout;
+    const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
+    const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
+    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+    // FIXME a design improvement that could be made here is to keep a set of
+    // flags for each block position on whether the block was filtered; if not,
+    // the backup of pre-filter data is empty, and the restore is therefore
+    // unnecessary as well.
+
+    for (int by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
+        const int tf = f->lf.top_pre_cdef_toggle;
+        if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;
+
+        if (edges & HAVE_BOTTOM) {
+            // backup pre-filter data for next iteration
+            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,
+                         8, f->bw * 4, layout);
+        }
+
+        pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
+        pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
+        edges &= ~HAVE_LEFT;
+        edges |= HAVE_RIGHT;
+        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {
+            const int sb128x = sbx >>1;
+            const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
+            const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
+            if (cdef_idx == -1 ||
+                (!f->frame_hdr.cdef.y_strength[cdef_idx] &&
+                 !f->frame_hdr.cdef.uv_strength[cdef_idx]))
+            {
+                last_skip = 1;
+                goto next_sb;
+            }
+
+            const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];
+            const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];
+            pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
+            for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
+                 bx += 2, edges |= HAVE_LEFT)
+            {
+                if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;
+
+                // check if this 8x8 block had any coded coefficients; if not,
+                // go to the next block
+                const unsigned bx_mask = 3U << (bx & 14);
+                const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
+                if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
+                       lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
+                {
+                    last_skip = 1;
+                    goto next_b;
+                }
+
+                if (!last_skip) {
+                    // backup post-filter data (will be restored at the end)
+                    backup2x8(lr_bak[1], bptrs, f->cur.p.stride, 0, layout);
+
+                    // restore pre-filter data from last iteration
+                    restore2x8(bptrs, f->cur.p.stride, lr_bak[0], layout);
+                }
+                if (edges & HAVE_RIGHT) {
+                    // backup pre-filter data for next iteration
+                    backup2x8(lr_bak[0], bptrs, f->cur.p.stride, 8, layout);
+                }
+
+                // the actual filter
+                const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);
+                int y_sec_lvl = y_lvl & 3;
+                y_sec_lvl += y_sec_lvl == 3;
+                y_sec_lvl <<= BITDEPTH - 8;
+                const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);
+                int uv_sec_lvl = uv_lvl & 3;
+                uv_sec_lvl += uv_sec_lvl == 3;
+                uv_sec_lvl <<= BITDEPTH - 8;
+                unsigned variance;
+                const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],
+                                              &variance);
+                if (y_lvl) {
+                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0],
+                                    (pixel *const [2]) {
+                                        &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
+                                        &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
+                                    },
+                                    adjust_strength(y_pri_lvl, variance),
+                                    y_sec_lvl, y_pri_lvl ? dir : 0,
+                                    damping, edges);
+                }
+                if (uv_lvl && has_chroma) {
+                    const int uvdir =
+                        f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
+                        ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
+                    for (int pl = 1; pl <= 2; pl++) {
+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],
+                                             (pixel *const [2]) {
+                                                 &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
+                                                 &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
+                                             },
+                                             uv_pri_lvl, uv_sec_lvl,
+                                             uv_pri_lvl ? uvdir : 0,
+                                             damping - 1, edges);
+                    }
+                }
+
+                if (!last_skip) {
+                    // restore post-filter data from the beginning of this loop
+                    restore2x8(bptrs, f->cur.p.stride, lr_bak[1], layout);
+                }
+                last_skip = 0;
+
+            next_b:
+                bptrs[0] += 8;
+                bptrs[1] += 8 >> ss_hor;
+                bptrs[2] += 8 >> ss_hor;
+            }
+
+        next_sb:
+            iptrs[0] += sbsz * 4;
+            iptrs[1] += sbsz * 4 >> ss_hor;
+            iptrs[2] += sbsz * 4 >> ss_hor;
+        }
+
+        ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);
+        ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+        ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+        f->lf.top_pre_cdef_toggle ^= 1;
+    }
+}
--- /dev/null
+++ b/src/cdef_tmpl.c
@@ -1,0 +1,298 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/cdef.h"
+
+static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {
+    { -1 * 8 + 1, -2 * 8 + 2 },
+    {  0 * 8 + 1, -1 * 8 + 2 },
+    {  0 * 8 + 1,  0 * 8 + 2 },
+    {  0 * 8 + 1,  1 * 8 + 2 },
+    {  1 * 8 + 1,  2 * 8 + 2 },
+    {  1 * 8 + 0,  2 * 8 + 1 },
+    {  1 * 8 + 0,  2 * 8 + 0 },
+    {  1 * 8 + 0,  2 * 8 - 1 }
+};
+
+static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {
+    { -1 * 16 + 1, -2 * 16 + 2 },
+    {  0 * 16 + 1, -1 * 16 + 2 },
+    {  0 * 16 + 1,  0 * 16 + 2 },
+    {  0 * 16 + 1,  1 * 16 + 2 },
+    {  1 * 16 + 1,  2 * 16 + 2 },
+    {  1 * 16 + 0,  2 * 16 + 1 },
+    {  1 * 16 + 0,  2 * 16 + 0 },
+    {  1 * 16 + 0,  2 * 16 - 1 }
+};
+static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+
+static inline int constrain(const int diff, const int threshold,
+                            const int damping)
+{
+    if (!threshold) return 0;
+    const int shift = imax(0, damping - ulog2(threshold));
+    return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
+                      diff);
+}
+
+/*
+ * <code partially copied from libaom>
+ */
+
+#define CDEF_VERY_LARGE (30000)
+
+static void fill(uint16_t *tmp, const ptrdiff_t stride,
+                 const int w, const int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)
+            tmp[x] = CDEF_VERY_LARGE;
+        tmp += stride;
+    }
+}
+
+/* Smooth in the direction detected. */
+static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
+                                /*const*/ pixel *const top[2],
+                                const int w, const int h, const int pri_strength,
+                                const int sec_strength, const int dir,
+                                const int damping, const enum CdefEdgeFlags edges)
+{
+    const ptrdiff_t tmp_stride = 16 >> (w == 4);
+    assert((w == 4 || w == 8) && (h == 4 || h == 8));
+    uint16_t tmp[192];  // 16*12 is the maximum value of tmp_stride * (h + 4)
+    uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;
+    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
+    const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
+    const int8_t (*cdef_directions)[2];
+
+    assert(w == 4 || w == 8);
+    cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;
+
+    // fill extended input buffer
+    int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
+    if (!(edges & HAVE_TOP)) {
+        fill(tmp, tmp_stride, w + 4, 2);
+        y_start = 0;
+    }
+    if (!(edges & HAVE_BOTTOM)) {
+        fill(tmp + (h + 2) * tmp_stride, tmp_stride, w + 4, 2);
+        y_end -= 2;
+    }
+    if (!(edges & HAVE_LEFT)) {
+        fill(tmp + (2 + y_start) * tmp_stride, tmp_stride, 2, y_end - y_start);
+        x_start = 0;
+    }
+    if (!(edges & HAVE_RIGHT)) {
+        fill(tmp + (2 + y_start) * tmp_stride + w + 2, tmp_stride,
+             2, y_end - y_start);
+        x_end -= 2;
+    }
+    for (int y = y_start; y < 0; y++)
+        for (int x = x_start; x < x_end; x++)
+            tmp2[y * tmp_stride + x] = top[y & 1][x];
+    for (int y = 0; y < y_end; y++)
+        for (int x = x_start; x < x_end; x++)
+            tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];
+
+    // run actual filter
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+            int sum = 0;
+            const int px = dst[y * PXSTRIDE(dst_stride) + x];
+            int max = px, min = px;
+            for (int k = 0; k < 2; k++) {
+                const int8_t off1 = cdef_directions[dir][k];
+                const int p0 = tmp2[y * tmp_stride + x + off1];
+                const int p1 = tmp2[y * tmp_stride + x - off1];
+                sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);
+                sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);
+                if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);
+                if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);
+                min = imin(p0, min);
+                min = imin(p1, min);
+                const int8_t off2 = cdef_directions[(dir + 2) & 7][k];
+                const int s0 = tmp2[y * tmp_stride + x + off2];
+                const int s1 = tmp2[y * tmp_stride + x - off2];
+                const int8_t off3 = cdef_directions[(dir + 6) & 7][k];
+                const int s2 = tmp2[y * tmp_stride + x + off3];
+                const int s3 = tmp2[y * tmp_stride + x - off3];
+                if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);
+                if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);
+                if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);
+                if (s3 != CDEF_VERY_LARGE) max = imax(s3, max);
+                min = imin(s0, min);
+                min = imin(s1, min);
+                min = imin(s2, min);
+                min = imin(s3, min);
+                sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);
+                sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);
+                sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);
+                sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);
+            }
+            dst[y * PXSTRIDE(dst_stride) + x] =
+                iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
+        }
+    }
+}
+
+/*
+ * </code partially copied from libaom>
+ */
+
+#define cdef_fn(w, h) \
+static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
+                                            const ptrdiff_t stride, \
+                                            /*const*/ pixel *const top[2], \
+                                            const int pri_strength, \
+                                            const int sec_strength, \
+                                            const int dir, \
+                                            const int damping, \
+                                            const enum CdefEdgeFlags edges) \
+{ \
+    cdef_filter_block_c(dst, stride, top, w, h, pri_strength, sec_strength, \
+                        dir, damping, edges); \
+}
+
+cdef_fn(4, 4);
+cdef_fn(4, 8);
+cdef_fn(8, 8);
+
+/*
+ * <code copied from libaom>
+ */
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+   The search minimizes the weighted variance along all the lines in a
+   particular direction, i.e. the squared error between the input and a
+   "predicted" block where each pixel is replaced by the average along a line
+   in a particular direction. Since each direction have the same sum(x^2) term,
+   that term is never computed. See Section 2, step 2, of:
+   http://jmvalin.ca/notes/intra_paint.pdf */
+static const uint16_t div_table[] = {
+    0, 840, 420, 280, 210, 168, 140, 120, 105
+};
+static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
+                           unsigned *const var)
+{
+    int i;
+    int32_t cost[8] = { 0 };
+    int partial[8][15] = { { 0 } };
+    int32_t best_cost = 0;
+    int best_dir = 0;
+    /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+     The output is then 840 times larger, but we don't care for finding
+     the max. */
+    for (i = 0; i < 8; i++) {
+        int j;
+        for (j = 0; j < 8; j++) {
+            int x;
+            /* We subtract 128 here to reduce the maximum range of the squared
+             partial sums. */
+            x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;
+            partial[0][i + j] += x;
+            partial[1][i + j / 2] += x;
+            partial[2][i] += x;
+            partial[3][3 + i - j / 2] += x;
+            partial[4][7 + i - j] += x;
+            partial[5][3 - i / 2 + j] += x;
+            partial[6][j] += x;
+            partial[7][i / 2 + j] += x;
+        }
+    }
+    for (i = 0; i < 8; i++) {
+        cost[2] += partial[2][i] * partial[2][i];
+        cost[6] += partial[6][i] * partial[6][i];
+    }
+    cost[2] *= div_table[8];
+    cost[6] *= div_table[8];
+    for (i = 0; i < 7; i++) {
+        cost[0] += (partial[0][i] * partial[0][i] +
+                    partial[0][14 - i] * partial[0][14 - i]) *
+                   div_table[i + 1];
+        cost[4] += (partial[4][i] * partial[4][i] +
+                    partial[4][14 - i] * partial[4][14 - i]) *
+                   div_table[i + 1];
+    }
+    cost[0] += partial[0][7] * partial[0][7] * div_table[8];
+    cost[4] += partial[4][7] * partial[4][7] * div_table[8];
+    for (i = 1; i < 8; i += 2) {
+        int j;
+        for (j = 0; j < 4 + 1; j++) {
+            cost[i] += partial[i][3 + j] * partial[i][3 + j];
+        }
+        cost[i] *= div_table[8];
+        for (j = 0; j < 4 - 1; j++) {
+            cost[i] += (partial[i][j] * partial[i][j] +
+                        partial[i][10 - j] * partial[i][10 - j]) *
+                       div_table[2 * j + 2];
+        }
+    }
+    for (i = 0; i < 8; i++) {
+        if (cost[i] > best_cost) {
+            best_cost = cost[i];
+            best_dir = i;
+        }
+    }
+    /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+    *var = best_cost - cost[(best_dir + 4) & 7];
+    /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+    *var >>= 10;
+    return best_dir;
+}
+
+/*
+ * </code copied from libaom>
+ */
+
+void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
+    c->dir = cdef_find_dir_c;
+    c->fb[0] = cdef_filter_block_8x8_c;
+    c->fb[1] = cdef_filter_block_4x8_c;
+    c->fb[2] = cdef_filter_block_4x4_c;
+}
--- a/src/ipred.c
+++ /dev/null
@@ -1,757 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/ipred.h"
-#include "src/tables.h"
-
-static NOINLINE void
-splat_dc(pixel *dst, const ptrdiff_t stride,
-         const int width, const int height, const unsigned dc)
-{
-    assert(dc <= (1 << BITDEPTH) - 1);
-#if BITDEPTH == 8
-    if (width > 4) {
-        const uint64_t dcN = dc * 0x0101010101010101ULL;
-        for (int y = 0; y < height; y++) {
-            for (int x = 0; x < width; x += sizeof(dcN))
-                *((uint64_t *) &dst[x]) = dcN;
-            dst += PXSTRIDE(stride);
-        }
-    } else {
-        const unsigned dcN = dc * 0x01010101U;
-        for (int y = 0; y < height; y++) {
-            for (int x = 0; x < width; x += sizeof(dcN))
-                *((unsigned *) &dst[x]) = dcN;
-            dst += PXSTRIDE(stride);
-        }
-    }
-#else
-    const uint64_t dcN = dc * 0x0001000100010001ULL;
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x += sizeof(dcN) >> 1)
-            *((uint64_t *) &dst[x]) = dcN;
-        dst += PXSTRIDE(stride);
-    }
-#endif
-}
-
-static NOINLINE void
-cfl_pred(pixel *dst, const ptrdiff_t stride,
-         const int width, const int height, const unsigned dc,
-         const int16_t *ac, const int alpha)
-{
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x++) {
-            const int diff = alpha * ac[x];
-            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
-        }
-        ac += width;
-        dst += PXSTRIDE(stride);
-    }
-}
-
-static unsigned dc_gen_top(const pixel *const topleft, const int width)
-{
-    unsigned dc = width >> 1;
-    for (int i = 0; i < width; i++)
-       dc += topleft[1 + i];
-    return dc >> ctz(width);
-}
-
-static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
-                           const pixel *const topleft,
-                           const int width, const int height, const int a)
-{
-    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));
-}
-
-static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
-                            const pixel *const topleft,
-                            const int width, const int height,
-                            const int16_t *ac, const int alpha)
-{
-    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);
-}
-
-static unsigned dc_gen_left(const pixel *const topleft, const int height)
-{
-    unsigned dc = height >> 1;
-    for (int i = 0; i < height; i++)
-       dc += topleft[-(1 + i)];
-    return dc >> ctz(height);
-}
-
-static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
-                            const pixel *const topleft,
-                            const int width, const int height, const int a)
-{
-    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));
-}
-
-static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
-                             const pixel *const topleft,
-                             const int width, const int height,
-                             const int16_t *ac, const int alpha)
-{
-    unsigned dc = dc_gen_left(topleft, height);
-    cfl_pred(dst, stride, width, height, dc, ac, alpha);
-}
-
-#if BITDEPTH == 8
-#define MULTIPLIER_1x2 0x5556
-#define MULTIPLIER_1x4 0x3334
-#define BASE_SHIFT 16
-#else
-#define MULTIPLIER_1x2 0xAAAB
-#define MULTIPLIER_1x4 0x6667
-#define BASE_SHIFT 17
-#endif
-
-static unsigned
-dc_gen(const pixel *const topleft, const int width, const int height)
-{
-    unsigned dc = (width + height) >> 1;
-    for (int i = 0; i < width; i++)
-       dc += topleft[i + 1];
-    for (int i = 0; i < height; i++)
-       dc += topleft[-(i + 1)];
-    dc >>= ctz(width + height);
-
-    if (width != height) {
-        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
-                                                           MULTIPLIER_1x2;
-        dc >>= BASE_SHIFT;
-    }
-    return dc;
-}
-
-static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
-                       const pixel *const topleft,
-                       const int width, const int height, const int a)
-{
-    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));
-}
-
-static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
-                        const pixel *const topleft,
-                        const int width, const int height,
-                        const int16_t *ac, const int alpha)
-{
-    unsigned dc = dc_gen(topleft, width, height);
-    cfl_pred(dst, stride, width, height, dc, ac, alpha);
-}
-
-#undef MULTIPLIER_1x2
-#undef MULTIPLIER_1x4
-#undef BASE_SHIFT
-
-static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
-                           const pixel *const topleft,
-                           const int width, const int height, const int a)
-{
-    splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
-}
-
-static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
-                            const pixel *const topleft,
-                            const int width, const int height,
-                            const int16_t *ac, const int alpha)
-{
-    cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);
-}
-
-static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
-                      const pixel *const topleft,
-                      const int width, const int height, const int a)
-{
-    for (int y = 0; y < height; y++) {
-        pixel_copy(dst, topleft + 1, width);
-        dst += PXSTRIDE(stride);
-    }
-}
-
-static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
-                      const pixel *const topleft,
-                      const int width, const int height, const int a)
-{
-    for (int y = 0; y < height; y++) {
-        pixel_set(dst, topleft[-(1 + y)], width);
-        dst += PXSTRIDE(stride);
-    }
-}
-
-static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
-                          const pixel *const tl_ptr,
-                          const int width, const int height, const int a)
-{
-    const int topleft = tl_ptr[0];
-    for (int y = 0; y < height; y++) {
-        const int left = tl_ptr[-(y + 1)];
-        for (int x = 0; x < width; x++) {
-            const int top = tl_ptr[1 + x];
-            const int base = left + top - topleft;
-            const int ldiff = abs(left - base);
-            const int tdiff = abs(top - base);
-            const int tldiff = abs(topleft - base);
-
-            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
-                     tdiff <= tldiff ? top : topleft;
-        }
-        dst += PXSTRIDE(stride);
-    }
-}
-
-static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
-                           const pixel *const topleft,
-                           const int width, const int height, const int a)
-{
-    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
-    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
-    const int right = topleft[width], bottom = topleft[-height];
-
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x++) {
-            const int pred = weights_ver[y]  * topleft[1 + x] +
-                      (256 - weights_ver[y]) * bottom +
-                             weights_hor[x]  * topleft[-(1 + y)] +
-                      (256 - weights_hor[x]) * right;
-            dst[x] = (pred + 256) >> 9;
-        }
-        dst += PXSTRIDE(stride);
-    }
-}
-
-static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
-                             const pixel *const topleft,
-                             const int width, const int height, const int a)
-{
-    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
-    const int bottom = topleft[-height];
-
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x++) {
-            const int pred = weights_ver[y]  * topleft[1 + x] +
-                      (256 - weights_ver[y]) * bottom;
-            dst[x] = (pred + 128) >> 8;
-        }
-        dst += PXSTRIDE(stride);
-    }
-}
-
-static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
-                             const pixel *const topleft,
-                             const int width, const int height, const int a)
-{
-    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
-    const int right = topleft[width];
-
-    for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x++) {
-            const int pred = weights_hor[x]  * topleft[-(y + 1)] +
-                      (256 - weights_hor[x]) * right;
-            dst[x] = (pred + 128) >> 8;
-        }
-        dst += PXSTRIDE(stride);
-    }
-}
-
-static int get_filter_strength(const unsigned blk_wh, const unsigned d,
-                               const int type)
-{
-    int strength = 0;
-
-    if (type == 0) {
-        if (blk_wh <= 8) {
-            if (d >= 56) strength = 1;
-        } else if (blk_wh <= 12) {
-            if (d >= 40) strength = 1;
-        } else if (blk_wh <= 16) {
-            if (d >= 40) strength = 1;
-        } else if (blk_wh <= 24) {
-            if (d >= 8) strength = 1;
-            if (d >= 16) strength = 2;
-            if (d >= 32) strength = 3;
-        } else if (blk_wh <= 32) {
-            if (d >= 1) strength = 1;
-            if (d >= 4) strength = 2;
-            if (d >= 32) strength = 3;
-        } else {
-            if (d >= 1) strength = 3;
-        }
-    } else {
-        if (blk_wh <= 8) {
-            if (d >= 40) strength = 1;
-            if (d >= 64) strength = 2;
-        } else if (blk_wh <= 16) {
-            if (d >= 20) strength = 1;
-            if (d >= 48) strength = 2;
-        } else if (blk_wh <= 24) {
-            if (d >= 4) strength = 3;
-        } else {
-            if (d >= 1) strength = 3;
-        }
-    }
-
-    return strength;
-}
-
-static void filter_edge(pixel *const out, const int sz, const pixel *const in,
-                        const int from, const int to, const unsigned strength)
-{
-    static const uint8_t kernel[3][5] = {
-        { 0, 4, 8, 4, 0 },
-        { 0, 5, 6, 5, 0 },
-        { 2, 4, 4, 4, 2 }
-    };
-
-    assert(strength > 0);
-    for (int i = 0; i < sz; i++) {
-        int s = 0;
-        for (int j = 0; j < 5; j++)
-            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
-        out[i] = (s + 8) >> 4;
-    }
-}
-
-static int get_upsample(const int blk_wh, const unsigned d, const int type) {
-    if (d >= 40) return 0;
-    return type ? (blk_wh <= 8) : (blk_wh <= 16);
-}
-
-static void upsample_edge(pixel *const out, const int hsz,
-                          const pixel *const in, const int from, const int to)
-{
-    static const int8_t kernel[4] = { -1, 9, 9, -1 };
-    int i;
-    for (i = 0; i < hsz - 1; i++) {
-        out[i * 2] = in[iclip(i, from, to - 1)];
-
-        int s = 0;
-        for (int j = 0; j < 4; j++)
-            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
-        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
-    }
-    out[i * 2] = in[iclip(i, from, to - 1)];
-}
-
-static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
-                       const pixel *const topleft_in,
-                       const int width, const int height, int angle)
-{
-    const int is_sm = angle >> 9;
-    angle &= 511;
-    assert(angle < 90);
-    const int dx = dav1d_dr_intra_derivative[angle];
-    pixel top_out[(64 + 64) * 2];
-    const pixel *top;
-    int max_base_x;
-    const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);
-    if (upsample_above) {
-        upsample_edge(top_out, width + height,
-                      &topleft_in[1], -1, width + imin(width, height));
-        top = top_out;
-        max_base_x = 2 * (width + height) - 2;
-    } else {
-        const int filter_strength =
-            get_filter_strength(width + height, 90 - angle, is_sm);
-
-        if (filter_strength) {
-            filter_edge(top_out, width + height,
-                        &topleft_in[1], -1, width + imin(width, height),
-                        filter_strength);
-            top = top_out;
-            max_base_x = width + height - 1;
-        } else {
-            top = &topleft_in[1];
-            max_base_x = width + imin(width, height) - 1;
-        }
-    }
-    const int frac_bits = 6 - upsample_above;
-    const int base_inc = 1 << upsample_above;
-    for (int y = 0, xpos = dx; y < height;
-         y++, dst += PXSTRIDE(stride), xpos += dx)
-    {
-        int base = xpos >> frac_bits;
-        const int frac = ((xpos << upsample_above) & 0x3F) >> 1;
-
-        for (int x = 0; x < width; x++, base += base_inc) {
-            if (base < max_base_x) {
-                const int v = top[base] * (32 - frac) + top[base + 1] * frac;
-                dst[x] = iclip_pixel((v + 16) >> 5);
-            } else {
-                pixel_set(&dst[x], top[max_base_x], width - x);
-                break;
-            }
-        }
-    }
-}
-
-static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
-                       const pixel *const topleft_in,
-                       const int width, const int height, int angle)
-{
-    const int is_sm = angle >> 9;
-    angle &= 511;
-    assert(angle > 90 && angle < 180);
-    const int dy = dav1d_dr_intra_derivative[angle - 90];
-    const int dx = dav1d_dr_intra_derivative[180 - angle];
-    const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);
-    const int upsample_above = get_upsample(width + height, angle - 90, is_sm);
-    pixel edge[64 * 2 + 64 * 2 + 1];
-    pixel *const topleft = &edge[height * 2];
-
-    if (upsample_above) {
-        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);
-    } else {
-        const int filter_strength =
-            get_filter_strength(width + height, angle - 90, is_sm);
-
-        if (filter_strength) {
-            filter_edge(&topleft[1], width, &topleft_in[1], -1, width,
-                        filter_strength);
-        } else {
-            pixel_copy(&topleft[1], &topleft_in[1], width);
-        }
-    }
-    if (upsample_left) {
-        upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);
-    } else {
-        const int filter_strength =
-            get_filter_strength(width + height, 180 - angle, is_sm);
-
-        if (filter_strength) {
-            filter_edge(&topleft[-height], height, &topleft_in[-height],
-                        0, height + 1, filter_strength);
-        } else {
-            pixel_copy(&topleft[-height], &topleft_in[-height], height);
-        }
-    }
-    *topleft = *topleft_in;
-
-    const int min_base_x = -(1 << upsample_above);
-    const int frac_bits_y = 6 - upsample_left, frac_bits_x = 6 - upsample_above;
-    const int base_inc_x = 1 << upsample_above;
-    const pixel *const left = &topleft[-(1 << upsample_left)];
-    const pixel *const top = &topleft[1 << upsample_above];
-    for (int y = 0, xpos = -dx; y < height;
-         y++, xpos -= dx, dst += PXSTRIDE(stride))
-    {
-        int base_x = xpos >> frac_bits_x;
-        const int frac_x = ((xpos * (1 << upsample_above)) & 0x3F) >> 1;
-
-        for (int x = 0, ypos = (y << 6) - dy; x < width;
-             x++, base_x += base_inc_x, ypos -= dy)
-        {
-            int v;
-
-            if (base_x >= min_base_x) {
-                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
-            } else {
-                const int base_y = ypos >> frac_bits_y;
-                assert(base_y >= -(1 << upsample_left));
-                const int frac_y = ((ypos * (1 << upsample_left)) & 0x3F) >> 1;
-                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
-            }
-            dst[x] = iclip_pixel((v + 16) >> 5);
-        }
-    }
-}
-
-static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
-                       const pixel *const topleft_in,
-                       const int width, const int height, int angle)
-{
-    const int is_sm = angle >> 9;
-    angle &= 511;
-    assert(angle > 180);
-    const int dy = dav1d_dr_intra_derivative[270 - angle];
-    pixel left_out[(64 + 64) * 2];
-    const pixel *left;
-    int max_base_y;
-    const int upsample_left = get_upsample(width + height, angle - 180, is_sm);
-    if (upsample_left) {
-        upsample_edge(left_out, width + height,
-                      &topleft_in[-(width + height)],
-                      imax(width - height, 0), width + height + 1);
-        left = &left_out[2 * (width + height) - 2];
-        max_base_y = 2 * (width + height) - 2;
-    } else {
-        const int filter_strength =
-            get_filter_strength(width + height, angle - 180, is_sm);
-
-        if (filter_strength) {
-            filter_edge(left_out, width + height,
-                        &topleft_in[-(width + height)],
-                        imax(width - height, 0), width + height + 1,
-                        filter_strength);
-            left = &left_out[width + height - 1];
-            max_base_y = width + height - 1;
-        } else {
-            left = &topleft_in[-1];
-            max_base_y = height + imin(width, height) - 1;
-        }
-    }
-    const int frac_bits = 6 - upsample_left;
-    const int base_inc = 1 << upsample_left;
-    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
-        int base = ypos >> frac_bits;
-        const int frac = ((ypos << upsample_left) & 0x3F) >> 1;
-
-        for (int y = 0; y < height; y++, base += base_inc) {
-            if (base < max_base_y) {
-                const int v = left[-base] * (32 - frac) +
-                              left[-(base + 1)] * frac;
-                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
-            } else {
-                do {
-                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
-                } while (++y < height);
-                break;
-            }
-        }
-    }
-}
-
-/* Up to 32x32 only */
-static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
-                           const pixel *const topleft_in,
-                           const int width, const int height, int filt_idx)
-{
-    filt_idx &= 511;
-    assert(filt_idx < 5);
-
-    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
-    int x, y;
-    ptrdiff_t left_stride;
-    const pixel *left, *topleft, *top;
-
-    top = &topleft_in[1];
-    for (y = 0; y < height; y += 2) {
-        topleft = &topleft_in[-y];
-        left = &topleft[-1];
-        left_stride = -1;
-        for (x = 0; x < width; x += 4) {
-            const int p0 = *topleft;
-            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
-            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
-            pixel *ptr = &dst[x];
-            const int8_t *flt_ptr = filter;
-
-            for (int yy = 0; yy < 2; yy++) {
-                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
-                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
-                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +
-                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +
-                              flt_ptr[48] * p6;
-                    ptr[xx] = iclip_pixel((acc + 8) >> 4);
-                }
-                ptr += PXSTRIDE(stride);
-            }
-            left = &dst[x + 4 - 1];
-            left_stride = PXSTRIDE(stride);
-            top += 4;
-            topleft = &top[-1];
-        }
-        top = &dst[PXSTRIDE(stride)];
-        dst = &dst[PXSTRIDE(stride) * 2];
-    }
-}
-
-static NOINLINE void
-cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
-         const int w_pad, const int h_pad, const int width, const int height,
-         const int ss_hor, const int ss_ver, const int log2sz)
-{
-    int y, x;
-    int16_t *const ac_orig = ac;
-
-    assert(w_pad >= 0 && w_pad * 4 < width);
-    assert(h_pad >= 0 && h_pad * 4 < height);
-
-    for (y = 0; y < height - 4 * h_pad; y++) {
-        for (x = 0; x < width - 4 * w_pad; x++) {
-            int ac_sum = ypx[x << ss_hor];
-            if (ss_hor) ac_sum += ypx[x * 2 + 1];
-            if (ss_ver) {
-                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
-                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
-            }
-            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
-        }
-        for (; x < width; x++)
-            ac[x] = ac[x - 1];
-        ac += width;
-        ypx += PXSTRIDE(stride) << ss_ver;
-    }
-    for (; y < height; y++) {
-        memcpy(ac, &ac[-width], width * sizeof(*ac));
-        ac += width;
-    }
-
-    int sum = (1 << log2sz) >> 1;
-    for (ac = ac_orig, y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            sum += ac[x];
-        ac += width;
-    }
-    sum >>= log2sz;
-
-    // subtract DC
-    for (ac = ac_orig, y = 0; y < height; y++) {
-        for (x = 0; x < width; x++)
-            ac[x] -= sum;
-        ac += width;
-    }
-}
-
-#define cfl_ac_fn(lw, lh, cw, ch, ss_hor, ss_ver, log2sz) \
-static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \
-                                                    const pixel *const ypx, \
-                                                    const ptrdiff_t stride, \
-                                                    const int w_pad, \
-                                                    const int h_pad) \
-{ \
-    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \
-}
-
-cfl_ac_fn( 8,  8,  4,  4, 1, 1, 4)
-cfl_ac_fn( 8, 16,  4,  8, 1, 1, 5)
-cfl_ac_fn( 8, 32,  4, 16, 1, 1, 6)
-cfl_ac_fn(16,  8,  8,  4, 1, 1, 5)
-cfl_ac_fn(16, 16,  8,  8, 1, 1, 6)
-cfl_ac_fn(16, 32,  8, 16, 1, 1, 7)
-cfl_ac_fn(32,  8, 16,  4, 1, 1, 6)
-cfl_ac_fn(32, 16, 16,  8, 1, 1, 7)
-cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)
-
-cfl_ac_fn( 8,  4,  4,  4, 1, 0, 4)
-cfl_ac_fn( 8,  8,  4,  8, 1, 0, 5)
-cfl_ac_fn(16,  4,  8,  4, 1, 0, 5)
-cfl_ac_fn(16,  8,  8,  8, 1, 0, 6)
-cfl_ac_fn(16, 16,  8, 16, 1, 0, 7)
-cfl_ac_fn(32,  8, 16,  8, 1, 0, 7)
-cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)
-cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)
-
-cfl_ac_fn( 4,  4,  4,  4, 0, 0, 4)
-cfl_ac_fn( 4,  8,  4,  8, 0, 0, 5)
-cfl_ac_fn( 4, 16,  4, 16, 0, 0, 6)
-cfl_ac_fn( 8,  4,  8,  4, 0, 0, 5)
-cfl_ac_fn( 8,  8,  8,  8, 0, 0, 6)
-cfl_ac_fn( 8, 16,  8, 16, 0, 0, 7)
-cfl_ac_fn( 8, 32,  8, 32, 0, 0, 8)
-cfl_ac_fn(16,  4, 16,  4, 0, 0, 6)
-cfl_ac_fn(16,  8, 16,  8, 0, 0, 7)
-cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)
-cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)
-cfl_ac_fn(32,  8, 32,  8, 0, 0, 8)
-cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)
-cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)
-
-static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
-                       const uint16_t *const pal, const uint8_t *idx,
-                       const int w, const int h)
-{
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++)
-            dst[x] = pal[idx[x]];
-        idx += w;
-        dst += PXSTRIDE(stride);
-    }
-}
-
-void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
-    c->intra_pred[DC_PRED      ] = ipred_dc_c;
-    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
-    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
-    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
-    c->intra_pred[HOR_PRED     ] = ipred_h_c;
-    c->intra_pred[VERT_PRED    ] = ipred_v_c;
-    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
-    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
-    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
-    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
-    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
-    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
-    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
-    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
-
-    // cfl functions are split per chroma subsampling type
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4  ] = cfl_ac_8x8_to_4x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X8  ] = cfl_ac_8x16_to_4x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X16 ] = cfl_ac_8x32_to_4x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X4  ] = cfl_ac_16x8_to_8x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_8X8  ] = cfl_ac_16x16_to_8x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X16 ] = cfl_ac_16x32_to_8x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X4 ] = cfl_ac_32x8_to_16x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X8 ] = cfl_ac_32x16_to_16x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_16X16] = cfl_ac_32x32_to_16x16_c;
-
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4  ] = cfl_ac_8x4_to_4x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8  ] = cfl_ac_8x8_to_4x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4  ] = cfl_ac_16x4_to_8x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8  ] = cfl_ac_16x8_to_8x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;
-
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_4X4  ] = cfl_ac_4x4_to_4x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X8  ] = cfl_ac_4x8_to_4x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X16 ] = cfl_ac_4x16_to_4x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X4  ] = cfl_ac_8x4_to_8x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_8X8  ] = cfl_ac_8x8_to_8x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X16 ] = cfl_ac_8x16_to_8x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X32 ] = cfl_ac_8x32_to_8x32_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X4 ] = cfl_ac_16x4_to_16x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X8 ] = cfl_ac_16x8_to_16x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_16X16] = cfl_ac_16x16_to_16x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X32] = cfl_ac_16x32_to_16x32_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X8 ] = cfl_ac_32x8_to_32x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X16] = cfl_ac_32x16_to_32x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_32X32] = cfl_ac_32x32_to_32x32_c;
-
-    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
-    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
-    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
-    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
-
-    c->pal_pred = pal_pred_c;
-
-#if HAVE_ASM && ARCH_X86
-    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
-#endif
-}
--- a/src/ipred_prepare.c
+++ /dev/null
@@ -1,209 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "common/intops.h"
-
-#include "src/ipred_prepare.h"
-
-static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]
-                                  [2 /* have_left */][2 /* have_top */] =
-{
-    [DC_PRED]    = { { DC_128_PRED,  TOP_DC_PRED },
-                     { LEFT_DC_PRED, DC_PRED     } },
-    [PAETH_PRED] = { { DC_128_PRED,  VERT_PRED   },
-                     { HOR_PRED,     PAETH_PRED  } },
-};
-
-static const uint8_t av1_mode_to_angle_map[8] = {
-    90, 180, 45, 135, 113, 157, 203, 67
-};
-
-static const struct {
-    uint8_t needs_left:1;
-    uint8_t needs_top:1;
-    uint8_t needs_topleft:1;
-    uint8_t needs_topright:1;
-    uint8_t needs_bottomleft:1;
-} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {
-    [DC_PRED]       = { .needs_top  = 1, .needs_left = 1 },
-    [VERT_PRED]     = { .needs_top  = 1 },
-    [HOR_PRED]      = { .needs_left = 1 },
-    [LEFT_DC_PRED]  = { .needs_left = 1 },
-    [TOP_DC_PRED]   = { .needs_top  = 1 },
-    [DC_128_PRED]   = { 0 },
-    [Z1_PRED]       = { .needs_top = 1, .needs_topright = 1,
-                        .needs_topleft = 1 },
-    [Z2_PRED]       = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
-    [Z3_PRED]       = { .needs_left = 1, .needs_bottomleft = 1,
-                        .needs_topleft = 1 },
-    [SMOOTH_PRED]   = { .needs_left = 1, .needs_top = 1 },
-    [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },
-    [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },
-    [PAETH_PRED]    = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
-    [FILTER_PRED]   = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
-};
-
-enum IntraPredMode
-bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
-                                  const int y, const int have_top,
-                                  const int w, const int h,
-                                  const enum EdgeFlags edge_flags,
-                                  const pixel *const dst,
-                                  const ptrdiff_t stride,
-                                  const pixel *prefilter_toplevel_sb_edge,
-                                  enum IntraPredMode mode, int *const angle,
-                                  const int tw, const int th,
-                                  pixel *const topleft_out)
-{
-    assert(y < h && x < w);
-
-    switch (mode) {
-    case VERT_PRED:
-    case HOR_PRED:
-    case DIAG_DOWN_LEFT_PRED:
-    case DIAG_DOWN_RIGHT_PRED:
-    case VERT_RIGHT_PRED:
-    case HOR_DOWN_PRED:
-    case HOR_UP_PRED:
-    case VERT_LEFT_PRED: {
-        *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
-
-        if (*angle < 90) {
-            mode = have_top ? Z1_PRED : VERT_PRED;
-        } else if (*angle == 90) {
-            mode = VERT_PRED;
-        } else if (*angle < 180) {
-            mode = Z2_PRED;
-        } else if (*angle == 180) {
-            mode = HOR_PRED;
-        } else {
-            mode = have_left ? Z3_PRED : HOR_PRED;
-        }
-        break;
-    }
-    case DC_PRED:
-    case PAETH_PRED:
-        mode = av1_mode_conv[mode][have_left][have_top];
-        break;
-    default:
-        break;
-    }
-
-    const pixel *dst_top;
-    if (have_top &&
-        (av1_intra_prediction_edges[mode].needs_top ||
-         av1_intra_prediction_edges[mode].needs_topleft ||
-         (av1_intra_prediction_edges[mode].needs_left && !have_left)))
-    {
-        if (prefilter_toplevel_sb_edge) {
-            dst_top = &prefilter_toplevel_sb_edge[x * 4];
-        } else {
-            dst_top = &dst[-PXSTRIDE(stride)];
-        }
-    }
-
-    if (av1_intra_prediction_edges[mode].needs_left) {
-        const int sz = th << 2;
-        pixel *const left = &topleft_out[-sz];
-
-        if (have_left) {
-            const int px_have = imin(sz, (h - y) << 2);
-
-            for (int i = 0; i < px_have; i++)
-                left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
-            if (px_have < sz)
-                pixel_set(left, left[sz - px_have], sz - px_have);
-        } else {
-            pixel_set(left, have_top ? *dst_top : ((1 << BITDEPTH) >> 1) + 1, sz);
-        }
-
-        if (av1_intra_prediction_edges[mode].needs_bottomleft) {
-            const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
-                                        (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
-
-            if (have_bottomleft) {
-                const int px_have = imin(sz, (h - y - th) << 2);
-
-                for (int i = 0; i < px_have; i++)
-                    left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
-                if (px_have < sz)
-                    pixel_set(left - sz, left[-px_have], sz - px_have);
-            } else {
-                pixel_set(left - sz, left[0], sz);
-            }
-        }
-    }
-
-    if (av1_intra_prediction_edges[mode].needs_top) {
-        const int sz = tw << 2;
-        pixel *const top = &topleft_out[1];
-
-        if (have_top) {
-            const int px_have = imin(sz, (w - x) << 2);
-            pixel_copy(top, dst_top, px_have);
-            if (px_have < sz)
-                pixel_set(top + px_have, top[px_have - 1], sz - px_have);
-        } else {
-            pixel_set(top, have_left ? dst[-1] : ((1 << BITDEPTH) >> 1) - 1, sz);
-        }
-
-        if (av1_intra_prediction_edges[mode].needs_topright) {
-            const int have_topright = (!have_top || x + tw >= w) ? 0 :
-                                      (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
-
-            if (have_topright) {
-                const int px_have = imin(sz, (w - x - tw) << 2);
-
-                pixel_copy(top + sz, &dst_top[sz], px_have);
-                if (px_have < sz)
-                    pixel_set(top + sz + px_have, top[sz + px_have - 1],
-                              sz - px_have);
-            } else {
-                pixel_set(top + sz, top[sz - 1], sz);
-            }
-        }
-    }
-
-    if (av1_intra_prediction_edges[mode].needs_topleft) {
-        if (have_left) {
-            *topleft_out = have_top ? dst_top[-1] : dst[-1];
-        } else {
-            *topleft_out = have_top ? *dst_top : (1 << BITDEPTH) >> 1;
-        }
-        if (mode == Z2_PRED && tw + th >= 6)
-            *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
-                            topleft_out[1] * 5 + 8) >> 4;
-    }
-
-    return mode;
-}
--- /dev/null
+++ b/src/ipred_prepare_tmpl.c
@@ -1,0 +1,209 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ipred_prepare.h"
+
+static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]
+                                  [2 /* have_left */][2 /* have_top */] =
+{
+    [DC_PRED]    = { { DC_128_PRED,  TOP_DC_PRED },
+                     { LEFT_DC_PRED, DC_PRED     } },
+    [PAETH_PRED] = { { DC_128_PRED,  VERT_PRED   },
+                     { HOR_PRED,     PAETH_PRED  } },
+};
+
+static const uint8_t av1_mode_to_angle_map[8] = {
+    90, 180, 45, 135, 113, 157, 203, 67
+};
+
+static const struct {
+    uint8_t needs_left:1;
+    uint8_t needs_top:1;
+    uint8_t needs_topleft:1;
+    uint8_t needs_topright:1;
+    uint8_t needs_bottomleft:1;
+} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {
+    [DC_PRED]       = { .needs_top  = 1, .needs_left = 1 },
+    [VERT_PRED]     = { .needs_top  = 1 },
+    [HOR_PRED]      = { .needs_left = 1 },
+    [LEFT_DC_PRED]  = { .needs_left = 1 },
+    [TOP_DC_PRED]   = { .needs_top  = 1 },
+    [DC_128_PRED]   = { 0 },
+    [Z1_PRED]       = { .needs_top = 1, .needs_topright = 1,
+                        .needs_topleft = 1 },
+    [Z2_PRED]       = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+    [Z3_PRED]       = { .needs_left = 1, .needs_bottomleft = 1,
+                        .needs_topleft = 1 },
+    [SMOOTH_PRED]   = { .needs_left = 1, .needs_top = 1 },
+    [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },
+    [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },
+    [PAETH_PRED]    = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+    [FILTER_PRED]   = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+};
+
+enum IntraPredMode
+bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
+                                  const int y, const int have_top,
+                                  const int w, const int h,
+                                  const enum EdgeFlags edge_flags,
+                                  const pixel *const dst,
+                                  const ptrdiff_t stride,
+                                  const pixel *prefilter_toplevel_sb_edge,
+                                  enum IntraPredMode mode, int *const angle,
+                                  const int tw, const int th,
+                                  pixel *const topleft_out)
+{
+    assert(y < h && x < w);
+
+    switch (mode) {
+    case VERT_PRED:
+    case HOR_PRED:
+    case DIAG_DOWN_LEFT_PRED:
+    case DIAG_DOWN_RIGHT_PRED:
+    case VERT_RIGHT_PRED:
+    case HOR_DOWN_PRED:
+    case HOR_UP_PRED:
+    case VERT_LEFT_PRED: {
+        *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
+
+        if (*angle < 90) {
+            mode = have_top ? Z1_PRED : VERT_PRED;
+        } else if (*angle == 90) {
+            mode = VERT_PRED;
+        } else if (*angle < 180) {
+            mode = Z2_PRED;
+        } else if (*angle == 180) {
+            mode = HOR_PRED;
+        } else {
+            mode = have_left ? Z3_PRED : HOR_PRED;
+        }
+        break;
+    }
+    case DC_PRED:
+    case PAETH_PRED:
+        mode = av1_mode_conv[mode][have_left][have_top];
+        break;
+    default:
+        break;
+    }
+
+    const pixel *dst_top;
+    if (have_top &&
+        (av1_intra_prediction_edges[mode].needs_top ||
+         av1_intra_prediction_edges[mode].needs_topleft ||
+         (av1_intra_prediction_edges[mode].needs_left && !have_left)))
+    {
+        if (prefilter_toplevel_sb_edge) {
+            dst_top = &prefilter_toplevel_sb_edge[x * 4];
+        } else {
+            dst_top = &dst[-PXSTRIDE(stride)];
+        }
+    }
+
+    if (av1_intra_prediction_edges[mode].needs_left) {
+        const int sz = th << 2;
+        pixel *const left = &topleft_out[-sz];
+
+        if (have_left) {
+            const int px_have = imin(sz, (h - y) << 2);
+
+            for (int i = 0; i < px_have; i++)
+                left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
+            if (px_have < sz)
+                pixel_set(left, left[sz - px_have], sz - px_have);
+        } else {
+            pixel_set(left, have_top ? *dst_top : ((1 << BITDEPTH) >> 1) + 1, sz);
+        }
+
+        if (av1_intra_prediction_edges[mode].needs_bottomleft) {
+            const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
+                                        (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
+
+            if (have_bottomleft) {
+                const int px_have = imin(sz, (h - y - th) << 2);
+
+                for (int i = 0; i < px_have; i++)
+                    left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
+                if (px_have < sz)
+                    pixel_set(left - sz, left[-px_have], sz - px_have);
+            } else {
+                pixel_set(left - sz, left[0], sz);
+            }
+        }
+    }
+
+    if (av1_intra_prediction_edges[mode].needs_top) {
+        const int sz = tw << 2;
+        pixel *const top = &topleft_out[1];
+
+        if (have_top) {
+            const int px_have = imin(sz, (w - x) << 2);
+            pixel_copy(top, dst_top, px_have);
+            if (px_have < sz)
+                pixel_set(top + px_have, top[px_have - 1], sz - px_have);
+        } else {
+            pixel_set(top, have_left ? dst[-1] : ((1 << BITDEPTH) >> 1) - 1, sz);
+        }
+
+        if (av1_intra_prediction_edges[mode].needs_topright) {
+            const int have_topright = (!have_top || x + tw >= w) ? 0 :
+                                      (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+            if (have_topright) {
+                const int px_have = imin(sz, (w - x - tw) << 2);
+
+                pixel_copy(top + sz, &dst_top[sz], px_have);
+                if (px_have < sz)
+                    pixel_set(top + sz + px_have, top[sz + px_have - 1],
+                              sz - px_have);
+            } else {
+                pixel_set(top + sz, top[sz - 1], sz);
+            }
+        }
+    }
+
+    if (av1_intra_prediction_edges[mode].needs_topleft) {
+        if (have_left) {
+            *topleft_out = have_top ? dst_top[-1] : dst[-1];
+        } else {
+            *topleft_out = have_top ? *dst_top : (1 << BITDEPTH) >> 1;
+        }
+        if (mode == Z2_PRED && tw + th >= 6)
+            *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
+                            topleft_out[1] * 5 + 8) >> 4;
+    }
+
+    return mode;
+}
--- /dev/null
+++ b/src/ipred_tmpl.c
@@ -1,0 +1,757 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/ipred.h"
+#include "src/tables.h"
+
+static NOINLINE void
+splat_dc(pixel *dst, const ptrdiff_t stride,
+         const int width, const int height, const unsigned dc)
+{
+    assert(dc <= (1 << BITDEPTH) - 1);
+#if BITDEPTH == 8
+    if (width > 4) {
+        const uint64_t dcN = dc * 0x0101010101010101ULL;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x += sizeof(dcN))
+                *((uint64_t *) &dst[x]) = dcN;
+            dst += PXSTRIDE(stride);
+        }
+    } else {
+        const unsigned dcN = dc * 0x01010101U;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x += sizeof(dcN))
+                *((unsigned *) &dst[x]) = dcN;
+            dst += PXSTRIDE(stride);
+        }
+    }
+#else
+    const uint64_t dcN = dc * 0x0001000100010001ULL;
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x += sizeof(dcN) >> 1)
+            *((uint64_t *) &dst[x]) = dcN;
+        dst += PXSTRIDE(stride);
+    }
+#endif
+}
+
+static NOINLINE void
+cfl_pred(pixel *dst, const ptrdiff_t stride,
+         const int width, const int height, const unsigned dc,
+         const int16_t *ac, const int alpha)
+{
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int diff = alpha * ac[x];
+            dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
+        }
+        ac += width;
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static unsigned dc_gen_top(const pixel *const topleft, const int width)
+{
+    unsigned dc = width >> 1;
+    for (int i = 0; i < width; i++)
+       dc += topleft[1 + i];
+    return dc >> ctz(width);
+}
+
+static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a)
+{
+    splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));
+}
+
+static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
+                            const pixel *const topleft,
+                            const int width, const int height,
+                            const int16_t *ac, const int alpha)
+{
+    cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);
+}
+
+static unsigned dc_gen_left(const pixel *const topleft, const int height)
+{
+    unsigned dc = height >> 1;
+    for (int i = 0; i < height; i++)
+       dc += topleft[-(1 + i)];
+    return dc >> ctz(height);
+}
+
+static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
+                            const pixel *const topleft,
+                            const int width, const int height, const int a)
+{
+    splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));
+}
+
+static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height,
+                             const int16_t *ac, const int alpha)
+{
+    unsigned dc = dc_gen_left(topleft, height);
+    cfl_pred(dst, stride, width, height, dc, ac, alpha);
+}
+
+#if BITDEPTH == 8
+#define MULTIPLIER_1x2 0x5556
+#define MULTIPLIER_1x4 0x3334
+#define BASE_SHIFT 16
+#else
+#define MULTIPLIER_1x2 0xAAAB
+#define MULTIPLIER_1x4 0x6667
+#define BASE_SHIFT 17
+#endif
+
+static unsigned
+dc_gen(const pixel *const topleft, const int width, const int height)
+{
+    unsigned dc = (width + height) >> 1;
+    for (int i = 0; i < width; i++)
+       dc += topleft[i + 1];
+    for (int i = 0; i < height; i++)
+       dc += topleft[-(i + 1)];
+    dc >>= ctz(width + height);
+
+    if (width != height) {
+        dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
+                                                           MULTIPLIER_1x2;
+        dc >>= BASE_SHIFT;
+    }
+    return dc;
+}
+
+static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft,
+                       const int width, const int height, const int a)
+{
+    splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));
+}
+
+static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
+                        const pixel *const topleft,
+                        const int width, const int height,
+                        const int16_t *ac, const int alpha)
+{
+    unsigned dc = dc_gen(topleft, width, height);
+    cfl_pred(dst, stride, width, height, dc, ac, alpha);
+}
+
+#undef MULTIPLIER_1x2
+#undef MULTIPLIER_1x4
+#undef BASE_SHIFT
+
+static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a)
+{
+    splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
+}
+
+static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
+                            const pixel *const topleft,
+                            const int width, const int height,
+                            const int16_t *ac, const int alpha)
+{
+    cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);
+}
+
+static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
+                      const pixel *const topleft,
+                      const int width, const int height, const int a)
+{
+    for (int y = 0; y < height; y++) {
+        pixel_copy(dst, topleft + 1, width);
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
+                      const pixel *const topleft,
+                      const int width, const int height, const int a)
+{
+    for (int y = 0; y < height; y++) {
+        pixel_set(dst, topleft[-(1 + y)], width);
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
+                          const pixel *const tl_ptr,
+                          const int width, const int height, const int a)
+{
+    const int topleft = tl_ptr[0];
+    for (int y = 0; y < height; y++) {
+        const int left = tl_ptr[-(y + 1)];
+        for (int x = 0; x < width; x++) {
+            const int top = tl_ptr[1 + x];
+            const int base = left + top - topleft;
+            const int ldiff = abs(left - base);
+            const int tdiff = abs(top - base);
+            const int tldiff = abs(topleft - base);
+
+            dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
+                     tdiff <= tldiff ? top : topleft;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft,
+                           const int width, const int height, const int a)
+{
+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+    const int right = topleft[width], bottom = topleft[-height];
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int pred = weights_ver[y]  * topleft[1 + x] +
+                      (256 - weights_ver[y]) * bottom +
+                             weights_hor[x]  * topleft[-(1 + y)] +
+                      (256 - weights_hor[x]) * right;
+            dst[x] = (pred + 256) >> 9;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height, const int a)
+{
+    const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+    const int bottom = topleft[-height];
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int pred = weights_ver[y]  * topleft[1 + x] +
+                      (256 - weights_ver[y]) * bottom;
+            dst[x] = (pred + 128) >> 8;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
+                             const pixel *const topleft,
+                             const int width, const int height, const int a)
+{
+    const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+    const int right = topleft[width];
+
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++) {
+            const int pred = weights_hor[x]  * topleft[-(y + 1)] +
+                      (256 - weights_hor[x]) * right;
+            dst[x] = (pred + 128) >> 8;
+        }
+        dst += PXSTRIDE(stride);
+    }
+}
+
+static int get_filter_strength(const unsigned blk_wh, const unsigned d,
+                               const int type)
+{
+    int strength = 0;
+
+    if (type == 0) {
+        if (blk_wh <= 8) {
+            if (d >= 56) strength = 1;
+        } else if (blk_wh <= 12) {
+            if (d >= 40) strength = 1;
+        } else if (blk_wh <= 16) {
+            if (d >= 40) strength = 1;
+        } else if (blk_wh <= 24) {
+            if (d >= 8) strength = 1;
+            if (d >= 16) strength = 2;
+            if (d >= 32) strength = 3;
+        } else if (blk_wh <= 32) {
+            if (d >= 1) strength = 1;
+            if (d >= 4) strength = 2;
+            if (d >= 32) strength = 3;
+        } else {
+            if (d >= 1) strength = 3;
+        }
+    } else {
+        if (blk_wh <= 8) {
+            if (d >= 40) strength = 1;
+            if (d >= 64) strength = 2;
+        } else if (blk_wh <= 16) {
+            if (d >= 20) strength = 1;
+            if (d >= 48) strength = 2;
+        } else if (blk_wh <= 24) {
+            if (d >= 4) strength = 3;
+        } else {
+            if (d >= 1) strength = 3;
+        }
+    }
+
+    return strength;
+}
+
+static void filter_edge(pixel *const out, const int sz, const pixel *const in,
+                        const int from, const int to, const unsigned strength)
+{
+    static const uint8_t kernel[3][5] = {
+        { 0, 4, 8, 4, 0 },
+        { 0, 5, 6, 5, 0 },
+        { 2, 4, 4, 4, 2 }
+    };
+
+    assert(strength > 0);
+    for (int i = 0; i < sz; i++) {
+        int s = 0;
+        for (int j = 0; j < 5; j++)
+            s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
+        out[i] = (s + 8) >> 4;
+    }
+}
+
+static int get_upsample(const int blk_wh, const unsigned d, const int type) {
+    if (d >= 40) return 0;
+    return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
+
+static void upsample_edge(pixel *const out, const int hsz,
+                          const pixel *const in, const int from, const int to)
+{
+    static const int8_t kernel[4] = { -1, 9, 9, -1 };
+    int i;
+    for (i = 0; i < hsz - 1; i++) {
+        out[i * 2] = in[iclip(i, from, to - 1)];
+
+        int s = 0;
+        for (int j = 0; j < 4; j++)
+            s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
+        out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
+    }
+    out[i * 2] = in[iclip(i, from, to - 1)];
+}
+
+static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle)
+{
+    const int is_sm = angle >> 9;
+    angle &= 511;
+    assert(angle < 90);
+    const int dx = dav1d_dr_intra_derivative[angle];
+    pixel top_out[(64 + 64) * 2];
+    const pixel *top;
+    int max_base_x;
+    const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);
+    if (upsample_above) {
+        upsample_edge(top_out, width + height,
+                      &topleft_in[1], -1, width + imin(width, height));
+        top = top_out;
+        max_base_x = 2 * (width + height) - 2;
+    } else {
+        const int filter_strength =
+            get_filter_strength(width + height, 90 - angle, is_sm);
+
+        if (filter_strength) {
+            filter_edge(top_out, width + height,
+                        &topleft_in[1], -1, width + imin(width, height),
+                        filter_strength);
+            top = top_out;
+            max_base_x = width + height - 1;
+        } else {
+            top = &topleft_in[1];
+            max_base_x = width + imin(width, height) - 1;
+        }
+    }
+    const int frac_bits = 6 - upsample_above;
+    const int base_inc = 1 << upsample_above;
+    for (int y = 0, xpos = dx; y < height;
+         y++, dst += PXSTRIDE(stride), xpos += dx)
+    {
+        int base = xpos >> frac_bits;
+        const int frac = ((xpos << upsample_above) & 0x3F) >> 1;
+
+        for (int x = 0; x < width; x++, base += base_inc) {
+            if (base < max_base_x) {
+                const int v = top[base] * (32 - frac) + top[base + 1] * frac;
+                dst[x] = iclip_pixel((v + 16) >> 5);
+            } else {
+                pixel_set(&dst[x], top[max_base_x], width - x);
+                break;
+            }
+        }
+    }
+}
+
+static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle)
+{
+    const int is_sm = angle >> 9;
+    angle &= 511;
+    assert(angle > 90 && angle < 180);
+    const int dy = dav1d_dr_intra_derivative[angle - 90];
+    const int dx = dav1d_dr_intra_derivative[180 - angle];
+    const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);
+    const int upsample_above = get_upsample(width + height, angle - 90, is_sm);
+    pixel edge[64 * 2 + 64 * 2 + 1];
+    pixel *const topleft = &edge[height * 2];
+
+    if (upsample_above) {
+        upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);
+    } else {
+        const int filter_strength =
+            get_filter_strength(width + height, angle - 90, is_sm);
+
+        if (filter_strength) {
+            filter_edge(&topleft[1], width, &topleft_in[1], -1, width,
+                        filter_strength);
+        } else {
+            pixel_copy(&topleft[1], &topleft_in[1], width);
+        }
+    }
+    if (upsample_left) {
+        upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);
+    } else {
+        const int filter_strength =
+            get_filter_strength(width + height, 180 - angle, is_sm);
+
+        if (filter_strength) {
+            filter_edge(&topleft[-height], height, &topleft_in[-height],
+                        0, height + 1, filter_strength);
+        } else {
+            pixel_copy(&topleft[-height], &topleft_in[-height], height);
+        }
+    }
+    *topleft = *topleft_in;
+
+    const int min_base_x = -(1 << upsample_above);
+    const int frac_bits_y = 6 - upsample_left, frac_bits_x = 6 - upsample_above;
+    const int base_inc_x = 1 << upsample_above;
+    const pixel *const left = &topleft[-(1 << upsample_left)];
+    const pixel *const top = &topleft[1 << upsample_above];
+    for (int y = 0, xpos = -dx; y < height;
+         y++, xpos -= dx, dst += PXSTRIDE(stride))
+    {
+        int base_x = xpos >> frac_bits_x;
+        const int frac_x = ((xpos * (1 << upsample_above)) & 0x3F) >> 1;
+
+        for (int x = 0, ypos = (y << 6) - dy; x < width;
+             x++, base_x += base_inc_x, ypos -= dy)
+        {
+            int v;
+
+            if (base_x >= min_base_x) {
+                v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
+            } else {
+                const int base_y = ypos >> frac_bits_y;
+                assert(base_y >= -(1 << upsample_left));
+                const int frac_y = ((ypos * (1 << upsample_left)) & 0x3F) >> 1;
+                v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
+            }
+            dst[x] = iclip_pixel((v + 16) >> 5);
+        }
+    }
+}
+
+static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
+                       const pixel *const topleft_in,
+                       const int width, const int height, int angle)
+{
+    const int is_sm = angle >> 9;
+    angle &= 511;
+    assert(angle > 180);
+    const int dy = dav1d_dr_intra_derivative[270 - angle];
+    pixel left_out[(64 + 64) * 2];
+    const pixel *left;
+    int max_base_y;
+    const int upsample_left = get_upsample(width + height, angle - 180, is_sm);
+    if (upsample_left) {
+        upsample_edge(left_out, width + height,
+                      &topleft_in[-(width + height)],
+                      imax(width - height, 0), width + height + 1);
+        left = &left_out[2 * (width + height) - 2];
+        max_base_y = 2 * (width + height) - 2;
+    } else {
+        const int filter_strength =
+            get_filter_strength(width + height, angle - 180, is_sm);
+
+        if (filter_strength) {
+            filter_edge(left_out, width + height,
+                        &topleft_in[-(width + height)],
+                        imax(width - height, 0), width + height + 1,
+                        filter_strength);
+            left = &left_out[width + height - 1];
+            max_base_y = width + height - 1;
+        } else {
+            left = &topleft_in[-1];
+            max_base_y = height + imin(width, height) - 1;
+        }
+    }
+    const int frac_bits = 6 - upsample_left;
+    const int base_inc = 1 << upsample_left;
+    for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
+        int base = ypos >> frac_bits;
+        const int frac = ((ypos << upsample_left) & 0x3F) >> 1;
+
+        for (int y = 0; y < height; y++, base += base_inc) {
+            if (base < max_base_y) {
+                const int v = left[-base] * (32 - frac) +
+                              left[-(base + 1)] * frac;
+                dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
+            } else {
+                do {
+                    dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
+                } while (++y < height);
+                break;
+            }
+        }
+    }
+}
+
+/* Up to 32x32 only */
+static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
+                           const pixel *const topleft_in,
+                           const int width, const int height, int filt_idx)
+{
+    filt_idx &= 511;
+    assert(filt_idx < 5);
+
+    const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
+    int x, y;
+    ptrdiff_t left_stride;
+    const pixel *left, *topleft, *top;
+
+    top = &topleft_in[1];
+    for (y = 0; y < height; y += 2) {
+        topleft = &topleft_in[-y];
+        left = &topleft[-1];
+        left_stride = -1;
+        for (x = 0; x < width; x += 4) {
+            const int p0 = *topleft;
+            const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
+            const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
+            pixel *ptr = &dst[x];
+            const int8_t *flt_ptr = filter;
+
+            for (int yy = 0; yy < 2; yy++) {
+                for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
+                    int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
+                              flt_ptr[16] * p2 + flt_ptr[17] * p3 +
+                              flt_ptr[32] * p4 + flt_ptr[33] * p5 +
+                              flt_ptr[48] * p6;
+                    ptr[xx] = iclip_pixel((acc + 8) >> 4);
+                }
+                ptr += PXSTRIDE(stride);
+            }
+            left = &dst[x + 4 - 1];
+            left_stride = PXSTRIDE(stride);
+            top += 4;
+            topleft = &top[-1];
+        }
+        top = &dst[PXSTRIDE(stride)];
+        dst = &dst[PXSTRIDE(stride) * 2];
+    }
+}
+
+static NOINLINE void
+cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
+         const int w_pad, const int h_pad, const int width, const int height,
+         const int ss_hor, const int ss_ver, const int log2sz)
+{
+    int y, x;
+    int16_t *const ac_orig = ac;
+
+    assert(w_pad >= 0 && w_pad * 4 < width);
+    assert(h_pad >= 0 && h_pad * 4 < height);
+
+    for (y = 0; y < height - 4 * h_pad; y++) {
+        for (x = 0; x < width - 4 * w_pad; x++) {
+            int ac_sum = ypx[x << ss_hor];
+            if (ss_hor) ac_sum += ypx[x * 2 + 1];
+            if (ss_ver) {
+                ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
+                if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
+            }
+            ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
+        }
+        for (; x < width; x++)
+            ac[x] = ac[x - 1];
+        ac += width;
+        ypx += PXSTRIDE(stride) << ss_ver;
+    }
+    for (; y < height; y++) {
+        memcpy(ac, &ac[-width], width * sizeof(*ac));
+        ac += width;
+    }
+
+    int sum = (1 << log2sz) >> 1;
+    for (ac = ac_orig, y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            sum += ac[x];
+        ac += width;
+    }
+    sum >>= log2sz;
+
+    // subtract DC
+    for (ac = ac_orig, y = 0; y < height; y++) {
+        for (x = 0; x < width; x++)
+            ac[x] -= sum;
+        ac += width;
+    }
+}
+
+#define cfl_ac_fn(lw, lh, cw, ch, ss_hor, ss_ver, log2sz) \
+static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \
+                                                    const pixel *const ypx, \
+                                                    const ptrdiff_t stride, \
+                                                    const int w_pad, \
+                                                    const int h_pad) \
+{ \
+    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \
+}
+
+cfl_ac_fn( 8,  8,  4,  4, 1, 1, 4)
+cfl_ac_fn( 8, 16,  4,  8, 1, 1, 5)
+cfl_ac_fn( 8, 32,  4, 16, 1, 1, 6)
+cfl_ac_fn(16,  8,  8,  4, 1, 1, 5)
+cfl_ac_fn(16, 16,  8,  8, 1, 1, 6)
+cfl_ac_fn(16, 32,  8, 16, 1, 1, 7)
+cfl_ac_fn(32,  8, 16,  4, 1, 1, 6)
+cfl_ac_fn(32, 16, 16,  8, 1, 1, 7)
+cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)
+
+cfl_ac_fn( 8,  4,  4,  4, 1, 0, 4)
+cfl_ac_fn( 8,  8,  4,  8, 1, 0, 5)
+cfl_ac_fn(16,  4,  8,  4, 1, 0, 5)
+cfl_ac_fn(16,  8,  8,  8, 1, 0, 6)
+cfl_ac_fn(16, 16,  8, 16, 1, 0, 7)
+cfl_ac_fn(32,  8, 16,  8, 1, 0, 7)
+cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)
+cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)
+
+cfl_ac_fn( 4,  4,  4,  4, 0, 0, 4)
+cfl_ac_fn( 4,  8,  4,  8, 0, 0, 5)
+cfl_ac_fn( 4, 16,  4, 16, 0, 0, 6)
+cfl_ac_fn( 8,  4,  8,  4, 0, 0, 5)
+cfl_ac_fn( 8,  8,  8,  8, 0, 0, 6)
+cfl_ac_fn( 8, 16,  8, 16, 0, 0, 7)
+cfl_ac_fn( 8, 32,  8, 32, 0, 0, 8)
+cfl_ac_fn(16,  4, 16,  4, 0, 0, 6)
+cfl_ac_fn(16,  8, 16,  8, 0, 0, 7)
+cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)
+cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)
+cfl_ac_fn(32,  8, 32,  8, 0, 0, 8)
+cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)
+cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)
+
+static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
+                       const uint16_t *const pal, const uint8_t *idx,
+                       const int w, const int h)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++)
+            dst[x] = pal[idx[x]];
+        idx += w;
+        dst += PXSTRIDE(stride);
+    }
+}
+
+void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
+    c->intra_pred[DC_PRED      ] = ipred_dc_c;
+    c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
+    c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
+    c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
+    c->intra_pred[HOR_PRED     ] = ipred_h_c;
+    c->intra_pred[VERT_PRED    ] = ipred_v_c;
+    c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
+    c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
+    c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
+    c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
+    c->intra_pred[Z1_PRED      ] = ipred_z1_c;
+    c->intra_pred[Z2_PRED      ] = ipred_z2_c;
+    c->intra_pred[Z3_PRED      ] = ipred_z3_c;
+    c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
+
+    // cfl functions are split per chroma subsampling type
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4  ] = cfl_ac_8x8_to_4x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X8  ] = cfl_ac_8x16_to_4x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X16 ] = cfl_ac_8x32_to_4x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X4  ] = cfl_ac_16x8_to_8x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_8X8  ] = cfl_ac_16x16_to_8x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X16 ] = cfl_ac_16x32_to_8x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X4 ] = cfl_ac_32x8_to_16x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X8 ] = cfl_ac_32x16_to_16x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_16X16] = cfl_ac_32x32_to_16x16_c;
+
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4  ] = cfl_ac_8x4_to_4x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8  ] = cfl_ac_8x8_to_4x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4  ] = cfl_ac_16x4_to_8x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8  ] = cfl_ac_16x8_to_8x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;
+
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_4X4  ] = cfl_ac_4x4_to_4x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X8  ] = cfl_ac_4x8_to_4x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X16 ] = cfl_ac_4x16_to_4x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X4  ] = cfl_ac_8x4_to_8x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_8X8  ] = cfl_ac_8x8_to_8x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X16 ] = cfl_ac_8x16_to_8x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X32 ] = cfl_ac_8x32_to_8x32_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X4 ] = cfl_ac_16x4_to_16x4_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X8 ] = cfl_ac_16x8_to_16x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_16X16] = cfl_ac_16x16_to_16x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X32] = cfl_ac_16x32_to_16x32_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X8 ] = cfl_ac_32x8_to_32x8_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X16] = cfl_ac_32x16_to_32x16_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_32X32] = cfl_ac_32x32_to_32x32_c;
+
+    c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
+    c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
+    c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
+    c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
+
+    c->pal_pred = pal_pred_c;
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+#endif
+}
--- a/src/itx.c
+++ /dev/null
@@ -1,233 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/itx.h"
-
-#include "src/itx_1d.c"
-
-typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
-                          coef *out, ptrdiff_t out_s);
-
-static void NOINLINE
-inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
-               coef *const coeff, const int eob,
-               const int w, const int h, const int shift1, const int shift2,
-               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn)
-{
-    int i, j;
-    const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
-    assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
-    // Maximum value for h and w is 64
-    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
-    const int is_rect2 = w * 2 == h || h * 2 == w;
-
-    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
-    const int rnd1 = (1 << shift1) >> 1;
-    for (i = 0; i < sh; i++) {
-        if (w != sw || is_rect2) {
-            for (j = 0; j < sw; j++) {
-                in_mem[j] = coeff[i + j * sh];
-                if (is_rect2)
-                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
-            }
-            first_1d_fn(in_mem, 1, &tmp[i * w], 1);
-        } else {
-            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);
-        }
-        for (j = 0; j < w; j++)
-            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
-    }
-
-    if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
-    const int rnd2 = (1 << shift2) >> 1;
-    for (i = 0; i < w; i++) {
-        second_1d_fn(&tmp[i], w, out, 1);
-        for (j = 0; j < h; j++)
-            dst[i + j * PXSTRIDE(stride)] =
-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
-                            ((out[j] + (rnd2)) >> shift2));
-    }
-    memset(coeff, 0, sizeof(*coeff) * sh * sw);
-}
-
-#define inv_txfm_fn(type1, type2, w, h, shift1, shift2) \
-static void \
-inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
-                                               const ptrdiff_t stride, \
-                                               coef *const coeff, \
-                                               const int eob) \
-{ \
-    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
-                   inv_##type1##w##_1d, inv_##type2##h##_1d); \
-}
-
-#define inv_txfm_fn64(w, h, shift1, shift2) \
-inv_txfm_fn(dct, dct, w, h, shift1, shift2)
-
-#define inv_txfm_fn32(w, h, shift1, shift2) \
-inv_txfm_fn64(w, h, shift1, shift2) \
-inv_txfm_fn(identity, identity, w, h, shift1, shift2)
-
-#define inv_txfm_fn16(w, h, shift1, shift2) \
-inv_txfm_fn32(w, h, shift1, shift2) \
-inv_txfm_fn(adst,     dct,      w, h, shift1, shift2) \
-inv_txfm_fn(dct,      adst,     w, h, shift1, shift2) \
-inv_txfm_fn(adst,     adst,     w, h, shift1, shift2) \
-inv_txfm_fn(dct,      flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, dct,      w, h, shift1, shift2) \
-inv_txfm_fn(adst,     flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, adst,     w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(identity, dct,      w, h, shift1, shift2) \
-inv_txfm_fn(dct,      identity, w, h, shift1, shift2) \
-
-#define inv_txfm_fn84(w, h, shift1, shift2) \
-inv_txfm_fn16(w, h, shift1, shift2) \
-inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \
-inv_txfm_fn(identity, adst,     w, h, shift1, shift2) \
-inv_txfm_fn(adst,     identity, w, h, shift1, shift2) \
-
-inv_txfm_fn84( 4,  4, 0, 4)
-inv_txfm_fn84( 4,  8, 0, 4)
-inv_txfm_fn84( 4, 16, 1, 4)
-inv_txfm_fn84( 8,  4, 0, 4)
-inv_txfm_fn84( 8,  8, 1, 4)
-inv_txfm_fn84( 8, 16, 1, 4)
-inv_txfm_fn32( 8, 32, 2, 4)
-inv_txfm_fn84(16,  4, 1, 4)
-inv_txfm_fn84(16,  8, 1, 4)
-inv_txfm_fn16(16, 16, 2, 4)
-inv_txfm_fn32(16, 32, 1, 4)
-inv_txfm_fn64(16, 64, 2, 4)
-inv_txfm_fn32(32,  8, 2, 4)
-inv_txfm_fn32(32, 16, 1, 4)
-inv_txfm_fn32(32, 32, 2, 4)
-inv_txfm_fn64(32, 64, 1, 4)
-inv_txfm_fn64(64, 16, 2, 4)
-inv_txfm_fn64(64, 32, 1, 4)
-inv_txfm_fn64(64, 64, 2, 4)
-
-static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
-                                       coef *const coeff, const int eob)
-{
-    int i, j;
-    coef tmp[4 * 4], out[4];
-
-    for (i = 0; i < 4; i++)
-        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
-
-    for (i = 0; i < 4; i++) {
-        inv_wht4_1d(&tmp[i], 4, out, 1, 1);
-        for (j = 0; j < 4; j++)
-            dst[i + j * PXSTRIDE(stride)] =
-                iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
-    }
-    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
-}
-
-void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
-#define assign_itx_all_fn64(w, h, pfx) \
-    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
-        inv_txfm_add_dct_dct_##w##x##h##_c
-
-#define assign_itx_all_fn32(w, h, pfx) \
-    assign_itx_all_fn64(w, h, pfx); \
-    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
-        inv_txfm_add_identity_identity_##w##x##h##_c
-
-#define assign_itx_all_fn16(w, h, pfx) \
-    assign_itx_all_fn32(w, h, pfx); \
-    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
-        inv_txfm_add_adst_dct_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
-        inv_txfm_add_dct_adst_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
-        inv_txfm_add_adst_adst_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
-        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
-        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
-        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
-        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
-        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
-        inv_txfm_add_dct_identity_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
-        inv_txfm_add_identity_dct_##w##x##h##_c
-
-#define assign_itx_all_fn84(w, h, pfx) \
-    assign_itx_all_fn16(w, h, pfx); \
-    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
-        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
-        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
-        inv_txfm_add_adst_identity_##w##x##h##_c; \
-    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
-        inv_txfm_add_identity_adst_##w##x##h##_c; \
-
-    memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */
-
-    c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
-    assign_itx_all_fn84( 4,  4, );
-    assign_itx_all_fn84( 4,  8, R);
-    assign_itx_all_fn84( 4, 16, R);
-    assign_itx_all_fn84( 8,  4, R);
-    assign_itx_all_fn84( 8,  8, );
-    assign_itx_all_fn84( 8, 16, R);
-    assign_itx_all_fn32( 8, 32, R);
-    assign_itx_all_fn84(16,  4, R);
-    assign_itx_all_fn84(16,  8, R);
-    assign_itx_all_fn16(16, 16, );
-    assign_itx_all_fn32(16, 32, R);
-    assign_itx_all_fn64(16, 64, R);
-    assign_itx_all_fn32(32,  8, R);
-    assign_itx_all_fn32(32, 16, R);
-    assign_itx_all_fn32(32, 32, );
-    assign_itx_all_fn64(32, 64, R);
-    assign_itx_all_fn64(64, 16, R);
-    assign_itx_all_fn64(64, 32, R);
-    assign_itx_all_fn64(64, 64, );
-
-#if HAVE_ASM && ARCH_X86
-    bitfn(dav1d_itx_dsp_init_x86)(c);
-#endif
-}
--- /dev/null
+++ b/src/itx_tmpl.c
@@ -1,0 +1,233 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/itx.h"
+
+#include "src/itx_1d.c"
+
+typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
+                          coef *out, ptrdiff_t out_s);
+
+static void NOINLINE
+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
+               coef *const coeff, const int eob,
+               const int w, const int h, const int shift1, const int shift2,
+               const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn)
+{
+    int i, j;
+    const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
+    assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
+    // Maximum value for h and w is 64
+    coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
+    const int is_rect2 = w * 2 == h || h * 2 == w;
+
+    if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
+    const int rnd1 = (1 << shift1) >> 1;
+    for (i = 0; i < sh; i++) {
+        if (w != sw || is_rect2) {
+            for (j = 0; j < sw; j++) {
+                in_mem[j] = coeff[i + j * sh];
+                if (is_rect2)
+                    in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
+            }
+            first_1d_fn(in_mem, 1, &tmp[i * w], 1);
+        } else {
+            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);
+        }
+        for (j = 0; j < w; j++)
+            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
+    }
+
+    if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
+    const int rnd2 = (1 << shift2) >> 1;
+    for (i = 0; i < w; i++) {
+        second_1d_fn(&tmp[i], w, out, 1);
+        for (j = 0; j < h; j++)
+            dst[i + j * PXSTRIDE(stride)] =
+                iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
+                            ((out[j] + (rnd2)) >> shift2));
+    }
+    memset(coeff, 0, sizeof(*coeff) * sh * sw);
+}
+
+#define inv_txfm_fn(type1, type2, w, h, shift1, shift2) \
+static void \
+inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
+                                               const ptrdiff_t stride, \
+                                               coef *const coeff, \
+                                               const int eob) \
+{ \
+    inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
+                   inv_##type1##w##_1d, inv_##type2##h##_1d); \
+}
+
+#define inv_txfm_fn64(w, h, shift1, shift2) \
+inv_txfm_fn(dct, dct, w, h, shift1, shift2)
+
+#define inv_txfm_fn32(w, h, shift1, shift2) \
+inv_txfm_fn64(w, h, shift1, shift2) \
+inv_txfm_fn(identity, identity, w, h, shift1, shift2)
+
+#define inv_txfm_fn16(w, h, shift1, shift2) \
+inv_txfm_fn32(w, h, shift1, shift2) \
+inv_txfm_fn(adst,     dct,      w, h, shift1, shift2) \
+inv_txfm_fn(dct,      adst,     w, h, shift1, shift2) \
+inv_txfm_fn(adst,     adst,     w, h, shift1, shift2) \
+inv_txfm_fn(dct,      flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, dct,      w, h, shift1, shift2) \
+inv_txfm_fn(adst,     flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, adst,     w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(identity, dct,      w, h, shift1, shift2) \
+inv_txfm_fn(dct,      identity, w, h, shift1, shift2) \
+
+#define inv_txfm_fn84(w, h, shift1, shift2) \
+inv_txfm_fn16(w, h, shift1, shift2) \
+inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \
+inv_txfm_fn(identity, adst,     w, h, shift1, shift2) \
+inv_txfm_fn(adst,     identity, w, h, shift1, shift2) \
+
+inv_txfm_fn84( 4,  4, 0, 4)
+inv_txfm_fn84( 4,  8, 0, 4)
+inv_txfm_fn84( 4, 16, 1, 4)
+inv_txfm_fn84( 8,  4, 0, 4)
+inv_txfm_fn84( 8,  8, 1, 4)
+inv_txfm_fn84( 8, 16, 1, 4)
+inv_txfm_fn32( 8, 32, 2, 4)
+inv_txfm_fn84(16,  4, 1, 4)
+inv_txfm_fn84(16,  8, 1, 4)
+inv_txfm_fn16(16, 16, 2, 4)
+inv_txfm_fn32(16, 32, 1, 4)
+inv_txfm_fn64(16, 64, 2, 4)
+inv_txfm_fn32(32,  8, 2, 4)
+inv_txfm_fn32(32, 16, 1, 4)
+inv_txfm_fn32(32, 32, 2, 4)
+inv_txfm_fn64(32, 64, 1, 4)
+inv_txfm_fn64(64, 16, 2, 4)
+inv_txfm_fn64(64, 32, 1, 4)
+inv_txfm_fn64(64, 64, 2, 4)
+
+static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
+                                       coef *const coeff, const int eob)
+{
+    int i, j;
+    coef tmp[4 * 4], out[4];
+
+    for (i = 0; i < 4; i++)
+        inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
+
+    for (i = 0; i < 4; i++) {
+        inv_wht4_1d(&tmp[i], 4, out, 1, 1);
+        for (j = 0; j < 4; j++)
+            dst[i + j * PXSTRIDE(stride)] =
+                iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
+    }
+    memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+}
+
+void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
+#define assign_itx_all_fn64(w, h, pfx) \
+    c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
+        inv_txfm_add_dct_dct_##w##x##h##_c
+
+#define assign_itx_all_fn32(w, h, pfx) \
+    assign_itx_all_fn64(w, h, pfx); \
+    c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
+        inv_txfm_add_identity_identity_##w##x##h##_c
+
+#define assign_itx_all_fn16(w, h, pfx) \
+    assign_itx_all_fn32(w, h, pfx); \
+    c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
+        inv_txfm_add_adst_dct_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
+        inv_txfm_add_dct_adst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
+        inv_txfm_add_adst_adst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
+        inv_txfm_add_flipadst_adst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
+        inv_txfm_add_adst_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
+        inv_txfm_add_flipadst_dct_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
+        inv_txfm_add_dct_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
+        inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
+        inv_txfm_add_dct_identity_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
+        inv_txfm_add_identity_dct_##w##x##h##_c
+
+#define assign_itx_all_fn84(w, h, pfx) \
+    assign_itx_all_fn16(w, h, pfx); \
+    c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
+        inv_txfm_add_flipadst_identity_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
+        inv_txfm_add_identity_flipadst_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
+        inv_txfm_add_adst_identity_##w##x##h##_c; \
+    c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
+        inv_txfm_add_identity_adst_##w##x##h##_c; \
+
+    memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */
+
+    c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+    assign_itx_all_fn84( 4,  4, );
+    assign_itx_all_fn84( 4,  8, R);
+    assign_itx_all_fn84( 4, 16, R);
+    assign_itx_all_fn84( 8,  4, R);
+    assign_itx_all_fn84( 8,  8, );
+    assign_itx_all_fn84( 8, 16, R);
+    assign_itx_all_fn32( 8, 32, R);
+    assign_itx_all_fn84(16,  4, R);
+    assign_itx_all_fn84(16,  8, R);
+    assign_itx_all_fn16(16, 16, );
+    assign_itx_all_fn32(16, 32, R);
+    assign_itx_all_fn64(16, 64, R);
+    assign_itx_all_fn32(32,  8, R);
+    assign_itx_all_fn32(32, 16, R);
+    assign_itx_all_fn32(32, 32, );
+    assign_itx_all_fn64(32, 64, R);
+    assign_itx_all_fn64(64, 16, R);
+    assign_itx_all_fn64(64, 32, R);
+    assign_itx_all_fn64(64, 64, );
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_itx_dsp_init_x86)(c);
+#endif
+}
--- a/src/lf_apply.c
+++ /dev/null
@@ -1,306 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "common/intops.h"
-
-#include "src/lf_apply.h"
-
-static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
-                                       const int have_left,
-                                       const uint8_t (*lvl)[4],
-                                       const ptrdiff_t b4_stride,
-                                       const uint16_t (*const mask)[3][2],
-                                       pixel *dst, const ptrdiff_t ls,
-                                       const int w,
-                                       const int starty4, const int endy4)
-{
-    const Dav1dDSPContext *const dsp = f->dsp;
-
-    // filter edges between columns (e.g. block1 | block2)
-    for (int x = 0; x < w; x++) {
-        if (!have_left && !x) continue;
-        uint32_t hmask[4];
-        if (!starty4) {
-            hmask[0] = mask[x][0][0];
-            hmask[1] = mask[x][1][0];
-            hmask[2] = mask[x][2][0];
-            if (endy4 > 16) {
-                hmask[0] |= mask[x][0][1] << 16;
-                hmask[1] |= mask[x][1][1] << 16;
-                hmask[2] |= mask[x][2][1] << 16;
-            }
-        } else {
-            hmask[0] = mask[x][0][1];
-            hmask[1] = mask[x][1][1];
-            hmask[2] = mask[x][2][1];
-        }
-        hmask[3] = 0;
-        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
-                                     (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
-                                     &f->lf.lim_lut, endy4 - starty4);
-    }
-}
-
-static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
-                                       const int have_top,
-                                       const uint8_t (*lvl)[4],
-                                       const ptrdiff_t b4_stride,
-                                       const uint16_t (*const mask)[3][2],
-                                       pixel *dst, const ptrdiff_t ls,
-                                       const int w,
-                                       const int starty4, const int endy4)
-{
-    const Dav1dDSPContext *const dsp = f->dsp;
-
-    //                                 block1
-    // filter edges between rows (e.g. ------)
-    //                                 block2
-    for (int y = starty4; y < endy4;
-         y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
-    {
-        if (!have_top && !y) continue;
-        const uint32_t vmask[4] = {
-            mask[y][0][0] | (mask[y][0][1] << 16),
-            mask[y][1][0] | (mask[y][1][1] << 16),
-            mask[y][2][0] | (mask[y][2][1] << 16),
-            0,
-        };
-        dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
-                                     (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
-                                     &f->lf.lim_lut, w);
-    }
-}
-
-static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
-                                        const int have_left,
-                                        const uint8_t (*lvl)[4],
-                                        const ptrdiff_t b4_stride,
-                                        const uint16_t (*const mask)[2][2],
-                                        pixel *const u, pixel *const v,
-                                        const ptrdiff_t ls, const int w,
-                                        const int starty4, const int endy4,
-                                        const int ss_ver)
-{
-    const Dav1dDSPContext *const dsp = f->dsp;
-
-    // filter edges between columns (e.g. block1 | block2)
-    for (int x = 0; x < w; x++) {
-        if (!have_left && !x) continue;
-        uint32_t hmask[3];
-        if (!starty4) {
-            hmask[0] = mask[x][0][0];
-            hmask[1] = mask[x][1][0];
-            if (endy4 > (16 >> ss_ver)) {
-                hmask[0] |= mask[x][0][1] << (16 >> ss_ver);
-                hmask[1] |= mask[x][1][1] << (16 >> ss_ver);
-            }
-        } else {
-            hmask[0] = mask[x][0][1];
-            hmask[1] = mask[x][1][1];
-        }
-        hmask[2] = 0;
-        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
-                                     (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
-                                     &f->lf.lim_lut, endy4 - starty4);
-        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
-                                     (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
-                                     &f->lf.lim_lut, endy4 - starty4);
-    }
-}
-
-static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
-                                        const int have_top,
-                                        const uint8_t (*lvl)[4],
-                                        const ptrdiff_t b4_stride,
-                                        const uint16_t (*const mask)[2][2],
-                                        pixel *const u, pixel *const v,
-                                        const ptrdiff_t ls, const int w,
-                                        const int starty4, const int endy4,
-                                        const int ss_hor)
-{
-    const Dav1dDSPContext *const dsp = f->dsp;
-    ptrdiff_t off_l = 0;
-
-    //                                 block1
-    // filter edges between rows (e.g. ------)
-    //                                 block2
-    for (int y = starty4; y < endy4;
-         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
-    {
-        if (!have_top && !y) continue;
-        const uint32_t vmask[3] = {
-            mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),
-            mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),
-            0,
-        };
-        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
-                                     (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
-                                     &f->lf.lim_lut, w);
-        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
-                                     (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
-                                     &f->lf.lim_lut, w);
-    }
-}
-
-void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
-                                    pixel *const p[3], Av1Filter *const lflvl,
-                                    int sby, const int start_of_tile_row)
-{
-    int x, have_left;
-    // Don't filter outside the frame
-    const int hy4 = (f->cur.p.p.h + 3) >> 2;
-    const int have_top = sby > 0;
-    const int is_sb64 = !f->seq_hdr.sb128;
-    const int starty4 = (sby & is_sb64) << 4;
-    const int sbsz = 32 >> is_sb64;
-    const int sbl2 = 5 - is_sb64;
-    const int halign = (f->bh + 31) & ~31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
-    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
-    const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);
-    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
-
-    // fix lpf strength at tile col boundaries
-    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
-    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
-    for (int tile_col = 1;; tile_col++) {
-        x = f->frame_hdr.tiling.col_start_sb[tile_col];
-        if ((x << sbl2) >= f->bw) break;
-        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
-        x >>= is_sb64;
-
-        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
-        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
-            const int sidx = mask >= 0x10000;
-            const unsigned smask = mask >> (sidx << 4);
-            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
-                                !!(y_hmask[1][sidx] & smask);
-            y_hmask[2][sidx] &= ~smask;
-            y_hmask[1][sidx] &= ~smask;
-            y_hmask[0][sidx] &= ~smask;
-            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
-        }
-
-        if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
-            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
-                 y++, uv_mask <<= 1)
-            {
-                const int sidx = uv_mask >= vmax;
-                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
-                const int idx = !!(uv_hmask[1][sidx] & smask);
-                uv_hmask[1][sidx] &= ~smask;
-                uv_hmask[0][sidx] &= ~smask;
-                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
-            }
-        }
-        lpf_y  += halign;
-        lpf_uv += halign >> ss_ver;
-    }
-
-    // fix lpf strength at tile row boundaries
-    if (start_of_tile_row) {
-        const BlockContext *a;
-        for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
-             x < f->sb128w; x++, a++)
-        {
-            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
-            for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {
-                const int sidx = mask >= 0x10000;
-                const unsigned smask = mask >> (sidx << 4);
-                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
-                                    !!(y_vmask[1][sidx] & smask);
-                y_vmask[2][sidx] &= ~smask;
-                y_vmask[1][sidx] &= ~smask;
-                y_vmask[0][sidx] &= ~smask;
-                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
-            }
-
-            if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
-                for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {
-                    const int sidx = uv_mask >= hmax;
-                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
-                    const int idx = !!(uv_vmask[1][sidx] & smask);
-                    uv_vmask[1][sidx] &= ~smask;
-                    uv_vmask[0][sidx] &= ~smask;
-                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
-                }
-            }
-        }
-    }
-
-    pixel *ptr;
-    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
-    for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
-         x++, have_left = 1, ptr += 128, level_ptr += 32)
-    {
-        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
-                            lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
-                            imin(32, f->bw - x * 32), starty4, endy4);
-    }
-
-    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
-    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
-        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
-                            lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],
-                            imin(32, f->bw - x * 32), starty4, endy4);
-    }
-
-    if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)
-        return;
-
-    ptrdiff_t uv_off;
-    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
-    for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
-         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
-    {
-        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
-                             lflvl[x].filter_uv[0],
-                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
-                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
-                             starty4 >> ss_ver, uv_endy4, ss_ver);
-    }
-
-    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
-    for (uv_off = 0, x = 0; x < f->sb128w;
-         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
-    {
-        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
-                             lflvl[x].filter_uv[1],
-                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
-                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
-                             starty4 >> ss_ver, uv_endy4, ss_hor);
-    }
-}
--- /dev/null
+++ b/src/lf_apply_tmpl.c
@@ -1,0 +1,306 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/lf_apply.h"
+
+static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
+                                       const int have_left,
+                                       const uint8_t (*lvl)[4],
+                                       const ptrdiff_t b4_stride,
+                                       const uint16_t (*const mask)[3][2],
+                                       pixel *dst, const ptrdiff_t ls,
+                                       const int w,
+                                       const int starty4, const int endy4)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (int x = 0; x < w; x++) {
+        if (!have_left && !x) continue;
+        uint32_t hmask[4];
+        if (!starty4) {
+            hmask[0] = mask[x][0][0];
+            hmask[1] = mask[x][1][0];
+            hmask[2] = mask[x][2][0];
+            if (endy4 > 16) {
+                hmask[0] |= mask[x][0][1] << 16;
+                hmask[1] |= mask[x][1][1] << 16;
+                hmask[2] |= mask[x][2][1] << 16;
+            }
+        } else {
+            hmask[0] = mask[x][0][1];
+            hmask[1] = mask[x][1][1];
+            hmask[2] = mask[x][2][1];
+        }
+        hmask[3] = 0;
+        dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
+                                     (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
+                                     &f->lf.lim_lut, endy4 - starty4);
+    }
+}
+
+static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
+                                       const int have_top,
+                                       const uint8_t (*lvl)[4],
+                                       const ptrdiff_t b4_stride,
+                                       const uint16_t (*const mask)[3][2],
+                                       pixel *dst, const ptrdiff_t ls,
+                                       const int w,
+                                       const int starty4, const int endy4)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (int y = starty4; y < endy4;
+         y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
+    {
+        if (!have_top && !y) continue;
+        const uint32_t vmask[4] = {
+            mask[y][0][0] | (mask[y][0][1] << 16),
+            mask[y][1][0] | (mask[y][1][1] << 16),
+            mask[y][2][0] | (mask[y][2][1] << 16),
+            0,
+        };
+        dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
+                                     (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
+                                     &f->lf.lim_lut, w);
+    }
+}
+
+static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
+                                        const int have_left,
+                                        const uint8_t (*lvl)[4],
+                                        const ptrdiff_t b4_stride,
+                                        const uint16_t (*const mask)[2][2],
+                                        pixel *const u, pixel *const v,
+                                        const ptrdiff_t ls, const int w,
+                                        const int starty4, const int endy4,
+                                        const int ss_ver)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+
+    // filter edges between columns (e.g. block1 | block2)
+    for (int x = 0; x < w; x++) {
+        if (!have_left && !x) continue;
+        uint32_t hmask[3];
+        if (!starty4) {
+            hmask[0] = mask[x][0][0];
+            hmask[1] = mask[x][1][0];
+            if (endy4 > (16 >> ss_ver)) {
+                hmask[0] |= mask[x][0][1] << (16 >> ss_ver);
+                hmask[1] |= mask[x][1][1] << (16 >> ss_ver);
+            }
+        } else {
+            hmask[0] = mask[x][0][1];
+            hmask[1] = mask[x][1][1];
+        }
+        hmask[2] = 0;
+        dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
+                                     (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
+                                     &f->lf.lim_lut, endy4 - starty4);
+        dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
+                                     (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
+                                     &f->lf.lim_lut, endy4 - starty4);
+    }
+}
+
+static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
+                                        const int have_top,
+                                        const uint8_t (*lvl)[4],
+                                        const ptrdiff_t b4_stride,
+                                        const uint16_t (*const mask)[2][2],
+                                        pixel *const u, pixel *const v,
+                                        const ptrdiff_t ls, const int w,
+                                        const int starty4, const int endy4,
+                                        const int ss_hor)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+    ptrdiff_t off_l = 0;
+
+    //                                 block1
+    // filter edges between rows (e.g. ------)
+    //                                 block2
+    for (int y = starty4; y < endy4;
+         y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
+    {
+        if (!have_top && !y) continue;
+        const uint32_t vmask[3] = {
+            mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),
+            mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),
+            0,
+        };
+        dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
+                                     (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
+                                     &f->lf.lim_lut, w);
+        dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
+                                     (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
+                                     &f->lf.lim_lut, w);
+    }
+}
+
+void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
+                                    pixel *const p[3], Av1Filter *const lflvl,
+                                    int sby, const int start_of_tile_row)
+{
+    int x, have_left;
+    // Don't filter outside the frame
+    const int hy4 = (f->cur.p.p.h + 3) >> 2;
+    const int have_top = sby > 0;
+    const int is_sb64 = !f->seq_hdr.sb128;
+    const int starty4 = (sby & is_sb64) << 4;
+    const int sbsz = 32 >> is_sb64;
+    const int sbl2 = 5 - is_sb64;
+    const int halign = (f->bh + 31) & ~31;
+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
+    const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);
+    const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
+
+    // fix lpf strength at tile col boundaries
+    const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
+    const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
+    for (int tile_col = 1;; tile_col++) {
+        x = f->frame_hdr.tiling.col_start_sb[tile_col];
+        if ((x << sbl2) >= f->bw) break;
+        const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
+        x >>= is_sb64;
+
+        uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
+        for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
+            const int sidx = mask >= 0x10000;
+            const unsigned smask = mask >> (sidx << 4);
+            const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
+                                !!(y_hmask[1][sidx] & smask);
+            y_hmask[2][sidx] &= ~smask;
+            y_hmask[1][sidx] &= ~smask;
+            y_hmask[0][sidx] &= ~smask;
+            y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
+        }
+
+        if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+            uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
+            for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
+                 y++, uv_mask <<= 1)
+            {
+                const int sidx = uv_mask >= vmax;
+                const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
+                const int idx = !!(uv_hmask[1][sidx] & smask);
+                uv_hmask[1][sidx] &= ~smask;
+                uv_hmask[0][sidx] &= ~smask;
+                uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
+            }
+        }
+        lpf_y  += halign;
+        lpf_uv += halign >> ss_ver;
+    }
+
+    // fix lpf strength at tile row boundaries
+    if (start_of_tile_row) {
+        const BlockContext *a;
+        for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
+             x < f->sb128w; x++, a++)
+        {
+            uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
+            for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {
+                const int sidx = mask >= 0x10000;
+                const unsigned smask = mask >> (sidx << 4);
+                const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
+                                    !!(y_vmask[1][sidx] & smask);
+                y_vmask[2][sidx] &= ~smask;
+                y_vmask[1][sidx] &= ~smask;
+                y_vmask[0][sidx] &= ~smask;
+                y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
+            }
+
+            if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+                uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
+                for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {
+                    const int sidx = uv_mask >= hmax;
+                    const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
+                    const int idx = !!(uv_vmask[1][sidx] & smask);
+                    uv_vmask[1][sidx] &= ~smask;
+                    uv_vmask[0][sidx] &= ~smask;
+                    uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
+                }
+            }
+        }
+    }
+
+    pixel *ptr;
+    uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
+    for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
+         x++, have_left = 1, ptr += 128, level_ptr += 32)
+    {
+        filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
+                            lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
+                            imin(32, f->bw - x * 32), starty4, endy4);
+    }
+
+    level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
+    for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
+        filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
+                            lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],
+                            imin(32, f->bw - x * 32), starty4, endy4);
+    }
+
+    if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)
+        return;
+
+    ptrdiff_t uv_off;
+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+    for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
+         x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+    {
+        filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
+                             lflvl[x].filter_uv[0],
+                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
+                             starty4 >> ss_ver, uv_endy4, ss_ver);
+    }
+
+    level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+    for (uv_off = 0, x = 0; x < f->sb128w;
+         x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+    {
+        filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
+                             lflvl[x].filter_uv[1],
+                             &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
+                             starty4 >> ss_ver, uv_endy4, ss_hor);
+    }
+}
--- a/src/loopfilter.c
+++ /dev/null
@@ -1,246 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdlib.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/loopfilter.h"
-
-static NOINLINE void
-loop_filter(pixel *dst, int E, int I, int H,
-            const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd)
-{
-    const int F = 1 << (BITDEPTH - 8);
-    E <<= BITDEPTH - 8;
-    I <<= BITDEPTH - 8;
-    H <<= BITDEPTH - 8;
-
-    for (int i = 0; i < 4; i++, dst += stridea) {
-        int p6, p5, p4, p3, p2;
-        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
-        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
-        int q2, q3, q4, q5, q6;
-        int fm, flat8out, flat8in;
-
-        fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
-             abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
-
-        if (wd > 4) {
-            p2 = dst[strideb * -3];
-            q2 = dst[strideb * +2];
-
-            fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
-
-            if (wd > 6) {
-                p3 = dst[strideb * -4];
-                q3 = dst[strideb * +3];
-
-                fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
-            }
-        }
-        if (!fm) continue;
-
-        if (wd >= 16) {
-            p6 = dst[strideb * -7];
-            p5 = dst[strideb * -6];
-            p4 = dst[strideb * -5];
-            q4 = dst[strideb * +4];
-            q5 = dst[strideb * +5];
-            q6 = dst[strideb * +6];
-
-            flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
-                       abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
-                       abs(q5 - q0) <= F && abs(q6 - q0) <= F;
-        }
-
-        if (wd >= 6)
-            flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
-                      abs(q1 - q0) <= F && abs(q2 - q0) <= F;
-
-        if (wd >= 8)
-            flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
-
-        if (wd >= 16 && (flat8out & flat8in)) {
-            dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
-                                 p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
-            dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
-                                 p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
-            dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
-                                 p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
-            dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
-                                 p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
-            dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
-                                 p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
-            dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
-                                 q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
-            dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
-                                 q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
-            dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
-                                 q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
-            dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
-                                 q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
-            dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
-                                 q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
-            dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
-                                 q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
-            dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
-                                 q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
-        } else if (wd >= 8 && flat8in) {
-            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
-            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
-            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
-            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
-            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
-            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
-        } else if (wd == 6 && flat8in) {
-            dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
-            dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
-            dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
-            dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
-        } else {
-            const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
-
-#define iclip_diff(v) iclip(v, -128 * (1 << (BITDEPTH - 8)), \
-                                128 * (1 << (BITDEPTH - 8)) - 1)
-
-            if (hev) {
-                int f = iclip_diff(p1 - q1), f1, f2;
-                f = iclip_diff(3 * (q0 - p0) + f);
-
-                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
-                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
-
-                dst[strideb * -1] = iclip_pixel(p0 + f2);
-                dst[strideb * +0] = iclip_pixel(q0 - f1);
-            } else {
-                int f = iclip_diff(3 * (q0 - p0)), f1, f2;
-
-                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
-                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
-
-                dst[strideb * -1] = iclip_pixel(p0 + f2);
-                dst[strideb * +0] = iclip_pixel(q0 - f1);
-
-                f = (f1 + 1) >> 1;
-                dst[strideb * -2] = iclip_pixel(p1 + f);
-                dst[strideb * +1] = iclip_pixel(q1 - f);
-            }
-#undef iclip_diff
-        }
-    }
-}
-
-static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
-                                   const uint32_t *const vmask,
-                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                                   const Av1FilterLUT *lut, const int h)
-{
-    const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
-    for (unsigned y = 1; vm & ~(y - 1);
-         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
-    {
-        if (vm & y) {
-            const int L = l[0][0] ? l[0][0] : l[-1][0];
-            if (!L) continue;
-            const int H = L >> 4;
-            const int E = lut->e[L], I = lut->i[L];
-            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
-            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);
-        }
-    }
-}
-
-static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
-                                   const uint32_t *const vmask,
-                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                                   const Av1FilterLUT *lut, const int w)
-{
-    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
-    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
-        if (vm & x) {
-            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
-            if (!L) continue;
-            const int H = L >> 4;
-            const int E = lut->e[L], I = lut->i[L];
-            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
-            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);
-        }
-    }
-}
-
-static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
-                                    const uint32_t *const vmask,
-                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                                    const Av1FilterLUT *lut, const int h)
-{
-    const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
-    for (unsigned y = 1; vm & ~(y - 1);
-         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
-    {
-        if (vm & y) {
-            const int L = l[0][0] ? l[0][0] : l[-1][0];
-            if (!L) continue;
-            const int H = L >> 4;
-            const int E = lut->e[L], I = lut->i[L];
-            const int idx = !!(vmask[1] & y);
-            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);
-        }
-    }
-}
-
-static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
-                                    const uint32_t *const vmask,
-                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
-                                    const Av1FilterLUT *lut, const int w)
-{
-    const unsigned vm = vmask[0] | vmask[1];
-    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
-        if (vm & x) {
-            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
-            if (!L) continue;
-            const int H = L >> 4;
-            const int E = lut->e[L], I = lut->i[L];
-            const int idx = !!(vmask[1] & x);
-            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);
-        }
-    }
-}
-
-void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
-    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
-    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
-    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
-    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
-
-#if HAVE_ASM && ARCH_X86
-    bitfn(dav1d_loop_filter_dsp_init_x86)(c);
-#endif
-}
--- /dev/null
+++ b/src/loopfilter_tmpl.c
@@ -1,0 +1,246 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/loopfilter.h"
+
+static NOINLINE void
+loop_filter(pixel *dst, int E, int I, int H,
+            const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd)
+{
+    const int F = 1 << (BITDEPTH - 8);
+    E <<= BITDEPTH - 8;
+    I <<= BITDEPTH - 8;
+    H <<= BITDEPTH - 8;
+
+    for (int i = 0; i < 4; i++, dst += stridea) {
+        int p6, p5, p4, p3, p2;
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2, q3, q4, q5, q6;
+        int fm, flat8out, flat8in;
+
+        fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
+             abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
+
+        if (wd > 4) {
+            p2 = dst[strideb * -3];
+            q2 = dst[strideb * +2];
+
+            fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
+
+            if (wd > 6) {
+                p3 = dst[strideb * -4];
+                q3 = dst[strideb * +3];
+
+                fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
+            }
+        }
+        if (!fm) continue;
+
+        if (wd >= 16) {
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+
+            flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
+                       abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
+                       abs(q5 - q0) <= F && abs(q6 - q0) <= F;
+        }
+
+        if (wd >= 6)
+            flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
+                      abs(q1 - q0) <= F && abs(q2 - q0) <= F;
+
+        if (wd >= 8)
+            flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
+
+        if (wd >= 16 && (flat8out & flat8in)) {
+            dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
+                                 p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
+                                 p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
+                                 p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
+                                 p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+                                 p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                 q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                 q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                 q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
+            dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+                                 q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+                                 q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+                                 q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
+                                 q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else if (wd == 6 && flat8in) {
+            dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
+            dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
+            dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
+            dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
+        } else {
+            const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
+
+#define iclip_diff(v) iclip(v, -128 * (1 << (BITDEPTH - 8)), \
+                                128 * (1 << (BITDEPTH - 8)) - 1)
+
+            if (hev) {
+                int f = iclip_diff(p1 - q1), f1, f2;
+                f = iclip_diff(3 * (q0 - p0) + f);
+
+                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
+                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
+
+                dst[strideb * -1] = iclip_pixel(p0 + f2);
+                dst[strideb * +0] = iclip_pixel(q0 - f1);
+            } else {
+                int f = iclip_diff(3 * (q0 - p0)), f1, f2;
+
+                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
+                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
+
+                dst[strideb * -1] = iclip_pixel(p0 + f2);
+                dst[strideb * +0] = iclip_pixel(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = iclip_pixel(p1 + f);
+                dst[strideb * +1] = iclip_pixel(q1 - f);
+            }
+#undef iclip_diff
+        }
+    }
+}
+
+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
+                                   const uint32_t *const vmask,
+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                   const Av1FilterLUT *lut, const int h)
+{
+    const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
+    for (unsigned y = 1; vm & ~(y - 1);
+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+    {
+        if (vm & y) {
+            const int L = l[0][0] ? l[0][0] : l[-1][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);
+        }
+    }
+}
+
+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
+                                   const uint32_t *const vmask,
+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                   const Av1FilterLUT *lut, const int w)
+{
+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);
+        }
+    }
+}
+
+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+                                    const uint32_t *const vmask,
+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                    const Av1FilterLUT *lut, const int h)
+{
+    const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
+    for (unsigned y = 1; vm & ~(y - 1);
+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+    {
+        if (vm & y) {
+            const int L = l[0][0] ? l[0][0] : l[-1][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = !!(vmask[1] & y);
+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);
+        }
+    }
+}
+
+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+                                    const uint32_t *const vmask,
+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                    const Av1FilterLUT *lut, const int w)
+{
+    const unsigned vm = vmask[0] | vmask[1];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);
+        }
+    }
+}
+
+void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
+    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
+    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
+    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
+    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_loop_filter_dsp_init_x86)(c);
+#endif
+}
--- a/src/looprestoration.c
+++ /dev/null
@@ -1,577 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdlib.h>
-
-#include "common/intops.h"
-
-#include "src/looprestoration.h"
-#include "src/tables.h"
-
-// 256 * 1.5 + 3 + 3 = 390
-#define REST_UNIT_STRIDE (390)
-
-// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
-// TODO Chroma only requires 2 rows of padding.
-static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
-                    const pixel (*left)[4],
-                    const pixel *lpf, const ptrdiff_t lpf_stride,
-                    int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
-{
-    const int have_left = !!(edges & LR_HAVE_LEFT);
-    const int have_right = !!(edges & LR_HAVE_RIGHT);
-
-    // Copy more pixels if we don't have to pad them
-    unit_w += 3 * have_left + 3 * have_right;
-    pixel *dst_l = dst + 3 * !have_left;
-    p -= 3 * have_left;
-    lpf -= 3 * have_left;
-
-    if (edges & LR_HAVE_TOP) {
-        // Copy previous loop filtered rows
-        const pixel *const above_1 = lpf;
-        const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride);
-        pixel_copy(dst_l, above_1, unit_w);
-        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
-        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
-    } else {
-        // Pad with first row
-        pixel_copy(dst_l, p, unit_w);
-        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
-        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
-        if (have_left) {
-            pixel_copy(dst_l, &left[0][1], 3);
-            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
-            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
-        }
-    }
-
-    pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
-    if (edges & LR_HAVE_BOTTOM) {
-        // Copy next loop filtered rows
-        const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
-        const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride);
-        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
-        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
-        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
-    } else {
-        // Pad with last row
-        const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
-        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
-        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
-        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
-        if (have_left) {
-            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
-            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
-            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
-        }
-    }
-
-    // Inner UNIT_WxSTRIPE_H
-    for (int j = 0; j < stripe_h; j++) {
-        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
-        dst_tl += REST_UNIT_STRIDE;
-        p += PXSTRIDE(p_stride);
-    }
-
-    if (!have_right) {
-        pixel *pad = dst_l + unit_w;
-        pixel *row_last = &dst_l[unit_w - 1];
-        // Pad 3x(STRIPE_H+6) with last column
-        for (int j = 0; j < stripe_h + 6; j++) {
-            pixel_set(pad, *row_last, 3);
-            pad += REST_UNIT_STRIDE;
-            row_last += REST_UNIT_STRIDE;
-        }
-    }
-
-    if (!have_left) {
-        // Pad 3x(STRIPE_H+6) with first column
-        for (int j = 0; j < stripe_h + 6; j++) {
-            pixel_set(dst, *dst_l, 3);
-            dst += REST_UNIT_STRIDE;
-            dst_l += REST_UNIT_STRIDE;
-        }
-    } else {
-        dst += 3 * REST_UNIT_STRIDE;
-        for (int j = 0; j < stripe_h; j++) {
-            pixel_copy(dst, &left[j][1], 3);
-            dst += REST_UNIT_STRIDE;
-        }
-    }
-}
-
-// FIXME Could split into luma and chroma specific functions,
-// (since first and last tops are always 0 for chroma)
-// FIXME Could implement a version that requires less temporary memory
-// (should be possible to implement with only 6 rows of temp storage)
-static void wiener_c(pixel *p, const ptrdiff_t p_stride,
-                     const pixel (*const left)[4],
-                     const pixel *lpf, const ptrdiff_t lpf_stride,
-                     const int w, const int h,
-                     const int16_t filterh[7], const int16_t filterv[7],
-                     const enum LrEdgeFlags edges)
-{
-    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
-    // of padding above and below
-    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-    pixel *tmp_ptr = tmp;
-
-    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
-
-    // Values stored between horizontal and vertical filtering don't
-    // fit in a uint8_t.
-    uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-    uint16_t *hor_ptr = hor;
-
-    const int round_bits_h = 3 + (BITDEPTH == 12) * 2;
-    const int rounding_off_h = 1 << (round_bits_h - 1);
-    const int clip_limit = 1 << ((BITDEPTH) + 1 + 7 - round_bits_h);
-    for (int j = 0; j < h + 6; j++) {
-        for (int i = 0; i < w; i++) {
-            int sum = (tmp_ptr[i + 3] << 7) + (1 << (BITDEPTH + 6));
-
-            for (int k = 0; k < 7; k++) {
-                sum += tmp_ptr[i + k] * filterh[k];
-            }
-
-            hor_ptr[i] =
-                iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
-        }
-        tmp_ptr += REST_UNIT_STRIDE;
-        hor_ptr += REST_UNIT_STRIDE;
-    }
-
-    const int round_bits_v = 11 - (BITDEPTH == 12) * 2;
-    const int rounding_off_v = 1 << (round_bits_v - 1);
-    const int round_offset = 1 << (BITDEPTH + (round_bits_v - 1));
-    for (int i = 0; i < w; i++) {
-        for (int j = 0; j < h; j++) {
-            int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
-
-            for (int k = 0; k < 7; k++) {
-                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];
-            }
-
-            p[j * PXSTRIDE(p_stride) + i] =
-                iclip_pixel((sum + rounding_off_v) >> round_bits_v);
-        }
-    }
-}
-
-// Sum over a 3x3 area
-// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
-// left of the top left corner. However, the self guided filter only needs 1
-// pixel above and one pixel to the left. As for the pixels below and to the
-// right they must be computed in the sums, but don't need to be stored.
-//
-// Example for a 4x4 block:
-//      x x x x x x x x x x
-//      x c c c c c c c c x
-//      x i s s s s s s i x
-//      x i s s s s s s i x
-//      x i s s s s s s i x
-//      x i s s s s s s i x
-//      x i s s s s s s i x
-//      x i s s s s s s i x
-//      x c c c c c c c c x
-//      x x x x x x x x x x
-//
-// s: Pixel summed and stored
-// i: Pixel summed and stored (between loops)
-// c: Pixel summed not stored
-// x: Pixel not summed not stored
-static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
-    // We skip the first row, as it is never used
-    src += REST_UNIT_STRIDE;
-    dst += REST_UNIT_STRIDE;
-
-    // We skip the first and last columns, as they are never used
-    for (int x = 1; x < w - 1; x++) {
-        coef *ds = dst + x;
-        const pixel *s = src + x;
-        int a = s[0], b = s[REST_UNIT_STRIDE];
-
-        // We skip the first 2 rows, as they are skipped in the next loop and
-        // we don't need the last 2 row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int c = s[REST_UNIT_STRIDE];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c;
-            a = b;
-            b = c;
-        }
-     }
-
-    // We skip the first 2 rows as they are never read
-    dst += REST_UNIT_STRIDE;
-    // We skip the last 2 rows as it is never read
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[1], b = dst[2];
-
-        // We don't store the first column as it is never read and
-        // we don't store the last 2 columns as they are never read
-        for (int x = 2; x < w - 2; x++) {
-            const int c = dst[x + 1];
-            dst[x] = a + b + c;
-            a = b;
-            b = c;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-// Sum over a 5x5 area
-// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
-// left of the top left corner. However, the self guided filter only needs 1
-// pixel above and one pixel to the left. As for the pixels below and to the
-// right they must be computed in the sums, but don't need to be stored.
-//
-// Example for a 4x4 block:
-//      c c c c c c c c c c
-//      c c c c c c c c c c
-//      i i s s s s s s i i
-//      i i s s s s s s i i
-//      i i s s s s s s i i
-//      i i s s s s s s i i
-//      i i s s s s s s i i
-//      i i s s s s s s i i
-//      c c c c c c c c c c
-//      c c c c c c c c c c
-//
-// s: Pixel summed and stored
-// i: Pixel summed and stored (between loops)
-// c: Pixel summed not stored
-// x: Pixel not summed not stored
-static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
-    // We skip the first row, as it is never used
-    dst += REST_UNIT_STRIDE;
-
-    for (int x = 0; x < w; x++) {
-        coef *ds = dst + x;
-        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
-        int a = s[-3 * REST_UNIT_STRIDE];
-        int b = s[-2 * REST_UNIT_STRIDE];
-        int c = s[-1 * REST_UNIT_STRIDE];
-        int d = s[0];
-
-        // We skip the first 2 rows, as they are skipped in the next loop and
-        // we don't need the last 2 row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int e = *s;
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-    }
-
-    // We skip the first 2 rows as they are never read
-    dst += REST_UNIT_STRIDE;
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[0];
-        int b = dst[1];
-        int c = dst[2];
-        int d = dst[3];
-
-        for (int x = 2; x < w - 2; x++) {
-            const int e = dst[x + 2];
-            dst[x] = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-// See boxsum3 function comments for details on row and column skipping
-static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
-    // We skip the first row, as it is never used
-    src += REST_UNIT_STRIDE;
-    dst += REST_UNIT_STRIDE;
-
-    // We skip the first and last columns, as they are never used
-    for (int x = 1; x < w - 1; x++) {
-        int *ds = dst + x;
-        const pixel *s = src + x;
-        int a = s[0] * s[0];
-        int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-
-        // We skip the first row, as it is skipped in the next loop and
-        // we don't need the last row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c;
-            a = b;
-            b = c;
-        }
-     }
-
-    // We skip the first row as it is never read
-    dst += REST_UNIT_STRIDE;
-    // We skip the last row as it is never read
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[1], b = dst[2];
-
-        // We don't store the first column as it is never read and
-        // we don't store the last 2 columns as they are never read
-        for (int x = 2; x < w - 2; x++) {
-            const int c = dst[x + 1];
-            dst[x] = a + b + c;
-            a = b;
-            b = c;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-// See boxsum5 function comments for details on row and column skipping
-static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
-                       const int h)
-{
-    // We skip the first row, as it is never used
-    dst += REST_UNIT_STRIDE;
-
-    for (int x = 0; x < w; x++) {
-        int *ds = dst + x;
-        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
-        int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
-        int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
-        int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
-        int d = s[0] * s[0];
-
-        // We skip the first 2 rows, as they are skipped in the next loop and
-        // we don't need the last 2 row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int e = s[0] * s[0];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-    }
-
-    // We skip the first 2 rows as they are never read
-    dst += REST_UNIT_STRIDE;
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[0];
-        int b = dst[1];
-        int c = dst[2];
-        int d = dst[3];
-
-        for (int x = 2; x < w - 2; x++) {
-            const int e = dst[x + 2];
-            dst[x] = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-static void selfguided_filter(int32_t *dst, const pixel *src,
-                              const ptrdiff_t src_stride, const int w,
-                              const int h, const int n, const int s)
-{
-    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
-    // of padding above and below
-    int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-    int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
-    // By inverting A and B after the boxsums, B can be of size coef instead
-    // of int32_t
-    coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-    coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
-
-    const int step = (n == 25) + 1;
-    if (n == 25) {
-        boxsum5(B_, src, w + 6, h + 6);
-        boxsum5sqr(A_, src, w + 6, h + 6);
-    } else {
-        boxsum3(B_, src, w + 6, h + 6);
-        boxsum3sqr(A_, src, w + 6, h + 6);
-    }
-
-    int32_t *AA = A - REST_UNIT_STRIDE;
-    coef *BB = B - REST_UNIT_STRIDE;
-    for (int j = -1; j < h + 1; j+= step) {
-        for (int i = -1; i < w + 1; i++) {
-            const int a =
-                (AA[i] + (1 << (2 * (BITDEPTH - 8)) >> 1)) >> (2 * (BITDEPTH - 8));
-            const int b =
-                (BB[i] + (1 << (BITDEPTH - 8) >> 1)) >> (BITDEPTH - 8);
-
-            const uint32_t p = (a * n >= b * b) * (a * n - b * b);
-            const uint32_t z = (p * s + (1 << 19)) >> 20;
-
-            const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
-            // This is where we invert A and B, so that B is of size coef.
-            AA[i] = (((1 << 8) - x) * BB[i] * dav1d_sgr_one_by_x[n - 1] + (1 << 11)) >> 12;
-            BB[i] = x;
-        }
-        AA += step * REST_UNIT_STRIDE;
-        BB += step * REST_UNIT_STRIDE;
-    }
-
-    src += 3 * REST_UNIT_STRIDE + 3;
-    if (n == 25) {
-        int j = 0;
-#define SIX_NEIGHBORS(P, i)\
-    ((P[i - REST_UNIT_STRIDE]     + P[i + REST_UNIT_STRIDE]) * 6 +   \
-     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +    \
-      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
-        for (; j < h - 1; j+=2) {
-            for (int i = 0; i < w; i++) {
-                const int32_t a = SIX_NEIGHBORS(B, i);
-                const int32_t b = SIX_NEIGHBORS(A, i);
-                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
-            }
-            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
-            src += REST_UNIT_STRIDE;
-            B += REST_UNIT_STRIDE;
-            A += REST_UNIT_STRIDE;
-            for (int i = 0; i < w; i++) {
-                const int32_t a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
-                const int32_t b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
-                dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
-            }
-            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
-            src += REST_UNIT_STRIDE;
-            B += REST_UNIT_STRIDE;
-            A += REST_UNIT_STRIDE;
-        }
-        if (j + 1 == h) { // Last row, when number of rows is odd
-            for (int i = 0; i < w; i++) {
-                const int32_t a = SIX_NEIGHBORS(B, i);
-                const int32_t b = SIX_NEIGHBORS(A, i);
-                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
-            }
-        }
-#undef SIX_NEIGHBORS
-    } else {
-#define EIGHT_NEIGHBORS(P, i)\
-    ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
-     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +                           \
-      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
-        for (int j = 0; j < h; j++) {
-            for (int i = 0; i < w; i++) {
-                const int32_t a = EIGHT_NEIGHBORS(B, i);
-                const int32_t b = EIGHT_NEIGHBORS(A, i);
-                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
-            }
-            dst += 384;
-            src += REST_UNIT_STRIDE;
-            B += REST_UNIT_STRIDE;
-            A += REST_UNIT_STRIDE;
-        }
-    }
-#undef NINE_NEIGHBORS
-}
-
-static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
-                         const pixel (*const left)[4],
-                         const pixel *lpf, const ptrdiff_t lpf_stride,
-                         const int w, const int h, const int sgr_idx,
-                         const int16_t sgr_w[2], const enum LrEdgeFlags edges)
-{
-    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
-    // of padding above and below
-    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-
-    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
-
-    // Selfguided filter outputs to a maximum stripe height of 64 and a
-    // maximum restoration width of 384 (256 * 1.5)
-    int32_t dst[64 * 384];
-
-    // both r1 and r0 can't be zero
-    if (!dav1d_sgr_params[sgr_idx][0]) {
-        const int s1 = dav1d_sgr_params[sgr_idx][3];
-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
-        const int w1 = (1 << 7) - sgr_w[1];
-        for (int j = 0; j < h; j++) {
-            for (int i = 0; i < w; i++) {
-                const int32_t u = (p[i] << 4);
-                const int32_t v = (u << 7) + w1 * (dst[j * 384 + i] - u);
-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
-            }
-            p += PXSTRIDE(p_stride);
-        }
-    } else if (!dav1d_sgr_params[sgr_idx][1]) {
-        const int s0 = dav1d_sgr_params[sgr_idx][2];
-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
-        const int w0 = sgr_w[0];
-        for (int j = 0; j < h; j++) {
-            for (int i = 0; i < w; i++) {
-                const int32_t u = (p[i] << 4);
-                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u);
-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
-            }
-            p += PXSTRIDE(p_stride);
-        }
-    } else {
-        int32_t dst1[64 * 384];
-        const int s0 = dav1d_sgr_params[sgr_idx][2];
-        const int s1 = dav1d_sgr_params[sgr_idx][3];
-        const int w0 = sgr_w[0];
-        const int w1 = (1 << 7) - w0 - sgr_w[1];
-        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
-        selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
-        for (int j = 0; j < h; j++) {
-            for (int i = 0; i < w; i++) {
-                const int32_t u = (p[i] << 4);
-                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
-                                  w1 * (dst1[j * 384 + i] - u);
-                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
-            }
-            p += PXSTRIDE(p_stride);
-        }
-    }
-}
-
-void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
-    c->wiener = wiener_c;
-    c->selfguided = selfguided_c;
-
-#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
-    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
-#endif
-}
--- /dev/null
+++ b/src/looprestoration_tmpl.c
@@ -1,0 +1,577 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/looprestoration.h"
+#include "src/tables.h"
+
+// 256 * 1.5 + 3 + 3 = 390
+#define REST_UNIT_STRIDE (390)
+
+// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
+// TODO Chroma only requires 2 rows of padding.
+static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
+                    const pixel (*left)[4],
+                    const pixel *lpf, const ptrdiff_t lpf_stride,
+                    int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
+{
+    const int have_left = !!(edges & LR_HAVE_LEFT);
+    const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+    // Copy more pixels if we don't have to pad them
+    unit_w += 3 * have_left + 3 * have_right;
+    pixel *dst_l = dst + 3 * !have_left;
+    p -= 3 * have_left;
+    lpf -= 3 * have_left;
+
+    if (edges & LR_HAVE_TOP) {
+        // Copy previous loop filtered rows
+        const pixel *const above_1 = lpf;
+        const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_l, above_1, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+    } else {
+        // Pad with first row
+        pixel_copy(dst_l, p, unit_w);
+        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+        if (have_left) {
+            pixel_copy(dst_l, &left[0][1], 3);
+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+        }
+    }
+
+    pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+    if (edges & LR_HAVE_BOTTOM) {
+        // Copy next loop filtered rows
+        const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
+        const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+    } else {
+        // Pad with last row
+        const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
+        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+        if (have_left) {
+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+        }
+    }
+
+    // Inner UNIT_WxSTRIPE_H
+    for (int j = 0; j < stripe_h; j++) {
+        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+        dst_tl += REST_UNIT_STRIDE;
+        p += PXSTRIDE(p_stride);
+    }
+
+    if (!have_right) {
+        pixel *pad = dst_l + unit_w;
+        pixel *row_last = &dst_l[unit_w - 1];
+        // Pad 3x(STRIPE_H+6) with last column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(pad, *row_last, 3);
+            pad += REST_UNIT_STRIDE;
+            row_last += REST_UNIT_STRIDE;
+        }
+    }
+
+    if (!have_left) {
+        // Pad 3x(STRIPE_H+6) with first column
+        for (int j = 0; j < stripe_h + 6; j++) {
+            pixel_set(dst, *dst_l, 3);
+            dst += REST_UNIT_STRIDE;
+            dst_l += REST_UNIT_STRIDE;
+        }
+    } else {
+        dst += 3 * REST_UNIT_STRIDE;
+        for (int j = 0; j < stripe_h; j++) {
+            pixel_copy(dst, &left[j][1], 3);
+            dst += REST_UNIT_STRIDE;
+        }
+    }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+static void wiener_c(pixel *p, const ptrdiff_t p_stride,
+                     const pixel (*const left)[4],
+                     const pixel *lpf, const ptrdiff_t lpf_stride,
+                     const int w, const int h,
+                     const int16_t filterh[7], const int16_t filterv[7],
+                     const enum LrEdgeFlags edges)
+{
+    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    pixel *tmp_ptr = tmp;
+
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+
+    // Values stored between horizontal and vertical filtering don't
+    // fit in a uint8_t.
+    uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    uint16_t *hor_ptr = hor;
+
+    const int round_bits_h = 3 + (BITDEPTH == 12) * 2;
+    const int rounding_off_h = 1 << (round_bits_h - 1);
+    const int clip_limit = 1 << ((BITDEPTH) + 1 + 7 - round_bits_h);
+    for (int j = 0; j < h + 6; j++) {
+        for (int i = 0; i < w; i++) {
+            int sum = (tmp_ptr[i + 3] << 7) + (1 << (BITDEPTH + 6));
+
+            for (int k = 0; k < 7; k++) {
+                sum += tmp_ptr[i + k] * filterh[k];
+            }
+
+            hor_ptr[i] =
+                iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
+        }
+        tmp_ptr += REST_UNIT_STRIDE;
+        hor_ptr += REST_UNIT_STRIDE;
+    }
+
+    const int round_bits_v = 11 - (BITDEPTH == 12) * 2;
+    const int rounding_off_v = 1 << (round_bits_v - 1);
+    const int round_offset = 1 << (BITDEPTH + (round_bits_v - 1));
+    for (int i = 0; i < w; i++) {
+        for (int j = 0; j < h; j++) {
+            int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
+
+            for (int k = 0; k < 7; k++) {
+                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];
+            }
+
+            p[j * PXSTRIDE(p_stride) + i] =
+                iclip_pixel((sum + rounding_off_v) >> round_bits_v);
+        }
+    }
+}
+
+// Sum over a 3x3 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+//      x x x x x x x x x x
+//      x c c c c c c c c x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x i s s s s s s i x
+//      x c c c c c c c c x
+//      x x x x x x x x x x
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
+    // We skip the first row, as it is never used
+    src += REST_UNIT_STRIDE;
+    dst += REST_UNIT_STRIDE;
+
+    // We skip the first and last columns, as they are never used
+    for (int x = 1; x < w - 1; x++) {
+        coef *ds = dst + x;
+        const pixel *s = src + x;
+        int a = s[0], b = s[REST_UNIT_STRIDE];
+
+        // We skip the first 2 rows, as they are skipped in the next loop and
+        // we don't need the last 2 row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int c = s[REST_UNIT_STRIDE];
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c;
+            a = b;
+            b = c;
+        }
+     }
+
+    // We skip the first 2 rows as they are never read
+    dst += REST_UNIT_STRIDE;
+    // We skip the last 2 rows as it is never read
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[1], b = dst[2];
+
+        // We don't store the first column as it is never read and
+        // we don't store the last 2 columns as they are never read
+        for (int x = 2; x < w - 2; x++) {
+            const int c = dst[x + 1];
+            dst[x] = a + b + c;
+            a = b;
+            b = c;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+// Sum over a 5x5 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+//      c c c c c c c c c c
+//      c c c c c c c c c c
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      i i s s s s s s i i
+//      c c c c c c c c c c
+//      c c c c c c c c c c
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
+    // We skip the first row, as it is never used
+    dst += REST_UNIT_STRIDE;
+
+    for (int x = 0; x < w; x++) {
+        coef *ds = dst + x;
+        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+        int a = s[-3 * REST_UNIT_STRIDE];
+        int b = s[-2 * REST_UNIT_STRIDE];
+        int c = s[-1 * REST_UNIT_STRIDE];
+        int d = s[0];
+
+        // We skip the first 2 rows, as they are skipped in the next loop and
+        // we don't need the last 2 row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int e = *s;
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+    }
+
+    // We skip the first 2 rows as they are never read
+    dst += REST_UNIT_STRIDE;
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[0];
+        int b = dst[1];
+        int c = dst[2];
+        int d = dst[3];
+
+        for (int x = 2; x < w - 2; x++) {
+            const int e = dst[x + 2];
+            dst[x] = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+// See boxsum3 function comments for details on row and column skipping
+static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
+    // We skip the first row, as it is never used
+    src += REST_UNIT_STRIDE;
+    dst += REST_UNIT_STRIDE;
+
+    // We skip the first and last columns, as they are never used
+    for (int x = 1; x < w - 1; x++) {
+        int *ds = dst + x;
+        const pixel *s = src + x;
+        int a = s[0] * s[0];
+        int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
+
+        // We skip the first row, as it is skipped in the next loop and
+        // we don't need the last row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c;
+            a = b;
+            b = c;
+        }
+     }
+
+    // We skip the first row as it is never read
+    dst += REST_UNIT_STRIDE;
+    // We skip the last row as it is never read
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[1], b = dst[2];
+
+        // We don't store the first column as it is never read and
+        // we don't store the last 2 columns as they are never read
+        for (int x = 2; x < w - 2; x++) {
+            const int c = dst[x + 1];
+            dst[x] = a + b + c;
+            a = b;
+            b = c;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+// See boxsum5 function comments for details on row and column skipping
+static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
+                       const int h)
+{
+    // We skip the first row, as it is never used
+    dst += REST_UNIT_STRIDE;
+
+    for (int x = 0; x < w; x++) {
+        int *ds = dst + x;
+        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+        int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
+        int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
+        int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
+        int d = s[0] * s[0];
+
+        // We skip the first 2 rows, as they are skipped in the next loop and
+        // we don't need the last 2 row as it is skipped in the next loop
+        for (int y = 2; y < h - 2; y++) {
+            s += REST_UNIT_STRIDE;
+            const int e = s[0] * s[0];
+            ds += REST_UNIT_STRIDE;
+            *ds = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+    }
+
+    // We skip the first 2 rows as they are never read
+    dst += REST_UNIT_STRIDE;
+    for (int y = 2; y < h - 2; y++) {
+        int a = dst[0];
+        int b = dst[1];
+        int c = dst[2];
+        int d = dst[3];
+
+        for (int x = 2; x < w - 2; x++) {
+            const int e = dst[x + 2];
+            dst[x] = a + b + c + d + e;
+            a = b;
+            b = c;
+            c = d;
+            d = e;
+        }
+        dst += REST_UNIT_STRIDE;
+    }
+}
+
+static void selfguided_filter(int32_t *dst, const pixel *src,
+                              const ptrdiff_t src_stride, const int w,
+                              const int h, const int n, const int s)
+{
+    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
+    // By inverting A and B after the boxsums, B can be of size coef instead
+    // of int32_t
+    coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+    coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
+
+    const int step = (n == 25) + 1;
+    if (n == 25) {
+        boxsum5(B_, src, w + 6, h + 6);
+        boxsum5sqr(A_, src, w + 6, h + 6);
+    } else {
+        boxsum3(B_, src, w + 6, h + 6);
+        boxsum3sqr(A_, src, w + 6, h + 6);
+    }
+
+    int32_t *AA = A - REST_UNIT_STRIDE;
+    coef *BB = B - REST_UNIT_STRIDE;
+    for (int j = -1; j < h + 1; j+= step) {
+        for (int i = -1; i < w + 1; i++) {
+            const int a =
+                (AA[i] + (1 << (2 * (BITDEPTH - 8)) >> 1)) >> (2 * (BITDEPTH - 8));
+            const int b =
+                (BB[i] + (1 << (BITDEPTH - 8) >> 1)) >> (BITDEPTH - 8);
+
+            const uint32_t p = (a * n >= b * b) * (a * n - b * b);
+            const uint32_t z = (p * s + (1 << 19)) >> 20;
+
+            const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
+            // This is where we invert A and B, so that B is of size coef.
+            AA[i] = (((1 << 8) - x) * BB[i] * dav1d_sgr_one_by_x[n - 1] + (1 << 11)) >> 12;
+            BB[i] = x;
+        }
+        AA += step * REST_UNIT_STRIDE;
+        BB += step * REST_UNIT_STRIDE;
+    }
+
+    src += 3 * REST_UNIT_STRIDE + 3;
+    if (n == 25) {
+        int j = 0;
+#define SIX_NEIGHBORS(P, i)\
+    ((P[i - REST_UNIT_STRIDE]     + P[i + REST_UNIT_STRIDE]) * 6 +   \
+     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +    \
+      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
+        for (; j < h - 1; j+=2) {
+            for (int i = 0; i < w; i++) {
+                const int32_t a = SIX_NEIGHBORS(B, i);
+                const int32_t b = SIX_NEIGHBORS(A, i);
+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+            }
+            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+            src += REST_UNIT_STRIDE;
+            B += REST_UNIT_STRIDE;
+            A += REST_UNIT_STRIDE;
+            for (int i = 0; i < w; i++) {
+                const int32_t a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
+                const int32_t b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
+                dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
+            }
+            dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+            src += REST_UNIT_STRIDE;
+            B += REST_UNIT_STRIDE;
+            A += REST_UNIT_STRIDE;
+        }
+        if (j + 1 == h) { // Last row, when number of rows is odd
+            for (int i = 0; i < w; i++) {
+                const int32_t a = SIX_NEIGHBORS(B, i);
+                const int32_t b = SIX_NEIGHBORS(A, i);
+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+            }
+        }
+#undef SIX_NEIGHBORS
+    } else {
+#define EIGHT_NEIGHBORS(P, i)\
+    ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
+     (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +                           \
+      P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int32_t a = EIGHT_NEIGHBORS(B, i);
+                const int32_t b = EIGHT_NEIGHBORS(A, i);
+                dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+            }
+            dst += 384;
+            src += REST_UNIT_STRIDE;
+            B += REST_UNIT_STRIDE;
+            A += REST_UNIT_STRIDE;
+        }
+    }
+#undef NINE_NEIGHBORS
+}
+
+static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
+                         const pixel (*const left)[4],
+                         const pixel *lpf, const ptrdiff_t lpf_stride,
+                         const int w, const int h, const int sgr_idx,
+                         const int16_t sgr_w[2], const enum LrEdgeFlags edges)
+{
+    // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+    // of padding above and below
+    pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+
+    padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+
+    // Selfguided filter outputs to a maximum stripe height of 64 and a
+    // maximum restoration width of 384 (256 * 1.5)
+    int32_t dst[64 * 384];
+
+    // both r1 and r0 can't be zero
+    if (!dav1d_sgr_params[sgr_idx][0]) {
+        const int s1 = dav1d_sgr_params[sgr_idx][3];
+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
+        const int w1 = (1 << 7) - sgr_w[1];
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int32_t u = (p[i] << 4);
+                const int32_t v = (u << 7) + w1 * (dst[j * 384 + i] - u);
+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+            }
+            p += PXSTRIDE(p_stride);
+        }
+    } else if (!dav1d_sgr_params[sgr_idx][1]) {
+        const int s0 = dav1d_sgr_params[sgr_idx][2];
+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
+        const int w0 = sgr_w[0];
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int32_t u = (p[i] << 4);
+                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u);
+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+            }
+            p += PXSTRIDE(p_stride);
+        }
+    } else {
+        int32_t dst1[64 * 384];
+        const int s0 = dav1d_sgr_params[sgr_idx][2];
+        const int s1 = dav1d_sgr_params[sgr_idx][3];
+        const int w0 = sgr_w[0];
+        const int w1 = (1 << 7) - w0 - sgr_w[1];
+        selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
+        selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
+        for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                const int32_t u = (p[i] << 4);
+                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
+                                  w1 * (dst1[j * 384 + i] - u);
+                p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+            }
+            p += PXSTRIDE(p_stride);
+        }
+    }
+}
+
+void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
+    c->wiener = wiener_c;
+    c->selfguided = selfguided_c;
+
+#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
+    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
+#endif
+}
--- a/src/lr_apply.c
+++ /dev/null
@@ -1,296 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdio.h>
-
-#include "common/intops.h"
-
-#include "src/lr_apply.h"
-
-
-enum LrRestorePlanes {
-    LR_RESTORE_Y = 1 << 0,
-    LR_RESTORE_U = 1 << 1,
-    LR_RESTORE_V = 1 << 2,
-};
-
-// The loop filter buffer stores 12 rows of pixels. A superblock block will
-// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
-// and 2 below) the final 4 rows are used to swap the bottom of the last
-// stripe with the top of the next super block row.
-static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,
-                       const pixel *src, ptrdiff_t src_stride,
-                       const int ss_ver, const int sb128,
-                       int row, const int row_h, const int w)
-{
-    src_stride = PXSTRIDE(src_stride);
-    dst_stride = PXSTRIDE(dst_stride);
-
-    // The first stripe of the frame is shorter by 8 luma pixel rows.
-    int stripe_h = (64 - 8 * !row) >> ss_ver;
-
-    if (row) {
-        const int top = 4 << sb128;
-        // Copy the top part of the stored loop filtered pixels from the
-        // previous sb row needed above the first stripe of this sb row.
-        pixel_copy(&dst[dst_stride *  0], &dst[dst_stride *  top], w);
-        pixel_copy(&dst[dst_stride *  1], &dst[dst_stride * (top + 1)], w);
-        pixel_copy(&dst[dst_stride *  2], &dst[dst_stride * (top + 2)], w);
-        pixel_copy(&dst[dst_stride *  3], &dst[dst_stride * (top + 3)], w);
-    }
-
-    dst += 4 * dst_stride;
-    src += (stripe_h - 2) * src_stride;
-
-    for (; row + stripe_h <= row_h; row += stripe_h) {
-        for (int i = 0; i < 4; i++) {
-            pixel_copy(dst, src, w);
-            dst += dst_stride;
-            src += src_stride;
-        }
-        stripe_h = 64 >> ss_ver;
-        src += (stripe_h - 4) * src_stride;
-    }
-}
-
-void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
-                               /*const*/ pixel *const src[3], const int sby)
-{
-    const ptrdiff_t offset = 8 * !!sby;
-    const ptrdiff_t *const src_stride = f->cur.p.stride;
-
-    // TODO Also check block level restore type to reduce copying.
-    const int restore_planes =
-        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
-        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
-        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
-
-    if (restore_planes & LR_RESTORE_Y) {
-        const int h = f->bh << 2;
-        const int w = f->bw << 2;
-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
-        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset;
-        backup_lpf(f->lf.lr_lpf_line_ptr[0], sizeof(pixel) * f->b4_stride * 4,
-                   src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
-                   0, f->seq_hdr.sb128, y_stripe, row_h, w);
-    }
-    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-        const int h = f->bh << (2 - ss_ver);
-        const int w = f->bw << (2 - ss_hor);
-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
-        const ptrdiff_t offset_uv = offset >> ss_ver;
-        const int y_stripe =
-            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
-
-        if (restore_planes & LR_RESTORE_U) {
-            backup_lpf(f->lf.lr_lpf_line_ptr[1], sizeof(pixel) * f->b4_stride * 4,
-                       src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
-        }
-        if (restore_planes & LR_RESTORE_V) {
-            backup_lpf(f->lf.lr_lpf_line_ptr[2], sizeof(pixel) * f->b4_stride * 4,
-                       src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
-        }
-    }
-}
-
-
-static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
-                      const pixel (*left)[4], int x, int y,
-                      const int plane, const int unit_w, const int row_h,
-                      const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
-{
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const int chroma = !!plane;
-    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
-    const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
-    const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;
-    const ptrdiff_t p_stride = f->cur.p.stride[chroma];
-    const ptrdiff_t lpf_stride = sizeof(pixel) * f->b4_stride * 4;
-
-    // The first stripe of the frame is shorter by 8 luma pixel rows.
-    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
-
-    // FIXME [8] might be easier for SIMD
-    int16_t filterh[7], filterv[7];
-    if (lr->type == RESTORATION_WIENER) {
-        filterh[0] = filterh[6] = lr->filter_h[0];
-        filterh[1] = filterh[5] = lr->filter_h[1];
-        filterh[2] = filterh[4] = lr->filter_h[2];
-        filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);
-
-        filterv[0] = filterv[6] = lr->filter_v[0];
-        filterv[1] = filterv[5] = lr->filter_v[1];
-        filterv[2] = filterv[4] = lr->filter_v[2];
-        filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);
-    }
-
-    while (y + stripe_h <= row_h) {
-        // TODO Look into getting rid of the this if
-        if (y + stripe_h == row_h) {
-            edges &= ~LR_HAVE_BOTTOM;
-        } else {
-            edges |= LR_HAVE_BOTTOM;
-        }
-        if (lr->type == RESTORATION_WIENER) {
-            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
-                           filterh, filterv, edges);
-        } else {
-            assert(lr->type == RESTORATION_SGRPROJ);
-            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
-                               lr->sgr_idx, lr->sgr_weights, edges);
-        }
-
-        left += stripe_h;
-        y += stripe_h;
-        if (y + stripe_h > row_h && sbrow_has_bottom) break;
-        p += stripe_h * PXSTRIDE(p_stride);
-        edges |= LR_HAVE_TOP;
-        stripe_h = imin(64 >> ss_ver, row_h - y);
-        if (stripe_h == 0) break;
-        lpf += 4 * PXSTRIDE(lpf_stride);
-    }
-}
-
-static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
-                      int u)
-{
-    for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
-        pixel_copy(dst, src, 4);
-}
-
-static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
-                     const int w, const int h, const int row_h, const int plane)
-{
-    const int chroma = !!plane;
-    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
-    const int ss_hor = chroma & (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
-    const ptrdiff_t p_stride = f->cur.p.stride[chroma];
-
-    const int unit_size_log2 = f->frame_hdr.restoration.unit_size[!!plane];
-    const int unit_size = 1 << unit_size_log2;
-    const int half_unit_size = unit_size >> 1;
-    const int max_unit_size = unit_size + half_unit_size;
-
-    // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
-    const int row_y = y + ((8 >> ss_ver) * !!y);
-
-    // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
-    // chroma planes. Question: For Multithreaded decoding, is it better
-    // to store the chroma LR information with collocated Luma information?
-    // In other words. For a chroma restoration unit locate at 128,128 and
-    // with a 4:2:0 chroma subsampling, do we store the filter information at
-    // the AV1Filter unit located at (128,128) or (256,256)
-    // TODO Support chroma subsampling.
-    const int shift_ver = 7 - ss_ver;
-    const int shift_hor = 7 - ss_hor;
-
-    int ruy = (row_y >> unit_size_log2);
-    // Merge last restoration unit if its height is < half_unit_size
-    if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;
-
-    // The first stripe of the frame is shorter by 8 luma pixel rows.
-    const int filter_h =
-        imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);
-
-    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
-
-    int unit_w = unit_size, bit = 0;
-
-    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
-                             (row_h < h ? LR_HAVE_BOTTOM : 0);
-
-    for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
-        // TODO Clean up this if statement.
-        if (x + max_unit_size > w) {
-            unit_w = w - x;
-            edges &= ~LR_HAVE_RIGHT;
-        } else {
-            edges |= LR_HAVE_RIGHT;
-        }
-
-        // Based on the position of the restoration unit, find the corresponding
-        // AV1Filter unit.
-        const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);
-        const Av1RestorationUnit *const lr =
-            &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
-                        (x >> shift_hor)].lr[plane][unit_idx];
-
-        // FIXME Don't backup if the next restoration unit is RESTORE_NONE
-        // This also requires not restoring in the same conditions.
-        if (edges & LR_HAVE_RIGHT) {
-            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);
-        }
-        if (lr->type != RESTORATION_NONE) {
-            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
-        }
-        p += unit_w;
-    }
-}
-
-void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
-                            const int sby)
-{
-    const ptrdiff_t offset_y = 8 * !!sby;
-    const ptrdiff_t *const dst_stride = f->cur.p.stride;
-
-    const int restore_planes =
-        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
-        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
-        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
-
-    if (restore_planes & LR_RESTORE_Y) {
-        const int h = f->cur.p.p.h;
-        const int w = f->cur.p.p.w;
-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
-        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset_y;
-        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
-                 h, row_h, 0);
-    }
-    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-        const int h = (f->cur.p.p.h + ss_ver) >> ss_ver;
-        const int w = (f->cur.p.p.w + ss_hor) >> ss_hor;
-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
-        const ptrdiff_t offset_uv = offset_y >> ss_ver;
-        const int y_stripe =
-            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
-        if (restore_planes & LR_RESTORE_U)
-            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
-                     w, h, row_h, 1);
-
-        if (restore_planes & LR_RESTORE_V)
-            lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
-                     w, h, row_h, 2);
-    }
-}
--- /dev/null
+++ b/src/lr_apply_tmpl.c
@@ -1,0 +1,296 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+
+#include "common/intops.h"
+
+#include "src/lr_apply.h"
+
+
+enum LrRestorePlanes {
+    LR_RESTORE_Y = 1 << 0,
+    LR_RESTORE_U = 1 << 1,
+    LR_RESTORE_V = 1 << 2,
+};
+
+// The loop filter buffer stores 12 rows of pixels. A superblock block will
+// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
+// and 2 below) the final 4 rows are used to swap the bottom of the last
+// stripe with the top of the next super block row.
+static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,
+                       const pixel *src, ptrdiff_t src_stride,
+                       const int ss_ver, const int sb128,
+                       int row, const int row_h, const int w)
+{
+    src_stride = PXSTRIDE(src_stride);
+    dst_stride = PXSTRIDE(dst_stride);
+
+    // The first stripe of the frame is shorter by 8 luma pixel rows.
+    int stripe_h = (64 - 8 * !row) >> ss_ver;
+
+    if (row) {
+        const int top = 4 << sb128;
+        // Copy the top part of the stored loop filtered pixels from the
+        // previous sb row needed above the first stripe of this sb row.
+        pixel_copy(&dst[dst_stride *  0], &dst[dst_stride *  top], w);
+        pixel_copy(&dst[dst_stride *  1], &dst[dst_stride * (top + 1)], w);
+        pixel_copy(&dst[dst_stride *  2], &dst[dst_stride * (top + 2)], w);
+        pixel_copy(&dst[dst_stride *  3], &dst[dst_stride * (top + 3)], w);
+    }
+
+    dst += 4 * dst_stride;
+    src += (stripe_h - 2) * src_stride;
+
+    for (; row + stripe_h <= row_h; row += stripe_h) {
+        for (int i = 0; i < 4; i++) {
+            pixel_copy(dst, src, w);
+            dst += dst_stride;
+            src += src_stride;
+        }
+        stripe_h = 64 >> ss_ver;
+        src += (stripe_h - 4) * src_stride;
+    }
+}
+
+void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
+                               /*const*/ pixel *const src[3], const int sby)
+{
+    const ptrdiff_t offset = 8 * !!sby;
+    const ptrdiff_t *const src_stride = f->cur.p.stride;
+
+    // TODO Also check block level restore type to reduce copying.
+    const int restore_planes =
+        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
+        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
+        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
+
+    if (restore_planes & LR_RESTORE_Y) {
+        const int h = f->bh << 2;
+        const int w = f->bw << 2;
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
+        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset;
+        backup_lpf(f->lf.lr_lpf_line_ptr[0], sizeof(pixel) * f->b4_stride * 4,
+                   src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
+                   0, f->seq_hdr.sb128, y_stripe, row_h, w);
+    }
+    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int h = f->bh << (2 - ss_ver);
+        const int w = f->bw << (2 - ss_hor);
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
+        const ptrdiff_t offset_uv = offset >> ss_ver;
+        const int y_stripe =
+            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
+
+        if (restore_planes & LR_RESTORE_U) {
+            backup_lpf(f->lf.lr_lpf_line_ptr[1], sizeof(pixel) * f->b4_stride * 4,
+                       src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
+                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
+        }
+        if (restore_planes & LR_RESTORE_V) {
+            backup_lpf(f->lf.lr_lpf_line_ptr[2], sizeof(pixel) * f->b4_stride * 4,
+                       src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
+                       ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
+        }
+    }
+}
+
+
+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
+                      const pixel (*left)[4], int x, int y,
+                      const int plane, const int unit_w, const int row_h,
+                      const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
+{
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int chroma = !!plane;
+    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+    const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
+    const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;
+    const ptrdiff_t p_stride = f->cur.p.stride[chroma];
+    const ptrdiff_t lpf_stride = sizeof(pixel) * f->b4_stride * 4;
+
+    // The first stripe of the frame is shorter by 8 luma pixel rows.
+    int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
+
+    // FIXME [8] might be easier for SIMD
+    int16_t filterh[7], filterv[7];
+    if (lr->type == RESTORATION_WIENER) {
+        filterh[0] = filterh[6] = lr->filter_h[0];
+        filterh[1] = filterh[5] = lr->filter_h[1];
+        filterh[2] = filterh[4] = lr->filter_h[2];
+        filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);
+
+        filterv[0] = filterv[6] = lr->filter_v[0];
+        filterv[1] = filterv[5] = lr->filter_v[1];
+        filterv[2] = filterv[4] = lr->filter_v[2];
+        filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);
+    }
+
+    while (y + stripe_h <= row_h) {
+        // TODO Look into getting rid of the this if
+        if (y + stripe_h == row_h) {
+            edges &= ~LR_HAVE_BOTTOM;
+        } else {
+            edges |= LR_HAVE_BOTTOM;
+        }
+        if (lr->type == RESTORATION_WIENER) {
+            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+                           filterh, filterv, edges);
+        } else {
+            assert(lr->type == RESTORATION_SGRPROJ);
+            dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+                               lr->sgr_idx, lr->sgr_weights, edges);
+        }
+
+        left += stripe_h;
+        y += stripe_h;
+        if (y + stripe_h > row_h && sbrow_has_bottom) break;
+        p += stripe_h * PXSTRIDE(p_stride);
+        edges |= LR_HAVE_TOP;
+        stripe_h = imin(64 >> ss_ver, row_h - y);
+        if (stripe_h == 0) break;
+        lpf += 4 * PXSTRIDE(lpf_stride);
+    }
+}
+
+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
+                      int u)
+{
+    for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
+        pixel_copy(dst, src, 4);
+}
+
+static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
+                     const int w, const int h, const int row_h, const int plane)
+{
+    const int chroma = !!plane;
+    const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+    const int ss_hor = chroma & (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
+    const ptrdiff_t p_stride = f->cur.p.stride[chroma];
+
+    const int unit_size_log2 = f->frame_hdr.restoration.unit_size[!!plane];
+    const int unit_size = 1 << unit_size_log2;
+    const int half_unit_size = unit_size >> 1;
+    const int max_unit_size = unit_size + half_unit_size;
+
+    // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
+    const int row_y = y + ((8 >> ss_ver) * !!y);
+
+    // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
+    // chroma planes. Question: For Multithreaded decoding, is it better
+    // to store the chroma LR information with collocated Luma information?
+    // In other words. For a chroma restoration unit locate at 128,128 and
+    // with a 4:2:0 chroma subsampling, do we store the filter information at
+    // the AV1Filter unit located at (128,128) or (256,256)
+    // TODO Support chroma subsampling.
+    const int shift_ver = 7 - ss_ver;
+    const int shift_hor = 7 - ss_hor;
+
+    int ruy = (row_y >> unit_size_log2);
+    // Merge last restoration unit if its height is < half_unit_size
+    if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;
+
+    // The first stripe of the frame is shorter by 8 luma pixel rows.
+    const int filter_h =
+        imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);
+
+    pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
+
+    int unit_w = unit_size, bit = 0;
+
+    enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
+                             (row_h < h ? LR_HAVE_BOTTOM : 0);
+
+    for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
+        // TODO Clean up this if statement.
+        if (x + max_unit_size > w) {
+            unit_w = w - x;
+            edges &= ~LR_HAVE_RIGHT;
+        } else {
+            edges |= LR_HAVE_RIGHT;
+        }
+
+        // Based on the position of the restoration unit, find the corresponding
+        // AV1Filter unit.
+        const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);
+        const Av1RestorationUnit *const lr =
+            &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
+                        (x >> shift_hor)].lr[plane][unit_idx];
+
+        // FIXME Don't backup if the next restoration unit is RESTORE_NONE
+        // This also requires not restoring in the same conditions.
+        if (edges & LR_HAVE_RIGHT) {
+            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);
+        }
+        if (lr->type != RESTORATION_NONE) {
+            lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
+        }
+        p += unit_w;
+    }
+}
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+                            const int sby)
+{
+    const ptrdiff_t offset_y = 8 * !!sby;
+    const ptrdiff_t *const dst_stride = f->cur.p.stride;
+
+    const int restore_planes =
+        ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
+        ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
+        ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
+
+    if (restore_planes & LR_RESTORE_Y) {
+        const int h = f->cur.p.p.h;
+        const int w = f->cur.p.p.w;
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
+        const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset_y;
+        lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
+                 h, row_h, 0);
+    }
+    if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int h = (f->cur.p.p.h + ss_ver) >> ss_ver;
+        const int w = (f->cur.p.p.w + ss_hor) >> ss_hor;
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
+        const ptrdiff_t offset_uv = offset_y >> ss_ver;
+        const int y_stripe =
+            (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
+        if (restore_planes & LR_RESTORE_U)
+            lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+                     w, h, row_h, 1);
+
+        if (restore_planes & LR_RESTORE_V)
+            lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+                     w, h, row_h, 2);
+    }
+}
--- a/src/mc.c
+++ /dev/null
@@ -1,542 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/mc.h"
-#include "src/tables.h"
-
-static NOINLINE void
-put_c(pixel *dst, const ptrdiff_t dst_stride,
-      const pixel *src, const ptrdiff_t src_stride, const int w, int h)
-{
-    do {
-        pixel_copy(dst, src, w);
-
-        dst += dst_stride;
-        src += src_stride;
-    } while (--h);
-}
-
-static NOINLINE void
-prep_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,
-       const int w, int h)
-{
-    do {
-        for (int x = 0; x < w; x++)
-            tmp[x] = src[x] << 4;
-
-        tmp += w;
-        src += src_stride;
-    } while (--h);
-}
-
-#define FILTER_8TAP(src, x, F, stride) \
-    (F[0] * src[x + -3 * stride] + \
-     F[1] * src[x + -2 * stride] + \
-     F[2] * src[x + -1 * stride] + \
-     F[3] * src[x + +0 * stride] + \
-     F[4] * src[x + +1 * stride] + \
-     F[5] * src[x + +2 * stride] + \
-     F[6] * src[x + +3 * stride] + \
-     F[7] * src[x + +4 * stride])
-
-#define FILTER_8TAP_RND(src, x, F, stride, sh) \
-    ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)
-
-#define FILTER_8TAP_CLIP(src, x, F, stride, sh) \
-    iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))
-
-#define GET_FILTERS() \
-    const int8_t *const fh = !mx ? NULL : w > 4 ? \
-        dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \
-        dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \
-    const int8_t *const fv = !my ? NULL : h > 4 ? \
-        dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \
-        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \
-
-static NOINLINE void
-put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
-           const pixel *src, ptrdiff_t src_stride,
-           const int w, int h, const int mx, const int my,
-           const int filter_type)
-{
-    GET_FILTERS();
-    dst_stride = PXSTRIDE(dst_stride);
-    src_stride = PXSTRIDE(src_stride);
-
-    if (fh) {
-        if (fv) {
-            int tmp_h = h + 7;
-            coef mid[128 * 135], *mid_ptr = mid;
-
-            src -= src_stride * 3;
-            do {
-                for (int x = 0; x < w; x++)
-                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
-
-                mid_ptr += 128;
-                src += src_stride;
-            } while (--tmp_h);
-
-            mid_ptr = mid + 128 * 3;
-            do {
-                for (int x = 0; x < w; x++)
-                    dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);
-
-                mid_ptr += 128;
-                dst += dst_stride;
-            } while (--h);
-        } else {
-            do {
-                for (int x = 0; x < w; x++) {
-                    const int px = FILTER_8TAP_RND(src, x, fh, 1, 2);
-                    dst[x] = iclip_pixel((px + 8) >> 4);
-                }
-
-                dst += dst_stride;
-                src += src_stride;
-            } while (--h);
-        }
-    } else if (fv) {
-        do {
-            for (int x = 0; x < w; x++)
-                dst[x] = FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
-
-            dst += dst_stride;
-            src += src_stride;
-        } while (--h);
-    } else
-        put_c(dst, dst_stride, src, src_stride, w, h);
-}
-
-static NOINLINE void
-prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
-            const int w, int h, const int mx, const int my,
-            const int filter_type)
-{
-    GET_FILTERS();
-    src_stride = PXSTRIDE(src_stride);
-
-    if (fh) {
-        if (fv) {
-            int tmp_h = h + 7;
-            coef mid[128 * 135], *mid_ptr = mid;
-
-            src -= src_stride * 3;
-            do {
-                for (int x = 0; x < w; x++)
-                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
-
-                mid_ptr += 128;
-                src += src_stride;
-            } while (--tmp_h);
-
-            mid_ptr = mid + 128 * 3;
-            do {
-                for (int x = 0; x < w; x++)
-                    tmp[x] = FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);
-
-                mid_ptr += 128;
-                tmp += w;
-            } while (--h);
-        } else {
-            do {
-                for (int x = 0; x < w; x++)
-                    tmp[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
-
-                tmp += w;
-                src += src_stride;
-            } while (--h);
-        }
-    } else if (fv) {
-        do {
-            for (int x = 0; x < w; x++)
-                tmp[x] = FILTER_8TAP_RND(src, x, fv, src_stride, 2);
-
-            tmp += w;
-            src += src_stride;
-        } while (--h);
-    } else
-        prep_c(tmp, src, src_stride, w, h);
-}
-
-#define filter_fns(type, type_h, type_v) \
-static void put_8tap_##type##_c(pixel *const dst, \
-                                const ptrdiff_t dst_stride, \
-                                const pixel *const src, \
-                                const ptrdiff_t src_stride, \
-                                const int w, const int h, \
-                                const int mx, const int my) \
-{ \
-    put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
-               type_h | (type_v << 2)); \
-} \
-static void prep_8tap_##type##_c(coef *const tmp, \
-                                 const pixel *const src, \
-                                 const ptrdiff_t src_stride, \
-                                 const int w, const int h, \
-                                 const int mx, const int my) \
-{ \
-    prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
-                type_h | (type_v << 2)); \
-}
-
-filter_fns(regular,        FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)
-filter_fns(regular_sharp,  FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)
-filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)
-filter_fns(smooth,         FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH)
-filter_fns(smooth_regular, FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR)
-filter_fns(smooth_sharp,   FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP)
-filter_fns(sharp,          FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP)
-filter_fns(sharp_regular,  FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR)
-filter_fns(sharp_smooth,   FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH)
-
-#define FILTER_BILIN(src, x, mxy, stride) \
-    (16 * src[x] + (mxy * (src[x + stride] - src[x])))
-
-#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
-    ((FILTER_BILIN(src, x, mxy, stride) + ((1 << sh) >> 1)) >> sh)
-
-#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
-    iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
-
-static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
-                        const pixel *src, ptrdiff_t src_stride,
-                        const int w, int h, const int mx, const int my)
-{
-    dst_stride = PXSTRIDE(dst_stride);
-    src_stride = PXSTRIDE(src_stride);
-
-    if (mx) {
-        if (my) {
-            coef mid[128 * 129], *mid_ptr = mid;
-            int tmp_h = h + 1;
-
-            do {
-                for (int x = 0; x < w; x++)
-                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
-
-                mid_ptr += 128;
-                src += src_stride;
-            } while (--tmp_h);
-
-            mid_ptr = mid;
-            do {
-                for (int x = 0; x < w; x++)
-                    dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, 8);
-
-                mid_ptr += 128;
-                dst += dst_stride;
-            } while (--h);
-        } else {
-            do {
-                for (int x = 0; x < w; x++)
-                    dst[x] = FILTER_BILIN_CLIP(src, x, mx, 1, 4);
-
-                dst += dst_stride;
-                src += src_stride;
-            } while (--h);
-        }
-    } else if (my) {
-        do {
-            for (int x = 0; x < w; x++)
-                dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
-
-            dst += dst_stride;
-            src += src_stride;
-        } while (--h);
-    } else
-        put_c(dst, dst_stride, src, src_stride, w, h);
-}
-
-static void prep_bilin_c(coef *tmp,
-                         const pixel *src, ptrdiff_t src_stride,
-                         const int w, int h, const int mx, const int my)
-{
-    src_stride = PXSTRIDE(src_stride);
-
-    if (mx) {
-        if (my) {
-            coef mid[128 * 129], *mid_ptr = mid;
-            int tmp_h = h + 1;
-
-            do {
-                for (int x = 0; x < w; x++)
-                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
-
-                mid_ptr += 128;
-                src += src_stride;
-            } while (--tmp_h);
-
-            mid_ptr = mid;
-            do {
-                for (int x = 0; x < w; x++)
-                    tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4);
-
-                mid_ptr += 128;
-                tmp += w;
-            } while (--h);
-        } else {
-            do {
-                for (int x = 0; x < w; x++)
-                    tmp[x] = FILTER_BILIN(src, x, mx, 1);
-
-                tmp += w;
-                src += src_stride;
-            } while (--h);
-        }
-    } else if (my) {
-        do {
-            for (int x = 0; x < w; x++)
-                tmp[x] = FILTER_BILIN(src, x, my, src_stride);
-
-            tmp += w;
-            src += src_stride;
-        } while (--h);
-    } else
-        prep_c(tmp, src, src_stride, w, h);
-}
-
-static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
-                  const coef *tmp1, const coef *tmp2, const int w, int h)
-{
-    do {
-        for (int x = 0; x < w; x++)
-            dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + 16) >> 5);
-
-        tmp1 += w;
-        tmp2 += w;
-        dst += PXSTRIDE(dst_stride);
-    } while (--h);
-}
-
-static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
-                    const coef *tmp1, const coef *tmp2, const int w, int h,
-                    const int weight)
-{
-    do {
-        for (int x = 0; x < w; x++)
-            dst[x] = iclip_pixel((tmp1[x] * weight +
-                                  tmp2[x] * (16 - weight) + 128) >> 8);
-
-        tmp1 += w;
-        tmp2 += w;
-        dst += PXSTRIDE(dst_stride);
-    } while (--h);
-}
-
-static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
-                   const coef *tmp1, const coef *tmp2, const int w, int h,
-                   const uint8_t *mask)
-{
-    do {
-        for (int x = 0; x < w; x++)
-            dst[x] = iclip_pixel((tmp1[x] * mask[x] +
-                                  tmp2[x] * (64 - mask[x]) + 512) >> 10);
-
-        tmp1 += w;
-        tmp2 += w;
-        mask += w;
-        dst += PXSTRIDE(dst_stride);
-    } while (--h);
-}
-
-static void blend_c(pixel *dst, const ptrdiff_t dst_stride,
-                    const pixel *tmp, const ptrdiff_t tmp_stride,
-                    const int w, const int h,
-                    const uint8_t *mask, const ptrdiff_t m_stride)
-{
-    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++) {
-#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
-            dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
-        }
-        dst += PXSTRIDE(dst_stride);
-        tmp += PXSTRIDE(tmp_stride);
-        mask += m_stride;
-    }
-}
-
-static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
-                     const coef *tmp1, const coef *tmp2, const int w, int h,
-                     uint8_t *mask, const int sign,
-                     const int ss_hor, const int ss_ver)
-{
-    // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
-    // and then load this intermediate to calculate final value for odd rows
-    const int rnd = 8 << (BITDEPTH - 8);
-    do {
-        for (int x = 0; x < w; x++) {
-            const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
-            dst[x] = iclip_pixel((tmp1[x] * m +
-                                  tmp2[x] * (64 - m) + 512) >> 10);
-
-            if (ss_hor) {
-                x++;
-
-                const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
-                dst[x] = iclip_pixel((tmp1[x] * n +
-                                      tmp2[x] * (64 - n) + 512) >> 10);
-
-                if (h & ss_ver) {
-                    mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
-                } else if (ss_ver) {
-                    mask[x >> 1] = m + n;
-                } else {
-                    mask[x >> 1] = (m + n + 1 - sign) >> 1;
-                }
-            } else {
-                mask[x] = m;
-            }
-        }
-
-        tmp1 += w;
-        tmp2 += w;
-        dst += PXSTRIDE(dst_stride);
-        if (!ss_ver || (h & 1)) mask += w >> ss_hor;
-    } while (--h);
-}
-
-#define w_mask_fns(ssn, ss_hor, ss_ver) \
-static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
-                             const coef *const tmp1, const coef *const tmp2, \
-                             const int w, const int h, uint8_t *mask, \
-                             const int sign) \
-{ \
-    w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver); \
-}
-
-w_mask_fns(444, 0, 0);
-w_mask_fns(422, 1, 0);
-w_mask_fns(420, 1, 1);
-
-#undef w_mask_fns
-
-static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
-                              const pixel *src, const ptrdiff_t src_stride,
-                              const int16_t *const abcd, int mx, int my)
-{
-    coef mid[15 * 8], *mid_ptr = mid;
-
-    src -= 3 * PXSTRIDE(src_stride);
-    for (int y = 0; y < 15; y++, mx += abcd[1]) {
-        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
-            const int8_t *const filter =
-                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
-
-            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
-        }
-        src += PXSTRIDE(src_stride);
-        mid_ptr += 8;
-    }
-
-    mid_ptr = &mid[3 * 8];
-    for (int y = 0; y < 8; y++, my += abcd[3]) {
-        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
-            const int8_t *const filter =
-                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
-
-            dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, filter, 8, 11);
-        }
-        mid_ptr += 8;
-        dst += PXSTRIDE(dst_stride);
-    }
-}
-
-static void warp_affine_8x8t_c(coef *tmp, const ptrdiff_t tmp_stride,
-                               const pixel *src, const ptrdiff_t src_stride,
-                               const int16_t *const abcd, int mx, int my)
-{
-    coef mid[15 * 8], *mid_ptr = mid;
-
-    src -= 3 * PXSTRIDE(src_stride);
-    for (int y = 0; y < 15; y++, mx += abcd[1]) {
-        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
-            const int8_t *const filter =
-                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
-
-            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
-        }
-        src += PXSTRIDE(src_stride);
-        mid_ptr += 8;
-    }
-
-    mid_ptr = &mid[3 * 8];
-    for (int y = 0; y < 8; y++, my += abcd[3]) {
-        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
-            const int8_t *const filter =
-                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
-
-            tmp[x] = FILTER_8TAP_RND(mid_ptr, x, filter, 8, 7);
-        }
-        mid_ptr += 8;
-        tmp += tmp_stride;
-    }
-}
-
-void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
-#define init_mc_fns(type, name) do { \
-    c->mc [type] = put_##name##_c; \
-    c->mct[type] = prep_##name##_c; \
-} while (0)
-
-    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
-    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
-    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
-    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
-    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
-    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
-    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
-    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
-    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
-    init_mc_fns(FILTER_2D_BILINEAR,            bilin);
-
-    c->avg      = avg_c;
-    c->w_avg    = w_avg_c;
-    c->mask     = mask_c;
-    c->blend    = blend_c;
-    c->w_mask[0] = w_mask_444_c;
-    c->w_mask[1] = w_mask_422_c;
-    c->w_mask[2] = w_mask_420_c;
-    c->warp8x8  = warp_affine_8x8_c;
-    c->warp8x8t = warp_affine_8x8t_c;
-
-#if HAVE_ASM
-#if ARCH_AARCH64 || ARCH_ARM
-    bitfn(dav1d_mc_dsp_init_arm)(c);
-#elif ARCH_X86
-    bitfn(dav1d_mc_dsp_init_x86)(c);
-#endif
-#endif
-}
--- /dev/null
+++ b/src/mc_tmpl.c
@@ -1,0 +1,542 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/mc.h"
+#include "src/tables.h"
+
+static NOINLINE void
+put_c(pixel *dst, const ptrdiff_t dst_stride,
+      const pixel *src, const ptrdiff_t src_stride, const int w, int h)
+{
+    do {
+        pixel_copy(dst, src, w);
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+static NOINLINE void
+prep_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,
+       const int w, int h)
+{
+    do {
+        for (int x = 0; x < w; x++)
+            tmp[x] = src[x] << 4;
+
+        tmp += w;
+        src += src_stride;
+    } while (--h);
+}
+
+#define FILTER_8TAP(src, x, F, stride) \
+    (F[0] * src[x + -3 * stride] + \
+     F[1] * src[x + -2 * stride] + \
+     F[2] * src[x + -1 * stride] + \
+     F[3] * src[x + +0 * stride] + \
+     F[4] * src[x + +1 * stride] + \
+     F[5] * src[x + +2 * stride] + \
+     F[6] * src[x + +3 * stride] + \
+     F[7] * src[x + +4 * stride])
+
+#define FILTER_8TAP_RND(src, x, F, stride, sh) \
+    ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)
+
+#define FILTER_8TAP_CLIP(src, x, F, stride, sh) \
+    iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))
+
+#define GET_FILTERS() \
+    const int8_t *const fh = !mx ? NULL : w > 4 ? \
+        dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \
+        dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \
+    const int8_t *const fv = !my ? NULL : h > 4 ? \
+        dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \
+        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \
+
+static NOINLINE void
+put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
+           const pixel *src, ptrdiff_t src_stride,
+           const int w, int h, const int mx, const int my,
+           const int filter_type)
+{
+    GET_FILTERS();
+    dst_stride = PXSTRIDE(dst_stride);
+    src_stride = PXSTRIDE(src_stride);
+
+    if (fh) {
+        if (fv) {
+            int tmp_h = h + 7;
+            coef mid[128 * 135], *mid_ptr = mid;
+
+            src -= src_stride * 3;
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid + 128 * 3;
+            do {
+                for (int x = 0; x < w; x++)
+                    dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);
+
+                mid_ptr += 128;
+                dst += dst_stride;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++) {
+                    const int px = FILTER_8TAP_RND(src, x, fh, 1, 2);
+                    dst[x] = iclip_pixel((px + 8) >> 4);
+                }
+
+                dst += dst_stride;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (fv) {
+        do {
+            for (int x = 0; x < w; x++)
+                dst[x] = FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
+
+            dst += dst_stride;
+            src += src_stride;
+        } while (--h);
+    } else
+        put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static NOINLINE void
+prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
+            const int w, int h, const int mx, const int my,
+            const int filter_type)
+{
+    GET_FILTERS();
+    src_stride = PXSTRIDE(src_stride);
+
+    if (fh) {
+        if (fv) {
+            int tmp_h = h + 7;
+            coef mid[128 * 135], *mid_ptr = mid;
+
+            src -= src_stride * 3;
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid + 128 * 3;
+            do {
+                for (int x = 0; x < w; x++)
+                    tmp[x] = FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);
+
+                mid_ptr += 128;
+                tmp += w;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++)
+                    tmp[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+
+                tmp += w;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (fv) {
+        do {
+            for (int x = 0; x < w; x++)
+                tmp[x] = FILTER_8TAP_RND(src, x, fv, src_stride, 2);
+
+            tmp += w;
+            src += src_stride;
+        } while (--h);
+    } else
+        prep_c(tmp, src, src_stride, w, h);
+}
+
+#define filter_fns(type, type_h, type_v) \
+static void put_8tap_##type##_c(pixel *const dst, \
+                                const ptrdiff_t dst_stride, \
+                                const pixel *const src, \
+                                const ptrdiff_t src_stride, \
+                                const int w, const int h, \
+                                const int mx, const int my) \
+{ \
+    put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
+               type_h | (type_v << 2)); \
+} \
+static void prep_8tap_##type##_c(coef *const tmp, \
+                                 const pixel *const src, \
+                                 const ptrdiff_t src_stride, \
+                                 const int w, const int h, \
+                                 const int mx, const int my) \
+{ \
+    prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
+                type_h | (type_v << 2)); \
+}
+
+filter_fns(regular,        FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)
+filter_fns(regular_sharp,  FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)
+filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)
+filter_fns(smooth,         FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH)
+filter_fns(smooth_regular, FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR)
+filter_fns(smooth_sharp,   FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP)
+filter_fns(sharp,          FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP)
+filter_fns(sharp_regular,  FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR)
+filter_fns(sharp_smooth,   FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH)
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+    (16 * src[x] + (mxy * (src[x + stride] - src[x])))
+
+#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
+    ((FILTER_BILIN(src, x, mxy, stride) + ((1 << sh) >> 1)) >> sh)
+
+#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
+    iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
+
+static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
+                        const pixel *src, ptrdiff_t src_stride,
+                        const int w, int h, const int mx, const int my)
+{
+    dst_stride = PXSTRIDE(dst_stride);
+    src_stride = PXSTRIDE(src_stride);
+
+    if (mx) {
+        if (my) {
+            coef mid[128 * 129], *mid_ptr = mid;
+            int tmp_h = h + 1;
+
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid;
+            do {
+                for (int x = 0; x < w; x++)
+                    dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, 8);
+
+                mid_ptr += 128;
+                dst += dst_stride;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++)
+                    dst[x] = FILTER_BILIN_CLIP(src, x, mx, 1, 4);
+
+                dst += dst_stride;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (my) {
+        do {
+            for (int x = 0; x < w; x++)
+                dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
+
+            dst += dst_stride;
+            src += src_stride;
+        } while (--h);
+    } else
+        put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static void prep_bilin_c(coef *tmp,
+                         const pixel *src, ptrdiff_t src_stride,
+                         const int w, int h, const int mx, const int my)
+{
+    src_stride = PXSTRIDE(src_stride);
+
+    if (mx) {
+        if (my) {
+            coef mid[128 * 129], *mid_ptr = mid;
+            int tmp_h = h + 1;
+
+            do {
+                for (int x = 0; x < w; x++)
+                    mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+                mid_ptr += 128;
+                src += src_stride;
+            } while (--tmp_h);
+
+            mid_ptr = mid;
+            do {
+                for (int x = 0; x < w; x++)
+                    tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4);
+
+                mid_ptr += 128;
+                tmp += w;
+            } while (--h);
+        } else {
+            do {
+                for (int x = 0; x < w; x++)
+                    tmp[x] = FILTER_BILIN(src, x, mx, 1);
+
+                tmp += w;
+                src += src_stride;
+            } while (--h);
+        }
+    } else if (my) {
+        do {
+            for (int x = 0; x < w; x++)
+                tmp[x] = FILTER_BILIN(src, x, my, src_stride);
+
+            tmp += w;
+            src += src_stride;
+        } while (--h);
+    } else
+        prep_c(tmp, src, src_stride, w, h);
+}
+
+static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
+                  const coef *tmp1, const coef *tmp2, const int w, int h)
+{
+    do {
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + 16) >> 5);
+
+        tmp1 += w;
+        tmp2 += w;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+                    const coef *tmp1, const coef *tmp2, const int w, int h,
+                    const int weight)
+{
+    do {
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel((tmp1[x] * weight +
+                                  tmp2[x] * (16 - weight) + 128) >> 8);
+
+        tmp1 += w;
+        tmp2 += w;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+                   const coef *tmp1, const coef *tmp2, const int w, int h,
+                   const uint8_t *mask)
+{
+    do {
+        for (int x = 0; x < w; x++)
+            dst[x] = iclip_pixel((tmp1[x] * mask[x] +
+                                  tmp2[x] * (64 - mask[x]) + 512) >> 10);
+
+        tmp1 += w;
+        tmp2 += w;
+        mask += w;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride,
+                    const pixel *tmp, const ptrdiff_t tmp_stride,
+                    const int w, const int h,
+                    const uint8_t *mask, const ptrdiff_t m_stride)
+{
+    for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
+            dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
+        }
+        dst += PXSTRIDE(dst_stride);
+        tmp += PXSTRIDE(tmp_stride);
+        mask += m_stride;
+    }
+}
+
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+                     const coef *tmp1, const coef *tmp2, const int w, int h,
+                     uint8_t *mask, const int sign,
+                     const int ss_hor, const int ss_ver)
+{
+    // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
+    // and then load this intermediate to calculate final value for odd rows
+    const int rnd = 8 << (BITDEPTH - 8);
+    do {
+        for (int x = 0; x < w; x++) {
+            const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
+            dst[x] = iclip_pixel((tmp1[x] * m +
+                                  tmp2[x] * (64 - m) + 512) >> 10);
+
+            if (ss_hor) {
+                x++;
+
+                const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
+                dst[x] = iclip_pixel((tmp1[x] * n +
+                                      tmp2[x] * (64 - n) + 512) >> 10);
+
+                if (h & ss_ver) {
+                    mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
+                } else if (ss_ver) {
+                    mask[x >> 1] = m + n;
+                } else {
+                    mask[x >> 1] = (m + n + 1 - sign) >> 1;
+                }
+            } else {
+                mask[x] = m;
+            }
+        }
+
+        tmp1 += w;
+        tmp2 += w;
+        dst += PXSTRIDE(dst_stride);
+        if (!ss_ver || (h & 1)) mask += w >> ss_hor;
+    } while (--h);
+}
+
+#define w_mask_fns(ssn, ss_hor, ss_ver) \
+static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
+                             const coef *const tmp1, const coef *const tmp2, \
+                             const int w, const int h, uint8_t *mask, \
+                             const int sign) \
+{ \
+    w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver); \
+}
+
+w_mask_fns(444, 0, 0);
+w_mask_fns(422, 1, 0);
+w_mask_fns(420, 1, 1);
+
+#undef w_mask_fns
+
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+                              const pixel *src, const ptrdiff_t src_stride,
+                              const int16_t *const abcd, int mx, int my)
+{
+    coef mid[15 * 8], *mid_ptr = mid;
+
+    src -= 3 * PXSTRIDE(src_stride);
+    for (int y = 0; y < 15; y++, mx += abcd[1]) {
+        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
+        }
+        src += PXSTRIDE(src_stride);
+        mid_ptr += 8;
+    }
+
+    mid_ptr = &mid[3 * 8];
+    for (int y = 0; y < 8; y++, my += abcd[3]) {
+        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+            dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, filter, 8, 11);
+        }
+        mid_ptr += 8;
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static void warp_affine_8x8t_c(coef *tmp, const ptrdiff_t tmp_stride,
+                               const pixel *src, const ptrdiff_t src_stride,
+                               const int16_t *const abcd, int mx, int my)
+{
+    coef mid[15 * 8], *mid_ptr = mid;
+
+    src -= 3 * PXSTRIDE(src_stride);
+    for (int y = 0; y < 15; y++, mx += abcd[1]) {
+        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
+        }
+        src += PXSTRIDE(src_stride);
+        mid_ptr += 8;
+    }
+
+    mid_ptr = &mid[3 * 8];
+    for (int y = 0; y < 8; y++, my += abcd[3]) {
+        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+            const int8_t *const filter =
+                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+            tmp[x] = FILTER_8TAP_RND(mid_ptr, x, filter, 8, 7);
+        }
+        mid_ptr += 8;
+        tmp += tmp_stride;
+    }
+}
+
+void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
+#define init_mc_fns(type, name) do { \
+    c->mc [type] = put_##name##_c; \
+    c->mct[type] = prep_##name##_c; \
+} while (0)
+
+    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
+    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
+    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
+    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
+    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
+    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
+    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
+    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
+    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
+    init_mc_fns(FILTER_2D_BILINEAR,            bilin);
+
+    c->avg      = avg_c;
+    c->w_avg    = w_avg_c;
+    c->mask     = mask_c;
+    c->blend    = blend_c;
+    c->w_mask[0] = w_mask_444_c;
+    c->w_mask[1] = w_mask_422_c;
+    c->w_mask[2] = w_mask_420_c;
+    c->warp8x8  = warp_affine_8x8_c;
+    c->warp8x8t = warp_affine_8x8t_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_mc_dsp_init_arm)(c);
+#elif ARCH_X86
+    bitfn(dav1d_mc_dsp_init_x86)(c);
+#endif
+#endif
+}
--- a/src/meson.build
+++ b/src/meson.build
@@ -52,17 +52,17 @@
 # These files are compiled for each bitdepth with
 # `BITDEPTH` defined to the currently built bitdepth.
 libdav1d_tmpl_sources = files(
-    'ipred.c',
-    'itx.c',
-    'ipred_prepare.c',
-    'lf_apply.c',
-    'loopfilter.c',
-    'mc.c',
-    'cdef_apply.c',
-    'cdef.c',
-    'lr_apply.c',
-    'looprestoration.c',
-    'recon.c'
+    'ipred_tmpl.c',
+    'itx_tmpl.c',
+    'ipred_prepare_tmpl.c',
+    'lf_apply_tmpl.c',
+    'loopfilter_tmpl.c',
+    'mc_tmpl.c',
+    'cdef_apply_tmpl.c',
+    'cdef_tmpl.c',
+    'lr_apply_tmpl.c',
+    'looprestoration_tmpl.c',
+    'recon_tmpl.c'
 )
 
 # libdav1d entrypoint source files
--- a/src/recon.c
+++ /dev/null
@@ -1,1518 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <string.h>
-#include <stdio.h>
-
-#include "common/attributes.h"
-#include "common/bitdepth.h"
-#include "common/dump.h"
-#include "common/intops.h"
-#include "common/mem.h"
-
-#include "src/cdef_apply.h"
-#include "src/ipred_prepare.h"
-#include "src/lf_apply.h"
-#include "src/lr_apply.h"
-#include "src/recon.h"
-#include "src/scan.h"
-#include "src/tables.h"
-#include "src/wedge.h"
-
-static unsigned read_golomb(MsacContext *const msac) {
-    int len = 0;
-    unsigned val = 1;
-
-    while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;
-    while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);
-
-    return val - 1;
-}
-
-static int decode_coefs(Dav1dTileContext *const t,
-                        uint8_t *const a, uint8_t *const l,
-                        const enum RectTxfmSize tx, const enum BlockSize bs,
-                        const Av1Block *const b, const int intra,
-                        const int plane, coef *cf,
-                        enum TxfmType *const txtp, uint8_t *res_ctx)
-{
-    Dav1dTileState *const ts = t->ts;
-    const int chroma = !!plane;
-    const Dav1dFrameContext *const f = t->f;
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
-    const int dbg = DEBUG_BLOCK_INFO && plane && 0;
-
-    if (dbg) printf("Start: r=%d\n", ts->msac.rng);
-
-    // does this block have any non-zero coefficients
-    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);
-    const int all_skip =
-        msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);
-    if (dbg)
-    printf("Post-non-zero[%d][%d][%d]: r=%d\n",
-           t_dim->ctx, sctx, all_skip, ts->msac.rng);
-    if (all_skip) {
-        *res_ctx = 0x40;
-        *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :
-                                                                DCT_DCT;
-        return -1;
-    }
-
-    // transform type (chroma: derived, luma: explicitly coded)
-    if (chroma) {
-        if (intra) {
-            *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);
-        } else {
-            const enum TxfmType y_txtp = *txtp;
-            *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);
-        }
-    } else {
-        const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
-                                                      &f->frame_hdr, b->seg_id);
-        const unsigned set_cnt = dav1d_tx_type_count[set];
-        unsigned idx;
-        if (set_cnt == 1) {
-            idx = 0;
-        } else {
-            const int set_idx = dav1d_tx_type_set_index[!intra][set];
-            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
-                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
-            uint16_t *const txtp_cdf = intra ?
-                       ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
-                       ts->cdf.m.txtp_inter[set_idx][t_dim->min];
-            idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
-            if (dbg)
-            printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
-                   set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,
-                   idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
-        }
-        *txtp = dav1d_tx_types_per_set[set][idx];
-    }
-
-    // find end-of-block (eob)
-    int eob_bin;
-    const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
-    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
-    const int is_1d = tx_class != TX_CLASS_2D;
-    switch (tx2dszctx) {
-#define case_sz(sz, bin) \
-    case sz: { \
-        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
-        eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
-        break; \
-    }
-    case_sz(0,   16);
-    case_sz(1,   32);
-    case_sz(2,   64);
-    case_sz(3,  128);
-    case_sz(4,  256);
-    case_sz(5,  512);
-    case_sz(6, 1024);
-#undef case_sz
-    }
-    if (dbg)
-    printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
-           16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
-    int eob;
-    if (eob_bin > 1) {
-        eob = 1 << (eob_bin - 1);
-        uint16_t *const eob_hi_bit_cdf =
-            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
-        const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
-        if (dbg)
-        printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
-               t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
-        unsigned mask = eob >> 1;
-        if (eob_hi_bit) eob |= mask;
-        for (mask >>= 1; mask; mask >>= 1) {
-            const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);
-            if (eob_bit) eob |= mask;
-        }
-        if (dbg)
-        printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
-    } else {
-        eob = eob_bin;
-    }
-
-    // base tokens
-    uint16_t (*const br_cdf)[5] =
-        ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
-    const int16_t *const scan = dav1d_scans[tx][tx_class];
-    uint8_t levels[36 * 36];
-    ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1);
-    memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1));
-    const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1;
-    unsigned cul_level = 0;
-    for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) {
-        const int rc = scan[i], x = rc >> shift, y = rc & mask;
-
-        // lo tok
-        const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class);
-        uint16_t *const lo_cdf = is_last ?
-            ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
-            ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
-        int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf,
-                                           4 - is_last) + is_last;
-        if (dbg)
-        printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
-               t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
-        if (!tok) continue;
-
-        // hi tok
-        if (tok == 3) {
-            const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
-            do {
-                const int tok_br =
-                    msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4);
-                if (dbg)
-                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
-                       imin(t_dim->ctx, 3), chroma, br_ctx,
-                       i, rc, tok_br, tok, ts->msac.rng);
-                tok += tok_br;
-                if (tok_br < 3) break;
-            } while (tok < 15);
-        }
-
-        levels[x * stride + y] = cf[rc] = tok;
-    }
-
-    // residual and sign
-    int dc_sign = 1;
-    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
-    const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
-    const int dq_shift = imax(0, t_dim->ctx - 2);
-    for (int i = 0; i <= eob; i++) {
-        const int rc = scan[i];
-        int tok = cf[rc];
-        if (!tok) continue;
-        int dq;
-
-        // sign
-        int sign;
-        if (i == 0) {
-            const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
-            uint16_t *const dc_sign_cdf =
-                ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
-            sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
-            if (dbg)
-            printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
-                   chroma, dc_sign_ctx, sign, ts->msac.rng);
-            dc_sign = sign ? 0 : 2;
-            dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
-        } else {
-            sign = msac_decode_bool(&ts->msac, 128 << 7);
-            if (dbg)
-            printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
-            dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
-        }
-
-        // residual
-        if (tok == 15) {
-            tok += read_golomb(&ts->msac);
-            if (dbg)
-            printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
-                   i, rc, tok - 15, tok, ts->msac.rng);
-        }
-
-        // dequant
-        cul_level += tok;
-        tok *= dq;
-        tok >>= dq_shift;
-        cf[rc] = sign ? -tok : tok;
-    }
-
-    // context
-    *res_ctx = imin(cul_level, 63) | (dc_sign << 6);
-
-    return eob;
-}
-
-static void read_coef_tree(Dav1dTileContext *const t,
-                           const enum BlockSize bs, const Av1Block *const b,
-                           const enum RectTxfmSize ytx, const int depth,
-                           const uint16_t *const tx_split,
-                           const int x_off, const int y_off, pixel *dst)
-{
-    const Dav1dFrameContext *const f = t->f;
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
-    const int txw = t_dim->w, txh = t_dim->h;
-
-    if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {
-        const enum RectTxfmSize sub = t_dim->sub;
-        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
-        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
-
-        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
-                       x_off * 2 + 0, y_off * 2 + 0, dst);
-        t->bx += txsw;
-        if (txw >= txh && t->bx < f->bw)
-            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
-                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
-        t->bx -= txsw;
-        t->by += txsh;
-        if (txh >= txw && t->by < f->bh) {
-            if (dst)
-                dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);
-            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
-                           x_off * 2 + 0, y_off * 2 + 1, dst);
-            t->bx += txsw;
-            if (txw >= txh && t->bx < f->bw)
-                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
-                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
-            t->bx -= txsw;
-        }
-        t->by -= txsh;
-    } else {
-        const int bx4 = t->bx & 31, by4 = t->by & 31;
-        enum TxfmType txtp;
-        uint8_t cf_ctx;
-        int eob;
-        coef *cf;
-        struct CodedBlockInfo *cbi;
-
-        if (f->frame_thread.pass) {
-            cf = ts->frame_thread.cf;
-            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-            cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-        } else {
-            cf = t->cf;
-        }
-        if (f->frame_thread.pass != 2) {
-            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
-                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
-            if (DEBUG_BLOCK_INFO)
-                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
-                       ytx, txtp, eob, ts->msac.rng);
-            memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
-            memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
-            for (int y = 0; y < txh; y++)
-                memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
-            if (f->frame_thread.pass == 1) {
-                cbi->eob[0] = eob;
-                cbi->txtp[0] = txtp;
-            }
-        } else {
-            eob = cbi->eob[0];
-            txtp = cbi->txtp[0];
-        }
-        if (!(f->frame_thread.pass & 1)) {
-            assert(dst);
-            if (eob >= 0) {
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
-                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
-            }
-        }
-    }
-}
-
-void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
-                                    const enum BlockSize bs, const Av1Block *const b)
-{
-    const Dav1dFrameContext *const f = t->f;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
-    const int bw4 = b_dim[0], bh4 = b_dim[1];
-    const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
-                           (bw4 > ss_hor || t->bx & 1) &&
-                           (bh4 > ss_ver || t->by & 1);
-
-    if (b->skip) {
-        memset(&t->a->lcoef[bx4], 0x40, bw4);
-        memset(&t->l.lcoef[by4], 0x40, bh4);
-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-            memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
-            memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
-        }
-        return;
-    }
-
-    Dav1dTileState *const ts = t->ts;
-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-    assert(f->frame_thread.pass == 1);
-    assert(!b->skip);
-    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
-
-    for (int init_y = 0; init_y < h4; init_y += 16) {
-        for (int init_x = 0; init_x < w4; init_x += 16) {
-            const int sub_h4 = imin(h4, 16 + init_y);
-            const int sub_w4 = imin(w4, init_x + 16);
-            int y_off = !!init_y, y, x;
-            for (y = init_y, t->by += init_y; y < sub_h4;
-                 y += t_dim->h, t->by += t_dim->h, y_off++)
-            {
-                struct CodedBlockInfo *const cbi =
-                    &f->frame_thread.cbi[t->by * f->b4_stride];
-                int x_off = !!init_x;
-                for (x = init_x, t->bx += init_x; x < sub_w4;
-                     x += t_dim->w, t->bx += t_dim->w, x_off++)
-                {
-                    if (!b->intra) {
-                        read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
-                                       x_off, y_off, NULL);
-                    } else {
-                        uint8_t cf_ctx = 0x40;
-                        enum TxfmType txtp;
-                        const int eob = cbi[t->bx].eob[0] =
-                            decode_coefs(t, &t->a->lcoef[bx4 + x],
-                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
-                                         0, ts->frame_thread.cf, &txtp, &cf_ctx);
-                        if (DEBUG_BLOCK_INFO)
-                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
-                                   b->tx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx].txtp[0] = txtp;
-                        ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                        memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                               imin(t_dim->w, f->bw - t->bx));
-                        memset(&t->l.lcoef[by4 + y], cf_ctx,
-                               imin(t_dim->h, f->bh - t->by));
-                    }
-                }
-                t->bx -= x;
-            }
-            t->by -= y;
-
-            if (!has_chroma) continue;
-
-            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
-            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
-            for (int pl = 0; pl < 2; pl++) {
-                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
-                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
-                {
-                    struct CodedBlockInfo *const cbi =
-                        &f->frame_thread.cbi[t->by * f->b4_stride];
-                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
-                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
-                    {
-                        uint8_t cf_ctx = 0x40;
-                        enum TxfmType txtp;
-                        if (!b->intra)
-                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
-                                                bx4 + (x << ss_hor)];
-                        const int eob = cbi[t->bx].eob[1 + pl] =
-                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
-                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
-                                         b, b->intra, 1 + pl, ts->frame_thread.cf,
-                                         &txtp, &cf_ctx);
-                        if (DEBUG_BLOCK_INFO)
-                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
-                                   "txtp=%d,eob=%d]: r=%d\n",
-                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx].txtp[1 + pl] = txtp;
-                        ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
-                        memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                               imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                        memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                               imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
-                    }
-                    t->bx -= x << ss_hor;
-                }
-                t->by -= y << ss_ver;
-            }
-        }
-    }
-}
-
-static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,
-                     const pixel *ref, const ptrdiff_t ref_stride,
-                     const int bw, const int bh,
-                     const int iw, const int ih,
-                     const int x, const int y)
-{
-    // find offset in reference of visible block to copy
-    ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);
-
-    // number of pixels to extend (left, right, top, bottom)
-    const int left_ext = iclip(-x, 0, bw - 1);
-    const int right_ext = iclip(x + bw - iw, 0, bw - 1);
-    assert(left_ext + right_ext < bw);
-    const int top_ext = iclip(-y, 0, bh - 1);
-    const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);
-    assert(top_ext + bottom_ext < bh);
-
-    // copy visible portion first
-    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
-    const int center_w = bw - left_ext - right_ext;
-    const int center_h = bh - top_ext - bottom_ext;
-    for (int y = 0; y < center_h; y++) {
-        pixel_copy(blk + left_ext, ref, center_w);
-        // extend left edge for this line
-        if (left_ext)
-            pixel_set(blk, blk[left_ext], left_ext);
-        // extend right edge for this line
-        if (right_ext)
-            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
-                      right_ext);
-        ref += PXSTRIDE(ref_stride);
-        blk += PXSTRIDE(dst_stride);
-    }
-
-    // copy top
-    blk = dst + top_ext * PXSTRIDE(dst_stride);
-    for (int y = 0; y < top_ext; y++) {
-        pixel_copy(dst, blk, bw);
-        dst += PXSTRIDE(dst_stride);
-    }
-
-    // copy bottom
-    dst += center_h * PXSTRIDE(dst_stride);
-    for (int y = 0; y < bottom_ext; y++) {
-        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
-        dst += PXSTRIDE(dst_stride);
-    }
-}
-
-static void mc(Dav1dTileContext *const t,
-               pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,
-               const int bw4, const int bh4,
-               const int bx, const int by, const int pl,
-               const mv mv, const Dav1dThreadPicture *const refp,
-               const enum Filter2d filter_2d)
-{
-    assert((dst8 != NULL) ^ (dst16 != NULL));
-    const Dav1dFrameContext *const f = t->f;
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
-    const int mvx = mv.x, mvy = mv.y;
-    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
-    const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
-    const int dy = by * v_mul + (mvy >> (3 + ss_ver));
-    ptrdiff_t ref_stride = refp->p.stride[!!pl];
-    const pixel *ref;
-    int w, h;
-
-    if (refp != &f->cur) { // i.e. not for intrabc
-        dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
-                                  PLANE_TYPE_Y + !!pl);
-        w = (f->cur.p.p.w + ss_hor) >> ss_hor;
-        h = (f->cur.p.p.h + ss_ver) >> ss_ver;
-    } else {
-        w = f->bw * 4 >> ss_hor;
-        h = f->bh * 4 >> ss_ver;
-    }
-    if (dx < !!mx * 3 || dy < !!my * 3 ||
-        dx + bw4 * h_mul + !!mx * 4 > w ||
-        dy + bh4 * v_mul + !!my * 4 > h)
-    {
-        emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,
-                 bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,
-                 dx - !!mx * 3, dy - !!my * 3);
-        ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];
-        ref_stride = 160 * sizeof(pixel);
-    } else {
-        ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
-    }
-
-    if (dst8 != NULL) {
-        f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
-                                 bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
-    } else {
-        f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
-                                  bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
-    }
-}
-
-static void obmc(Dav1dTileContext *const t,
-                 pixel *const dst, const ptrdiff_t dst_stride,
-                 const uint8_t *const b_dim, const int pl,
-                 const int bx4, const int by4, const int w4, const int h4)
-{
-    assert(!(t->bx & 1) && !(t->by & 1));
-    const Dav1dFrameContext *const f = t->f;
-    const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
-    pixel *const lap = t->scratch.lap;
-    static const uint8_t obmc_mask_2[2] = { 19,  0 };
-    static const uint8_t obmc_mask_4[4] = { 25, 14,  5,  0 };
-    static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11,  7,  3,  0,  0 };
-    static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,
-                                               8,  6,  4,  3,  0,  0,  0,  0 };
-    static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,
-                                              19, 17, 16, 14, 13, 12, 11,  9,
-                                               8,  7,  6,  5,  4,  4,  3,  2,
-                                               0,  0,  0,  0,  0,  0,  0,  0 };
-    static const uint8_t *const obmc_masks[] = {
-        obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32
-    };
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
-
-    if (t->by > t->ts->tiling.row_start &&
-        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
-    {
-        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
-            // only odd blocks are considered for overlap handling, hence +1
-            const refmvs *const a_r = &r[x - f->b4_stride + 1];
-            const uint8_t *const a_b_dim =
-                dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];
-
-            if (a_r->ref[0] > 0) {
-                mc(t, lap, NULL, 128 * sizeof(pixel),
-                   iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,
-                   t->bx + x, t->by, pl, a_r->mv[0],
-                   &f->refp[a_r->ref[0] - 1],
-                   dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
-                f->dsp->mc.blend(&dst[x * h_mul], dst_stride,
-                                 lap, 128 * sizeof(pixel),
-                                 h_mul * iclip(a_b_dim[0], 2, b_dim[0]),
-                                 v_mul * imin(b_dim[1], 16) >> 1,
-                                 obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);
-                i++;
-            }
-            x += imax(a_b_dim[0], 2);
-        }
-    }
-
-    if (t->bx > t->ts->tiling.col_start)
-        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
-            // only odd blocks are considered for overlap handling, hence +1
-            const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
-            const uint8_t *const l_b_dim =
-                dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];
-
-            if (l_r->ref[0] > 0) {
-                mc(t, lap, NULL, 32 * sizeof(pixel),
-                   imin(b_dim[0], 16) >> 1,
-                   iclip(l_b_dim[1], 2, b_dim[1]),
-                   t->bx, t->by + y, pl, l_r->mv[0],
-                   &f->refp[l_r->ref[0] - 1],
-                   dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
-                f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,
-                                 lap, 32 * sizeof(pixel),
-                                 h_mul * imin(b_dim[0], 16) >> 1,
-                                 v_mul * iclip(l_b_dim[1], 2, b_dim[1]),
-                                 obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);
-                i++;
-            }
-            y += imax(l_b_dim[1], 2);
-        }
-}
-
-static void warp_affine(Dav1dTileContext *const t,
-                        pixel *dst8, coef *dst16, const ptrdiff_t dstride,
-                        const uint8_t *const b_dim, const int pl,
-                        const Dav1dThreadPicture *const refp,
-                        const WarpedMotionParams *const wmp)
-{
-    assert((dst8 != NULL) ^ (dst16 != NULL));
-    const Dav1dFrameContext *const f = t->f;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
-    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
-    const int32_t *const mat = wmp->matrix;
-    const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;
-    const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;
-
-    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
-        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
-            // calculate transformation relative to center of 8x8 block in
-            // luma pixel units
-            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
-            const int src_y = t->by * 4 + ((y + 4) << ss_ver);
-            const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
-            const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
-
-            const int dx = (mvx >> 16) - 4;
-            const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
-                                             wmp->beta  * 7) & ~0x3f;
-            const int dy = (mvy >> 16) - 4;
-            const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
-                                             wmp->delta * 4) & ~0x3f;
-
-            const pixel *ref_ptr;
-            ptrdiff_t ref_stride = refp->p.stride[!!pl];
-
-            dav1d_thread_picture_wait(refp, dy + 4 + 8,
-                                      PLANE_TYPE_Y + !!pl);
-            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
-                emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],
-                         ref_stride, 15, 15, width, height, dx - 3, dy - 3);
-                ref_ptr = &t->emu_edge[160 * 3 + 3];
-                ref_stride = 160 * sizeof(pixel);
-            } else {
-                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
-            }
-            if (dst16 != NULL)
-                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
-                                 wmp->abcd, mx, my);
-            else
-                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
-                                wmp->abcd, mx, my);
-        }
-        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
-        else      dst16 += 8 * dstride;
-    }
-}
-
-void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
-                                 const enum EdgeFlags intra_edge_flags,
-                                 const Av1Block *const b)
-{
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dFrameContext *const f = t->f;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
-    const int bw4 = b_dim[0], bh4 = b_dim[1];
-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
-                           (bw4 > ss_hor || t->bx & 1) &&
-                           (bh4 > ss_ver || t->by & 1);
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
-    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
-
-    // coefficient coding
-    ALIGN_STK_32(pixel, edge_buf, 257,);
-    pixel *const edge = edge_buf + 128;
-    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
-
-    for (int init_y = 0; init_y < h4; init_y += 16) {
-        for (int init_x = 0; init_x < w4; init_x += 16) {
-            if (b->pal_sz[0]) {
-                pixel *dst = ((pixel *) f->cur.p.data[0]) +
-                             4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
-                const uint8_t *pal_idx;
-                if (f->frame_thread.pass) {
-                    pal_idx = ts->frame_thread.pal_idx;
-                    ts->frame_thread.pal_idx += bw4 * bh4 * 16;
-                } else {
-                    pal_idx = t->scratch.pal_idx;
-                }
-                const uint16_t *const pal = f->frame_thread.pass ?
-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                        ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
-                f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,
-                                       pal_idx, bw4 * 4, bh4 * 4);
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),
-                             bw4 * 4, bh4 * 4, "y-pal-pred");
-            }
-
-            const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);
-            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
-                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
-            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
-                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
-            int y, x;
-            const int sub_h4 = imin(h4, 16 + init_y);
-            const int sub_w4 = imin(w4, init_x + 16);
-            for (y = init_y, t->by += init_y; y < sub_h4;
-                 y += t_dim->h, t->by += t_dim->h)
-            {
-                pixel *dst = ((pixel *) f->cur.p.data[0]) +
-                               4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +
-                                    t->bx + init_x);
-                for (x = init_x, t->bx += init_x; x < sub_w4;
-                     x += t_dim->w, t->bx += t_dim->w)
-                {
-                    if (b->pal_sz[0]) goto skip_y_pred;
-
-                    int angle = b->y_angle;
-                    const enum EdgeFlags edge_flags =
-                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
-                             0 : EDGE_I444_TOP_HAS_RIGHT) |
-                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
-                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
-                    const pixel *top_sb_edge = NULL;
-                    if (!(t->by & (f->sb_step - 1))) {
-                        top_sb_edge = f->ipred_edge[0];
-                        const int sby = t->by >> f->sb_shift;
-                        top_sb_edge += f->sb128w * 128 * (sby - 1);
-                    }
-                    const enum IntraPredMode m =
-                        bytefn(dav1d_prepare_intra_edges)(t->bx,
-                                                          t->bx > ts->tiling.col_start,
-                                                          t->by,
-                                                          t->by > ts->tiling.row_start,
-                                                          ts->tiling.col_end,
-                                                          ts->tiling.row_end,
-                                                          edge_flags, dst,
-                                                          f->cur.p.stride[0], top_sb_edge,
-                                                          b->y_mode, &angle,
-                                                          t_dim->w, t_dim->h, edge);
-                    dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
-                                             t_dim->w * 4, t_dim->h * 4,
-                                             angle | sm_fl);
-
-                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
-                                 t_dim->h * 4, 2, "l");
-                        hex_dump(edge, 0, 1, 1, "tl");
-                        hex_dump(edge + 1, t_dim->w * 4,
-                                 t_dim->w * 4, 2, "t");
-                        hex_dump(dst, f->cur.p.stride[0],
-                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
-                    }
-
-                skip_y_pred: {}
-                    if (!b->skip) {
-                        coef *cf;
-                        int eob;
-                        enum TxfmType txtp;
-                        if (f->frame_thread.pass) {
-                            cf = ts->frame_thread.cf;
-                            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                            const struct CodedBlockInfo *const cbi =
-                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-                            eob = cbi->eob[0];
-                            txtp = cbi->txtp[0];
-                        } else {
-                            uint8_t cf_ctx;
-                            cf = t->cf;
-                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
-                                               &t->l.lcoef[by4 + y], b->tx, bs,
-                                               b, 1, 0, cf, &txtp, &cf_ctx);
-                            if (DEBUG_BLOCK_INFO)
-                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
-                                       b->tx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                                   imin(t_dim->w, f->bw - t->bx));
-                            memset(&t->l.lcoef[by4 + y], cf_ctx,
-                                   imin(t_dim->h, f->bh - t->by));
-                        }
-                        if (eob >= 0) {
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                coef_dump(cf, imin(t_dim->h, 8) * 4,
-                                          imin(t_dim->w, 8) * 4, 3, "dq");
-                            dsp->itx.itxfm_add[b->tx]
-                                              [txtp](dst,
-                                                     f->cur.p.stride[0],
-                                                     cf, eob);
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                hex_dump(dst, f->cur.p.stride[0],
-                                         t_dim->w * 4, t_dim->h * 4, "recon");
-                        }
-                    } else if (!f->frame_thread.pass) {
-                        memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
-                        memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
-                    }
-                    dst += 4 * t_dim->w;
-                }
-                t->bx -= x;
-            }
-            t->by -= y;
-
-            if (!has_chroma) continue;
-
-            const ptrdiff_t stride = f->cur.p.stride[1];
-
-            if (b->uv_mode == CFL_PRED) {
-                assert(!init_x && !init_y);
-
-                int16_t *const ac = t->scratch.ac;
-                pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +
-                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);
-                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
-                                              (t->by >> ss_ver) * PXSTRIDE(stride));
-                pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,
-                                           ((pixel *) f->cur.p.data[2]) + uv_off };
-
-                const int furthest_r =
-                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
-                const int furthest_b =
-                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
-                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]
-                                 [b->uvtx](ac, y_src, f->cur.p.stride[0],
-                                           cbw4 - (furthest_r >> ss_hor),
-                                           cbh4 - (furthest_b >> ss_ver));
-                for (int pl = 0; pl < 2; pl++) {
-                    if (!b->cfl_alpha[pl]) continue;
-                    int angle = 0;
-                    const pixel *top_sb_edge = NULL;
-                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
-                        top_sb_edge = f->ipred_edge[pl + 1];
-                        const int sby = t->by >> f->sb_shift;
-                        top_sb_edge += f->sb128w * 128 * (sby - 1);
-                    }
-                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
-                    const int xstart = ts->tiling.col_start >> ss_hor;
-                    const int ystart = ts->tiling.row_start >> ss_ver;
-                    const enum IntraPredMode m =
-                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
-                                                          ypos, ypos > ystart,
-                                                          ts->tiling.col_end >> ss_hor,
-                                                          ts->tiling.row_end >> ss_ver,
-                                                          0, uv_dst[pl], stride,
-                                                          top_sb_edge, DC_PRED, &angle,
-                                                          uv_t_dim->w,
-                                                          uv_t_dim->h, edge);
-                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
-                                           uv_t_dim->w * 4,
-                                           uv_t_dim->h * 4,
-                                           ac, b->cfl_alpha[pl]);
-                }
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
-                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
-                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
-                }
-            } else if (b->pal_sz[1]) {
-                ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
-                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
-                const uint8_t *pal_idx;
-                if (f->frame_thread.pass) {
-                    pal_idx = ts->frame_thread.pal_idx;
-                    ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
-                } else {
-                    pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
-                }
-                const uint16_t *const pal_u = f->frame_thread.pass ?
-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                        ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];
-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,
-                                       f->cur.p.stride[1], pal_u,
-                                       pal_idx, cbw4 * 4, cbh4 * 4);
-                const uint16_t *const pal_v = f->frame_thread.pass ?
-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                        ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,
-                                       f->cur.p.stride[1], pal_v,
-                                       pal_idx, cbw4 * 4, cbh4 * 4);
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                    hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,
-                             PXSTRIDE(f->cur.p.stride[1]),
-                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
-                    hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,
-                             PXSTRIDE(f->cur.p.stride[1]),
-                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
-                }
-            }
-
-            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
-                                 sm_uv_flag(&t->l, cby4);
-            const int uv_sb_has_tr =
-                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
-                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));
-            const int uv_sb_has_bl =
-                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
-                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));
-            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
-            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
-            for (int pl = 0; pl < 2; pl++) {
-                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
-                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
-                {
-                    pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +
-                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
-                                        ((t->bx + init_x) >> ss_hor));
-                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
-                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
-                    {
-                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
-                            b->pal_sz[1])
-                        {
-                            goto skip_uv_pred;
-                        }
-
-                        int angle = b->uv_angle;
-                        // this probably looks weird because we're using
-                        // luma flags in a chroma loop, but that's because
-                        // prepare_intra_edges() expects luma flags as input
-                        const enum EdgeFlags edge_flags =
-                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
-                              (x + uv_t_dim->w >= sub_cw4)) ?
-                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
-                            ((x > (init_x >> ss_hor) ||
-                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
-                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
-                        const pixel *top_sb_edge = NULL;
-                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
-                            top_sb_edge = f->ipred_edge[1 + pl];
-                            const int sby = t->by >> f->sb_shift;
-                            top_sb_edge += f->sb128w * 128 * (sby - 1);
-                        }
-                        const enum IntraPredMode uv_mode =
-                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
-                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
-                        const int xstart = ts->tiling.col_start >> ss_hor;
-                        const int ystart = ts->tiling.row_start >> ss_ver;
-                        const enum IntraPredMode m =
-                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
-                                                              ypos, ypos > ystart,
-                                                              ts->tiling.col_end >> ss_hor,
-                                                              ts->tiling.row_end >> ss_ver,
-                                                              edge_flags, dst, stride,
-                                                              top_sb_edge, uv_mode,
-                                                              &angle, uv_t_dim->w,
-                                                              uv_t_dim->h, edge);
-                        dsp->ipred.intra_pred[m](dst, stride, edge,
-                                                 uv_t_dim->w * 4,
-                                                 uv_t_dim->h * 4,
-                                                 angle | sm_uv_fl);
-                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
-                                     uv_t_dim->h * 4, 2, "l");
-                            hex_dump(edge, 0, 1, 1, "tl");
-                            hex_dump(edge + 1, uv_t_dim->w * 4,
-                                     uv_t_dim->w * 4, 2, "t");
-                            hex_dump(dst, stride, uv_t_dim->w * 4,
-                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
-                        }
-
-                    skip_uv_pred: {}
-                        if (!b->skip) {
-                            enum TxfmType txtp;
-                            int eob;
-                            coef *cf;
-                            if (f->frame_thread.pass) {
-                                cf = ts->frame_thread.cf;
-                                ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
-                                const struct CodedBlockInfo *const cbi =
-                                    &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-                                eob = cbi->eob[pl + 1];
-                                txtp = cbi->txtp[pl + 1];
-                            } else {
-                                uint8_t cf_ctx;
-                                cf = t->cf;
-                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
-                                                   &t->l.ccoef[pl][cby4 + y],
-                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
-                                                   &txtp, &cf_ctx);
-                                if (DEBUG_BLOCK_INFO)
-                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
-                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
-                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
-                                memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                       imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                                memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                       imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
-                            }
-                            if (eob >= 0) {
-                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                    coef_dump(cf, uv_t_dim->h * 4,
-                                              uv_t_dim->w * 4, 3, "dq");
-                                dsp->itx.itxfm_add[b->uvtx]
-                                                  [txtp](dst, stride,
-                                                         cf, eob);
-                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                    hex_dump(dst, stride, uv_t_dim->w * 4,
-                                             uv_t_dim->h * 4, "recon");
-                            }
-                        } else if (!f->frame_thread.pass) {
-                            memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
-                            memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
-                        }
-                        dst += uv_t_dim->w * 4;
-                    }
-                    t->bx -= x << ss_hor;
-                }
-                t->by -= y << ss_ver;
-            }
-        }
-    }
-}
-
-void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
-                                 const Av1Block *const b)
-{
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dFrameContext *const f = t->f;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
-    const int bw4 = b_dim[0], bh4 = b_dim[1];
-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
-                           (bw4 > ss_hor || t->bx & 1) &&
-                           (bh4 > ss_ver || t->by & 1);
-    const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
-                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;
-
-    // prediction
-    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
-    pixel *dst = ((pixel *) f->cur.p.data[0]) +
-        4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
-    const ptrdiff_t uvdstoff =
-        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
-    if (!(f->frame_hdr.frame_type & 1)) {
-        // intrabc
-        mc(t, dst, NULL, f->cur.p.stride[0],
-           bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
-        if (has_chroma) for (int pl = 1; pl < 3; pl++)
-            mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
-               bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
-               t->bx & ~ss_hor, t->by & ~ss_ver,
-               pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
-    } else if (b->comp_type == COMP_INTER_NONE) {
-        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
-        const enum Filter2d filter_2d = b->filter2d;
-
-        if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&
-            ((b->inter_mode == GLOBALMV &&
-              f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
-             (b->motion_mode == MM_WARP &&
-              t->warpmv.type > WM_TYPE_TRANSLATION)))
-        {
-            warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
-                        b->motion_mode == MM_WARP ? &t->warpmv :
-                            &f->frame_hdr.gmv[b->ref[0]]);
-        } else {
-            mc(t, dst, NULL, f->cur.p.stride[0],
-               bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);
-            if (b->motion_mode == MM_OBMC)
-                obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
-        }
-        if (b->interintra_type) {
-            ALIGN_STK_32(pixel, tl_edge_buf, 65,);
-            pixel *const tl_edge = tl_edge_buf + 32;
-            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
-                                   SMOOTH_PRED : b->interintra_mode;
-            pixel *const tmp = t->scratch.interintra;
-            int angle = 0;
-            const pixel *top_sb_edge = NULL;
-            if (!(t->by & (f->sb_step - 1))) {
-                top_sb_edge = f->ipred_edge[0];
-                const int sby = t->by >> f->sb_shift;
-                top_sb_edge += f->sb128w * 128 * (sby - 1);
-            }
-            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
-                                                  t->by, t->by > ts->tiling.row_start,
-                                                  ts->tiling.col_end, ts->tiling.row_end,
-                                                  0, dst, f->cur.p.stride[0], top_sb_edge,
-                                                  m, &angle, bw4, bh4, tl_edge);
-            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
-                                     tl_edge, bw4 * 4, bh4 * 4, 0);
-            const uint8_t *const ii_mask =
-                b->interintra_type == INTER_INTRA_BLEND ?
-                     dav1d_ii_masks[bs][0][b->interintra_mode] :
-                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];
-            dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),
-                          bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
-        }
-
-        if (!has_chroma) goto skip_inter_chroma_pred;
-
-        // sub8x8 derivation
-        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
-        refmvs *r;
-        if (is_sub8x8) {
-            assert(ss_hor == 1);
-            r = &f->mvs[t->by * f->b4_stride + t->bx];
-            if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;
-            if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;
-            if (bw4 == 1 && bh4 == ss_ver)
-                is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;
-        }
-
-        // chroma prediction
-        if (is_sub8x8) {
-            assert(ss_hor == 1);
-            int h_off = 0, v_off = 0;
-            if (bw4 == 1 && bh4 == ss_ver) {
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                       NULL, f->cur.p.stride[1],
-                       bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
-                       r[-(f->b4_stride + 1)].mv[0],
-                       &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? t->tl_4x4_filter :
-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
-                h_off = 2;
-            }
-            if (bw4 == 1) {
-                const enum Filter2d left_filter_2d =
-                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
-                       f->cur.p.stride[1], bw4, bh4, t->bx - 1,
-                       t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? left_filter_2d :
-                           f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
-                h_off = 2;
-            }
-            if (bh4 == ss_ver) {
-                const enum Filter2d top_filter_2d =
-                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
-                       f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
-                       1 + pl, r[-f->b4_stride].mv[0],
-                       &f->refp[r[-f->b4_stride].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? top_filter_2d :
-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
-            }
-            for (int pl = 0; pl < 2; pl++)
-                mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
-                   bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);
-        } else {
-            if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
-                ((b->inter_mode == GLOBALMV &&
-                  f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
-                 (b->motion_mode == MM_WARP &&
-                  t->warpmv.type > WM_TYPE_TRANSLATION)))
-            {
-                for (int pl = 0; pl < 2; pl++)
-                    warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
-                                f->cur.p.stride[1], b_dim, 1 + pl, refp,
-                                b->motion_mode == MM_WARP ? &t->warpmv :
-                                    &f->frame_hdr.gmv[b->ref[0]]);
-            } else {
-                for (int pl = 0; pl < 2; pl++) {
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                       NULL, f->cur.p.stride[1],
-                       bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
-                       t->bx & ~ss_hor, t->by & ~ss_ver,
-                       1 + pl, b->mv[0], refp, filter_2d);
-                    if (b->motion_mode == MM_OBMC)
-                        obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                             f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
-                }
-            }
-            if (b->interintra_type) {
-                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
-                // the wrong thing since it will select 4x16, not 4x32, as a
-                // transform size...
-                const uint8_t *const ii_mask =
-                    b->interintra_type == INTER_INTRA_BLEND ?
-                         dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
-                         dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
-
-                for (int pl = 0; pl < 2; pl++) {
-                    pixel *const tmp = t->scratch.interintra;
-                    pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
-                    enum IntraPredMode m =
-                        b->interintra_mode == II_SMOOTH_PRED ?
-                        SMOOTH_PRED : b->interintra_mode;
-                    int angle = 0;
-                    pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
-                    const pixel *top_sb_edge = NULL;
-                    if (!(t->by & (f->sb_step - 1))) {
-                        top_sb_edge = f->ipred_edge[pl + 1];
-                        const int sby = t->by >> f->sb_shift;
-                        top_sb_edge += f->sb128w * 128 * (sby - 1);
-                    }
-                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
-                                                          (t->bx >> ss_hor) >
-                                                              (ts->tiling.col_start >> ss_hor),
-                                                          t->by >> ss_ver,
-                                                          (t->by >> ss_ver) >
-                                                              (ts->tiling.row_start >> ss_ver),
-                                                          ts->tiling.col_end >> ss_hor,
-                                                          ts->tiling.row_end >> ss_ver,
-                                                          0, uvdst, f->cur.p.stride[1],
-                                                          top_sb_edge, m,
-                                                          &angle, cbw4, cbh4, tl_edge);
-                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
-                                             tl_edge, cbw4 * 4, cbh4 * 4, 0);
-                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),
-                                  cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
-                }
-            }
-        }
-
-    skip_inter_chroma_pred: {}
-        t->tl_4x4_filter = filter_2d;
-    } else {
-        const enum Filter2d filter_2d = b->filter2d;
-        // Maximum super block size is 128x128
-        coef (*tmp)[128 * 128] = (coef (*)[128 * 128]) t->scratch.compinter;
-        int jnt_weight;
-        uint8_t *const seg_mask = t->scratch_seg_mask;
-        const uint8_t *mask;
-
-        for (int i = 0; i < 2; i++) {
-            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
-
-            if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&
-                f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
-            {
-                warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
-                            &f->frame_hdr.gmv[b->ref[i]]);
-            } else {
-                mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
-                   b->mv[i], refp, filter_2d);
-            }
-        }
-        switch (b->comp_type) {
-        case COMP_INTER_AVG:
-            dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
-                        bw4 * 4, bh4 * 4);
-            break;
-        case COMP_INTER_WEIGHTED_AVG:
-            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
-            dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
-                          bw4 * 4, bh4 * 4, jnt_weight);
-            break;
-        case COMP_INTER_SEG:
-            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],
-                                           tmp[b->mask_sign], tmp[!b->mask_sign],
-                                           bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);
-            mask = seg_mask;
-            break;
-        case COMP_INTER_WEDGE:
-            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
-            dsp->mc.mask(dst, f->cur.p.stride[0],
-                         tmp[b->mask_sign], tmp[!b->mask_sign],
-                         bw4 * 4, bh4 * 4, mask);
-            if (has_chroma)
-                mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
-            break;
-        }
-
-        // chroma
-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-            for (int i = 0; i < 2; i++) {
-                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
-                if (b->inter_mode == GLOBALMV_GLOBALMV &&
-                    imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
-                    f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
-                {
-                    warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
-                                refp, &f->frame_hdr.gmv[b->ref[i]]);
-                } else {
-                    mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
-                       1 + pl, b->mv[i], refp, filter_2d);
-                }
-            }
-            pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
-            switch (b->comp_type) {
-            case COMP_INTER_AVG:
-                dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
-                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);
-                break;
-            case COMP_INTER_WEIGHTED_AVG:
-                dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
-                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);
-                break;
-            case COMP_INTER_WEDGE:
-            case COMP_INTER_SEG:
-                dsp->mc.mask(uvdst, f->cur.p.stride[1],
-                             tmp[b->mask_sign], tmp[!b->mask_sign],
-                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);
-                break;
-            }
-        }
-    }
-
-    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-        hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
-        if (has_chroma) {
-            hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],
-                     cbw4 * 4, cbh4 * 4, "u-pred");
-            hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],
-                     cbw4 * 4, cbh4 * 4, "v-pred");
-        }
-    }
-
-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-
-    if (b->skip) {
-        // reset coef contexts
-        memset(&t->a->lcoef[bx4], 0x40, w4);
-        memset(&t->l.lcoef[by4], 0x40, h4);
-        if (has_chroma) {
-            memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[0][cby4], 0x40, ch4);
-            memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[1][cby4], 0x40, ch4);
-        }
-        return;
-    }
-
-    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
-    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
-
-    for (int init_y = 0; init_y < bh4; init_y += 16) {
-        for (int init_x = 0; init_x < bw4; init_x += 16) {
-            // coefficient coding & inverse transforms
-            int y_off = !!init_y, y;
-            dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;
-            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
-                 y += ytx->h, y_off++)
-            {
-                int x, x_off = !!init_x;
-                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
-                     x += ytx->w, x_off++)
-                {
-                    read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
-                                   x_off, y_off, &dst[x * 4]);
-                    t->bx += ytx->w;
-                }
-                dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;
-                t->bx -= x;
-                t->by += ytx->h;
-            }
-            dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;
-            t->by -= y;
-
-            // chroma coefs and inverse transform
-            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-                pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +
-                    (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);
-                for (y = init_y >> ss_ver, t->by += init_y;
-                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
-                {
-                    int x;
-                    for (x = init_x >> ss_hor, t->bx += init_x;
-                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
-                    {
-                        coef *cf;
-                        int eob;
-                        enum TxfmType txtp;
-                        if (f->frame_thread.pass) {
-                            cf = ts->frame_thread.cf;
-                            ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
-                            const struct CodedBlockInfo *const cbi =
-                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-                            eob = cbi->eob[1 + pl];
-                            txtp = cbi->txtp[1 + pl];
-                        } else {
-                            uint8_t cf_ctx;
-                            cf = t->cf;
-                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
-                                                bx4 + (x << ss_hor)];
-                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
-                                               &t->l.ccoef[pl][cby4 + y],
-                                               b->uvtx, bs, b, 0, 1 + pl,
-                                               cf, &txtp, &cf_ctx);
-                            if (DEBUG_BLOCK_INFO)
-                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
-                                       "txtp=%d,eob=%d]: r=%d\n",
-                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                   imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                            memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                   imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
-                        }
-                        if (eob >= 0) {
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
-                            dsp->itx.itxfm_add[b->uvtx]
-                                              [txtp](&uvdst[4 * x],
-                                                     f->cur.p.stride[1],
-                                                     cf, eob);
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                hex_dump(&uvdst[4 * x], f->cur.p.stride[1],
-                                         uvtx->w * 4, uvtx->h * 4, "recon");
-                        }
-                        t->bx += uvtx->w << ss_hor;
-                    }
-                    uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;
-                    t->bx -= x << ss_hor;
-                    t->by += uvtx->h << ss_ver;
-                }
-                t->by -= y << ss_ver;
-            }
-        }
-    }
-}
-
-void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int sbsz = f->sb_step, sbh = f->sbh;
-
-    if (f->frame_hdr.loopfilter.level_y[0] ||
-        f->frame_hdr.loopfilter.level_y[1])
-    {
-        int start_of_tile_row = 0;
-        if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)
-            start_of_tile_row = f->lf.tile_row++;
-        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
-                                       start_of_tile_row);
-    }
-
-    if (f->seq_hdr.restoration) {
-        // Store loop filtered pixels required by loop restoration
-        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
-    }
-    if (f->seq_hdr.cdef) {
-        if (sby) {
-            pixel *p_up[3] = {
-                f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),
-                f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
-                f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
-            };
-            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
-                                    sby * sbsz - 2, sby * sbsz);
-        }
-        const int n_blks = sbsz - 2 * (sby + 1 < sbh);
-        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
-                                imin(sby * sbsz + n_blks, f->bh));
-    }
-    if (f->seq_hdr.restoration) {
-        bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);
-    }
-
-    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);
-    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-    f->lf.prev_mask_ptr = f->lf.mask_ptr;
-    if ((sby & 1) || f->seq_hdr.sb128) {
-        f->lf.mask_ptr += f->sb128w;
-    }
-}
-
-void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
-    const Dav1dFrameContext *const f = t->f;
-    Dav1dTileState *const ts = t->ts;
-    const int sby = t->by >> f->sb_shift;
-    const int sby_off = f->sb128w * 128 * sby;
-    const int x_off = ts->tiling.col_start;
-
-    const pixel *const y =
-        ((const pixel *) f->cur.p.data[0]) + x_off * 4 +
-                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);
-    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
-               4 * (ts->tiling.col_end - x_off));
-
-    if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-
-        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
-            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);
-        for (int pl = 1; pl <= 2; pl++)
-            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
-                       &((const pixel *) f->cur.p.data[pl])[uv_off],
-                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
-    }
-}
--- /dev/null
+++ b/src/recon_tmpl.c
@@ -1,0 +1,1518 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <stdio.h>
+
+#include "common/attributes.h"
+#include "common/bitdepth.h"
+#include "common/dump.h"
+#include "common/intops.h"
+#include "common/mem.h"
+
+#include "src/cdef_apply.h"
+#include "src/ipred_prepare.h"
+#include "src/lf_apply.h"
+#include "src/lr_apply.h"
+#include "src/recon.h"
+#include "src/scan.h"
+#include "src/tables.h"
+#include "src/wedge.h"
+
+static unsigned read_golomb(MsacContext *const msac) {
+    int len = 0;
+    unsigned val = 1;
+
+    while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;
+    while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);
+
+    return val - 1;
+}
+
+static int decode_coefs(Dav1dTileContext *const t,
+                        uint8_t *const a, uint8_t *const l,
+                        const enum RectTxfmSize tx, const enum BlockSize bs,
+                        const Av1Block *const b, const int intra,
+                        const int plane, coef *cf,
+                        enum TxfmType *const txtp, uint8_t *res_ctx)
+{
+    Dav1dTileState *const ts = t->ts;
+    const int chroma = !!plane;
+    const Dav1dFrameContext *const f = t->f;
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+    const int dbg = DEBUG_BLOCK_INFO && plane && 0;
+
+    if (dbg) printf("Start: r=%d\n", ts->msac.rng);
+
+    // does this block have any non-zero coefficients
+    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);
+    const int all_skip =
+        msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);
+    if (dbg)
+    printf("Post-non-zero[%d][%d][%d]: r=%d\n",
+           t_dim->ctx, sctx, all_skip, ts->msac.rng);
+    if (all_skip) {
+        *res_ctx = 0x40;
+        *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :
+                                                                DCT_DCT;
+        return -1;
+    }
+
+    // transform type (chroma: derived, luma: explicitly coded)
+    if (chroma) {
+        if (intra) {
+            *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);
+        } else {
+            const enum TxfmType y_txtp = *txtp;
+            *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);
+        }
+    } else {
+        const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
+                                                      &f->frame_hdr, b->seg_id);
+        const unsigned set_cnt = dav1d_tx_type_count[set];
+        unsigned idx;
+        if (set_cnt == 1) {
+            idx = 0;
+        } else {
+            const int set_idx = dav1d_tx_type_set_index[!intra][set];
+            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
+                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
+            uint16_t *const txtp_cdf = intra ?
+                       ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
+                       ts->cdf.m.txtp_inter[set_idx][t_dim->min];
+            idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
+            if (dbg)
+            printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
+                   set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,
+                   idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
+        }
+        *txtp = dav1d_tx_types_per_set[set][idx];
+    }
+
+    // find end-of-block (eob)
+    int eob_bin;
+    const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
+    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
+    const int is_1d = tx_class != TX_CLASS_2D;
+    switch (tx2dszctx) {
+#define case_sz(sz, bin) \
+    case sz: { \
+        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
+        eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
+        break; \
+    }
+    case_sz(0,   16);
+    case_sz(1,   32);
+    case_sz(2,   64);
+    case_sz(3,  128);
+    case_sz(4,  256);
+    case_sz(5,  512);
+    case_sz(6, 1024);
+#undef case_sz
+    }
+    if (dbg)
+    printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
+           16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
+    int eob;
+    if (eob_bin > 1) {
+        eob = 1 << (eob_bin - 1);
+        uint16_t *const eob_hi_bit_cdf =
+            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
+        const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
+        if (dbg)
+        printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
+               t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
+        unsigned mask = eob >> 1;
+        if (eob_hi_bit) eob |= mask;
+        for (mask >>= 1; mask; mask >>= 1) {
+            const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);
+            if (eob_bit) eob |= mask;
+        }
+        if (dbg)
+        printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
+    } else {
+        eob = eob_bin;
+    }
+
+    // base tokens
+    uint16_t (*const br_cdf)[5] =
+        ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
+    const int16_t *const scan = dav1d_scans[tx][tx_class];
+    uint8_t levels[36 * 36];
+    ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1);
+    memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1));
+    const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1;
+    unsigned cul_level = 0;
+    for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) {
+        const int rc = scan[i], x = rc >> shift, y = rc & mask;
+
+        // lo tok
+        const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class);
+        uint16_t *const lo_cdf = is_last ?
+            ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
+            ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
+        int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf,
+                                           4 - is_last) + is_last;
+        if (dbg)
+        printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
+               t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
+        if (!tok) continue;
+
+        // hi tok
+        if (tok == 3) {
+            const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
+            do {
+                const int tok_br =
+                    msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4);
+                if (dbg)
+                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
+                       imin(t_dim->ctx, 3), chroma, br_ctx,
+                       i, rc, tok_br, tok, ts->msac.rng);
+                tok += tok_br;
+                if (tok_br < 3) break;
+            } while (tok < 15);
+        }
+
+        levels[x * stride + y] = cf[rc] = tok;
+    }
+
+    // residual and sign
+    int dc_sign = 1;
+    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
+    const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
+    const int dq_shift = imax(0, t_dim->ctx - 2);
+    for (int i = 0; i <= eob; i++) {
+        const int rc = scan[i];
+        int tok = cf[rc];
+        if (!tok) continue;
+        int dq;
+
+        // sign
+        int sign;
+        if (i == 0) {
+            const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
+            uint16_t *const dc_sign_cdf =
+                ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
+            sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
+            if (dbg)
+            printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
+                   chroma, dc_sign_ctx, sign, ts->msac.rng);
+            dc_sign = sign ? 0 : 2;
+            dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
+        } else {
+            sign = msac_decode_bool(&ts->msac, 128 << 7);
+            if (dbg)
+            printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
+            dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
+        }
+
+        // residual
+        if (tok == 15) {
+            tok += read_golomb(&ts->msac);
+            if (dbg)
+            printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
+                   i, rc, tok - 15, tok, ts->msac.rng);
+        }
+
+        // dequant
+        cul_level += tok;
+        tok *= dq;
+        tok >>= dq_shift;
+        cf[rc] = sign ? -tok : tok;
+    }
+
+    // context
+    *res_ctx = imin(cul_level, 63) | (dc_sign << 6);
+
+    return eob;
+}
+
+static void read_coef_tree(Dav1dTileContext *const t,
+                           const enum BlockSize bs, const Av1Block *const b,
+                           const enum RectTxfmSize ytx, const int depth,
+                           const uint16_t *const tx_split,
+                           const int x_off, const int y_off, pixel *dst)
+{
+    const Dav1dFrameContext *const f = t->f;
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
+    const int txw = t_dim->w, txh = t_dim->h;
+
+    if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {
+        const enum RectTxfmSize sub = t_dim->sub;
+        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+                       x_off * 2 + 0, y_off * 2 + 0, dst);
+        t->bx += txsw;
+        if (txw >= txh && t->bx < f->bw)
+            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
+        t->bx -= txsw;
+        t->by += txsh;
+        if (txh >= txw && t->by < f->bh) {
+            if (dst)
+                dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);
+            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+                           x_off * 2 + 0, y_off * 2 + 1, dst);
+            t->bx += txsw;
+            if (txw >= txh && t->bx < f->bw)
+                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
+            t->bx -= txsw;
+        }
+        t->by -= txsh;
+    } else {
+        const int bx4 = t->bx & 31, by4 = t->by & 31;
+        enum TxfmType txtp;
+        uint8_t cf_ctx;
+        int eob;
+        coef *cf;
+        struct CodedBlockInfo *cbi;
+
+        if (f->frame_thread.pass) {
+            cf = ts->frame_thread.cf;
+            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+            cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+        } else {
+            cf = t->cf;
+        }
+        if (f->frame_thread.pass != 2) {
+            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
+                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+                       ytx, txtp, eob, ts->msac.rng);
+            memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
+            memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
+            for (int y = 0; y < txh; y++)
+                memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
+            if (f->frame_thread.pass == 1) {
+                cbi->eob[0] = eob;
+                cbi->txtp[0] = txtp;
+            }
+        } else {
+            eob = cbi->eob[0];
+            txtp = cbi->txtp[0];
+        }
+        if (!(f->frame_thread.pass & 1)) {
+            assert(dst);
+            if (eob >= 0) {
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
+                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                    hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
+            }
+        }
+    }
+}
+
+void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
+                                    const enum BlockSize bs, const Av1Block *const b)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+    const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;
+    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                           (bw4 > ss_hor || t->bx & 1) &&
+                           (bh4 > ss_ver || t->by & 1);
+
+    if (b->skip) {
+        memset(&t->a->lcoef[bx4], 0x40, bw4);
+        memset(&t->l.lcoef[by4], 0x40, bh4);
+        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+            memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
+            memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
+        }
+        return;
+    }
+
+    Dav1dTileState *const ts = t->ts;
+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+    assert(f->frame_thread.pass == 1);
+    assert(!b->skip);
+    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
+
+    for (int init_y = 0; init_y < h4; init_y += 16) {
+        for (int init_x = 0; init_x < w4; init_x += 16) {
+            const int sub_h4 = imin(h4, 16 + init_y);
+            const int sub_w4 = imin(w4, init_x + 16);
+            int y_off = !!init_y, y, x;
+            for (y = init_y, t->by += init_y; y < sub_h4;
+                 y += t_dim->h, t->by += t_dim->h, y_off++)
+            {
+                struct CodedBlockInfo *const cbi =
+                    &f->frame_thread.cbi[t->by * f->b4_stride];
+                int x_off = !!init_x;
+                for (x = init_x, t->bx += init_x; x < sub_w4;
+                     x += t_dim->w, t->bx += t_dim->w, x_off++)
+                {
+                    if (!b->intra) {
+                        read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
+                                       x_off, y_off, NULL);
+                    } else {
+                        uint8_t cf_ctx = 0x40;
+                        enum TxfmType txtp;
+                        const int eob = cbi[t->bx].eob[0] =
+                            decode_coefs(t, &t->a->lcoef[bx4 + x],
+                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
+                                         0, ts->frame_thread.cf, &txtp, &cf_ctx);
+                        if (DEBUG_BLOCK_INFO)
+                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+                                   b->tx, txtp, eob, ts->msac.rng);
+                        cbi[t->bx].txtp[0] = txtp;
+                        ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+                        memset(&t->a->lcoef[bx4 + x], cf_ctx,
+                               imin(t_dim->w, f->bw - t->bx));
+                        memset(&t->l.lcoef[by4 + y], cf_ctx,
+                               imin(t_dim->h, f->bh - t->by));
+                    }
+                }
+                t->bx -= x;
+            }
+            t->by -= y;
+
+            if (!has_chroma) continue;
+
+            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+            for (int pl = 0; pl < 2; pl++) {
+                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+                {
+                    struct CodedBlockInfo *const cbi =
+                        &f->frame_thread.cbi[t->by * f->b4_stride];
+                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+                    {
+                        uint8_t cf_ctx = 0x40;
+                        enum TxfmType txtp;
+                        if (!b->intra)
+                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+                                                bx4 + (x << ss_hor)];
+                        const int eob = cbi[t->bx].eob[1 + pl] =
+                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
+                                         b, b->intra, 1 + pl, ts->frame_thread.cf,
+                                         &txtp, &cf_ctx);
+                        if (DEBUG_BLOCK_INFO)
+                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+                                   "txtp=%d,eob=%d]: r=%d\n",
+                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
+                        cbi[t->bx].txtp[1 + pl] = txtp;
+                        ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
+                        memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
+                               imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
+                        memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
+                               imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+                    }
+                    t->bx -= x << ss_hor;
+                }
+                t->by -= y << ss_ver;
+            }
+        }
+    }
+}
+
+static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,
+                     const pixel *ref, const ptrdiff_t ref_stride,
+                     const int bw, const int bh,
+                     const int iw, const int ih,
+                     const int x, const int y)
+{
+    // find offset in reference of visible block to copy
+    ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);
+
+    // number of pixels to extend (left, right, top, bottom)
+    const int left_ext = iclip(-x, 0, bw - 1);
+    const int right_ext = iclip(x + bw - iw, 0, bw - 1);
+    assert(left_ext + right_ext < bw);
+    const int top_ext = iclip(-y, 0, bh - 1);
+    const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);
+    assert(top_ext + bottom_ext < bh);
+
+    // copy visible portion first
+    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
+    const int center_w = bw - left_ext - right_ext;
+    const int center_h = bh - top_ext - bottom_ext;
+    for (int y = 0; y < center_h; y++) {
+        pixel_copy(blk + left_ext, ref, center_w);
+        // extend left edge for this line
+        if (left_ext)
+            pixel_set(blk, blk[left_ext], left_ext);
+        // extend right edge for this line
+        if (right_ext)
+            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
+                      right_ext);
+        ref += PXSTRIDE(ref_stride);
+        blk += PXSTRIDE(dst_stride);
+    }
+
+    // copy top
+    blk = dst + top_ext * PXSTRIDE(dst_stride);
+    for (int y = 0; y < top_ext; y++) {
+        pixel_copy(dst, blk, bw);
+        dst += PXSTRIDE(dst_stride);
+    }
+
+    // copy bottom
+    dst += center_h * PXSTRIDE(dst_stride);
+    for (int y = 0; y < bottom_ext; y++) {
+        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static void mc(Dav1dTileContext *const t,
+               pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,
+               const int bw4, const int bh4,
+               const int bx, const int by, const int pl,
+               const mv mv, const Dav1dThreadPicture *const refp,
+               const enum Filter2d filter_2d)
+{
+    assert((dst8 != NULL) ^ (dst16 != NULL));
+    const Dav1dFrameContext *const f = t->f;
+    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+    const int mvx = mv.x, mvy = mv.y;
+    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
+    const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
+    const int dy = by * v_mul + (mvy >> (3 + ss_ver));
+    ptrdiff_t ref_stride = refp->p.stride[!!pl];
+    const pixel *ref;
+    int w, h;
+
+    if (refp != &f->cur) { // i.e. not for intrabc
+        dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
+                                  PLANE_TYPE_Y + !!pl);
+        w = (f->cur.p.p.w + ss_hor) >> ss_hor;
+        h = (f->cur.p.p.h + ss_ver) >> ss_ver;
+    } else {
+        w = f->bw * 4 >> ss_hor;
+        h = f->bh * 4 >> ss_ver;
+    }
+    if (dx < !!mx * 3 || dy < !!my * 3 ||
+        dx + bw4 * h_mul + !!mx * 4 > w ||
+        dy + bh4 * v_mul + !!my * 4 > h)
+    {
+        emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,
+                 bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,
+                 dx - !!mx * 3, dy - !!my * 3);
+        ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];
+        ref_stride = 160 * sizeof(pixel);
+    } else {
+        ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+    }
+
+    if (dst8 != NULL) {
+        f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
+                                 bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
+    } else {
+        f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
+                                  bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
+    }
+}
+
+static void obmc(Dav1dTileContext *const t,
+                 pixel *const dst, const ptrdiff_t dst_stride,
+                 const uint8_t *const b_dim, const int pl,
+                 const int bx4, const int by4, const int w4, const int h4)
+{
+    assert(!(t->bx & 1) && !(t->by & 1));
+    const Dav1dFrameContext *const f = t->f;
+    const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
+    pixel *const lap = t->scratch.lap;
+    static const uint8_t obmc_mask_2[2] = { 19,  0 };
+    static const uint8_t obmc_mask_4[4] = { 25, 14,  5,  0 };
+    static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11,  7,  3,  0,  0 };
+    static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,
+                                               8,  6,  4,  3,  0,  0,  0,  0 };
+    static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,
+                                              19, 17, 16, 14, 13, 12, 11,  9,
+                                               8,  7,  6,  5,  4,  4,  3,  2,
+                                               0,  0,  0,  0,  0,  0,  0,  0 };
+    static const uint8_t *const obmc_masks[] = {
+        obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32
+    };
+    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+
+    if (t->by > t->ts->tiling.row_start &&
+        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+    {
+        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+            // only odd blocks are considered for overlap handling, hence +1
+            const refmvs *const a_r = &r[x - f->b4_stride + 1];
+            const uint8_t *const a_b_dim =
+                dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];
+
+            if (a_r->ref[0] > 0) {
+                mc(t, lap, NULL, 128 * sizeof(pixel),
+                   iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,
+                   t->bx + x, t->by, pl, a_r->mv[0],
+                   &f->refp[a_r->ref[0] - 1],
+                   dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
+                f->dsp->mc.blend(&dst[x * h_mul], dst_stride,
+                                 lap, 128 * sizeof(pixel),
+                                 h_mul * iclip(a_b_dim[0], 2, b_dim[0]),
+                                 v_mul * imin(b_dim[1], 16) >> 1,
+                                 obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);
+                i++;
+            }
+            x += imax(a_b_dim[0], 2);
+        }
+    }
+
+    if (t->bx > t->ts->tiling.col_start)
+        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+            // only odd blocks are considered for overlap handling, hence +1
+            const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
+            const uint8_t *const l_b_dim =
+                dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];
+
+            if (l_r->ref[0] > 0) {
+                mc(t, lap, NULL, 32 * sizeof(pixel),
+                   imin(b_dim[0], 16) >> 1,
+                   iclip(l_b_dim[1], 2, b_dim[1]),
+                   t->bx, t->by + y, pl, l_r->mv[0],
+                   &f->refp[l_r->ref[0] - 1],
+                   dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
+                f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,
+                                 lap, 32 * sizeof(pixel),
+                                 h_mul * imin(b_dim[0], 16) >> 1,
+                                 v_mul * iclip(l_b_dim[1], 2, b_dim[1]),
+                                 obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);
+                i++;
+            }
+            y += imax(l_b_dim[1], 2);
+        }
+}
+
+static void warp_affine(Dav1dTileContext *const t,
+                        pixel *dst8, coef *dst16, const ptrdiff_t dstride,
+                        const uint8_t *const b_dim, const int pl,
+                        const Dav1dThreadPicture *const refp,
+                        const WarpedMotionParams *const wmp)
+{
+    assert((dst8 != NULL) ^ (dst16 != NULL));
+    const Dav1dFrameContext *const f = t->f;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+    const int32_t *const mat = wmp->matrix;
+    const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;
+    const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;
+
+    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
+        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
+            // calculate transformation relative to center of 8x8 block in
+            // luma pixel units
+            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+            const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+            const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
+            const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
+
+            const int dx = (mvx >> 16) - 4;
+            const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
+                                             wmp->beta  * 7) & ~0x3f;
+            const int dy = (mvy >> 16) - 4;
+            const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
+                                             wmp->delta * 4) & ~0x3f;
+
+            const pixel *ref_ptr;
+            ptrdiff_t ref_stride = refp->p.stride[!!pl];
+
+            dav1d_thread_picture_wait(refp, dy + 4 + 8,
+                                      PLANE_TYPE_Y + !!pl);
+            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
+                emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],
+                         ref_stride, 15, 15, width, height, dx - 3, dy - 3);
+                ref_ptr = &t->emu_edge[160 * 3 + 3];
+                ref_stride = 160 * sizeof(pixel);
+            } else {
+                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+            }
+            if (dst16 != NULL)
+                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
+                                 wmp->abcd, mx, my);
+            else
+                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
+                                wmp->abcd, mx, my);
+        }
+        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
+        else      dst16 += 8 * dstride;
+    }
+}
+
+void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
+                                 const enum EdgeFlags intra_edge_flags,
+                                 const Av1Block *const b)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                           (bw4 > ss_hor || t->bx & 1) &&
+                           (bh4 > ss_ver || t->by & 1);
+    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
+    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+
+    // coefficient coding
+    ALIGN_STK_32(pixel, edge_buf, 257,);
+    pixel *const edge = edge_buf + 128;
+    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+
+    for (int init_y = 0; init_y < h4; init_y += 16) {
+        for (int init_x = 0; init_x < w4; init_x += 16) {
+            if (b->pal_sz[0]) {
+                pixel *dst = ((pixel *) f->cur.p.data[0]) +
+                             4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
+                const uint8_t *pal_idx;
+                if (f->frame_thread.pass) {
+                    pal_idx = ts->frame_thread.pal_idx;
+                    ts->frame_thread.pal_idx += bw4 * bh4 * 16;
+                } else {
+                    pal_idx = t->scratch.pal_idx;
+                }
+                const uint16_t *const pal = f->frame_thread.pass ?
+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                                        ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
+                f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,
+                                       pal_idx, bw4 * 4, bh4 * 4);
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                    hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),
+                             bw4 * 4, bh4 * 4, "y-pal-pred");
+            }
+
+            const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);
+            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
+                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
+            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
+                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
+            int y, x;
+            const int sub_h4 = imin(h4, 16 + init_y);
+            const int sub_w4 = imin(w4, init_x + 16);
+            for (y = init_y, t->by += init_y; y < sub_h4;
+                 y += t_dim->h, t->by += t_dim->h)
+            {
+                pixel *dst = ((pixel *) f->cur.p.data[0]) +
+                               4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +
+                                    t->bx + init_x);
+                for (x = init_x, t->bx += init_x; x < sub_w4;
+                     x += t_dim->w, t->bx += t_dim->w)
+                {
+                    if (b->pal_sz[0]) goto skip_y_pred;
+
+                    int angle = b->y_angle;
+                    const enum EdgeFlags edge_flags =
+                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
+                             0 : EDGE_I444_TOP_HAS_RIGHT) |
+                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
+                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
+                    const pixel *top_sb_edge = NULL;
+                    if (!(t->by & (f->sb_step - 1))) {
+                        top_sb_edge = f->ipred_edge[0];
+                        const int sby = t->by >> f->sb_shift;
+                        top_sb_edge += f->sb128w * 128 * (sby - 1);
+                    }
+                    const enum IntraPredMode m =
+                        bytefn(dav1d_prepare_intra_edges)(t->bx,
+                                                          t->bx > ts->tiling.col_start,
+                                                          t->by,
+                                                          t->by > ts->tiling.row_start,
+                                                          ts->tiling.col_end,
+                                                          ts->tiling.row_end,
+                                                          edge_flags, dst,
+                                                          f->cur.p.stride[0], top_sb_edge,
+                                                          b->y_mode, &angle,
+                                                          t_dim->w, t_dim->h, edge);
+                    dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
+                                             t_dim->w * 4, t_dim->h * 4,
+                                             angle | sm_fl);
+
+                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
+                                 t_dim->h * 4, 2, "l");
+                        hex_dump(edge, 0, 1, 1, "tl");
+                        hex_dump(edge + 1, t_dim->w * 4,
+                                 t_dim->w * 4, 2, "t");
+                        hex_dump(dst, f->cur.p.stride[0],
+                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
+                    }
+
+                skip_y_pred: {}
+                    if (!b->skip) {
+                        coef *cf;
+                        int eob;
+                        enum TxfmType txtp;
+                        if (f->frame_thread.pass) {
+                            cf = ts->frame_thread.cf;
+                            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+                            const struct CodedBlockInfo *const cbi =
+                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+                            eob = cbi->eob[0];
+                            txtp = cbi->txtp[0];
+                        } else {
+                            uint8_t cf_ctx;
+                            cf = t->cf;
+                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
+                                               &t->l.lcoef[by4 + y], b->tx, bs,
+                                               b, 1, 0, cf, &txtp, &cf_ctx);
+                            if (DEBUG_BLOCK_INFO)
+                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+                                       b->tx, txtp, eob, ts->msac.rng);
+                            memset(&t->a->lcoef[bx4 + x], cf_ctx,
+                                   imin(t_dim->w, f->bw - t->bx));
+                            memset(&t->l.lcoef[by4 + y], cf_ctx,
+                                   imin(t_dim->h, f->bh - t->by));
+                        }
+                        if (eob >= 0) {
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                coef_dump(cf, imin(t_dim->h, 8) * 4,
+                                          imin(t_dim->w, 8) * 4, 3, "dq");
+                            dsp->itx.itxfm_add[b->tx]
+                                              [txtp](dst,
+                                                     f->cur.p.stride[0],
+                                                     cf, eob);
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                hex_dump(dst, f->cur.p.stride[0],
+                                         t_dim->w * 4, t_dim->h * 4, "recon");
+                        }
+                    } else if (!f->frame_thread.pass) {
+                        memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
+                        memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
+                    }
+                    dst += 4 * t_dim->w;
+                }
+                t->bx -= x;
+            }
+            t->by -= y;
+
+            if (!has_chroma) continue;
+
+            const ptrdiff_t stride = f->cur.p.stride[1];
+
+            if (b->uv_mode == CFL_PRED) {
+                assert(!init_x && !init_y);
+
+                int16_t *const ac = t->scratch.ac;
+                pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +
+                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);
+                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
+                                              (t->by >> ss_ver) * PXSTRIDE(stride));
+                pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,
+                                           ((pixel *) f->cur.p.data[2]) + uv_off };
+
+                const int furthest_r =
+                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
+                const int furthest_b =
+                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
+                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]
+                                 [b->uvtx](ac, y_src, f->cur.p.stride[0],
+                                           cbw4 - (furthest_r >> ss_hor),
+                                           cbh4 - (furthest_b >> ss_ver));
+                for (int pl = 0; pl < 2; pl++) {
+                    if (!b->cfl_alpha[pl]) continue;
+                    int angle = 0;
+                    const pixel *top_sb_edge = NULL;
+                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+                        top_sb_edge = f->ipred_edge[pl + 1];
+                        const int sby = t->by >> f->sb_shift;
+                        top_sb_edge += f->sb128w * 128 * (sby - 1);
+                    }
+                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+                    const int xstart = ts->tiling.col_start >> ss_hor;
+                    const int ystart = ts->tiling.row_start >> ss_ver;
+                    const enum IntraPredMode m =
+                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+                                                          ypos, ypos > ystart,
+                                                          ts->tiling.col_end >> ss_hor,
+                                                          ts->tiling.row_end >> ss_ver,
+                                                          0, uv_dst[pl], stride,
+                                                          top_sb_edge, DC_PRED, &angle,
+                                                          uv_t_dim->w,
+                                                          uv_t_dim->h, edge);
+                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
+                                           uv_t_dim->w * 4,
+                                           uv_t_dim->h * 4,
+                                           ac, b->cfl_alpha[pl]);
+                }
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
+                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
+                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
+                }
+            } else if (b->pal_sz[1]) {
+                ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
+                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
+                const uint8_t *pal_idx;
+                if (f->frame_thread.pass) {
+                    pal_idx = ts->frame_thread.pal_idx;
+                    ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
+                } else {
+                    pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+                }
+                const uint16_t *const pal_u = f->frame_thread.pass ?
+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                                        ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];
+                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,
+                                       f->cur.p.stride[1], pal_u,
+                                       pal_idx, cbw4 * 4, cbh4 * 4);
+                const uint16_t *const pal_v = f->frame_thread.pass ?
+                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                                        ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
+                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,
+                                       f->cur.p.stride[1], pal_v,
+                                       pal_idx, cbw4 * 4, cbh4 * 4);
+                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                    hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,
+                             PXSTRIDE(f->cur.p.stride[1]),
+                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
+                    hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,
+                             PXSTRIDE(f->cur.p.stride[1]),
+                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
+                }
+            }
+
+            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
+                                 sm_uv_flag(&t->l, cby4);
+            const int uv_sb_has_tr =
+                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
+                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));
+            const int uv_sb_has_bl =
+                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
+                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));
+            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+            for (int pl = 0; pl < 2; pl++) {
+                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+                {
+                    pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +
+                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
+                                        ((t->bx + init_x) >> ss_hor));
+                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+                    {
+                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
+                            b->pal_sz[1])
+                        {
+                            goto skip_uv_pred;
+                        }
+
+                        int angle = b->uv_angle;
+                        // this probably looks weird because we're using
+                        // luma flags in a chroma loop, but that's because
+                        // prepare_intra_edges() expects luma flags as input
+                        const enum EdgeFlags edge_flags =
+                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
+                              (x + uv_t_dim->w >= sub_cw4)) ?
+                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
+                            ((x > (init_x >> ss_hor) ||
+                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
+                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+                        const pixel *top_sb_edge = NULL;
+                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+                            top_sb_edge = f->ipred_edge[1 + pl];
+                            const int sby = t->by >> f->sb_shift;
+                            top_sb_edge += f->sb128w * 128 * (sby - 1);
+                        }
+                        const enum IntraPredMode uv_mode =
+                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
+                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+                        const int xstart = ts->tiling.col_start >> ss_hor;
+                        const int ystart = ts->tiling.row_start >> ss_ver;
+                        const enum IntraPredMode m =
+                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+                                                              ypos, ypos > ystart,
+                                                              ts->tiling.col_end >> ss_hor,
+                                                              ts->tiling.row_end >> ss_ver,
+                                                              edge_flags, dst, stride,
+                                                              top_sb_edge, uv_mode,
+                                                              &angle, uv_t_dim->w,
+                                                              uv_t_dim->h, edge);
+                        dsp->ipred.intra_pred[m](dst, stride, edge,
+                                                 uv_t_dim->w * 4,
+                                                 uv_t_dim->h * 4,
+                                                 angle | sm_uv_fl);
+                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
+                                     uv_t_dim->h * 4, 2, "l");
+                            hex_dump(edge, 0, 1, 1, "tl");
+                            hex_dump(edge + 1, uv_t_dim->w * 4,
+                                     uv_t_dim->w * 4, 2, "t");
+                            hex_dump(dst, stride, uv_t_dim->w * 4,
+                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
+                        }
+
+                    skip_uv_pred: {}
+                        if (!b->skip) {
+                            enum TxfmType txtp;
+                            int eob;
+                            coef *cf;
+                            if (f->frame_thread.pass) {
+                                cf = ts->frame_thread.cf;
+                                ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
+                                const struct CodedBlockInfo *const cbi =
+                                    &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+                                eob = cbi->eob[pl + 1];
+                                txtp = cbi->txtp[pl + 1];
+                            } else {
+                                uint8_t cf_ctx;
+                                cf = t->cf;
+                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+                                                   &t->l.ccoef[pl][cby4 + y],
+                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
+                                                   &txtp, &cf_ctx);
+                                if (DEBUG_BLOCK_INFO)
+                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
+                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
+                                memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
+                                       imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
+                                memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
+                                       imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+                            }
+                            if (eob >= 0) {
+                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                    coef_dump(cf, uv_t_dim->h * 4,
+                                              uv_t_dim->w * 4, 3, "dq");
+                                dsp->itx.itxfm_add[b->uvtx]
+                                                  [txtp](dst, stride,
+                                                         cf, eob);
+                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                    hex_dump(dst, stride, uv_t_dim->w * 4,
+                                             uv_t_dim->h * 4, "recon");
+                            }
+                        } else if (!f->frame_thread.pass) {
+                            memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
+                            memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
+                        }
+                        dst += uv_t_dim->w * 4;
+                    }
+                    t->bx -= x << ss_hor;
+                }
+                t->by -= y << ss_ver;
+            }
+        }
+    }
+}
+
+void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
+                                 const Av1Block *const b)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    const Dav1dDSPContext *const dsp = f->dsp;
+    const int bx4 = t->bx & 31, by4 = t->by & 31;
+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+    const int bw4 = b_dim[0], bh4 = b_dim[1];
+    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                           (bw4 > ss_hor || t->bx & 1) &&
+                           (bh4 > ss_ver || t->by & 1);
+    const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
+                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;
+
+    // prediction
+    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
+    pixel *dst = ((pixel *) f->cur.p.data[0]) +
+        4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
+    const ptrdiff_t uvdstoff =
+        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
+    if (!(f->frame_hdr.frame_type & 1)) {
+        // intrabc
+        mc(t, dst, NULL, f->cur.p.stride[0],
+           bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
+        if (has_chroma) for (int pl = 1; pl < 3; pl++)
+            mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
+               bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+               t->bx & ~ss_hor, t->by & ~ss_ver,
+               pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
+    } else if (b->comp_type == COMP_INTER_NONE) {
+        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
+        const enum Filter2d filter_2d = b->filter2d;
+
+        if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&
+            ((b->inter_mode == GLOBALMV &&
+              f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
+             (b->motion_mode == MM_WARP &&
+              t->warpmv.type > WM_TYPE_TRANSLATION)))
+        {
+            warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
+                        b->motion_mode == MM_WARP ? &t->warpmv :
+                            &f->frame_hdr.gmv[b->ref[0]]);
+        } else {
+            mc(t, dst, NULL, f->cur.p.stride[0],
+               bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);
+            if (b->motion_mode == MM_OBMC)
+                obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
+        }
+        if (b->interintra_type) {
+            ALIGN_STK_32(pixel, tl_edge_buf, 65,);
+            pixel *const tl_edge = tl_edge_buf + 32;
+            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
+                                   SMOOTH_PRED : b->interintra_mode;
+            pixel *const tmp = t->scratch.interintra;
+            int angle = 0;
+            const pixel *top_sb_edge = NULL;
+            if (!(t->by & (f->sb_step - 1))) {
+                top_sb_edge = f->ipred_edge[0];
+                const int sby = t->by >> f->sb_shift;
+                top_sb_edge += f->sb128w * 128 * (sby - 1);
+            }
+            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
+                                                  t->by, t->by > ts->tiling.row_start,
+                                                  ts->tiling.col_end, ts->tiling.row_end,
+                                                  0, dst, f->cur.p.stride[0], top_sb_edge,
+                                                  m, &angle, bw4, bh4, tl_edge);
+            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
+                                     tl_edge, bw4 * 4, bh4 * 4, 0);
+            const uint8_t *const ii_mask =
+                b->interintra_type == INTER_INTRA_BLEND ?
+                     dav1d_ii_masks[bs][0][b->interintra_mode] :
+                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+            dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),
+                          bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
+        }
+
+        if (!has_chroma) goto skip_inter_chroma_pred;
+
+        // sub8x8 derivation
+        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+        refmvs *r;
+        if (is_sub8x8) {
+            assert(ss_hor == 1);
+            r = &f->mvs[t->by * f->b4_stride + t->bx];
+            if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;
+            if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;
+            if (bw4 == 1 && bh4 == ss_ver)
+                is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;
+        }
+
+        // chroma prediction
+        if (is_sub8x8) {
+            assert(ss_hor == 1);
+            int h_off = 0, v_off = 0;
+            if (bw4 == 1 && bh4 == ss_ver) {
+                for (int pl = 0; pl < 2; pl++)
+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+                       NULL, f->cur.p.stride[1],
+                       bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
+                       r[-(f->b4_stride + 1)].mv[0],
+                       &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
+                       f->frame_thread.pass != 2 ? t->tl_4x4_filter :
+                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
+                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
+                h_off = 2;
+            }
+            if (bw4 == 1) {
+                const enum Filter2d left_filter_2d =
+                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
+                for (int pl = 0; pl < 2; pl++)
+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
+                       f->cur.p.stride[1], bw4, bh4, t->bx - 1,
+                       t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
+                       f->frame_thread.pass != 2 ? left_filter_2d :
+                           f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
+                h_off = 2;
+            }
+            if (bh4 == ss_ver) {
+                const enum Filter2d top_filter_2d =
+                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
+                for (int pl = 0; pl < 2; pl++)
+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
+                       f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
+                       1 + pl, r[-f->b4_stride].mv[0],
+                       &f->refp[r[-f->b4_stride].ref[0] - 1],
+                       f->frame_thread.pass != 2 ? top_filter_2d :
+                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
+                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
+            }
+            for (int pl = 0; pl < 2; pl++)
+                mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
+                   bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);
+        } else {
+            if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
+                ((b->inter_mode == GLOBALMV &&
+                  f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
+                 (b->motion_mode == MM_WARP &&
+                  t->warpmv.type > WM_TYPE_TRANSLATION)))
+            {
+                for (int pl = 0; pl < 2; pl++)
+                    warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
+                                f->cur.p.stride[1], b_dim, 1 + pl, refp,
+                                b->motion_mode == MM_WARP ? &t->warpmv :
+                                    &f->frame_hdr.gmv[b->ref[0]]);
+            } else {
+                for (int pl = 0; pl < 2; pl++) {
+                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+                       NULL, f->cur.p.stride[1],
+                       bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+                       t->bx & ~ss_hor, t->by & ~ss_ver,
+                       1 + pl, b->mv[0], refp, filter_2d);
+                    if (b->motion_mode == MM_OBMC)
+                        obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+                             f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+                }
+            }
+            if (b->interintra_type) {
+                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
+                // the wrong thing since it will select 4x16, not 4x32, as a
+                // transform size...
+                const uint8_t *const ii_mask =
+                    b->interintra_type == INTER_INTRA_BLEND ?
+                         dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
+                         dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
+
+                for (int pl = 0; pl < 2; pl++) {
+                    pixel *const tmp = t->scratch.interintra;
+                    pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
+                    enum IntraPredMode m =
+                        b->interintra_mode == II_SMOOTH_PRED ?
+                        SMOOTH_PRED : b->interintra_mode;
+                    int angle = 0;
+                    pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
+                    const pixel *top_sb_edge = NULL;
+                    if (!(t->by & (f->sb_step - 1))) {
+                        top_sb_edge = f->ipred_edge[pl + 1];
+                        const int sby = t->by >> f->sb_shift;
+                        top_sb_edge += f->sb128w * 128 * (sby - 1);
+                    }
+                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
+                                                          (t->bx >> ss_hor) >
+                                                              (ts->tiling.col_start >> ss_hor),
+                                                          t->by >> ss_ver,
+                                                          (t->by >> ss_ver) >
+                                                              (ts->tiling.row_start >> ss_ver),
+                                                          ts->tiling.col_end >> ss_hor,
+                                                          ts->tiling.row_end >> ss_ver,
+                                                          0, uvdst, f->cur.p.stride[1],
+                                                          top_sb_edge, m,
+                                                          &angle, cbw4, cbh4, tl_edge);
+                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
+                                             tl_edge, cbw4 * 4, cbh4 * 4, 0);
+                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),
+                                  cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
+                }
+            }
+        }
+
+    skip_inter_chroma_pred: {}
+        t->tl_4x4_filter = filter_2d;
+    } else {
+        const enum Filter2d filter_2d = b->filter2d;
+        // Maximum super block size is 128x128
+        coef (*tmp)[128 * 128] = (coef (*)[128 * 128]) t->scratch.compinter;
+        int jnt_weight;
+        uint8_t *const seg_mask = t->scratch_seg_mask;
+        const uint8_t *mask;
+
+        for (int i = 0; i < 2; i++) {
+            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+
+            if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&
+                f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
+            {
+                warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
+                            &f->frame_hdr.gmv[b->ref[i]]);
+            } else {
+                mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
+                   b->mv[i], refp, filter_2d);
+            }
+        }
+        switch (b->comp_type) {
+        case COMP_INTER_AVG:
+            dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
+                        bw4 * 4, bh4 * 4);
+            break;
+        case COMP_INTER_WEIGHTED_AVG:
+            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
+            dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
+                          bw4 * 4, bh4 * 4, jnt_weight);
+            break;
+        case COMP_INTER_SEG:
+            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],
+                                           tmp[b->mask_sign], tmp[!b->mask_sign],
+                                           bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);
+            mask = seg_mask;
+            break;
+        case COMP_INTER_WEDGE:
+            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+            dsp->mc.mask(dst, f->cur.p.stride[0],
+                         tmp[b->mask_sign], tmp[!b->mask_sign],
+                         bw4 * 4, bh4 * 4, mask);
+            if (has_chroma)
+                mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
+            break;
+        }
+
+        // chroma
+        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+            for (int i = 0; i < 2; i++) {
+                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+                if (b->inter_mode == GLOBALMV_GLOBALMV &&
+                    imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
+                    f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
+                {
+                    warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
+                                refp, &f->frame_hdr.gmv[b->ref[i]]);
+                } else {
+                    mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
+                       1 + pl, b->mv[i], refp, filter_2d);
+                }
+            }
+            pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
+            switch (b->comp_type) {
+            case COMP_INTER_AVG:
+                dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
+                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);
+                break;
+            case COMP_INTER_WEIGHTED_AVG:
+                dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
+                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);
+                break;
+            case COMP_INTER_WEDGE:
+            case COMP_INTER_SEG:
+                dsp->mc.mask(uvdst, f->cur.p.stride[1],
+                             tmp[b->mask_sign], tmp[!b->mask_sign],
+                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);
+                break;
+            }
+        }
+    }
+
+    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+        hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
+        if (has_chroma) {
+            hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],
+                     cbw4 * 4, cbh4 * 4, "u-pred");
+            hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],
+                     cbw4 * 4, cbh4 * 4, "v-pred");
+        }
+    }
+
+    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+    if (b->skip) {
+        // reset coef contexts
+        memset(&t->a->lcoef[bx4], 0x40, w4);
+        memset(&t->l.lcoef[by4], 0x40, h4);
+        if (has_chroma) {
+            memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
+            memset(&t->l.ccoef[0][cby4], 0x40, ch4);
+            memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
+            memset(&t->l.ccoef[1][cby4], 0x40, ch4);
+        }
+        return;
+    }
+
+    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
+    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+
+    for (int init_y = 0; init_y < bh4; init_y += 16) {
+        for (int init_x = 0; init_x < bw4; init_x += 16) {
+            // coefficient coding & inverse transforms
+            int y_off = !!init_y, y;
+            dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;
+            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
+                 y += ytx->h, y_off++)
+            {
+                int x, x_off = !!init_x;
+                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
+                     x += ytx->w, x_off++)
+                {
+                    read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
+                                   x_off, y_off, &dst[x * 4]);
+                    t->bx += ytx->w;
+                }
+                dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;
+                t->bx -= x;
+                t->by += ytx->h;
+            }
+            dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;
+            t->by -= y;
+
+            // chroma coefs and inverse transform
+            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+                pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +
+                    (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);
+                for (y = init_y >> ss_ver, t->by += init_y;
+                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
+                {
+                    int x;
+                    for (x = init_x >> ss_hor, t->bx += init_x;
+                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
+                    {
+                        coef *cf;
+                        int eob;
+                        enum TxfmType txtp;
+                        if (f->frame_thread.pass) {
+                            cf = ts->frame_thread.cf;
+                            ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
+                            const struct CodedBlockInfo *const cbi =
+                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+                            eob = cbi->eob[1 + pl];
+                            txtp = cbi->txtp[1 + pl];
+                        } else {
+                            uint8_t cf_ctx;
+                            cf = t->cf;
+                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+                                                bx4 + (x << ss_hor)];
+                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+                                               &t->l.ccoef[pl][cby4 + y],
+                                               b->uvtx, bs, b, 0, 1 + pl,
+                                               cf, &txtp, &cf_ctx);
+                            if (DEBUG_BLOCK_INFO)
+                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+                                       "txtp=%d,eob=%d]: r=%d\n",
+                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
+                            memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
+                                   imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
+                            memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
+                                   imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
+                        }
+                        if (eob >= 0) {
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
+                            dsp->itx.itxfm_add[b->uvtx]
+                                              [txtp](&uvdst[4 * x],
+                                                     f->cur.p.stride[1],
+                                                     cf, eob);
+                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+                                hex_dump(&uvdst[4 * x], f->cur.p.stride[1],
+                                         uvtx->w * 4, uvtx->h * 4, "recon");
+                        }
+                        t->bx += uvtx->w << ss_hor;
+                    }
+                    uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;
+                    t->bx -= x << ss_hor;
+                    t->by += uvtx->h << ss_ver;
+                }
+                t->by -= y << ss_ver;
+            }
+        }
+    }
+}
+
+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int sbsz = f->sb_step, sbh = f->sbh;
+
+    if (f->frame_hdr.loopfilter.level_y[0] ||
+        f->frame_hdr.loopfilter.level_y[1])
+    {
+        int start_of_tile_row = 0;
+        if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)
+            start_of_tile_row = f->lf.tile_row++;
+        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
+                                       start_of_tile_row);
+    }
+
+    if (f->seq_hdr.restoration) {
+        // Store loop filtered pixels required by loop restoration
+        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
+    }
+    if (f->seq_hdr.cdef) {
+        if (sby) {
+            pixel *p_up[3] = {
+                f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),
+                f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
+                f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
+            };
+            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
+                                    sby * sbsz - 2, sby * sbsz);
+        }
+        const int n_blks = sbsz - 2 * (sby + 1 < sbh);
+        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
+                                imin(sby * sbsz + n_blks, f->bh));
+    }
+    if (f->seq_hdr.restoration) {
+        bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);
+    }
+
+    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);
+    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+    f->lf.prev_mask_ptr = f->lf.mask_ptr;
+    if ((sby & 1) || f->seq_hdr.sb128) {
+        f->lf.mask_ptr += f->sb128w;
+    }
+}
+
+void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
+    const Dav1dFrameContext *const f = t->f;
+    Dav1dTileState *const ts = t->ts;
+    const int sby = t->by >> f->sb_shift;
+    const int sby_off = f->sb128w * 128 * sby;
+    const int x_off = ts->tiling.col_start;
+
+    const pixel *const y =
+        ((const pixel *) f->cur.p.data[0]) + x_off * 4 +
+                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);
+    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
+               4 * (ts->tiling.col_end - x_off));
+
+    if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+
+        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
+            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);
+        for (int pl = 1; pl <= 2; pl++)
+            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
+                       &((const pixel *) f->cur.p.data[pl])[uv_off],
+                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
+    }
+}