shithub: dav1d

Download patch

ref: 2e3fd4d6c9fe649a032218093e50e55577528dde
parent: dfa24594cbd5c2e8dee253abf7bd04edc3bb0612
parent: 802790f181a30f02d93aa83ae364f81b341c9b4a
author: Sigrid Haflínudóttir <[email protected]>
date: Mon Dec 14 14:22:44 EST 2020

Merge remote-tracking branch 'upstream/master'

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -464,9 +464,12 @@
                       -Dtestdata_tests=true
                       -Dlogging=false
                       -Db_sanitize=address
-                      -Denable_asm=false
         - ninja -C build
-        - cd build && time meson test -v --setup=sanitizer
+        - cd build
+        - exit_code=0
+        - time meson test -v --setup=sanitizer --test-args "--cpumask 0" || exit_code=$((exit_code + $?))
+        - time meson test -v --setup=sanitizer --test-args "--cpumask 0xff" || exit_code=$((exit_code + $?))
+        - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
 
 test-debian-msan:
     extends:
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,18 @@
+Changes for 0.8.0 'Eurasian hobby":
+-----------------------------------
+
+0.8.0 is a major update for dav1d:
+ - Improve the performance by using a picture buffer pool;
+   The improvements can reach 10% on some cases on Windows.
+ - Support for Apple ARM Silicon
+ - ARM32 optimizations for 8bit bitdepth for ipred paeth, smooth, cfl
+ - ARM32 optimizations for 10/12/16bit bitdepth for mc_avg/mask/w_avg,
+   put/prep 8tap/bilin, wiener and CDEF filters
+ - ARM64 optimizations for cfl_ac 444 for all bitdepths
+ - x86 optimizations for MC 8-tap, mc_scaled in AVX2
+ - x86 optimizations for CDEF in SSE and {put/prep}_{8tap/bilin} in SSSE3
+
+
 Changes for 0.7.1 'Frigatebird':
 ------------------------------
 
--- a/THANKS.md
+++ b/THANKS.md
@@ -16,13 +16,16 @@
 
 And all the dav1d Authors (git shortlog -sn), including:
 
-Janne Grunau, Ronald S. Bultje, Martin Storsjö, Henrik Gramner, James Almer,
-Marvin Scholz, Luc Trudeau, Jean-Baptiste Kempf, Victorien Le Couviour--Tuffet,
-David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Nathan E. Egge,
-Francois Cartegnie, Konstantin Pavlov, Liwei Wang, Xuefeng Jiang,
-Derek Buitenhuis, Raphaël Zumer, Niklas Haas, Michael Bradshaw, Kyle Siefring,
-Raphael Zumer, Boyuan Xiao, Thierry Foucu, Matthias Dressel, Thomas Daede,
-Rupert Swarbrick, Jan Beich, Dale Curtis, SmilingWolf, Tristan Laurent,
-Vittorio Giovara, Rostislav Pehlivanov, Shiz, skal, Steinar Midtskogen,
-Luca Barbato, Justin Bull, Jean-Yves Avenard, Timo Gurr, Fred Barbier,
-Anisse Astier, Pablo Stebler, Nicolas Frattaroli, Mark Shuttleworth.
+Martin Storsjö, Janne Grunau, Henrik Gramner, Ronald S. Bultje, James Almer,
+Marvin Scholz, Luc Trudeau, Victorien Le Couviour--Tuffet, Jean-Baptiste Kempf,
+Hugo Beauzée-Luyssen, Matthias Dressel, Konstantin Pavlov, David Michael Barr,
+Steve Lhomme, Niklas Haas, B Krishnan Iyer, Francois Cartegnie, Liwei Wang,
+Nathan E. Egge, Derek Buitenhuis, Michael Bradshaw, Raphaël Zumer,
+Xuefeng Jiang, Luca Barbato, Jan Beich, Wan-Teh Chang, Justin Bull, Boyuan Xiao,
+Dale Curtis, Kyle Siefring, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
+Thomas Daede, Colin Lee, Emmanuel Gil Peyrot, Lynne, Michail Alvanos,
+Nico Weber, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier,
+Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard,
+Mark Shuttleworth, Matthieu Bouron, Nicolas Frattaroli, Pablo Stebler,
+Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvestre Ledru, Timo Gurr,
+Tristan Matthews, Xavier Claessens, Xu Guangxin, kossh1 and skal.
--- a/include/common/mem.h
+++ /dev/null
@@ -1,84 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef DAV1D_COMMON_MEM_H
-#define DAV1D_COMMON_MEM_H
-
-#include <stdlib.h>
-
-#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
-#include <malloc.h>
-#endif
-
-#include "common/attributes.h"
-
-/*
- * Allocate align-byte aligned memory. The return value can be released
- * by calling the dav1d_free_aligned() function.
- */
-static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
-    assert(!(align & (align - 1)));
-#ifdef HAVE_POSIX_MEMALIGN
-    void *ptr;
-    if (posix_memalign(&ptr, align, sz)) return NULL;
-    return ptr;
-#elif defined(HAVE_ALIGNED_MALLOC)
-    return _aligned_malloc(sz, align);
-#elif defined(HAVE_MEMALIGN)
-    return memalign(align, sz);
-#else
-#error Missing aligned alloc implementation
-#endif
-}
-
-static inline void dav1d_free_aligned(void* ptr) {
-#ifdef HAVE_POSIX_MEMALIGN
-    free(ptr);
-#elif defined(HAVE_ALIGNED_MALLOC)
-    _aligned_free(ptr);
-#elif defined(HAVE_MEMALIGN)
-    free(ptr);
-#endif
-}
-
-static inline void dav1d_freep_aligned(void* ptr) {
-    void **mem = (void **) ptr;
-    if (*mem) {
-        dav1d_free_aligned(*mem);
-        *mem = NULL;
-    }
-}
-
-static inline void freep(void *ptr) {
-    void **mem = (void **) ptr;
-    if (*mem) {
-        free(*mem);
-        *mem = NULL;
-    }
-}
-
-#endif /* DAV1D_COMMON_MEM_H */
--- a/include/dav1d/meson.build
+++ b/include/dav1d/meson.build
@@ -31,13 +31,13 @@
                                   output: 'version.h',
                                   configuration: version_h_data)
 
-dav1d_api_headers = files(
+dav1d_api_headers = [
     'common.h',
     'data.h',
     'dav1d.h',
     'headers.h',
     'picture.h',
-    )
+]
 
 # install headers
 install_headers(dav1d_api_headers,
--- a/meson.build
+++ b/meson.build
@@ -23,12 +23,12 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 project('dav1d', ['c'],
-    version: '0.7.1',
+    version: '0.8.0',
     default_options: ['c_std=c99',
                       'warning_level=2',
                       'buildtype=release',
                       'b_ndebug=if-release'],
-    meson_version: '>= 0.47.0')
+    meson_version: '>= 0.49.0')
 
 dav1d_soname_version       = '5.0.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
@@ -118,6 +118,17 @@
     thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
 
     rt_dependency = []
+
+    rc_version_array = meson.project_version().split('.')
+    winmod = import('windows')
+    rc_data = configuration_data()
+    rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
+    rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
+    rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
+    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
+    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
+    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
+    rc_data.set('COPYRIGHT_YEARS', '2020')
 else
     thread_dependency = dependency('threads')
     thread_compat_dep = []
@@ -227,7 +238,7 @@
 # Compiler flags that should be set
 # But when the compiler does not supports them
 # it is not an error and silently tolerated
-if cc.get_id() != 'msvc'
+if cc.get_argument_syntax() != 'msvc'
     optional_arguments += [
       '-Wundef',
       '-Werror=vla',
@@ -426,6 +437,28 @@
         ])
 endif
 
+use_gaspp = false
+if (is_asm_enabled and
+    (host_machine.cpu_family() == 'aarch64' or
+     host_machine.cpu_family().startswith('arm')) and
+    cc.get_argument_syntax() == 'msvc')
+    gaspp = find_program('gas-preprocessor.pl')
+    use_gaspp = true
+    gaspp_gen = generator(gaspp,
+        output: '@[email protected]',
+        arguments: [
+            '-as-type', 'armasm',
+            '-arch', host_machine.cpu_family(),
+            '--',
+            host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
+            '-nologo',
+            '-I@0@'.format(dav1d_src_root),
+            '-I@0@/'.format(meson.current_build_dir()),
+            '@INPUT@',
+            '-c',
+            '-o', '@OUTPUT@'
+        ])
+endif
 
 # Generate config.h
 config_h_target = configure_file(output: 'config.h', configuration: cdata)
--- a/src/arm/32/cdef.S
+++ b/src/arm/32/cdef.S
@@ -27,6 +27,7 @@
 
 #include "src/arm/asm.S"
 #include "util.S"
+#include "cdef_tmpl.S"
 
 // n1 = s0/d0
 // w1 = d0/q0
@@ -190,11 +191,9 @@
         beq             1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
-        ldrh            r12, [r3], #2
-        vldr            \n1, [r1]
-        vdup.16         d2,  r12
+        vld1.16         {d2[]}, [r3, :16]!
         ldrh            r12, [r1, #\w]
-        add             r1,  r1,  r2
+        load_n_incr     d0,  r1,  r2,  \w
         subs            r5,  r5,  #1
         vmov.16         d2[1], r12
         vmovl.u8        q0,  d0
@@ -207,9 +206,8 @@
         b               3f
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
-        ldrh            r12, [r3], #2
+        vld1.16         {d2[]}, [r3, :16]!
         load_n_incr     d0,  r1,  r2,  \w
-        vdup.16         d2,  r12
         subs            r5,  r5,  #1
         vmovl.u8        q0,  d0
         vmovl.u8        q1,  d2
@@ -327,231 +325,13 @@
 padding_func_edged 8, 16, d0, 64
 padding_func_edged 4, 8,  s0, 32
 
-.macro dir_table w, stride
-const directions\w
-        .byte           -1 * \stride + 1, -2 * \stride + 2
-        .byte            0 * \stride + 1, -1 * \stride + 2
-        .byte            0 * \stride + 1,  0 * \stride + 2
-        .byte            0 * \stride + 1,  1 * \stride + 2
-        .byte            1 * \stride + 1,  2 * \stride + 2
-        .byte            1 * \stride + 0,  2 * \stride + 1
-        .byte            1 * \stride + 0,  2 * \stride + 0
-        .byte            1 * \stride + 0,  2 * \stride - 1
-// Repeated, to avoid & 7
-        .byte           -1 * \stride + 1, -2 * \stride + 2
-        .byte            0 * \stride + 1, -1 * \stride + 2
-        .byte            0 * \stride + 1,  0 * \stride + 2
-        .byte            0 * \stride + 1,  1 * \stride + 2
-        .byte            1 * \stride + 1,  2 * \stride + 2
-        .byte            1 * \stride + 0,  2 * \stride + 1
-endconst
-.endm
+tables
 
-dir_table 8, 16
-dir_table 4, 8
+filter 8, 8
+filter 4, 8
 
-const pri_taps
-        .byte           4, 2, 3, 3
-endconst
+find_dir 8
 
-.macro load_px d11, d12, d21, d22, w
-.if \w == 8
-        add             r6,  r2,  r9, lsl #1 // x + off
-        sub             r9,  r2,  r9, lsl #1 // x - off
-        vld1.16         {\d11,\d12}, [r6]    // p0
-        vld1.16         {\d21,\d22}, [r9]    // p1
-.else
-        add             r6,  r2,  r9, lsl #1 // x + off
-        sub             r9,  r2,  r9, lsl #1 // x - off
-        vld1.16         {\d11}, [r6]         // p0
-        add             r6,  r6,  #2*8       // += stride
-        vld1.16         {\d21}, [r9]         // p1
-        add             r9,  r9,  #2*8       // += stride
-        vld1.16         {\d12}, [r6]         // p0
-        vld1.16         {\d22}, [r9]         // p1
-.endif
-.endm
-.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
-.if \min
-        vmin.u16        q2,  q2,  \s1
-        vmax.s16        q3,  q3,  \s1
-        vmin.u16        q2,  q2,  \s2
-        vmax.s16        q3,  q3,  \s2
-.endif
-        vabd.u16        q8,  q0,  \s1        // abs(diff)
-        vabd.u16        q11, q0,  \s2        // abs(diff)
-        vshl.u16        q9,  q8,  \shift     // abs(diff) >> shift
-        vshl.u16        q12, q11, \shift     // abs(diff) >> shift
-        vqsub.u16       q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
-        vqsub.u16       q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
-        vsub.i16        q10, \s1, q0         // diff = p0 - px
-        vsub.i16        q13, \s2, q0         // diff = p1 - px
-        vneg.s16        q8,  q9              // -clip
-        vneg.s16        q11, q12             // -clip
-        vmin.s16        q10, q10, q9         // imin(diff, clip)
-        vmin.s16        q13, q13, q12        // imin(diff, clip)
-        vdup.16         q9,  \tap            // taps[k]
-        vmax.s16        q10, q10, q8         // constrain() = imax(imin(diff, clip), -clip)
-        vmax.s16        q13, q13, q11        // constrain() = imax(imin(diff, clip), -clip)
-        vmla.i16        q1,  q10, q9         // sum += taps[k] * constrain()
-        vmla.i16        q1,  q13, q9         // sum += taps[k] * constrain()
-.endm
-
-// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
-//                                   const uint16_t *tmp, int pri_strength,
-//                                   int sec_strength, int dir, int damping,
-//                                   int h, size_t edges);
-.macro filter_func w, pri, sec, min, suffix
-function cdef_filter\w\suffix\()_neon
-        cmp             r8,  #0xf
-        beq             cdef_filter\w\suffix\()_edged_neon
-.if \pri
-        movrel_local    r8,  pri_taps
-        and             r9,  r3,  #1
-        add             r8,  r8,  r9, lsl #1
-.endif
-        movrel_local    r9,  directions\w
-        add             r5,  r9,  r5, lsl #1
-        vmov.u16        d17, #15
-        vdup.16         d16, r6              // damping
-
-.if \pri
-        vdup.16         q5,  r3              // threshold
-.endif
-.if \sec
-        vdup.16         q7,  r4              // threshold
-.endif
-        vmov.16         d8[0], r3
-        vmov.16         d8[1], r4
-        vclz.i16        d8,  d8              // clz(threshold)
-        vsub.i16        d8,  d17, d8         // ulog2(threshold)
-        vqsub.u16       d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
-        vneg.s16        d8,  d8              // -shift
-.if \sec
-        vdup.16         q6,  d8[1]
-.endif
-.if \pri
-        vdup.16         q4,  d8[0]
-.endif
-
-1:
-.if \w == 8
-        vld1.16         {q0},  [r2, :128]    // px
-.else
-        add             r12, r2,  #2*8
-        vld1.16         {d0},  [r2,  :64]    // px
-        vld1.16         {d1},  [r12, :64]    // px
-.endif
-
-        vmov.u16        q1,  #0              // sum
-.if \min
-        vmov.u16        q2,  q0              // min
-        vmov.u16        q3,  q0              // max
-.endif
-
-        // Instead of loading sec_taps 2, 1 from memory, just set it
-        // to 2 initially and decrease for the second round.
-        // This is also used as loop counter.
-        mov             lr,  #2              // sec_taps[0]
-
-2:
-.if \pri
-        ldrsb           r9,  [r5]            // off1
-
-        load_px         d28, d29, d30, d31, \w
-.endif
-
-.if \sec
-        add             r5,  r5,  #4         // +2*2
-        ldrsb           r9,  [r5]            // off2
-.endif
-
-.if \pri
-        ldrb            r12, [r8]            // *pri_taps
-
-        handle_pixel    q14, q15, q5,  q4,  r12, \min
-.endif
-
-.if \sec
-        load_px         d28, d29, d30, d31, \w
-
-        add             r5,  r5,  #8         // +2*4
-        ldrsb           r9,  [r5]            // off3
-
-        handle_pixel    q14, q15, q7,  q6,  lr, \min
-
-        load_px         d28, d29, d30, d31, \w
-
-        handle_pixel    q14, q15, q7,  q6,  lr, \min
-
-        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
-.else
-        add             r5,  r5,  #1         // r5 += 1
-.endif
-        subs            lr,  lr,  #1         // sec_tap-- (value)
-.if \pri
-        add             r8,  r8,  #1         // pri_taps++ (pointer)
-.endif
-        bne             2b
-
-        vshr.s16        q14, q1,  #15        // -(sum < 0)
-        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
-        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
-        vadd.i16        q0,  q0,  q1         // px + (8 + sum ...) >> 4
-.if \min
-        vmin.s16        q0,  q0,  q3
-        vmax.s16        q0,  q0,  q2         // iclip(px + .., min, max)
-.endif
-        vmovn.u16       d0,  q0
-.if \w == 8
-        add             r2,  r2,  #2*16      // tmp += tmp_stride
-        subs            r7,  r7,  #1         // h--
-        vst1.8          {d0}, [r0, :64], r1
-.else
-        vst1.32         {d0[0]}, [r0, :32], r1
-        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
-        subs            r7,  r7,  #2         // h -= 2
-        vst1.32         {d0[1]}, [r0, :32], r1
-.endif
-
-        // Reset pri_taps and directions back to the original point
-        sub             r5,  r5,  #2
-.if \pri
-        sub             r8,  r8,  #2
-.endif
-
-        bgt             1b
-        vpop            {q4-q7}
-        pop             {r4-r9,pc}
-endfunc
-.endm
-
-.macro filter w
-filter_func \w, pri=1, sec=0, min=0, suffix=_pri
-filter_func \w, pri=0, sec=1, min=0, suffix=_sec
-filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
-
-function cdef_filter\w\()_8bpc_neon, export=1
-        push            {r4-r9,lr}
-        vpush           {q4-q7}
-        ldrd            r4,  r5,  [sp, #92]
-        ldrd            r6,  r7,  [sp, #100]
-        ldr             r8,  [sp, #108]
-        cmp             r3,  #0 // pri_strength
-        bne             1f
-        b               cdef_filter\w\()_sec_neon // only sec
-1:
-        cmp             r4,  #0 // sec_strength
-        bne             1f
-        b               cdef_filter\w\()_pri_neon // only pri
-1:
-        b               cdef_filter\w\()_pri_sec_neon // both pri and sec
-endfunc
-.endm
-
-filter 8
-filter 4
-
 .macro load_px_8 d11, d12, d21, d22, w
 .if \w == 8
         add             r6,  r2,  r9         // x + off
@@ -756,219 +536,3 @@
 
 filter_8 8
 filter_8 4
-
-const div_table, align=4
-        .short         840, 420, 280, 210, 168, 140, 120, 105
-endconst
-
-const alt_fact, align=4
-        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
-endconst
-
-// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride,
-//                                   unsigned *const var)
-function cdef_find_dir_8bpc_neon, export=1
-        push            {lr}
-        vpush           {q4-q7}
-        sub             sp,  sp,  #32          // cost
-        mov             r3,  #8
-        vmov.u16        q1,  #0                // q0-q1   sum_diag[0]
-        vmov.u16        q3,  #0                // q2-q3   sum_diag[1]
-        vmov.u16        q5,  #0                // q4-q5   sum_hv[0-1]
-        vmov.u16        q8,  #0                // q6,d16  sum_alt[0]
-                                               // q7,d17  sum_alt[1]
-        vmov.u16        q9,  #0                // q9,d22  sum_alt[2]
-        vmov.u16        q11, #0
-        vmov.u16        q10, #0                // q10,d23 sum_alt[3]
-
-
-.irpc i, 01234567
-        vld1.8          {d30}, [r0, :64], r1
-        vmov.u8         d31, #128
-        vsubl.u8        q15, d30, d31          // img[x] - 128
-        vmov.u16        q14, #0
-
-.if \i == 0
-        vmov            q0,  q15               // sum_diag[0]
-.else
-        vext.8          q12, q14, q15, #(16-2*\i)
-        vext.8          q13, q15, q14, #(16-2*\i)
-        vadd.i16        q0,  q0,  q12          // sum_diag[0]
-        vadd.i16        q1,  q1,  q13          // sum_diag[0]
-.endif
-        vrev64.16       q13, q15
-        vswp            d26, d27               // [-x]
-.if \i == 0
-        vmov            q2,  q13               // sum_diag[1]
-.else
-        vext.8          q12, q14, q13, #(16-2*\i)
-        vext.8          q13, q13, q14, #(16-2*\i)
-        vadd.i16        q2,  q2,  q12          // sum_diag[1]
-        vadd.i16        q3,  q3,  q13          // sum_diag[1]
-.endif
-
-        vpadd.u16       d26, d30, d31          // [(x >> 1)]
-        vmov.u16        d27, #0
-        vpadd.u16       d24, d26, d28
-        vpadd.u16       d24, d24, d28          // [y]
-        vmov.u16        r12, d24[0]
-        vadd.i16        q5,  q5,  q15          // sum_hv[1]
-.if \i < 4
-        vmov.16         d8[\i],   r12          // sum_hv[0]
-.else
-        vmov.16         d9[\i-4], r12          // sum_hv[0]
-.endif
-
-.if \i == 0
-        vmov.u16        q6,  q13               // sum_alt[0]
-.else
-        vext.8          q12, q14, q13, #(16-2*\i)
-        vext.8          q14, q13, q14, #(16-2*\i)
-        vadd.i16        q6,  q6,  q12          // sum_alt[0]
-        vadd.i16        d16, d16, d28          // sum_alt[0]
-.endif
-        vrev64.16       d26, d26               // [-(x >> 1)]
-        vmov.u16        q14, #0
-.if \i == 0
-        vmov            q7,  q13               // sum_alt[1]
-.else
-        vext.8          q12, q14, q13, #(16-2*\i)
-        vext.8          q13, q13, q14, #(16-2*\i)
-        vadd.i16        q7,  q7,  q12          // sum_alt[1]
-        vadd.i16        d17, d17, d26          // sum_alt[1]
-.endif
-
-.if \i < 6
-        vext.8          q12, q14, q15, #(16-2*(3-(\i/2)))
-        vext.8          q13, q15, q14, #(16-2*(3-(\i/2)))
-        vadd.i16        q9,  q9,  q12          // sum_alt[2]
-        vadd.i16        d22, d22, d26          // sum_alt[2]
-.else
-        vadd.i16        q9,  q9,  q15          // sum_alt[2]
-.endif
-.if \i == 0
-        vmov            q10, q15               // sum_alt[3]
-.elseif \i == 1
-        vadd.i16        q10, q10, q15          // sum_alt[3]
-.else
-        vext.8          q12, q14, q15, #(16-2*(\i/2))
-        vext.8          q13, q15, q14, #(16-2*(\i/2))
-        vadd.i16        q10, q10, q12          // sum_alt[3]
-        vadd.i16        d23, d23, d26          // sum_alt[3]
-.endif
-.endr
-
-        vmov.u32        q15, #105
-
-        vmull.s16       q12, d8,  d8           // sum_hv[0]*sum_hv[0]
-        vmlal.s16       q12, d9,  d9
-        vmull.s16       q13, d10, d10          // sum_hv[1]*sum_hv[1]
-        vmlal.s16       q13, d11, d11
-        vadd.s32        d8,  d24, d25
-        vadd.s32        d9,  d26, d27
-        vpadd.s32       d8,  d8,  d9           // cost[2,6] (s16, s17)
-        vmul.i32        d8,  d8,  d30          // cost[2,6] *= 105
-
-        vrev64.16       q1,  q1
-        vrev64.16       q3,  q3
-        vext.8          q1,  q1,  q1,  #10     // sum_diag[0][14-n]
-        vext.8          q3,  q3,  q3,  #10     // sum_diag[1][14-n]
-
-        vstr            s16, [sp, #2*4]        // cost[2]
-        vstr            s17, [sp, #6*4]        // cost[6]
-
-        movrel_local    r12, div_table
-        vld1.16         {q14}, [r12, :128]
-
-        vmull.s16       q5,  d0,  d0           // sum_diag[0]*sum_diag[0]
-        vmull.s16       q12, d1,  d1
-        vmlal.s16       q5,  d2,  d2
-        vmlal.s16       q12, d3,  d3
-        vmull.s16       q0,  d4,  d4           // sum_diag[1]*sum_diag[1]
-        vmull.s16       q1,  d5,  d5
-        vmlal.s16       q0,  d6,  d6
-        vmlal.s16       q1,  d7,  d7
-        vmovl.u16       q13, d28               // div_table
-        vmovl.u16       q14, d29
-        vmul.i32        q5,  q5,  q13          // cost[0]
-        vmla.i32        q5,  q12, q14
-        vmul.i32        q0,  q0,  q13          // cost[4]
-        vmla.i32        q0,  q1,  q14
-        vadd.i32        d10, d10, d11
-        vadd.i32        d0,  d0,  d1
-        vpadd.i32       d0,  d10, d0           // cost[0,4] = s0,s1
-
-        movrel_local    r12, alt_fact
-        vld1.16         {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
-
-        vstr            s0,  [sp, #0*4]        // cost[0]
-        vstr            s1,  [sp, #4*4]        // cost[4]
-
-        vmovl.u16       q13, d29               // div_table[2*m+1] + 105
-        vmovl.u16       q14, d30
-        vmovl.u16       q15, d31
-
-.macro cost_alt dest, s1, s2, s3, s4, s5, s6
-        vmull.s16       q1,  \s1, \s1          // sum_alt[n]*sum_alt[n]
-        vmull.s16       q2,  \s2, \s2
-        vmull.s16       q3,  \s3, \s3
-        vmull.s16       q5,  \s4, \s4          // sum_alt[n]*sum_alt[n]
-        vmull.s16       q12, \s5, \s5
-        vmull.s16       q6,  \s6, \s6          // q6 overlaps the first \s1-\s2 here
-        vmul.i32        q1,  q1,  q13          // sum_alt[n]^2*fact
-        vmla.i32        q1,  q2,  q14
-        vmla.i32        q1,  q3,  q15
-        vmul.i32        q5,  q5,  q13          // sum_alt[n]^2*fact
-        vmla.i32        q5,  q12, q14
-        vmla.i32        q5,  q6,  q15
-        vadd.i32        d2,  d2,  d3
-        vadd.i32        d3,  d10, d11
-        vpadd.i32       \dest, d2, d3          // *cost_ptr
-.endm
-        cost_alt        d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
-        cost_alt        d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
-        vstr            s28, [sp, #1*4]        // cost[1]
-        vstr            s29, [sp, #3*4]        // cost[3]
-
-        mov             r0,  #0                // best_dir
-        vmov.32         r1,  d0[0]             // best_cost
-        mov             r3,  #1                // n
-
-        vstr            s30, [sp, #5*4]        // cost[5]
-        vstr            s31, [sp, #7*4]        // cost[7]
-
-        vmov.32         r12, d14[0]
-
-.macro find_best s1, s2, s3
-.ifnb \s2
-        vmov.32         lr,  \s2
-.endif
-        cmp             r12, r1                // cost[n] > best_cost
-        itt             gt
-        movgt           r0,  r3                // best_dir = n
-        movgt           r1,  r12               // best_cost = cost[n]
-.ifnb \s2
-        add             r3,  r3,  #1           // n++
-        cmp             lr,  r1                // cost[n] > best_cost
-        vmov.32         r12, \s3
-        itt             gt
-        movgt           r0,  r3                // best_dir = n
-        movgt           r1,  lr                // best_cost = cost[n]
-        add             r3,  r3,  #1           // n++
-.endif
-.endm
-        find_best       d14[0], d8[0], d14[1]
-        find_best       d14[1], d0[1], d15[0]
-        find_best       d15[0], d8[1], d15[1]
-        find_best       d15[1]
-
-        eor             r3,  r0,  #4           // best_dir ^4
-        ldr             r12, [sp, r3, lsl #2]
-        sub             r1,  r1,  r12          // best_cost - cost[best_dir ^ 4]
-        lsr             r1,  r1,  #10
-        str             r1,  [r2]              // *var
-
-        add             sp,  sp,  #32
-        vpop            {q4-q7}
-        pop             {pc}
-endfunc
--- /dev/null
+++ b/src/arm/32/cdef16.S
@@ -1,0 +1,232 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+#include "cdef_tmpl.S"
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret
+        tst             r6,  #1 // CDEF_HAVE_LEFT
+        beq             2f
+        // CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        vldr            s8,  [\s1, #-4]
+        vld1.16         {\r1}, [\s1, :\align]
+        vldr            s9,  [\s1, #2*\w]
+        vldr            s10, [\s2, #-4]
+        vld1.16         {\r2}, [\s2, :\align]
+        vldr            s11, [\s2, #2*\w]
+        vstr            s8,  [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s9,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s10, [r0, #-4]
+        vst1.16         {\r2}, [r0, :\align]
+        vstr            s11, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        vldr            s8,  [\s1, #-4]
+        vld1.16         {\r1}, [\s1, :\align]
+        vldr            s9,  [\s2, #-4]
+        vld1.16         {\r2}, [\s2, :\align]
+        vstr            s8,  [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s9,  [r0, #-4]
+        vst1.16         {\r2}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+2:
+        // !CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        vld1.16         {\r1}, [\s1, :\align]
+        vldr            s8,  [\s1, #2*\w]
+        vld1.16         {\r2}, [\s2, :\align]
+        vldr            s9,  [\s2, #2*\w]
+        vstr            s12, [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s8,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s12, [r0, #-4]
+        vst1.16         {\r2}, [r0, :\align]
+        vstr            s9,  [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        vld1.16         {\r1}, [\s1, :\align]
+        vld1.16         {\r2}, [\s2, :\align]
+        vstr            s12, [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        vstr            s12, [r0, #-4]
+        vst1.16         {\r2}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+.if \ret
+        pop             {r4-r7,pc}
+.else
+        add             r0,  r0,  #2*\stride
+.endif
+3:
+.endm
+
+// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
+//                                     ptrdiff_t src_stride, const pixel (*left)[2],
+//                                     const pixel *const top, int h,
+//                                     enum CdefEdgeFlags edges);
+
+// r1 = d0/q0
+// r2 = d2/q1
+.macro padding_func_16 w, stride, r1, r2, align
+function cdef_padding\w\()_16bpc_neon, export=1
+        push            {r4-r7,lr}
+        ldrd            r4,  r5,  [sp, #20]
+        ldr             r6,  [sp, #28]
+        vmov.i16        q3,  #0x8000
+        tst             r6,  #4 // CDEF_HAVE_TOP
+        bne             1f
+        // !CDEF_HAVE_TOP
+        sub             r12, r0,  #2*(2*\stride+2)
+        vmov.i16        q2,  #0x8000
+        vst1.16         {q2,q3}, [r12]!
+.if \w == 8
+        vst1.16         {q2,q3}, [r12]!
+.endif
+        b               3f
+1:
+        // CDEF_HAVE_TOP
+        add             r7,  r4,  r2
+        sub             r0,  r0,  #2*(2*\stride)
+        pad_top_bot_16  r4,  r7,  \w, \stride, \r1, \r2, \align, 0
+
+        // Middle section
+3:
+        tst             r6,  #1 // CDEF_HAVE_LEFT
+        beq             2f
+        // CDEF_HAVE_LEFT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        vld1.32         {d2[]}, [r3, :32]!
+        vldr            s5,  [r1, #2*\w]
+        vld1.16         {\r1}, [r1, :\align], r2
+        subs            r5,  r5,  #1
+        vstr            s4,  [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s5,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             0b
+        b               3f
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        vld1.32         {d2[]}, [r3, :32]!
+        vld1.16         {\r1}, [r1, :\align], r2
+        subs            r5,  r5,  #1
+        vstr            s4,  [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             1b
+        b               3f
+2:
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
+        beq             1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        vldr            s4,  [r1, #2*\w]
+        vld1.16         {\r1}, [r1, :\align], r2
+        subs            r5,  r5,  #1
+        vstr            s12, [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s4,  [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             0b
+        b               3f
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        vld1.16         {\r1}, [r1, :\align], r2
+        subs            r5,  r5,  #1
+        vstr            s12, [r0, #-4]
+        vst1.16         {\r1}, [r0, :\align]
+        vstr            s12, [r0, #2*\w]
+        add             r0,  r0,  #2*\stride
+        bgt             1b
+
+3:
+        tst             r6,  #8 // CDEF_HAVE_BOTTOM
+        bne             1f
+        // !CDEF_HAVE_BOTTOM
+        sub             r12, r0,  #4
+        vmov.i16        q2,  #0x8000
+        vst1.16         {q2,q3}, [r12]!
+.if \w == 8
+        vst1.16         {q2,q3}, [r12]!
+.endif
+        pop             {r4-r7,pc}
+1:
+        // CDEF_HAVE_BOTTOM
+        add             r7,  r1,  r2
+        pad_top_bot_16  r1,  r7,  \w, \stride, \r1, \r2, \align, 1
+endfunc
+.endm
+
+padding_func_16 8, 16, q0, q1, 128
+padding_func_16 4, 8,  d0, d2, 64
+
+tables
+
+filter 8, 16
+filter 4, 16
+
+find_dir 16
--- /dev/null
+++ b/src/arm/32/cdef_tmpl.S
@@ -1,0 +1,515 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro dir_table w, stride
+const directions\w
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+        .byte            1 * \stride + 0,  2 * \stride + 0
+        .byte            1 * \stride + 0,  2 * \stride - 1
+// Repeated, to avoid & 7
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+endconst
+.endm
+
+.macro tables
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+        .byte           4, 2, 3, 3
+endconst
+.endm
+
+.macro load_px d11, d12, d21, d22, w
+.if \w == 8
+        add             r6,  r2,  r9, lsl #1 // x + off
+        sub             r9,  r2,  r9, lsl #1 // x - off
+        vld1.16         {\d11,\d12}, [r6]    // p0
+        vld1.16         {\d21,\d22}, [r9]    // p1
+.else
+        add             r6,  r2,  r9, lsl #1 // x + off
+        sub             r9,  r2,  r9, lsl #1 // x - off
+        vld1.16         {\d11}, [r6]         // p0
+        add             r6,  r6,  #2*8       // += stride
+        vld1.16         {\d21}, [r9]         // p1
+        add             r9,  r9,  #2*8       // += stride
+        vld1.16         {\d12}, [r6]         // p0
+        vld1.16         {\d22}, [r9]         // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
+.if \min
+        vmin.u16        q2,  q2,  \s1
+        vmax.s16        q3,  q3,  \s1
+        vmin.u16        q2,  q2,  \s2
+        vmax.s16        q3,  q3,  \s2
+.endif
+        vabd.u16        q8,  q0,  \s1        // abs(diff)
+        vabd.u16        q11, q0,  \s2        // abs(diff)
+        vshl.u16        q9,  q8,  \shift     // abs(diff) >> shift
+        vshl.u16        q12, q11, \shift     // abs(diff) >> shift
+        vqsub.u16       q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
+        vqsub.u16       q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
+        vsub.i16        q10, \s1, q0         // diff = p0 - px
+        vsub.i16        q13, \s2, q0         // diff = p1 - px
+        vneg.s16        q8,  q9              // -clip
+        vneg.s16        q11, q12             // -clip
+        vmin.s16        q10, q10, q9         // imin(diff, clip)
+        vmin.s16        q13, q13, q12        // imin(diff, clip)
+        vdup.16         q9,  \tap            // taps[k]
+        vmax.s16        q10, q10, q8         // constrain() = imax(imin(diff, clip), -clip)
+        vmax.s16        q13, q13, q11        // constrain() = imax(imin(diff, clip), -clip)
+        vmla.i16        q1,  q10, q9         // sum += taps[k] * constrain()
+        vmla.i16        q1,  q13, q9         // sum += taps[k] * constrain()
+.endm
+
+// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
+//                                   const uint16_t *tmp, int pri_strength,
+//                                   int sec_strength, int dir, int damping,
+//                                   int h, size_t edges);
+.macro filter_func w, bpc, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_\bpc\()bpc_neon
+.if \bpc == 8
+        cmp             r8,  #0xf
+        beq             cdef_filter\w\suffix\()_edged_neon
+.endif
+.if \pri
+.if \bpc == 16
+        clz             r9,  r9
+        sub             r9,  r9,  #24        // -bitdepth_min_8
+        neg             r9,  r9              // bitdepth_min_8
+.endif
+        movrel_local    r8,  pri_taps
+.if \bpc == 16
+        lsr             r9,  r3,  r9         // pri_strength >> bitdepth_min_8
+        and             r9,  r9,  #1         // (pri_strength >> bitdepth_min_8) & 1
+.else
+        and             r9,  r3,  #1
+.endif
+        add             r8,  r8,  r9, lsl #1
+.endif
+        movrel_local    r9,  directions\w
+        add             r5,  r9,  r5, lsl #1
+        vmov.u16        d17, #15
+        vdup.16         d16, r6              // damping
+
+.if \pri
+        vdup.16         q5,  r3              // threshold
+.endif
+.if \sec
+        vdup.16         q7,  r4              // threshold
+.endif
+        vmov.16         d8[0], r3
+        vmov.16         d8[1], r4
+        vclz.i16        d8,  d8              // clz(threshold)
+        vsub.i16        d8,  d17, d8         // ulog2(threshold)
+        vqsub.u16       d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
+        vneg.s16        d8,  d8              // -shift
+.if \sec
+        vdup.16         q6,  d8[1]
+.endif
+.if \pri
+        vdup.16         q4,  d8[0]
+.endif
+
+1:
+.if \w == 8
+        vld1.16         {q0},  [r2, :128]    // px
+.else
+        add             r12, r2,  #2*8
+        vld1.16         {d0},  [r2,  :64]    // px
+        vld1.16         {d1},  [r12, :64]    // px
+.endif
+
+        vmov.u16        q1,  #0              // sum
+.if \min
+        vmov.u16        q2,  q0              // min
+        vmov.u16        q3,  q0              // max
+.endif
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        // This is also used as loop counter.
+        mov             lr,  #2              // sec_taps[0]
+
+2:
+.if \pri
+        ldrsb           r9,  [r5]            // off1
+
+        load_px         d28, d29, d30, d31, \w
+.endif
+
+.if \sec
+        add             r5,  r5,  #4         // +2*2
+        ldrsb           r9,  [r5]            // off2
+.endif
+
+.if \pri
+        ldrb            r12, [r8]            // *pri_taps
+
+        handle_pixel    q14, q15, q5,  q4,  r12, \min
+.endif
+
+.if \sec
+        load_px         d28, d29, d30, d31, \w
+
+        add             r5,  r5,  #8         // +2*4
+        ldrsb           r9,  [r5]            // off3
+
+        handle_pixel    q14, q15, q7,  q6,  lr, \min
+
+        load_px         d28, d29, d30, d31, \w
+
+        handle_pixel    q14, q15, q7,  q6,  lr, \min
+
+        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
+.else
+        add             r5,  r5,  #1         // r5 += 1
+.endif
+        subs            lr,  lr,  #1         // sec_tap-- (value)
+.if \pri
+        add             r8,  r8,  #1         // pri_taps++ (pointer)
+.endif
+        bne             2b
+
+        vshr.s16        q14, q1,  #15        // -(sum < 0)
+        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
+        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
+        vadd.i16        q0,  q0,  q1         // px + (8 + sum ...) >> 4
+.if \min
+        vmin.s16        q0,  q0,  q3
+        vmax.s16        q0,  q0,  q2         // iclip(px + .., min, max)
+.endif
+.if \bpc == 8
+        vmovn.u16       d0,  q0
+.endif
+.if \w == 8
+        add             r2,  r2,  #2*16      // tmp += tmp_stride
+        subs            r7,  r7,  #1         // h--
+.if \bpc == 8
+        vst1.8          {d0}, [r0, :64], r1
+.else
+        vst1.16         {q0}, [r0, :128], r1
+.endif
+.else
+.if \bpc == 8
+        vst1.32         {d0[0]}, [r0, :32], r1
+.else
+        vst1.16         {d0},    [r0, :64], r1
+.endif
+        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
+        subs            r7,  r7,  #2         // h -= 2
+.if \bpc == 8
+        vst1.32         {d0[1]}, [r0, :32], r1
+.else
+        vst1.16         {d1},    [r0, :64], r1
+.endif
+.endif
+
+        // Reset pri_taps and directions back to the original point
+        sub             r5,  r5,  #2
+.if \pri
+        sub             r8,  r8,  #2
+.endif
+
+        bgt             1b
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w, bpc
+filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_\bpc\()bpc_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #92]
+        ldrd            r6,  r7,  [sp, #100]
+.if \bpc == 16
+        ldrd            r8,  r9,  [sp, #108]
+.else
+        ldr             r8,  [sp, #108]
+.endif
+        cmp             r3,  #0 // pri_strength
+        bne             1f
+        b               cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
+1:
+        cmp             r4,  #0 // sec_strength
+        bne             1f
+        b               cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
+1:
+        b               cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
+endfunc
+.endm
+
+const div_table, align=4
+        .short         840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact, align=4
+        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+.macro cost_alt dest, s1, s2, s3, s4, s5, s6
+        vmull.s16       q1,  \s1, \s1          // sum_alt[n]*sum_alt[n]
+        vmull.s16       q2,  \s2, \s2
+        vmull.s16       q3,  \s3, \s3
+        vmull.s16       q5,  \s4, \s4          // sum_alt[n]*sum_alt[n]
+        vmull.s16       q12, \s5, \s5
+        vmull.s16       q6,  \s6, \s6          // q6 overlaps the first \s1-\s2 here
+        vmul.i32        q1,  q1,  q13          // sum_alt[n]^2*fact
+        vmla.i32        q1,  q2,  q14
+        vmla.i32        q1,  q3,  q15
+        vmul.i32        q5,  q5,  q13          // sum_alt[n]^2*fact
+        vmla.i32        q5,  q12, q14
+        vmla.i32        q5,  q6,  q15
+        vadd.i32        d2,  d2,  d3
+        vadd.i32        d3,  d10, d11
+        vpadd.i32       \dest, d2, d3          // *cost_ptr
+.endm
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+        vmov.32         lr,  \s2
+.endif
+        cmp             r12, r1                // cost[n] > best_cost
+        itt             gt
+        movgt           r0,  r3                // best_dir = n
+        movgt           r1,  r12               // best_cost = cost[n]
+.ifnb \s2
+        add             r3,  r3,  #1           // n++
+        cmp             lr,  r1                // cost[n] > best_cost
+        vmov.32         r12, \s3
+        itt             gt
+        movgt           r0,  r3                // best_dir = n
+        movgt           r1,  lr                // best_cost = cost[n]
+        add             r3,  r3,  #1           // n++
+.endif
+.endm
+
+// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
+//                                   unsigned *const var)
+.macro find_dir bpc
+function cdef_find_dir_\bpc\()bpc_neon, export=1
+        push            {lr}
+        vpush           {q4-q7}
+.if \bpc == 16
+        clz             r3,  r3                // clz(bitdepth_max)
+        sub             lr,  r3,  #24          // -bitdepth_min_8
+.endif
+        sub             sp,  sp,  #32          // cost
+        mov             r3,  #8
+        vmov.u16        q1,  #0                // q0-q1   sum_diag[0]
+        vmov.u16        q3,  #0                // q2-q3   sum_diag[1]
+        vmov.u16        q5,  #0                // q4-q5   sum_hv[0-1]
+        vmov.u16        q8,  #0                // q6,d16  sum_alt[0]
+                                               // q7,d17  sum_alt[1]
+        vmov.u16        q9,  #0                // q9,d22  sum_alt[2]
+        vmov.u16        q11, #0
+        vmov.u16        q10, #0                // q10,d23 sum_alt[3]
+
+
+.irpc i, 01234567
+.if \bpc == 8
+        vld1.8          {d30}, [r0, :64], r1
+        vmov.u8         d31, #128
+        vsubl.u8        q15, d30, d31          // img[x] - 128
+.else
+        vld1.16         {q15}, [r0, :128], r1
+        vdup.16         q14, lr                // -bitdepth_min_8
+        vshl.u16        q15, q15, q14
+        vmov.u16        q14, #128
+        vsub.i16        q15, q15, q14          // img[x] - 128
+.endif
+        vmov.u16        q14, #0
+
+.if \i == 0
+        vmov            q0,  q15               // sum_diag[0]
+.else
+        vext.8          q12, q14, q15, #(16-2*\i)
+        vext.8          q13, q15, q14, #(16-2*\i)
+        vadd.i16        q0,  q0,  q12          // sum_diag[0]
+        vadd.i16        q1,  q1,  q13          // sum_diag[0]
+.endif
+        vrev64.16       q13, q15
+        vswp            d26, d27               // [-x]
+.if \i == 0
+        vmov            q2,  q13               // sum_diag[1]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q13, q13, q14, #(16-2*\i)
+        vadd.i16        q2,  q2,  q12          // sum_diag[1]
+        vadd.i16        q3,  q3,  q13          // sum_diag[1]
+.endif
+
+        vpadd.u16       d26, d30, d31          // [(x >> 1)]
+        vmov.u16        d27, #0
+        vpadd.u16       d24, d26, d28
+        vpadd.u16       d24, d24, d28          // [y]
+        vmov.u16        r12, d24[0]
+        vadd.i16        q5,  q5,  q15          // sum_hv[1]
+.if \i < 4
+        vmov.16         d8[\i],   r12          // sum_hv[0]
+.else
+        vmov.16         d9[\i-4], r12          // sum_hv[0]
+.endif
+
+.if \i == 0
+        vmov.u16        q6,  q13               // sum_alt[0]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q14, q13, q14, #(16-2*\i)
+        vadd.i16        q6,  q6,  q12          // sum_alt[0]
+        vadd.i16        d16, d16, d28          // sum_alt[0]
+.endif
+        vrev64.16       d26, d26               // [-(x >> 1)]
+        vmov.u16        q14, #0
+.if \i == 0
+        vmov            q7,  q13               // sum_alt[1]
+.else
+        vext.8          q12, q14, q13, #(16-2*\i)
+        vext.8          q13, q13, q14, #(16-2*\i)
+        vadd.i16        q7,  q7,  q12          // sum_alt[1]
+        vadd.i16        d17, d17, d26          // sum_alt[1]
+.endif
+
+.if \i < 6
+        vext.8          q12, q14, q15, #(16-2*(3-(\i/2)))
+        vext.8          q13, q15, q14, #(16-2*(3-(\i/2)))
+        vadd.i16        q9,  q9,  q12          // sum_alt[2]
+        vadd.i16        d22, d22, d26          // sum_alt[2]
+.else
+        vadd.i16        q9,  q9,  q15          // sum_alt[2]
+.endif
+.if \i == 0
+        vmov            q10, q15               // sum_alt[3]
+.elseif \i == 1
+        vadd.i16        q10, q10, q15          // sum_alt[3]
+.else
+        vext.8          q12, q14, q15, #(16-2*(\i/2))
+        vext.8          q13, q15, q14, #(16-2*(\i/2))
+        vadd.i16        q10, q10, q12          // sum_alt[3]
+        vadd.i16        d23, d23, d26          // sum_alt[3]
+.endif
+.endr
+
+        vmov.u32        q15, #105
+
+        vmull.s16       q12, d8,  d8           // sum_hv[0]*sum_hv[0]
+        vmlal.s16       q12, d9,  d9
+        vmull.s16       q13, d10, d10          // sum_hv[1]*sum_hv[1]
+        vmlal.s16       q13, d11, d11
+        vadd.s32        d8,  d24, d25
+        vadd.s32        d9,  d26, d27
+        vpadd.s32       d8,  d8,  d9           // cost[2,6] (s16, s17)
+        vmul.i32        d8,  d8,  d30          // cost[2,6] *= 105
+
+        vrev64.16       q1,  q1
+        vrev64.16       q3,  q3
+        vext.8          q1,  q1,  q1,  #10     // sum_diag[0][14-n]
+        vext.8          q3,  q3,  q3,  #10     // sum_diag[1][14-n]
+
+        vstr            s16, [sp, #2*4]        // cost[2]
+        vstr            s17, [sp, #6*4]        // cost[6]
+
+        movrel_local    r12, div_table
+        vld1.16         {q14}, [r12, :128]
+
+        vmull.s16       q5,  d0,  d0           // sum_diag[0]*sum_diag[0]
+        vmull.s16       q12, d1,  d1
+        vmlal.s16       q5,  d2,  d2
+        vmlal.s16       q12, d3,  d3
+        vmull.s16       q0,  d4,  d4           // sum_diag[1]*sum_diag[1]
+        vmull.s16       q1,  d5,  d5
+        vmlal.s16       q0,  d6,  d6
+        vmlal.s16       q1,  d7,  d7
+        vmovl.u16       q13, d28               // div_table
+        vmovl.u16       q14, d29
+        vmul.i32        q5,  q5,  q13          // cost[0]
+        vmla.i32        q5,  q12, q14
+        vmul.i32        q0,  q0,  q13          // cost[4]
+        vmla.i32        q0,  q1,  q14
+        vadd.i32        d10, d10, d11
+        vadd.i32        d0,  d0,  d1
+        vpadd.i32       d0,  d10, d0           // cost[0,4] = s0,s1
+
+        movrel_local    r12, alt_fact
+        vld1.16         {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
+
+        vstr            s0,  [sp, #0*4]        // cost[0]
+        vstr            s1,  [sp, #4*4]        // cost[4]
+
+        vmovl.u16       q13, d29               // div_table[2*m+1] + 105
+        vmovl.u16       q14, d30
+        vmovl.u16       q15, d31
+
+        cost_alt        d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
+        cost_alt        d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
+        vstr            s28, [sp, #1*4]        // cost[1]
+        vstr            s29, [sp, #3*4]        // cost[3]
+
+        mov             r0,  #0                // best_dir
+        vmov.32         r1,  d0[0]             // best_cost
+        mov             r3,  #1                // n
+
+        vstr            s30, [sp, #5*4]        // cost[5]
+        vstr            s31, [sp, #7*4]        // cost[7]
+
+        vmov.32         r12, d14[0]
+
+        find_best       d14[0], d8[0], d14[1]
+        find_best       d14[1], d0[1], d15[0]
+        find_best       d15[0], d8[1], d15[1]
+        find_best       d15[1]
+
+        eor             r3,  r0,  #4           // best_dir ^4
+        ldr             r12, [sp, r3, lsl #2]
+        sub             r1,  r1,  r12          // best_cost - cost[best_dir ^ 4]
+        lsr             r1,  r1,  #10
+        str             r1,  [r2]              // *var
+
+        add             sp,  sp,  #32
+        vpop            {q4-q7}
+        pop             {pc}
+endfunc
+.endm
--- a/src/arm/32/loopfilter.S
+++ b/src/arm/32/loopfilter.S
@@ -515,7 +515,7 @@
         lpf_8_wd8
 
         sub             r10, r0,  r1, lsl #1
-        sub             r10, r10,  r1
+        sub             r10, r10, r1
         vst1.8          {d21}, [r10, :64], r1 // p2
         vst1.8          {d24}, [r0,  :64], r1 // q0
         vst1.8          {d22}, [r10, :64], r1 // p1
@@ -783,11 +783,11 @@
         vld1.8          {d6[]}, [r5]   // sharp[1]
         sub             r5,  r5,  #8
         vbif            d1,  d0,  d3   // if (!l[0][0]) L = l[offset][0]
+        vtst.32         d2,  d1,  d2   // L != 0
         vmul.i32        d1,  d1,  d4   // L
 .ifc \type, y
         vdup.32         d15, r2        // vmask[2]
 .endif
-        vtst.32         d2,  d1,  d2   // L != 0
         vdup.32         d14, r7        // vmask[1]
         vmov            r10, r11, d2
         orrs            r10, r10, r11
--- /dev/null
+++ b/src/arm/32/loopfilter16.S
@@ -1,0 +1,860 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro loop_filter wd
+function lpf_4_wd\wd\()_neon
+        vabd.u16        d0,  d22, d23 // abs(p1 - p0)
+        vabd.u16        d1,  d25, d24 // abs(q1 - q0)
+        vabd.u16        d2,  d23, d24 // abs(p0 - q0)
+        vabd.u16        d3,  d22, d25 // abs(p1 - q1)
+.if \wd >= 6
+        vabd.u16        d4,  d21, d22 // abs(p2 - p1)
+        vabd.u16        d5,  d26, d25 // abs(q2 - q1)
+.endif
+.if \wd >= 8
+        vabd.u16        d6,  d20, d21 // abs(p3 - p2)
+        vabd.u16        d7,  d27, d26 // abs(q3 - q3)
+.endif
+.if \wd >= 6
+        vmax.u16        d4,  d4,  d5
+.endif
+        vqadd.u16       d2,  d2,  d2  // abs(p0 - q0) * 2
+.if \wd >= 8
+        vmax.u16        d6,  d6,  d7
+.endif
+        vshr.u16        d3,  d3,  #1
+.if \wd >= 8
+        vmax.u16        d4,  d4,  d6
+.endif
+        vmax.u16        d0,  d0,  d1  // max(abs(p1 - p0), abs(q1 - q0))
+        vqadd.u16       d2,  d2,  d3  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+.if \wd >= 6
+        vmax.u16        d4,  d0,  d4
+        vcge.u16        d1,  d11, d4  // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
+.else
+        vcge.u16        d1,  d11, d0  // max(abs(p1 - p0), abs(q1 - q0)) <= I
+.endif
+        vcge.u16        d2,  d10, d2  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
+        vand            d1,  d1,  d2  // fm && wd >= 4 (implicit)
+.if \wd >= 6
+        vmov            d14, d1       // fm && wd > 4 (implicit)
+.endif
+.if \wd >= 16
+        vmov            d15, d1       // fm && wd == 16 (implicit)
+.endif
+
+        vmov            r10, r11, d1
+        orrs            r10, r10, r11
+        beq             9f            // if (!fm || wd < 4) return;
+
+.if \wd >= 6
+        vmov.i16        d10, #1
+        vabd.u16        d2,  d21, d23 // abs(p2 - p0)
+        vabd.u16        d3,  d22, d23 // abs(p1 - p0)
+        vabd.u16        d4,  d25, d24 // abs(q1 - q0)
+        vabd.u16        d5,  d26, d24 // abs(q2 - q0)
+        vdup.16         d9,  r9       // bitdepth_min_8
+.if \wd >= 8
+        vabd.u16        d6,  d20, d23 // abs(p3 - p0)
+        vabd.u16        d7,  d27, d24 // abs(q3 - q0)
+.endif
+        vmax.u16        d2,  d2,  d3
+        vmax.u16        d4,  d4,  d5
+.if \wd >= 8
+        vmax.u16        d6,  d6,  d7
+.endif
+        vmax.u16        d2,  d2,  d4
+        vshl.u16        d10, d10, d9  // F = 1 << bitdepth_min_8
+.if \wd >= 8
+        vmax.u16        d2,  d2,  d6
+.endif
+
+.if \wd == 16
+        vabd.u16        d3,  d17, d23 // abs(p6 - p0)
+        vabd.u16        d4,  d18, d23 // abs(p5 - p0)
+        vabd.u16        d5,  d19, d23 // abs(p4 - p0)
+.endif
+        vcge.u16        d2,  d10, d2  // flat8in
+.if \wd == 16
+        vabd.u16        d6,  d28, d24 // abs(q4 - q0)
+        vabd.u16        d7,  d29, d24 // abs(q5 - q0)
+        vabd.u16        d8,  d30, d24 // abs(q6 - q0)
+.endif
+        vand            d14, d2,  d14 // flat8in && fm && wd > 4
+        vbic            d1,  d1,  d14 // fm && wd >= 4 && !flat8in
+.if \wd == 16
+        vmax.u16        d3,  d3,  d4
+        vmax.u16        d5,  d5,  d6
+.endif
+        vmov            r10, r11, d1
+.if \wd == 16
+        vmax.u16        d7,  d7,  d8
+        vmax.u16        d3,  d3,  d5
+        vmax.u16        d3,  d3,  d7
+        vcge.u16        d3,  d10, d3  // flat8out
+.endif
+        orrs            r10, r10, r11
+.if \wd == 16
+        vand            d15, d15, d3  // flat8out && fm && wd == 16
+        vand            d15, d15, d14 // flat8out && flat8in && fm && wd == 16
+        vbic            d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
+.endif
+        beq             1f            // skip wd == 4 case
+.endif
+
+        vdup.16         d3,  r8       // bitdepth_max
+        vsub.u16        d2,  d22, d25 // p1 - q1
+        vshr.u16        d3,  d3,  #1  // 128 << bitdepth_min_8 - 1
+        vcgt.u16        d0,  d0,  d12 // hev
+        vmvn            d9,  d3       // - 128 * (1 << bitdepth_min_8)
+        vmin.s16        d2,  d2,  d3  // iclip_diff(p1 - q1)
+        vmax.s16        d2,  d2,  d9  // iclip_diff(p1 - q1)
+        vand            d4,  d2,  d0  // if (hev) iclip_diff(p1 - q1)
+        vsub.u16        d2,  d24, d23
+        vmov.i16        d6,  #3
+        vbic            d0,  d1,  d0  // (fm && wd >= 4 && !hev)
+        vmul.i16        d2,  d2,  d6
+        vmov.i16        d6,  #4
+        vadd.i16        d2,  d2,  d4
+        vmin.s16        d2,  d2,  d3  // f = iclip_diff()
+        vmov.i16        d7,  #3
+        vmax.s16        d2,  d2,  d9  // f = iclip_diff()
+        vqadd.s16       d4,  d6,  d2  // f + 4
+        vqadd.s16       d5,  d7,  d2  // f + 3
+        vmin.s16        d4,  d4,  d3  // imin(f + 4, 128 << bitdepth_min_8 - 1)
+        vmin.s16        d5,  d5,  d3  // imin(f + 3, 128 << bitdepth_min_8 - 1)
+        vshr.s16        d4,  d4,  #3  // f1
+        vshr.s16        d5,  d5,  #3  // f2
+        vmov.i16        d9,  #0
+        vdup.16         d3,  r8       // bitdepth_max
+        vqadd.s16       d2,  d23, d5  // p0 + f2
+        vqsub.s16       d6,  d24, d4  // q0 - f1
+        vrshr.s16       d4,  d4,  #1  // (f1 + 1) >> 1
+        vmin.s16        d2,  d2,  d3  // out p0 = iclip_pixel()
+        vmin.s16        d6,  d6,  d3  // out q0 = iclip_pixel()
+        vmax.s16        d2,  d2,  d9  // out p0 = iclip_pixel()
+        vmax.s16        d6,  d6,  d9  // out q0 = iclip_pixel()
+        vbit            d23, d2,  d1  // if (fm && wd >= 4)
+        vbit            d24, d6,  d1  // if (fm && wd >= 4)
+        vqadd.s16       d2,  d22, d4  // p1 + f
+        vqsub.s16       d6,  d25, d4  // q1 - f
+        vmin.s16        d2,  d2,  d3  // out p1 = iclip_pixel()
+        vmin.s16        d6,  d6,  d3  // out q1 = iclip_pixel()
+        vmax.s16        d2,  d2,  d9  // out p1 = iclip_pixel()
+        vmax.s16        d6,  d6,  d9  // out q1 = iclip_pixel()
+        vbit            d22, d2,  d0  // if (fm && wd >= 4 && !hev)
+        vbit            d25, d6,  d0  // if (fm && wd >= 4 && !hev)
+1:
+
+.if \wd == 6
+        vmov            r10, r11, d14
+        orrs            r10, r10, r11
+        beq             2f            // skip if there's no flat8in
+
+        vadd.i16        d0,  d21, d21 // p2 * 2
+        vadd.i16        d2,  d21, d22 // p2 + p1
+        vadd.i16        d4,  d22, d23 // p1 + p0
+        vadd.i16        d6,  d23, d24 // p0 + q0
+        vadd.i16        d8,  d0,  d2
+        vadd.i16        d10, d4,  d6
+        vadd.i16        d12, d24, d25 // q0 + q1
+        vadd.i16        d8,  d8,  d10
+        vsub.i16        d12, d12, d0
+        vadd.i16        d10, d25, d26 // q1 + q2
+        vrshr.u16       d0,  d8,  #3  // out p1
+
+        vadd.i16        d8,  d8,  d12
+        vsub.i16        d10, d10, d2
+        vadd.i16        d12, d26, d26 // q2 + q2
+        vrshr.u16       d1,  d8,  #3  // out p0
+
+        vadd.i16        d8,  d8,  d10
+        vsub.i16        d12, d12, d4
+        vrshr.u16       d2,  d8,  #3  // out q0
+
+        vbit            d22, d0,  d14 // p1 if (flat8in)
+        vadd.i16        d8,  d8,  d12
+        vbit            d23, d1,  d14 // p0 if (flat8in)
+        vrshr.u16       d3,  d8,  #3  // out q1
+        vbit            d24, d2,  d14 // q0 if (flat8in)
+        vbit            d25, d3,  d14 // q1 if (flat8in)
+.elseif \wd >= 8
+        vmov            r10, r11, d14
+        orrs            r10, r10, r11
+.if \wd == 8
+        beq             8f            // skip if there's no flat8in
+.else
+        beq             2f            // skip if there's no flat8in
+.endif
+
+        vadd.i16        d0,  d20, d21 // p3 + p2
+        vadd.i16        d2,  d22, d25 // p1 + q1
+        vadd.i16        d4,  d20, d22 // p3 + p1
+        vadd.i16        d6,  d23, d26 // p0 + q2
+        vadd.i16        d8,  d0,  d0  // 2 * (p3 + p2)
+        vadd.i16        d9,  d23, d24 // p0 + q0
+        vadd.i16        d8,  d8,  d4  // + p3 + p1
+        vsub.i16        d2,  d2,  d0  // p1 + q1 - p3 - p2
+        vadd.i16        d8,  d8,  d9  // + p0 + q0
+        vsub.i16        d6,  d6,  d4  // p0 + q2 - p3 - p1
+        vrshr.u16       d10, d8,  #3  // out p2
+
+        vadd.i16        d8,  d8,  d2
+        vadd.i16        d0,  d20, d23 // p3 + p0
+        vadd.i16        d2,  d24, d27 // q0 + q3
+        vrshr.u16       d11, d8,  #3  // out p1
+
+        vadd.i16        d8,  d8,  d6
+        vsub.i16        d2,  d2,  d0  // q0 + q3 - p3 - p0
+        vadd.i16        d4,  d21, d24 // p2 + q0
+        vadd.i16        d6,  d25, d27 // q1 + q3
+        vrshr.u16       d12, d8,  #3  // out p0
+
+        vadd.i16        d8,  d8,  d2
+        vsub.i16        d6,  d6,  d4  // q1 + q3 - p2 - q0
+        vadd.i16        d0,  d22, d25 // p1 + q1
+        vadd.i16        d2,  d26, d27 // q2 + q3
+        vrshr.u16       d13, d8,  #3  // out q0
+
+        vadd.i16        d8,  d8,  d6
+        vsub.i16        d2,  d2,  d0  // q2 + q3 - p1 - q1
+        vrshr.u16       d0,  d8,  #3  // out q1
+
+        vadd.i16        d8,  d8,  d2
+
+        vbit            d21, d10, d14
+        vbit            d22, d11, d14
+        vbit            d23, d12, d14
+        vrshr.u16       d1,  d8,  #3  // out q2
+        vbit            d24, d13, d14
+        vbit            d25, d0,  d14
+        vbit            d26, d1,  d14
+.endif
+2:
+.if \wd == 16
+        vmov            r10, r11, d15
+        orrs            r10, r10, r11
+        bne             1f            // check if flat8out is needed
+        vmov            r10, r11, d14
+        orrs            r10, r10, r11
+        beq             8f            // if there was no flat8in, just write the inner 4 pixels
+        b               7f            // if flat8in was used, write the inner 6 pixels
+1:
+
+        vadd.i16        d2,  d17, d17 // p6 + p6
+        vadd.i16        d4,  d17, d18 // p6 + p5
+        vadd.i16        d6,  d17, d19 // p6 + p4
+        vadd.i16        d8,  d17, d20 // p6 + p3
+        vadd.i16        d12, d2,  d4
+        vadd.i16        d10, d6,  d8
+        vadd.i16        d6,  d17, d21 // p6 + p2
+        vadd.i16        d12, d12, d10
+        vadd.i16        d8,  d17, d22 // p6 + p1
+        vadd.i16        d10, d18, d23 // p5 + p0
+        vadd.i16        d6,  d6,  d8
+        vadd.i16        d8,  d19, d24 // p4 + q0
+        vadd.i16        d12, d12, d6
+        vadd.i16        d10, d10, d8
+        vadd.i16        d6,  d20, d25 // p3 + q1
+        vadd.i16        d12, d12, d10
+        vsub.i16        d6,  d6,  d2
+        vadd.i16        d2,  d21, d26 // p2 + q2
+        vrshr.u16       d0,  d12, #4  // out p5
+        vadd.i16        d12, d12, d6  // - (p6 + p6) + (p3 + q1)
+        vsub.i16        d2,  d2,  d4
+        vadd.i16        d4,  d22, d27 // p1 + q3
+        vadd.i16        d6,  d17, d19 // p6 + p4
+        vrshr.u16       d1,  d12, #4  // out p4
+        vadd.i16        d12, d12, d2  // - (p6 + p5) + (p2 + q2)
+        vsub.i16        d4,  d4,  d6
+        vadd.i16        d6,  d23, d28 // p0 + q4
+        vadd.i16        d8,  d17, d20 // p6 + p3
+        vrshr.u16       d2,  d12, #4  // out p3
+        vadd.i16        d12, d12, d4  // - (p6 + p4) + (p1 + q3)
+        vsub.i16        d6,  d6,  d8
+        vadd.i16        d8,  d24, d29 // q0 + q5
+        vadd.i16        d4,  d17, d21 // p6 + p2
+        vrshr.u16       d3,  d12, #4  // out p2
+        vadd.i16        d12, d12, d6  // - (p6 + p3) + (p0 + q4)
+        vsub.i16        d8,  d8,  d4
+        vadd.i16        d6,  d25, d30 // q1 + q6
+        vadd.i16        d10, d17, d22 // p6 + p1
+        vrshr.u16       d4,  d12, #4  // out p1
+        vadd.i16        d12, d12, d8  // - (p6 + p2) + (q0 + q5)
+        vsub.i16        d6,  d6,  d10
+        vadd.i16        d8,  d26, d30 // q2 + q6
+        vbif            d0,  d18, d15 // out p5
+        vadd.i16        d10, d18, d23 // p5 + p0
+        vrshr.u16       d5,  d12, #4  // out p0
+        vadd.i16        d12, d12, d6  // - (p6 + p1) + (q1 + q6)
+        vsub.i16        d8,  d8,  d10
+        vadd.i16        d10, d27, d30 // q3 + q6
+        vbif            d1,  d19, d15 // out p4
+        vadd.i16        d18, d19, d24 // p4 + q0
+        vrshr.u16       d6,  d12, #4  // out q0
+        vadd.i16        d12, d12, d8  // - (p5 + p0) + (q2 + q6)
+        vsub.i16        d10, d10, d18
+        vadd.i16        d8,  d28, d30 // q4 + q6
+        vbif            d2,  d20, d15 // out p3
+        vadd.i16        d18, d20, d25 // p3 + q1
+        vrshr.u16       d7,  d12, #4  // out q1
+        vadd.i16        d12, d12, d10 // - (p4 + q0) + (q3 + q6)
+        vsub.i16        d18, d8,  d18
+        vadd.i16        d10, d29, d30 // q5 + q6
+        vbif            d3,  d21, d15 // out p2
+        vadd.i16        d20, d21, d26 // p2 + q2
+        vrshr.u16       d8,  d12, #4  // out q2
+        vadd.i16        d12, d12, d18 // - (p3 + q1) + (q4 + q6)
+        vsub.i16        d10, d10, d20
+        vadd.i16        d18, d30, d30 // q6 + q6
+        vbif            d4,  d22, d15 // out p1
+        vadd.i16        d20, d22, d27 // p1 + q3
+        vrshr.u16       d9,  d12, #4  // out q3
+        vadd.i16        d12, d12, d10 // - (p2 + q2) + (q5 + q6)
+        vsub.i16        d18, d18, d20
+        vbif            d5,  d23, d15 // out p0
+        vrshr.u16       d10, d12, #4  // out q4
+        vadd.i16        d12, d12, d18 // - (p1 + q3) + (q6 + q6)
+        vrshr.u16       d11, d12, #4  // out q5
+        vbif            d6,  d24, d15 // out q0
+        vbif            d7,  d25, d15 // out q1
+        vbif            d8,  d26, d15 // out q2
+        vbif            d9,  d27, d15 // out q3
+        vbif            d10, d28, d15 // out q4
+        vbif            d11, d29, d15 // out q5
+.endif
+
+        bx              lr
+.if \wd == 16
+7:
+        // Return to a shorter epilogue, writing only the inner 6 pixels
+        bx              r6
+.endif
+.if \wd >= 8
+8:
+        // Return to a shorter epilogue, writing only the inner 4 pixels
+        bx              r7
+.endif
+9:
+        // Return directly without writing back any pixels
+        bx              r12
+endfunc
+.endm
+
+loop_filter 16
+loop_filter 8
+loop_filter 6
+loop_filter 4
+
+.macro lpf_4_wd16
+        adr             r6,  7f + CONFIG_THUMB
+        adr             r7,  8f + CONFIG_THUMB
+        bl              lpf_4_wd16_neon
+.endm
+
+.macro lpf_4_wd8
+        adr             r7,  8f + CONFIG_THUMB
+        bl              lpf_4_wd8_neon
+.endm
+
+.macro lpf_4_wd6
+        bl              lpf_4_wd6_neon
+.endm
+
+.macro lpf_4_wd4
+        bl              lpf_4_wd4_neon
+.endm
+
+function lpf_v_4_4_neon
+        mov             r12, lr
+        sub             r10, r0,  r1, lsl #1
+        vld1.16         {d22}, [r10, :64], r1 // p1
+        vld1.16         {d24}, [r0,  :64], r1 // q0
+        vld1.16         {d23}, [r10, :64], r1 // p0
+        vld1.16         {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+
+        lpf_4_wd4
+
+        sub             r10, r0,  r1, lsl #1
+        vst1.16         {d22}, [r10, :64], r1 // p1
+        vst1.16         {d24}, [r0,  :64], r1 // q0
+        vst1.16         {d23}, [r10, :64], r1 // p0
+        vst1.16         {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_4_4_neon
+        mov             r12, lr
+        sub             r10, r0,  #4
+        add             r0,  r10, r1, lsl #1
+        vld1.16         {d22}, [r10], r1
+        vld1.16         {d24}, [r0],  r1
+        vld1.16         {d23}, [r10], r1
+        vld1.16         {d25}, [r0],  r1
+        add             r0,  r0,  #4
+
+        transpose_4x4h  q11, q12, d22, d23, d24, d25
+
+        lpf_4_wd4
+
+        sub             r10, r0,  r1, lsl #2
+        sub             r10, r10, #4
+        transpose_4x4h  q11, q12, d22, d23, d24, d25
+        add             r0,  r10, r1, lsl #1
+
+        vst1.16         {d22}, [r10], r1
+        vst1.16         {d24}, [r0],  r1
+        vst1.16         {d23}, [r10], r1
+        vst1.16         {d25}, [r0],  r1
+        add             r0,  r0,  #4
+        bx              r12
+endfunc
+
+function lpf_v_6_4_neon
+        mov             r12, lr
+        sub             r10, r0,  r1, lsl #1
+        sub             r10, r10, r1
+        vld1.16         {d21}, [r10, :64], r1 // p2
+        vld1.16         {d24}, [r0,  :64], r1 // q0
+        vld1.16         {d22}, [r10, :64], r1 // p1
+        vld1.16         {d25}, [r0,  :64], r1 // q1
+        vld1.16         {d23}, [r10, :64], r1 // p0
+        vld1.16         {d26}, [r0,  :64], r1 // q2
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+
+        lpf_4_wd6
+
+        sub             r10, r0,  r1, lsl #1
+        vst1.16         {d22}, [r10, :64], r1 // p1
+        vst1.16         {d24}, [r0,  :64], r1 // q0
+        vst1.16         {d23}, [r10, :64], r1 // p0
+        vst1.16         {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_6_4_neon
+        mov             r12, lr
+        sub             r10, r0,  #8
+        vld1.16         {d20}, [r10, :64], r1
+        vld1.16         {d24}, [r0,  :64], r1
+        vld1.16         {d21}, [r10, :64], r1
+        vld1.16         {d25}, [r0,  :64], r1
+        vld1.16         {d22}, [r10, :64], r1
+        vld1.16         {d26}, [r0,  :64], r1
+        vld1.16         {d23}, [r10, :64], r1
+        vld1.16         {d27}, [r0,  :64], r1
+
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+
+        lpf_4_wd6
+
+        sub             r0,  r0,  #4
+        transpose_4x4h  q11, q12, d22, d23, d24, d25
+        sub             r10, r0,  r1, lsl #2
+        sub             r0,  r0,  r1, lsl #1
+
+        vst1.16         {d22}, [r10], r1
+        vst1.16         {d24}, [r0],  r1
+        vst1.16         {d23}, [r10], r1
+        vst1.16         {d25}, [r0],  r1
+        add             r0,  r0,  #4
+        bx              r12
+endfunc
+
+function lpf_v_8_4_neon
+        mov             r12, lr
+        sub             r10, r0,  r1, lsl #2
+        vld1.16         {d20}, [r10, :64], r1 // p3
+        vld1.16         {d24}, [r0,  :64], r1 // q0
+        vld1.16         {d21}, [r10, :64], r1 // p2
+        vld1.16         {d25}, [r0,  :64], r1 // q1
+        vld1.16         {d22}, [r10, :64], r1 // p1
+        vld1.16         {d26}, [r0,  :64], r1 // q2
+        vld1.16         {d23}, [r10, :64], r1 // p0
+        vld1.16         {d27}, [r0,  :64], r1 // q3
+        sub             r0,  r0,  r1, lsl #2
+
+        lpf_4_wd8
+
+        sub             r10, r0,  r1, lsl #1
+        sub             r10, r10, r1
+        vst1.16         {d21}, [r10, :64], r1 // p2
+        vst1.16         {d24}, [r0,  :64], r1 // q0
+        vst1.16         {d22}, [r10, :64], r1 // p1
+        vst1.16         {d25}, [r0,  :64], r1 // q1
+        vst1.16         {d23}, [r10, :64], r1 // p0
+        vst1.16         {d26}, [r0,  :64], r1 // q2
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+        bx              r12
+
+8:
+        sub             r10, r0,  r1, lsl #1
+        vst1.16         {d22}, [r10, :64], r1 // p1
+        vst1.16         {d24}, [r0,  :64], r1 // q0
+        vst1.16         {d23}, [r10, :64], r1 // p0
+        vst1.16         {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_8_4_neon
+        mov             r12, lr
+        sub             r10, r0,  #8
+        vld1.16         {d20}, [r10, :64], r1
+        vld1.16         {d24}, [r0,  :64], r1
+        vld1.16         {d21}, [r10, :64], r1
+        vld1.16         {d25}, [r0,  :64], r1
+        vld1.16         {d22}, [r10, :64], r1
+        vld1.16         {d26}, [r0,  :64], r1
+        vld1.16         {d23}, [r10, :64], r1
+        vld1.16         {d27}, [r0,  :64], r1
+
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+
+        lpf_4_wd8
+
+        sub             r0,  r0,  r1, lsl #2
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+        sub             r10, r0,  #8
+
+        vst1.16         {d20}, [r10, :64], r1
+        vst1.16         {d24}, [r0,  :64], r1
+        vst1.16         {d21}, [r10, :64], r1
+        vst1.16         {d25}, [r0,  :64], r1
+        vst1.16         {d22}, [r10, :64], r1
+        vst1.16         {d26}, [r0,  :64], r1
+        vst1.16         {d23}, [r10, :64], r1
+        vst1.16         {d27}, [r0,  :64], r1
+        bx              r12
+8:
+        sub             r0,  r0,  #4
+        transpose_4x4h  q11, q12, d22, d23, d24, d25
+        sub             r10, r0,  r1, lsl #2
+        sub             r0,  r0,  r1, lsl #1
+
+        vst1.16         {d22}, [r10], r1
+        vst1.16         {d24}, [r0],  r1
+        vst1.16         {d23}, [r10], r1
+        vst1.16         {d25}, [r0],  r1
+        add             r0,  r0,  #4
+        bx              r12
+endfunc
+
+function lpf_v_16_4_neon
+        mov             r12, lr
+
+        sub             r10, r0,  r1, lsl #3
+        add             r10, r10, r1
+        vld1.16         {d17}, [r10, :64], r1 // p6
+        vld1.16         {d24}, [r0,  :64], r1 // q0
+        vld1.16         {d18}, [r10, :64], r1 // p5
+        vld1.16         {d25}, [r0,  :64], r1 // q1
+        vld1.16         {d19}, [r10, :64], r1 // p4
+        vld1.16         {d26}, [r0,  :64], r1 // q2
+        vld1.16         {d20}, [r10, :64], r1 // p3
+        vld1.16         {d27}, [r0,  :64], r1 // q3
+        vld1.16         {d21}, [r10, :64], r1 // p2
+        vld1.16         {d28}, [r0,  :64], r1 // q4
+        vld1.16         {d22}, [r10, :64], r1 // p1
+        vld1.16         {d29}, [r0,  :64], r1 // q5
+        vld1.16         {d23}, [r10, :64], r1 // p0
+        vld1.16         {d30}, [r0,  :64], r1 // q6
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  r1
+
+        lpf_4_wd16
+
+        sub             r10, r0,  r1, lsl #2
+        sub             r10, r10, r1, lsl #1
+        vst1.16         {d0},  [r10, :64], r1 // p5
+        vst1.16         {d6},  [r0,  :64], r1 // q0
+        vst1.16         {d1},  [r10, :64], r1 // p4
+        vst1.16         {d7},  [r0,  :64], r1 // q1
+        vst1.16         {d2},  [r10, :64], r1 // p3
+        vst1.16         {d8},  [r0,  :64], r1 // q2
+        vst1.16         {d3},  [r10, :64], r1 // p2
+        vst1.16         {d9},  [r0,  :64], r1 // q3
+        vst1.16         {d4},  [r10, :64], r1 // p1
+        vst1.16         {d10}, [r0,  :64], r1 // q4
+        vst1.16         {d5},  [r10, :64], r1 // p0
+        vst1.16         {d11}, [r0,  :64], r1 // q5
+        sub             r0,  r0,  r1, lsl #2
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+7:
+        sub             r10, r0,  r1
+        sub             r10, r10, r1, lsl #1
+        vst1.16         {d21}, [r10, :64], r1 // p2
+        vst1.16         {d24}, [r0,  :64], r1 // q0
+        vst1.16         {d22}, [r10, :64], r1 // p1
+        vst1.16         {d25}, [r0,  :64], r1 // q1
+        vst1.16         {d23}, [r10, :64], r1 // p0
+        vst1.16         {d26}, [r0,  :64], r1 // q2
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+        bx              r12
+
+8:
+        sub             r10, r0,  r1, lsl #1
+        vst1.16         {d22}, [r10, :64], r1 // p1
+        vst1.16         {d24}, [r0,  :64], r1 // q0
+        vst1.16         {d23}, [r10, :64], r1 // p0
+        vst1.16         {d25}, [r0,  :64], r1 // q1
+        sub             r0,  r0,  r1, lsl #1
+        bx              r12
+endfunc
+
+function lpf_h_16_4_neon
+        mov             r12, lr
+        sub             r10, r0,  #16
+        sub             r0,  r0,  #8
+        vld1.16         {d16}, [r10, :64], r1
+        vld1.16         {d20}, [r0,  :64], r1
+        vld1.16         {d17}, [r10, :64], r1
+        vld1.16         {d21}, [r0,  :64], r1
+        vld1.16         {d18}, [r10, :64], r1
+        vld1.16         {d22}, [r0,  :64], r1
+        vld1.16         {d19}, [r10, :64], r1
+        vld1.16         {d23}, [r0,  :64], r1
+        sub             r10, r10, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r10, r10, #16
+        add             r0,  r0,  #16
+        vld1.16         {d24}, [r10, :64], r1
+        vld1.16         {d28}, [r0,  :64], r1
+        vld1.16         {d25}, [r10, :64], r1
+        vld1.16         {d29}, [r0,  :64], r1
+        vld1.16         {d26}, [r10, :64], r1
+        vld1.16         {d30}, [r0,  :64], r1
+        vld1.16         {d27}, [r10, :64], r1
+        vld1.16         {d31}, [r0,  :64], r1
+        sub             r0,  r0,  #8
+
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+        transpose_4x4h  q14, q15, d28, d29, d30, d31
+
+        lpf_4_wd16
+
+        sub             r0,  r0,  r1, lsl #2
+        transpose_4x4h  q8,  q0,  d16, d17, d0,  d1
+        transpose_4x4h  q1,  q2,  d2,  d3,  d4,  d5
+        transpose_4x4h  q3,  q4,  d6,  d7,  d8,  d9
+        transpose_4x4h  q5,  q15, d10, d11, d30, d31
+        sub             r10, r0,  #16
+        sub             r0,  r0,  #8
+
+        vst1.16         {d16}, [r10, :64], r1
+        vst1.16         {d2},  [r0,  :64], r1
+        vst1.16         {d17}, [r10, :64], r1
+        vst1.16         {d3},  [r0,  :64], r1
+        vst1.16         {d0},  [r10, :64], r1
+        vst1.16         {d4},  [r0,  :64], r1
+        vst1.16         {d1},  [r10, :64], r1
+        vst1.16         {d5},  [r0,  :64], r1
+        sub             r10, r10, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r10, r10, #16
+        add             r0,  r0,  #16
+        vst1.16         {d6},  [r10, :64], r1
+        vst1.16         {d10}, [r0,  :64], r1
+        vst1.16         {d7},  [r10, :64], r1
+        vst1.16         {d11}, [r0,  :64], r1
+        vst1.16         {d8},  [r10, :64], r1
+        vst1.16         {d30}, [r0,  :64], r1
+        vst1.16         {d9},  [r10, :64], r1
+        vst1.16         {d31}, [r0,  :64], r1
+        sub             r0,  r0,  #8
+
+        bx              r12
+
+7:
+        sub             r0,  r0,  r1, lsl #2
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+        sub             r10, r0,  #8
+
+        vst1.16         {d20}, [r10, :64], r1
+        vst1.16         {d24}, [r0,  :64], r1
+        vst1.16         {d21}, [r10, :64], r1
+        vst1.16         {d25}, [r0,  :64], r1
+        vst1.16         {d22}, [r10, :64], r1
+        vst1.16         {d26}, [r0,  :64], r1
+        vst1.16         {d23}, [r10, :64], r1
+        vst1.16         {d27}, [r0,  :64], r1
+        bx              r12
+8:
+        sub             r0,  r0,  #4
+        transpose_4x4h  q11, q12, d22, d23, d24, d25
+        sub             r10, r0,  r1, lsl #2
+        sub             r0,  r0,  r1, lsl #1
+
+        vst1.16         {d22}, [r10], r1
+        vst1.16         {d24}, [r0],  r1
+        vst1.16         {d23}, [r10], r1
+        vst1.16         {d25}, [r0],  r1
+        add             r0,  r0,  #4
+        bx              r12
+endfunc
+
+// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                  const uint32_t *const vmask,
+//                                  const uint8_t (*l)[4], ptrdiff_t b4_stride,
+//                                  const Av1FilterLUT *lut, const int w,
+//                                  const int bitdepth_max)
+
+.macro lpf_func dir, type
+function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r8,  [sp,  #112] // bitdepth_max; the 'w' parameter isn't loaded
+        sub             sp,  sp,  #8
+        clz             r9,  r8
+        rsb             r9,  r9,  #24  // bitdepth_min_8
+        ldrd            r6,  r7,  [r2] // vmask[0], vmask[1]
+.ifc \type, y
+        ldr             r2,  [r2, #8]  // vmask[2]
+.endif
+        add             r5,  r5,  #128 // Move to sharp part of lut
+.ifc \type, y
+        orr             r7,  r7,  r2   // vmask[1] |= vmask[2]
+.endif
+.ifc \dir, v
+        sub             r4,  r3,  r4, lsl #2
+.else
+        sub             r3,  r3,  #4
+        lsl             r4,  r4,  #2
+.endif
+        orr             r6,  r6,  r7   // vmask[0] |= vmask[1]
+
+1:
+        tst             r6,  #0x01
+        strd            r6,  r7,  [sp]
+.ifc \dir, v
+        ldrb            r10, [r4], #4
+        ldrb            r11, [r3], #4
+.else
+        ldrb            r10, [r3]
+        ldrb            r11, [r3, #4]
+        add             r3,  r3,  r4
+.endif
+        beq             7f             // if (!(vm & bits)) continue;
+
+        orrs            r12, r10, r11
+        vdup.16         d31, r9        // bitdepth_min_8
+        beq             7f             // if (!(l[0][0] | l[offset][0])) continue;
+        cmp             r11, #0        // Check for nonzero values in l[0][0]
+        ldrb            r6,  [r5], #8  // sharp[0]
+        it              eq
+        moveq           r11, r10       // if (!l[0][0]) L = l[offset][0]
+        ldrb            r12, [r5]      // sharp[1]
+        lsr             r6,  r11, r6   // L >> sharp[0]
+        sub             r5,  r5,  #8
+        cmp             r12, r6
+        lsr             r10, r11, #4   // H
+        add             r11, r11, #2   // L + 2
+        it              lt
+        movlt           r6,  r12       // imin(L >> sharp[0], sharp[1])
+        add             r11, r11, r11  // 2*(L + 2)
+        cmp             r6,  #1
+        lsl             r10, r10, r9   // H << bitdepth_min_8
+        it              lt
+        movlt           r6,  #1        // imax(imin(), 1) = limit = I
+        vdup.16         d12, r10       // H << bitdepth_min_8
+        add             r11, r11, r6   // 2*(L + 2) + limit = E
+        lsl             r6,  r6,  r9   // I << bitdepth_min_8
+        lsl             r11, r11, r9   // E << bitdepth_min_8
+        vdup.16         d11, r6        // I << bitdepth_min_8
+        vdup.16         d10, r11       // E << bitdepth_min_8
+
+.ifc \type, y
+        tst             r2,  #0x01
+        beq             2f
+        // wd16
+        bl              lpf_\dir\()_16_4_neon
+        b               8f
+2:
+.endif
+        tst             r7,  #0x01
+        beq             3f
+.ifc \type, y
+        // wd8
+        bl              lpf_\dir\()_8_4_neon
+.else
+        // wd6
+        bl              lpf_\dir\()_6_4_neon
+.endif
+        b               8f
+3:
+        // wd4
+        bl              lpf_\dir\()_4_4_neon
+.ifc \dir, h
+        b               8f
+7:
+        // For dir h, the functions above increment r0.
+        // If the whole function is skipped, increment it here instead.
+        add             r0,  r0,  r1,  lsl #2
+.else
+7:
+.endif
+8:
+        ldrd            r6,  r7,  [sp]
+.ifc \type, y
+        lsr             r2,  r2,  #1   // vmask[2] >>= 1
+.endif
+.ifc \dir, v
+        add             r0,  r0,  #8
+.else
+        // For dir h, r0 is returned incremented
+.endif
+        lsrs            r6,  r6,  #1   // vmask[0] >>= 1
+        lsr             r7,  r7,  #1   // vmask[1] >>= 1
+        bne             1b
+
+        add             sp,  sp,  #8
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+lpf_func v, y
+lpf_func h, y
+lpf_func v, uv
+lpf_func h, uv
--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -30,7 +30,7 @@
 
 // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
 //                                      const pixel *src, ptrdiff_t stride,
-//                                      const int16_t fh[7], const intptr_t w,
+//                                      const int16_t fh[8], intptr_t w,
 //                                      int h, enum LrEdgeFlags edges);
 function wiener_filter_h_8bpc_neon, export=1
         push            {r4-r11,lr}
@@ -38,10 +38,10 @@
         ldrd            r4,  r5,  [sp, #52]
         ldrd            r6,  r7,  [sp, #60]
         mov             r8,  r5
-        vld1.16         {q0},  [r4]
+        vld1.16         {q0},  [r4, :128]
         movw            r9,  #(1 << 14) - (1 << 2)
-        vdup.16         q14,  r9
-        vmov.s16        q15,  #2048
+        vdup.16         q14, r9
+        vmov.s16        q15, #2048
         // Calculate mid_stride
         add             r10, r5,  #7
         bic             r10, r10, #7
@@ -108,8 +108,8 @@
 0:
         // !LR_HAVE_LEFT, fill q1 with the leftmost byte
         // and shift q2 to have 3x the first byte at the front.
-        vdup.8          q1, d4[0]
-        vdup.8          q8, d18[0]
+        vdup.8          q1,  d4[0]
+        vdup.8          q8,  d18[0]
         // Move r2 back to account for the last 3 bytes we loaded before,
         // which we shifted out.
         sub             r2,  r2,  #3
@@ -127,7 +127,7 @@
         bne             4f
         // If we'll need to pad the right edge, load that byte to pad with
         // here since we can find it pretty easily from here.
-        sub             r9,  r5, #14
+        sub             r9,  r5,  #14
         ldrb            r11, [r2, r9]
         ldrb            r9,  [lr, r9]
         // Fill q12/q13 with the right padding pixel
@@ -144,7 +144,6 @@
         b               6f
 
 4:      // Loop horizontally
-.macro filter_8
         // This is tuned as some sort of compromise between Cortex A7, A8,
         // A9 and A53.
         vmul.s16        q3,  q1,  d0[0]
@@ -187,8 +186,6 @@
         vshr.s16        q10, q10, #3
         vadd.s16        q3,  q3,  q15
         vadd.s16        q10, q10, q15
-.endm
-        filter_8
         vst1.16         {q3},  [r0,  :128]!
         vst1.16         {q10}, [r12, :128]!
 
@@ -206,50 +203,43 @@
 
 5:      // Filter 4 pixels, 7 <= w < 11
 .macro filter_4
+        vext.8          d20, d2,  d3,  #2
+        vext.8          d21, d2,  d3,  #4
+        vext.8          d22, d2,  d3,  #6
+        vext.8          d23, d3,  d4,  #2
+        vext.8          d8,  d3,  d4,  #4
         vmul.s16        d6,  d2,  d0[0]
-        vext.8          q10, q1,  q2,  #2
-        vext.8          q11, q1,  q2,  #4
         vmla.s16        d6,  d20, d0[1]
-        vmla.s16        d6,  d22, d0[2]
-        vext.8          q10, q1,  q2,  #6
-        vext.8          q11, q1,  q2,  #8
-        vmla.s16        d6,  d20, d0[3]
-        vmla.s16        d6,  d22, d1[0]
-        vext.8          q10, q1,  q2,  #10
-        vext.8          q11, q1,  q2,  #12
-        vmla.s16        d6,  d20, d1[1]
-        vmla.s16        d6,  d22, d1[2]
+        vmla.s16        d6,  d21, d0[2]
+        vmla.s16        d6,  d22, d0[3]
+        vmla.s16        d6,  d3,  d1[0]
+        vmla.s16        d6,  d23, d1[1]
+        vmla.s16        d6,  d8,  d1[2]
 
-        vmul.s16        d20, d16, d0[0]
-        vext.8          q11, q8,  q9,  #2
-        vext.8          q4,  q8,  q9,  #4
-        vmla.s16        d20, d22, d0[1]
-        vmla.s16        d20, d8,  d0[2]
-        vext.8          q11, q8,  q9,  #6
-        vext.8          q4,  q8,  q9,  #8
-        vmla.s16        d20, d22, d0[3]
-        vmla.s16        d20, d8,  d1[0]
-        vext.8          q11, q8,  q9,  #10
-        vext.8          q4,  q8,  q9,  #12
-        vmla.s16        d20, d22, d1[1]
-        vmla.s16        d20, d8,  d1[2]
+        vext.8          d20, d16, d17, #2
+        vext.8          d21, d16, d17, #4
+        vext.8          d22, d16, d17, #6
+        vext.8          d23, d17, d18, #2
+        vext.8          d8,  d17, d18, #4
+        vmul.s16        d7,  d16, d0[0]
+        vmla.s16        d7,  d20, d0[1]
+        vmla.s16        d7,  d21, d0[2]
+        vmla.s16        d7,  d22, d0[3]
+        vmla.s16        d7,  d17, d1[0]
+        vmla.s16        d7,  d23, d1[1]
+        vmla.s16        d7,  d8,  d1[2]
 
-        vext.8          q11, q1,  q2,  #6
-        vshl.s16        d22, d22, #7
-        vsub.s16        d22, d22, d28
-        vqadd.s16       d6,  d6,  d22
-        vext.8          q11, q8,  q9,  #6
-        vshl.s16        d22, d22, #7
-        vsub.s16        d22, d22, d28
-        vqadd.s16       d20, d20, d22
-        vshr.s16        d6,  d6,  #3
-        vshr.s16        d20, d20, #3
-        vadd.s16        d6,  d6,  d30
-        vadd.s16        d20, d20, d30
+        vext.8          d22, d2,  d3,  #6
+        vext.8          d23, d16, d17, #6
+        vshl.s16        q11, q11, #7
+        vsub.s16        q11, q11, q14
+        vqadd.s16       q3,  q3,  q11
+        vshr.s16        q3,  q3,  #3
+        vadd.s16        q3,  q3,  q15
 .endm
         filter_4
         vst1.16         {d6},  [r0,  :64]!
-        vst1.16         {d20}, [r12, :64]!
+        vst1.16         {d7},  [r12, :64]!
 
         subs            r5,  r5,  #4 // 3 <= w < 7
         vext.8          q1,  q1,  q2,  #8
@@ -323,7 +313,7 @@
         // w >= 4, filter 4 pixels
         filter_4
         vst1.16         {d6},  [r0,  :64]!
-        vst1.16         {d20}, [r12, :64]!
+        vst1.16         {d7},  [r12, :64]!
         subs            r5,  r5,  #4 // 0 <= w < 4
         vext.8          q1,  q1,  q2,  #8
         vext.8          q8,  q8,  q9,  #8
@@ -338,11 +328,11 @@
         vdup.16         d25, d16[3]
         vpadd.s16       d6,  d6,  d6
         vtrn.16         d24, d25
-        vshl.s16        d24, d24,  #7
-        vsub.s16        d24, d24,  d28
-        vqadd.s16       d6,  d6,   d24
-        vshr.s16        d6,  d6,   #3
-        vadd.s16        d6,  d6,   d30
+        vshl.s16        d24, d24, #7
+        vsub.s16        d24, d24, d28
+        vqadd.s16       d6,  d6,  d24
+        vshr.s16        d6,  d6,  #3
+        vadd.s16        d6,  d6,  d30
         vst1.s16        {d6[0]}, [r0,  :16]!
         vst1.s16        {d6[1]}, [r12, :16]!
         subs            r5,  r5,  #1
@@ -363,13 +353,12 @@
 0:
         vpop            {q4}
         pop             {r4-r11,pc}
-.purgem filter_8
 .purgem filter_4
 endfunc
 
 // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
 //                                      const int16_t *mid, int w, int h,
-//                                      const int16_t fv[7], enum LrEdgeFlags edges,
+//                                      const int16_t fv[8], enum LrEdgeFlags edges,
 //                                      ptrdiff_t mid_stride);
 function wiener_filter_v_8bpc_neon, export=1
         push            {r4-r7,lr}
@@ -376,11 +365,7 @@
         ldrd            r4,  r5,  [sp, #20]
         ldrd            r6,  r7,  [sp, #28]
         mov             lr,  r4
-        vmov.s16        q1,  #0
-        mov             r12, #128
-        vld1.16         {q0},  [r5]
-        vmov.s16        d2[3], r12
-        vadd.s16        q0,  q0,  q1
+        vld1.16         {q0},  [r5, :128]
 
         // Calculate the number of rows to move back when looping vertically
         mov             r12, r4
@@ -422,22 +407,22 @@
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
         // chained like this.
-        vmull.s16       q2,  d16,  d0[0]
-        vmlal.s16       q2,  d18,  d0[1]
-        vmlal.s16       q2,  d20,  d0[2]
-        vmlal.s16       q2,  d22,  d0[3]
-        vmlal.s16       q2,  d24,  d1[0]
-        vmlal.s16       q2,  d26,  d1[1]
-        vmlal.s16       q2,  d28,  d1[2]
-        vmull.s16       q3,  d17,  d0[0]
-        vmlal.s16       q3,  d19,  d0[1]
-        vmlal.s16       q3,  d21,  d0[2]
-        vmlal.s16       q3,  d23,  d0[3]
-        vmlal.s16       q3,  d25,  d1[0]
-        vmlal.s16       q3,  d27,  d1[1]
-        vmlal.s16       q3,  d29,  d1[2]
-        vqrshrun.s32    d4,  q2,   #11
-        vqrshrun.s32    d5,  q3,   #11
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d20, d0[2]
+        vmlal.s16       q2,  d22, d0[3]
+        vmlal.s16       q2,  d24, d1[0]
+        vmlal.s16       q2,  d26, d1[1]
+        vmlal.s16       q2,  d28, d1[2]
+        vmull.s16       q3,  d17, d0[0]
+        vmlal.s16       q3,  d19, d0[1]
+        vmlal.s16       q3,  d21, d0[2]
+        vmlal.s16       q3,  d23, d0[3]
+        vmlal.s16       q3,  d25, d1[0]
+        vmlal.s16       q3,  d27, d1[1]
+        vmlal.s16       q3,  d29, d1[2]
+        vqrshrun.s32    d4,  q2,  #11
+        vqrshrun.s32    d5,  q3,  #11
         vqmovun.s16     d4,  q2
         vst1.8          {d4}, [r0], r1
 .if \compare
@@ -473,7 +458,7 @@
 52:     // 2 rows in total, q11 already loaded, load q12 with content data
         // and 2 rows of edge.
         vld1.16         {q14}, [r2, :128], r7
-        vmov            q15,  q14
+        vmov            q15, q14
         b               8f
 53:
         // 3 rows in total, q11 already loaded, load q12 and q13 with content
@@ -615,8 +600,8 @@
         asr             r1,  r1,  #1
 22:
         subs            r4,  r4,  #1
-        vld1.16         {d0[]},  [r2]!
-        vst1.16         {d0[0]}, [r0], r1
+        vld1.16         {d0[]},  [r2, :16]!
+        vst1.16         {d0[0]}, [r0, :16], r1
         bgt             22b
 0:
         pop             {r4,pc}
@@ -644,8 +629,8 @@
         ble             0f
         b               42b
 41:
-        vld1.32         {d0[]},  [r2]
-        vst1.32         {d0[0]}, [r0]
+        vld1.32         {d0[]},  [r2, :32]
+        vst1.32         {d0[0]}, [r0, :32]
 0:
         pop             {r4,pc}
 
@@ -687,6 +672,8 @@
 
 #define SUM_STRIDE (384+16)
 
+#include "looprestoration_tmpl.S"
+
 // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
 //                                 const pixel (*left)[4],
 //                                 const pixel *src, const ptrdiff_t stride,
@@ -785,7 +772,7 @@
         bne             4f
         // If we'll need to pad the right edge, load that byte to pad with
         // here since we can find it pretty easily from here.
-        sub             lr,  r5, #(2 + 16 - 2 + 1)
+        sub             lr,  r5,  #(2 + 16 - 2 + 1)
         ldrb            r11, [r3,  lr]
         ldrb            lr,  [r12, lr]
         // Fill q14/q15 with the right padding pixel
@@ -1058,7 +1045,7 @@
         bne             4f
         // If we'll need to pad the right edge, load that byte to pad with
         // here since we can find it pretty easily from here.
-        sub             lr,  r5, #(2 + 16 - 3 + 1)
+        sub             lr,  r5,  #(2 + 16 - 3 + 1)
         ldrb            r11, [r3,  lr]
         ldrb            lr,  [r12, lr]
         // Fill q14/q15 with the right padding pixel
@@ -1100,7 +1087,7 @@
         vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
         vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, \w
         vaddw_u16_n     q12, q13, d22, d23, \w
-        vadd_i32_n      q12, q13, q8,  q9, \w
+        vadd_i32_n      q12, q13, q8,  q9,  \w
         vext.8          q8,  q5,  q6,  #2
         vext.8          q9,  q5,  q6,  #4
         vext.8          q10, q5,  q6,  #6
@@ -1152,7 +1139,7 @@
 
 6:      // Pad the right edge and produce the last few pixels.
         // w < 7, w+1 pixels valid in q0/q4
-        sub             lr,   r5,  #1
+        sub             lr,  r5,  #1
         // lr = pixels valid - 2
         adr             r11, L(box5_variable_shift_tbl)
         ldr             lr,  [r11, lr, lsl #2]
@@ -1249,862 +1236,4 @@
 .purgem add5
 endfunc
 
-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
-        push            {r4-r9,lr}
-        ldr             r4,  [sp, #28]
-        add             r12, r3,  #2 // Number of output rows to move back
-        mov             lr,  r3      // Number of input rows to move back
-        add             r2,  r2,  #2 // Actual summed width
-        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             r8,       #(2*SUM_STRIDE) // sum stride
-        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             r4,  #4 // LR_HAVE_TOP
-        beq             0f
-        // If have top, read from row -2.
-        sub             r5,  r0,  #(4*SUM_STRIDE)
-        sub             r6,  r1,  #(2*SUM_STRIDE)
-        add             lr,  lr,  #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             r5,  r0,  #(4*SUM_STRIDE)
-        add             r6,  r1,  #(2*SUM_STRIDE)
-1:
-
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        beq             1f
-        // LR_HAVE_BOTTOM
-        add             r3,  r3,  #2  // Sum all h+2 lines with the main loop
-        add             lr,  lr,  #2
-1:
-        mov             r9,  r3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into q8-q13 and q0-q2 taking top
-        // padding into consideration.
-        tst             r4,  #4 // LR_HAVE_TOP
-        vld1.32         {q8,  q9},  [r5, :128], r7
-        vld1.16         {q0},       [r6, :128], r8
-        beq             2f
-        // LR_HAVE_TOP
-        vld1.32         {q10, q11}, [r5, :128], r7
-        vld1.16         {q1},       [r6, :128], r8
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q2},       [r6, :128], r8
-        b               3f
-2:      // !LR_HAVE_TOP
-        vmov            q10, q8
-        vmov            q11, q9
-        vmov            q1,  q0
-        vmov            q12, q8
-        vmov            q13, q9
-        vmov            q2,  q0
-
-3:
-        subs            r3,  r3,  #1
-.macro add3
-        vadd.i32        q8,  q8,  q10
-        vadd.i32        q9,  q9,  q11
-        vadd.i16        q0,  q0,  q1
-        vadd.i32        q8,  q8,  q12
-        vadd.i32        q9,  q9,  q13
-        vadd.i16        q0,  q0,  q2
-        vst1.32         {q8, q9}, [r0, :128], r7
-        vst1.16         {q0},     [r1, :128], r8
-.endm
-        add3
-        vmov            q8,  q10
-        vmov            q9,  q11
-        vmov            q0,  q1
-        vmov            q10, q12
-        vmov            q11, q13
-        vmov            q1,  q2
-        ble             4f
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q2},       [r6, :128], r8
-        b               3b
-
-4:
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        bne             5f
-        // !LR_HAVE_BOTTOM
-        // Produce two more rows, extending the already loaded rows.
-        add3
-        vmov            q8,  q10
-        vmov            q9,  q11
-        vmov            q0,  q1
-        add3
-
-5:      // End of one vertical slice.
-        subs            r2,  r2,  #8
-        ble             0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        mls             r5,  r7,  lr,  r5
-        mls             r6,  r8,  lr,  r6
-        // Output pointers
-        mls             r0,  r7,  r12, r0
-        mls             r1,  r8,  r12, r1
-        add             r0,  r0,  #32
-        add             r1,  r1,  #16
-        add             r5,  r5,  #32
-        add             r6,  r6,  #16
-        mov             r3,  r9
-        b               1b
-
-0:
-        pop             {r4-r9,pc}
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
-        push            {r4-r9,lr}
-        vpush           {q5-q7}
-        ldr             r4,  [sp, #76]
-        add             r12, r3,  #2 // Number of output rows to move back
-        mov             lr,  r3      // Number of input rows to move back
-        add             r2,  r2,  #8 // Actual summed width
-        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             r8,       #(2*SUM_STRIDE) // sum stride
-        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             r4,  #4 // LR_HAVE_TOP
-        beq             0f
-        // If have top, read from row -2.
-        sub             r5,  r0,  #(4*SUM_STRIDE)
-        sub             r6,  r1,  #(2*SUM_STRIDE)
-        add             lr,  lr,  #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             r5,  r0,  #(4*SUM_STRIDE)
-        add             r6,  r1,  #(2*SUM_STRIDE)
-1:
-
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        beq             0f
-        // LR_HAVE_BOTTOM
-        add             r3,  r3,  #2  // Handle h+2 lines with the main loop
-        add             lr,  lr,  #2
-        b               1f
-0:
-        // !LR_HAVE_BOTTOM
-        sub             r3,  r3,  #1  // Handle h-1 lines with the main loop
-1:
-        mov             r9,  r3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into q6-q15 and q0-q3,q5 taking top
-        // padding into consideration.
-        tst             r4,  #4 // LR_HAVE_TOP
-        vld1.32         {q6,  q7},  [r5, :128], r7
-        vld1.16         {q0},       [r6, :128], r8
-        beq             2f
-        // LR_HAVE_TOP
-        vld1.32         {q10, q11}, [r5, :128], r7
-        vld1.16         {q2},       [r6, :128], r8
-        vmov            q8,  q6
-        vmov            q9,  q7
-        vmov            q1,  q0
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q3},       [r6, :128], r8
-        b               3f
-2:      // !LR_HAVE_TOP
-        vmov            q8,  q6
-        vmov            q9,  q7
-        vmov            q1,  q0
-        vmov            q10, q6
-        vmov            q11, q7
-        vmov            q2,  q0
-        vmov            q12, q6
-        vmov            q13, q7
-        vmov            q3,  q0
-
-3:
-        cmp             r3,  #0
-        beq             4f
-        vld1.32         {q14, q15}, [r5, :128], r7
-        vld1.16         {q5},       [r6, :128], r8
-
-3:
-        // Start of vertical loop
-        subs            r3,  r3,  #2
-.macro add5
-        vadd.i32        q6,  q6,  q8
-        vadd.i32        q7,  q7,  q9
-        vadd.i16        q0,  q0,  q1
-        vadd.i32        q6,  q6,  q10
-        vadd.i32        q7,  q7,  q11
-        vadd.i16        q0,  q0,  q2
-        vadd.i32        q6,  q6,  q12
-        vadd.i32        q7,  q7,  q13
-        vadd.i16        q0,  q0,  q3
-        vadd.i32        q6,  q6,  q14
-        vadd.i32        q7,  q7,  q15
-        vadd.i16        q0,  q0,  q5
-        vst1.32         {q6, q7}, [r0, :128], r7
-        vst1.16         {q0},     [r1, :128], r8
-.endm
-        add5
-.macro shift2
-        vmov            q6,  q10
-        vmov            q7,  q11
-        vmov            q0,  q2
-        vmov            q8,  q12
-        vmov            q9,  q13
-        vmov            q1,  q3
-        vmov            q10, q14
-        vmov            q11, q15
-        vmov            q2,  q5
-.endm
-        shift2
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        ble             5f
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q3},       [r6, :128], r8
-        vld1.32         {q14, q15}, [r5, :128], r7
-        vld1.16         {q5},       [r6, :128], r8
-        b               3b
-
-4:
-        // h == 1, !LR_HAVE_BOTTOM.
-        // Pad the last row with the only content row, and add.
-        vmov            q14, q12
-        vmov            q15, q13
-        vmov            q5,  q3
-        add5
-        shift2
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        add5
-        b               6f
-
-5:
-        tst             r4,  #8 // LR_HAVE_BOTTOM
-        bne             6f
-        // !LR_HAVE_BOTTOM
-        cmp             r3,  #0
-        bne             5f
-        // The intended three edge rows left; output the one at h-2 and
-        // the past edge one at h.
-        vld1.32         {q12, q13}, [r5, :128], r7
-        vld1.16         {q3},       [r6, :128], r8
-        // Pad the past-edge row from the last content row.
-        vmov            q14, q12
-        vmov            q15, q13
-        vmov            q5,  q3
-        add5
-        shift2
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        // The last two rows are already padded properly here.
-        add5
-        b               6f
-
-5:
-        // r3 == -1, two rows left, output one.
-        // Pad the last two rows from the mid one.
-        vmov            q12, q10
-        vmov            q13, q11
-        vmov            q3,  q2
-        vmov            q14, q10
-        vmov            q15, q11
-        vmov            q5,  q2
-        add5
-        add             r0,  r0,  r7
-        add             r1,  r1,  r8
-        b               6f
-
-6:      // End of one vertical slice.
-        subs            r2,  r2,  #8
-        ble             0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        mls             r5,  r7,  lr,  r5
-        mls             r6,  r8,  lr,  r6
-        // Output pointers
-        mls             r0,  r7,  r12, r0
-        mls             r1,  r8,  r12, r1
-        add             r0,  r0,  #32
-        add             r1,  r1,  #16
-        add             r5,  r5,  #32
-        add             r6,  r6,  #16
-        mov             r3,  r9
-        b               1b
-
-0:
-        vpop            {q5-q7}
-        pop             {r4-r9,pc}
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-function sgr_calc_ab1_neon, export=1
-        push            {r4-r5,lr}
-        vpush           {q4-q7}
-        ldr             r4,  [sp, #76]
-        add             r3,  r3,  #2   // h += 2
-        vmov.i32        q15, #9        // n
-        movw            r5,  #455
-        mov             lr,  #SUM_STRIDE
-        b               sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
-        push            {r4-r5,lr}
-        vpush           {q4-q7}
-        ldr             r4,  [sp, #76]
-        add             r3,  r3,  #3   // h += 3
-        asr             r3,  r3,  #1   // h /= 2
-        vmov.i32        q15, #25       // n
-        mov             r5,  #164
-        mov             lr,  #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
-        movrel          r12, X(sgr_x_by_x)
-        vld1.8          {q8, q9}, [r12, :128]!
-        vmov.i8         q11, #5
-        vmov.i8         d10, #55       // idx of last 5
-        vld1.8          {q10},    [r12, :128]
-        vmov.i8         d11, #72       // idx of last 4
-        vmov.i8         d12, #101      // idx of last 3
-        vmov.i8         d13, #169      // idx of last 2
-        vmov.i8         d14, #254      // idx of last 1
-        vmov.i8         d15, #32       // elements consumed in first vtbl
-        add             r2,  r2,  #2   // w += 2
-        add             r12, r2,  #7
-        bic             r12, r12, #7   // aligned w
-        sub             r12, lr,  r12  // increment between rows
-        vmov.i16        q13, #256
-        vdup.32         q12, r4
-        vdup.32         q14, r5        // one_by_x
-        sub             r0,  r0,  #(4*(SUM_STRIDE))
-        sub             r1,  r1,  #(2*(SUM_STRIDE))
-        mov             r4,  r2        // backup of w
-        vsub.i8         q8,  q8,  q11
-        vsub.i8         q9,  q9,  q11
-        vsub.i8         q10, q10, q11
-1:
-        subs            r2,  r2,  #8
-        vld1.32         {q0, q1}, [r0, :128] // a
-        vld1.16         {q2},     [r1, :128] // b
-        vmul.i32        q0,  q0,  q15  // a * n
-        vmul.i32        q1,  q1,  q15  // a * n
-        vmull.u16       q3,  d4,  d4   // b * b
-        vmull.u16       q4,  d5,  d5   // b * b
-        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
-        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
-        vmul.i32        q0,  q0,  q12  // p * s
-        vmul.i32        q1,  q1,  q12  // p * s
-        vqshrn.u32      d0,  q0,  #16
-        vqshrn.u32      d1,  q1,  #16
-        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
-
-        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
-        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
-        vtbl.8          d1,  {q8, q9}, d0
-        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
-        vsub.i8         d9,  d0,  d15  // indices for vtbx
-        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
-        vadd.i8         d2,  d2,  d3
-        vtbx.8          d1,  {q10}, d9
-        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
-        vadd.i8         d6,  d6,  d7
-        vadd.i8         d8,  d8,  d22
-        vadd.i8         d2,  d2,  d6
-        vadd.i8         d1,  d1,  d8
-        vadd.i8         d1,  d1,  d2
-        vmovl.u8        q0,  d1        // x
-
-        vmull.u16       q1,  d0,  d4   // x * BB[i]
-        vmull.u16       q2,  d1,  d5   // x * BB[i]
-        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
-        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
-        vrshr.s32       q1,  q1,  #12  // AA[i]
-        vrshr.s32       q2,  q2,  #12  // AA[i]
-        vsub.i16        q0,  q13, q0   // 256 - x
-
-        vst1.32         {q1, q2}, [r0, :128]!
-        vst1.16         {q0},     [r1, :128]!
-        bgt             1b
-
-        subs            r3,  r3,  #1
-        ble             0f
-        add             r0,  r0,  r12, lsl #2
-        add             r1,  r1,  r12, lsl #1
-        mov             r2,  r4
-        b               1b
-0:
-        vpop            {q4-q7}
-        pop             {r4-r5,pc}
-endfunc
-
-#define FILTER_OUT_STRIDE 384
-
-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter1_8bpc_neon, export=1
-        push            {r4-r11,lr}
-        vpush           {q4-q7}
-        ldrd            r4,  r5,  [sp, #100]
-        ldr             r6,  [sp, #108]
-        sub             r7,  r3,  #(4*SUM_STRIDE)
-        add             r8,  r3,  #(4*SUM_STRIDE)
-        sub             r9,  r4,  #(2*SUM_STRIDE)
-        add             r10, r4,  #(2*SUM_STRIDE)
-        mov             r11, #SUM_STRIDE
-        mov             r12, #FILTER_OUT_STRIDE
-        add             lr,  r5,  #3
-        bic             lr,  lr,  #3 // Aligned width
-        sub             r2,  r2,  lr
-        sub             r12, r12, lr
-        sub             r11, r11, lr
-        sub             r11, r11, #4 // We read 4 extra elements from both a and b
-        mov             lr,  r5
-        vmov.i16        q14, #3
-        vmov.i32        q15, #3
-1:
-        vld1.16         {q0},       [r9]!
-        vld1.16         {q1},       [r4]!
-        vld1.16         {q2},       [r10]!
-        vld1.32         {q8,  q9},  [r7]!
-        vld1.32         {q10, q11}, [r3]!
-        vld1.32         {q12, q13}, [r8]!
-
-2:
-        subs            r5,  r5,  #4
-        vext.8          d6,  d0,  d1,  #2  // -stride
-        vext.8          d7,  d2,  d3,  #2  // 0
-        vext.8          d8,  d4,  d5,  #2  // +stride
-        vext.8          d9,  d0,  d1,  #4  // +1-stride
-        vext.8          d10, d2,  d3,  #4  // +1
-        vext.8          d11, d4,  d5,  #4  // +1+stride
-        vadd.i16        d2,  d2,  d6       // -1, -stride
-        vadd.i16        d7,  d7,  d8       // 0, +stride
-        vadd.i16        d0,  d0,  d9       // -1-stride, +1-stride
-        vadd.i16        d2,  d2,  d7
-        vadd.i16        d4,  d4,  d11      // -1+stride, +1+stride
-        vadd.i16        d2,  d2,  d10      // +1
-        vadd.i16        d0,  d0,  d4
-
-        vext.8          q3,  q8,  q9,  #4  // -stride
-        vshl.i16        d2,  d2,  #2
-        vext.8          q4,  q8,  q9,  #8  // +1-stride
-        vext.8          q5,  q10, q11, #4  // 0
-        vext.8          q6,  q10, q11, #8  // +1
-        vmla.i16        d2,  d0,  d28      // * 3 -> a
-        vadd.i32        q3,  q3,  q10      // -stride, -1
-        vadd.i32        q8,  q8,  q4       // -1-stride, +1-stride
-        vadd.i32        q5,  q5,  q6       // 0, +1
-        vadd.i32        q8,  q8,  q12      // -1+stride
-        vadd.i32        q3,  q3,  q5
-        vext.8          q7,  q12, q13, #4  // +stride
-        vext.8          q10, q12, q13, #8  // +1+stride
-        vld1.32         {d24[0]}, [r1]!    // src
-        vadd.i32        q3,  q3,  q7       // +stride
-        vadd.i32        q8,  q8,  q10      // +1+stride
-        vshl.i32        q3,  q3,  #2
-        vmla.i32        q3,  q8,  q15      // * 3 -> b
-        vmovl.u8        q12, d24           // src
-        vmov            d0,  d1
-        vmlal.u16       q3,  d2,  d24      // b + a * src
-        vmov            d2,  d3
-        vrshrn.i32      d6,  q3,  #9
-        vmov            d4,  d5
-        vst1.16         {d6}, [r0]!
-
-        ble             3f
-        vmov            q8,  q9
-        vmov            q10, q11
-        vmov            q12, q13
-        vld1.16         {d1},  [r9]!
-        vld1.16         {d3},  [r4]!
-        vld1.16         {d5},  [r10]!
-        vld1.32         {q9},  [r7]!
-        vld1.32         {q11}, [r3]!
-        vld1.32         {q13}, [r8]!
-        b               2b
-
-3:
-        subs            r6,  r6,  #1
-        ble             0f
-        mov             r5,  lr
-        add             r0,  r0,  r12, lsl #1
-        add             r1,  r1,  r2
-        add             r3,  r3,  r11, lsl #2
-        add             r7,  r7,  r11, lsl #2
-        add             r8,  r8,  r11, lsl #2
-        add             r4,  r4,  r11, lsl #1
-        add             r9,  r9,  r11, lsl #1
-        add             r10, r10, r11, lsl #1
-        b               1b
-0:
-        vpop            {q4-q7}
-        pop             {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter2_8bpc_neon, export=1
-        push            {r4-r11,lr}
-        vpush           {q4-q7}
-        ldrd            r4,  r5,  [sp, #100]
-        ldr             r6,  [sp, #108]
-        add             r7,  r3,  #(4*(SUM_STRIDE))
-        sub             r3,  r3,  #(4*(SUM_STRIDE))
-        add             r8,  r4,  #(2*(SUM_STRIDE))
-        sub             r4,  r4,  #(2*(SUM_STRIDE))
-        mov             r9,  #(2*SUM_STRIDE)
-        mov             r10, #FILTER_OUT_STRIDE
-        add             r11, r5,  #7
-        bic             r11, r11, #7 // Aligned width
-        sub             r2,  r2,  r11
-        sub             r10, r10, r11
-        sub             r9,  r9,  r11
-        sub             r9,  r9,  #4 // We read 4 extra elements from a
-        sub             r12, r9,  #4 // We read 8 extra elements from b
-        mov             lr,  r5
-
-1:
-        vld1.16         {q0,  q1},  [r4]!
-        vld1.16         {q2,  q3},  [r8]!
-        vld1.32         {q8,  q9},  [r3]!
-        vld1.32         {q11, q12}, [r7]!
-        vld1.32         {q10},      [r3]!
-        vld1.32         {q13},      [r7]!
-
-2:
-        vmov.i16        q14, #5
-        vmov.i16        q15, #6
-        subs            r5,  r5,  #8
-        vext.8          q4,  q0,  q1,  #4  // +1-stride
-        vext.8          q5,  q2,  q3,  #4  // +1+stride
-        vext.8          q6,  q0,  q1,  #2  // -stride
-        vext.8          q7,  q2,  q3,  #2  // +stride
-        vadd.i16        q0,  q0,  q4       // -1-stride, +1-stride
-        vadd.i16        q5,  q2,  q5       // -1+stride, +1+stride
-        vadd.i16        q2,  q6,  q7       // -stride, +stride
-        vadd.i16        q0,  q0,  q5
-
-        vext.8          q4,  q8,  q9,  #8  // +1-stride
-        vext.8          q5,  q9,  q10, #8
-        vext.8          q6,  q11, q12, #8  // +1+stride
-        vext.8          q7,  q12, q13, #8
-        vmul.i16        q0,  q0,  q14      // * 5
-        vmla.i16        q0,  q2,  q15      // * 6
-        vadd.i32        q4,  q4,  q8       // -1-stride, +1-stride
-        vadd.i32        q5,  q5,  q9
-        vadd.i32        q6,  q6,  q11      // -1+stride, +1+stride
-        vadd.i32        q7,  q7,  q12
-        vadd.i32        q4,  q4,  q6
-        vadd.i32        q5,  q5,  q7
-        vext.8          q6,  q8,  q9,  #4  // -stride
-        vext.8          q7,  q9,  q10, #4
-        vext.8          q8,  q11, q12, #4  // +stride
-        vext.8          q11, q12, q13, #4
-
-        vld1.8          {d4}, [r1]!
-
-        vmov.i32        q14, #5
-        vmov.i32        q15, #6
-
-        vadd.i32        q6,  q6,  q8       // -stride, +stride
-        vadd.i32        q7,  q7,  q11
-        vmul.i32        q4,  q4,  q14      // * 5
-        vmla.i32        q4,  q6,  q15      // * 6
-        vmul.i32        q5,  q5,  q14      // * 5
-        vmla.i32        q5,  q7,  q15      // * 6
-
-        vmovl.u8        q2,  d4
-        vmlal.u16       q4,  d0,  d4       // b + a * src
-        vmlal.u16       q5,  d1,  d5       // b + a * src
-        vmov            q0,  q1
-        vrshrn.i32      d8,  q4,  #9
-        vrshrn.i32      d9,  q5,  #9
-        vmov            q2,  q3
-        vst1.16         {q4}, [r0]!
-
-        ble             3f
-        vmov            q8,  q10
-        vmov            q11, q13
-        vld1.16         {q1},       [r4]!
-        vld1.16         {q3},       [r8]!
-        vld1.32         {q9,  q10}, [r3]!
-        vld1.32         {q12, q13}, [r7]!
-        b               2b
-
-3:
-        subs            r6,  r6,  #1
-        ble             0f
-        mov             r5,  lr
-        add             r0,  r0,  r10, lsl #1
-        add             r1,  r1,  r2
-        add             r3,  r3,  r9,  lsl #2
-        add             r7,  r7,  r9,  lsl #2
-        add             r4,  r4,  r12, lsl #1
-        add             r8,  r8,  r12, lsl #1
-
-        vld1.32         {q8, q9}, [r3]!
-        vld1.16         {q0, q1}, [r4]!
-        vld1.32         {q10},    [r3]!
-
-        vmov.i16        q12, #5
-        vmov.i16        q13, #6
-
-4:
-        subs            r5,  r5,  #8
-        vext.8          q3,  q0,  q1,  #4  // +1
-        vext.8          q2,  q0,  q1,  #2  // 0
-        vadd.i16        q0,  q0,  q3       // -1, +1
-
-        vext.8          q4,  q8,  q9,  #4  // 0
-        vext.8          q5,  q9,  q10, #4
-        vext.8          q6,  q8,  q9,  #8  // +1
-        vext.8          q7,  q9,  q10, #8
-        vmul.i16        q2,  q2,  q13      // * 6
-        vmla.i16        q2,  q0,  q12      // * 5 -> a
-        vld1.8          {d22}, [r1]!
-        vadd.i32        q8,  q8,  q6       // -1, +1
-        vadd.i32        q9,  q9,  q7
-        vmovl.u8        q11, d22
-        vmul.i32        q4,  q4,  q15      // * 6
-        vmla.i32        q4,  q8,  q14      // * 5 -> b
-        vmul.i32        q5,  q5,  q15      // * 6
-        vmla.i32        q5,  q9,  q14      // * 5 -> b
-
-        vmlal.u16       q4,  d4,  d22      // b + a * src
-        vmlal.u16       q5,  d5,  d23
-        vmov            q0,  q1
-        vrshrn.i32      d8,  q4,  #8
-        vrshrn.i32      d9,  q5,  #8
-        vmov            q8,  q10
-        vst1.16         {q4}, [r0]!
-
-        ble             5f
-        vld1.16         {q1},      [r4]!
-        vld1.32         {q9, q10}, [r3]!
-        b               4b
-
-5:
-        subs            r6,  r6,  #1
-        ble             0f
-        mov             r5,  lr
-        sub             r3,  r3,  r11, lsl #2 // Rewind r3/r4 to where they started
-        sub             r4,  r4,  r11, lsl #1
-        add             r0,  r0,  r10, lsl #1
-        add             r1,  r1,  r2
-        sub             r3,  r3,  #16
-        sub             r4,  r4,  #16
-        b               1b
-0:
-        vpop            {q4-q7}
-        pop             {r4-r11,pc}
-endfunc
-
-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
-//                                    const pixel *src, const ptrdiff_t src_stride,
-//                                    const int16_t *t1, const int w, const int h,
-//                                    const int wt);
-function sgr_weighted1_8bpc_neon, export=1
-        push            {r4-r9,lr}
-        ldrd            r4,  r5,  [sp, #28]
-        ldrd            r6,  r7,  [sp, #36]
-        ldr             r8,  [sp, #44]
-        vdup.16         d31, r7
-        cmp             r6,  #2
-        add             r9,  r0,  r1
-        add             r12, r2,  r3
-        add             lr,  r4,  #2*FILTER_OUT_STRIDE
-        mov             r7,  #(4*FILTER_OUT_STRIDE)
-        lsl             r1,  r1,  #1
-        lsl             r3,  r3,  #1
-        add             r8,  r5,  #7
-        bic             r8,  r8,  #7 // Aligned width
-        sub             r1,  r1,  r8
-        sub             r3,  r3,  r8
-        sub             r7,  r7,  r8, lsl #1
-        mov             r8,  r5
-        blt             2f
-1:
-        vld1.8          {d0},  [r2]!
-        vld1.8          {d16}, [r12]!
-        vld1.16         {q1},  [r4]!
-        vld1.16         {q9},  [lr]!
-        subs            r5,  r5,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vshll.u8        q8,  d16, #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vsub.i16        q9,  q9,  q8     // t1 - u
-        vshll.u16       q2,  d0,  #7     // u << 7
-        vshll.u16       q3,  d1,  #7     // u << 7
-        vshll.u16       q10, d16, #7     // u << 7
-        vshll.u16       q11, d17, #7     // u << 7
-        vmlal.s16       q2,  d2,  d31    // v
-        vmlal.s16       q3,  d3,  d31    // v
-        vmlal.s16       q10, d18, d31    // v
-        vmlal.s16       q11, d19, d31    // v
-        vrshrn.i32      d4,  q2,  #11
-        vrshrn.i32      d5,  q3,  #11
-        vrshrn.i32      d20, q10, #11
-        vrshrn.i32      d21, q11, #11
-        vqmovun.s16     d4,  q2
-        vqmovun.s16     d20, q10
-        vst1.8          {d4},  [r0]!
-        vst1.8          {d20}, [r9]!
-        bgt             1b
-
-        sub             r6,  r6,  #2
-        cmp             r6,  #1
-        blt             0f
-        mov             r5,  r8
-        add             r0,  r0,  r1
-        add             r9,  r9,  r1
-        add             r2,  r2,  r3
-        add             r12, r12, r3
-        add             r4,  r4,  r7
-        add             lr,  lr,  r7
-        beq             2f
-        b               1b
-
-2:
-        vld1.8          {d0}, [r2]!
-        vld1.16         {q1}, [r4]!
-        subs            r5,  r5,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vshll.u16       q2,  d0,  #7     // u << 7
-        vshll.u16       q3,  d1,  #7     // u << 7
-        vmlal.s16       q2,  d2,  d31    // v
-        vmlal.s16       q3,  d3,  d31    // v
-        vrshrn.i32      d4,  q2,  #11
-        vrshrn.i32      d5,  q3,  #11
-        vqmovun.s16     d2,  q2
-        vst1.8          {d2}, [r0]!
-        bgt             2b
-0:
-        pop             {r4-r9,pc}
-endfunc
-
-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-//                                    const pixel *src, const ptrdiff_t src_stride,
-//                                    const int16_t *t1, const int16_t *t2,
-//                                    const int w, const int h,
-//                                    const int16_t wt[2]);
-function sgr_weighted2_8bpc_neon, export=1
-        push            {r4-r11,lr}
-        ldrd            r4,  r5,  [sp, #36]
-        ldrd            r6,  r7,  [sp, #44]
-        ldr             r8,  [sp, #52]
-        cmp             r7,  #2
-        add             r10, r0,  r1
-        add             r11, r2,  r3
-        add             r12, r4,  #2*FILTER_OUT_STRIDE
-        add             lr,  r5,  #2*FILTER_OUT_STRIDE
-        vld2.16         {d30[], d31[]}, [r8] // wt[0], wt[1]
-        mov             r8,  #4*FILTER_OUT_STRIDE
-        lsl             r1,  r1,  #1
-        lsl             r3,  r3,  #1
-        add             r9,  r6,  #7
-        bic             r9,  r9,  #7 // Aligned width
-        sub             r1,  r1,  r9
-        sub             r3,  r3,  r9
-        sub             r8,  r8,  r9, lsl #1
-        mov             r9,  r6
-        blt             2f
-1:
-        vld1.8          {d0},  [r2]!
-        vld1.8          {d16}, [r11]!
-        vld1.16         {q1},  [r4]!
-        vld1.16         {q9},  [r12]!
-        vld1.16         {q2},  [r5]!
-        vld1.16         {q10}, [lr]!
-        subs            r6,  r6,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vshll.u8        q8,  d16, #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vsub.i16        q2,  q2,  q0     // t2 - u
-        vsub.i16        q9,  q9,  q8     // t1 - u
-        vsub.i16        q10, q10, q8     // t2 - u
-        vshll.u16       q3,  d0,  #7     // u << 7
-        vshll.u16       q0,  d1,  #7     // u << 7
-        vshll.u16       q11, d16, #7     // u << 7
-        vshll.u16       q8,  d17, #7     // u << 7
-        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
-        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
-        vmlal.s16       q11, d18, d30    // wt[0] * (t1 - u)
-        vmlal.s16       q11, d20, d31    // wt[1] * (t2 - u)
-        vmlal.s16       q8,  d19, d30    // wt[0] * (t1 - u)
-        vmlal.s16       q8,  d21, d31    // wt[1] * (t2 - u)
-        vrshrn.i32      d6,  q3,  #11
-        vrshrn.i32      d7,  q0,  #11
-        vrshrn.i32      d22, q11, #11
-        vrshrn.i32      d23, q8,  #11
-        vqmovun.s16     d6,  q3
-        vqmovun.s16     d22, q11
-        vst1.8          {d6},  [r0]!
-        vst1.8          {d22}, [r10]!
-        bgt             1b
-
-        subs            r7,  r7,  #2
-        cmp             r7,  #1
-        blt             0f
-        mov             r6,  r9
-        add             r0,  r0,  r1
-        add             r10, r10, r1
-        add             r2,  r2,  r3
-        add             r11, r11, r3
-        add             r4,  r4,  r8
-        add             r12, r12, r8
-        add             r5,  r5,  r8
-        add             lr,  lr,  r8
-        beq             2f
-        b               1b
-
-2:
-        vld1.8          {d0}, [r2]!
-        vld1.16         {q1}, [r4]!
-        vld1.16         {q2}, [r5]!
-        subs            r6,  r6,  #8
-        vshll.u8        q0,  d0,  #4     // u
-        vsub.i16        q1,  q1,  q0     // t1 - u
-        vsub.i16        q2,  q2,  q0     // t2 - u
-        vshll.u16       q3,  d0,  #7     // u << 7
-        vshll.u16       q0,  d1,  #7     // u << 7
-        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
-        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
-        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
-        vrshrn.i32      d6,  q3,  #11
-        vrshrn.i32      d7,  q0,  #11
-        vqmovun.s16     d6,  q3
-        vst1.8          {d6}, [r0]!
-        bgt             1b
-0:
-        pop             {r4-r11,pc}
-endfunc
+sgr_funcs 8
--- /dev/null
+++ b/src/arm/32/looprestoration16.S
@@ -1,0 +1,1270 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                       const pixel *src, ptrdiff_t stride,
+//                                       const int16_t fh[7], const intptr_t w,
+//                                       int h, enum LrEdgeFlags edges,
+//                                       const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        ldr             r8,       [sp, #116] // bitdepth_max
+        vld1.16         {q0}, [r4, :128]
+        clz             r8,  r8
+        vmov.i32        q14, #1
+        sub             r9,  r8,  #38  // -(bitdepth + 6)
+        sub             r8,  r8,  #25  // -round_bits_h
+        neg             r9,  r9        // bitdepth + 6
+        vdup.32         q1,  r9
+        vdup.32         q13, r8        // -round_bits_h
+        vmov.i16        q15, #8192
+        vshl.u32        q14, q14, q1   // 1 << (bitdepth + 6)
+        mov             r8,  r5
+        // Calculate mid_stride
+        add             r10, r5,  #7
+        bic             r10, r10, #7
+        lsl             r10, r10, #1
+
+        // Clear the last unused element of q0, to allow filtering a single
+        // pixel with one plain vmul+vpadd.
+        mov             r12, #0
+        vmov.16         d1[3], r12
+
+        // Set up pointers for reading/writing alternate rows
+        add             r12, r0,  r10
+        lsl             r10, r10, #1
+        add             lr,  r2,  r3
+        lsl             r3,  r3,  #1
+
+        // Subtract the width from mid_stride
+        sub             r10, r10, r5, lsl #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             r5,  #8
+        add             r11, r5,  #13
+        bic             r11, r11, #7
+        bge             1f
+        mov             r11, #16
+1:
+        sub             r3,  r3,  r11, lsl #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r1,  #0
+        bne             0f
+        // left == NULL
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r3,  r3,  #6
+
+
+1:      // Loop vertically
+        vld1.16         {q2, q3}, [r2]!
+        vld1.16         {q4, q5}, [lr]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r1,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.16         {d3},  [r1]!
+        // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        vld1.16         {d13}, [r1]!
+        vext.8          q3,  q2,  q3,  #10
+        vext.8          q2,  q1,  q2,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+        // and shift q2/q3 to have 3x the first pixel at the front.
+        vdup.16         q1,  d4[0]
+        vdup.16         q6,  d8[0]
+        // Move r2 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        vext.8          q3,  q2,  q3,  #10
+        vext.8          q2,  q1,  q2,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+
+2:
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that pixel to pad with
+        // here since we can find it pretty easily from here.
+        sub             r9,  r5,  #14
+        lsl             r9,  r9,  #1
+        ldrh            r11, [r2, r9]
+        ldrh            r9,  [lr, r9]
+        // Fill q11/q12 with the right padding pixel
+        vdup.16         q11, r11
+        vdup.16         q12, r9
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+        cmp             r5,  #7
+        bge             5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+        vext.8          q8,  q2,  q3,  #2
+        vext.8          q9,  q2,  q3,  #4
+        vext.8          q10, q2,  q3,  #6
+        vmull.s16       q6,  d4,  d0[0]
+        vmlal.s16       q6,  d16, d0[1]
+        vmlal.s16       q6,  d18, d0[2]
+        vmlal.s16       q6,  d20, d0[3]
+        vmull.s16       q7,  d5,  d0[0]
+        vmlal.s16       q7,  d17, d0[1]
+        vmlal.s16       q7,  d19, d0[2]
+        vmlal.s16       q7,  d21, d0[3]
+        vext.8          q8,  q2,  q3,  #8
+        vext.8          q9,  q2,  q3,  #10
+        vext.8          q10, q2,  q3,  #12
+        vmlal.s16       q6,  d16, d1[0]
+        vmlal.s16       q6,  d18, d1[1]
+        vmlal.s16       q6,  d20, d1[2]
+        vmlal.s16       q7,  d17, d1[0]
+        vmlal.s16       q7,  d19, d1[1]
+        vmlal.s16       q7,  d21, d1[2]
+        vext.8          q2,  q4,  q5,  #2
+        vext.8          q10, q4,  q5,  #6
+        vmull.s16       q8,  d8,  d0[0]
+        vmlal.s16       q8,  d4,  d0[1]
+        vmlal.s16       q8,  d20, d0[3]
+        vmull.s16       q9,  d9,  d0[0]
+        vmlal.s16       q9,  d5,  d0[1]
+        vmlal.s16       q9,  d21, d0[3]
+        vext.8          q2,  q4,  q5,  #4
+        vext.8          q10, q4,  q5,  #8
+        vmlal.s16       q8,  d4,  d0[2]
+        vmlal.s16       q8,  d20, d1[0]
+        vmlal.s16       q9,  d5,  d0[2]
+        vmlal.s16       q9,  d21, d1[0]
+        vext.8          q2,  q4,  q5,  #10
+        vext.8          q10, q4,  q5,  #12
+        vmlal.s16       q8,  d4,  d1[1]
+        vmlal.s16       q8,  d20, d1[2]
+        vmlal.s16       q9,  d5,  d1[1]
+        vmlal.s16       q9,  d21, d1[2]
+
+        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q14
+        vadd.i32        q8,  q8,  q14
+        vadd.i32        q9,  q9,  q14
+        vrshl.s32       q6,  q6,  q13
+        vrshl.s32       q7,  q7,  q13
+        vrshl.s32       q8,  q8,  q13
+        vrshl.s32       q9,  q9,  q13
+        vqmovun.s32     d12, q6
+        vqmovun.s32     d13, q7
+        vqmovun.s32     d14, q8
+        vqmovun.s32     d15, q9
+        vmin.u16        q6,  q6,  q10
+        vmin.u16        q7,  q7,  q10
+        vsub.i16        q6,  q6,  q15
+        vsub.i16        q7,  q7,  q15
+        vst1.16         {q6}, [r0,  :128]!
+        vst1.16         {q7}, [r12, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q2,  q3
+        vmov            q4,  q5
+        vld1.16         {q3}, [r2]!
+        vld1.16         {q5}, [lr]!
+        bne             4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+.macro filter_4
+        vext.8          d18, d4,  d5,  #6
+        vext.8          d16, d4,  d5,  #2
+        vext.8          d17, d4,  d5,  #4
+        vext.8          d19, d5,  d6,  #2
+        vext.8          d20, d5,  d6,  #4
+        vmull.s16       q6,  d4,  d0[0]
+        vmlal.s16       q6,  d16, d0[1]
+        vmlal.s16       q6,  d17, d0[2]
+        vmlal.s16       q6,  d18, d0[3]
+        vmlal.s16       q6,  d5,  d1[0]
+        vmlal.s16       q6,  d19, d1[1]
+        vmlal.s16       q6,  d20, d1[2]
+
+        vext.8          d18, d8,  d9,  #6
+        vext.8          d16, d8,  d9,  #2
+        vext.8          d17, d8,  d9,  #4
+        vext.8          d19, d9,  d10, #2
+        vext.8          d20, d9,  d10, #4
+        vmull.s16       q7,  d8,  d0[0]
+        vmlal.s16       q7,  d16, d0[1]
+        vmlal.s16       q7,  d17, d0[2]
+        vmlal.s16       q7,  d18, d0[3]
+        vmlal.s16       q7,  d9,  d1[0]
+        vmlal.s16       q7,  d19, d1[1]
+        vmlal.s16       q7,  d20, d1[2]
+
+        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q14
+        vrshl.s32       q6,  q6,  q13
+        vrshl.s32       q7,  q7,  q13
+        vqmovun.s32     d12, q6
+        vqmovun.s32     d13, q7
+        vmin.u16        q6,  q6,  q10
+        vsub.i16        q6,  q6,  q15
+.endm
+        filter_4
+        vst1.16         {d12}, [r0,  :64]!
+        vst1.16         {d13}, [r12, :64]!
+
+        subs            r5,  r5,  #4 // 3 <= w < 7
+        vext.8          q2,  q2,  q3,  #8
+        vext.8          q3,  q3,  q3,  #8
+        vext.8          q4,  q4,  q5,  #8
+        vext.8          q5,  q5,  q5,  #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in q2-q3
+        cmp             r5,  #5
+        blt             7f
+        bgt             8f
+        // w == 5, 8 pixels valid in q2, q3 invalid
+        vmov            q3,  q11
+        vmov            q5,  q12
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in q2
+        sub             r9,  r5,  #1
+        // r9 = (pixels valid - 4)
+        adr             r11, L(variable_shift_tbl)
+        ldr             r9,  [r11, r9, lsl #2]
+        add             r11, r11, r9
+        vmov            q3,  q11
+        vmov            q5,  q12
+        bx              r11
+
+        .align 2
+L(variable_shift_tbl):
+        .word 44f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
+
+44:     // 4 pixels valid in q2/q4, fill the high half with padding.
+        vmov            d5,  d6
+        vmov            d9,  d10
+        b               88f
+        // Shift q2 right, shifting out invalid pixels,
+        // shift q2 left to the original offset, shifting in padding pixels.
+55:     // 5 pixels valid
+        vext.8          q2,  q2,  q2,  #10
+        vext.8          q2,  q2,  q3,  #6
+        vext.8          q4,  q4,  q4,  #10
+        vext.8          q4,  q4,  q5,  #6
+        b               88f
+66:     // 6 pixels valid
+        vext.8          q2,  q2,  q2,  #12
+        vext.8          q2,  q2,  q3,  #4
+        vext.8          q4,  q4,  q4,  #12
+        vext.8          q4,  q4,  q5,  #4
+        b               88f
+77:     // 7 pixels valid
+        vext.8          q2,  q2,  q2,  #14
+        vext.8          q2,  q2,  q3,  #2
+        vext.8          q4,  q4,  q4,  #14
+        vext.8          q4,  q4,  q5,  #2
+        b               88f
+
+8:      // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
+        vext.8          q3,  q3,  q3,  #2
+        vext.8          q3,  q3,  q11, #14
+        vext.8          q5,  q5,  q5,  #2
+        vext.8          q5,  q5,  q12, #14
+
+88:
+        // w < 7, q2-q3 padded properly
+        cmp             r5,  #4
+        blt             888f
+
+        // w >= 4, filter 4 pixels
+        filter_4
+        vst1.16         {d12}, [r0,  :64]!
+        vst1.16         {d13}, [r12, :64]!
+        subs            r5,  r5,  #4 // 0 <= w < 4
+        vext.8          q2,  q2,  q3,  #8
+        vext.8          q4,  q4,  q5,  #8
+        beq             9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        vmull.s16       q6,  d4,  d0
+        vmull.s16       q7,  d5,  d1
+        vmull.s16       q8,  d8,  d0
+        vmull.s16       q9,  d9,  d1
+        vadd.i32        q6,  q7
+        vadd.i32        q8,  q9
+        vpadd.i32       d12, d12, d13
+        vpadd.i32       d13, d16, d17
+        vpadd.i32       d12, d12, d13
+        vadd.i32        d12, d12, d28
+        vmvn.i16        d20, #0x8000 // 0x7fff = (1 << 15) - 1
+        vrshl.s32       d12, d12, d26
+        vqmovun.s32     d12, q6
+        vmin.u16        d12, d12, d20
+        vsub.i16        d12, d12, d30
+        vst1.16         {d12[0]}, [r0,  :16]!
+        vst1.16         {d12[1]}, [r12, :16]!
+        subs            r5,  r5,  #1
+        vext.8          q2,  q2,  q3,  #2
+        vext.8          q4,  q4,  q5,  #2
+        bgt             888b
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r10
+        add             r12, r12, r10
+        add             r2,  r2,  r3
+        add             lr,  lr,  r3
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem filter_4
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                       const int16_t *mid, int w, int h,
+//                                       const int16_t fv[7], enum LrEdgeFlags edges,
+//                                       ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+        push            {r4-r7,lr}
+        vpush           {q4-q5}
+        ldrd            r4,  r5,  [sp, #52]
+        ldrd            r6,  r7,  [sp, #60]
+        ldr             lr,       [sp, #68] // bitdepth_max
+        vld1.16         {q0},  [r5, :128]
+        vdup.16         q5,  lr
+        clz             lr,  lr
+        sub             lr,  lr,  #11   // round_bits_v
+        vdup.32         q4,  lr
+        mov             lr,  r4
+        vneg.s32        q4,  q4         // -round_bits_v
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             r12, r4
+        tst             r6,  #4 // LR_HAVE_TOP
+        beq             0f
+        sub             r2,  r2,  r7, lsl #1
+        add             r12, r12, #2
+0:
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        add             r12, r12, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into q8-q11 and pad properly.
+        tst             r6,  #4 // LR_HAVE_TOP
+        vld1.16         {q8},  [r2, :128], r7
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.16         {q10}, [r2, :128], r7
+        vmov            q9,  q8
+        vld1.16         {q11}, [r2, :128], r7
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q9,  q8
+        vmov            q10, q8
+        vmov            q11, q8
+
+3:
+        cmp             r4,  #4
+        blt             5f
+        // Start filtering normally; fill in q12-q14 with unique rows.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vld1.16         {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+        subs            r4,  r4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d20, d0[2]
+        vmlal.s16       q2,  d22, d0[3]
+        vmlal.s16       q2,  d24, d1[0]
+        vmlal.s16       q2,  d26, d1[1]
+        vmlal.s16       q2,  d28, d1[2]
+        vmull.s16       q3,  d17, d0[0]
+        vmlal.s16       q3,  d19, d0[1]
+        vmlal.s16       q3,  d21, d0[2]
+        vmlal.s16       q3,  d23, d0[3]
+        vmlal.s16       q3,  d25, d1[0]
+        vmlal.s16       q3,  d27, d1[1]
+        vmlal.s16       q3,  d29, d1[2]
+        vrshl.s32       q2,  q2,  q4    // round_bits_v
+        vrshl.s32       q3,  q3,  q4
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vmin.u16        q2,  q2,  q5    // bitdepth_max
+        vst1.16         {q2}, [r0], r1
+.if \compare
+        cmp             r4,  #4
+.else
+        ble             9f
+.endif
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vmov            q10, q11
+        vmov            q11, q12
+        vmov            q12, q13
+        vmov            q13, q14
+.endm
+        filter          1
+        blt             7f
+        vld1.16         {q14}, [r2, :128], r7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             6f
+        // LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        // We load at least 2 rows in all cases.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        bgt             53f // 3 rows in total
+        beq             52f // 2 rows in total
+51:     // 1 row in total, q11 already loaded, load edge into q12-q14.
+        vmov            q13, q12
+        b               8f
+52:     // 2 rows in total, q11 already loaded, load q12 with content data
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vmov            q15, q14
+        b               8f
+53:
+        // 3 rows in total, q11 already loaded, load q12 and q13 with content
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        bgt             63f // 3 rows in total
+        beq             62f // 2 rows in total
+61:     // 1 row in total, q11 already loaded, pad that into q12-q14.
+        vmov            q12, q11
+        vmov            q13, q11
+        vmov            q14, q11
+        b               8f
+62:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+        vld1.16         {q12}, [r2, :128], r7
+        vmov            q13, q12
+        vmov            q14, q12
+        vmov            q15, q12
+        b               8f
+63:
+        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+        b               8f
+
+7:
+        // All registers up to q13 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+
+8:      // At this point, all registers up to q14-q15,q1 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        vmov            q14, q15
+        vmov            q15, q1
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            r3,  r3,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        mls             r0,  r1,  lr,  r0
+        mls             r2,  r7,  r12, r2
+        add             r0,  r0,  #16
+        add             r2,  r2,  #16
+        mov             r4,  lr
+        b               1b
+
+0:
+        vpop            {q4-q5}
+        pop             {r4-r7,pc}
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                   const pixel *src, int w, int h);
+function copy_narrow_16bpc_neon, export=1
+        push            {r4,lr}
+        ldr             r4,  [sp, #8]
+        adr             r12, L(copy_narrow_tbl)
+        ldr             r3,  [r12, r3, lsl #2]
+        add             r12, r12, r3
+        bx              r12
+
+        .align 2
+L(copy_narrow_tbl):
+        .word 0
+        .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
+
+10:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+18:
+        subs            r4,  r4,  #8
+        blt             110f
+        vld1.16         {q0}, [r2, :128]!
+        vst1.16         {d0[0]}, [r0, :16], r1
+        vst1.16         {d0[1]}, [r3, :16], r1
+        vst1.16         {d0[2]}, [r0, :16], r1
+        vst1.16         {d0[3]}, [r3, :16], r1
+        vst1.16         {d1[0]}, [r0, :16], r1
+        vst1.16         {d1[1]}, [r3, :16], r1
+        vst1.16         {d1[2]}, [r0, :16], r1
+        vst1.16         {d1[3]}, [r3, :16], r1
+        ble             0f
+        b               18b
+110:
+        add             r4,  r4,  #8
+        asr             r1,  r1,  #1
+11:
+        subs            r4,  r4,  #1
+        vld1.16         {d0[]},  [r2]!
+        vst1.16         {d0[0]}, [r0], r1
+        bgt             11b
+0:
+        pop             {r4,pc}
+
+20:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+24:
+        subs            r4,  r4,  #4
+        blt             210f
+        vld1.32         {q0}, [r2, :128]!
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vst1.32         {d0[1]}, [r3, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r3, :32], r1
+        ble             0f
+        b               24b
+210:
+        add             r4,  r4,  #4
+        asr             r1,  r1,  #1
+22:
+        subs            r4,  r4,  #1
+        vld1.32         {d0[]},  [r2, :32]!
+        vst1.32         {d0[0]}, [r0, :32], r1
+        bgt             22b
+0:
+        pop             {r4,pc}
+
+30:
+        ldr             r3,  [r2]
+        ldrh            r12, [r2, #4]
+        add             r2,  r2,  #6
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strh            r12, [r0, #4]
+        add             r0,  r0,  r1
+        bgt             30b
+        pop             {r4,pc}
+
+40:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+42:
+        subs            r4,  r4,  #2
+        blt             41f
+        vld1.16         {q0}, [r2, :128]!
+        vst1.16         {d0}, [r0, :64], r1
+        vst1.16         {d1}, [r3, :64], r1
+        ble             0f
+        b               42b
+41:
+        vld1.16         {d0}, [r2, :64]
+        vst1.16         {d0}, [r0, :64]
+0:
+        pop             {r4,pc}
+
+50:
+        vld1.16         {d0}, [r2]
+        ldrh            r12, [r2, #8]
+        add             r2,  r2,  #10
+        subs            r4,  r4,  #1
+        vst1.16         {d0}, [r0]
+        strh            r12, [r0, #8]
+        add             r0,  r0,  r1
+        bgt             50b
+        pop             {r4,pc}
+
+60:
+        vld1.16         {d0}, [r2]
+        ldr             r12, [r2, #8]
+        add             r2,  r2,  #12
+        subs            r4,  r4,  #1
+        vst1.16         {d0}, [r0]
+        str             r12, [r0, #8]
+        add             r0,  r0,  r1
+        bgt             60b
+        pop             {r4,pc}
+
+70:
+        vld1.16         {d0}, [r2]
+        ldr             r12, [r2, #8]
+        ldrh            lr,  [r2, #12]
+        add             r2,  r2,  #14
+        subs            r4,  r4,  #1
+        vst1.16         {d0}, [r0]
+        str             r12, [r0, #8]
+        strh            lr,  [r0, #12]
+        add             r0,  r0,  r1
+        bgt             70b
+        pop             {r4,pc}
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             0f
+        // !LR_HAVE_RIGHT
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3
+        b               1f
+0:
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+1:
+        sub             r9,  r9,  lr, lsl #1
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             lr,  r5,  #14
+        bic             lr,  lr,  #7
+        sub             r4,  r4,  lr, lsl #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #4
+        sub             r12, r12, #4
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #4
+
+
+1:      // Loop vertically
+        vld1.16         {q0, q1}, [r3]!
+        vld1.16         {q4, q5}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.16         {d5}, [r2]!
+        // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #4
+        sub             r12, r12, #4
+        vld1.16         {d13}, [r2]!
+        vext.8          q1,  q0,  q1,  #12
+        vext.8          q0,  q2,  q0,  #12
+        vext.8          q5,  q4,  q5,  #12
+        vext.8          q4,  q6,  q4,  #12
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+        // and shift q0 to have 2x the first byte at the front.
+        vdup.16         q2,  d0[0]
+        vdup.16         q6,  d8[0]
+        // Move r3 back to account for the last 2 pixels we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #4
+        sub             r12, r12, #4
+        vext.8          q1,  q0,  q1,  #12
+        vext.8          q0,  q2,  q0,  #12
+        vext.8          q5,  q4,  q5,  #12
+        vext.8          q4,  q6,  q4,  #12
+
+2:
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that pixel to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5,  #(2 + 16 - 2 + 1)
+        lsl             lr,  lr,  #1
+        ldrh            r11, [r3,  lr]
+        ldrh            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.16         q14, r11
+        vdup.16         q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #10
+        bge             4f   // If w >= 10, all used input pixels are valid
+        cmp             r5,  #6
+        bge             5f   // If w >= 6, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add3 w
+.if \w > 4
+        vext.8          q8,  q0,  q1,  #2
+        vext.8          q10, q4,  q5,  #2
+        vext.8          q9,  q0,  q1,  #4
+        vext.8          q11, q4,  q5,  #4
+        vadd.i16        q2,  q0,  q8
+        vadd.i16        q3,  q4,  q10
+        vadd.i16        q2,  q2,  q9
+        vadd.i16        q3,  q3,  q11
+.else
+        vext.8          d16, d0,  d1,  #2
+        vext.8          d20, d8,  d9,  #2
+        vext.8          d18, d0,  d1,  #4
+        vext.8          d22, d8,  d9,  #4
+        vadd.i16        d4,  d0,  d16
+        vadd.i16        d6,  d8,  d20
+        vadd.i16        d4,  d4,  d18
+        vadd.i16        d6,  d6,  d22
+.endif
+
+        vmull.u16       q6,  d0,  d0
+        vmlal.u16       q6,  d16, d16
+        vmlal.u16       q6,  d18, d18
+        vmull.u16       q12, d8,  d8
+        vmlal.u16       q12, d20, d20
+        vmlal.u16       q12, d22, d22
+.if \w > 4
+        vmull.u16       q7,  d1,  d1
+        vmlal.u16       q7,  d17, d17
+        vmlal.u16       q7,  d19, d19
+        vmull.u16       q13, d9,  d9
+        vmlal.u16       q13, d21, d21
+        vmlal.u16       q13, d23, d23
+.endif
+.endm
+        add3            8
+        vst1.16         {q2},       [r1,  :128]!
+        vst1.16         {q3},       [r11, :128]!
+        vst1.32         {q6,  q7},  [r0,  :128]!
+        vst1.32         {q12, q13}, [r10, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q0,  q1
+        vmov            q4,  q5
+        vld1.16         {q1}, [r3]!
+        vld1.16         {q5}, [r12]!
+
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 6 <= w < 10
+        add3            4
+        vst1.16         {d4},  [r1,  :64]!
+        vst1.16         {d6},  [r11, :64]!
+        vst1.32         {q6},  [r0,  :128]!
+        vst1.32         {q12}, [r10, :128]!
+
+        subs            r5,  r5,  #4 // 2 <= w < 6
+        vext.8          q0,  q0,  q1,  #8
+        vext.8          q4,  q4,  q5,  #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // 2 <= w < 6, 2-5 pixels valid in q0
+        sub             lr,  r5,  #2
+        // lr = (pixels valid - 2)
+        adr             r11, L(box3_variable_shift_tbl)
+        ldr             lr,  [r11, lr, lsl #2]
+        add             r11, r11, lr
+        bx              r11
+
+        .align 2
+L(box3_variable_shift_tbl):
+        .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+
+        // Shift q0 right, shifting out invalid pixels,
+        // shift q0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q0,  q0,  q14, #12
+        vext.8          q4,  q4,  q15, #12
+        b               88f
+33:     // 3 pixels valid
+        vext.8          q0,  q0,  q0,  #6
+        vext.8          q4,  q4,  q4,  #6
+        vext.8          q0,  q0,  q14, #10
+        vext.8          q4,  q4,  q15, #10
+        b               88f
+44:     // 4 pixels valid
+        vmov            d1,  d28
+        vmov            d9,  d30
+        b               88f
+55:     // 5 pixels valid
+        vext.8          q0,  q0,  q0,  #10
+        vext.8          q4,  q4,  q4,  #10
+        vext.8          q0,  q0,  q14, #6
+        vext.8          q4,  q4,  q15, #6
+
+88:
+        // Restore r11 after using it for a temporary value above
+        add             r11, r1,  #(2*SUM_STRIDE)
+
+        add3            4
+        subs            r5,  r5,  #4
+        vst1.16         {d4},  [r1,  :64]!
+        vst1.16         {d6},  [r11, :64]!
+        vst1.32         {q6},  [r0,  :128]!
+        vst1.32         {q12}, [r10, :128]!
+        ble             9f
+        vext.8          q0,  q0,  q0,  #8
+        vext.8          q4,  q4,  q4,  #8
+        // Only one needed pixel left, but do a normal 4 pixel
+        // addition anyway
+        add3            4
+        vst1.16         {d4},  [r1,  :64]!
+        vst1.16         {d6},  [r11, :64]!
+        vst1.32         {q6},  [r0,  :128]!
+        vst1.32         {q12}, [r10, :128]!
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        // Subtract the number of pixels read from the input from the stride.
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             0f
+        // !LR_HAVE_RIGHT
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3
+        add             r8,  r5,  #13
+        b               1f
+0:
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+        add             r8,  r5,  #15
+1:
+        sub             r9,  r9,  lr, lsl #1
+        bic             r8,  r8,  #7
+        sub             r4,  r4,  r8, lsl #1
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #6
+        sub             r12, r12, #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #6
+
+1:      // Loop vertically
+        vld1.16         {q0, q1}, [r3]!
+        vld1.16         {q4, q5}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.16         {d5}, [r2]!
+        // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #6
+        sub             r12, r12, #6
+        vld1.16         {d13}, [r2]!
+        vext.8          q1,  q0,  q1,  #10
+        vext.8          q0,  q2,  q0,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
+        // and shift q0 to have 3x the first pixel at the front.
+        vdup.16         q2,  d0[0]
+        vdup.16         q6,  d8[0]
+        // Move r3 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #6
+        sub             r12, r12, #6
+        vext.8          q1,  q0,  q1,  #10
+        vext.8          q0,  q2,  q0,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+
+2:
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that pixel to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5,  #(2 + 16 - 3 + 1)
+        lsl             lr,  lr,  #1
+        ldrh            r11, [r3,  lr]
+        ldrh            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.16         q14, r11
+        vdup.16         q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+        cmp             r5,  #7
+        bge             5f   // If w >= 7, we can produce 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add5 w
+.if \w > 4
+        vext.8          q8,  q0,  q1,  #2
+        vext.8          q10, q4,  q5,  #2
+        vext.8          q9,  q0,  q1,  #4
+        vext.8          q11, q4,  q5,  #4
+        vadd.i16        q2,  q0,  q8
+        vadd.i16        q3,  q4,  q10
+        vadd.i16        q2,  q2,  q9
+        vadd.i16        q3,  q3,  q11
+.else
+        vext.8          d16, d0,  d1,  #2
+        vext.8          d20, d8,  d9,  #2
+        vext.8          d18, d0,  d1,  #4
+        vext.8          d22, d8,  d9,  #4
+        vadd.i16        d4,  d0,  d16
+        vadd.i16        d6,  d8,  d20
+        vadd.i16        d4,  d4,  d18
+        vadd.i16        d6,  d6,  d22
+.endif
+
+        vmull.u16       q6,  d0,  d0
+        vmlal.u16       q6,  d16, d16
+        vmlal.u16       q6,  d18, d18
+        vmull.u16       q12, d8,  d8
+        vmlal.u16       q12, d20, d20
+        vmlal.u16       q12, d22, d22
+.if \w > 4
+        vmull.u16       q7,  d1,  d1
+        vmlal.u16       q7,  d17, d17
+        vmlal.u16       q7,  d19, d19
+        vmull.u16       q13, d9,  d9
+        vmlal.u16       q13, d21, d21
+        vmlal.u16       q13, d23, d23
+.endif
+
+.if \w > 4
+        vext.8          q8,  q0,  q1,  #6
+        vext.8          q10, q4,  q5,  #6
+        vext.8          q9,  q0,  q1,  #8
+        vext.8          q11, q4,  q5,  #8
+        vadd.i16        q2,  q2,  q8
+        vadd.i16        q3,  q3,  q10
+        vadd.i16        q2,  q2,  q9
+        vadd.i16        q3,  q3,  q11
+.else
+        vext.8          d16, d0,  d1,  #6
+        // d18 would be equal to d1; using d1 instead
+        vext.8          d20, d8,  d9,  #6
+        // d22 would be equal to d9; using d9 instead
+        vadd.i16        d4,  d4,  d16
+        vadd.i16        d6,  d6,  d20
+        vadd.i16        d4,  d4,  d1
+        vadd.i16        d6,  d6,  d9
+.endif
+
+        vmlal.u16       q6,  d16, d16
+        vmlal.u16       q6,  d1,  d1
+        vmlal.u16       q12, d20, d20
+        vmlal.u16       q12, d9,  d9
+.if \w > 4
+        vmlal.u16       q7,  d17, d17
+        vmlal.u16       q7,  d19, d19
+        vmlal.u16       q13, d21, d21
+        vmlal.u16       q13, d23, d23
+.endif
+.endm
+        add5            8
+        vst1.16         {q2},       [r1,  :128]!
+        vst1.16         {q3},       [r11, :128]!
+        vst1.32         {q6,  q7},  [r0,  :128]!
+        vst1.32         {q12, q13}, [r10, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q0,  q1
+        vmov            q4,  q5
+        vld1.16         {q1}, [r3]!
+        vld1.16         {q5}, [r12]!
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 7 <= w < 11
+        add5            4
+        vst1.16         {d4},  [r1,  :64]!
+        vst1.16         {d6},  [r11, :64]!
+        vst1.32         {q6},  [r0,  :128]!
+        vst1.32         {q12}, [r10, :128]!
+
+        subs            r5,  r5,  #4 // 3 <= w < 7
+        vext.8          q0,  q0,  q1,  #8
+        vext.8          q4,  q4,  q5,  #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // w < 7, w+1 pixels valid in q0/q4
+        sub             lr,  r5,  #1
+        // lr = pixels valid - 2
+        adr             r11, L(box5_variable_shift_tbl)
+        ldr             lr,  [r11, lr, lsl #2]
+        vmov            q1,  q14
+        vmov            q5,  q15
+        add             r11, r11, lr
+        bx              r11
+
+        .align 2
+L(box5_variable_shift_tbl):
+        .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+
+        // Shift q0 right, shifting out invalid pixels,
+        // shift q0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q0,  q0,  q14, #12
+        vext.8          q4,  q4,  q15, #12
+        b               88f
+33:     // 3 pixels valid
+        vext.8          q0,  q0,  q0,  #6
+        vext.8          q4,  q4,  q4,  #6
+        vext.8          q0,  q0,  q14, #10
+        vext.8          q4,  q4,  q15, #10
+        b               88f
+44:     // 4 pixels valid
+        vmov            d1,  d28
+        vmov            d9,  d30
+        b               88f
+55:     // 5 pixels valid
+        vext.8          q0,  q0,  q0,  #10
+        vext.8          q4,  q4,  q4,  #10
+        vext.8          q0,  q0,  q14, #6
+        vext.8          q4,  q4,  q15, #6
+        b               88f
+66:     // 6 pixels valid
+        vext.8          q0,  q0,  q0,  #12
+        vext.8          q4,  q4,  q4,  #12
+        vext.8          q0,  q0,  q14, #4
+        vext.8          q4,  q4,  q15, #4
+        b               88f
+77:     // 7 pixels valid
+        vext.8          q0,  q0,  q0,  #14
+        vext.8          q4,  q4,  q4,  #14
+        vext.8          q0,  q0,  q14, #2
+        vext.8          q4,  q4,  q15, #2
+
+88:
+        // Restore r11 after using it for a temporary value above
+        add             r11, r1,  #(2*SUM_STRIDE)
+
+        add5            4
+        subs            r5,  r5,  #4
+        vst1.16         {d4},  [r1,  :64]!
+        vst1.16         {d6},  [r11, :64]!
+        vst1.32         {q6},  [r0,  :128]!
+        vst1.32         {q12}, [r10, :128]!
+        ble             9f
+        vext.8          q0,  q0,  q1,  #8
+        vext.8          q4,  q4,  q5,  #8
+        add5            4
+        vst1.16         {d4},  [r1,  :64]!
+        vst1.16         {d6},  [r11, :64]!
+        vst1.32         {q6},  [r0,  :128]!
+        vst1.32         {q12}, [r10, :128]!
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem add5
+endfunc
+
+sgr_funcs 16
--- /dev/null
+++ b/src/arm/32/looprestoration_common.S
@@ -1,0 +1,453 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #2 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Sum all h+2 lines with the main loop
+        add             lr,  lr,  #2
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q8-q13 and q0-q2 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q8,  q9},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q1},       [r6, :128], r8
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q10, q8
+        vmov            q11, q9
+        vmov            q1,  q0
+        vmov            q12, q8
+        vmov            q13, q9
+        vmov            q2,  q0
+
+3:
+        subs            r3,  r3,  #1
+.macro add3
+        vadd.i32        q8,  q8,  q10
+        vadd.i32        q9,  q9,  q11
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q8,  q8,  q12
+        vadd.i32        q9,  q9,  q13
+        vadd.i16        q0,  q0,  q2
+        vst1.32         {q8, q9}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        vmov            q10, q12
+        vmov            q11, q13
+        vmov            q1,  q2
+        ble             4f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3b
+
+4:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             5f
+        // !LR_HAVE_BOTTOM
+        // Produce two more rows, extending the already loaded rows.
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        add3
+
+5:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        pop             {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q5-q7}
+        ldr             r4,  [sp, #76]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #8 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             0f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Handle h+2 lines with the main loop
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_BOTTOM
+        sub             r3,  r3,  #1  // Handle h-1 lines with the main loop
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q6-q15 and q0-q3,q5 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q6,  q7},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vmov            q10, q6
+        vmov            q11, q7
+        vmov            q2,  q0
+        vmov            q12, q6
+        vmov            q13, q7
+        vmov            q3,  q0
+
+3:
+        cmp             r3,  #0
+        beq             4f
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+
+3:
+        // Start of vertical loop
+        subs            r3,  r3,  #2
+.macro add5
+        vadd.i32        q6,  q6,  q8
+        vadd.i32        q7,  q7,  q9
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q6,  q6,  q10
+        vadd.i32        q7,  q7,  q11
+        vadd.i16        q0,  q0,  q2
+        vadd.i32        q6,  q6,  q12
+        vadd.i32        q7,  q7,  q13
+        vadd.i16        q0,  q0,  q3
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q15
+        vadd.i16        q0,  q0,  q5
+        vst1.32         {q6, q7}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add5
+.macro shift2
+        vmov            q6,  q10
+        vmov            q7,  q11
+        vmov            q0,  q2
+        vmov            q8,  q12
+        vmov            q9,  q13
+        vmov            q1,  q3
+        vmov            q10, q14
+        vmov            q11, q15
+        vmov            q2,  q5
+.endm
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        ble             5f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+        b               3b
+
+4:
+        // h == 1, !LR_HAVE_BOTTOM.
+        // Pad the last row with the only content row, and add.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        add5
+        b               6f
+
+5:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             6f
+        // !LR_HAVE_BOTTOM
+        cmp             r3,  #0
+        bne             5f
+        // The intended three edge rows left; output the one at h-2 and
+        // the past edge one at h.
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        // Pad the past-edge row from the last content row.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        // The last two rows are already padded properly here.
+        add5
+        b               6f
+
+5:
+        // r3 == -1, two rows left, output one.
+        // Pad the last two rows from the mid one.
+        vmov            q12, q10
+        vmov            q13, q11
+        vmov            q3,  q2
+        vmov            q14, q10
+        vmov            q15, q11
+        vmov            q5,  q2
+        add5
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        b               6f
+
+6:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        vpop            {q5-q7}
+        pop             {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
+function sgr_calc_ab1_neon, export=1
+        push            {r4-r7,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #84]
+        add             r3,  r3,  #2   // h += 2
+        clz             r6,  r5
+        vmov.i32        q15, #9        // n
+        movw            r5,  #455
+        mov             lr,  #SUM_STRIDE
+        b               sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+        push            {r4-r7,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #84]
+        add             r3,  r3,  #3   // h += 3
+        clz             r6,  r5
+        asr             r3,  r3,  #1   // h /= 2
+        vmov.i32        q15, #25       // n
+        mov             r5,  #164
+        mov             lr,  #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+        movrel          r12, X(sgr_x_by_x)
+        sub             r6,  r6,  #24  // -bitdepth_min_8
+        vld1.8          {q8, q9}, [r12, :128]!
+        add             r7,  r6,  r6   // -2*bitdepth_min_8
+        vmov.i8         q11, #5
+        vmov.i8         d10, #55       // idx of last 5
+        vld1.8          {q10},    [r12, :128]
+        vmov.i8         d11, #72       // idx of last 4
+        vmov.i8         d12, #101      // idx of last 3
+        vmov.i8         d13, #169      // idx of last 2
+        vmov.i8         d14, #254      // idx of last 1
+        vmov.i8         d15, #32       // elements consumed in first vtbl
+        add             r2,  r2,  #2   // w += 2
+        add             r12, r2,  #7
+        bic             r12, r12, #7   // aligned w
+        sub             r12, lr,  r12  // increment between rows
+        vdup.32         q12, r4
+        sub             r0,  r0,  #(4*(SUM_STRIDE))
+        sub             r1,  r1,  #(2*(SUM_STRIDE))
+        mov             r4,  r2        // backup of w
+        vsub.i8         q8,  q8,  q11
+        vsub.i8         q9,  q9,  q11
+        vsub.i8         q10, q10, q11
+1:
+        vld1.32         {q0, q1}, [r0, :128] // a
+        vld1.16         {q2},     [r1, :128] // b
+        vdup.32         q13, r7        // -2*bitdepth_min_8
+        vdup.16         q14, r6        // -bitdepth_min_8
+        subs            r2,  r2,  #8
+        vrshl.s32       q0,  q0,  q13
+        vrshl.s32       q1,  q1,  q13
+        vrshl.s16       q4,  q2,  q14
+        vmul.i32        q0,  q0,  q15  // a * n
+        vmul.i32        q1,  q1,  q15  // a * n
+        vmull.u16       q3,  d8,  d8   // b * b
+        vmull.u16       q4,  d9,  d9   // b * b
+        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
+        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
+        vmul.i32        q0,  q0,  q12  // p * s
+        vmul.i32        q1,  q1,  q12  // p * s
+        vqshrn.u32      d0,  q0,  #16
+        vqshrn.u32      d1,  q1,  #16
+        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
+
+        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
+        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
+        vtbl.8          d1,  {q8, q9}, d0
+        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
+        vsub.i8         d9,  d0,  d15  // indices for vtbx
+        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
+        vadd.i8         d2,  d2,  d3
+        vtbx.8          d1,  {q10}, d9
+        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
+        vadd.i8         d6,  d6,  d7
+        vadd.i8         d8,  d8,  d22
+        vadd.i8         d2,  d2,  d6
+        vadd.i8         d1,  d1,  d8
+        vadd.i8         d1,  d1,  d2
+        vmovl.u8        q0,  d1        // x
+
+        vmov.i16        q13, #256
+        vdup.32         q14, r5        // one_by_x
+
+        vmull.u16       q1,  d0,  d4   // x * BB[i]
+        vmull.u16       q2,  d1,  d5   // x * BB[i]
+        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
+        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
+        vrshr.s32       q1,  q1,  #12  // AA[i]
+        vrshr.s32       q2,  q2,  #12  // AA[i]
+        vsub.i16        q0,  q13, q0   // 256 - x
+
+        vst1.32         {q1, q2}, [r0, :128]!
+        vst1.16         {q0},     [r1, :128]!
+        bgt             1b
+
+        subs            r3,  r3,  #1
+        ble             0f
+        add             r0,  r0,  r12, lsl #2
+        add             r1,  r1,  r12, lsl #1
+        mov             r2,  r4
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r7,pc}
+endfunc
--- /dev/null
+++ b/src/arm/32/looprestoration_tmpl.S
@@ -1,0 +1,600 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        sub             r7,  r3,  #(4*SUM_STRIDE)
+        add             r8,  r3,  #(4*SUM_STRIDE)
+        sub             r9,  r4,  #(2*SUM_STRIDE)
+        add             r10, r4,  #(2*SUM_STRIDE)
+        mov             r11, #SUM_STRIDE
+        mov             r12, #FILTER_OUT_STRIDE
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3 // Aligned width
+.if \bpc == 8
+        sub             r2,  r2,  lr
+.else
+        sub             r2,  r2,  lr, lsl #1
+.endif
+        sub             r12, r12, lr
+        sub             r11, r11, lr
+        sub             r11, r11, #4 // We read 4 extra elements from both a and b
+        mov             lr,  r5
+        vmov.i16        q14, #3
+        vmov.i32        q15, #3
+1:
+        vld1.16         {q0},       [r9,  :128]!
+        vld1.16         {q1},       [r4,  :128]!
+        vld1.16         {q2},       [r10, :128]!
+        vld1.32         {q8,  q9},  [r7,  :128]!
+        vld1.32         {q10, q11}, [r3,  :128]!
+        vld1.32         {q12, q13}, [r8,  :128]!
+
+2:
+        subs            r5,  r5,  #4
+        vext.8          d6,  d0,  d1,  #2  // -stride
+        vext.8          d7,  d2,  d3,  #2  // 0
+        vext.8          d8,  d4,  d5,  #2  // +stride
+        vext.8          d9,  d0,  d1,  #4  // +1-stride
+        vext.8          d10, d2,  d3,  #4  // +1
+        vext.8          d11, d4,  d5,  #4  // +1+stride
+        vadd.i16        d2,  d2,  d6       // -1, -stride
+        vadd.i16        d7,  d7,  d8       // 0, +stride
+        vadd.i16        d0,  d0,  d9       // -1-stride, +1-stride
+        vadd.i16        d2,  d2,  d7
+        vadd.i16        d4,  d4,  d11      // -1+stride, +1+stride
+        vadd.i16        d2,  d2,  d10      // +1
+        vadd.i16        d0,  d0,  d4
+
+        vext.8          q3,  q8,  q9,  #4  // -stride
+        vshl.i16        d2,  d2,  #2
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q10, q11, #4  // 0
+        vext.8          q6,  q10, q11, #8  // +1
+        vmla.i16        d2,  d0,  d28      // * 3 -> a
+        vadd.i32        q3,  q3,  q10      // -stride, -1
+        vadd.i32        q8,  q8,  q4       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q6       // 0, +1
+        vadd.i32        q8,  q8,  q12      // -1+stride
+        vadd.i32        q3,  q3,  q5
+        vext.8          q7,  q12, q13, #4  // +stride
+        vext.8          q10, q12, q13, #8  // +1+stride
+.if \bpc == 8
+        vld1.32         {d24[0]}, [r1, :32]! // src
+.else
+        vld1.16         {d24}, [r1, :64]!    // src
+.endif
+        vadd.i32        q3,  q3,  q7       // +stride
+        vadd.i32        q8,  q8,  q10      // +1+stride
+        vshl.i32        q3,  q3,  #2
+        vmla.i32        q3,  q8,  q15      // * 3 -> b
+.if \bpc == 8
+        vmovl.u8        q12, d24           // src
+.endif
+        vmov            d0,  d1
+        vmlal.u16       q3,  d2,  d24      // b + a * src
+        vmov            d2,  d3
+        vrshrn.i32      d6,  q3,  #9
+        vmov            d4,  d5
+        vst1.16         {d6}, [r0]!
+
+        ble             3f
+        vmov            q8,  q9
+        vmov            q10, q11
+        vmov            q12, q13
+        vld1.16         {d1},  [r9,  :64]!
+        vld1.16         {d3},  [r4,  :64]!
+        vld1.16         {d5},  [r10, :64]!
+        vld1.32         {q9},  [r7,  :128]!
+        vld1.32         {q11}, [r3,  :128]!
+        vld1.32         {q13}, [r8,  :128]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r12, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r11, lsl #2
+        add             r7,  r7,  r11, lsl #2
+        add             r8,  r8,  r11, lsl #2
+        add             r4,  r4,  r11, lsl #1
+        add             r9,  r9,  r11, lsl #1
+        add             r10, r10, r11, lsl #1
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        add             r7,  r3,  #(4*(SUM_STRIDE))
+        sub             r3,  r3,  #(4*(SUM_STRIDE))
+        add             r8,  r4,  #(2*(SUM_STRIDE))
+        sub             r4,  r4,  #(2*(SUM_STRIDE))
+        mov             r9,  #(2*SUM_STRIDE)
+        mov             r10, #FILTER_OUT_STRIDE
+        add             r11, r5,  #7
+        bic             r11, r11, #7 // Aligned width
+.if \bpc == 8
+        sub             r2,  r2,  r11
+.else
+        sub             r2,  r2,  r11, lsl #1
+.endif
+        sub             r10, r10, r11
+        sub             r9,  r9,  r11
+        sub             r9,  r9,  #4 // We read 4 extra elements from a
+        sub             r12, r9,  #4 // We read 8 extra elements from b
+        mov             lr,  r5
+
+1:
+        vld1.16         {q0,  q1},  [r4, :128]!
+        vld1.16         {q2,  q3},  [r8, :128]!
+        vld1.32         {q8,  q9},  [r3, :128]!
+        vld1.32         {q11, q12}, [r7, :128]!
+        vld1.32         {q10},      [r3, :128]!
+        vld1.32         {q13},      [r7, :128]!
+
+2:
+        vmov.i16        q14, #5
+        vmov.i16        q15, #6
+        subs            r5,  r5,  #8
+        vext.8          q4,  q0,  q1,  #4  // +1-stride
+        vext.8          q5,  q2,  q3,  #4  // +1+stride
+        vext.8          q6,  q0,  q1,  #2  // -stride
+        vext.8          q7,  q2,  q3,  #2  // +stride
+        vadd.i16        q0,  q0,  q4       // -1-stride, +1-stride
+        vadd.i16        q5,  q2,  q5       // -1+stride, +1+stride
+        vadd.i16        q2,  q6,  q7       // -stride, +stride
+        vadd.i16        q0,  q0,  q5
+
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q9,  q10, #8
+        vext.8          q6,  q11, q12, #8  // +1+stride
+        vext.8          q7,  q12, q13, #8
+        vmul.i16        q0,  q0,  q14      // * 5
+        vmla.i16        q0,  q2,  q15      // * 6
+        vadd.i32        q4,  q4,  q8       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q9
+        vadd.i32        q6,  q6,  q11      // -1+stride, +1+stride
+        vadd.i32        q7,  q7,  q12
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q7
+        vext.8          q6,  q8,  q9,  #4  // -stride
+        vext.8          q7,  q9,  q10, #4
+        vext.8          q8,  q11, q12, #4  // +stride
+        vext.8          q11, q12, q13, #4
+
+.if \bpc == 8
+        vld1.8          {d4}, [r1, :64]!
+.else
+        vld1.8          {q2}, [r1, :128]!
+.endif
+
+        vmov.i32        q14, #5
+        vmov.i32        q15, #6
+
+        vadd.i32        q6,  q6,  q8       // -stride, +stride
+        vadd.i32        q7,  q7,  q11
+        vmul.i32        q4,  q4,  q14      // * 5
+        vmla.i32        q4,  q6,  q15      // * 6
+        vmul.i32        q5,  q5,  q14      // * 5
+        vmla.i32        q5,  q7,  q15      // * 6
+
+.if \bpc == 8
+        vmovl.u8        q2,  d4
+.endif
+        vmlal.u16       q4,  d0,  d4       // b + a * src
+        vmlal.u16       q5,  d1,  d5       // b + a * src
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #9
+        vrshrn.i32      d9,  q5,  #9
+        vmov            q2,  q3
+        vst1.16         {q4}, [r0, :128]!
+
+        ble             3f
+        vmov            q8,  q10
+        vmov            q11, q13
+        vld1.16         {q1},       [r4, :128]!
+        vld1.16         {q3},       [r8, :128]!
+        vld1.32         {q9,  q10}, [r3, :128]!
+        vld1.32         {q12, q13}, [r7, :128]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r9,  lsl #2
+        add             r7,  r7,  r9,  lsl #2
+        add             r4,  r4,  r12, lsl #1
+        add             r8,  r8,  r12, lsl #1
+
+        vld1.32         {q8, q9}, [r3, :128]!
+        vld1.16         {q0, q1}, [r4, :128]!
+        vld1.32         {q10},    [r3, :128]!
+
+        vmov.i16        q12, #5
+        vmov.i16        q13, #6
+
+4:
+        subs            r5,  r5,  #8
+        vext.8          q3,  q0,  q1,  #4  // +1
+        vext.8          q2,  q0,  q1,  #2  // 0
+        vadd.i16        q0,  q0,  q3       // -1, +1
+
+        vext.8          q4,  q8,  q9,  #4  // 0
+        vext.8          q5,  q9,  q10, #4
+        vext.8          q6,  q8,  q9,  #8  // +1
+        vext.8          q7,  q9,  q10, #8
+        vmul.i16        q2,  q2,  q13      // * 6
+        vmla.i16        q2,  q0,  q12      // * 5 -> a
+.if \bpc == 8
+        vld1.8          {d22}, [r1, :64]!
+.else
+        vld1.16         {q11}, [r1, :128]!
+.endif
+        vadd.i32        q8,  q8,  q6       // -1, +1
+        vadd.i32        q9,  q9,  q7
+.if \bpc == 8
+        vmovl.u8        q11, d22
+.endif
+        vmul.i32        q4,  q4,  q15      // * 6
+        vmla.i32        q4,  q8,  q14      // * 5 -> b
+        vmul.i32        q5,  q5,  q15      // * 6
+        vmla.i32        q5,  q9,  q14      // * 5 -> b
+
+        vmlal.u16       q4,  d4,  d22      // b + a * src
+        vmlal.u16       q5,  d5,  d23
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #8
+        vrshrn.i32      d9,  q5,  #8
+        vmov            q8,  q10
+        vst1.16         {q4}, [r0, :128]!
+
+        ble             5f
+        vld1.16         {q1},      [r4, :128]!
+        vld1.32         {q9, q10}, [r3, :128]!
+        b               4b
+
+5:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        sub             r3,  r3,  r11, lsl #2 // Rewind r3/r4 to where they started
+        sub             r4,  r4,  r11, lsl #1
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        sub             r3,  r3,  #16
+        sub             r4,  r4,  #16
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int w, const int h,
+//                                    const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+        push            {r4-r9,lr}
+        ldrd            r4,  r5,  [sp, #28]
+        ldrd            r6,  r7,  [sp, #36]
+.if \bpc == 16
+        ldr             r8,  [sp, #44]
+.endif
+        vdup.16         d31, r7
+        cmp             r6,  #2
+.if \bpc == 16
+        vdup.16         q14, r8
+.endif
+        add             r9,  r0,  r1
+        add             r12, r2,  r3
+        add             lr,  r4,  #2*FILTER_OUT_STRIDE
+        mov             r7,  #(4*FILTER_OUT_STRIDE)
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r8,  r5,  #7
+        bic             r8,  r8,  #7 // Aligned width
+.if \bpc == 8
+        sub             r1,  r1,  r8
+        sub             r3,  r3,  r8
+.else
+        sub             r1,  r1,  r8, lsl #1
+        sub             r3,  r3,  r8, lsl #1
+.endif
+        sub             r7,  r7,  r8, lsl #1
+        mov             r8,  r5
+        blt             2f
+1:
+.if \bpc == 8
+        vld1.8          {d0},  [r2,  :64]!
+        vld1.8          {d16}, [r12, :64]!
+.else
+        vld1.16         {q0},  [r2,  :128]!
+        vld1.16         {q8},  [r12, :128]!
+.endif
+        vld1.16         {q1},  [r4,  :128]!
+        vld1.16         {q9},  [lr,  :128]!
+        subs            r5,  r5,  #8
+.if \bpc == 8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+.else
+        vshl.i16        q0,  q0,  #4     // u
+        vshl.i16        q8,  q8,  #4     // u
+.endif
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vshll.u16       q10, d16, #7     // u << 7
+        vshll.u16       q11, d17, #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+        vmlal.s16       q10, d18, d31    // v
+        vmlal.s16       q11, d19, d31    // v
+.if \bpc == 8
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vrshrn.i32      d20, q10, #11
+        vrshrn.i32      d21, q11, #11
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d20, q10
+        vst1.8          {d4},  [r0]!
+        vst1.8          {d20}, [r9]!
+.else
+        vqrshrun.s32    d4,  q2,  #11
+        vqrshrun.s32    d5,  q3,  #11
+        vqrshrun.s32    d20, q10, #11
+        vqrshrun.s32    d21, q11, #11
+        vmin.u16        q2,  q2,  q14
+        vmin.u16        q10, q10, q14
+        vst1.16         {q2},  [r0]!
+        vst1.16         {q10}, [r9]!
+.endif
+        bgt             1b
+
+        sub             r6,  r6,  #2
+        cmp             r6,  #1
+        blt             0f
+        mov             r5,  r8
+        add             r0,  r0,  r1
+        add             r9,  r9,  r1
+        add             r2,  r2,  r3
+        add             r12, r12, r3
+        add             r4,  r4,  r7
+        add             lr,  lr,  r7
+        beq             2f
+        b               1b
+
+2:
+.if \bpc == 8
+        vld1.8          {d0}, [r2, :64]!
+.else
+        vld1.16         {q0}, [r2, :128]!
+.endif
+        vld1.16         {q1}, [r4, :128]!
+        subs            r5,  r5,  #8
+.if \bpc == 8
+        vshll.u8        q0,  d0,  #4     // u
+.else
+        vshl.i16        q0,  q0,  #4     // u
+.endif
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+.if \bpc == 8
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vqmovun.s16     d2,  q2
+        vst1.8          {d2}, [r0]!
+.else
+        vqrshrun.s32    d4,  q2,  #11
+        vqrshrun.s32    d5,  q3,  #11
+        vmin.u16        q2,  q2,  q14
+        vst1.16         {q2}, [r0]!
+.endif
+        bgt             2b
+0:
+        pop             {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int16_t *t2,
+//                                    const int w, const int h,
+//                                    const int16_t wt[2], const int bitdepth_max);
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+.if \bpc == 8
+        ldr             r8,  [sp, #52]
+.else
+        ldrd            r8,  r9,  [sp, #52]
+.endif
+        cmp             r7,  #2
+        add             r10, r0,  r1
+        add             r11, r2,  r3
+        add             r12, r4,  #2*FILTER_OUT_STRIDE
+        add             lr,  r5,  #2*FILTER_OUT_STRIDE
+        vld2.16         {d30[], d31[]}, [r8] // wt[0], wt[1]
+.if \bpc == 16
+        vdup.16         q14, r9
+.endif
+        mov             r8,  #4*FILTER_OUT_STRIDE
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r9,  r6,  #7
+        bic             r9,  r9,  #7 // Aligned width
+.if \bpc == 8
+        sub             r1,  r1,  r9
+        sub             r3,  r3,  r9
+.else
+        sub             r1,  r1,  r9, lsl #1
+        sub             r3,  r3,  r9, lsl #1
+.endif
+        sub             r8,  r8,  r9, lsl #1
+        mov             r9,  r6
+        blt             2f
+1:
+.if \bpc == 8
+        vld1.8          {d0},  [r2,  :64]!
+        vld1.8          {d16}, [r11, :64]!
+.else
+        vld1.16         {q0},  [r2,  :128]!
+        vld1.16         {q8},  [r11, :128]!
+.endif
+        vld1.16         {q1},  [r4,  :128]!
+        vld1.16         {q9},  [r12, :128]!
+        vld1.16         {q2},  [r5,  :128]!
+        vld1.16         {q10}, [lr,  :128]!
+        subs            r6,  r6,  #8
+.if \bpc == 8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+.else
+        vshl.i16        q0,  q0,  #4     // u
+        vshl.i16        q8,  q8,  #4     // u
+.endif
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vsub.i16        q10, q10, q8     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vshll.u16       q11, d16, #7     // u << 7
+        vshll.u16       q8,  d17, #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q11, d18, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q11, d20, d31    // wt[1] * (t2 - u)
+        vmlal.s16       q8,  d19, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q8,  d21, d31    // wt[1] * (t2 - u)
+.if \bpc == 8
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vrshrn.i32      d22, q11, #11
+        vrshrn.i32      d23, q8,  #11
+        vqmovun.s16     d6,  q3
+        vqmovun.s16     d22, q11
+        vst1.8          {d6},  [r0]!
+        vst1.8          {d22}, [r10]!
+.else
+        vqrshrun.s32    d6,  q3,  #11
+        vqrshrun.s32    d7,  q0,  #11
+        vqrshrun.s32    d22, q11, #11
+        vqrshrun.s32    d23, q8,  #11
+        vmin.u16        q3,  q3,  q14
+        vmin.u16        q11, q11, q14
+        vst1.16         {q3},  [r0]!
+        vst1.16         {q11}, [r10]!
+.endif
+        bgt             1b
+
+        subs            r7,  r7,  #2
+        cmp             r7,  #1
+        blt             0f
+        mov             r6,  r9
+        add             r0,  r0,  r1
+        add             r10, r10, r1
+        add             r2,  r2,  r3
+        add             r11, r11, r3
+        add             r4,  r4,  r8
+        add             r12, r12, r8
+        add             r5,  r5,  r8
+        add             lr,  lr,  r8
+        beq             2f
+        b               1b
+
+2:
+.if \bpc == 8
+        vld1.8          {d0}, [r2, :64]!
+.else
+        vld1.16         {q0}, [r2, :128]!
+.endif
+        vld1.16         {q1}, [r4, :128]!
+        vld1.16         {q2}, [r5, :128]!
+        subs            r6,  r6,  #8
+.if \bpc == 8
+        vshll.u8        q0,  d0,  #4     // u
+.else
+        vshl.i16        q0,  q0,  #4     // u
+.endif
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+.if \bpc == 8
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vqmovun.s16     d6,  q3
+        vst1.8          {d6}, [r0]!
+.else
+        vqrshrun.s32    d6,  q3,  #11
+        vqrshrun.s32    d7,  q0,  #11
+        vmin.u16        q3,  q3,  q14
+        vst1.16         {q3}, [r0]!
+.endif
+        bgt             1b
+0:
+        pop             {r4-r11,pc}
+endfunc
+.endm
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -1403,12 +1403,12 @@
         vld1.8          {d24}, [\sr2], \s_strd
         vmovl.u8        q8,  d16
         vmovl.u8        q12, d24
-        vext.8          q9,  q8,  q8,  #2
-        vext.8          q10, q8,  q8,  #4
-        vext.8          q11, q8,  q8,  #6
-        vext.8          q13, q12, q12, #2
-        vext.8          q14, q12, q12, #4
-        vext.8          q15, q12, q12, #6
+        vext.8          d18, d16, d17, #2
+        vext.8          d20, d16, d17, #4
+        vext.8          d22, d16, d17, #6
+        vext.8          d26, d24, d25, #2
+        vext.8          d28, d24, d25, #4
+        vext.8          d30, d24, d25, #6
         subs            \h,  \h,  #2
         vmul.s16        d4,  d16, d0[0]
         vmla.s16        d4,  d18, d0[1]
@@ -1431,7 +1431,7 @@
         pop             {r4-r11,pc}
 
 80:     // 8xN h
-        vld1.8          {d0}, [\mx]
+        vld1.8          {d0}, [\mx, :64]
         sub             \src,  \src,  #3
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1482,7 +1482,7 @@
         // one temporary for vext in the loop. That's slower on A7 and A53,
         // (but surprisingly, marginally faster on A8 and A73).
         vpush           {q4-q6}
-        vld1.8          {d0}, [\mx]
+        vld1.8          {d0}, [\mx, :64]
         sub             \src,  \src,  #3
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
@@ -1629,7 +1629,7 @@
 
 28:     // 2x8, 2x16 v
         vpush           {q4-q7}
-        vld1.8          {d0}, [\my]
+        vld1.8          {d0}, [\my, :64]
         sub             \sr2,  \src,  \s_strd, lsl #1
         add             \ds2,  \dst,  \d_strd
         sub             \src,  \sr2,  \s_strd
@@ -1709,7 +1709,7 @@
 
 480:    // 4x8, 4x16 v
         vpush           {q4}
-        vld1.8          {d0}, [\my]
+        vld1.8          {d0}, [\my, :64]
         sub             \sr2, \src, \s_strd, lsl #1
         add             \ds2, \dst, \d_strd
         sub             \src, \sr2, \s_strd
@@ -1782,7 +1782,7 @@
 640:
 1280:
         vpush           {q4}
-        vld1.8          {d0}, [\my]
+        vld1.8          {d0}, [\my, :64]
         sub             \src, \src, \s_strd
         sub             \src, \src, \s_strd, lsl #1
         vmovl.s8        q0,  d0
@@ -1951,11 +1951,10 @@
         bl              L(\type\()_8tap_filter_2)
 
         vext.8          d18, d17, d26, #4
-        vmov            d19, d26
         vmull.s16       q2,  d16, d2[0]
         vmlal.s16       q2,  d17, d2[1]
         vmlal.s16       q2,  d18, d2[2]
-        vmlal.s16       q2,  d19, d2[3]
+        vmlal.s16       q2,  d26, d2[3]
 
         vqrshrn.s32     d4,  q2,  #\shift_hv
         vqmovun.s16     d4,  q2
@@ -1964,11 +1963,11 @@
         vst1.16         {d4[1]}, [\ds2, :16], \d_strd
         ble             0f
         vmov            d16, d18
-        vmov            d17, d19
+        vmov            d17, d26
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
-        vld1.8          {d2},  [\my]
+        vld1.8          {d2},  [\my, :64]
         sub             \src, \src, #1
         sub             \sr2, \src, \s_strd, lsl #1
         sub             \src, \sr2, \s_strd
@@ -2001,7 +2000,6 @@
 28:
         bl              L(\type\()_8tap_filter_2)
         vext.8          d22, d21, d26, #4
-        vmov            d23, d26
         vmull.s16       q2,  d16, d2[0]
         vmlal.s16       q2,  d17, d2[1]
         vmlal.s16       q2,  d18, d2[2]
@@ -2009,7 +2007,7 @@
         vmlal.s16       q2,  d20, d3[0]
         vmlal.s16       q2,  d21, d3[1]
         vmlal.s16       q2,  d22, d3[2]
-        vmlal.s16       q2,  d23, d3[3]
+        vmlal.s16       q2,  d26, d3[3]
 
         vqrshrn.s32     d4,  q2,  #\shift_hv
         vqmovun.s16     d4,  q2
@@ -2022,7 +2020,7 @@
         vmov            d18, d20
         vmov            d19, d21
         vmov            d20, d22
-        vmov            d21, d23
+        vmov            d21, d26
         b               28b
 
 0:
@@ -2108,7 +2106,7 @@
         b               4b
 
 480:    // 4x8, 4x16, 4x32 hv
-        vld1.8          {d2},  [\my]
+        vld1.8          {d2},  [\my, :64]
         sub             \src, \src, #1
         sub             \sr2, \src, \s_strd, lsl #1
         sub             \src, \sr2, \s_strd
@@ -2211,7 +2209,7 @@
         bgt             880f
         vpush           {q4-q7}
         add             \my,  \my,  #2
-        vld1.8          {d0},  [\mx]
+        vld1.8          {d0},  [\mx, :64]
         vld1.32         {d2[]},  [\my]
         sub             \src,  \src,  #3
         sub             \src,  \src,  \s_strd
@@ -2301,8 +2299,8 @@
 640:
 1280:
         vpush           {q4-q7}
-        vld1.8          {d0},  [\mx]
-        vld1.8          {d2},  [\my]
+        vld1.8          {d0},  [\mx, :64]
+        vld1.8          {d2},  [\my, :64]
         sub             \src,  \src,  #3
         sub             \src,  \src,  \s_strd
         sub             \src,  \src,  \s_strd, lsl #1
--- a/src/arm/32/mc16.S
+++ b/src/arm/32/mc16.S
@@ -272,3 +272,2463 @@
 bidir_fn avg, r6
 bidir_fn w_avg, r7
 bidir_fn mask, r7
+
+
+// This has got the same signature as the put_8tap functions,
+// and assumes that r9 is set to (clz(w)-24).
+function put_neon
+        adr             r10, L(put_tbl)
+        ldr             r9,  [r10, r9, lsl #2]
+        add             r10, r10, r9
+        bx              r10
+
+        .align 2
+L(put_tbl):
+        .word 1280f - L(put_tbl) + CONFIG_THUMB
+        .word 640f  - L(put_tbl) + CONFIG_THUMB
+        .word 320f  - L(put_tbl) + CONFIG_THUMB
+        .word 16f   - L(put_tbl) + CONFIG_THUMB
+        .word 80f   - L(put_tbl) + CONFIG_THUMB
+        .word 4f    - L(put_tbl) + CONFIG_THUMB
+        .word 2f    - L(put_tbl) + CONFIG_THUMB
+
+2:
+        vld1.32         {d0[]}, [r2], r3
+        vld1.32         {d1[]}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r0, :32], r1
+        bgt             2b
+        pop             {r4-r11,pc}
+4:
+        vld1.16         {d0}, [r2], r3
+        vld1.16         {d1}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.16         {d0}, [r0, :64], r1
+        vst1.16         {d1}, [r0, :64], r1
+        bgt             4b
+        pop             {r4-r11,pc}
+80:
+        add             r8,  r0,  r1
+        lsl             r1,  r1,  #1
+        add             r9,  r2,  r3
+        lsl             r3,  r3,  #1
+8:
+        vld1.16         {q0}, [r2], r3
+        vld1.16         {q1}, [r9], r3
+        subs            r5,  r5,  #2
+        vst1.16         {q0}, [r0, :128], r1
+        vst1.16         {q1}, [r8, :128], r1
+        bgt             8b
+        pop             {r4-r11,pc}
+16:
+        vld1.16         {q0,  q1},  [r2], r3
+        subs            r5,  r5,  #1
+        vst1.16         {q0,  q1},  [r0, :128], r1
+        bgt             16b
+        pop             {r4-r11,pc}
+320:
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+32:
+        vld1.16         {q0,  q1},  [r2]!
+        vst1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q2,  q3},  [r2], r3
+        subs            r5,  r5,  #1
+        vst1.16         {q2,  q3},  [r0, :128], r1
+        bgt             32b
+        pop             {r4-r11,pc}
+640:
+        sub             r1,  r1,  #96
+        sub             r3,  r3,  #96
+64:
+        vld1.16         {q8,  q9},  [r2]!
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2]!
+        vst1.16         {q10, q11}, [r0, :128]!
+        vld1.16         {q12, q13}, [r2]!
+        vst1.16         {q12, q13}, [r0, :128]!
+        vld1.16         {q14, q15}, [r2], r3
+        subs            r5,  r5,  #1
+        vst1.16         {q14, q15}, [r0, :128], r1
+        bgt             64b
+        pop             {r4-r11,pc}
+1280:
+        sub             r1,  r1,  #224
+        sub             r3,  r3,  #224
+128:
+        vld1.16         {q8,  q9},  [r2]!
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2]!
+        vst1.16         {q10, q11}, [r0, :128]!
+        vld1.16         {q12, q13}, [r2]!
+        vst1.16         {q12, q13}, [r0, :128]!
+        vld1.16         {q14, q15}, [r2]!
+        vst1.16         {q14, q15}, [r0, :128]!
+        vld1.16         {q8,  q9},  [r2]!
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2]!
+        vst1.16         {q10, q11}, [r0, :128]!
+        vld1.16         {q12, q13}, [r2]!
+        vst1.16         {q12, q13}, [r0, :128]!
+        vld1.16         {q14, q15}, [r2], r3
+        subs            r5,  r5,  #1
+        vst1.16         {q14, q15}, [r0, :128], r1
+        bgt             128b
+        pop             {r4-r11,pc}
+endfunc
+
+// This has got the same signature as the prep_8tap functions,
+// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and
+// r8 to w*2.
+function prep_neon
+        adr             r10, L(prep_tbl)
+        ldr             r9,  [r10, r9, lsl #2]
+        vdup.16         q15, r7   // intermediate_bits
+        vmov.i16        q14, #PREP_BIAS
+        add             r10, r10, r9
+        bx              r10
+
+        .align 2
+L(prep_tbl):
+        .word 1280f - L(prep_tbl) + CONFIG_THUMB
+        .word 640f  - L(prep_tbl) + CONFIG_THUMB
+        .word 320f  - L(prep_tbl) + CONFIG_THUMB
+        .word 16f   - L(prep_tbl) + CONFIG_THUMB
+        .word 80f   - L(prep_tbl) + CONFIG_THUMB
+        .word 40f   - L(prep_tbl) + CONFIG_THUMB
+
+40:
+        add             r9,  r1,  r2
+        lsl             r2,  r2,  #1
+4:
+        vld1.16         {d0}, [r1], r2
+        vld1.16         {d1}, [r9], r2
+        subs            r4,  r4,  #2
+        vshl.s16        q0,  q0,  q15
+        vsub.i16        q0,  q0,  q14
+        vst1.16         {q0}, [r0, :128]!
+        bgt             4b
+        pop             {r4-r11,pc}
+80:
+        add             r9,  r1,  r2
+        lsl             r2,  r2,  #1
+8:
+        vld1.16         {q0}, [r1], r2
+        vld1.16         {q1}, [r9], r2
+        subs            r4,  r4,  #2
+        vshl.s16        q0,  q0,  q15
+        vshl.s16        q1,  q1,  q15
+        vsub.i16        q0,  q0,  q14
+        vsub.i16        q1,  q1,  q14
+        vst1.16         {q0, q1}, [r0, :128]!
+        bgt             8b
+        pop             {r4-r11,pc}
+16:
+        vld1.16         {q0, q1}, [r1], r2
+        vshl.s16        q0,  q0,  q15
+        vld1.16         {q2, q3}, [r1], r2
+        subs            r4,  r4,  #2
+        vshl.s16        q1,  q1,  q15
+        vshl.s16        q2,  q2,  q15
+        vshl.s16        q3,  q3,  q15
+        vsub.i16        q0,  q0,  q14
+        vsub.i16        q1,  q1,  q14
+        vsub.i16        q2,  q2,  q14
+        vst1.16         {q0, q1}, [r0, :128]!
+        vsub.i16        q3,  q3,  q14
+        vst1.16         {q2, q3}, [r0, :128]!
+        bgt             16b
+        pop             {r4-r11,pc}
+320:
+        sub             r2,  r2,  #32
+32:
+        vld1.16         {q0, q1}, [r1]!
+        subs            r4,  r4,  #1
+        vshl.s16        q0,  q0,  q15
+        vld1.16         {q2, q3}, [r1], r2
+        vshl.s16        q1,  q1,  q15
+        vshl.s16        q2,  q2,  q15
+        vshl.s16        q3,  q3,  q15
+        vsub.i16        q0,  q0,  q14
+        vsub.i16        q1,  q1,  q14
+        vsub.i16        q2,  q2,  q14
+        vst1.16         {q0, q1}, [r0, :128]!
+        vsub.i16        q3,  q3,  q14
+        vst1.16         {q2, q3}, [r0, :128]!
+        bgt             32b
+        pop             {r4-r11,pc}
+640:
+        sub             r2,  r2,  #96
+64:
+        vld1.16         {q0,  q1},  [r1]!
+        subs            r4,  r4,  #1
+        vshl.s16        q0,  q0,  q15
+        vld1.16         {q2,  q3},  [r1]!
+        vshl.s16        q1,  q1,  q15
+        vld1.16         {q8,  q9},  [r1]!
+        vshl.s16        q2,  q2,  q15
+        vld1.16         {q10, q11}, [r1], r2
+        vshl.s16        q3,  q3,  q15
+        vshl.s16        q8,  q8,  q15
+        vshl.s16        q9,  q9,  q15
+        vshl.s16        q10, q10, q15
+        vshl.s16        q11, q11, q15
+        vsub.i16        q0,  q0,  q14
+        vsub.i16        q1,  q1,  q14
+        vsub.i16        q2,  q2,  q14
+        vsub.i16        q3,  q3,  q14
+        vsub.i16        q8,  q8,  q14
+        vst1.16         {q0,  q1},  [r0, :128]!
+        vsub.i16        q9,  q9,  q14
+        vst1.16         {q2,  q3},  [r0, :128]!
+        vsub.i16        q10, q10, q14
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vsub.i16        q11, q11, q14
+        vst1.16         {q10, q11}, [r0, :128]!
+        bgt             64b
+        pop             {r4-r11,pc}
+1280:
+        sub             r2,  r2,  #224
+128:
+        vld1.16         {q0,  q1},  [r1]!
+        subs            r4,  r4,  #1
+        vshl.s16        q0,  q0,  q15
+        vld1.16         {q2,  q3},  [r1]!
+        vshl.s16        q1,  q1,  q15
+        vld1.16         {q8,  q9},  [r1]!
+        vshl.s16        q2,  q2,  q15
+        vld1.16         {q10, q11}, [r1]!
+        vshl.s16        q3,  q3,  q15
+        vshl.s16        q8,  q8,  q15
+        vshl.s16        q9,  q9,  q15
+        vshl.s16        q10, q10, q15
+        vshl.s16        q11, q11, q15
+        vsub.i16        q0,  q0,  q14
+        vsub.i16        q1,  q1,  q14
+        vsub.i16        q2,  q2,  q14
+        vsub.i16        q3,  q3,  q14
+        vsub.i16        q8,  q8,  q14
+        vst1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q0,  q1},  [r1]!
+        vsub.i16        q9,  q9,  q14
+        vsub.i16        q10, q10, q14
+        vst1.16         {q2,  q3},  [r0, :128]!
+        vld1.16         {q2,  q3},  [r1]!
+        vsub.i16        q11, q11, q14
+        vshl.s16        q0,  q0,  q15
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vld1.16         {q8,  q9},  [r1]!
+        vshl.s16        q1,  q1,  q15
+        vshl.s16        q2,  q2,  q15
+        vst1.16         {q10, q11}, [r0, :128]!
+        vld1.16         {q10, q11}, [r1], r2
+        vshl.s16        q3,  q3,  q15
+        vshl.s16        q8,  q8,  q15
+        vshl.s16        q9,  q9,  q15
+        vshl.s16        q10, q10, q15
+        vshl.s16        q11, q11, q15
+        vsub.i16        q0,  q0,  q14
+        vsub.i16        q1,  q1,  q14
+        vsub.i16        q2,  q2,  q14
+        vsub.i16        q3,  q3,  q14
+        vsub.i16        q8,  q8,  q14
+        vst1.16         {q0,  q1},  [r0, :128]!
+        vsub.i16        q9,  q9,  q14
+        vst1.16         {q2,  q3},  [r0, :128]!
+        vsub.i16        q10, q10, q14
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vsub.i16        q11, q11, q14
+        vst1.16         {q10, q11}, [r0, :128]!
+        bgt             128b
+        pop             {r4-r11,pc}
+endfunc
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        vld1.\wd        {\d0[]}, [\s0], \strd
+        vld1.\wd        {\d1[]}, [\s1], \strd
+.ifnb \d2
+        vld1.\wd        {\d2[]}, [\s0], \strd
+        vld1.\wd        {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+        vld1.\wd        {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+        vld1.\wd        {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+        vld1.\wd        {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        vld1.16         {\d0}, [\s0], \strd
+        vld1.16         {\d1}, [\s1], \strd
+.ifnb \d2
+        vld1.16         {\d2}, [\s0], \strd
+        vld1.16         {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+        vld1.16         {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+        vld1.16         {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+        vld1.16         {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5
+        vld1.16         {\d0, \d1}, [\s0], \strd
+.ifnb \d2
+        vld1.16         {\d2, \d3}, [\s1], \strd
+.endif
+.ifnb \d4
+        vld1.16         {\d4, \d5}, [\s0], \strd
+.endif
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5
+        load_regpair    \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+        vext.8          \r0, \r0, \r1, #4
+        vext.8          \r1, \r1, \r2, #4
+.ifnb \r3
+        vext.8          \r2, \r2, \r3, #4
+        vext.8          \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmin_u16 c, r0, r1, r2, r3
+        vmin.u16        \r0, \r0, \c
+.ifnb \r1
+        vmin.u16        \r1, \r1, \c
+.endif
+.ifnb \r2
+        vmin.u16        \r2, \r2, \c
+        vmin.u16        \r3, \r3, \c
+.endif
+.endm
+.macro vsub_i16 c, r0, r1, r2, r3
+        vsub.i16        \r0, \r0, \c
+.ifnb \r1
+        vsub.i16        \r1, \r1, \c
+.endif
+.ifnb \r2
+        vsub.i16        \r2, \r2, \c
+        vsub.i16        \r3, \r3, \c
+.endif
+.endm
+.macro vmull_vmlal_4 d, s0, s1, s2, s3
+        vmull.s16       \d,  \s0, d0[0]
+        vmlal.s16       \d,  \s1, d0[1]
+        vmlal.s16       \d,  \s2, d0[2]
+        vmlal.s16       \d,  \s3, d0[3]
+.endm
+.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+        vmull.s16       \d,  \s0, d0[0]
+        vmlal.s16       \d,  \s1, d0[1]
+        vmlal.s16       \d,  \s2, d0[2]
+        vmlal.s16       \d,  \s3, d0[3]
+        vmlal.s16       \d,  \s4, d1[0]
+        vmlal.s16       \d,  \s5, d1[1]
+        vmlal.s16       \d,  \s6, d1[2]
+        vmlal.s16       \d,  \s7, d1[3]
+.endm
+.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3
+        vqrshrun.s32    \d0, \q0, #\shift
+.ifnb \q1
+        vqrshrun.s32    \d1, \q1, #\shift
+.endif
+.ifnb \q2
+        vqrshrun.s32    \d2, \q2, #\shift
+        vqrshrun.s32    \d3, \q3, #\shift
+.endif
+.endm
+.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3
+        vmovn.i32       \d0, \q0
+.ifnb \q1
+        vmovn.i32       \d1, \q1
+.endif
+.ifnb \q2
+        vmovn.i32       \d2, \q2
+        vmovn.i32       \d3, \q3
+.endif
+.endm
+.macro vrshl_s32 shift, r0, r1, r2, r3
+        vrshl.s32       \r0, \r0, \shift
+        vrshl.s32       \r1, \r1, \shift
+.ifnb \r2
+        vrshl.s32       \r2, \r2, \shift
+        vrshl.s32       \r3, \r3, \shift
+.endif
+.endm
+.macro vst1_32 strd, r0, r1
+        vst1.32         {\r0[0]}, [r0, :32], \strd
+        vst1.32         {\r0[1]}, [r9, :32], \strd
+.ifnb \r1
+        vst1.32         {\r1[0]}, [r0, :32], \strd
+        vst1.32         {\r1[1]}, [r9, :32], \strd
+.endif
+.endm
+.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+        vst1.16         {\r0}, [r0, \align], \strd
+        vst1.16         {\r1}, [r9, \align], \strd
+.ifnb \r2
+        vst1.16         {\r2}, [r0, \align], \strd
+        vst1.16         {\r3}, [r9, \align], \strd
+.endif
+.ifnb \r4
+        vst1.16         {\r4}, [r0, \align], \strd
+        vst1.16         {\r5}, [r9, \align], \strd
+        vst1.16         {\r6}, [r0, \align], \strd
+        vst1.16         {\r7}, [r9, \align], \strd
+.endif
+.endm
+.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3
+.ifc \type, put
+        vqrshrun_s32    6,   \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+        vmin_u16        q15, \q0, \q1
+.else
+        vrshl_s32       q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits)
+        vmovn_i32       \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+        vsub_i16        q15, \q0, \q1           // PREP_BIAS
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+        finalize        \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+        vst1_reg        \strd, :64, \d0, \d1, \d2, \d3
+.endm
+.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+        finalize        \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+        vst1_reg        \strd, :128, \q0, \q1
+.endm
+.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
+        finalize        \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
+        vst1.16         {\q0, \q1}, [r0, :128], \strd
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        movw            r9,  \type_h
+        movw            r10, \type_v
+        b               \op\()_8tap_neon
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH  ((1*15<<7)|4*15)
+#define SHARP   ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2
+make_8tap_fn \type, regular,        REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
+make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
+make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
+make_8tap_fn \type, sharp,          SHARP,   SHARP
+make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
+make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
+
+function \type\()_8tap_neon
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+.ifc \bdmax, r8
+        ldr             r8,  [sp, #52]
+.endif
+        movw            r11, #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
+        mul             \mx, \mx, r11
+        mul             \my, \my, r11
+        add             \mx, \mx, r9  // mx, 8tap_h, 4tap_h
+        add             \my, \my, r10 // my, 8tap_v, 4tap_v
+
+.ifc \type, prep
+        lsl             \d_strd, \w, #1
+.endif
+
+        vdup.16         q15, \bdmax            // bitdepth_max
+        clz             \bdmax,  \bdmax
+        clz             r9,  \w
+        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
+        tst             \mx, #(0x7f << 14)
+        sub             r9,  r9,  #24
+        add             lr,  \bdmax, #6        // 6 + intermediate_bits
+        rsb             r12, \bdmax, #6        // 6 - intermediate_bits
+        movrel          r11, X(mc_subpel_filters), -8
+        bne             L(\type\()_8tap_h)
+        tst             \my, #(0x7f << 14)
+        bne             L(\type\()_8tap_v)
+        b               \type\()_neon
+
+L(\type\()_8tap_h):
+        cmp             \w,  #4
+        ubfx            r10, \mx, #7,  #7
+        and             \mx, \mx, #0x7f
+        it              gt
+        movgt           \mx, r10
+        tst             \my, #(0x7f << 14)
+        add             \mx, r11, \mx, lsl #3
+        bne             L(\type\()_8tap_hv)
+
+        adr             r10, L(\type\()_8tap_h_tbl)
+        vdup.32         q14, r12           // 6 - intermediate_bits
+        ldr             r9,  [r10, r9, lsl #2]
+        vneg.s32        q14, q14           // -(6-intermediate_bits)
+.ifc \type, put
+        vdup.16         q13, \bdmax        // intermediate_bits
+.else
+        vmov.i16        q13, #PREP_BIAS
+.endif
+        add             r10, r10, r9
+.ifc \type, put
+        vneg.s16        q13, q13           // -intermediate_bits
+.endif
+        bx              r10
+
+        .align 2
+L(\type\()_8tap_h_tbl):
+        .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20:     // 2xN h
+.ifc \type, put
+        add             \mx, \mx, #2
+        vld1.32         {d0[]}, [\mx]
+        sub             \src,  \src,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+2:
+        vld1.16         {q2}, [\src], \s_strd
+        vld1.16         {q3}, [\sr2], \s_strd
+        vext.8          d5,  d4,  d5,  #2
+        vext.8          d7,  d6,  d7,  #2
+        subs            \h,  \h,  #2
+        vtrn.32         d4,  d6
+        vtrn.32         d5,  d7
+        vmull.s16       q1,  d4,  d0[0]
+        vmlal.s16       q1,  d5,  d0[1]
+        vmlal.s16       q1,  d6,  d0[2]
+        vmlal.s16       q1,  d7,  d0[3]
+        vrshl.s32       q1,  q1,  q14 // -(6-intermediate_bits)
+        vqmovun.s32     d2,  q1
+        vrshl.s16       d2,  d2,  d26 // -intermediate_bits
+        vmin.u16        d2,  d2,  d30
+        vst1.32         {d2[0]}, [\dst, :32], \d_strd
+        vst1.32         {d2[1]}, [\ds2, :32], \d_strd
+        bgt             2b
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN h
+        add             \mx, \mx, #2
+        vld1.32         {d0[]}, [\mx]
+        sub             \src,  \src,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+4:
+        vld1.16         {q8},  [\src], \s_strd
+        vld1.16         {q11}, [\sr2], \s_strd
+        vext.8          d18, d16, d17, #2
+        vext.8          d19, d16, d17, #4
+        vext.8          d20, d16, d17, #6
+        vext.8          d24, d22, d23, #2
+        vext.8          d25, d22, d23, #4
+        vext.8          d21, d22, d23, #6
+        subs            \h,  \h,  #2
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d19, d0[2]
+        vmlal.s16       q2,  d20, d0[3]
+        vmull.s16       q3,  d22, d0[0]
+        vmlal.s16       q3,  d24, d0[1]
+        vmlal.s16       q3,  d25, d0[2]
+        vmlal.s16       q3,  d21, d0[3]
+        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
+        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
+.ifc \type, put
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vrshl.s16       q2,  q2,  q13 // -intermediate_bits
+        vmin.u16        q2,  q2,  q15
+.else
+        vmovn.s32       d4,  q2
+        vmovn.s32       d5,  q3
+        vsub.i16        q2,  q2,  q13 // PREP_BIAS
+.endif
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d5}, [\ds2, :64], \d_strd
+        bgt             4b
+        pop             {r4-r11,pc}
+
+80:
+160:
+320:
+640:
+1280:   // 8xN, 16xN, 32xN, ... h
+        vpush           {q4-q5}
+        vld1.8          {d0}, [\mx, :64]
+        sub             \src,  \src,  #6
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+
+        sub             \s_strd,  \s_strd,  \w, lsl #1
+        sub             \s_strd,  \s_strd,  #16
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, lsl #1
+.endif
+81:
+        vld1.16         {q8,  q9},  [\src]!
+        vld1.16         {q10, q11}, [\sr2]!
+        mov             \mx, \w
+
+8:
+        vmull.s16       q1,  d16, d0[0]
+        vmull.s16       q2,  d17, d0[0]
+        vmull.s16       q3,  d20, d0[0]
+        vmull.s16       q4,  d21, d0[0]
+.irpc i, 1234567
+        vext.8          q12, q8,  q9,  #(2*\i)
+        vext.8          q5,  q10, q11, #(2*\i)
+.if \i < 4
+        vmlal.s16       q1,  d24, d0[\i]
+        vmlal.s16       q2,  d25, d0[\i]
+        vmlal.s16       q3,  d10, d0[\i]
+        vmlal.s16       q4,  d11, d0[\i]
+.else
+        vmlal.s16       q1,  d24, d1[\i-4]
+        vmlal.s16       q2,  d25, d1[\i-4]
+        vmlal.s16       q3,  d10, d1[\i-4]
+        vmlal.s16       q4,  d11, d1[\i-4]
+.endif
+.endr
+        subs            \mx, \mx, #8
+        vrshl.s32       q1,  q1,  q14 // -(6-intermediate_bits)
+        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
+        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
+        vrshl.s32       q4,  q4,  q14 // -(6-intermediate_bits)
+.ifc \type, put
+        vqmovun.s32     d2,  q1
+        vqmovun.s32     d3,  q2
+        vqmovun.s32     d4,  q3
+        vqmovun.s32     d5,  q4
+        vrshl.s16       q1,  q1,  q13 // -intermediate_bits
+        vrshl.s16       q2,  q2,  q13 // -intermediate_bits
+        vmin.u16        q1,  q1,  q15
+        vmin.u16        q2,  q2,  q15
+.else
+        vmovn.s32       d2,  q1
+        vmovn.s32       d3,  q2
+        vmovn.s32       d4,  q3
+        vmovn.s32       d5,  q4
+        vsub.i16        q1,  q1,  q13 // PREP_BIAS
+        vsub.i16        q2,  q2,  q13 // PREP_BIAS
+.endif
+        vst1.16         {q1}, [\dst, :128]!
+        vst1.16         {q2}, [\ds2, :128]!
+        ble             9f
+
+        vmov            q8,  q9
+        vmov            q10, q11
+        vld1.16         {q9},  [\src]!
+        vld1.16         {q11}, [\sr2]!
+        b               8b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        bgt             81b
+        vpop            {q4-q5}
+        pop             {r4-r11,pc}
+
+
+L(\type\()_8tap_v):
+        cmp             \h,  #4
+        ubfx            r10, \my, #7,  #7
+        and             \my, \my, #0x7f
+        it              gt
+        movgt           \my, r10
+        add             \my, r11, \my, lsl #3
+
+.ifc \type, prep
+        vdup.32         q14, r12        // 6 - intermediate_bits
+        vmov.i16        q15, #PREP_BIAS
+.endif
+        adr             r10, L(\type\()_8tap_v_tbl)
+        ldr             r9,  [r10, r9, lsl #2]
+.ifc \type, prep
+        vneg.s32        q14, q14        // -(6-intermediate_bits)
+.endif
+        add             r10, r10, r9
+        bx              r10
+
+        .align 2
+L(\type\()_8tap_v_tbl):
+        .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20:     // 2xN v
+.ifc \type, put
+        bgt             28f
+
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src,  \src,  \s_strd
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vmovl.s8        q0,  d0
+
+        // 2x2 v
+        load_32         \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        interleave_1_32 d1,  d2,  d3,  d4,  d5
+        bgt             24f
+        vmull_vmlal_4   q8,  d1,  d2,  d3,  d4
+        vqrshrun_s32    6,   q8,  d16
+        vmin_u16        d30, d16
+        vst1_32         \d_strd,  d16
+        pop             {r4-r11,pc}
+
+24:     // 2x4 v
+        load_32         \sr2, \src, \s_strd, d6, d7
+        interleave_1_32 d5,  d6,  d7
+        vmull_vmlal_4   q8,  d1,  d2,  d3,  d4
+        vmull_vmlal_4   q9,  d3,  d4,  d5,  d6
+        vqrshrun_s32    6,   q8,  d16, q9,  d17
+        vmin_u16        q15, q8
+        vst1_32         \d_strd,  d16, d17
+        pop             {r4-r11,pc}
+
+28:     // 2x8, 2x16 v
+        vld1.8          {d0}, [\my, :64]
+        sub             \sr2,  \src,  \s_strd, lsl #1
+        add             \ds2,  \dst,  \d_strd
+        sub             \src,  \sr2,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+
+        load_32         \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16
+        interleave_1_32 d2,  d3,  d4,  d5,  d6
+        interleave_1_32 d6,  d7,  d16
+216:
+        subs            \h,  \h,  #8
+        load_32         \sr2, \src, \s_strd, d17, d18, d19, d20
+        load_32         \sr2, \src, \s_strd, d21, d22, d23, d24
+        interleave_1_32 d16, d17, d18, d19, d20
+        interleave_1_32 d20, d21, d22, d23, d24
+        vmull_vmlal_8   q13, d2,  d3,  d4,  d5,  d6,  d7,  d16, d17
+        vmull_vmlal_8   q1,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
+        vmull_vmlal_8   q2,  d6,  d7,  d16, d17, d18, d19, d20, d21
+        vmull_vmlal_8   q3,  d16, d17, d18, d19, d20, d21, d22, d23
+        vqrshrun_s32    6,   q13, d26, q1,  d27, q2,  d2,  q3,  d3
+        vmin_u16        q15, q13, q1
+        vst1_32         \d_strd,  d26, d27
+        vst1_32         \d_strd,  d2,  d3
+        ble             0f
+        vmov            q1,  q9
+        vmov            q2,  q10
+        vmov            q3,  q11
+        vmov            d16, d24
+        b               216b
+0:
+        pop             {r4-r11,pc}
+.endif
+
+40:
+        bgt             480f
+
+        // 4x2, 4x4 v
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_reg        \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        vmull_vmlal_4   q8,  d1,  d2,  d3,  d4
+        vmull_vmlal_4   q9,  d2,  d3,  d4,  d5
+        shift_store_4   \type, \d_strd, q8, q9, d16, d17
+        ble             0f
+        load_reg        \sr2, \src, \s_strd, d6, d7
+        vmull_vmlal_4   q8,  d3,  d4,  d5,  d6
+        vmull_vmlal_4   q9,  d4,  d5,  d6,  d7
+        shift_store_4   \type, \d_strd, q8, q9, d16, d17
+0:
+        pop             {r4-r11,pc}
+
+480:    // 4x8, 4x16 v
+        vld1.8          {d0}, [\my, :64]
+        sub             \sr2, \src, \s_strd, lsl #1
+        add             \ds2, \dst, \d_strd
+        sub             \src, \sr2, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_reg        \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22
+
+48:
+        subs            \h,  \h,  #4
+        load_reg        \sr2, \src, \s_strd, d23, d24, d25, d26
+        vmull_vmlal_8   q1,  d16, d17, d18, d19, d20, d21, d22, d23
+        vmull_vmlal_8   q2,  d17, d18, d19, d20, d21, d22, d23, d24
+        vmull_vmlal_8   q3,  d18, d19, d20, d21, d22, d23, d24, d25
+        vmull_vmlal_8   q8,  d19, d20, d21, d22, d23, d24, d25, d26
+        shift_store_4   \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
+        ble             0f
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q10, q12
+        vmov            d22, d26
+        b               48b
+0:
+        pop             {r4-r11,pc}
+
+80:
+        bgt             880f
+
+        // 8x2, 8x4 v
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_reg        \src, \sr2, \s_strd, q1, q2, q3, q8, q9
+        vmull_vmlal_4   q10, d2,  d4,  d6,  d16
+        vmull_vmlal_4   q11, d3,  d5,  d7,  d17
+        vmull_vmlal_4   q12, d4,  d6,  d16, d18
+        vmull_vmlal_4   q13, d5,  d7,  d17, d19
+        shift_store_8   \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23
+        ble             0f
+        load_reg        \sr2, \src, \s_strd, q10, q11
+        vmull_vmlal_4   q1,  d6,  d16, d18, d20
+        vmull_vmlal_4   q2,  d7,  d17, d19, d21
+        vmull_vmlal_4   q12, d16, d18, d20, d22
+        vmull_vmlal_4   q13, d17, d19, d21, d23
+        shift_store_8   \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5
+0:
+        pop             {r4-r11,pc}
+
+880:    // 8x6, 8x8, 8x16, 8x32 v
+1680:   // 16x8, 16x16, ...
+320:    // 32x8, 32x16, ...
+640:
+1280:
+        vpush           {q4-q7}
+        vld1.8          {d0}, [\my, :64]
+        sub             \src, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+        vmovl.s8        q0,  d0
+        mov             \my, \h
+168:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        load_reg        \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11
+
+88:
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, q12, q13
+        vmull_vmlal_8   q1,  d10, d12, d14, d16, d18, d20, d22, d24
+        vmull_vmlal_8   q2,  d11, d13, d15, d17, d19, d21, d23, d25
+        vmull_vmlal_8   q3,  d12, d14, d16, d18, d20, d22, d24, d26
+        vmull_vmlal_8   q4,  d13, d15, d17, d19, d21, d23, d25, d27
+        shift_store_8   \type, \d_strd, q1, q2,  d2,  d3,  q3,  q4,  d4,  d5
+        ble             9f
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, q1,  q2
+        vmull_vmlal_8   q3,  d14, d16, d18, d20, d22, d24, d26, d2
+        vmull_vmlal_8   q4,  d15, d17, d19, d21, d23, d25, d27, d3
+        vmull_vmlal_8   q5,  d16, d18, d20, d22, d24, d26, d2,  d4
+        vmull_vmlal_8   q6,  d17, d19, d21, d23, d25, d27, d3,  d5
+        shift_store_8   \type, \d_strd, q3, q4,  d6,  d7,  q5,  q6,  d8,  d9
+        ble             9f
+        vmov            q5,  q9
+        vmov            q6,  q10
+        vmov            q7,  q11
+        vmov            q8,  q12
+        vmov            q9,  q13
+        vmov            q10, q1
+        vmov            q11, q2
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        mls             \src, \s_strd, \my, \src
+        mls             \dst, \d_strd, \my, \dst
+        sub             \src, \src, \s_strd, lsl #3
+        mov             \h,  \my
+        add             \src, \src, #16
+        add             \dst, \dst, #16
+        b               168b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+160:
+        bgt             1680b
+
+        // 16x2, 16x4 v
+        vpush           {q6-q7}
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        vmovl.s8        q0,  d0
+
+        load_16s16      \src, \src, \s_strd, q6,  q7,  q8,  q9, q10, q11
+16:
+        load_16s16      \src, \src, \s_strd, q12, q13
+        subs            \h,  \h,  #1
+        vmull_vmlal_4   q1,  d12, d16, d20, d24
+        vmull_vmlal_4   q2,  d13, d17, d21, d25
+        vmull_vmlal_4   q3,  d14, d18, d22, d26
+        vmull_vmlal_4   q6,  d15, d19, d23, d27
+        shift_store_16  \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5
+        ble             0f
+        vmov            q6,  q8
+        vmov            q7,  q9
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q10, q12
+        vmov            q11, q13
+        b               16b
+0:
+        vpop            {q6-q7}
+        pop             {r4-r11,pc}
+
+
+L(\type\()_8tap_hv):
+        cmp             \h,  #4
+        ubfx            r10, \my, #7,  #7
+        and             \my, \my, #0x7f
+        it              gt
+        movgt           \my, r10
+4:
+        add             \my, r11, \my, lsl #3
+
+        adr             r10, L(\type\()_8tap_hv_tbl)
+        neg             r12, r12           // -(6-intermediate_bits)
+        ldr             r9,  [r10, r9, lsl #2]
+        vdup.32         q14, r12           // -(6-intermediate_bits)
+.ifc \type, put
+        neg             r8,  lr            // -(6+intermeidate_bits)
+.else
+        vmov.i16        q13, #PREP_BIAS
+.endif
+        add             r10, r10, r9
+.ifc \type, put
+        vdup.32         q13, r8            // -(6+intermediate_bits)
+.endif
+        bx              r10
+
+        .align 2
+L(\type\()_8tap_hv_tbl):
+        .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+        add             \mx, \mx, #2
+        vld1.32         {d0[]}, [\mx]
+        bgt             280f
+        add             \my, \my, #2
+        vld1.32         {d2[]}, [\my]
+
+        // 2x2, 2x4 hv
+        sub             \sr2, \src, #2
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        vld1.16         {q11}, [\src], \s_strd
+        vext.8          d24, d22, d23, #2
+        vmull.s16       q11, d22, d0
+        vmull.s16       q12, d24, d0
+        vpadd.s32       d22, d22, d23
+        vpadd.s32       d23, d24, d25
+        vpadd.s32       d22, d22, d23
+        vrshl.s32       d16, d22, d28 // -(6-intermediate_bits)
+        vmovn.i32       d16, q8
+        bl              L(\type\()_8tap_filter_2)
+
+        vext.8          d16, d16, d16, #4
+        vext.8          d16, d16, d24, #4
+        vmov            d17, d24
+
+2:
+        bl              L(\type\()_8tap_filter_2)
+
+        vext.8          d18, d17, d24, #4
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d24, d2[3]
+
+        vrshl.s32       q2,  q2,  q13 // -(6+intermediate_bits)
+        vqmovun.s32     d4,  q2
+        vmin.u16        d4,  d4,  d30
+        subs            \h,  \h,  #2
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d24
+        b               2b
+
+280:    // 2x8, 2x16, 2x32 hv
+        vld1.8          {d2},  [\my, :64]
+        sub             \src, \src, #2
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        vld1.16         {q11}, [\src], \s_strd
+        vext.8          d24, d22, d23, #2
+        vmull.s16       q11, d22, d0
+        vmull.s16       q12, d24, d0
+        vpadd.s32       d22, d22, d23
+        vpadd.s32       d23, d24, d25
+        vpadd.s32       d22, d22, d23
+        vrshl.s32       d16, d22, d28 // -(6-intermediate_bits)
+        vmovn.i32       d16, q8
+
+        bl              L(\type\()_8tap_filter_2)
+
+        vext.8          d16, d16, d16, #4
+        vext.8          d16, d16, d24, #4
+        vmov            d17, d24
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d18, d17, d24, #4
+        vmov            d19, d24
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d20, d19, d24, #4
+        vmov            d21, d24
+
+28:
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d22, d21, d24, #4
+        vmull.s16       q3,  d16, d2[0]
+        vmlal.s16       q3,  d17, d2[1]
+        vmlal.s16       q3,  d18, d2[2]
+        vmlal.s16       q3,  d19, d2[3]
+        vmlal.s16       q3,  d20, d3[0]
+        vmlal.s16       q3,  d21, d3[1]
+        vmlal.s16       q3,  d22, d3[2]
+        vmlal.s16       q3,  d24, d3[3]
+
+        vrshl.s32       q3,  q3,  q13 // -(6+intermediate_bits)
+        vqmovun.s32     d6,  q3
+        vmin.u16        d6,  d6,  d30
+        subs            \h,  \h,  #2
+        vst1.32         {d6[0]}, [\dst, :32], \d_strd
+        vst1.32         {d6[1]}, [\ds2, :32], \d_strd
+        ble             0f
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vmov            d20, d22
+        vmov            d21, d24
+        b               28b
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+        vld1.16         {q11}, [\sr2], \s_strd
+        vld1.16         {q12}, [\src], \s_strd
+        vext.8          d23, d22, d23, #2
+        vext.8          d25, d24, d25, #2
+        vtrn.32         q11, q12
+        vmull.s16       q3,  d22, d0[0]
+        vmlal.s16       q3,  d23, d0[1]
+        vmlal.s16       q3,  d24, d0[2]
+        vmlal.s16       q3,  d25, d0[3]
+        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
+        vmovn.i32       d24, q3
+        bx              lr
+.endif
+
+40:
+        add             \mx, \mx, #2
+        vld1.32         {d0[]}, [\mx]
+        bgt             480f
+        add             \my, \my, #2
+        vld1.32         {d2[]}, [\my]
+        sub             \sr2, \src, #2
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        // 4x2, 4x4 hv
+        vld1.16         {q11}, [\src], \s_strd
+        vext.8          d24, d22, d23, #2
+        vext.8          d25, d22, d23, #4
+        vext.8          d23, d22, d23, #6
+        vmull.s16       q10, d22, d0[0]
+        vmlal.s16       q10, d24, d0[1]
+        vmlal.s16       q10, d25, d0[2]
+        vmlal.s16       q10, d23, d0[3]
+        vrshl.s32       q10, q10, q14 // -(6-intermediate_bits)
+        vmovn.i32       d17, q10
+
+        bl              L(\type\()_8tap_filter_4)
+        vmov            q9,  q12
+
+4:
+        bl              L(\type\()_8tap_filter_4)
+        vmull.s16       q2,  d17, d2[0]
+        vmlal.s16       q2,  d18, d2[1]
+        vmlal.s16       q2,  d19, d2[2]
+        vmlal.s16       q2,  d24, d2[3]
+        vmull.s16       q3,  d18, d2[0]
+        vmlal.s16       q3,  d19, d2[1]
+        vmlal.s16       q3,  d24, d2[2]
+        vmlal.s16       q3,  d25, d2[3]
+.ifc \type, put
+        vrshl.s32       q2,  q2,  q13 // -(6+intermediate_bits)
+        vrshl.s32       q3,  q3,  q13 // -(6+intermediate_bits)
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vmin.u16        q2,  q2,  q15
+.else
+        vrshrn.i32      d4,  q2,  #6
+        vrshrn.i32      d5,  q3,  #6
+        vsub.i16        q2,  q2,  q13 // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d5}, [\ds2, :64], \d_strd
+        ble             0f
+        vmov            d17, d19
+        vmov            q9,  q12
+        b               4b
+0:
+        pop             {r4-r11,pc}
+
+480:    // 4x8, 4x16, 4x32 hv
+        vpush           {d13-d15}
+        vld1.8          {d2},  [\my, :64]
+        sub             \src, \src, #2
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        vld1.16         {q11}, [\src], \s_strd
+        vext.8          d24, d22, d23, #2
+        vext.8          d25, d22, d23, #4
+        vext.8          d23, d22, d23, #6
+        vmull.s16       q10, d22, d0[0]
+        vmlal.s16       q10, d24, d0[1]
+        vmlal.s16       q10, d25, d0[2]
+        vmlal.s16       q10, d23, d0[3]
+        vrshl.s32       q10, q10, q14 // -(6-intermediate_bits)
+        vmovn.i32       d13, q10
+
+        bl              L(\type\()_8tap_filter_4)
+        vmov            q7,  q12
+        bl              L(\type\()_8tap_filter_4)
+        vmov            q8,  q12
+        bl              L(\type\()_8tap_filter_4)
+        vmov            q9,  q12
+
+48:
+        bl              L(\type\()_8tap_filter_4)
+        vmull.s16       q2,  d13, d2[0]
+        vmlal.s16       q2,  d14, d2[1]
+        vmlal.s16       q2,  d15, d2[2]
+        vmlal.s16       q2,  d16, d2[3]
+        vmlal.s16       q2,  d17, d3[0]
+        vmlal.s16       q2,  d18, d3[1]
+        vmlal.s16       q2,  d19, d3[2]
+        vmlal.s16       q2,  d24, d3[3]
+        vmull.s16       q3,  d14, d2[0]
+        vmlal.s16       q3,  d15, d2[1]
+        vmlal.s16       q3,  d16, d2[2]
+        vmlal.s16       q3,  d17, d2[3]
+        vmlal.s16       q3,  d18, d3[0]
+        vmlal.s16       q3,  d19, d3[1]
+        vmlal.s16       q3,  d24, d3[2]
+        vmlal.s16       q3,  d25, d3[3]
+.ifc \type, put
+        vrshl.s32       q2,  q2,  q13 // -(6+intermediate_bits)
+        vrshl.s32       q3,  q3,  q13 // -(6+intermediate_bits)
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vmin.u16        q2,  q2,  q15
+.else
+        vrshrn.i32      d4,  q2,  #6
+        vrshrn.i32      d5,  q3,  #6
+        vsub.i16        q2,  q2,  q13 // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d5}, [\ds2, :64], \d_strd
+        ble             0f
+        vmov            d13, d15
+        vmov            q7,  q8
+        vmov            q8,  q9
+        vmov            q9,  q12
+        b               48b
+0:
+        vpop            {d13-d15}
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+        vld1.16         {q10}, [\sr2], \s_strd
+        vld1.16         {q11}, [\src], \s_strd
+        vext.8          d24, d20, d21, #2
+        vext.8          d25, d20, d21, #4
+        vext.8          d21, d20, d21, #6
+        vmull.s16       q3,  d20, d0[0]
+        vmlal.s16       q3,  d24, d0[1]
+        vmlal.s16       q3,  d25, d0[2]
+        vmlal.s16       q3,  d21, d0[3]
+        vext.8          d24, d22, d23, #2
+        vext.8          d25, d22, d23, #4
+        vext.8          d23, d22, d23, #6
+        vmull.s16       q10, d22, d0[0]
+        vmlal.s16       q10, d24, d0[1]
+        vmlal.s16       q10, d25, d0[2]
+        vmlal.s16       q10, d23, d0[3]
+        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
+        vrshl.s32       q10, q10, q14 // -(6-intermediate_bits)
+        vmovn.i32       d24, q3
+        vmovn.i32       d25, q10
+        bx              lr
+
+80:
+160:
+320:
+        bgt             880f
+        add             \my, \my, #2
+        vld1.8          {d0},  [\mx, :64]
+        vld1.32         {d2[]}, [\my]
+        sub             \src,  \src,  #6
+        sub             \src,  \src,  \s_strd
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+        mov             \my, \h
+
+164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+        add             \ds2, \dst, \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        vld1.16         {q11, q12}, [\src], \s_strd
+        vmull.s16       q2,  d22, d0[0]
+        vmull.s16       q3,  d23, d0[0]
+        vdup.32         q14, r12      // -(6-intermediate_bits)
+.irpc i, 1234567
+        vext.8          q10, q11, q12, #(2*\i)
+.if \i < 4
+        vmlal.s16       q2,  d20, d0[\i]
+        vmlal.s16       q3,  d21, d0[\i]
+.else
+        vmlal.s16       q2,  d20, d1[\i - 4]
+        vmlal.s16       q3,  d21, d1[\i - 4]
+.endif
+.endr
+        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
+        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
+        vmovn.i32       d16, q2
+        vmovn.i32       d17, q3
+
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q9,  q11
+        vmov            q10, q12
+
+8:
+        bl              L(\type\()_8tap_filter_8)
+        vmull.s16       q2,  d16, d2[0]
+        vmull.s16       q3,  d17, d2[0]
+        vmull.s16       q13, d18, d2[0]
+        vmull.s16       q14, d19, d2[0]
+.ifc \type, put
+        vdup.32         q8,  r8      // -(6+intermediate_bits)
+.endif
+        vmlal.s16       q2,  d18, d2[1]
+        vmlal.s16       q3,  d19, d2[1]
+        vmlal.s16       q13, d20, d2[1]
+        vmlal.s16       q14, d21, d2[1]
+        vmlal.s16       q2,  d20, d2[2]
+        vmlal.s16       q3,  d21, d2[2]
+        vmlal.s16       q13, d22, d2[2]
+        vmlal.s16       q14, d23, d2[2]
+        vmlal.s16       q2,  d22, d2[3]
+        vmlal.s16       q3,  d23, d2[3]
+        vmlal.s16       q13, d24, d2[3]
+        vmlal.s16       q14, d25, d2[3]
+.ifc \type, put
+        vdup.16         q9,  \bdmax  // bitdepth_max
+        vrshl.s32       q2,  q2,  q8 // -(6+intermediate_bits)
+        vrshl.s32       q3,  q3,  q8 // -(6+intermediate_bits)
+        vrshl.s32       q13, q13, q8 // -(6+intermediate_bits)
+        vrshl.s32       q14, q14, q8 // -(6+intermediate_bits)
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vqmovun.s32     d6,  q13
+        vqmovun.s32     d7,  q14
+        vmin.u16        q2,  q2,  q15
+        vmin.u16        q3,  q3,  q15
+.else
+        vmov.i16        q9,  #PREP_BIAS
+        vrshrn.i32      d4,  q2,  #6
+        vrshrn.i32      d5,  q3,  #6
+        vrshrn.i32      d6,  q13, #6
+        vrshrn.i32      d7,  q14, #6
+        vsub.i16        q2,  q2,  q9 // PREP_BIAS
+        vsub.i16        q3,  q3,  q9 // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        vst1.16         {q2}, [\dst, :128], \d_strd
+        vst1.16         {q3}, [\ds2, :128], \d_strd
+        ble             9f
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q10, q12
+        b               8b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #2
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               164b
+0:
+        pop             {r4-r11,pc}
+
+880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+        vpush           {q4-q7}
+        vld1.8          {d0}, [\mx, :64]
+        vld1.8          {d2}, [\my, :64]
+        sub             \src,  \src,  #6
+        sub             \src,  \src,  \s_strd
+        sub             \src,  \src,  \s_strd, lsl #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+        mov             \my, \h
+
+168:
+        add             \ds2, \dst, \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        vld1.16         {q11, q12}, [\src], \s_strd
+        vmull.s16       q2,  d22, d0[0]
+        vmull.s16       q3,  d23, d0[0]
+        vdup.32         q14, r12      // -(6-intermediate_bits)
+.irpc i, 1234567
+        vext.8          q10, q11, q12, #(2*\i)
+.if \i < 4
+        vmlal.s16       q2,  d20, d0[\i]
+        vmlal.s16       q3,  d21, d0[\i]
+.else
+        vmlal.s16       q2,  d20, d1[\i - 4]
+        vmlal.s16       q3,  d21, d1[\i - 4]
+.endif
+.endr
+        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
+        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
+        vmovn.i32       d8,  q2
+        vmovn.i32       d9,  q3
+
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q5,  q11
+        vmov            q6,  q12
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q7,  q11
+        vmov            q8,  q12
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q9,  q11
+        vmov            q10, q12
+
+88:
+        bl              L(\type\()_8tap_filter_8)
+        vmull.s16       q2,  d8,  d2[0]
+        vmull.s16       q3,  d9,  d2[0]
+        vmull.s16       q13, d10, d2[0]
+        vmull.s16       q14, d11, d2[0]
+.ifc \type, put
+        vdup.32         q4,  r8      // -(6+intermediate_bits)
+.endif
+        vmlal.s16       q2,  d10, d2[1]
+        vmlal.s16       q3,  d11, d2[1]
+        vmlal.s16       q13, d12, d2[1]
+        vmlal.s16       q14, d13, d2[1]
+        vmlal.s16       q2,  d12, d2[2]
+        vmlal.s16       q3,  d13, d2[2]
+        vmlal.s16       q13, d14, d2[2]
+        vmlal.s16       q14, d15, d2[2]
+        vmlal.s16       q2,  d14, d2[3]
+        vmlal.s16       q3,  d15, d2[3]
+        vmlal.s16       q13, d16, d2[3]
+        vmlal.s16       q14, d17, d2[3]
+        vmlal.s16       q2,  d16, d3[0]
+        vmlal.s16       q3,  d17, d3[0]
+        vmlal.s16       q13, d18, d3[0]
+        vmlal.s16       q14, d19, d3[0]
+        vmlal.s16       q2,  d18, d3[1]
+        vmlal.s16       q3,  d19, d3[1]
+        vmlal.s16       q13, d20, d3[1]
+        vmlal.s16       q14, d21, d3[1]
+        vmlal.s16       q2,  d20, d3[2]
+        vmlal.s16       q3,  d21, d3[2]
+        vmlal.s16       q13, d22, d3[2]
+        vmlal.s16       q14, d23, d3[2]
+        vmlal.s16       q2,  d22, d3[3]
+        vmlal.s16       q3,  d23, d3[3]
+        vmlal.s16       q13, d24, d3[3]
+        vmlal.s16       q14, d25, d3[3]
+.ifc \type, put
+        vrshl.s32       q2,  q2,  q4 // -(6+intermediate_bits)
+        vrshl.s32       q3,  q3,  q4 // -(6+intermediate_bits)
+        vrshl.s32       q13, q13, q4 // -(6+intermediate_bits)
+        vrshl.s32       q14, q14, q4 // -(6+intermediate_bits)
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vqmovun.s32     d6,  q13
+        vqmovun.s32     d7,  q14
+        vmin.u16        q2,  q2,  q15
+        vmin.u16        q3,  q3,  q15
+.else
+        vmov.i16        q5,  #PREP_BIAS
+        vrshrn.i32      d4,  q2,  #6
+        vrshrn.i32      d5,  q3,  #6
+        vrshrn.i32      d6,  q13, #6
+        vrshrn.i32      d7,  q14, #6
+        vsub.i16        q2,  q2,  q5 // PREP_BIAS
+        vsub.i16        q3,  q3,  q5 // PREP_BIAS
+.endif
+        subs            \h,  \h,  #2
+        vst1.16         {q2}, [\dst, :128], \d_strd
+        vst1.16         {q3}, [\ds2, :128], \d_strd
+        ble             9f
+        vmov            q4,  q6
+        vmov            q5,  q7
+        vmov            q6,  q8
+        vmov            q7,  q9
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q10, q12
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #3
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               168b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+        vld1.16         {q13, q14}, [\sr2], \s_strd
+        vmull.s16       q2,  d26, d0[0]
+        vmull.s16       q3,  d27, d0[0]
+.irpc i, 1234567
+        vext.8          q12, q13, q14, #(2*\i)
+.if \i < 4
+        vmlal.s16       q2,  d24, d0[\i]
+        vmlal.s16       q3,  d25, d0[\i]
+.else
+        vmlal.s16       q2,  d24, d1[\i - 4]
+        vmlal.s16       q3,  d25, d1[\i - 4]
+.endif
+.endr
+        vdup.32         q12, r12      // -(6-intermediate_bits)
+        vld1.16         {q13, q14}, [\src], \s_strd
+        vrshl.s32       q2,  q2,  q12 // -(6-intermediate_bits)
+        vrshl.s32       q3,  q3,  q12 // -(6-intermediate_bits)
+        vmovn.i32       d4,  q2
+        vmovn.i32       d5,  q3
+
+        vmull.s16       q3,  d26, d0[0]
+        vmull.s16       q11, d27, d0[0]
+.irpc i, 1234567
+        vext.8          q12, q13, q14, #(2*\i)
+.if \i < 4
+        vmlal.s16       q3,  d24, d0[\i]
+        vmlal.s16       q11, d25, d0[\i]
+.else
+        vmlal.s16       q3,  d24, d1[\i - 4]
+        vmlal.s16       q11, d25, d1[\i - 4]
+.endif
+.endr
+        vdup.32         q13, r12      // -(6-intermediate_bits)
+        vrshl.s32       q3,  q3,  q13 // -(6-intermediate_bits)
+        vrshl.s32       q11, q11, q13 // -(6-intermediate_bits)
+
+        vmovn.i32       d24, q3
+        vmovn.i32       d25, q11
+        vmov            q11, q2
+        bx              lr
+endfunc
+
+function \type\()_bilin_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+.ifc \bdmax, r8
+        ldr             r8,  [sp, #52]
+.endif
+        vdup.16         q1,  \mx
+        vdup.16         q3,  \my
+        rsb             r9,  \mx, #16
+        rsb             r10, \my, #16
+        vdup.16         q0,  r9
+        vdup.16         q2,  r10
+.ifc \type, prep
+        lsl             \d_strd, \w, #1
+.endif
+        clz             \bdmax,   \bdmax       // bitdepth_max
+        clz             r9,  \w
+        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
+        cmp             \mx, #0
+        sub             r9,  r9,  #24
+        rsb             r11, \bdmax, #4        // 4 - intermediate_bits
+        add             r12, \bdmax, #4        // 4 + intermediate_bits
+        bne             L(\type\()_bilin_h)
+        cmp             \my, #0
+        bne             L(\type\()_bilin_v)
+        b               \type\()_neon
+
+L(\type\()_bilin_h):
+        cmp             \my, #0
+        bne             L(\type\()_bilin_hv)
+
+        adr             r10, L(\type\()_bilin_h_tbl)
+        vdup.16         q15, r11               // 4 - intermediate_bits
+        ldr             r9,  [r10, r9, lsl #2]
+        vneg.s16        q15, q15               // -(4-intermediate_bits)
+.ifc \type, put
+        vdup.16         q14, \bdmax            // intermediate_bits
+.else
+        vmov.i16        q14, #PREP_BIAS
+.endif
+        add             r10, r10, r9
+.ifc \type, put
+        vneg.s16        q14, q14               // -intermediate_bits
+.endif
+        bx              r10
+
+        .align 2
+L(\type\()_bilin_h_tbl):
+        .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20:     // 2xN h
+.ifc \type, put
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+2:
+        vld1.16         {d16}, [\src], \s_strd
+        vld1.16         {d18}, [\sr2], \s_strd
+        vext.8          d17, d16, d16, #2
+        vext.8          d19, d18, d18, #2
+        vtrn.32         d16, d18
+        vtrn.32         d17, d19
+        subs            \h,  \h,  #2
+        vmul.i16        d16, d16, d0
+        vmla.i16        d16, d17, d2
+        vrshl.u16       d16, d16, d30
+        vrshl.u16       d16, d16, d28
+        vst1.32         {d16[0]}, [\dst, :32], \d_strd
+        vst1.32         {d16[1]}, [\ds2, :32], \d_strd
+        bgt             2b
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+4:
+        vld1.16         {q8},  [\src], \s_strd
+        vld1.16         {q10}, [\sr2], \s_strd
+        vext.8          q9,  q8,  q8,  #2
+        vext.8          q11, q10, q10, #2
+        vmov            d17, d20
+        vmov            d19, d22
+        subs            \h,  \h,  #2
+        vmul.i16        q8,  q8,  q0
+        vmla.i16        q8,  q9,  q1
+        vrshl.u16       q8,  q8,  q15
+.ifc \type, put
+        vrshl.u16       q8,  q8,  q14
+.else
+        vsub.i16        q8,  q8,  q14
+.endif
+        vst1.16         {d16}, [\dst, :64], \d_strd
+        vst1.16         {d17}, [\ds2, :64], \d_strd
+        bgt             4b
+        pop             {r4-r11,pc}
+
+80:     // 8xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+8:
+        vld1.16         {d16, d17, d18}, [\src], \s_strd
+        vld1.16         {d20, d21, d22}, [\sr2], \s_strd
+        vext.8          q9,  q8,  q9,  #2
+        vext.8          q11, q10, q11, #2
+        subs            \h,  \h,  #2
+        vmul.i16        q8,  q8,  q0
+        vmla.i16        q8,  q9,  q1
+        vmul.i16        q10, q10, q0
+        vmla.i16        q10, q11, q1
+        vrshl.u16       q8,  q8,  q15
+        vrshl.u16       q10, q10, q15
+.ifc \type, put
+        vrshl.u16       q8,  q8,  q14
+        vrshl.u16       q10, q10, q14
+.else
+        vsub.i16        q8,  q8,  q14
+        vsub.i16        q10, q10, q14
+.endif
+        vst1.16         {q8},  [\dst, :128], \d_strd
+        vst1.16         {q10}, [\ds2, :128], \d_strd
+        bgt             8b
+        pop             {r4-r11,pc}
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        vpush           {q4-q7}
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+
+        sub             \s_strd,  \s_strd,  \w, lsl #1
+        sub             \s_strd,  \s_strd,  #16
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w, lsl #1
+.endif
+161:
+        vld1.16         {q4}, [\src]!
+        vld1.16         {q9}, [\sr2]!
+        mov             \mx, \w
+
+16:
+        vld1.16         {q5,  q6},  [\src]!
+        vld1.16         {q10, q11}, [\sr2]!
+        vext.8          q7,  q4,  q5,  #2
+        vext.8          q8,  q5,  q6,  #2
+        vext.8          q12, q9,  q10, #2
+        vext.8          q13, q10, q11, #2
+        vmul.i16        q4,  q4,  q0
+        vmla.i16        q4,  q7,  q1
+        vmul.i16        q5,  q5,  q0
+        vmla.i16        q5,  q8,  q1
+        vmul.i16        q9,  q9,  q0
+        vmla.i16        q9,  q12, q1
+        vmul.i16        q10, q10, q0
+        vmla.i16        q10, q13, q1
+        vrshl.u16       q4,  q4,  q15
+        vrshl.u16       q5,  q5,  q15
+        vrshl.u16       q9,  q9,  q15
+        vrshl.u16       q10, q10, q15
+        subs            \mx, \mx, #16
+.ifc \type, put
+        vrshl.u16       q4,  q4,  q14
+        vrshl.u16       q5,  q5,  q14
+        vrshl.u16       q9,  q9,  q14
+        vrshl.u16       q10, q10, q14
+.else
+        vsub.i16        q4,  q4,  q14
+        vsub.i16        q5,  q5,  q14
+        vsub.i16        q9,  q9,  q14
+        vsub.i16        q10, q10, q14
+.endif
+        vst1.16         {q4, q5},  [\dst, :128]!
+        vst1.16         {q9, q10}, [\ds2, :128]!
+        ble             9f
+
+        vmov            q4,  q6
+        vmov            q9,  q11
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        bgt             161b
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+
+L(\type\()_bilin_v):
+        cmp             \h,  #4
+        adr             r10, L(\type\()_bilin_v_tbl)
+.ifc \type, prep
+        vdup.16         q15, r11      // 4 - intermediate_bits
+.endif
+        ldr             r9,  [r10, r9, lsl #2]
+.ifc \type, prep
+        vmov.i16        q14, #PREP_BIAS
+        vneg.s16        q15, q15      // -(4-intermediate_bits)
+.endif
+        add             r10, r10, r9
+        bx              r10
+
+        .align 2
+L(\type\()_bilin_v_tbl):
+        .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20:     // 2xN v
+.ifc \type, put
+        cmp             \h,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+
+        // 2x2 v
+        vld1.32         {d16[]}, [\src], \s_strd
+        bgt             24f
+        vld1.32         {d17[]}, [\sr2], \s_strd
+        vld1.32         {d18[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #4
+        vext.8          d17, d17, d18, #4
+        vmul.i16        d16, d16, d4
+        vmla.i16        d16, d17, d6
+        vrshr.u16       d16, d16, #4
+        vst1.32         {d16[0]}, [\dst, :32]
+        vst1.32         {d16[1]}, [\ds2, :32]
+        pop             {r4-r11,pc}
+24:     // 2x4, 2x8, ... v
+        vld1.32         {d17[]}, [\sr2], \s_strd
+        vld1.32         {d18[]}, [\src], \s_strd
+        vld1.32         {d19[]}, [\sr2], \s_strd
+        vld1.32         {d20[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #4
+        vext.8          d17, d17, d18, #4
+        vext.8          d18, d18, d19, #4
+        vext.8          d19, d19, d20, #4
+        vswp            d17, d18
+        vmul.i16        q8,  q8,  q2
+        vmla.i16        q8,  q9,  q3
+        subs            \h,  \h,  #4
+        vrshr.u16       q8,  q8,  #4
+        vst1.32         {d16[0]}, [\dst, :32], \d_strd
+        vst1.32         {d16[1]}, [\ds2, :32], \d_strd
+        vst1.32         {d17[0]}, [\dst, :32], \d_strd
+        vst1.32         {d17[1]}, [\ds2, :32], \d_strd
+        ble             0f
+        vmov            d16, d20
+        b               24b
+0:
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vld1.16         {d16}, [\src], \s_strd
+4:
+        vld1.16         {d17}, [\sr2], \s_strd
+        vld1.16         {d19}, [\src], \s_strd
+        vmov            d18, d17
+        vmul.i16        q8,  q8,  q2
+        vmla.i16        q8,  q9,  q3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vrshr.u16       q8,  q8,  #4
+.else
+        vrshl.u16       q8,  q8,  q15
+        vsub.i16        q8,  q8,  q14
+.endif
+        vst1.16         {d16}, [\dst, :64], \d_strd
+        vst1.16         {d17}, [\ds2, :64], \d_strd
+        ble             0f
+        vmov            d16, d19
+        b               4b
+0:
+        pop             {r4-r11,pc}
+
+80:     // 8xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vld1.16         {q8},  [\src], \s_strd
+8:
+        vld1.16         {q9},  [\sr2], \s_strd
+        vld1.16         {q10}, [\src], \s_strd
+        vmul.i16        q8,  q8,  q2
+        vmla.i16        q8,  q9,  q3
+        vmul.i16        q9,  q9,  q2
+        vmla.i16        q9,  q10, q3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vrshr.u16       q8,  q8,  #4
+        vrshr.u16       q9,  q9,  #4
+.else
+        vrshl.u16       q8,  q8,  q15
+        vrshl.u16       q9,  q9,  q15
+        vsub.i16        q8,  q8,  q14
+        vsub.i16        q9,  q9,  q14
+.endif
+        vst1.16         {q8}, [\dst, :128], \d_strd
+        vst1.16         {q9}, [\ds2, :128], \d_strd
+        ble             0f
+        vmov            q8,  q10
+        b               8b
+0:
+        pop             {r4-r11,pc}
+
+160:    // 16xN, 32xN, ...
+320:
+640:
+1280:
+        mov             \my, \h
+1:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.16         {q8,  q9},  [\src], \s_strd
+2:
+        vld1.16         {q10, q11}, [\sr2], \s_strd
+        vld1.16         {q12, q13}, [\src], \s_strd
+        vmul.i16        q8,  q8,  q2
+        vmla.i16        q8,  q10, q3
+        vmul.i16        q9,  q9,  q2
+        vmla.i16        q9,  q11, q3
+        vmul.i16        q10, q10, q2
+        vmla.i16        q10, q12, q3
+        vmul.i16        q11, q11, q2
+        vmla.i16        q11, q13, q3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vrshr.u16       q8,  q8,  #4
+        vrshr.u16       q9,  q9,  #4
+        vrshr.u16       q10, q10, #4
+        vrshr.u16       q11, q11, #4
+.else
+        vrshl.u16       q8,  q8,  q15
+        vrshl.u16       q9,  q9,  q15
+        vrshl.u16       q10, q10, q15
+        vrshl.u16       q11, q11, q15
+        vsub.i16        q8,  q8,  q14
+        vsub.i16        q9,  q9,  q14
+        vsub.i16        q10, q10, q14
+        vsub.i16        q11, q11, q14
+.endif
+        vst1.16         {q8,  q9},  [\dst, :128], \d_strd
+        vst1.16         {q10, q11}, [\ds2, :128], \d_strd
+        ble             9f
+        vmov            q8,  q12
+        vmov            q9,  q13
+        b               2b
+9:
+        subs            \w,  \w,  #16
+        ble             0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        mls             \src, \s_strd, \my, \src
+        mls             \dst, \d_strd, \my, \dst
+        sub             \src, \src, \s_strd, lsl #1
+        mov             \h,  \my
+        add             \src, \src, #32
+        add             \dst, \dst, #32
+        b               1b
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+        adr             r10, L(\type\()_bilin_hv_tbl)
+        vdup.16         q15, r11          // 4 - intermediate_bits
+        ldr             r9,  [r10, r9, lsl #2]
+        vneg.s16        q15, q15          // -(4-intermediate_bits)
+.ifc \type, put
+        vdup.32         q14, r12          // 4 + intermediate_bits
+.else
+        vmov.i16        q14, #PREP_BIAS
+.endif
+        add             r10, r10, r9
+.ifc \type, put
+        vneg.s32        q14, q14          // -(4+intermediate_bits)
+.endif
+        bx              r10
+
+        .align 2
+L(\type\()_bilin_hv_tbl):
+        .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20:     // 2xN hv
+.ifc \type, put
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.16         {d20}, [\src], \s_strd
+        vext.8          d21, d20, d20, #2
+        vmul.i16        d16, d20, d0
+        vmla.i16        d16, d21, d2
+        vrshl.u16       d16, d16, d30
+        vext.8          d16, d16, d16, #4
+
+2:
+        vld1.16         {d20}, [\sr2], \s_strd
+        vld1.16         {d22}, [\src], \s_strd
+        vext.8          d21, d20, d20, #2
+        vext.8          d23, d22, d22, #2
+        vtrn.32         d20, d22
+        vtrn.32         d21, d23
+        vmul.i16        d18, d20, d0
+        vmla.i16        d18, d21, d2
+        vrshl.u16       d18, d18, d30
+
+        vext.8          d16, d16, d18, #4
+
+        vmull.u16       q8,  d16, d4
+        vmlal.u16       q8,  d18, d6
+        vrshl.u32       q8,  q8,  q14
+        vmovn.i32       d16, q8
+        subs            \h,  \h,  #2
+        vst1.32         {d16[0]}, [\dst, :32], \d_strd
+        vst1.32         {d16[1]}, [\ds2, :32], \d_strd
+        ble             0f
+        vmov            d16, d18
+        b               2b
+0:
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN hv
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.16         {q10}, [\src], \s_strd
+        vext.8          d21, d20, d21, #2
+        vmul.i16        d16, d20, d0
+        vmla.i16        d16, d21, d2
+        vrshl.u16       d16, d16, d30
+
+4:
+        vld1.16         {q10}, [\sr2], \s_strd
+        vld1.16         {q11}, [\src], \s_strd
+        vext.8          d21, d20, d21, #2
+        vext.8          d23, d22, d23, #2
+        vswp            d21, d22
+        vmul.i16        q9,  q10, q0
+        vmla.i16        q9,  q11, q1
+        vrshl.u16       q9,  q9,  q15
+
+        vmull.u16       q10, d16, d4
+        vmlal.u16       q10, d18, d6
+        vmull.u16       q11, d18, d4
+        vmlal.u16       q11, d19, d6
+.ifc \type, put
+        vrshl.u32       q10, q10, q14
+        vrshl.u32       q11, q11, q14
+        vmovn.i32       d20, q10
+        vmovn.i32       d21, q11
+.else
+        vrshrn.i32      d20, q10, #4
+        vrshrn.i32      d21, q11, #4
+        vsub.i16        q10, q10, q14
+.endif
+        subs            \h,  \h,  #2
+        vst1.16         {d20}, [\dst, :64], \d_strd
+        vst1.16         {d21}, [\ds2, :64], \d_strd
+        ble             0f
+        vmov            d16, d19
+        b               4b
+0:
+        pop             {r4-r11,pc}
+
+80:     // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+        mov             \my, \h
+
+1:
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.16         {d20, d21, d22}, [\src], \s_strd
+        vext.8          q11, q10, q11, #2
+        vmul.i16        q8,  q10, q0
+        vmla.i16        q8,  q11, q1
+        vrshl.u16       q8,  q8,  q15
+
+2:
+        vld1.16         {d20, d21, d22}, [\sr2], \s_strd
+        vld1.16         {d24, d25, d26}, [\src], \s_strd
+        vext.8          q11, q10, q11, #2
+        vext.8          q13, q12, q13, #2
+        vmul.i16        q9,  q10, q0
+        vmla.i16        q9,  q11, q1
+        vmul.i16        q10, q12, q0
+        vmla.i16        q10, q13, q1
+        vrshl.u16       q9,  q9,  q15
+        vrshl.u16       q10, q10, q15
+
+        vmull.u16       q11, d16, d4
+        vmlal.u16       q11, d18, d6
+        vmull.u16       q12, d17, d4
+        vmlal.u16       q12, d19, d6
+        vmull.u16       q8,  d18, d4
+        vmlal.u16       q8,  d20, d6
+        vmull.u16       q9,  d19, d4
+        vmlal.u16       q9,  d21, d6
+.ifc \type, put
+        vrshl.u32       q11, q11, q14
+        vrshl.u32       q12, q12, q14
+        vrshl.u32       q8,  q8,  q14
+        vrshl.u32       q9,  q9,  q14
+        vmovn.i32       d22, q11
+        vmovn.i32       d23, q12
+        vmovn.i32       d16, q8
+        vmovn.i32       d17, q9
+.else
+        vrshrn.i32      d22, q11, #4
+        vrshrn.i32      d23, q12, #4
+        vrshrn.i32      d16, q8,  #4
+        vrshrn.i32      d17, q9,  #4
+        vsub.i16        q11, q11, q14
+        vsub.i16        q8,  q8,  q14
+.endif
+        subs            \h,  \h,  #2
+        vst1.16         {q11}, [\dst, :128], \d_strd
+        vst1.16         {q8},  [\ds2, :128], \d_strd
+        ble             9f
+        vmov            q8,  q10
+        b               2b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #1
+        mov             \h,  \my
+        add             \src,  \src,  #16
+        add             \dst,  \dst,  #16
+        b               1b
+0:
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put,  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
+filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
+
+.macro load_filter_ptr src
+        asr             r12, \src, #10
+        add             r12, r11, r12, lsl #3
+.endm
+
+.macro load_filter_coef dst, src, inc
+        vld1.8          {\dst}, [r12, :64]
+        add             \src, \src, \inc
+.endm
+
+.macro load_filter_row dst, src, inc
+        load_filter_ptr \src
+        load_filter_coef \dst, \src, \inc
+.endm
+
+function warp_filter_horz_neon
+        load_filter_ptr r5                  // filter 0
+        vld1.16         {q6,q7}, [r2], r3
+
+        load_filter_coef d0, r5,  r7        // filter 0
+        load_filter_row d2,  r5,  r7        // filter 1
+        vmovl.s8        q0,  d0             // filter 0
+        vext.8          q3,  q6,  q7,  #2*1 // filter 1 pixels
+        vmovl.s8        q1,  d2             // filter 1
+
+        vmull.s16       q4,  d12, d0        // filter 0 output (0-3)
+        vmull.s16       q5,  d13, d1        // filter 0 output (4-7)
+
+        load_filter_ptr r5                  // filter 2
+
+        vmull.s16       q2,  d6,  d2        // filter 1 output (0-3)
+        vmull.s16       q3,  d7,  d3        // filter 1 output (4-7)
+
+        load_filter_coef d0, r5,  r7        // filter 2
+
+        vpadd.i32       d8,  d8,  d9        // half pixel 0 (2x32)
+        vpadd.i32       d9,  d10, d11       // half pixel 0 (2x32)
+
+        load_filter_ptr r5                  // filter 3
+
+        vpadd.i32       d4,  d4,  d5        // half pixel 1 (2x32)
+        vpadd.i32       d5,  d6,  d7        // half pixel 1 (2x32)
+
+        vmovl.s8        q0,  d0             // filter 2
+        vext.8          q3,  q6,  q7,  #2*2 // filter 2 pixels
+
+        vpadd.i32       d8,  d8,  d9        // pixel 0 (2x32)
+        vpadd.i32       d9,  d4,  d5        // pixel 1 (2x32)
+
+        load_filter_coef d2, r5,  r7        // filter 3
+
+        vmull.s16       q2,  d6,  d0        // filter 2 output (0-3)
+        vmull.s16       q3,  d7,  d1        // filter 2 output (4-7)
+
+        load_filter_ptr r5                  // filter 4
+
+        vpadd.i32       d8,  d8,  d9        // pixel 0,1
+
+        vpadd.i32       d9,  d4,  d5        // half pixel 2 (2x32)
+        vpadd.i32       d10, d6,  d7        // half pixel 2 (2x32)
+
+        vmovl.s8        q1,  d2             // filter 3
+        vext.8          q3,  q6,  q7,  #2*3 // filter 3 pixels
+
+        load_filter_coef d0, r5,  r7        // filter 4
+
+        vpadd.i32       d9,  d9,  d10       // pixel 2 (2x32)
+
+        vmull.s16       q2,  d6,  d2        // filter 3 output (0-3)
+        vmull.s16       q3,  d7,  d3        // filter 3 output (4-7)
+
+        vmovl.s8        q0,  d0             // filter 4
+        load_filter_ptr r5                  // filter 5
+
+        vpadd.i32       d10, d4,  d5        // half pixel 3 (2x32)
+        vpadd.i32       d11, d6,  d7        // half pixel 3 (2x32)
+
+        vext.8          q3,  q6,  q7,  #2*4 // filter 4 pixels
+        load_filter_coef d2, r5,  r7        // filter 5
+
+        vpadd.i32       d10, d10, d11       // pixel 3 (2x32)
+
+        vpadd.i32       d9,  d9,  d10       // pixel 2,3
+
+        vmull.s16       q2,  d6,  d0        // filter 4 output (0-3)
+        vmull.s16       q3,  d7,  d1        // filter 4 output (4-7)
+
+        vmovl.s8        q1,  d2             // filter 5
+        load_filter_ptr r5                  // filter 6
+
+        vpadd.i32       d10, d4,  d5        // half pixel 4 (2x32)
+        vpadd.i32       d11, d6,  d7        // half pixel 4 (2x32)
+
+        vext.8          q3,  q6,  q7,  #2*5 // filter 5 pixels
+        load_filter_coef d0, r5,  r7        // filter 6
+
+        vpadd.i32       d10, d10, d11       // pixel 4 (2x32)
+
+        vmull.s16       q2,  d6,  d2        // filter 5 output (0-3)
+        vmull.s16       q3,  d7,  d3        // filter 5 output (4-7)
+
+        vmovl.s8        q0,  d0             // filter 6
+        load_filter_ptr r5                  // filter 7
+
+        vpadd.i32       d4,  d4,  d5        // half pixel 5 (2x32)
+        vpadd.i32       d5,  d6,  d7        // half pixel 5 (2x32)
+
+        vext.8          q3,  q6,  q7,  #2*6 // filter 6 pixels
+        load_filter_coef d2, r5,  r7        // filter 7
+
+        vpadd.i32       d11, d4,  d5        // pixel 5 (2x32)
+
+        vmull.s16       q2,  d6,  d0        // filter 6 output (0-3)
+        vmull.s16       q3,  d7,  d1        // filter 6 output (4-7)
+
+        vmovl.s8        q1,  d2             // filter 7
+
+        vpadd.i32       d10, d10, d11       // pixel 4,5
+
+        vpadd.i32       d4,  d4,  d5        // half pixel 6 (2x32)
+        vpadd.i32       d5,  d6,  d7        // half pixel 6 (2x32)
+
+        vext.8          q3,  q6,  q7,  #2*7 // filter 7 pixels
+
+        vpadd.i32       d11, d4,  d5        // pixel 6 (2x32)
+
+        vmull.s16       q2,  d6,  d2        // filter 7 output (0-3)
+        vmull.s16       q3,  d7,  d3        // filter 7 output (4-7)
+
+        vld1.32         {d14[],d15[]}, [sp] // -(7 - intermediate_bits)
+
+        vpadd.i32       d4,  d4,  d5        // half pixel 7 (2x32)
+        vpadd.i32       d5,  d6,  d7        // half pixel 7 (2x32)
+
+        sub             r5,  r5,  r7, lsl #3
+
+        vpadd.i32       d4,  d4,  d5        // pixel 7 (2x32)
+
+        add             r5,  r5,  r8
+
+        vpadd.i32       d11, d11, d4        // pixel 6,7
+
+        vrshl.s32       q4,  q4,  q7        // -(7 - intermediate_bits)
+        vrshl.s32       q5,  q5,  q7        // -(7 - intermediate_bits)
+
+        bx              lr
+endfunc
+
+// void dav1d_warp_affine_8x8_16bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my,
+//         const int bitdepth_max)
+.macro warp t
+function warp_affine_8x8\t\()_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        sub             sp,  sp,  #8
+
+        clz             r7,  r7
+                                      // intermediate_bits = clz(bitdepth_max) - 18
+.ifb \t
+        sub             r8,  r7,  #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
+.endif
+        sub             r7,  r7,  #25 // -(7 - intermediate_bits)
+.ifb \t
+        neg             r8,  r8       // -(7 + intermediate_bits)
+.endif
+        str             r7,  [sp]     // spill -(7 - intermediate_bits) on stack
+.ifb \t
+        str             r8,  [sp, #4] // spill -(7 + intermediate_bits) on stack
+.endif
+
+        ldrd            r8,  r9,  [r4]
+        sxth            r7,  r8
+        asr             r8,  r8, #16
+        asr             r4,  r9, #16
+        sxth            r9,  r9
+        mov             r10, #8
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        sub             r2,  r2,  #6
+        movrel          r11, X(mc_warp_filter), 64*8
+.ifnb \t
+        lsl             r1,  r1,  #1
+.endif
+        add             r5,  r5,  #512
+        add             r6,  r6,  #512
+
+        bl              warp_filter_horz_neon
+        vmovn.i32       d16, q4
+        vmovn.i32       d17, q5
+        bl              warp_filter_horz_neon
+        vmovn.i32       d18, q4
+        vmovn.i32       d19, q5
+        bl              warp_filter_horz_neon
+        vmovn.i32       d20, q4
+        vmovn.i32       d21, q5
+        bl              warp_filter_horz_neon
+        vmovn.i32       d22, q4
+        vmovn.i32       d23, q5
+        bl              warp_filter_horz_neon
+        vmovn.i32       d24, q4
+        vmovn.i32       d25, q5
+        bl              warp_filter_horz_neon
+        vmovn.i32       d26, q4
+        vmovn.i32       d27, q5
+        bl              warp_filter_horz_neon
+        vmovn.i32       d28, q4
+        vmovn.i32       d29, q5
+
+1:
+        bl              warp_filter_horz_neon
+        vmovn.i32       d30, q4
+        vmovn.i32       d31, q5
+
+        load_filter_row d8,  r6,  r9
+        load_filter_row d9,  r6,  r9
+        load_filter_row d10, r6,  r9
+        load_filter_row d11, r6,  r9
+        load_filter_row d12, r6,  r9
+        load_filter_row d13, r6,  r9
+        load_filter_row d14, r6,  r9
+        load_filter_row d15, r6,  r9
+        transpose_8x8b  q4,  q5,  q6,  q7,  d8,  d9,  d10, d11, d12, d13, d14, d15
+        vmovl.s8        q1,  d8
+        vmovl.s8        q2,  d9
+        vmovl.s8        q3,  d10
+        vmovl.s8        q4,  d11
+        vmovl.s8        q5,  d12
+        vmovl.s8        q6,  d13
+
+        sub             r6,  r6,  r9, lsl #3
+
+        // This ordering of vmull/vmlal is highly beneficial for
+        // Cortex A8/A9/A53 here, but harmful for Cortex A7.
+        vmull.s16       q0,  d16,  d2
+        vmlal.s16       q0,  d18,  d4
+        vmlal.s16       q0,  d20,  d6
+        vmlal.s16       q0,  d22,  d8
+        vmlal.s16       q0,  d24,  d10
+        vmlal.s16       q0,  d26,  d12
+        vmull.s16       q1,  d17,  d3
+        vmlal.s16       q1,  d19,  d5
+        vmlal.s16       q1,  d21,  d7
+        vmlal.s16       q1,  d23,  d9
+        vmlal.s16       q1,  d25,  d11
+        vmlal.s16       q1,  d27,  d13
+
+        vmovl.s8        q2,  d14
+        vmovl.s8        q3,  d15
+
+        vmlal.s16       q0,  d28,  d4
+        vmlal.s16       q0,  d30,  d6
+        vmlal.s16       q1,  d29,  d5
+        vmlal.s16       q1,  d31,  d7
+
+.ifb \t
+        ldr             lr,  [sp, #4]   // -(7 + intermediate_bits)
+        ldr             r12, [sp, #120] // bitdepth_max
+        vdup.32         q2,  lr         // -(7 + intermediate_bits)
+        vdup.16         q3,  r12        // bitdepth_max
+.endif
+
+        vmov            q8,  q9
+        vmov            q9,  q10
+.ifb \t
+        vrshl.s32       q0,  q0,  q2    // -(7 + intermediate_bits)
+        vrshl.s32       q1,  q1,  q2    // -(7 + intermediate_bits)
+.else
+        vrshrn.s32      d0,  q0,  #7
+        vrshrn.s32      d1,  q1,  #7
+        vmov.i16        q3,  #PREP_BIAS
+.endif
+        vmov            q10, q11
+.ifb \t
+        vqmovun.s32     d0,  q0
+        vqmovun.s32     d1,  q1
+.else
+        vsub.i16        q0,  q0,  q3    // PREP_BIAS
+.endif
+        vmov            q11, q12
+        vmov            q12, q13
+.ifb \t
+        vmin.u16        q0,  q0,  q3    // bitdepth_max
+.endif
+        vmov            q13, q14
+        vmov            q14, q15
+        subs            r10, r10, #1
+        vst1.16         {q0}, [r0, :128], r1
+
+        add             r6,  r6,  r4
+        bgt             1b
+
+        add             sp,  sp,  #8
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+warp
+warp t
--- a/src/arm/64/cdef_tmpl.S
+++ b/src/arm/64/cdef_tmpl.S
@@ -107,7 +107,7 @@
 .macro filter_func w, bpc, pri, sec, min, suffix
 function cdef_filter\w\suffix\()_\bpc\()bpc_neon
 .if \bpc == 8
-        ldr             w8,  [sp]                   // bitdepth_max
+        ldr             w8,  [sp]                   // edges
         cmp             w8,  #0xf
         b.eq            cdef_filter\w\suffix\()_edged_8bpc_neon
 .endif
--- a/src/arm/64/loopfilter.S
+++ b/src/arm/64/loopfilter.S
@@ -1034,11 +1034,11 @@
         ld1r            {v6.16b}, [x5]            // sharp[1]
         sub             x5,  x5,  #8
         bif             v1.16b,  v0.16b,  v3.16b  // if (!l[0][0]) L = l[offset][0]
+        cmtst           v2.4s,   v1.4s,   v2.4s   // L != 0
         mul             v1.4s,   v1.4s,   v4.4s   // L
 .ifc \type, y
         dup             v15.4s,  w2               // vmask[2]
 .endif
-        cmtst           v2.4s,   v1.4s,   v2.4s   // L != 0
         dup             v14.4s,  w7               // vmask[1]
         mov             x16, v2.d[0]
         mov             x17, v2.d[1]
--- a/src/arm/64/loopfilter16.S
+++ b/src/arm/64/loopfilter16.S
@@ -785,7 +785,7 @@
         orr             w6,  w6,  w7             // vmask[0] |= vmask[1]
 
 1:
-        tst             w6,  #0x0f
+        tst             w6,  #0x03
 .ifc \dir, v
         ld1             {v0.8b}, [x4], #8
         ld1             {v1.8b}, [x3], #8
@@ -808,11 +808,11 @@
         ld1r            {v6.8b}, [x5]             // sharp[1]
         sub             x5,  x5,  #8
         bif             v1.8b,   v0.8b,   v3.8b   // if (!l[0][0]) L = l[offset][0]
+        cmtst           v2.2s,   v1.2s,   v2.2s   // L != 0
         mul             v1.2s,   v1.2s,   v4.2s   // L
 .ifc \type, y
         dup             v15.2s,  w2               // vmask[2]
 .endif
-        cmtst           v2.2s,   v1.2s,   v2.2s   // L != 0
         dup             v14.2s,  w7               // vmask[1]
         mov             x16, v2.d[0]
         cmp             x16, #0
@@ -847,7 +847,7 @@
         ushl            v10.8h,  v10.8h,  v31.8h
 
 .ifc \type, y
-        tst             w2,  #0x0f
+        tst             w2,  #0x03
         b.eq            2f
         // wd16
         bl              lpf_\dir\()_16_8_neon
@@ -854,7 +854,7 @@
         b               8f
 2:
 .endif
-        tst             w7,  #0x0f
+        tst             w7,  #0x03
         b.eq            3f
 .ifc \type, y
         // wd8
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -30,7 +30,7 @@
 
 // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
 //                                      const pixel *src, ptrdiff_t stride,
-//                                      const int16_t fh[7], const intptr_t w,
+//                                      const int16_t fh[8], intptr_t w,
 //                                      int h, enum LrEdgeFlags edges);
 function wiener_filter_h_8bpc_neon, export=1
         mov             w8,  w5
@@ -308,13 +308,11 @@
 
 // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride,
 //                                      const int16_t *mid, int w, int h,
-//                                      const int16_t fv[7], enum LrEdgeFlags edges,
+//                                      const int16_t fv[8], enum LrEdgeFlags edges,
 //                                      ptrdiff_t mid_stride);
 function wiener_filter_v_8bpc_neon, export=1
         mov             w8,  w4
         ld1             {v0.8h},  [x5]
-        movi            v1.8h, #128
-        add             v1.8h,  v1.8h,  v0.8h
 
         // Calculate the number of rows to move back when looping vertically
         mov             w11, w4
@@ -359,7 +357,7 @@
         smull           v2.4s,  v16.4h,  v0.h[0]
         smlal           v2.4s,  v17.4h,  v0.h[1]
         smlal           v2.4s,  v18.4h,  v0.h[2]
-        smlal           v2.4s,  v19.4h,  v1.h[3]
+        smlal           v2.4s,  v19.4h,  v0.h[3]
         smlal           v2.4s,  v20.4h,  v0.h[4]
         smlal           v2.4s,  v21.4h,  v0.h[5]
         smlal           v2.4s,  v22.4h,  v0.h[6]
@@ -366,7 +364,7 @@
         smull2          v3.4s,  v16.8h,  v0.h[0]
         smlal2          v3.4s,  v17.8h,  v0.h[1]
         smlal2          v3.4s,  v18.8h,  v0.h[2]
-        smlal2          v3.4s,  v19.8h,  v1.h[3]
+        smlal2          v3.4s,  v19.8h,  v0.h[3]
         smlal2          v3.4s,  v20.8h,  v0.h[4]
         smlal2          v3.4s,  v21.8h,  v0.h[5]
         smlal2          v3.4s,  v22.8h,  v0.h[6]
--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -126,7 +126,7 @@
 
         tst             w7,  #2 // LR_HAVE_RIGHT
         b.ne            4f
-        // If we'll need to pad the right edge, load that byte to pad with
+        // If we'll need to pad the right edge, load that pixel to pad with
         // here since we can find it pretty easily from here.
         sub             w9,  w5,  #14
         ldr             h27, [x2,  w9, sxtw #1]
@@ -143,12 +143,6 @@
         b               6f
 
 4:      // Loop horizontally
-.macro ushll_sz d0, d1, src, shift, wd
-        ushll           \d0\().4s,  \src\().4h,  \shift
-.ifc \wd, .8h
-        ushll2          \d1\().4s,  \src\().8h,  \shift
-.endif
-.endm
 .macro add_sz d0, d1, s0, s1, c, wd
         add             \d0\().4s,  \s0\().4s,   \c\().4s
 .ifc \wd, .8h
@@ -178,8 +172,7 @@
         ext             v19.16b, v2.16b,  v3.16b, #8
         ext             v20.16b, v2.16b,  v3.16b, #10
         ext             v21.16b, v2.16b,  v3.16b, #12
-        ushll_sz        v6,  v7,  v18, #7, \wd
-        smlal           v6.4s,   v2.4h,   v0.h[0]
+        smull           v6.4s,   v2.4h,   v0.h[0]
         smlal           v6.4s,   v16.4h,  v0.h[1]
         smlal           v6.4s,   v17.4h,  v0.h[2]
         smlal           v6.4s,   v18.4h,  v0.h[3]
@@ -187,7 +180,7 @@
         smlal           v6.4s,   v20.4h,  v0.h[5]
         smlal           v6.4s,   v21.4h,  v0.h[6]
 .ifc \wd, .8h
-        smlal2          v7.4s,   v2.8h,   v0.h[0]
+        smull2          v7.4s,   v2.8h,   v0.h[0]
         smlal2          v7.4s,   v16.8h,  v0.h[1]
         smlal2          v7.4s,   v17.8h,  v0.h[2]
         smlal2          v7.4s,   v18.8h,  v0.h[3]
@@ -201,8 +194,7 @@
         ext             v22.16b, v4.16b,  v5.16b, #8
         ext             v23.16b, v4.16b,  v5.16b, #10
         ext             v24.16b, v4.16b,  v5.16b, #12
-        ushll_sz        v16, v17, v21, #7, \wd
-        smlal           v16.4s,  v4.4h,   v0.h[0]
+        smull           v16.4s,  v4.4h,   v0.h[0]
         smlal           v16.4s,  v19.4h,  v0.h[1]
         smlal           v16.4s,  v20.4h,  v0.h[2]
         smlal           v16.4s,  v21.4h,  v0.h[3]
@@ -210,7 +202,7 @@
         smlal           v16.4s,  v23.4h,  v0.h[5]
         smlal           v16.4s,  v24.4h,  v0.h[6]
 .ifc \wd, .8h
-        smlal2          v17.4s,  v4.8h,   v0.h[0]
+        smull2          v17.4s,  v4.8h,   v0.h[0]
         smlal2          v17.4s,  v19.8h,  v0.h[1]
         smlal2          v17.4s,  v20.8h,  v0.h[2]
         smlal2          v17.4s,  v21.8h,  v0.h[3]
@@ -329,14 +321,10 @@
         add             v16.4s,  v16.4s,  v17.4s
         addv            s6,      v6.4s
         addv            s7,      v16.4s
-        dup             v16.4h,  v2.h[3]
-        ins             v16.h[1], v4.h[3]
         ins             v6.s[1], v7.s[0]
         mvni            v24.4h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
-        ushll           v16.4s,  v16.4h,  #7
-        add             v6.4s,   v6.4s,   v30.4s
-        add             v6.4s,   v6.4s,   v16.4s
-        srshl           v6.4s,   v6.4s,   v29.4s
+        add             v6.2s,   v6.2s,   v30.2s
+        srshl           v6.2s,   v6.2s,   v29.2s
         sqxtun          v6.4h,   v6.4s
         umin            v6.4h,   v6.4h,   v24.4h
         sub             v6.4h,   v6.4h,   v31.4h
@@ -371,9 +359,7 @@
         ld1             {v0.8h},  [x5]
         dup             v31.8h,  w8
         clz             w8,  w8
-        movi            v1.8h,   #128
         sub             w8,  w8,  #11   // round_bits_v
-        add             v1.8h,   v1.8h,   v0.8h
         dup             v30.4s,  w8
         mov             w8,  w4
         neg             v30.4s,  v30.4s // -round_bits_v
@@ -421,7 +407,7 @@
         smull           v2.4s,  v16.4h,  v0.h[0]
         smlal           v2.4s,  v17.4h,  v0.h[1]
         smlal           v2.4s,  v18.4h,  v0.h[2]
-        smlal           v2.4s,  v19.4h,  v1.h[3]
+        smlal           v2.4s,  v19.4h,  v0.h[3]
         smlal           v2.4s,  v20.4h,  v0.h[4]
         smlal           v2.4s,  v21.4h,  v0.h[5]
         smlal           v2.4s,  v22.4h,  v0.h[6]
@@ -428,7 +414,7 @@
         smull2          v3.4s,  v16.8h,  v0.h[0]
         smlal2          v3.4s,  v17.8h,  v0.h[1]
         smlal2          v3.4s,  v18.8h,  v0.h[2]
-        smlal2          v3.4s,  v19.8h,  v1.h[3]
+        smlal2          v3.4s,  v19.8h,  v0.h[3]
         smlal2          v3.4s,  v20.8h,  v0.h[4]
         smlal2          v3.4s,  v21.8h,  v0.h[5]
         smlal2          v3.4s,  v22.8h,  v0.h[6]
@@ -770,16 +756,9 @@
         ext             v16.16b, v18.16b, v16.16b, #12
 
 2:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
-
         tst             w7,  #2 // LR_HAVE_RIGHT
         b.ne            4f
-        // If we'll need to pad the right edge, load that byte to pad with
+        // If we'll need to pad the right edge, load that pixel to pad with
         // here since we can find it pretty easily from here.
         sub             w13, w5, #(2 + 16 - 2 + 1)
         ldr             h30, [x3,  w13, sxtw #1]
@@ -796,41 +775,33 @@
         b               6f
 
 4:      // Loop horizontally
-.macro ext_n            dst1, dst2, src1, src2, src3, n, w
-        ext             \dst1,  \src1,  \src2,  \n
+.macro add3 w, wd
+        ext             v26.16b, v0.16b,  v1.16b,  #2
+        ext             v28.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v0.16b,  v1.16b,  #4
+        ext             v29.16b, v16.16b, v17.16b, #4
+
+        add             v6\wd,   v0\wd,   v26\wd
+        umull           v22.4s,  v0.4h,   v0.4h
+        umlal           v22.4s,  v26.4h,  v26.4h
+        umlal           v22.4s,  v27.4h,  v27.4h
+        add             v7\wd,   v16\wd,  v28\wd
+        umull           v24.4s,  v16.4h,  v16.4h
+        umlal           v24.4s,  v28.4h,  v28.4h
+        umlal           v24.4s,  v29.4h,  v29.4h
+        add             v6\wd,   v6\wd,   v27\wd
 .if \w > 4
-        ext             \dst2,  \src2,  \src3,  \n
+        umull2          v23.4s,  v0.8h,   v0.8h
+        umlal2          v23.4s,  v26.8h,  v26.8h
+        umlal2          v23.4s,  v27.8h,  v27.8h
 .endif
-.endm
-.macro add_n            dst1, dst2, src1, src2, src3, src4, w
-        add             \dst1,  \src1,  \src3
+        add             v7\wd,   v7\wd,   v29\wd
 .if \w > 4
-        add             \dst2,  \src2,  \src4
+        umull2          v25.4s,  v16.8h,  v16.8h
+        umlal2          v25.4s,  v28.8h,  v28.8h
+        umlal2          v25.4s,  v29.8h,  v29.8h
 .endif
 .endm
-
-.macro add3 w, wd
-        ext             v24.16b, v0.16b,  v1.16b,  #2
-        ext             v25.16b, v0.16b,  v1.16b,  #4
-        ext             v26.16b, v16.16b, v17.16b, #2
-        ext             v27.16b, v16.16b, v17.16b, #4
-        add             v6\wd,   v0\wd,   v24\wd
-        add             v7\wd,   v16\wd,  v26\wd
-        add             v6\wd,   v6\wd,   v25\wd
-        add             v7\wd,   v7\wd,   v27\wd
-
-        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4, \w
-        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8, \w
-
-        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
-        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
-
-        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
-        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
-
-        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
-        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
-.endm
         add3            8, .8h
         st1             {v6.8h},         [x1],  #16
         st1             {v7.8h},         [x11], #16
@@ -844,12 +815,6 @@
         mov             v16.16b, v17.16b
         ld1             {v1.8h},  [x3],  #16
         ld1             {v17.8h}, [x12], #16
-        mov             v2.16b,  v4.16b
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        mov             v18.16b, v20.16b
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
 
         b.ne            4b // If we don't need to pad, just keep summing.
         b               3b // If we need to pad, check how many pixels we have left.
@@ -907,11 +872,6 @@
         .hword L(box3_variable_shift_tbl) - 55b
 
 88:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-
         add3            4, .4h
         subs            w5,  w5,  #4
         st1             {v6.4h},  [x1],  #8
@@ -921,10 +881,6 @@
         b.le            9f
         ext             v0.16b,  v0.16b,  v0.16b,  #8
         ext             v16.16b, v16.16b, v16.16b, #8
-        mov             v2.16b,  v3.16b
-        mov             v3.16b,  v4.16b
-        mov             v18.16b, v19.16b
-        mov             v19.16b, v20.16b
         // Only one needed pixel left, but do a normal 4 pixel
         // addition anyway
         add3            4, .4h
@@ -1026,7 +982,7 @@
         // and shift v0/v1 to have 3x the first pixel at the front.
         dup             v2.8h,  v0.h[0]
         dup             v18.8h, v16.h[0]
-        // Move x3 back to account for the last 6 bytes we loaded before,
+        // Move x3 back to account for the last 3 pixels we loaded before,
         // which we shifted out.
         sub             x3,  x3,  #6
         sub             x12, x12, #6
@@ -1036,16 +992,9 @@
         ext             v16.16b, v18.16b, v16.16b, #10
 
 2:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
-
         tst             w7,  #2 // LR_HAVE_RIGHT
         b.ne            4f
-        // If we'll need to pad the right edge, load that byte to pad with
+        // If we'll need to pad the right edge, load that pixel to pad with
         // here since we can find it pretty easily from here.
         sub             w13, w5, #(2 + 16 - 3 + 1)
         ldr             h30, [x3,  w13, sxtw #1]
@@ -1063,43 +1012,53 @@
 
 4:      // Loop horizontally
 .macro add5 w, wd
-        ext             v24.16b, v0.16b,  v1.16b,  #2
-        ext             v25.16b, v0.16b,  v1.16b,  #4
-        ext             v26.16b, v0.16b,  v1.16b,  #6
-        ext             v27.16b, v0.16b,  v1.16b,  #8
+        ext             v26.16b, v0.16b,  v1.16b,  #2
+        ext             v28.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v0.16b,  v1.16b,  #4
+        ext             v29.16b, v16.16b, v17.16b, #4
 
-        add             v6\wd,   v0\wd,   v24\wd
-        add             v25\wd,  v25\wd,  v26\wd
+        add             v6\wd,   v0\wd,   v26\wd
+        umull           v22.4s,  v0.4h,   v0.4h
+        umlal           v22.4s,  v26.4h,  v26.4h
+        umlal           v22.4s,  v27.4h,  v27.4h
+        add             v7\wd,   v16\wd,  v28\wd
+        umull           v24.4s,  v16.4h,  v16.4h
+        umlal           v24.4s,  v28.4h,  v28.4h
+        umlal           v24.4s,  v29.4h,  v29.4h
         add             v6\wd,   v6\wd,   v27\wd
+.if \w > 4
+        umull2          v23.4s,  v0.8h,   v0.8h
+        umlal2          v23.4s,  v26.8h,  v26.8h
+        umlal2          v23.4s,  v27.8h,  v27.8h
+.endif
+        add             v7\wd,   v7\wd,   v29\wd
+.if \w > 4
+        umull2          v25.4s,  v16.8h,  v16.8h
+        umlal2          v25.4s,  v28.8h,  v28.8h
+        umlal2          v25.4s,  v29.8h,  v29.8h
+.endif
 
-        ext             v26.16b, v16.16b, v17.16b, #2
-        ext             v27.16b, v16.16b, v17.16b, #4
+        ext             v26.16b, v0.16b,  v1.16b,  #6
         ext             v28.16b, v16.16b, v17.16b, #6
+        ext             v27.16b, v0.16b,  v1.16b,  #8
         ext             v29.16b, v16.16b, v17.16b, #8
 
-        add             v7\wd,   v16\wd,  v26\wd
-        add             v27\wd,  v27\wd,  v28\wd
+        add             v6\wd,   v6\wd,   v26\wd
+        umlal           v22.4s,  v26.4h,  v26.4h
+        umlal           v22.4s,  v27.4h,  v27.4h
+        add             v7\wd,   v7\wd,   v28\wd
+        umlal           v24.4s,  v28.4h,  v28.4h
+        umlal           v24.4s,  v29.4h,  v29.4h
+        add             v6\wd,   v6\wd,   v27\wd
+.if \w > 4
+        umlal2          v23.4s,  v26.8h,  v26.8h
+        umlal2          v23.4s,  v27.8h,  v27.8h
+.endif
         add             v7\wd,   v7\wd,   v29\wd
-        add             v6\wd,   v6\wd,   v25\wd
-        add             v7\wd,   v7\wd,   v27\wd
-
-        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4,  \w
-        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8,  \w
-        ext_n           v28.16b, v29.16b, v2.16b,  v3.16b,  v4.16b,  #12, \w
-
-        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
-        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
-        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v3.4s,   v4.4s,   \w
-        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
-
-        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4,  \w
-        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8,  \w
-        ext_n           v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w
-
-        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
-        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
-        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v19.4s,  v20.4s,  \w
-        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.if \w > 4
+        umlal2          v25.4s,  v28.8h,  v28.8h
+        umlal2          v25.4s,  v29.8h,  v29.8h
+.endif
 .endm
         add5            8, .8h
         st1             {v6.8h},         [x1],  #16
@@ -1114,12 +1073,6 @@
         mov             v16.16b, v17.16b
         ld1             {v1.8h},  [x3],  #16
         ld1             {v17.8h}, [x12], #16
-        mov             v2.16b,  v4.16b
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        mov             v18.16b, v20.16b
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
 
         b.ne            4b // If we don't need to pad, just keep summing.
         b               3b // If we need to pad, check how many pixels we have left.
@@ -1193,13 +1146,6 @@
         .hword L(box5_variable_shift_tbl) - 77b
 
 88:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
-
         add5            4, .4h
         subs            w5,  w5,  #4
         st1             {v6.4h},  [x1],  #8
@@ -1209,10 +1155,6 @@
         b.le            9f
         ext             v0.16b,  v0.16b,  v1.16b,  #8
         ext             v16.16b, v16.16b, v17.16b, #8
-        mov             v2.16b,  v3.16b
-        mov             v3.16b,  v4.16b
-        mov             v18.16b, v19.16b
-        mov             v19.16b, v20.16b
         add5            4, .4h
         st1             {v6.4h},  [x1],  #8
         st1             {v7.4h},  [x11], #8
--- a/src/arm/64/looprestoration_tmpl.S
+++ b/src/arm/64/looprestoration_tmpl.S
@@ -454,7 +454,7 @@
 //                                    const pixel *src, const ptrdiff_t src_stride,
 //                                    const int16_t *t1, const int16_t *t2,
 //                                    const int w, const int h,
-//                                    const int16_t wt[2]);
+//                                    const int16_t wt[2], const int bitdepth_max);
 function sgr_weighted2_\bpc\()bpc_neon, export=1
 .if \bpc == 8
         ldr             x8,  [sp]
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1906,11 +1906,10 @@
         bl              L(\type\()_8tap_filter_2)
 
         ext             v18.8b, v17.8b, v28.8b, #4
-        mov             v19.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal           v2.4s,  v28.4h, v1.h[3]
 
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqxtun          v2.8b,  v2.8h
@@ -1919,7 +1918,7 @@
         st1             {v2.h}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.8b, v18.8b
-        mov             v17.8b, v19.8b
+        mov             v17.8b, v28.8b
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
@@ -1956,7 +1955,6 @@
 28:
         bl              L(\type\()_8tap_filter_2)
         ext             v22.8b, v21.8b, v28.8b, #4
-        mov             v23.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -1964,7 +1962,7 @@
         smlal           v2.4s,  v20.4h, v1.h[4]
         smlal           v2.4s,  v21.4h, v1.h[5]
         smlal           v2.4s,  v22.4h, v1.h[6]
-        smlal           v2.4s,  v23.4h, v1.h[7]
+        smlal           v2.4s,  v28.4h, v1.h[7]
 
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqxtun          v2.8b,  v2.8h
@@ -1977,7 +1975,7 @@
         mov             v18.8b, v20.8b
         mov             v19.8b, v21.8b
         mov             v20.8b, v22.8b
-        mov             v21.8b, v23.8b
+        mov             v21.8b, v28.8b
         b               28b
 
 0:
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -1004,11 +1004,11 @@
         b.gt            2b
         ret
 4:
-        ld1             {v0.8b}, [x2], x3
-        ld1             {v1.8b}, [x2], x3
+        ld1             {v0.4h}, [x2], x3
+        ld1             {v1.4h}, [x2], x3
         subs            w5,  w5,  #2
-        st1             {v0.8b}, [x0], x1
-        st1             {v1.8b}, [x0], x1
+        st1             {v0.4h}, [x0], x1
+        st1             {v1.4h}, [x0], x1
         b.gt            4b
         ret
 80:
@@ -1017,11 +1017,11 @@
         add             x9,  x2,  x3
         lsl             x3,  x3,  #1
 8:
-        ld1             {v0.16b}, [x2], x3
-        ld1             {v1.16b}, [x9], x3
+        ld1             {v0.8h}, [x2], x3
+        ld1             {v1.8h}, [x9], x3
         subs            w5,  w5,  #2
-        st1             {v0.16b}, [x0], x1
-        st1             {v1.16b}, [x8], x1
+        st1             {v0.8h}, [x0], x1
+        st1             {v1.8h}, [x8], x1
         b.gt            8b
         ret
 16:
@@ -2039,7 +2039,6 @@
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
         mov             x15, x30
-        sxtl            v1.4s,   v1.4h
 
         ld1             {v27.8h}, [\src], \s_strd
         ext             v28.16b, v27.16b, v27.16b, #2
@@ -2049,19 +2048,23 @@
         addp            v16.4s,  v27.4s,  v27.4s
         srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
         bl              L(\type\()_8tap_filter_2)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s
 
-        trn1            v16.2d,  v16.2d,  v24.2d
-        mov             v17.16b, v24.16b
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b
 
 2:
         bl              L(\type\()_8tap_filter_2)
 
-        ext             v18.16b, v17.16b, v24.16b, #8
-        mov             v19.16b, v24.16b
-        mul             v2.4s,   v16.4s,  v1.s[0]
-        mla             v2.4s,   v17.4s,  v1.s[1]
-        mla             v2.4s,   v18.4s,  v1.s[2]
-        mla             v2.4s,   v19.4s,  v1.s[3]
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal           v2.4s,   v24.4h,  v1.h[3]
 
         srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v2.4h,   v2.4s
@@ -2070,8 +2073,8 @@
         st1             {v2.s}[0], [\dst], \d_strd
         st1             {v2.s}[1], [\ds2], \d_strd
         b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v24.8b
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
@@ -2085,8 +2088,6 @@
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
         mov             x15, x30
-        sxtl2           v2.4s,   v1.8h
-        sxtl            v1.4s,   v1.4h
 
         ld1             {v27.8h}, [\src], \s_strd
         ext             v28.16b, v27.16b, v27.16b, #2
@@ -2095,29 +2096,33 @@
         addp            v27.4s,  v27.4s,  v28.4s
         addp            v16.4s,  v27.4s,  v27.4s
         srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
 
         bl              L(\type\()_8tap_filter_2)
-        trn1            v16.2d,  v16.2d,  v24.2d
-        mov             v17.16b, v24.16b
+        xtn             v16.4h,  v16.4s
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b
         bl              L(\type\()_8tap_filter_2)
-        ext             v18.16b, v17.16b, v24.16b, #8
-        mov             v19.16b, v24.16b
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        mov             v19.8b,  v24.8b
         bl              L(\type\()_8tap_filter_2)
-        ext             v20.16b, v19.16b, v24.16b, #8
-        mov             v21.16b, v24.16b
+        ext             v20.8b,  v19.8b,  v24.8b,  #4
+        mov             v21.8b,  v24.8b
 
 28:
         bl              L(\type\()_8tap_filter_2)
-        ext             v22.16b, v21.16b, v24.16b, #8
-        mov             v23.16b, v24.16b
-        mul             v3.4s,   v16.4s,  v1.s[0]
-        mla             v3.4s,   v17.4s,  v1.s[1]
-        mla             v3.4s,   v18.4s,  v1.s[2]
-        mla             v3.4s,   v19.4s,  v1.s[3]
-        mla             v3.4s,   v20.4s,  v2.s[0]
-        mla             v3.4s,   v21.4s,  v2.s[1]
-        mla             v3.4s,   v22.4s,  v2.s[2]
-        mla             v3.4s,   v23.4s,  v2.s[3]
+        ext             v22.8b,  v21.8b,  v24.8b,  #4
+        smull           v3.4s,   v16.4h,  v1.h[0]
+        smlal           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+        smlal           v3.4s,   v24.4h,  v1.h[7]
 
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v3.4h,   v3.4s
@@ -2126,12 +2131,12 @@
         st1             {v3.s}[0], [\dst], \d_strd
         st1             {v3.s}[1], [\ds2], \d_strd
         b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v18.16b, v20.16b
-        mov             v19.16b, v21.16b
-        mov             v20.16b, v22.16b
-        mov             v21.16b, v23.16b
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v19.8b
+        mov             v18.8b,  v20.8b
+        mov             v19.8b,  v21.8b
+        mov             v20.8b,  v22.8b
+        mov             v21.8b,  v24.8b
         b               28b
 
 0:
@@ -2151,6 +2156,7 @@
         smlal           v24.4s,  v27.4h,  v0.h[2]
         smlal           v24.4s,  v28.4h,  v0.h[3]
         srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        xtn             v24.4h,  v24.4s
         ret
 .endif
 
--- a/src/arm/cdef_init_tmpl.c
+++ b/src/arm/cdef_init_tmpl.c
@@ -27,7 +27,6 @@
 #include "src/cpu.h"
 #include "src/cdef.h"
 
-#if BITDEPTH == 8 || ARCH_AARCH64
 decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
 
 void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
@@ -72,7 +71,6 @@
 DEFINE_FILTER(8, 8, 16)
 DEFINE_FILTER(4, 8, 8)
 DEFINE_FILTER(4, 4, 8)
-#endif
 
 
 COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
@@ -80,10 +78,8 @@
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-#if BITDEPTH == 8 || ARCH_AARCH64
     c->dir = BF(dav1d_cdef_find_dir, neon);
     c->fb[0] = cdef_filter_8x8_neon;
     c->fb[1] = cdef_filter_4x8_neon;
     c->fb[2] = cdef_filter_4x4_neon;
-#endif
 }
--- a/src/arm/loopfilter_init_tmpl.c
+++ b/src/arm/loopfilter_init_tmpl.c
@@ -38,10 +38,8 @@
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-#if BITDEPTH == 8 || ARCH_AARCH64
     c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon);
     c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon);
     c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon);
     c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon);
-#endif
 }
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -29,7 +29,6 @@
 #include "src/looprestoration.h"
 #include "src/tables.h"
 
-#if BITDEPTH == 8 || ARCH_AARCH64
 // The 8bpc version calculates things slightly differently than the reference
 // C version. That version calculates roughly this:
 // int16_t sum = 0;
@@ -46,12 +45,11 @@
 // 1 << (bitdepth + 6 - round_bits_h).
 void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
                                      const pixel *src, ptrdiff_t stride,
-                                     const int16_t fh[7], const intptr_t w,
+                                     const int16_t fh[8], intptr_t w,
                                      int h, enum LrEdgeFlags edges
                                      HIGHBD_DECL_SUFFIX);
 // This calculates things slightly differently than the reference C version.
 // This version calculates roughly this:
-// fv[3] += 128;
 // int32_t sum = 0;
 // for (int i = 0; i < 7; i++)
 //     sum += mid[idx] * fv[i];
@@ -59,7 +57,7 @@
 // This function assumes that the width is a multiple of 8.
 void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
                                      const int16_t *mid, int w, int h,
-                                     const int16_t fv[7], enum LrEdgeFlags edges,
+                                     const int16_t fv[8], enum LrEdgeFlags edges,
                                      ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
 void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride,
                                  const pixel *src, int w, int h);
@@ -67,9 +65,9 @@
 static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
                                const pixel (*const left)[4],
                                const pixel *lpf, const ptrdiff_t lpf_stride,
-                               const int w, const int h, const int16_t fh[7],
-                               const int16_t fv[7], const enum LrEdgeFlags edges
-                               HIGHBD_DECL_SUFFIX)
+                               const int w, const int h,
+                               const int16_t filter[2][8],
+                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     ALIGN_STK_16(int16_t, mid, 68 * 384,);
     int mid_stride = (w + 7) & ~7;
@@ -76,20 +74,21 @@
 
     // Horizontal filter
     BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
-                                    fh, w, h, edges HIGHBD_TAIL_SUFFIX);
+                                    filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
     if (edges & LR_HAVE_TOP)
         BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
-                                        fh, w, 2, edges HIGHBD_TAIL_SUFFIX);
+                                        filter[0], w, 2, edges
+                                        HIGHBD_TAIL_SUFFIX);
     if (edges & LR_HAVE_BOTTOM)
         BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
                                         lpf + 6 * PXSTRIDE(lpf_stride),
-                                        lpf_stride, fh, w, 2, edges
+                                        lpf_stride, filter[0], w, 2, edges
                                         HIGHBD_TAIL_SUFFIX);
 
     // Vertical filter
     if (w >= 8)
         BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
-                                        w & ~7, h, fv, edges,
+                                        w & ~7, h, filter[1], edges,
                                         mid_stride * sizeof(*mid)
                                         HIGHBD_TAIL_SUFFIX);
     if (w & 7) {
@@ -98,7 +97,7 @@
         ALIGN_STK_16(pixel, tmp, 64 * 8,);
         BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel),
                                         &mid[2*mid_stride + (w & ~7)],
-                                        w & 7, h, fv, edges,
+                                        w & 7, h, filter[1], edges,
                                         mid_stride * sizeof(*mid)
                                         HIGHBD_TAIL_SUFFIX);
         BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
@@ -283,7 +282,6 @@
         }
     }
 }
-#endif // BITDEPTH == 8
 
 COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
     const unsigned flags = dav1d_get_cpu_flags();
@@ -290,9 +288,7 @@
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-#if BITDEPTH == 8 || ARCH_AARCH64
-    c->wiener = wiener_filter_neon;
+    c->wiener[0] = c->wiener[1] = wiener_filter_neon;
     if (bpc <= 10)
         c->selfguided = sgr_filter_neon;
-#endif
 }
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -77,7 +77,6 @@
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-#if BITDEPTH == 8 || ARCH_AARCH64
     init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
     init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
     init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
@@ -99,7 +98,6 @@
     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
-#endif
 
     c->avg = BF(dav1d_avg, neon);
     c->w_avg = BF(dav1d_w_avg, neon);
@@ -111,8 +109,10 @@
     c->w_mask[0] = BF(dav1d_w_mask_444, neon);
     c->w_mask[1] = BF(dav1d_w_mask_422, neon);
     c->w_mask[2] = BF(dav1d_w_mask_420, neon);
+#endif
     c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
     c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+#if BITDEPTH == 8 || ARCH_AARCH64
     c->emu_edge = BF(dav1d_emu_edge, neon);
 #endif
 }
--- a/src/cdf.c
+++ b/src/cdf.c
@@ -29,10 +29,7 @@
 
 #include <string.h>
 
-#include "src/thread.h"
-#include "common/intops.h"
-
-#include "src/cdf.h"
+#include "src/internal.h"
 #include "src/tables.h"
 
 #define CDF1(x) (32768-(x))
@@ -4098,11 +4095,11 @@
     }
 }
 
-int dav1d_cdf_thread_alloc(CdfThreadContext *const cdf,
+int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf,
                            struct thread_data *const t)
 {
-    cdf->ref = dav1d_ref_create(sizeof(CdfContext) +
-                                (t != NULL) * sizeof(atomic_uint));
+    cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool,
+                                           sizeof(CdfContext) + sizeof(atomic_uint));
     if (!cdf->ref) return DAV1D_ERR(ENOMEM);
     cdf->data.cdf = cdf->ref->data;
     if (t) {
--- a/src/cdf.h
+++ b/src/cdf.h
@@ -140,7 +140,8 @@
 } CdfThreadContext;
 
 void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
-int dav1d_cdf_thread_alloc(CdfThreadContext *cdf, struct thread_data *t);
+int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
+                           struct thread_data *t);
 void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
 void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
 void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
--- a/src/data.c
+++ b/src/data.c
@@ -43,6 +43,7 @@
 uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
     validate_input_or_ret(buf != NULL, NULL);
 
+    if (sz > SIZE_MAX / 2) return NULL;
     buf->ref = dav1d_ref_create(sz);
     if (!buf->ref) return NULL;
     buf->data = buf->ref->const_data;
--- a/src/decode.c
+++ b/src/decode.c
@@ -38,7 +38,6 @@
 #include "dav1d/data.h"
 
 #include "common/intops.h"
-#include "common/mem.h"
 
 #include "src/ctx.h"
 #include "src/decode.h"
@@ -2681,7 +2680,7 @@
                    sizeof(*f->tile_thread.titsati_index_rows) *
                        (f->frame_hdr->tiling.rows + 1)))
         {
-            for (int tile_row = 0, tile_idx = 0;
+            for (int tile_row = 0, task_idx = 0;
                  tile_row < f->frame_hdr->tiling.rows; tile_row++)
             {
                 for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
@@ -2688,10 +2687,10 @@
                      sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
                 {
                     for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
-                         tile_col++, tile_idx++)
+                         tile_col++, task_idx++)
                     {
-                        f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][0] = sby;
-                        f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][1] =
+                        f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0] = sby;
+                        f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1] =
                             tile_row * f->frame_hdr->tiling.cols + tile_col;
                     }
                 }
@@ -3105,7 +3104,7 @@
                                                           4 * (t->by + f->sb_step),
                                                           PLANE_TYPE_BLOCK))
                             {
-                                return 1;
+                                goto error;
                             }
                         dav1d_refmvs_load_tmvs(&f->rf, tile_row,
                                                0, f->bw >> 1, t->by >> 1, by_end);
@@ -3401,7 +3400,7 @@
         dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
     }
     if (f->frame_hdr->refresh_context) {
-        res = dav1d_cdf_thread_alloc(&f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
+        res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
         if (res < 0) goto error;
     }
 
@@ -3466,8 +3465,8 @@
 
     // ref_mvs
     if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
-        f->mvs_ref = dav1d_ref_create(f->sb128h * 16 * (f->b4_stride >> 1) *
-                                      sizeof(*f->mvs));
+        f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool,
+            sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1));
         if (!f->mvs_ref) {
             res = DAV1D_ERR(ENOMEM);
             goto error;
@@ -3530,7 +3529,8 @@
             // We're updating an existing map, but need somewhere to
             // put the new values. Allocate them here (the data
             // actually gets set elsewhere)
-            f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
+            f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool,
+                sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h);
             if (!f->cur_segmap_ref) {
                 dav1d_ref_dec(&f->prev_segmap_ref);
                 res = DAV1D_ERR(ENOMEM);
@@ -3545,13 +3545,14 @@
             f->cur_segmap = f->prev_segmap_ref->data;
         } else {
             // We need to make a new map. Allocate one here and zero it out.
-            f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
+            const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h;
+            f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, segmap_size);
             if (!f->cur_segmap_ref) {
                 res = DAV1D_ERR(ENOMEM);
                 goto error;
             }
             f->cur_segmap = f->cur_segmap_ref->data;
-            memset(f->cur_segmap_ref->data, 0, f->b4_stride * 32 * f->sb128h);
+            memset(f->cur_segmap, 0, segmap_size);
         }
     } else {
         f->cur_segmap = NULL;
--- a/src/internal.h
+++ b/src/internal.h
@@ -82,8 +82,10 @@
     int n_tile_data_alloc;
     int n_tile_data;
     int n_tiles;
+    Dav1dMemPool *seq_hdr_pool;
     Dav1dRef *seq_hdr_ref;
     Dav1dSequenceHeader *seq_hdr;
+    Dav1dMemPool *frame_hdr_pool;
     Dav1dRef *frame_hdr_ref;
     Dav1dFrameHeader *frame_hdr;
 
@@ -107,6 +109,8 @@
     } frame_thread;
 
     // reference/entropy state
+    Dav1dMemPool *segmap_pool;
+    Dav1dMemPool *refmvs_pool;
     struct {
         Dav1dThreadPicture p;
         Dav1dRef *segmap;
@@ -113,6 +117,7 @@
         Dav1dRef *refmvs;
         unsigned refpoc[7];
     } refs[8];
+    Dav1dMemPool *cdf_pool;
     CdfThreadContext cdf[8];
 
     Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
@@ -135,6 +140,8 @@
     int drain;
 
     Dav1dLogger logger;
+
+    Dav1dMemPool *picture_pool;
 };
 
 struct Dav1dFrameContext {
--- a/src/lf_mask.h
+++ b/src/lf_mask.h
@@ -41,8 +41,8 @@
 
 typedef struct Av1RestorationUnit {
     enum Dav1dRestorationType type;
-    int16_t filter_h[3];
-    int16_t filter_v[3];
+    int8_t filter_h[3];
+    int8_t filter_v[3];
     uint8_t sgr_idx;
     int16_t sgr_weights[2];
 } Av1RestorationUnit;
--- a/src/lib.c
+++ b/src/lib.c
@@ -35,7 +35,9 @@
 #include <dlfcn.h>
 #endif
 
-#include "common/mem.h"
+#include "dav1d/dav1d.h"
+#include "dav1d/data.h"
+
 #include "common/validate.h"
 
 #include "src/cpu.h"
@@ -126,6 +128,19 @@
     c->all_layers = s->all_layers;
     c->frame_size_limit = s->frame_size_limit;
 
+    if (dav1d_mem_pool_init(&c->seq_hdr_pool) ||
+        dav1d_mem_pool_init(&c->frame_hdr_pool) ||
+        dav1d_mem_pool_init(&c->segmap_pool) ||
+        dav1d_mem_pool_init(&c->refmvs_pool) ||
+        dav1d_mem_pool_init(&c->cdf_pool))
+    {
+        goto error;
+    }
+    if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc) {
+        if (dav1d_mem_pool_init(&c->picture_pool)) goto error;
+        c->allocator.cookie = c->picture_pool;
+    }
+
     /* On 32-bit systems extremely large frame sizes can cause overflows in
      * dav1d_decode_frame() malloc size calculations. Prevent that from occuring
      * by enforcing a maximum frame size limit, chosen to roughly correspond to
@@ -568,6 +583,13 @@
     dav1d_ref_dec(&c->mastering_display_ref);
     dav1d_ref_dec(&c->content_light_ref);
     dav1d_ref_dec(&c->itut_t35_ref);
+
+    dav1d_mem_pool_end(c->seq_hdr_pool);
+    dav1d_mem_pool_end(c->frame_hdr_pool);
+    dav1d_mem_pool_end(c->segmap_pool);
+    dav1d_mem_pool_end(c->refmvs_pool);
+    dav1d_mem_pool_end(c->cdf_pool);
+    dav1d_mem_pool_end(c->picture_pool);
 
     dav1d_freep_aligned(c_out);
 }
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -54,9 +54,8 @@
 void (name)(pixel *dst, ptrdiff_t dst_stride, \
             const_left_pixel_row left, \
             const pixel *lpf, ptrdiff_t lpf_stride, \
-            int w, int h, const int16_t filterh[7], \
-            const int16_t filterv[7], enum LrEdgeFlags edges \
-            HIGHBD_DECL_SUFFIX)
+            int w, int h, const int16_t filter[2][8], \
+            enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 typedef decl_wiener_filter_fn(*wienerfilter_fn);
 
 #define decl_selfguided_filter_fn(name) \
@@ -68,7 +67,7 @@
 typedef decl_selfguided_filter_fn(*selfguided_fn);
 
 typedef struct Dav1dLoopRestorationDSPContext {
-    wienerfilter_fn wiener;
+    wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
     selfguided_fn selfguided;
 } Dav1dLoopRestorationDSPContext;
 
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -135,7 +135,7 @@
                      const pixel (*const left)[4],
                      const pixel *lpf, const ptrdiff_t lpf_stride,
                      const int w, const int h,
-                     const int16_t filterh[7], const int16_t filterv[7],
+                     const int16_t filter[2][8],
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
@@ -156,10 +156,13 @@
     const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
     for (int j = 0; j < h + 6; j++) {
         for (int i = 0; i < w; i++) {
-            int sum = (tmp_ptr[i + 3] << 7) + (1 << (bitdepth + 6));
+            int sum = (1 << (bitdepth + 6));
+#if BITDEPTH == 8
+            sum += tmp_ptr[i + 3] * 128;
+#endif
 
             for (int k = 0; k < 7; k++) {
-                sum += tmp_ptr[i + k] * filterh[k];
+                sum += tmp_ptr[i + k] * filter[0][k];
             }
 
             hor_ptr[i] =
@@ -174,10 +177,10 @@
     const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
     for (int j = 0; j < h; j++) {
         for (int i = 0; i < w; i++) {
-            int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
+            int sum = -round_offset;
 
             for (int k = 0; k < 7; k++) {
-                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];
+                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
             }
 
             p[j * PXSTRIDE(p_stride) + i] =
@@ -208,16 +211,19 @@
 // i: Pixel summed and stored (between loops)
 // c: Pixel summed not stored
 // x: Pixel not summed not stored
-static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
+static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src,
+                    const int w, const int h)
+{
     // We skip the first row, as it is never used
     src += REST_UNIT_STRIDE;
-    dst += REST_UNIT_STRIDE;
 
     // We skip the first and last columns, as they are never used
     for (int x = 1; x < w - 1; x++) {
-        coef *ds = dst + x;
+        coef *sum_v = sum + x;
+        int32_t *sumsq_v = sumsq + x;
         const pixel *s = src + x;
-        int a = s[0], b = s[REST_UNIT_STRIDE];
+        int a = s[0], a2 = a * a;
+        int b = s[REST_UNIT_STRIDE], b2 = b * b;
 
         // We skip the first 2 rows, as they are skipped in the next loop and
         // we don't need the last 2 row as it is skipped in the next loop
@@ -224,28 +230,39 @@
         for (int y = 2; y < h - 2; y++) {
             s += REST_UNIT_STRIDE;
             const int c = s[REST_UNIT_STRIDE];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c;
+            const int c2 = c * c;
+            sum_v += REST_UNIT_STRIDE;
+            sumsq_v += REST_UNIT_STRIDE;
+            *sum_v = a + b + c;
+            *sumsq_v = a2 + b2 + c2;
             a = b;
+            a2 = b2;
             b = c;
+            b2 = c2;
         }
      }
 
-    // We skip the first 2 rows as they are never read
-    dst += REST_UNIT_STRIDE;
+    // We skip the first row as it is never read
+    sum += REST_UNIT_STRIDE;
+    sumsq += REST_UNIT_STRIDE;
     // We skip the last 2 rows as it is never read
     for (int y = 2; y < h - 2; y++) {
-        int a = dst[1], b = dst[2];
+        int a = sum[1], a2 = sumsq[1];
+        int b = sum[2], b2 = sumsq[2];
 
         // We don't store the first column as it is never read and
         // we don't store the last 2 columns as they are never read
         for (int x = 2; x < w - 2; x++) {
-            const int c = dst[x + 1];
-            dst[x] = a + b + c;
+            const int c = sum[x + 1], c2 = sumsq[x + 1];
+            sum[x] = a + b + c;
+            sumsq[x] = a2 + b2 + c2;
             a = b;
+            a2 = b2;
             b = c;
+            b2 = c2;
         }
-        dst += REST_UNIT_STRIDE;
+        sum += REST_UNIT_STRIDE;
+        sumsq += REST_UNIT_STRIDE;
     }
 }
 
@@ -271,142 +288,63 @@
 // i: Pixel summed and stored (between loops)
 // c: Pixel summed not stored
 // x: Pixel not summed not stored
-static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
-    // We skip the first row, as it is never used
-    dst += REST_UNIT_STRIDE;
-
+static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
+                    const int w, const int h)
+{
     for (int x = 0; x < w; x++) {
-        coef *ds = dst + x;
+        coef *sum_v = sum + x;
+        int32_t *sumsq_v = sumsq + x;
         const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
-        int a = s[-3 * REST_UNIT_STRIDE];
-        int b = s[-2 * REST_UNIT_STRIDE];
-        int c = s[-1 * REST_UNIT_STRIDE];
-        int d = s[0];
+        int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a;
+        int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b;
+        int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c;
+        int d = s[0], d2 = d * d;
 
         // We skip the first 2 rows, as they are skipped in the next loop and
         // we don't need the last 2 row as it is skipped in the next loop
         for (int y = 2; y < h - 2; y++) {
             s += REST_UNIT_STRIDE;
-            const int e = *s;
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c + d + e;
+            const int e = *s, e2 = e * e;
+            sum_v += REST_UNIT_STRIDE;
+            sumsq_v += REST_UNIT_STRIDE;
+            *sum_v = a + b + c + d + e;
+            *sumsq_v = a2 + b2 + c2 + d2 + e2;
             a = b;
             b = c;
             c = d;
             d = e;
+            a2 = b2;
+            b2 = c2;
+            c2 = d2;
+            d2 = e2;
         }
     }
 
-    // We skip the first 2 rows as they are never read
-    dst += REST_UNIT_STRIDE;
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[0];
-        int b = dst[1];
-        int c = dst[2];
-        int d = dst[3];
-
-        for (int x = 2; x < w - 2; x++) {
-            const int e = dst[x + 2];
-            dst[x] = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-// See boxsum3 function comments for details on row and column skipping
-static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
-    // We skip the first row, as it is never used
-    src += REST_UNIT_STRIDE;
-    dst += REST_UNIT_STRIDE;
-
-    // We skip the first and last columns, as they are never used
-    for (int x = 1; x < w - 1; x++) {
-        int32_t *ds = dst + x;
-        const pixel *s = src + x;
-        int a = s[0] * s[0];
-        int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-
-        // We skip the first row, as it is skipped in the next loop and
-        // we don't need the last row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c;
-            a = b;
-            b = c;
-        }
-     }
-
     // We skip the first row as it is never read
-    dst += REST_UNIT_STRIDE;
-    // We skip the last row as it is never read
+    sum += REST_UNIT_STRIDE;
+    sumsq += REST_UNIT_STRIDE;
     for (int y = 2; y < h - 2; y++) {
-        int a = dst[1], b = dst[2];
+        int a = sum[0], a2 = sumsq[0];
+        int b = sum[1], b2 = sumsq[1];
+        int c = sum[2], c2 = sumsq[2];
+        int d = sum[3], d2 = sumsq[3];
 
-        // We don't store the first column as it is never read and
-        // we don't store the last 2 columns as they are never read
         for (int x = 2; x < w - 2; x++) {
-            const int c = dst[x + 1];
-            dst[x] = a + b + c;
+            const int e = sum[x + 2], e2 = sumsq[x + 2];
+            sum[x] = a + b + c + d + e;
+            sumsq[x] = a2 + b2 + c2 + d2 + e2;
             a = b;
             b = c;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
-}
-
-// See boxsum5 function comments for details on row and column skipping
-static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
-                       const int h)
-{
-    // We skip the first row, as it is never used
-    dst += REST_UNIT_STRIDE;
-
-    for (int x = 0; x < w; x++) {
-        int32_t *ds = dst + x;
-        const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
-        int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
-        int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
-        int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
-        int d = s[0] * s[0];
-
-        // We skip the first 2 rows, as they are skipped in the next loop and
-        // we don't need the last 2 row as it is skipped in the next loop
-        for (int y = 2; y < h - 2; y++) {
-            s += REST_UNIT_STRIDE;
-            const int e = s[0] * s[0];
-            ds += REST_UNIT_STRIDE;
-            *ds = a + b + c + d + e;
-            a = b;
-            b = c;
             c = d;
             d = e;
+            a2 = b2;
+            b2 = c2;
+            c2 = d2;
+            d2 = e2;
         }
+        sum += REST_UNIT_STRIDE;
+        sumsq += REST_UNIT_STRIDE;
     }
-
-    // We skip the first 2 rows as they are never read
-    dst += REST_UNIT_STRIDE;
-    for (int y = 2; y < h - 2; y++) {
-        int a = dst[0];
-        int b = dst[1];
-        int c = dst[2];
-        int d = dst[3];
-
-        for (int x = 2; x < w - 2; x++) {
-            const int e = dst[x + 2];
-            dst[x] = a + b + c + d + e;
-            a = b;
-            b = c;
-            c = d;
-            d = e;
-        }
-        dst += REST_UNIT_STRIDE;
-    }
 }
 
 static void selfguided_filter(coef *dst, const pixel *src,
@@ -418,21 +356,18 @@
 
     // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
     // of padding above and below
-    int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-    int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
+    int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+    int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3;
     // By inverting A and B after the boxsums, B can be of size coef instead
     // of int32_t
-    coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-    coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
+    coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
+    coef *B = sum + 2 * REST_UNIT_STRIDE + 3;
 
     const int step = (n == 25) + 1;
-    if (n == 25) {
-        boxsum5(B_, src, w + 6, h + 6);
-        boxsum5sqr(A_, src, w + 6, h + 6);
-    } else {
-        boxsum3(B_, src, w + 6, h + 6);
-        boxsum3sqr(A_, src, w + 6, h + 6);
-    }
+    if (n == 25)
+        boxsum5(sumsq, sum, src, w + 6, h + 6);
+    else
+        boxsum3(sumsq, sum, src, w + 6, h + 6);
     const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
 
     int32_t *AA = A - REST_UNIT_STRIDE;
@@ -574,7 +509,7 @@
 }
 
 COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
-    c->wiener = wiener_c;
+    c->wiener[0] = c->wiener[1] = wiener_c;
     c->selfguided = selfguided_c;
 
 #if HAVE_ASM
--- a/src/lr_apply_tmpl.c
+++ b/src/lr_apply_tmpl.c
@@ -164,28 +164,36 @@
     // The first stripe of the frame is shorter by 8 luma pixel rows.
     int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
 
-    // FIXME [8] might be easier for SIMD
-    int16_t filterh[7], filterv[7];
+    ALIGN_STK_16(int16_t, filter, 2, [8]);
+    wienerfilter_fn wiener_fn = NULL;
     if (lr->type == DAV1D_RESTORATION_WIENER) {
-        filterh[0] = filterh[6] = lr->filter_h[0];
-        filterh[1] = filterh[5] = lr->filter_h[1];
-        filterh[2] = filterh[4] = lr->filter_h[2];
-        filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);
+        filter[0][0] = filter[0][6] = lr->filter_h[0];
+        filter[0][1] = filter[0][5] = lr->filter_h[1];
+        filter[0][2] = filter[0][4] = lr->filter_h[2];
+        filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
+#if BITDEPTH != 8
+        /* For 8-bit SIMD it's beneficial to handle the +128 separately
+         * in order to avoid overflows. */
+        filter[0][3] += 128;
+#endif
 
-        filterv[0] = filterv[6] = lr->filter_v[0];
-        filterv[1] = filterv[5] = lr->filter_v[1];
-        filterv[2] = filterv[4] = lr->filter_v[2];
-        filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);
+        filter[1][0] = filter[1][6] = lr->filter_v[0];
+        filter[1][1] = filter[1][5] = lr->filter_v[1];
+        filter[1][2] = filter[1][4] = lr->filter_v[2];
+        filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
+
+        wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
+    } else {
+        assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
     }
 
     while (y + stripe_h <= row_h) {
         // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
         edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
-        if (lr->type == DAV1D_RESTORATION_WIENER) {
-            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
-                           filterh, filterv, edges HIGHBD_CALL_SUFFIX);
+        if (wiener_fn) {
+            wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+                      filter, edges HIGHBD_CALL_SUFFIX);
         } else {
-            assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
             dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
                                lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
         }
--- /dev/null
+++ b/src/mem.c
@@ -1,0 +1,119 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2020, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdint.h>
+
+#include "src/internal.h"
+
+static COLD void mem_pool_destroy(Dav1dMemPool *const pool) {
+    pthread_mutex_destroy(&pool->lock);
+    free(pool);
+}
+
+void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) {
+    pthread_mutex_lock(&pool->lock);
+    const int ref_cnt = --pool->ref_cnt;
+    if (!pool->end) {
+        buf->next = pool->buf;
+        pool->buf = buf;
+        pthread_mutex_unlock(&pool->lock);
+        assert(ref_cnt > 0);
+    } else {
+        pthread_mutex_unlock(&pool->lock);
+        dav1d_free_aligned(buf->data);
+        if (!ref_cnt) mem_pool_destroy(pool);
+    }
+}
+
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t size) {
+    assert(!(size & (sizeof(void*) - 1)));
+    pthread_mutex_lock(&pool->lock);
+    Dav1dMemPoolBuffer *buf = pool->buf;
+    pool->ref_cnt++;
+    uint8_t *data;
+    if (buf) {
+        pool->buf = buf->next;
+        pthread_mutex_unlock(&pool->lock);
+        data = buf->data;
+        if ((uintptr_t)buf - (uintptr_t)data != size) {
+            /* Reallocate if the size has changed */
+            dav1d_free_aligned(data);
+            goto alloc;
+        }
+    } else {
+        pthread_mutex_unlock(&pool->lock);
+alloc:
+        data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64);
+        if (!data) {
+            pthread_mutex_lock(&pool->lock);
+            const int ref_cnt = --pool->ref_cnt;
+            pthread_mutex_unlock(&pool->lock);
+            if (!ref_cnt) mem_pool_destroy(pool);
+            return NULL;
+        }
+        buf = (Dav1dMemPoolBuffer*)(data + size);
+        buf->data = data;
+    }
+
+    return buf;
+}
+
+COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) {
+    Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool));
+    if (pool) {
+        if (!pthread_mutex_init(&pool->lock, NULL)) {
+            pool->buf = NULL;
+            pool->ref_cnt = 1;
+            pool->end = 0;
+            *ppool = pool;
+            return 0;
+        }
+        free(pool);
+    }
+    *ppool = NULL;
+    return DAV1D_ERR(ENOMEM);
+}
+
+COLD void dav1d_mem_pool_end(Dav1dMemPool *const pool) {
+    if (pool) {
+        pthread_mutex_lock(&pool->lock);
+        Dav1dMemPoolBuffer *buf = pool->buf;
+        const int ref_cnt = --pool->ref_cnt;
+        pool->buf = NULL;
+        pool->end = 1;
+        pthread_mutex_unlock(&pool->lock);
+
+        while (buf) {
+            void *const data = buf->data;
+            buf = buf->next;
+            dav1d_free_aligned(data);
+        }
+        if (!ref_cnt) mem_pool_destroy(pool);
+    }
+}
--- /dev/null
+++ b/src/mem.h
@@ -1,0 +1,103 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_MEM_H
+#define DAV1D_SRC_MEM_H
+
+#include <stdlib.h>
+
+#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
+#include <malloc.h>
+#endif
+
+#include "common/attributes.h"
+
+#include "src/thread.h"
+
+typedef struct Dav1dMemPoolBuffer {
+    void *data;
+    struct Dav1dMemPoolBuffer *next;
+} Dav1dMemPoolBuffer;
+
+typedef struct Dav1dMemPool {
+    pthread_mutex_t lock;
+    Dav1dMemPoolBuffer *buf;
+    int ref_cnt;
+    int end;
+} Dav1dMemPool;
+
+void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf);
+Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size);
+int dav1d_mem_pool_init(Dav1dMemPool **pool);
+void dav1d_mem_pool_end(Dav1dMemPool *pool);
+
+/*
+ * Allocate align-byte aligned memory. The return value can be released
+ * by calling the dav1d_free_aligned() function.
+ */
+static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
+    assert(!(align & (align - 1)));
+#ifdef HAVE_POSIX_MEMALIGN
+    void *ptr;
+    if (posix_memalign(&ptr, align, sz)) return NULL;
+    return ptr;
+#elif defined(HAVE_ALIGNED_MALLOC)
+    return _aligned_malloc(sz, align);
+#elif defined(HAVE_MEMALIGN)
+    return memalign(align, sz);
+#else
+#error Missing aligned alloc implementation
+#endif
+}
+
+static inline void dav1d_free_aligned(void* ptr) {
+#ifdef HAVE_POSIX_MEMALIGN
+    free(ptr);
+#elif defined(HAVE_ALIGNED_MALLOC)
+    _aligned_free(ptr);
+#elif defined(HAVE_MEMALIGN)
+    free(ptr);
+#endif
+}
+
+static inline void dav1d_freep_aligned(void* ptr) {
+    void **mem = (void **) ptr;
+    if (*mem) {
+        dav1d_free_aligned(*mem);
+        *mem = NULL;
+    }
+}
+
+static inline void freep(void *ptr) {
+    void **mem = (void **) ptr;
+    if (*mem) {
+        free(*mem);
+        *mem = NULL;
+    }
+}
+
+#endif /* DAV1D_SRC_MEM_H */
--- a/src/meson.build
+++ b/src/meson.build
@@ -38,6 +38,7 @@
     'itx_1d.c',
     'lf_mask.c',
     'log.c',
+    'mem.c',
     'msac.c',
     'obu.c',
     'picture.c',
@@ -82,7 +83,7 @@
 )
 
 # ASM specific sources
-libdav1d_nasm_objs = []
+libdav1d_asm_objs = []
 # Arch-specific flags
 arch_flags = []
 if is_asm_enabled
@@ -102,7 +103,7 @@
         )
         if (host_machine.cpu_family() == 'aarch64' or
             host_machine.cpu() == 'arm64')
-            libdav1d_sources += files(
+            libdav1d_sources_asm = files(
                 # itx.S is used for both 8 and 16 bpc.
                 'arm/64/itx.S',
                 'arm/64/looprestoration_common.S',
@@ -110,7 +111,7 @@
             )
 
             if dav1d_bitdepths.contains('8')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
                     'arm/64/cdef.S',
                     'arm/64/ipred.S',
                     'arm/64/loopfilter.S',
@@ -120,7 +121,7 @@
             endif
 
             if dav1d_bitdepths.contains('16')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
                     'arm/64/cdef16.S',
                     'arm/64/ipred16.S',
                     'arm/64/itx16.S',
@@ -130,12 +131,13 @@
                 )
             endif
         elif host_machine.cpu_family().startswith('arm')
-            libdav1d_sources += files(
+            libdav1d_sources_asm = files(
+                'arm/32/looprestoration_common.S',
                 'arm/32/msac.S',
             )
 
             if dav1d_bitdepths.contains('8')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
                     'arm/32/cdef.S',
                     'arm/32/ipred.S',
                     'arm/32/itx.S',
@@ -146,11 +148,20 @@
             endif
 
             if dav1d_bitdepths.contains('16')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
+                    'arm/32/cdef16.S',
+                    'arm/32/loopfilter16.S',
+                    'arm/32/looprestoration16.S',
                     'arm/32/mc16.S',
                 )
             endif
         endif
+
+        if use_gaspp
+            libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm)
+        else
+            libdav1d_sources += libdav1d_sources_asm
+        endif
     elif host_machine.cpu_family().startswith('x86')
 
         libdav1d_sources += files(
@@ -190,7 +201,7 @@
                 'x86/ipred_ssse3.asm',
                 'x86/itx_ssse3.asm',
                 'x86/loopfilter_ssse3.asm',
-                'x86/looprestoration_ssse3.asm',
+                'x86/looprestoration_sse.asm',
                 'x86/mc_sse.asm',
             )
         endif
@@ -201,7 +212,7 @@
         endif
 
         # Compile the ASM sources with NASM
-        libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
+        libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
     elif host_machine.cpu() == 'ppc64le'
         arch_flags = ['-maltivec', '-mvsx']
         libdav1d_sources += files(
@@ -223,17 +234,6 @@
 #
 
 if host_machine.system() == 'windows' and get_option('default_library') != 'static'
-    rc_version_array = meson.project_version().split('.')
-    winmod = import('windows')
-    rc_data = configuration_data()
-    rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
-    rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
-    rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
-    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
-    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
-    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
-    rc_data.set('COPYRIGHT_YEARS', '2019')
-
     rc_file = configure_file(
         input : 'dav1d.rc.in',
         output : 'dav1d.rc',
@@ -302,7 +302,7 @@
 
 libdav1d = library('dav1d',
     libdav1d_sources,
-    libdav1d_nasm_objs,
+    libdav1d_asm_objs,
     libdav1d_rc_obj,
 
     objects : [
--- a/src/msac.c
+++ b/src/msac.c
@@ -101,17 +101,17 @@
 }
 
 int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
-                             const int n, const unsigned k)
+                             const int n, unsigned k)
 {
-    int i = 0;
-    int a = 0;
-    int b = k;
-    while ((2 << b) < n) {
-        if (!dav1d_msac_decode_bool_equi(s)) break;
-        b = k + i++;
-        a = (1 << b);
+    assert(n >> k == 8);
+
+    unsigned a = 0;
+    if (dav1d_msac_decode_bool_equi(s)) {
+        if (dav1d_msac_decode_bool_equi(s))
+            k += dav1d_msac_decode_bool_equi(s) + 1;
+        a = 1 << k;
     }
-    const unsigned v = dav1d_msac_decode_bools(s, b) + a;
+    const unsigned v = dav1d_msac_decode_bools(s, k) + a;
     return ref * 2 <= n ? inv_recenter(ref, v) :
                           n - 1 - inv_recenter(n - 1 - ref, v);
 }
--- a/src/obu.c
+++ b/src/obu.c
@@ -57,7 +57,7 @@
     hdr->profile = dav1d_get_bits(gb, 3);
     if (hdr->profile > 2) goto error;
 #if DEBUG_SEQ_HDR
-    printf("SEQHDR: post-profile: off=%ld\n",
+    printf("SEQHDR: post-profile: off=%u\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
@@ -65,7 +65,7 @@
     hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1);
     if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
 #if DEBUG_SEQ_HDR
-    printf("SEQHDR: post-stillpicture_flags: off=%ld\n",
+    printf("SEQHDR: post-stillpicture_flags: off=%u\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
@@ -104,7 +104,7 @@
             hdr->decoder_model_info_present = 0;
         }
 #if DEBUG_SEQ_HDR
-        printf("SEQHDR: post-timinginfo: off=%ld\n",
+        printf("SEQHDR: post-timinginfo: off=%u\n",
                dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
@@ -114,6 +114,8 @@
             struct Dav1dSequenceHeaderOperatingPoint *const op =
                 &hdr->operating_points[i];
             op->idc = dav1d_get_bits(gb, 12);
+            if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
+                goto error;
             op->major_level = 2 + dav1d_get_bits(gb, 3);
             op->minor_level = dav1d_get_bits(gb, 2);
             op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
@@ -138,7 +140,7 @@
             c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
         c->operating_point_idc = hdr->operating_points[op_idx].idc;
 #if DEBUG_SEQ_HDR
-        printf("SEQHDR: post-operating-points: off=%ld\n",
+        printf("SEQHDR: post-operating-points: off=%u\n",
                dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
     }
@@ -148,7 +150,7 @@
     hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
     hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1;
 #if DEBUG_SEQ_HDR
-    printf("SEQHDR: post-size: off=%ld\n",
+    printf("SEQHDR: post-size: off=%u\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
     hdr->frame_id_numbers_present =
@@ -158,7 +160,7 @@
         hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
     }
 #if DEBUG_SEQ_HDR
-    printf("SEQHDR: post-frame-id-numbers-present: off=%ld\n",
+    printf("SEQHDR: post-frame-id-numbers-present: off=%u\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
@@ -192,7 +194,7 @@
         }
         hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1);
     #if DEBUG_SEQ_HDR
-        printf("SEQHDR: post-screentools: off=%ld\n",
+        printf("SEQHDR: post-screentools: off=%u\n",
                dav1d_get_bits_pos(gb) - init_bit_pos);
     #endif
         hdr->force_integer_mv = hdr->screen_content_tools ?
@@ -204,7 +206,7 @@
     hdr->cdef = dav1d_get_bits(gb, 1);
     hdr->restoration = dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
-    printf("SEQHDR: post-featurebits: off=%ld\n",
+    printf("SEQHDR: post-featurebits: off=%u\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
@@ -264,13 +266,13 @@
     }
     hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
-    printf("SEQHDR: post-colorinfo: off=%ld\n",
+    printf("SEQHDR: post-colorinfo: off=%u\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     hdr->film_grain_present = dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
-    printf("SEQHDR: post-filmgrain: off=%ld\n",
+    printf("SEQHDR: post-filmgrain: off=%u\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
@@ -367,7 +369,7 @@
     hdr->show_existing_frame =
         !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-show_existing_frame: off=%ld\n",
+    printf("HDR: post-show_existing_frame: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     if (hdr->show_existing_frame) {
@@ -374,8 +376,11 @@
         hdr->existing_frame_idx = dav1d_get_bits(gb, 3);
         if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
             hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
-        if (seqhdr->frame_id_numbers_present)
+        if (seqhdr->frame_id_numbers_present) {
             hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
+            Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr;
+            if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL);
+        }
         return 0;
     }
 
@@ -391,7 +396,7 @@
         hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ||
         seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-frametype_bits: off=%ld\n",
+    printf("HDR: post-frametype_bits: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     hdr->disable_cdf_update = dav1d_get_bits(gb, 1);
@@ -412,7 +417,7 @@
     hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 :
                                hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-frame_size_override_flag: off=%ld\n",
+    printf("HDR: post-frame_size_override_flag: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     hdr->frame_offset = seqhdr->order_hint ?
@@ -550,8 +555,12 @@
         for (int i = 0; i < 7; i++) {
             if (!hdr->frame_ref_short_signaling)
                 hdr->refidx[i] = dav1d_get_bits(gb, 3);
-            if (seqhdr->frame_id_numbers_present)
-                dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
+            if (seqhdr->frame_id_numbers_present) {
+                const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits);
+                const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1);
+                Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr;
+                if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error;
+            }
         }
         const int use_ref = !hdr->error_resilient_mode &&
                             hdr->frame_size_override;
@@ -565,7 +574,7 @@
             hdr->frame_type & 1 && dav1d_get_bits(gb, 1);
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-frametype-specific-bits: off=%ld\n",
+    printf("HDR: post-frametype-specific-bits: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -572,7 +581,7 @@
     hdr->refresh_context = !seqhdr->reduced_still_picture_header &&
                            !hdr->disable_cdf_update && !dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-refresh_context: off=%ld\n",
+    printf("HDR: post-refresh_context: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -646,7 +655,7 @@
         hdr->tiling.n_bytes = hdr->tiling.update = 0;
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-tiling: off=%ld\n",
+    printf("HDR: post-tiling: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -669,7 +678,7 @@
         }
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-quant: off=%ld\n",
+    printf("HDR: post-quant: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     hdr->quant.qm = dav1d_get_bits(gb, 1);
@@ -681,7 +690,7 @@
                                           hdr->quant.qm_u;
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-qm: off=%ld\n",
+    printf("HDR: post-qm: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -766,7 +775,7 @@
             hdr->segmentation.seg_data.d[i].ref = -1;
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-segmentation: off=%ld\n",
+    printf("HDR: post-segmentation: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -778,7 +787,7 @@
     hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0;
     hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0;
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-delta_q_lf_flags: off=%ld\n",
+    printf("HDR: post-delta_q_lf_flags: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -838,7 +847,7 @@
         }
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-lpf: off=%ld\n",
+    printf("HDR: post-lpf: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -857,7 +866,7 @@
         hdr->cdef.uv_strength[0] = 0;
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-cdef: off=%ld\n",
+    printf("HDR: post-cdef: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -899,7 +908,7 @@
         hdr->restoration.type[2] = DAV1D_RESTORATION_NONE;
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-restoration: off=%ld\n",
+    printf("HDR: post-restoration: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -906,12 +915,12 @@
     hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY :
                      dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST;
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-txfmmode: off=%ld\n",
+    printf("HDR: post-txfmmode: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0;
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-refmode: off=%ld\n",
+    printf("HDR: post-refmode: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     hdr->skip_mode_allowed = 0;
@@ -972,18 +981,18 @@
     }
     hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0;
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-extskip: off=%ld\n",
+    printf("HDR: post-extskip: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 &&
         seqhdr->warped_motion && dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-warpmotionbit: off=%ld\n",
+    printf("HDR: post-warpmotionbit: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     hdr->reduced_txtp_set = dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-reducedtxtpset: off=%ld\n",
+    printf("HDR: post-reducedtxtpset: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -1037,7 +1046,7 @@
         }
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-gmv: off=%ld\n",
+    printf("HDR: post-gmv: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -1121,7 +1130,7 @@
         memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data));
     }
 #if DEBUG_FRAME_HDR
-    printf("HDR: post-filmgrain: off=%ld\n",
+    printf("HDR: post-filmgrain: off=%td\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
@@ -1227,7 +1236,8 @@
 
     switch (type) {
     case DAV1D_OBU_SEQ_HDR: {
-        Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader));
+        Dav1dRef *ref = dav1d_ref_create_using_pool(c->seq_hdr_pool,
+                                                    sizeof(Dav1dSequenceHeader));
         if (!ref) return DAV1D_ERR(ENOMEM);
         Dav1dSequenceHeader *seq_hdr = ref->data;
         memset(seq_hdr, 0, sizeof(*seq_hdr));
@@ -1273,7 +1283,8 @@
         if (global) break;
         if (!c->seq_hdr) goto error;
         if (!c->frame_hdr_ref) {
-            c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
+            c->frame_hdr_ref = dav1d_ref_create_using_pool(c->frame_hdr_pool,
+                                                           sizeof(Dav1dFrameHeader));
             if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
         }
 #ifndef NDEBUG
@@ -1366,6 +1377,10 @@
         break;
     }
     case DAV1D_OBU_METADATA: {
+#define DEBUG_OBU_METADATA 0
+#if DEBUG_OBU_METADATA
+        const uint8_t *const init_ptr = gb.ptr;
+#endif
         // obu metadta type field
         const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
         const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
@@ -1378,7 +1393,17 @@
             Dav1dContentLightLevel *const content_light = ref->data;
 
             content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+            printf("CLLOBU: max-content-light-level: %d [off=%td]\n",
+                   content_light->max_content_light_level,
+                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
             content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+            printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n",
+                   content_light->max_frame_average_light_level,
+                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
 
             // Skip the trailing bit, align to the next byte boundary and check for overrun.
             dav1d_get_bits(&gb, 1);
@@ -1401,13 +1426,37 @@
             for (int i = 0; i < 3; i++) {
                 mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
                 mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+                printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i,
+                       mastering_display->primaries[i][0],
+                       mastering_display->primaries[i][1],
+                       (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
             }
             mastering_display->white_point[0] = dav1d_get_bits(&gb, 16);
+#if DEBUG_OBU_METADATA
+            printf("MDCVOBU: white-point-x: %d [off=%td]\n",
+                   mastering_display->white_point[0],
+                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
             mastering_display->white_point[1] = dav1d_get_bits(&gb, 16);
-
+#if DEBUG_OBU_METADATA
+            printf("MDCVOBU: white-point-y: %d [off=%td]\n",
+                   mastering_display->white_point[1],
+                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
             mastering_display->max_luminance = dav1d_get_bits(&gb, 32);
+#if DEBUG_OBU_METADATA
+            printf("MDCVOBU: max-luminance: %d [off=%td]\n",
+                   mastering_display->max_luminance,
+                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
             mastering_display->min_luminance = dav1d_get_bits(&gb, 32);
-
+#if DEBUG_OBU_METADATA
+            printf("MDCVOBU: min-luminance: %d [off=%td]\n",
+                   mastering_display->min_luminance,
+                   (gb.ptr - init_ptr) * 8 - gb.bits_left);
+#endif
             // Skip the trailing bit, align to the next byte boundary and check for overrun.
             dav1d_get_bits(&gb, 1);
             dav1d_bytealign_get_bits(&gb);
--- a/src/picture.c
+++ b/src/picture.c
@@ -36,7 +36,6 @@
 #include <string.h>
 
 #include "common/intops.h"
-#include "common/mem.h"
 #include "common/validate.h"
 
 #include "src/internal.h"
@@ -47,7 +46,7 @@
 #include "src/thread_task.h"
 
 int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
-    assert(cookie == NULL);
+    assert(sizeof(Dav1dMemPoolBuffer) <= DAV1D_PICTURE_ALIGNMENT);
     const int hbd = p->p.bpc > 8;
     const int aligned_w = (p->p.w + 127) & ~127;
     const int aligned_h = (p->p.h + 127) & ~127;
@@ -69,27 +68,24 @@
     p->stride[1] = uv_stride;
     const size_t y_sz = y_stride * aligned_h;
     const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
-    const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT;
-    uint8_t *const data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
-    if (!data) return DAV1D_ERR(ENOMEM);
+    const size_t pic_size = y_sz + 2 * uv_sz;
 
+    Dav1dMemPoolBuffer *const buf = dav1d_mem_pool_pop(cookie, pic_size +
+                                                       DAV1D_PICTURE_ALIGNMENT -
+                                                       sizeof(Dav1dMemPoolBuffer));
+    if (!buf) return DAV1D_ERR(ENOMEM);
+    p->allocator_data = buf;
+
+    uint8_t *const data = buf->data;
     p->data[0] = data;
     p->data[1] = has_chroma ? data + y_sz : NULL;
     p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL;
 
-#ifndef NDEBUG /* safety check */
-    p->allocator_data = data;
-#endif
-
     return 0;
 }
 
 void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
-    assert(cookie == NULL);
-#ifndef NDEBUG /* safety check */
-    assert(p->allocator_data == p->data[0]);
-#endif
-    dav1d_free_aligned(p->data[0]);
+    dav1d_mem_pool_push(cookie, p->allocator_data);
 }
 
 struct pic_ctx_context {
--- a/src/picture.h
+++ b/src/picture.h
@@ -52,6 +52,11 @@
     atomic_uint *progress;
 } Dav1dThreadPicture;
 
+typedef struct Dav1dPictureBuffer {
+    void *data;
+    struct Dav1dPictureBuffer *next;
+} Dav1dPictureBuffer;
+
 /*
  * Allocate a picture with custom border size.
  */
--- a/src/ppc/looprestoration_init_tmpl.c
+++ b/src/ppc/looprestoration_init_tmpl.c
@@ -49,7 +49,7 @@
 
 static void wiener_filter_h_vsx(int32_t *hor_ptr,
                                 uint8_t *tmp_ptr,
-                                const int16_t filterh[7],
+                                const int16_t filterh[8],
                                 const int w, const int h)
 {
     static const i32x4 zerov = vec_splats(0);
@@ -149,14 +149,10 @@
 } while (0)
 
 #define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
-    i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
-    i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
-    i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
-    i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
-    i32x4 sum1 = -round_offset_vec; \
-    i32x4 sum2 = -round_offset_vec; \
-    i32x4 sum3 = -round_offset_vec; \
-    i32x4 sum4 = -round_offset_vec; \
+    i32x4 sum1 = round_vec; \
+    i32x4 sum2 = round_vec; \
+    i32x4 sum3 = round_vec; \
+    i32x4 sum4 = round_vec; \
     APPLY_FILTER_V(0, filterv0); \
     APPLY_FILTER_V(1, filterv1); \
     APPLY_FILTER_V(2, filterv2); \
@@ -164,31 +160,25 @@
     APPLY_FILTER_V(4, filterv4); \
     APPLY_FILTER_V(5, filterv5); \
     APPLY_FILTER_V(6, filterv6); \
-    sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \
-    sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \
-    sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \
-    sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \
     sum1 = sum1 >> round_bits_vec; \
     sum2 = sum2 >> round_bits_vec; \
     sum3 = sum3 >> round_bits_vec; \
     sum4 = sum4 >> round_bits_vec; \
-    i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \
-    i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \
+    i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \
+    i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \
     sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
     sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
-    sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \
+    sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \
 } while (0)
 
 static inline void wiener_filter_v_vsx(uint8_t *p,
                                        const ptrdiff_t p_stride,
                                        const int32_t *hor,
-                                       const int16_t filterv[7],
+                                       const int16_t filterv[8],
                                        const int w, const int h)
 {
     static const i32x4 round_bits_vec = vec_splats(11);
-    static const i32x4 rounding_off_vec = vec_splats(1 << 10);
-    static const i32x4 round_offset_vec = vec_splats(1 << 18);
-    static const i32x4 seven_vec = vec_splats(7);
+    static const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18));
 
     i32x4 filterv0 =  vec_splats((int32_t) filterv[0]);
     i32x4 filterv1 =  vec_splats((int32_t) filterv[1]);
@@ -319,8 +309,7 @@
                               const uint8_t *lpf,
                               const ptrdiff_t lpf_stride,
                               const int w, const int h,
-                              const int16_t filterh[7],
-                              const int16_t filterv[7],
+                              const int16_t filter[2][8],
                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
@@ -329,8 +318,8 @@
     padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
     ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
 
-    wiener_filter_h_vsx(hor, tmp, filterh, w, h);
-    wiener_filter_v_vsx(p, p_stride, hor, filterv, w, h);
+    wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
+    wiener_filter_v_vsx(p, p_stride, hor, filter[1], w, h);
 
 }
 #endif
@@ -343,7 +332,7 @@
     if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
 
 #if BITDEPTH == 8
-    c->wiener = wiener_filter_vsx;
+    c->wiener[0] = c->wiener[1] = wiener_filter_vsx;
 #endif
 }
 
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -36,7 +36,6 @@
 #include "common/bitdepth.h"
 #include "common/dump.h"
 #include "common/intops.h"
-#include "common/mem.h"
 
 #include "src/cdef_apply.h"
 #include "src/ctx.h"
--- a/src/ref.c
+++ b/src/ref.c
@@ -27,8 +27,6 @@
 
 #include "config.h"
 
-#include "common/mem.h"
-
 #include "src/ref.h"
 
 static void default_free_callback(const uint8_t *const data, void *const user_data) {
@@ -36,19 +34,43 @@
     dav1d_free_aligned(user_data);
 }
 
-Dav1dRef *dav1d_ref_create(const size_t size) {
-    void *data = dav1d_alloc_aligned(size, 32);
+Dav1dRef *dav1d_ref_create(size_t size) {
+    size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+    uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64);
     if (!data) return NULL;
 
-    Dav1dRef *const res = dav1d_ref_wrap(data, default_free_callback, data);
-    if (res)
-        res->data = data;
-    else
-        dav1d_free_aligned(data);
+    Dav1dRef *const res = (Dav1dRef*)(data + size);
+    res->const_data = res->user_data = res->data = data;
+    atomic_init(&res->ref_cnt, 1);
+    res->free_ref = 0;
+    res->free_callback = default_free_callback;
 
     return res;
 }
 
+static void pool_free_callback(const uint8_t *const data, void *const user_data) {
+    dav1d_mem_pool_push((Dav1dMemPool*)data, user_data);
+}
+
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) {
+    size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+    Dav1dMemPoolBuffer *const buf =
+        dav1d_mem_pool_pop(pool, size + sizeof(Dav1dRef));
+    if (!buf) return NULL;
+
+    Dav1dRef *const res = &((Dav1dRef*)buf)[-1];
+    res->data = buf->data;
+    res->const_data = pool;
+    atomic_init(&res->ref_cnt, 1);
+    res->free_ref = 0;
+    res->free_callback = pool_free_callback;
+    res->user_data = buf;
+
+    return res;
+}
+
 Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
                          void (*free_callback)(const uint8_t *data, void *user_data),
                          void *const user_data)
@@ -59,6 +81,7 @@
     res->data = NULL;
     res->const_data = ptr;
     atomic_init(&res->ref_cnt, 1);
+    res->free_ref = 1;
     res->free_callback = free_callback;
     res->user_data = user_data;
 
@@ -76,8 +99,9 @@
     if (!ref) return;
 
     if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
+        const int free_ref = ref->free_ref;
         ref->free_callback(ref->const_data, ref->user_data);
-        free(ref);
+        if (free_ref) free(ref);
     }
     *pref = NULL;
 }
--- a/src/ref.h
+++ b/src/ref.h
@@ -30,6 +30,9 @@
 
 #include "dav1d/dav1d.h"
 
+#include "src/mem.h"
+#include "src/thread.h"
+
 #include <stdatomic.h>
 #include <stddef.h>
 
@@ -37,11 +40,13 @@
     void *data;
     const void *const_data;
     atomic_int ref_cnt;
+    int free_ref;
     void (*free_callback)(const uint8_t *data, void *user_data);
     void *user_data;
 };
 
 Dav1dRef *dav1d_ref_create(size_t size);
+Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size);
 Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr,
                          void (*free_callback)(const uint8_t *data, void *user_data),
                          void *user_data);
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -29,21 +29,25 @@
 %if ARCH_X86_64
 
 SECTION_RODATA 32
+
+wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
+wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
+wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
+wiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
+wiener_l_shuf: db  4,  4,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+pb_0to31:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+               db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 pb_right_ext_mask: times 32 db 0xff
                    times 32 db 0
-pb_14x0_1_2: times 14 db 0
-             db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
-                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_15: times 16 db 15
-pw_16: times 2 dw 16
-pw_256: times 2 dw 256
-pw_2048: times 2 dw 2048
-pw_16380: times 2 dw 16380
-pw_0_128: dw 0, 128
-pw_5_6: dw 5, 6
-pd_6: dd 6
-pd_1024: dd 1024
+
+pb_3:          times 4 db 3
+pb_m5:         times 4 db -5
+pw_16:         times 2 dw 16
+pw_256:        times 2 dw 256
+pw_2056:       times 2 dw 2056
+pw_m16380:     times 2 dw -16380
+pw_5_6:        dw 5, 6
+pd_1024:       dd 1024
 pd_0xf0080029: dd 0xf0080029
 pd_0xf00801c7: dd 0xf00801c7
 
@@ -51,279 +55,662 @@
 
 SECTION .text
 
-INIT_YMM avx2
-cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge
-    mov        edged, edgem
-    vpbroadcastb m15, [fhq+0]
-    movifnidn     wd, wm
-    vpbroadcastb m14, [fhq+2]
-    mov           hd, hm
-    vpbroadcastb m13, [fhq+4]
-    vpbroadcastw m12, [fhq+6]
-    vpbroadcastd m11, [pw_2048]
-    vpbroadcastd m10, [pw_16380]
-    lea          r11, [pb_right_ext_mask]
+%macro REPX 2-*
+    %xdefine %%f(x) %1
+%rep %0 - 1
+    %rotate 1
+    %%f(%1)
+%endrep
+%endmacro
 
-    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
 
-    ; if (edge & has_right) align_w_to_32
-    ; else w -= 32, and use that as limit in x loop
-    test       edgeb, 2 ; has_right
-    jnz .align
-    mov        xlimq, -3
-    jmp .loop
-.align:
-    add           wd, 31
-    and           wd, ~31
-    xor        xlimd, xlimd
-
-    ; main y loop for vertical filter
-.loop:
-    mov      srcptrq, srcq
-    mov      dstptrq, dstq
-    lea           xq, [wq+xlimq]
-
-    ; load left edge pixels
-    test       edgeb, 1 ; have_left
-    jz .emu_left
-    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
-    jz .load_left_combined
-    movd         xm0, [leftq]
-    add        leftq, 4
-    pinsrd       xm0, [srcq], 1
-    pslldq       xm0, 9
-    jmp .left_load_done
-.load_left_combined:
-    movq         xm0, [srcq-3]
-    pslldq       xm0, 10
-    jmp .left_load_done
-.emu_left:
-    movd         xm0, [srcq]
-    pshufb       xm0, [pb_14x0_1_2]
-
-    ; load right edge pixels
-.left_load_done:
-    cmp           xd, 32
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    je .splat_right
-
-    ; for very small images (w=[1-2]), edge-extend the original cache,
-    ; ugly, but only runs in very odd cases
-    add           wd, wd
-    pshufb       xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
-    shr           wd, 1
-
-    ; main x loop, mostly this starts in .main_load
-.splat_right:
-    ; no need to load new pixels, just extend them from the (possibly previously
-    ; extended) previous load into m0
-    pshufb       xm1, xm0, [pb_15]
-    jmp .main_loop
-.load_and_splat:
-    ; load new pixels and extend edge for right-most
-    movu          m1, [srcptrq+3]
-    sub          r11, xq
-    movu          m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
-    add          r11, xq
-    vpbroadcastb  m3, [srcptrq+2+xq]
-    pand          m1, m2
-    pandn         m3, m2, m3
-    por           m1, m3
-    jmp .main_loop
-.main_load:
-    ; load subsequent line
-    movu          m1, [srcptrq+3]
+INIT_YMM avx2
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+                                               lpf_stride, w, edge, flt, h
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    vbroadcasti128  m6, [wiener_shufA]
+    vpbroadcastb   m11, [fltq+ 0] ; x0 x0
+    vbroadcasti128  m7, [wiener_shufB]
+    vpbroadcastd   m12, [fltq+ 2]
+    vbroadcasti128  m8, [wiener_shufC]
+    packsswb       m12, m12       ; x1 x2
+    vpbroadcastw   m13, [fltq+ 6] ; x3
+    vbroadcasti128  m9, [wiener_shufD]
+    add           lpfq, wq
+    vpbroadcastd   m10, [pw_m16380]
+    lea             t1, [rsp+wq*2+16]
+    vpbroadcastd   m14, [fltq+16] ; y0 y1
+    add           dstq, wq
+    vpbroadcastd   m15, [fltq+20] ; y2 y3
+    neg             wq
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t6, t1
+    mov             t5, t1
+    add             t1, 384*2
+    call .h_top
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov             t4, t1
+    add             t1, 384*2
+    mov      [rsp+8*1], lpf_strideq
+    add             r7, lpf_strideq
+    mov      [rsp+8*0], r7 ; below
+    call .h
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+.main:
+    lea             t0, [t1+384*2]
 .main_loop:
-    vinserti128   m0, xm1, 1
-
-    palignr       m2, m1, m0, 10
-    palignr       m3, m1, m0, 11
-    palignr       m4, m1, m0, 12
-    palignr       m5, m1, m0, 13
-    palignr       m6, m1, m0, 14
-    palignr       m7, m1, m0, 15
-
-    punpcklbw     m0, m2, m1
-    punpckhbw     m2, m1
-    punpcklbw     m8, m3, m7
-    punpckhbw     m3, m7
-    punpcklbw     m7, m4, m6
-    punpckhbw     m4, m6
-    pxor          m9, m9
-    punpcklbw     m6, m5, m9
-    punpckhbw     m5, m9
-
-    pmaddubsw     m0, m15
-    pmaddubsw     m2, m15
-    pmaddubsw     m8, m14
-    pmaddubsw     m3, m14
-    pmaddubsw     m7, m13
-    pmaddubsw     m4, m13
-    paddw         m0, m8
-    paddw         m2, m3
-    psllw         m8, m6, 7
-    psllw         m3, m5, 7
-    psubw         m8, m10
-    psubw         m3, m10
-    pmullw        m6, m12
-    pmullw        m5, m12
-    paddw         m0, m7
-    paddw         m2, m4
-    paddw         m0, m6
-    paddw         m2, m5
-    ; for a signed overflow to happen we need filter and pixels as follow:
-    ; filter => -5,-23,-17,90,-17,-23,-5
-    ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0
-    ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6]
-    ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84]
-    ;  32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A]
-    ; => signed 16-bit overflow occurs
-    paddsw        m0, m8  ; paddsw clips this range to [-8000;+7FFF]
-    paddsw        m2, m3
-    psraw         m0, 3   ; shift changes the range to [-1000;+FFF]
-    psraw         m2, 3
-    paddw         m0, m11 ; adding back 800 (removed in m8) changes the
-    paddw         m2, m11 ; range to [-800;+17FF] as defined in the spec
-    mova   [dstptrq], xm0 ; (note that adding another 800 would give us
-    mova [dstptrq+16], xm2;  the same range as in the C code => [0;1FFF])
-    vextracti128 [dstptrq+32], m0, 1
-    vextracti128 [dstptrq+48], m2, 1
-    vextracti128 xm0, m1, 1
-    add      srcptrq, 32
-    add      dstptrq, 64
-    sub           xq, 32
-    cmp           xd, 32
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    cmp           xd, xlimd
-    jg .splat_right
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v3
+    mov           lpfq, [rsp+8*0]
+    call .hv_bottom
+    add           lpfq, [rsp+8*1]
+    call .hv_bottom
+.v1:
+    call .v
+    RET
+.no_top:
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov      [rsp+8*1], lpf_strideq
+    lea             r7, [r7+lpf_strideq*2]
+    mov      [rsp+8*0], r7
+    call .h
+    mov             t6, t1
+    mov             t5, t1
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v3
+    add             t0, 384*8
+    call .hv
+    dec             hd
+    jnz .main
+.v3:
+    call .v
+.v2:
+    call .v
+    jmp .v1
+.extend_right:
+    movd           xm2, r10d
+    vpbroadcastd    m0, [pb_3]
+    vpbroadcastd    m1, [pb_m5]
+    vpbroadcastb    m2, xm2
+    movu            m3, [pb_0to31]
+    psubb           m0, m2
+    psubb           m1, m2
+    pminub          m0, m3
+    pminub          m1, m3
+    pshufb          m4, m0
+    pshufb          m5, m1
+    ret
+.h:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .h_main
+.h_extend_left:
+    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
+    mova            m4, [lpfq+r10] ; before the start of the buffer
+    palignr         m4, m5, 12
+    pshufb          m4, [wiener_l_shuf]
+    jmp .h_main
+.h_top:
+    mov            r10, wq
+    movu            m4, [lpfq+r10-4]
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jnz .h_main
+    pshufb          m4, [wiener_l_shuf]
+    jmp .h_main
+.h_loop:
+    movu            m4, [lpfq+r10-4]
+.h_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp           r10d, -34
+    jl .h_have_right
+    call .extend_right
+.h_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m11
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m11
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    paddw           m0, m2
+    pshufb          m2, m4, m8
+    pmaddubsw       m2, m12
+    paddw           m1, m3
+    pshufb          m3, m5, m8
+    pmaddubsw       m3, m12
+    pshufb          m4, m9
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m9
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m10
+    paddw           m5, m10
+    paddw           m0, m2
+    vpbroadcastd    m2, [pw_2056]
+    paddw           m1, m3
+    paddsw          m0, m4
+    paddsw          m1, m5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m2
+    paddw           m1, m2
+    mova [t1+r10*2+ 0], m0
+    mova [t1+r10*2+32], m1
+    add            r10, 32
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .hv_main
+.hv_extend_left:
+    movu            m4, [lpfq+r10-4]
+    pshufb          m4, [wiener_l_shuf]
+    jmp .hv_main
+.hv_bottom:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+r10-4]
+.hv_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp           r10d, -34
+    jl .hv_have_right
+    call .extend_right
+.hv_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m11
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m11
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    paddw           m0, m2
+    pshufb          m2, m4, m8
+    pmaddubsw       m2, m12
+    paddw           m1, m3
+    pshufb          m3, m5, m8
+    pmaddubsw       m3, m12
+    pshufb          m4, m9
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m9
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m10
+    paddw           m5, m10
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m2, [t4+r10*2]
+    paddw           m2, [t2+r10*2]
+    mova            m3, [t3+r10*2]
+    paddsw          m0, m4
+    vpbroadcastd    m4, [pw_2056]
+    paddsw          m1, m5
+    mova            m5, [t5+r10*2]
+    paddw           m5, [t1+r10*2]
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m4
+    paddw           m1, m4
+    paddw           m4, m0, [t6+r10*2]
+    mova    [t0+r10*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m14
+    punpckhwd       m4, m5
+    pmaddwd         m4, m14
+    paddd           m0, m3
+    paddd           m4, m2
+    mova            m2, [t4+r10*2+32]
+    paddw           m2, [t2+r10*2+32]
+    mova            m3, [t3+r10*2+32]
+    mova            m5, [t5+r10*2+32]
+    paddw           m5, [t1+r10*2+32]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
+    paddw           m4, m1, [t6+r10*2+32]
+    mova [t0+r10*2+32], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m14
+    punpckhwd       m4, m5
+    pmaddwd         m4, m14
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .hv_loop
+    mov             t6, t5
+    mov             t5, t4
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t6
+    add           dstq, dst_strideq
+    ret
+.v:
+    mov            r10, wq
+.v_loop:
+    mova            m2, [t4+r10*2+ 0]
+    paddw           m2, [t2+r10*2+ 0]
+    mova            m4, [t3+r10*2+ 0]
+    mova            m6, [t1+r10*2+ 0]
+    paddw           m8, m6, [t6+r10*2+ 0]
+    paddw           m6, [t5+r10*2+ 0]
+    mova            m3, [t4+r10*2+32]
+    paddw           m3, [t2+r10*2+32]
+    mova            m5, [t3+r10*2+32]
+    mova            m7, [t1+r10*2+32]
+    paddw           m9, m7, [t6+r10*2+32]
+    paddw           m7, [t5+r10*2+32]
+    punpcklwd       m0, m2, m4
+    pmaddwd         m0, m15
+    punpckhwd       m2, m4
+    pmaddwd         m2, m15
+    punpcklwd       m4, m8, m6
+    pmaddwd         m4, m14
+    punpckhwd       m6, m8, m6
+    pmaddwd         m6, m14
+    punpcklwd       m1, m3, m5
+    pmaddwd         m1, m15
+    punpckhwd       m3, m5
+    pmaddwd         m3, m15
+    punpcklwd       m5, m9, m7
+    pmaddwd         m5, m14
+    punpckhwd       m7, m9, m7
+    pmaddwd         m7, m14
+    paddd           m0, m4
+    paddd           m2, m6
+    paddd           m1, m5
+    paddd           m3, m7
+    REPX {psrad x, 11}, m0, m2, m1, m3
+    packssdw        m0, m2
+    packssdw        m1, m3
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .v_loop
+    mov             t6, t5
+    mov             t5, t4
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    add           dstq, dst_strideq
+    ret
 
-    add         srcq, strideq
-    add         dstq, 384*2
-    dec           hd
-    jg .loop
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+                                             lpf_stride, w, edge, flt, h
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    vbroadcasti128  m6, [wiener_shufB]
+    vpbroadcastd   m12, [fltq+ 2]
+    vbroadcasti128  m7, [wiener_shufC]
+    packsswb       m12, m12       ; x1 x2
+    vpbroadcastw   m13, [fltq+ 6] ; x3
+    vbroadcasti128  m8, [wiener_shufD]
+    add           lpfq, wq
+    vpbroadcastd    m9, [pw_m16380]
+    vpbroadcastd   m10, [pw_2056]
+    lea             t1, [rsp+wq*2+16]
+    mova           m11, [wiener_l_shuf]
+    vpbroadcastd   m14, [fltq+16] ; __ y1
+    add           dstq, wq
+    vpbroadcastd   m15, [fltq+20] ; y2 y3
+    neg             wq
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t4, t1
+    add             t1, 384*2
+    call .h_top
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov             t3, t1
+    add             t1, 384*2
+    mov      [rsp+8*1], lpf_strideq
+    add             r7, lpf_strideq
+    mov      [rsp+8*0], r7 ; below
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+.main:
+    mov             t0, t4
+.main_loop:
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v2
+    mov           lpfq, [rsp+8*0]
+    call .hv_bottom
+    add           lpfq, [rsp+8*1]
+    call .hv_bottom
+.end:
     RET
+.no_top:
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov      [rsp+8*1], lpf_strideq
+    lea             r7, [r7+lpf_strideq*2]
+    mov      [rsp+8*0], r7
+    call .h
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v2
+    add             t0, 384*6
+    call .hv
+    dec             hd
+    jnz .main
+.v2:
+    call .v
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    add           dstq, dst_strideq
+.v1:
+    call .v
+    jmp .end
+.h:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .h_main
+.h_extend_left:
+    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
+    mova            m4, [lpfq+r10] ; before the start of the buffer
+    palignr         m4, m5, 12
+    pshufb          m4, m11
+    jmp .h_main
+.h_top:
+    mov            r10, wq
+    movu            m4, [lpfq+r10-4]
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jnz .h_main
+    pshufb          m4, m11
+    jmp .h_main
+.h_loop:
+    movu            m4, [lpfq+r10-4]
+.h_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp           r10d, -33
+    jl .h_have_right
+    call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.h_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m12
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m12
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    pshufb          m4, m8
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m8
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m9
+    paddw           m5, m9
+    paddw           m0, m2
+    paddw           m1, m3
+    paddsw          m0, m4
+    paddsw          m1, m5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m10
+    paddw           m1, m10
+    mova [t1+r10*2+ 0], m0
+    mova [t1+r10*2+32], m1
+    add            r10, 32
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .hv_main
+.hv_extend_left:
+    movu            m4, [lpfq+r10-4]
+    pshufb          m4, m11
+    jmp .hv_main
+.hv_bottom:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+r10-4]
+.hv_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp           r10d, -33
+    jl .hv_have_right
+    call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.hv_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m12
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m12
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    pshufb          m4, m8
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m8
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m9
+    paddw           m5, m9
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m2, [t3+r10*2]
+    paddw           m2, [t1+r10*2]
+    mova            m3, [t2+r10*2]
+    paddsw          m0, m4
+    paddsw          m1, m5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m10
+    paddw           m1, m10
+    paddw           m4, m0, [t4+r10*2]
+    mova    [t0+r10*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m14
+    punpckhwd       m4, m4
+    pmaddwd         m4, m14
+    paddd           m0, m3
+    paddd           m4, m2
+    mova            m2, [t3+r10*2+32]
+    paddw           m2, [t1+r10*2+32]
+    mova            m3, [t2+r10*2+32]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
+    paddw           m4, m1, [t4+r10*2+32]
+    mova [t0+r10*2+32], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m14
+    punpckhwd       m4, m4
+    pmaddwd         m4, m14
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .hv_loop
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t4
+    add           dstq, dst_strideq
+    ret
+.v:
+    mov            r10, wq
+    psrld          m13, m14, 16 ; y1 __
+.v_loop:
+    mova            m6, [t1+r10*2+ 0]
+    paddw           m2, m6, [t3+r10*2+ 0]
+    mova            m4, [t2+r10*2+ 0]
+    mova            m7, [t1+r10*2+32]
+    paddw           m3, m7, [t3+r10*2+32]
+    mova            m5, [t2+r10*2+32]
+    paddw           m6, [t4+r10*2+ 0]
+    paddw           m7, [t4+r10*2+32]
+    punpcklwd       m0, m2, m4
+    pmaddwd         m0, m15
+    punpckhwd       m2, m4
+    pmaddwd         m2, m15
+    punpcklwd       m1, m3, m5
+    pmaddwd         m1, m15
+    punpckhwd       m3, m5
+    pmaddwd         m3, m15
+    punpcklwd       m5, m7, m6
+    pmaddwd         m4, m5, m14
+    punpckhwd       m7, m6
+    pmaddwd         m6, m7, m14
+    pmaddwd         m5, m13
+    pmaddwd         m7, m13
+    paddd           m0, m4
+    paddd           m2, m6
+    paddd           m1, m5
+    paddd           m3, m7
+    REPX {psrad x, 11}, m0, m2, m1, m3
+    packssdw        m0, m2
+    packssdw        m1, m3
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .v_loop
+    ret
 
-cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge
-    movifnidn    fvq, fvmp
-    mov        edged, edgem
-    movifnidn     hd, hm
-    vpbroadcastd m10, [fvq]
-    vpbroadcastd m11, [fvq+4]
-    vpbroadcastd  m0, [pw_0_128]
-    vpbroadcastd m12, [pd_1024]
-
-    DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
-    rorx       ylimd, edged, 2
-    paddw        m11, m0
-    and        ylimd, 2 ; have_bottom
-    sub        ylimd, 3
-
-    ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
-    mova          m3, [midq] ; middle line
-
-    ; load top pixels
-    test       edgeb, 4 ; have_top
-    jz .emu_top
-    mova          m0, [midq-384*4]
-    mova          m2, [midq-384*2]
-    mova          m1, m0
-    jmp .load_bottom_pixels
-.emu_top:
-    mova          m0, m3
-    mova          m1, m3
-    mova          m2, m3
-
-    ; load bottom pixels
-.load_bottom_pixels:
-    mov           yd, hd
-    mov        mptrq, midq
-    mov      dstptrq, dstq
-    add           yd, ylimd
-    jg .load_threelines
-
-    ; the remainder here is somewhat messy but only runs in very weird
-    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
-    ; so performance is not terribly important here...
-    je .load_twolines
-    cmp           yd, -1
-    je .load_oneline
-    ; h == 1 case
-    mova          m5, m3
-    mova          m4, m3
-    mova          m6, m3
-    jmp .loop
-.load_oneline:
-    ; h == 2 case
-    mova          m4, [midq+384*2]
-    mova          m5, m4
-    mova          m6, m4
-    jmp .loop
-.load_twolines:
-    ; h == 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    mova          m6, m5
-    jmp .loop
-.load_threelines:
-    ; h > 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    ; third line loaded in main loop below
-
-    ; main y loop for vertical filter
-.loop_load:
-    ; load one line into m6. if that pixel is no longer available, do
-    ; nothing, since m6 still has the data from the previous line in it. We
-    ; try to structure the loop so that the common case is evaluated fastest
-    mova          m6, [mptrq+384*6]
-.loop:
-    paddw         m0, m6
-    paddw         m7, m1, m5
-    paddw         m8, m2, m4
-    punpcklwd     m9, m0, m7
-    punpckhwd     m0, m7
-    punpcklwd     m7, m8, m3
-    punpckhwd     m8, m3
-    pmaddwd       m9, m10
-    pmaddwd       m0, m10
-    pmaddwd       m7, m11
-    pmaddwd       m8, m11
-    add        mptrq, 384*2
-    paddd         m7, m9
-    paddd         m0, m8
-    paddd         m7, m12
-    paddd         m0, m12
-    psrad         m7, 11
-    psrad         m0, 11
-    packssdw      m7, m0
-    vextracti128 xm0, m7, 1
-    packuswb     xm7, xm0
-    mova   [dstptrq], xm7
-    ; shift pixels one position
-    mova          m0, m1
-    mova          m1, m2
-    mova          m2, m3
-    mova          m3, m4
-    mova          m4, m5
-    mova          m5, m6
-    add      dstptrq, strideq
-    dec           yd
-    jg .loop_load
-    ; for the bottom pixels, continue using m6 (as extended edge)
-    cmp           yd, ylimd
-    jg .loop
-    add         midq, 32
-    add         dstq, 16
-    sub           wd, 16
-    jg .loop_x
-    RET
-
-INIT_YMM avx2
 cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
     mov        xlimd, edgem
     movifnidn     wd, wm
--- a/src/x86/looprestoration_init_tmpl.c
+++ b/src/x86/looprestoration_init_tmpl.c
@@ -31,52 +31,19 @@
 #include "common/intops.h"
 #include "src/tables.h"
 
-// Future potential optimizations:
-// - special chroma versions which don't filter [0]/[6];
-// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
-//   to bottom) instead of scanline-ordered should be faster since then the
-//   if (have_left) and similar conditions run only once instead of per line;
-// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
-//   to run 32 (like filter_h_avx2), and then all vpermqs can go;
-// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
-//   since then the have_left condition can be inlined;
-// - consider having the wrapper (wiener_filter_${ext}) also in hand-written
-//   assembly, so the setup overhead is minimized.
-
 #define WIENER_FILTER(ext) \
-\
-void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \
-                                 const pixel *src, ptrdiff_t stride, \
-                                 const int16_t fh[7], const intptr_t w, \
-                                 int h, enum LrEdgeFlags edges); \
-void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \
-                                 const int16_t *mid, int w, int h, \
-                                 const int16_t fv[7], enum LrEdgeFlags edges); \
-\
-static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
-                                const pixel (*const left)[4], \
-                                const pixel *lpf, const ptrdiff_t lpf_stride, \
-                                const int w, const int h, const int16_t fh[7], \
-                                const int16_t fv[7], const enum LrEdgeFlags edges) \
-{ \
-    ALIGN_STK_32(int16_t, mid, 68 * 384,); \
-\
-    /* horizontal filter */ \
-    dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \
-                               fh, w, h, edges); \
-    if (edges & LR_HAVE_TOP) \
-        dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \
-                                   fh, w, 2, edges); \
-    if (edges & LR_HAVE_BOTTOM) \
-        dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \
-                                   lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \
-                                   fh, w, 2, edges); \
-\
-    dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, fv, edges); \
-}
+void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+                                const pixel (*left)[4], const pixel *lpf, \
+                                ptrdiff_t lpf_stride, int w, int h, \
+                                const int16_t filter[2][8], \
+                                enum LrEdgeFlags edges); \
+void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+                                const pixel (*left)[4], const pixel *lpf, \
+                                ptrdiff_t lpf_stride, int w, int h, \
+                                const int16_t filter[2][8], \
+                                enum LrEdgeFlags edges);
 
 #define SGR_FILTER(ext) \
-\
 void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
                             const pixel (*left)[4], \
                             const pixel *src, const ptrdiff_t stride, \
@@ -199,15 +166,13 @@
     } \
 }
 
-#define DEF_LR_FILTERS(ext) \
-WIENER_FILTER(ext) \
-SGR_FILTER(ext)
-
 #if BITDEPTH == 8
 WIENER_FILTER(sse2)
-DEF_LR_FILTERS(ssse3)
+WIENER_FILTER(ssse3)
+SGR_FILTER(ssse3)
 # if ARCH_X86_64
-DEF_LR_FILTERS(avx2)
+WIENER_FILTER(avx2)
+SGR_FILTER(avx2)
 # endif
 #endif
 
@@ -216,18 +181,21 @@
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
 #if BITDEPTH == 8
-    c->wiener = wiener_filter_sse2;
+    c->wiener[0] = dav1d_wiener_filter7_sse2;
+    c->wiener[1] = dav1d_wiener_filter5_sse2;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 #if BITDEPTH == 8
-    c->wiener = wiener_filter_ssse3;
+    c->wiener[0] = dav1d_wiener_filter7_ssse3;
+    c->wiener[1] = dav1d_wiener_filter5_ssse3;
     c->selfguided = sgr_filter_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 #if BITDEPTH == 8 && ARCH_X86_64
-    c->wiener = wiener_filter_avx2;
+    c->wiener[0] = dav1d_wiener_filter7_avx2;
+    c->wiener[1] = dav1d_wiener_filter5_avx2;
     c->selfguided = sgr_filter_avx2;
 #endif
 }
--- /dev/null
+++ b/src/x86/looprestoration_sse.asm
@@ -1,0 +1,2448 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+wiener_init:   db  6,  7,  6,  7,  6,  7,  6,  7,  0,  0,  0,  0,  2,  4,  2,  4
+wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
+wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
+wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
+wiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
+wiener_l_shuf: db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
+pb_unpcklwdw:  db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
+
+pb_right_ext_mask: times 24 db 0xff
+                   times 8 db 0
+pb_0:          times 16 db 0
+pb_3:          times 16 db 3
+pb_15:         times 16 db 15
+pb_0_1:        times 8 db 0, 1
+pb_14_15:      times 8 db 14, 15
+pw_1:          times 8 dw 1
+pw_16:         times 8 dw 16
+pw_128:        times 8 dw 128
+pw_256:        times 8 dw 256
+pw_2048:       times 8 dw 2048
+pw_2056:       times 8 dw 2056
+pw_m16380:     times 8 dw -16380
+pw_5_6:        times 4 dw 5, 6
+pd_1024:       times 4 dd 1024
+%if ARCH_X86_32
+pd_512:        times 4 dd 512
+pd_2048:       times 4 dd 2048
+%endif
+pd_0xF0080029: times 4 dd 0xF0080029
+pd_0xF00801C7: times 4 dd 0XF00801C7
+
+cextern sgr_x_by_x
+
+SECTION .text
+
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
+  %assign pic_reg_stk_off 4
+  %xdefine PIC_reg %1
+  %if %2 == 1
+    mov        [esp], %1
+  %endif
+    LEA      PIC_reg, PIC_base_offset
+  %if %3 == 1
+    XCHG_PIC_REG
+  %endif
+ %endmacro
+
+ %macro XCHG_PIC_REG 0
+    mov [esp+pic_reg_stk_off], PIC_reg
+    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
+    mov PIC_reg, [esp+pic_reg_stk_off]
+ %endmacro
+
+ %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 0
+ %endmacro
+
+ %define PIC_sym(sym)   (sym)
+%endif
+
+%macro WIENER 0
+%if ARCH_X86_64
+DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+                                               lpf_stride, w, edge, flt, h, x
+    %define base 0
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    movq           m14, [fltq]
+    add           lpfq, wq
+    lea             t1, [rsp+wq*2+16]
+    mova           m15, [pw_2056]
+    add           dstq, wq
+    movq            m7, [fltq+16]
+    neg             wq
+%if cpuflag(ssse3)
+    pshufb         m14, [wiener_init]
+    mova            m8, [wiener_shufA]
+    pshufd         m12, m14, q2222  ; x0 x0
+    mova            m9, [wiener_shufB]
+    pshufd         m13, m14, q3333  ; x1 x2
+    mova           m10, [wiener_shufC]
+    punpcklqdq     m14, m14         ; x3
+    mova           m11, [wiener_shufD]
+%else
+    mova           m10, [pw_m16380]
+    punpcklwd      m14, m14
+    pshufd         m11, m14, q0000 ; x0
+    pshufd         m12, m14, q1111 ; x1
+    pshufd         m13, m14, q2222 ; x2
+    pshufd         m14, m14, q3333 ; x3
+%endif
+%else
+DECLARE_REG_TMP 4, 0, _, 5
+%if cpuflag(ssse3)
+    %define m10         [base+wiener_shufC]
+    %define m11         [base+wiener_shufD]
+    %define stk_off     96
+%else
+    %define m10         [base+pw_m16380]
+    %define m11         [stk+96]
+    %define stk_off     112
+%endif
+cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+    %define base        r6-pb_right_ext_mask-21
+    %define stk         esp
+    %define dstq        leftq
+    %define edgeb       byte edged
+    %define edged       [stk+ 8]
+    %define dstmp       [stk+12]
+    %define hd    dword [stk+16]
+    %define wq          [stk+20]
+    %define dst_strideq [stk+24]
+    %define leftmp      [stk+28]
+    %define t2          [stk+32]
+    %define t4          [stk+36]
+    %define t5          [stk+40]
+    %define t6          [stk+44]
+    %define m8          [base+wiener_shufA]
+    %define m9          [base+wiener_shufB]
+    %define m12         [stk+48]
+    %define m13         [stk+64]
+    %define m14         [stk+80]
+    %define m15         [base+pw_2056]
+    mov             r1, r7m ; flt
+    mov             r0, r0m ; dst
+    mov             r5, r5m ; w
+    mov           lpfq, lpfm
+    mov             r2, r8m ; edge
+    mov             r4, r6m ; h
+    movq            m3, [r1+ 0]
+    movq            m7, [r1+16]
+    add             r0, r5
+    mov             r1, r1m ; dst_stride
+    add           lpfq, r5
+    mov          edged, r2
+    mov             r2, r2m ; left
+    mov          dstmp, r0
+    lea             t1, [rsp+r5*2+stk_off]
+    mov             hd, r4
+    neg             r5
+    mov    lpf_strideq, lpf_stridem
+    LEA             r6, pb_right_ext_mask+21
+    mov             wq, r5
+    mov    dst_strideq, r1
+    mov         leftmp, r2
+%if cpuflag(ssse3)
+    pshufb          m3, [base+wiener_init]
+    pshufd          m1, m3, q2222
+    pshufd          m2, m3, q3333
+    punpcklqdq      m3, m3
+%else
+    punpcklwd       m3, m3
+    pshufd          m0, m3, q0000
+    pshufd          m1, m3, q1111
+    pshufd          m2, m3, q2222
+    pshufd          m3, m3, q3333
+    mova           m11, m0
+%endif
+    mova           m12, m1
+    mova           m13, m2
+    mova           m14, m3
+%endif
+    pshufd          m6, m7, q0000 ; y0 y1
+    pshufd          m7, m7, q1111 ; y2 y3
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t6, t1
+    mov             t5, t1
+    add             t1, 384*2
+    call .h_top
+    lea             t3, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov [rsp+gprsize*1], lpf_strideq
+    add             t3, lpf_strideq
+    mov [rsp+gprsize*0], t3 ; below
+    mov             t4, t1
+    add             t1, 384*2
+    call .h
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+.main:
+    lea             t0, [t1+384*2]
+.main_loop:
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v3
+    mov           lpfq, [rsp+gprsize*0]
+    call .hv_bottom
+    add           lpfq, [rsp+gprsize*1]
+    call .hv_bottom
+.v1:
+    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+    RET
+.no_top:
+    lea             t3, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov [rsp+gprsize*1], lpf_strideq
+    lea             t3, [t3+lpf_strideq*2]
+    mov [rsp+gprsize*0], t3
+    call .h
+    mov             t6, t1
+    mov             t5, t1
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v3
+    add             t0, 384*8
+    call .hv
+    dec             hd
+    jnz .main
+.v3:
+    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+.v2:
+    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+    jmp .v1
+.extend_right:
+    movd            m2, [lpfq-4]
+%if ARCH_X86_64
+    push            r0
+    lea             r0, [pb_right_ext_mask+21]
+    movu            m0, [r0+xq+0]
+    movu            m1, [r0+xq+8]
+    pop             r0
+%else
+    movu            m0, [r6+xq+0]
+    movu            m1, [r6+xq+8]
+%endif
+%if cpuflag(ssse3)
+    pshufb          m2, [base+pb_3]
+%else
+    punpcklbw       m2, m2
+    pshuflw         m2, m2, q3333
+    punpcklqdq      m2, m2
+%endif
+    pand            m4, m0
+    pand            m5, m1
+    pandn           m0, m2
+    pandn           m1, m2
+    por             m4, m0
+    por             m5, m1
+    ret
+.h:
+    %define stk esp+4 ; offset due to call
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+    mova            m4, [lpfq+xq]
+    pshufb          m4, [base+wiener_l_shuf]
+%else
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
+%endif
+    jmp .h_main
+.h_top:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+.h_loop:
+    movu            m4, [lpfq+xq-4]
+.h_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp             xd, -18
+    jl .h_have_right
+    call .extend_right
+.h_have_right:
+%macro %%h7 0
+%if cpuflag(ssse3)
+    pshufb          m0, m4, m8
+    pmaddubsw       m0, m12
+    pshufb          m1, m5, m8
+    pmaddubsw       m1, m12
+    pshufb          m2, m4, m9
+    pmaddubsw       m2, m13
+    pshufb          m3, m5, m9
+    pmaddubsw       m3, m13
+    paddw           m0, m2
+    pshufb          m2, m4, m10
+    pmaddubsw       m2, m13
+    paddw           m1, m3
+    pshufb          m3, m5, m10
+    pmaddubsw       m3, m13
+    pshufb          m4, m11
+    paddw           m0, m2
+    pmullw          m2, m14, m4
+    pshufb          m5, m11
+    paddw           m1, m3
+    pmullw          m3, m14, m5
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m0, m2
+    mova            m2, [base+pw_m16380]
+    paddw           m1, m3
+    paddw           m4, m2
+    paddw           m5, m2
+    paddsw          m0, m4
+    paddsw          m1, m5
+%else
+    psrldq          m0, m4, 1
+    pslldq          m1, m4, 1
+    pxor            m3, m3
+    punpcklbw       m0, m3
+    punpckhbw       m1, m3
+    paddw           m0, m1
+    pmullw          m0, m11
+    psrldq          m1, m4, 2
+    pslldq          m2, m4, 2
+    punpcklbw       m1, m3
+    punpckhbw       m2, m3
+    paddw           m1, m2
+    pmullw          m1, m12
+    paddw           m0, m1
+    pshufd          m2, m4, q0321
+    punpcklbw       m2, m3
+    pmullw          m1, m14, m2
+    paddw           m0, m1
+    psrldq          m1, m4, 3
+    pslldq          m4, 3
+    punpcklbw       m1, m3
+    punpckhbw       m4, m3
+    paddw           m1, m4
+    pmullw          m1, m13
+    paddw           m0, m1
+    psllw           m2, 7
+    paddw           m2, m10
+    paddsw          m0, m2
+    psrldq          m1, m5, 1
+    pslldq          m2, m5, 1
+    punpcklbw       m1, m3
+    punpckhbw       m2, m3
+    paddw           m1, m2
+    pmullw          m1, m11
+    psrldq          m2, m5, 2
+    pslldq          m4, m5, 2
+    punpcklbw       m2, m3
+    punpckhbw       m4, m3
+    paddw           m2, m4
+    pmullw          m2, m12
+    paddw           m1, m2
+    pshufd          m4, m5, q0321
+    punpcklbw       m4, m3
+    pmullw          m2, m14, m4
+    paddw           m1, m2
+    psrldq          m2, m5, 3
+    pslldq          m5, 3
+    punpcklbw       m2, m3
+    punpckhbw       m5, m3
+    paddw           m2, m5
+    pmullw          m2, m13
+    paddw           m1, m2
+    psllw           m4, 7
+    paddw           m4, m10
+    paddsw          m1, m4
+%endif
+%endmacro
+    %%h7
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
+    mova  [t1+xq*2+ 0], m0
+    mova  [t1+xq*2+16], m1
+    add             xq, 16
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+    mova            m4, [lpfq+xq]
+    pshufb          m4, [base+wiener_l_shuf]
+%else
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
+%endif
+    jmp .hv_main
+.hv_bottom:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+xq-4]
+.hv_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp             xd, -18
+    jl .hv_have_right
+    call .extend_right
+.hv_have_right:
+    %%h7
+%if ARCH_X86_64
+    mova            m2, [t4+xq*2]
+    paddw           m2, [t2+xq*2]
+%else
+    mov             r2, t4
+    mova            m2, [r2+xq*2]
+    mov             r2, t2
+    paddw           m2, [r2+xq*2]
+    mov             r2, t5
+%endif
+    mova            m3, [t3+xq*2]
+%if ARCH_X86_64
+    mova            m5, [t5+xq*2]
+%else
+    mova            m5, [r2+xq*2]
+    mov             r2, t6
+%endif
+    paddw           m5, [t1+xq*2]
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
+%if ARCH_X86_64
+    paddw           m4, m0, [t6+xq*2]
+%else
+    paddw           m4, m0, [r2+xq*2]
+    mov             r2, t4
+%endif
+    mova     [t0+xq*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m6
+    punpckhwd       m4, m5
+    pmaddwd         m4, m6
+    paddd           m0, m3
+    mova            m3, [t3+xq*2+16]
+    paddd           m4, m2
+%if ARCH_X86_64
+    mova            m2, [t4+xq*2+16]
+    paddw           m2, [t2+xq*2+16]
+    mova            m5, [t5+xq*2+16]
+%else
+    mova            m2, [r2+xq*2+16]
+    mov             r2, t2
+    paddw           m2, [r2+xq*2+16]
+    mov             r2, t5
+    mova            m5, [r2+xq*2+16]
+    mov             r2, t6
+%endif
+    paddw           m5, [t1+xq*2+16]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
+%if ARCH_X86_64
+    paddw           m4, m1, [t6+xq*2+16]
+%else
+    paddw           m4, m1, [r2+xq*2+16]
+    mov           dstq, dstmp
+%endif
+    mova  [t0+xq*2+16], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m6
+    punpckhwd       m4, m5
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .hv_loop
+    add           dstq, dst_strideq
+%if ARCH_X86_64
+    mov             t6, t5
+    mov             t5, t4
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t6
+%else
+    mov          dstmp, dstq
+    mov             r1, t5
+    mov             r2, t4
+    mov             t6, r1
+    mov             t5, r2
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, r1
+%endif
+    ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+    mov             xq, wq
+.v_loop:
+%if ARCH_X86_64
+    mova            m1, [t4+xq*2]
+    paddw           m1, [t2+xq*2]
+%else
+    mov             r2, t4
+    mova            m1, [r2+xq*2]
+    mov             r2, t2
+    paddw           m1, [r2+xq*2]
+    mov             r2, t6
+%endif
+    mova            m2, [t3+xq*2]
+    mova            m4, [t1+xq*2]
+%if ARCH_X86_64
+    paddw           m3, m4, [t6+xq*2]
+    paddw           m4, [t5+xq*2]
+%else
+    paddw           m3, m4, [r2+xq*2]
+    mov             r2, t5
+    paddw           m4, [r2+xq*2]
+    mov             r2, t4
+%endif
+    punpcklwd       m0, m1, m2
+    pmaddwd         m0, m7
+    punpckhwd       m1, m2
+    pmaddwd         m1, m7
+    punpcklwd       m2, m3, m4
+    pmaddwd         m2, m6
+    punpckhwd       m3, m4
+    pmaddwd         m3, m6
+    paddd           m0, m2
+    paddd           m1, m3
+%if ARCH_X86_64
+    mova            m2, [t4+xq*2+16]
+    paddw           m2, [t2+xq*2+16]
+%else
+    mova            m2, [r2+xq*2+16]
+    mov             r2, t2
+    paddw           m2, [r2+xq*2+16]
+    mov             r2, t6
+%endif
+    mova            m3, [t3+xq*2+16]
+    mova            m5, [t1+xq*2+16]
+%if ARCH_X86_64
+    paddw           m4, m5, [t6+xq*2+16]
+    paddw           m5, [t5+xq*2+16]
+%else
+    paddw           m4, m5, [r2+xq*2+16]
+    mov             r2, t5
+    paddw           m5, [r2+xq*2+16]
+    movifnidn     dstq, dstmp
+%endif
+    psrad           m0, 11
+    psrad           m1, 11
+    packssdw        m0, m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m6
+    punpckhwd       m4, m5
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .v_loop
+    add           dstq, dst_strideq
+%if ARCH_X86_64
+    mov             t6, t5
+    mov             t5, t4
+%else
+    mov          dstmp, dstq
+    mov             r1, t5
+    mov             r2, t4
+    mov             t6, r1
+    mov             t5, r2
+%endif
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    ret
+%endif
+
+%if ARCH_X86_64
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+                                             lpf_stride, w, edge, flt, h, x
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    movq           m14, [fltq]
+    add           lpfq, wq
+    mova            m8, [pw_m16380]
+    lea             t1, [rsp+wq*2+16]
+    mova           m15, [pw_2056]
+    add           dstq, wq
+    movq            m7, [fltq+16]
+    neg             wq
+%if cpuflag(ssse3)
+    pshufb         m14, [wiener_init]
+    mova            m9, [wiener_shufB]
+    pshufd         m13, m14, q3333  ; x1 x2
+    mova           m10, [wiener_shufC]
+    punpcklqdq     m14, m14         ; x3
+    mova           m11, [wiener_shufD]
+    mova           m12, [wiener_l_shuf]
+%else
+    punpcklwd      m14, m14
+    pshufd         m11, m14, q1111 ; x1
+    pshufd         m13, m14, q2222 ; x2
+    pshufd         m14, m14, q3333 ; x3
+%endif
+%else
+%if cpuflag(ssse3)
+    %define stk_off     80
+%else
+    %define m11         [stk+80]
+    %define stk_off     96
+%endif
+cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+    %define stk         esp
+    %define leftmp      [stk+28]
+    %define m8          [base+pw_m16380]
+    %define m12         [base+wiener_l_shuf]
+    %define m14         [stk+48]
+    mov             r1, r7m ; flt
+    mov             r0, r0m ; dst
+    mov             r5, r5m ; w
+    mov           lpfq, lpfm
+    mov             r2, r8m ; edge
+    mov             r4, r6m ; h
+    movq            m2, [r1+ 0]
+    movq            m7, [r1+16]
+    add             r0, r5
+    mov             r1, r1m ; dst_stride
+    add           lpfq, r5
+    mov          edged, r2
+    mov             r2, r2m ; left
+    mov          dstmp, r0
+    lea             t1, [rsp+r5*2+stk_off]
+    mov             hd, r4
+    neg             r5
+    mov    lpf_strideq, lpf_stridem
+    LEA             r6, pb_right_ext_mask+21
+    mov             wq, r5
+    mov    dst_strideq, r1
+    mov         leftmp, r2
+%if cpuflag(ssse3)
+    pshufb          m2, [base+wiener_init]
+    pshufd          m1, m2, q3333
+    punpcklqdq      m2, m2
+%else
+    punpcklwd       m2, m2
+    pshufd          m0, m2, q1111
+    pshufd          m1, m2, q2222
+    pshufd          m2, m2, q3333
+    mova           m11, m0
+%endif
+    mova           m13, m1
+    mova           m14, m2
+%endif
+    pshufd          m6, m7, q0000 ; __ y1
+    pshufd          m7, m7, q1111 ; y2 y3
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t4, t1
+    add             t1, 384*2
+    call .h_top
+    lea             xq, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov             t3, t1
+    add             t1, 384*2
+    mov [rsp+gprsize*1], lpf_strideq
+    add             xq, lpf_strideq
+    mov [rsp+gprsize*0], xq ; below
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+.main:
+    mov             t0, t4
+.main_loop:
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v2
+    mov           lpfq, [rsp+gprsize*0]
+    call .hv_bottom
+    add           lpfq, [rsp+gprsize*1]
+    call .hv_bottom
+.end:
+    RET
+.no_top:
+    lea             t3, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov [rsp+gprsize*1], lpf_strideq
+    lea             t3, [t3+lpf_strideq*2]
+    mov [rsp+gprsize*0], t3
+    call .h
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v2
+    add             t0, 384*6
+    call .hv
+    dec             hd
+    jnz .main
+.v2:
+    call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+    add           dstq, dst_strideq
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    movifnidn    dstmp, dstq
+.v1:
+    call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+    jmp .end
+.h:
+    %define stk esp+4
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+    mova            m4, [lpfq+xq]
+    pshufb          m4, m12
+%else
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
+%endif
+    jmp .h_main
+.h_top:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+.h_loop:
+    movu            m4, [lpfq+xq-4]
+.h_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp             xd, -17
+    jl .h_have_right
+    call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+    pshufb          m0, m4, m9
+    pmaddubsw       m0, m13
+    pshufb          m1, m5, m9
+    pmaddubsw       m1, m13
+    pshufb          m2, m4, m10
+    pmaddubsw       m2, m13
+    pshufb          m3, m5, m10
+    pmaddubsw       m3, m13
+    pshufb          m4, m11
+    paddw           m0, m2
+    pmullw          m2, m14, m4
+    pshufb          m5, m11
+    paddw           m1, m3
+    pmullw          m3, m14, m5
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m8
+    paddw           m5, m8
+    paddw           m0, m2
+    paddw           m1, m3
+    paddsw          m0, m4
+    paddsw          m1, m5
+%else
+    psrldq          m0, m4, 2
+    pslldq          m1, m4, 2
+    pxor            m3, m3
+    punpcklbw       m0, m3
+    punpckhbw       m1, m3
+    paddw           m0, m1
+    pmullw          m0, m11
+    pshufd          m2, m4, q0321
+    punpcklbw       m2, m3
+    pmullw          m1, m14, m2
+    paddw           m0, m1
+    psrldq          m1, m4, 3
+    pslldq          m4, 3
+    punpcklbw       m1, m3
+    punpckhbw       m4, m3
+    paddw           m1, m4
+    pmullw          m1, m13
+    paddw           m0, m1
+    psllw           m2, 7
+    paddw           m2, m8
+    paddsw          m0, m2
+    psrldq          m1, m5, 2
+    pslldq          m4, m5, 2
+    punpcklbw       m1, m3
+    punpckhbw       m4, m3
+    paddw           m1, m4
+    pmullw          m1, m11
+    pshufd          m4, m5, q0321
+    punpcklbw       m4, m3
+    pmullw          m2, m14, m4
+    paddw           m1, m2
+    psrldq          m2, m5, 3
+    pslldq          m5, 3
+    punpcklbw       m2, m3
+    punpckhbw       m5, m3
+    paddw           m2, m5
+    pmullw          m2, m13
+    paddw           m1, m2
+    psllw           m4, 7
+    paddw           m4, m8
+    paddsw          m1, m4
+%endif
+%endmacro
+    %%h5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
+    mova  [t1+xq*2+ 0], m0
+    mova  [t1+xq*2+16], m1
+    add             xq, 16
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+    mova            m4, [lpfq+xq]
+    pshufb          m4, m12
+%else
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
+%endif
+    jmp .hv_main
+.hv_bottom:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+xq-4]
+.hv_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp             xd, -17
+    jl .hv_have_right
+    call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.hv_have_right:
+    %%h5
+    mova            m2, [t3+xq*2]
+    paddw           m2, [t1+xq*2]
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
+%if ARCH_X86_64
+    mova            m3, [t2+xq*2]
+    paddw           m4, m0, [t4+xq*2]
+%else
+    mov             r2, t2
+    mova            m3, [r2+xq*2]
+    mov             r2, t4
+    paddw           m4, m0, [r2+xq*2]
+%endif
+    mova     [t0+xq*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m6
+    punpckhwd       m4, m4
+    pmaddwd         m4, m6
+    paddd           m0, m3
+    paddd           m4, m2
+    mova            m2, [t3+xq*2+16]
+    paddw           m2, [t1+xq*2+16]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
+%if ARCH_X86_64
+    mova            m3, [t2+xq*2+16]
+    paddw           m4, m1, [t4+xq*2+16]
+%else
+    paddw           m4, m1, [r2+xq*2+16]
+    mov             r2, t2
+    mova            m3, [r2+xq*2+16]
+    mov           dstq, dstmp
+%endif
+    mova  [t0+xq*2+16], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m6
+    punpckhwd       m4, m4
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .hv_loop
+    add           dstq, dst_strideq
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t4
+    movifnidn    dstmp, dstq
+    ret
+%if cpuflag(ssse3)
+.v:
+    mov             xq, wq
+.v_loop:
+    mova            m3, [t1+xq*2]
+    paddw           m1, m3, [t3+xq*2]
+%if ARCH_X86_64
+    mova            m2, [t2+xq*2]
+    paddw           m3, [t4+xq*2]
+%else
+    mov             r2, t2
+    mova            m2, [r2+xq*2]
+    mov             r2, t4
+    paddw           m3, [r2+xq*2]
+%endif
+    punpcklwd       m0, m1, m2
+    pmaddwd         m0, m7
+    punpckhwd       m1, m2
+    pmaddwd         m1, m7
+    punpcklwd       m2, m3
+    pmaddwd         m2, m6
+    punpckhwd       m3, m3
+    pmaddwd         m3, m6
+    paddd           m0, m2
+    paddd           m1, m3
+    mova            m4, [t1+xq*2+16]
+    paddw           m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+    mova            m3, [t2+xq*2+16]
+    paddw           m4, [t4+xq*2+16]
+%else
+    paddw           m4, [r2+xq*2+16]
+    mov             r2, t2
+    mova            m3, [r2+xq*2+16]
+    mov           dstq, dstmp
+%endif
+    psrad           m0, 11
+    psrad           m1, 11
+    packssdw        m0, m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4
+    pmaddwd         m3, m6
+    punpckhwd       m4, m4
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .v_loop
+    ret
+%endif
+%endmacro
+
+INIT_XMM sse2
+WIENER
+
+INIT_XMM ssse3
+WIENER
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;      self-guided     ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro MULLD 2
+    pmulhuw       m5, %1, %2
+    pmullw        %1, %2
+    pslld         m5, 16
+    paddd         %1, m5
+%endmacro
+
+%macro GATHERDD 2
+    mova          m5, m7
+    movd         r6d, %2
+ %if ARCH_X86_64
+    movd          %1, [r5+r6]
+    pextrw       r6d, %2, 2
+    pinsrw        m5, [r5+r6+2], 3
+    pextrw       r6d, %2, 4
+    pinsrw        %1, [r5+r6+2], 5
+    pextrw       r6d, %2, 6
+    pinsrw        m5, [r5+r6+2], 7
+ %else
+    movd          %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
+    pextrw       r6d, %2, 2
+    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
+    pextrw       r6d, %2, 4
+    pinsrw        %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
+    pextrw       r6d, %2, 6
+    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
+ %endif
+    por           %1, m5
+%endmacro
+
+%if ARCH_X86_64
+cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+    mov        xlimd, edgem
+    movifnidn     xd, xm
+    mov           hd, hm
+    mov        edged, xlimd
+    and        xlimd, 2                             ; have_right
+    add           xd, xlimd
+    xor        xlimd, 2                             ; 2*!have_right
+%else
+cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
+ %define wq     r0m
+ %define xlimd  r1m
+ %define hd     hmp
+ %define edgeb  byte edgem
+
+    mov           r6, edgem
+    and           r6, 2                             ; have_right
+    add           xd, r6
+    xor           r6, 2                             ; 2*!have_right
+    mov        xlimd, r6
+    SETUP_PIC     r6, 0
+%endif
+
+    jnz .no_right
+    add           xd, 7
+    and           xd, ~7
+.no_right:
+    pxor          m1, m1
+    lea         srcq, [srcq+xq]
+    lea         sumq, [sumq+xq*2-2]
+    lea       sumsqq, [sumsqq+xq*4-4]
+    neg           xq
+    mov           wq, xq
+%if ARCH_X86_64
+    lea          r10, [pb_right_ext_mask+24]
+%endif
+.loop_y:
+    mov           xq, wq
+
+    ; load left
+    test       edgeb, 1                             ; have_left
+    jz .no_left
+    test       leftq, leftq
+    jz .load_left_from_main
+    movd          m0, [leftq]
+    pslldq        m0, 12
+    add        leftq, 4
+    jmp .expand_x
+.no_left:
+    movd          m0, [srcq+xq]
+    pshufb        m0, [PIC_sym(pb_0)]
+    jmp .expand_x
+.load_left_from_main:
+    movd          m0, [srcq+xq-2]
+    pslldq        m0, 14
+.expand_x:
+    punpckhbw    xm0, xm1
+
+    ; when we reach this, m0 contains left two px in highest words
+    cmp           xd, -8
+    jle .loop_x
+.partial_load_and_extend:
+    movd          m3, [srcq-4]
+    pshufb        m3, [PIC_sym(pb_3)]
+    movq          m2, [srcq+xq]
+    punpcklbw     m2, m1
+    punpcklbw     m3, m1
+%if ARCH_X86_64
+    movu          m4, [r10+xq*2]
+%else
+    movu          m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
+%endif
+    pand          m2, m4
+    pandn         m4, m3
+    por           m2, m4
+    jmp .loop_x_noload
+.right_extend:
+    pshufb        m2, m0, [PIC_sym(pb_14_15)]
+    jmp .loop_x_noload
+
+.loop_x:
+    movq          m2, [srcq+xq]
+    punpcklbw     m2, m1
+.loop_x_noload:
+    palignr       m3, m2, m0, 12
+    palignr       m4, m2, m0, 14
+
+    punpcklwd     m5, m3, m2
+    punpckhwd     m6, m3, m2
+    paddw         m3, m4
+    punpcklwd     m7, m4, m1
+    punpckhwd     m4, m1
+    pmaddwd       m5, m5
+    pmaddwd       m6, m6
+    pmaddwd       m7, m7
+    pmaddwd       m4, m4
+    paddd         m5, m7
+    paddd         m6, m4
+    paddw         m3, m2
+    movu [sumq+xq*2], m3
+    movu [sumsqq+xq*4+ 0], m5
+    movu [sumsqq+xq*4+16], m6
+
+    mova          m0, m2
+    add           xq, 8
+
+    ; if x <= -8 we can reload more pixels
+    ; else if x < 0 we reload and extend (this implies have_right=0)
+    ; else if x < xlimd we extend from previous load (this implies have_right=0)
+    ; else we are done
+
+    cmp           xd, -8
+    jle .loop_x
+    test          xd, xd
+    jl .partial_load_and_extend
+    cmp           xd, xlimd
+    jl .right_extend
+
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    add         srcq, strideq
+    dec           hd
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
+    movifnidn  edged, edgem
+%else
+cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
+ %define sumsq_baseq dword [esp+0]
+ %define sum_baseq   dword [esp+4]
+ %define ylimd       dword [esp+8]
+ %define m8          [esp+12]
+    mov        edged, r4m
+    mov           hd, r3m
+%endif
+    mov           xq, -2
+%if ARCH_X86_64
+    mov        ylimd, edged
+    and        ylimd, 8                             ; have_bottom
+    shr        ylimd, 2
+    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
+    mov  sumsq_baseq, sumsqq
+    mov    sum_baseq, sumq
+.loop_x:
+    mov       sumsqq, sumsq_baseq
+    mov         sumq, sum_baseq
+    lea           yd, [hq+ylimq+2]
+%else
+    mov           yd, edged
+    and           yd, 8                             ; have_bottom
+    shr           yd, 2
+    sub           yd, 2                             ; -2 if have_bottom=0, else 0
+    mov  sumsq_baseq, sumsqq
+    mov    sum_baseq, sumq
+    mov        ylimd, yd
+.loop_x:
+    mov       sumsqd, sumsq_baseq
+    mov         sumd, sum_baseq
+    lea           yd, [hq+2]
+    add           yd, ylimd
+%endif
+    lea       sumsqq, [sumsqq+xq*4+4-(384+16)*4]
+    lea         sumq, [sumq+xq*2+2-(384+16)*2]
+    test       edgeb, 4                             ; have_top
+    jnz .load_top
+    movu          m0, [sumsqq+(384+16)*4*1]
+    movu          m1, [sumsqq+(384+16)*4*1+16]
+    mova          m2, m0
+    mova          m3, m1
+    mova          m4, m0
+    mova          m5, m1
+    movu          m6, [sumq+(384+16)*2*1]
+    mova          m7, m6
+    mova          m8, m6
+    jmp .loop_y_noload
+.load_top:
+    movu          m0, [sumsqq-(384+16)*4*1]      ; l2sq [left]
+    movu          m1, [sumsqq-(384+16)*4*1+16]   ; l2sq [right]
+    movu          m2, [sumsqq-(384+16)*4*0]      ; l1sq [left]
+    movu          m3, [sumsqq-(384+16)*4*0+16]   ; l1sq [right]
+    movu          m6, [sumq-(384+16)*2*1]        ; l2
+    movu          m7, [sumq-(384+16)*2*0]        ; l1
+.loop_y:
+%if ARCH_X86_64
+    movu          m8, [sumq+(384+16)*2*1]        ; l0
+%else
+    movu          m4, [sumq+(384+16)*2*1]        ; l0
+    mova          m8, m4
+%endif
+    movu          m4, [sumsqq+(384+16)*4*1]      ; l0sq [left]
+    movu          m5, [sumsqq+(384+16)*4*1+16]   ; l0sq [right]
+.loop_y_noload:
+    paddd         m0, m2
+    paddd         m1, m3
+    paddw         m6, m7
+    paddd         m0, m4
+    paddd         m1, m5
+    paddw         m6, m8
+    movu [sumsqq+ 0], m0
+    movu [sumsqq+16], m1
+    movu      [sumq], m6
+
+    ; shift position down by one
+    mova          m0, m2
+    mova          m1, m3
+    mova          m2, m4
+    mova          m3, m5
+    mova          m6, m7
+    mova          m7, m8
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    dec           yd
+    jg .loop_y
+    cmp           yd, ylimd
+    jg .loop_y_noload
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    RET
+
+cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
+    movifnidn     sd, sm
+    sub           aq, (384+16-1)*4
+    sub           bq, (384+16-1)*2
+    add           hd, 2
+%if ARCH_X86_64
+    LEA           r5, sgr_x_by_x-0xF03
+%else
+    SETUP_PIC r5, 0
+%endif
+    movd          m6, sd
+    pshuflw       m6, m6, q0000
+    punpcklqdq    m6, m6
+    pxor          m7, m7
+    DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+    mova          m8, [pd_0xF00801C7]
+    mova          m9, [pw_256]
+    psrld        m10, m9, 13                        ; pd_2048
+    mova         m11, [pb_unpcklwdw]
+%else
+ %define m8     [PIC_sym(pd_0xF00801C7)]
+ %define m9     [PIC_sym(pw_256)]
+ %define m10    [PIC_sym(pd_2048)]
+ %define m11    [PIC_sym(pb_unpcklwdw)]
+%endif
+.loop_y:
+    mov           xq, -2
+.loop_x:
+    movq          m0, [bq+xq*2]
+    movq          m1, [bq+xq*2+(384+16)*2]
+    punpcklwd     m0, m7
+    punpcklwd     m1, m7
+    movu          m2, [aq+xq*4]
+    movu          m3, [aq+xq*4+(384+16)*4]
+    pslld         m4, m2, 3
+    pslld         m5, m3, 3
+    paddd         m2, m4                            ; aa * 9
+    paddd         m3, m5
+    pmaddwd       m4, m0, m0
+    pmaddwd       m5, m1, m1
+    pmaddwd       m0, m8
+    pmaddwd       m1, m8
+    psubd         m2, m4                            ; p = aa * 9 - bb * bb
+    psubd         m3, m5
+    MULLD         m2, m6
+    MULLD         m3, m6
+    paddusw       m2, m8
+    paddusw       m3, m8
+    psrld         m2, 20                            ; z
+    psrld         m3, 20
+    GATHERDD      m4, m2                            ; xx
+    GATHERDD      m2, m3
+    psrld         m4, 24
+    psrld         m2, 24
+    packssdw      m3, m4, m2
+    pshufb        m4, m11
+    MULLD         m0, m4
+    pshufb        m2, m11
+    MULLD         m1, m2
+    psubw         m5, m9, m3
+    paddd         m0, m10
+    paddd         m1, m10
+    psrld         m0, 12
+    psrld         m1, 12
+    movq   [bq+xq*2], m5
+    psrldq        m5, 8
+    movq [bq+xq*2+(384+16)*2], m5
+    movu   [aq+xq*4], m0
+    movu [aq+xq*4+(384+16)*4], m1
+    add           xd, 4
+    cmp           xd, wd
+    jl .loop_x
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    sub           hd, 2
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
+                                       tmp_base, src_base, a_base, b_base, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
+    mova         m15, [pw_16]
+    mov    tmp_baseq, tq
+    mov    src_baseq, srcq
+    mov      a_baseq, aq
+    mov      b_baseq, bq
+    xor           xd, xd
+%else
+cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
+ %define tmp_baseq  [esp+8]
+ %define src_baseq  [esp+12]
+ %define a_baseq    [esp+16]
+ %define b_baseq    [esp+20]
+ %define wd         [esp+24]
+ %define hd         [esp+28]
+    mov    tmp_baseq, tq
+    mov    src_baseq, srcq
+    mov      a_baseq, aq
+    mov      b_baseq, bq
+    mov           wd, xd
+    mov           hd, yd
+    xor           xd, xd
+    SETUP_PIC yd, 1, 1
+    jmp .loop_start
+%endif
+
+.loop_x:
+    mov           tq, tmp_baseq
+    mov         srcq, src_baseq
+    mov           aq, a_baseq
+    mov           bq, b_baseq
+%if ARCH_X86_32
+.loop_start:
+    movu          m0, [bq+xq*2-(384+16)*2-2]
+    movu          m2, [bq+xq*2-(384+16)*2+2]
+    mova          m1, [bq+xq*2-(384+16)*2]          ; b:top
+    paddw         m0, m2                            ; b:tl+tr
+    movu          m2, [bq+xq*2-2]
+    movu          m3, [bq+xq*2+2]
+    paddw         m1, [bq+xq*2]                     ; b:top+ctr
+    paddw         m2, m3                            ; b:l+r
+    mova  [esp+0x80], m0
+    mova  [esp+0x70], m1
+    mova  [esp+0x60], m2
+%endif
+    movu          m0, [aq+xq*4-(384+16)*4-4]
+    movu          m2, [aq+xq*4-(384+16)*4+4]
+    mova          m1, [aq+xq*4-(384+16)*4]          ; a:top [first half]
+    paddd         m0, m2                            ; a:tl+tr [first half]
+    movu          m2, [aq+xq*4-(384+16)*4-4+16]
+    movu          m4, [aq+xq*4-(384+16)*4+4+16]
+    mova          m3, [aq+xq*4-(384+16)*4+16]       ; a:top [second half]
+    paddd         m2, m4                            ; a:tl+tr [second half]
+    movu          m4, [aq+xq*4-4]
+    movu          m5, [aq+xq*4+4]
+    paddd         m1, [aq+xq*4]                     ; a:top+ctr [first half]
+    paddd         m4, m5                            ; a:l+r [first half]
+    movu          m5, [aq+xq*4+16-4]
+    movu          m6, [aq+xq*4+16+4]
+    paddd         m3, [aq+xq*4+16]                  ; a:top+ctr [second half]
+    paddd         m5, m6                            ; a:l+r [second half]
+%if ARCH_X86_64
+    movu          m6, [bq+xq*2-(384+16)*2-2]
+    movu          m8, [bq+xq*2-(384+16)*2+2]
+    mova          m7, [bq+xq*2-(384+16)*2]          ; b:top
+    paddw         m6, m8                            ; b:tl+tr
+    movu          m8, [bq+xq*2-2]
+    movu          m9, [bq+xq*2+2]
+    paddw         m7, [bq+xq*2]                     ; b:top+ctr
+    paddw         m8, m9                            ; b:l+r
+%endif
+
+    lea           tq, [tq+xq*2]
+    lea         srcq, [srcq+xq*1]
+    lea           aq, [aq+xq*4+(384+16)*4]
+    lea           bq, [bq+xq*2+(384+16)*2]
+    mov           yd, hd
+.loop_y:
+%if ARCH_X86_64
+    movu          m9, [bq-2]
+    movu         m10, [bq+2]
+    paddw         m7, [bq]                          ; b:top+ctr+bottom
+    paddw         m9, m10                           ; b:bl+br
+    paddw        m10, m7, m8                        ; b:top+ctr+bottom+l+r
+    paddw         m6, m9                            ; b:tl+tr+bl+br
+    psubw         m7, [bq-(384+16)*2*2]             ; b:ctr+bottom
+    paddw        m10, m6
+    psllw        m10, 2
+    psubw        m10, m6                            ; aa
+    pxor         m14, m14
+    movq         m12, [srcq]
+    punpcklbw    m12, m14
+    punpcklwd     m6, m10, m15
+    punpckhwd    m10, m15
+    punpcklwd    m13, m12, m15
+    punpckhwd    m12, m15
+    pmaddwd       m6, m13                           ; aa*src[x]+256 [first half]
+    pmaddwd      m10, m12                           ; aa*src[x]+256 [second half]
+%else
+    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
+    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
+    mova  [esp+0x50], m1
+    mova  [esp+0x40], m3
+    mova  [esp+0x30], m4
+    movu          m6, [aq-4]
+    movu          m7, [aq+4]
+    paddd         m1, m4                            ; a:top+ctr+bottom+l+r [first half]
+    paddd         m3, m5                            ; a:top+ctr+bottom+l+r [second half]
+    paddd         m6, m7                            ; a:bl+br [first half]
+    movu          m7, [aq+16-4]
+    movu          m4, [aq+16+4]
+    paddd         m7, m4                            ; a:bl+br [second half]
+    paddd         m0, m6                            ; a:tl+tr+bl+br [first half]
+    paddd         m2, m7                            ; a:tl+tr+bl+br [second half]
+    paddd         m1, m0
+    paddd         m3, m2
+    pslld         m1, 2
+    pslld         m3, 2
+    psubd         m1, m0                            ; bb [first half]
+    psubd         m3, m2                            ; bb [second half]
+%endif
+
+%if ARCH_X86_64
+    movu         m11, [aq-4]
+    movu         m12, [aq+4]
+    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
+    paddd        m11, m12                           ; a:bl+br [first half]
+    movu         m12, [aq+16-4]
+    movu         m13, [aq+16+4]
+    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
+    paddd        m12, m13                           ; a:bl+br [second half]
+    paddd        m13, m1, m4                        ; a:top+ctr+bottom+l+r [first half]
+    paddd        m14, m3, m5                        ; a:top+ctr+bottom+l+r [second half]
+    paddd         m0, m11                           ; a:tl+tr+bl+br [first half]
+    paddd         m2, m12                           ; a:tl+tr+bl+br [second half]
+    paddd        m13, m0
+    paddd        m14, m2
+    pslld        m13, 2
+    pslld        m14, 2
+    psubd        m13, m0                            ; bb [first half]
+    psubd        m14, m2                            ; bb [second half]
+    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
+    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
+%else
+    mova          m4, [esp+0x80]
+    mova  [esp+0x80], m5
+    mova          m5, [esp+0x70]
+    mova  [esp+0x70], m6
+    mova          m6, [esp+0x60]
+    mova  [esp+0x60], m7
+    mova  [esp+0x20], m1
+    movu          m7, [bq-2]
+    movu          m1, [bq+2]
+    paddw         m5, [bq]                          ; b:top+ctr+bottom
+    paddw         m7, m1
+    paddw         m1, m5, m6                        ; b:top+ctr+bottom+l+r
+    paddw         m4, m7                            ; b:tl+tr+bl+br
+    psubw         m5, [bq-(384+16)*2*2]             ; b:ctr+bottom
+    paddw         m1, m4
+    psllw         m1, 2
+    psubw         m1, m4                            ; aa
+    movq          m0, [srcq]
+    XCHG_PIC_REG
+    punpcklbw     m0, [PIC_sym(pb_0)]
+    punpcklwd     m4, m1, [PIC_sym(pw_16)]
+    punpckhwd     m1, [PIC_sym(pw_16)]
+    punpcklwd     m2, m0, [PIC_sym(pw_16)]
+    punpckhwd     m0, [PIC_sym(pw_16)]
+    XCHG_PIC_REG
+    pmaddwd       m4, m2                            ; aa*src[x]+256 [first half]
+    pmaddwd       m1, m0                            ; aa*src[x]+256 [second half]
+%endif
+
+%if ARCH_X86_64
+    paddd         m6, m13
+    paddd        m10, m14
+    psrad         m6, 9
+    psrad        m10, 9
+    packssdw      m6, m10
+    mova        [tq], m6
+%else
+    paddd         m4, [esp+0x20]
+    paddd         m1, m3
+    psrad         m4, 9
+    psrad         m1, 9
+    packssdw      m4, m1
+    mova        [tq], m4
+%endif
+
+    ; shift to next row
+%if ARCH_X86_64
+    mova          m0, m4
+    mova          m2, m5
+    mova          m4, m11
+    mova          m5, m12
+    mova          m6, m8
+    mova          m8, m9
+%else
+    mova          m1, [esp+0x50]
+    mova          m3, [esp+0x40]
+    mova          m0, [esp+0x30]
+    mova          m2, [esp+0x80]
+    mova          m4, [esp+0x70]
+    mova  [esp+0x70], m5
+    mova          m5, [esp+0x60]
+    mova  [esp+0x80], m6
+    mova  [esp+0x60], m7
+    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
+    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
+%endif
+
+    add         srcq, strideq
+    add           aq, (384+16)*4
+    add           bq, (384+16)*2
+    add           tq, 384*2
+    dec           yd
+    jg .loop_y
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    RET
+
+cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
+    movifnidn     hd, hm
+%if ARCH_X86_32
+    SETUP_PIC r6, 0
+%endif
+    movd          m0, wtm
+    pshufb        m0, [PIC_sym(pb_0_1)]
+    psllw         m0, 4
+    pxor          m7, m7
+    DEFINE_ARGS dst, stride, t, w, h, idx
+.loop_y:
+    xor         idxd, idxd
+.loop_x:
+    mova          m1, [tq+idxq*2+ 0]
+    mova          m4, [tq+idxq*2+16]
+    mova          m5, [dstq+idxq]
+    punpcklbw     m2, m5, m7
+    punpckhbw     m5, m7
+    psllw         m3, m2, 4
+    psllw         m6, m5, 4
+    psubw         m1, m3
+    psubw         m4, m6
+    pmulhrsw      m1, m0
+    pmulhrsw      m4, m0
+    paddw         m1, m2
+    paddw         m4, m5
+    packuswb      m1, m4
+    mova [dstq+idxq], m1
+    add         idxd, 16
+    cmp         idxd, wd
+    jl .loop_x
+    add         dstq, strideq
+    add           tq, 384 * 2
+    dec           hd
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+    mov        edged, edgem
+    movifnidn     wd, wm
+    mov           hd, hm
+    mova         m10, [pb_0]
+    mova         m11, [pb_0_1]
+%else
+cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
+ %define edgeb      byte edgem
+ %define wd         xd
+ %define wq         wd
+ %define wm         r5m
+ %define strideq    r4m
+    SUB          esp, 8
+    SETUP_PIC sumsqd, 1, 1
+
+ %define m10    [PIC_sym(pb_0)]
+ %define m11    [PIC_sym(pb_0_1)]
+%endif
+
+    test       edgeb, 2                             ; have_right
+    jz .no_right
+    xor        xlimd, xlimd
+    add           wd, 2
+    add           wd, 15
+    and           wd, ~15
+    jmp .right_done
+.no_right:
+    mov        xlimd, 3
+    dec           wd
+.right_done:
+    pxor          m1, m1
+    lea         srcq, [srcq+wq+1]
+    lea         sumq, [sumq+wq*2-2]
+    lea       sumsqq, [sumsqq+wq*4-4]
+    neg           wq
+%if ARCH_X86_64
+    lea          r10, [pb_right_ext_mask+24]
+%else
+    mov           wm, xd
+ %define wq wm
+%endif
+
+.loop_y:
+    mov           xq, wq
+    ; load left
+    test       edgeb, 1                             ; have_left
+    jz .no_left
+    test       leftq, leftq
+    jz .load_left_from_main
+    movd          m0, [leftq]
+    movd          m2, [srcq+xq-1]
+    pslldq        m2, 4
+    por           m0, m2
+    pslldq        m0, 11
+    add        leftq, 4
+    jmp .expand_x
+.no_left:
+    movd          m0, [srcq+xq-1]
+    XCHG_PIC_REG
+    pshufb        m0, m10
+    XCHG_PIC_REG
+    jmp .expand_x
+.load_left_from_main:
+    movd          m0, [srcq+xq-4]
+    pslldq        m0, 12
+.expand_x:
+    punpckhbw     m0, m1
+
+    ; when we reach this, m0 contains left two px in highest words
+    cmp           xd, -8
+    jle .loop_x
+    test          xd, xd
+    jge .right_extend
+.partial_load_and_extend:
+    XCHG_PIC_REG
+    movd          m3, [srcq-1]
+    movq          m2, [srcq+xq]
+    pshufb        m3, m10
+    punpcklbw     m3, m1
+    punpcklbw     m2, m1
+%if ARCH_X86_64
+    movu          m4, [r10+xq*2]
+%else
+    movu          m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
+    XCHG_PIC_REG
+%endif
+    pand          m2, m4
+    pandn         m4, m3
+    por           m2, m4
+    jmp .loop_x_noload
+.right_extend:
+    psrldq        m2, m0, 14
+    XCHG_PIC_REG
+    pshufb        m2, m11
+    XCHG_PIC_REG
+    jmp .loop_x_noload
+
+.loop_x:
+    movq          m2, [srcq+xq]
+    punpcklbw     m2, m1
+.loop_x_noload:
+    palignr       m3, m2, m0, 8
+    palignr       m4, m2, m0, 10
+    palignr       m5, m2, m0, 12
+    palignr       m6, m2, m0, 14
+
+%if ARCH_X86_64
+    paddw         m0, m3, m2
+    punpcklwd     m7, m3, m2
+    punpckhwd     m3, m2
+    paddw         m0, m4
+    punpcklwd     m8, m4, m5
+    punpckhwd     m4, m5
+    paddw         m0, m5
+    punpcklwd     m9, m6, m1
+    punpckhwd     m5, m6, m1
+    paddw         m0, m6
+    pmaddwd       m7, m7
+    pmaddwd       m3, m3
+    pmaddwd       m8, m8
+    pmaddwd       m4, m4
+    pmaddwd       m9, m9
+    pmaddwd       m5, m5
+    paddd         m7, m8
+    paddd         m3, m4
+    paddd         m7, m9
+    paddd         m3, m5
+    movu [sumq+xq*2], m0
+    movu [sumsqq+xq*4+ 0], m7
+    movu [sumsqq+xq*4+16], m3
+%else
+    paddw         m0, m3, m2
+    paddw         m0, m4
+    paddw         m0, m5
+    paddw         m0, m6
+    movu [sumq+xq*2], m0
+    punpcklwd     m7, m3, m2
+    punpckhwd     m3, m2
+    punpcklwd     m0, m4, m5
+    punpckhwd     m4, m5
+    punpckhwd     m5, m6, m1
+    pmaddwd       m7, m7
+    pmaddwd       m3, m3
+    pmaddwd       m0, m0
+    pmaddwd       m4, m4
+    pmaddwd       m5, m5
+    paddd         m7, m0
+    paddd         m3, m4
+    paddd         m3, m5
+    punpcklwd     m0, m6, m1
+    pmaddwd       m0, m0
+    paddd         m7, m0
+    movu [sumsqq+xq*4+ 0], m7
+    movu [sumsqq+xq*4+16], m3
+%endif
+
+    mova          m0, m2
+    add           xq, 8
+
+    ; if x <= -8 we can reload more pixels
+    ; else if x < 0 we reload and extend (this implies have_right=0)
+    ; else if x < xlimd we extend from previous load (this implies have_right=0)
+    ; else we are done
+
+    cmp           xd, -8
+    jle .loop_x
+    test          xd, xd
+    jl .partial_load_and_extend
+    cmp           xd, xlimd
+    jl .right_extend
+
+    add         srcq, strideq
+    add       sumsqq, (384+16)*4
+    add         sumq, (384+16)*2
+    dec           hd
+    jg .loop_y
+%if ARCH_X86_32
+    ADD          esp, 8
+%endif
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+    movifnidn  edged, edgem
+    mov        ylimd, edged
+%else
+cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
+ %define wm     [esp+0]
+ %define hm     [esp+4]
+ %define edgem  [esp+8]
+    mov           wm, xd
+    mov           hm, yd
+    mov        edgem, ylimd
+%endif
+
+    and        ylimd, 8                             ; have_bottom
+    shr        ylimd, 2
+    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
+    mov           xq, -2
+%if ARCH_X86_64
+.loop_x:
+    lea           yd, [hd+ylimd+2]
+    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+    lea     sum_ptrq, [  sumq+xq*2+2-(384+16)*2]
+    test       edgeb, 4                             ; have_top
+    jnz .load_top
+    movu          m0, [sumsq_ptrq+(384+16)*4*1]
+    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
+    mova          m2, m0
+    mova          m3, m1
+    mova          m4, m0
+    mova          m5, m1
+    mova          m6, m0
+    mova          m7, m1
+    movu         m10, [sum_ptrq+(384+16)*2*1]
+    mova         m11, m10
+    mova         m12, m10
+    mova         m13, m10
+    jmp .loop_y_second_load
+.load_top:
+    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
+    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
+    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
+    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
+    mova          m2, m0
+    mova          m3, m1
+    movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
+    movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
+    mova         m11, m10
+.loop_y:
+    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
+    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
+    movu         m13, [sum_ptrq+(384+16)*2*1]        ; l1
+.loop_y_second_load:
+    test          yd, yd
+    jle .emulate_second_load
+    movu          m8, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
+    movu          m9, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
+    movu         m14, [sum_ptrq+(384+16)*2*2]        ; l0
+.loop_y_noload:
+    paddd         m0, m2
+    paddd         m1, m3
+    paddw        m10, m11
+    paddd         m0, m4
+    paddd         m1, m5
+    paddw        m10, m12
+    paddd         m0, m6
+    paddd         m1, m7
+    paddw        m10, m13
+    paddd         m0, m8
+    paddd         m1, m9
+    paddw        m10, m14
+    movu [sumsq_ptrq+ 0], m0
+    movu [sumsq_ptrq+16], m1
+    movu  [sum_ptrq], m10
+
+    ; shift position down by one
+    mova          m0, m4
+    mova          m1, m5
+    mova          m2, m6
+    mova          m3, m7
+    mova          m4, m8
+    mova          m5, m9
+    mova         m10, m12
+    mova         m11, m13
+    mova         m12, m14
+    add   sumsq_ptrq, (384+16)*4*2
+    add     sum_ptrq, (384+16)*2*2
+    sub           yd, 2
+    jge .loop_y
+    ; l1 = l0
+    mova          m6, m8
+    mova          m7, m9
+    mova         m13, m14
+    cmp           yd, ylimd
+    jg .loop_y_noload
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    RET
+.emulate_second_load:
+    mova          m8, m6
+    mova          m9, m7
+    mova         m14, m13
+    jmp .loop_y_noload
+%else
+.sumsq_loop_x:
+    lea           yd, [ylimd+2]
+    add           yd, hm
+    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
+    test  byte edgem, 4                             ; have_top
+    jnz .sumsq_load_top
+    movu          m0, [sumsq_ptrq+(384+16)*4*1]
+    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
+    mova          m4, m0
+    mova          m5, m1
+    mova          m6, m0
+    mova          m7, m1
+    mova  [esp+0x1c], m0
+    mova  [esp+0x0c], m1
+    jmp .sumsq_loop_y_second_load
+.sumsq_load_top:
+    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
+    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
+    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
+    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
+    mova  [esp+0x1c], m0
+    mova  [esp+0x0c], m1
+.sumsq_loop_y:
+    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
+    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
+.sumsq_loop_y_second_load:
+    test          yd, yd
+    jle .sumsq_emulate_second_load
+    movu          m2, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
+    movu          m3, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
+.sumsq_loop_y_noload:
+    paddd         m0, [esp+0x1c]
+    paddd         m1, [esp+0x0c]
+    paddd         m0, m4
+    paddd         m1, m5
+    paddd         m0, m6
+    paddd         m1, m7
+    paddd         m0, m2
+    paddd         m1, m3
+    movu [sumsq_ptrq+ 0], m0
+    movu [sumsq_ptrq+16], m1
+
+    ; shift position down by one
+    mova          m0, m4
+    mova          m1, m5
+    mova          m4, m2
+    mova          m5, m3
+    mova  [esp+0x1c], m6
+    mova  [esp+0x0c], m7
+    add   sumsq_ptrq, (384+16)*4*2
+    sub           yd, 2
+    jge .sumsq_loop_y
+    ; l1 = l0
+    mova          m6, m2
+    mova          m7, m3
+    cmp           yd, ylimd
+    jg .sumsq_loop_y_noload
+    add           xd, 8
+    cmp           xd, wm
+    jl .sumsq_loop_x
+
+    mov           xd, -2
+.sum_loop_x:
+    lea           yd, [ylimd+2]
+    add           yd, hm
+    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
+    test  byte edgem, 4                             ; have_top
+    jnz .sum_load_top
+    movu          m0, [sum_ptrq+(384+16)*2*1]
+    mova          m1, m0
+    mova          m2, m0
+    mova          m3, m0
+    jmp .sum_loop_y_second_load
+.sum_load_top:
+    movu          m0, [sum_ptrq-(384+16)*2*1]        ; l3/4
+    movu          m2, [sum_ptrq-(384+16)*2*0]        ; l2
+    mova          m1, m0
+.sum_loop_y:
+    movu          m3, [sum_ptrq+(384+16)*2*1]        ; l1
+.sum_loop_y_second_load:
+    test          yd, yd
+    jle .sum_emulate_second_load
+    movu          m4, [sum_ptrq+(384+16)*2*2]        ; l0
+.sum_loop_y_noload:
+    paddw         m0, m1
+    paddw         m0, m2
+    paddw         m0, m3
+    paddw         m0, m4
+    movu  [sum_ptrq], m0
+
+    ; shift position down by one
+    mova          m0, m2
+    mova          m1, m3
+    mova          m2, m4
+    add     sum_ptrq, (384+16)*2*2
+    sub           yd, 2
+    jge .sum_loop_y
+    ; l1 = l0
+    mova          m3, m4
+    cmp           yd, ylimd
+    jg .sum_loop_y_noload
+    add           xd, 8
+    cmp           xd, wm
+    jl .sum_loop_x
+    RET
+.sumsq_emulate_second_load:
+    mova          m2, m6
+    mova          m3, m7
+    jmp .sumsq_loop_y_noload
+.sum_emulate_second_load:
+    mova          m4, m3
+    jmp .sum_loop_y_noload
+%endif
+
+cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
+    movifnidn     sd, sm
+    sub           aq, (384+16-1)*4
+    sub           bq, (384+16-1)*2
+    add           hd, 2
+%if ARCH_X86_64
+    LEA           r5, sgr_x_by_x-0xF03
+%else
+    SETUP_PIC r5, 0
+%endif
+    movd          m6, sd
+    pshuflw       m6, m6, q0000
+    punpcklqdq    m6, m6
+    pxor          m7, m7
+    DEFINE_ARGS a, b, w, h, x
+%if ARCH_X86_64
+    mova          m8, [pd_0xF0080029]
+    mova          m9, [pw_256]
+    psrld        m10, m9, 15                        ; pd_512
+%else
+ %define m8     [PIC_sym(pd_0xF0080029)]
+ %define m9     [PIC_sym(pw_256)]
+ %define m10    [PIC_sym(pd_512)]
+%endif
+.loop_y:
+    mov           xq, -2
+.loop_x:
+    movq          m0, [bq+xq*2+0]
+    movq          m1, [bq+xq*2+8]
+    punpcklwd     m0, m7
+    punpcklwd     m1, m7
+    movu          m2, [aq+xq*4+ 0]
+    movu          m3, [aq+xq*4+16]
+    pslld         m4, m2, 3                         ; aa * 8
+    pslld         m5, m3, 3
+    paddd         m2, m4                            ; aa * 9
+    paddd         m3, m5
+    paddd         m4, m4                            ; aa * 16
+    paddd         m5, m5
+    paddd         m2, m4                            ; aa * 25
+    paddd         m3, m5
+    pmaddwd       m4, m0, m0
+    pmaddwd       m5, m1, m1
+    psubd         m2, m4                            ; p = aa * 25 - bb * bb
+    psubd         m3, m5
+    MULLD         m2, m6
+    MULLD         m3, m6
+    paddusw       m2, m8
+    paddusw       m3, m8
+    psrld         m2, 20                            ; z
+    psrld         m3, 20
+    GATHERDD      m4, m2                            ; xx
+    GATHERDD      m2, m3
+    psrld         m4, 24
+    psrld         m2, 24
+    packssdw      m3, m4, m2
+    pmullw        m4, m8
+    pmullw        m2, m8
+    psubw         m5, m9, m3
+    pmaddwd       m0, m4
+    pmaddwd       m1, m2
+    paddd         m0, m10
+    paddd         m1, m10
+    psrld         m0, 10
+    psrld         m1, 10
+    movu   [bq+xq*2], m5
+    movu [aq+xq*4+ 0], m0
+    movu [aq+xq*4+16], m1
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    sub           hd, 2
+    jg .loop_y
+    RET
+
+%if ARCH_X86_64
+cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
+                                       tmp_base, src_base, a_base, b_base, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
+    mov    tmp_baseq, tq
+    mov    src_baseq, srcq
+    mov      a_baseq, aq
+    mov      b_baseq, bq
+    mova          m9, [pw_5_6]
+    mova         m12, [pw_256]
+    psrlw        m10, m12, 8                    ; pw_1
+    psrlw        m11, m12, 1                    ; pw_128
+    pxor         m13, m13
+%else
+cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
+ %define tmp_baseq  r0m
+ %define src_baseq  r1m
+ %define a_baseq    r3m
+ %define b_baseq    r4m
+ %define wd         r5m
+ %define hd         r6m
+
+    SUB          esp, 8
+    SETUP_PIC yd
+
+ %define m8     m5
+ %define m9     [PIC_sym(pw_5_6)]
+ %define m10    [PIC_sym(pw_1)]
+ %define m11    [PIC_sym(pw_128)]
+ %define m12    [PIC_sym(pw_256)]
+ %define m13    m0
+%endif
+    xor           xd, xd
+.loop_x:
+    mov           tq, tmp_baseq
+    mov         srcq, src_baseq
+    mov           aq, a_baseq
+    mov           bq, b_baseq
+    movu          m0, [aq+xq*4-(384+16)*4-4]
+    mova          m1, [aq+xq*4-(384+16)*4]
+    movu          m2, [aq+xq*4-(384+16)*4+4]
+    movu          m3, [aq+xq*4-(384+16)*4-4+16]
+    mova          m4, [aq+xq*4-(384+16)*4+16]
+    movu          m5, [aq+xq*4-(384+16)*4+4+16]
+    paddd         m0, m2
+    paddd         m3, m5
+    paddd         m0, m1
+    paddd         m3, m4
+    pslld         m2, m0, 2
+    pslld         m5, m3, 2
+    paddd         m2, m0
+    paddd         m5, m3
+    paddd         m0, m2, m1                    ; prev_odd_b [first half]
+    paddd         m1, m5, m4                    ; prev_odd_b [second half]
+    movu          m3, [bq+xq*2-(384+16)*2-2]
+    mova          m4, [bq+xq*2-(384+16)*2]
+    movu          m5, [bq+xq*2-(384+16)*2+2]
+    paddw         m3, m5
+    punpcklwd     m5, m3, m4
+    punpckhwd     m3, m4
+    pmaddwd       m5, m9
+    pmaddwd       m3, m9
+    mova          m2, m5
+    packssdw      m2, m3                        ; prev_odd_a
+    lea           tq, [tq+xq*2]
+    lea         srcq, [srcq+xq*1]
+    lea           aq, [aq+xq*4+(384+16)*4]
+    lea           bq, [bq+xq*2+(384+16)*2]
+%if ARCH_X86_32
+    mov        [esp], PIC_reg
+%endif
+    mov           yd, hd
+    XCHG_PIC_REG
+.loop_y:
+    movu          m3, [aq-4]
+    mova          m4, [aq]
+    movu          m5, [aq+4]
+    paddd         m3, m5
+    paddd         m3, m4
+    pslld         m5, m3, 2
+    paddd         m5, m3
+    paddd         m5, m4                        ; cur_odd_b [first half]
+    movu          m3, [aq+16-4]
+    mova          m6, [aq+16]
+    movu          m7, [aq+16+4]
+    paddd         m3, m7
+    paddd         m3, m6
+    pslld         m7, m3, 2
+    paddd         m7, m3
+    paddd         m4, m7, m6                    ; cur_odd_b [second half]
+    movu          m3, [bq-2]
+    mova          m6, [bq]
+    movu          m7, [bq+2]
+    paddw         m3, m7
+    punpcklwd     m7, m3, m6
+    punpckhwd     m3, m6
+    pmaddwd       m7, m9
+    pmaddwd       m3, m9
+    packssdw      m6, m7, m3                    ; cur_odd_a
+
+    paddd         m0, m5                        ; cur_even_b [first half]
+    paddd         m1, m4                        ; cur_even_b [second half]
+    paddw         m2, m6                        ; cur_even_a
+
+    movq          m3, [srcq]
+%if ARCH_X86_64
+    punpcklbw     m3, m13
+%else
+    mova        [td], m5
+    pxor          m7, m7
+    punpcklbw     m3, m7
+%endif
+    punpcklwd     m7, m3, m10
+    punpckhwd     m3, m10
+    punpcklwd     m8, m2, m12
+    punpckhwd     m2, m12
+    pmaddwd       m7, m8
+    pmaddwd       m3, m2
+    paddd         m7, m0
+    paddd         m3, m1
+    psrad         m7, 9
+    psrad         m3, 9
+
+%if ARCH_X86_32
+    pxor         m13, m13
+%endif
+    movq          m8, [srcq+strideq]
+    punpcklbw     m8, m13
+    punpcklwd     m0, m8, m10
+    punpckhwd     m8, m10
+    punpcklwd     m1, m6, m11
+    punpckhwd     m2, m6, m11
+    pmaddwd       m0, m1
+    pmaddwd       m8, m2
+%if ARCH_X86_64
+    paddd         m0, m5
+%else
+    paddd         m0, [td]
+%endif
+    paddd         m8, m4
+    psrad         m0, 8
+    psrad         m8, 8
+
+    packssdw      m7, m3
+    packssdw      m0, m8
+%if ARCH_X86_32
+    mova          m5, [td]
+%endif
+    mova [tq+384*2*0], m7
+    mova [tq+384*2*1], m0
+
+    mova          m0, m5
+    mova          m1, m4
+    mova          m2, m6
+    add           aq, (384+16)*4*2
+    add           bq, (384+16)*2*2
+    add           tq, 384*2*2
+    lea         srcq, [srcq+strideq*2]
+%if ARCH_X86_64
+    sub           yd, 2
+%else
+    sub dword [esp+4], 2
+%endif
+    jg .loop_y
+    add           xd, 8
+    cmp           xd, wd
+    jl .loop_x
+%if ARCH_X86_32
+    ADD          esp, 8
+%endif
+    RET
+
+%undef t2
+cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
+    movifnidn     wd, wm
+    movd          m0, wtm
+%if ARCH_X86_64
+    movifnidn     hd, hm
+    mova         m10, [pd_1024]
+    pxor         m11, m11
+%else
+    SETUP_PIC     hd, 0
+ %define m10    [PIC_sym(pd_1024)]
+ %define m11    m7
+%endif
+    pshufd        m0, m0, 0
+    DEFINE_ARGS dst, stride, t1, t2, w, h, idx
+%if ARCH_X86_32
+ %define hd     hmp
+%endif
+
+.loop_y:
+    xor         idxd, idxd
+.loop_x:
+    mova          m1, [t1q+idxq*2+ 0]
+    mova          m2, [t1q+idxq*2+16]
+    mova          m3, [t2q+idxq*2+ 0]
+    mova          m4, [t2q+idxq*2+16]
+    mova          m6, [dstq+idxq]
+%if ARCH_X86_32
+    pxor          m11, m11
+%endif
+    punpcklbw     m5, m6, m11
+    punpckhbw     m6, m11
+    psllw         m7, m5, 4
+    psubw         m1, m7
+    psubw         m3, m7
+    psllw         m7, m6, 4
+    psubw         m2, m7
+    psubw         m4, m7
+    punpcklwd     m7, m1, m3
+    punpckhwd     m1, m3
+    punpcklwd     m3, m2, m4
+    punpckhwd     m2, m4
+    pmaddwd       m7, m0
+    pmaddwd       m1, m0
+    pmaddwd       m3, m0
+    pmaddwd       m2, m0
+    paddd         m7, m10
+    paddd         m1, m10
+    paddd         m3, m10
+    paddd         m2, m10
+    psrad         m7, 11
+    psrad         m1, 11
+    psrad         m3, 11
+    psrad         m2, 11
+    packssdw      m7, m1
+    packssdw      m3, m2
+    paddw         m7, m5
+    paddw         m3, m6
+    packuswb      m7, m3
+    mova [dstq+idxq], m7
+    add         idxd, 16
+    cmp         idxd, wd
+    jl .loop_x
+    add         dstq, strideq
+    add          t1q, 384 * 2
+    add          t2q, 384 * 2
+    dec           hd
+    jg .loop_y
+    RET
--- a/src/x86/looprestoration_ssse3.asm
+++ /dev/null
@@ -1,1953 +1,0 @@
-; Copyright © 2018, VideoLAN and dav1d authors
-; Copyright © 2018, Two Orioles, LLC
-; Copyright © 2018, VideoLabs
-; All rights reserved.
-;
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions are met:
-;
-; 1. Redistributions of source code must retain the above copyright notice, this
-;    list of conditions and the following disclaimer.
-;
-; 2. Redistributions in binary form must reproduce the above copyright notice,
-;    this list of conditions and the following disclaimer in the documentation
-;    and/or other materials provided with the distribution.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-%include "config.asm"
-%include "ext/x86/x86inc.asm"
-
-SECTION_RODATA 16
-
-pb_right_ext_mask: times 16 db 0xff
-                   times 16 db 0
-pb_14x0_1_2: times 14 db 0
-             db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
-                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
-pb_0: times 16 db 0
-pb_2: times 16 db 2
-pb_3: times 16 db 3
-pb_4: times 16 db 4
-pb_15: times 16 db 15
-pb_0_1: times 8 db 0, 1
-pb_6_7: times 8 db 6, 7
-pb_14_15: times 8 db 14, 15
-pw_1: times 8 dw 1
-pw_16: times 8 dw 16
-pw_128: times 8 dw 128
-pw_255: times 8 dw 255
-pw_256: times 8 dw 256
-pw_2048: times 8 dw 2048
-pw_16380: times 8 dw 16380
-pw_5_6: times 4 dw 5, 6
-pw_0_128: times 4 dw 0, 128
-pd_1024: times 4 dd 1024
-%if ARCH_X86_32
-pd_256: times 4 dd 256
-pd_512: times 4 dd 512
-pd_2048: times 4 dd 2048
-%endif
-pd_0xF0080029: times 4 dd 0xF0080029
-pd_0xF00801C7: times 4 dd 0XF00801C7
-
-cextern sgr_x_by_x
-
-SECTION .text
-
-%if ARCH_X86_32
- %define PIC_base_offset $$
-
- %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg
-  %assign pic_reg_stk_off 4
-  %xdefine PIC_reg %1
-  %if %2 == 1
-    mov        [esp], %1
-  %endif
-    LEA      PIC_reg, PIC_base_offset
-  %if %3 == 1
-    XCHG_PIC_REG
-  %endif
- %endmacro
-
- %macro XCHG_PIC_REG 0
-    mov [esp+pic_reg_stk_off], PIC_reg
-    %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8
-    mov PIC_reg, [esp+pic_reg_stk_off]
- %endmacro
-
- %define PIC_sym(sym)   (PIC_reg+(sym)-PIC_base_offset)
-
-%else
- %macro XCHG_PIC_REG 0
- %endmacro
-
- %define PIC_sym(sym)   (sym)
-%endif
-
-%macro PALIGNR 4 ; dst, src1, src2, shift
- %if cpuflag(ssse3)
-    palignr       %1, %2, %3, %4
- %else
-  %assign %%i regnumof%+%1 + 1
-  %define %%tmp m %+ %%i
-    psrldq        %1, %3, %4
-    pslldq     %%tmp, %2, 16-%4
-    por           %1, %%tmp
- %endif
-%endmacro
-
-%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
- %if cpuflag(ssse3)
-    pmaddubsw     %1, %2
- %else
-  %if %5 == 1
-    pxor          %3, %3
-  %endif
-    punpckhbw     %4, %1, %3
-    punpcklbw     %1, %3
-    pmaddwd       %4, %2
-    pmaddwd       %1, %2
-    packssdw      %1, %4
- %endif
-%endmacro
-
-;;;;;;;;;;;;;;;;;;;;;;
-;;      wiener      ;;
-;;;;;;;;;;;;;;;;;;;;;;
-
-%macro WIENER_H 0
-%if ARCH_X86_64
-cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
-    mov        edged, edgem
-    movifnidn     wd, wm
-    mov           hd, hm
-%else
-cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
-    mov           r5, edgem
-    mov     [esp+12], r5
-    mov           wd, wm
-    mov           hd, hm
-    SETUP_PIC hd
- %define m15 m0
- %define m14 m1
- %define m13 m2
- %define m12 m3
-%endif
-
-    movq         m15, [fhq]
-%if cpuflag(ssse3)
-    pshufb       m12, m15, [PIC_sym(pb_6_7)]
-    pshufb       m13, m15, [PIC_sym(pb_4)]
-    pshufb       m14, m15, [PIC_sym(pb_2)]
-    pshufb       m15, m15, [PIC_sym(pb_0)]
-%else
-    pshuflw      m12, m15, q3333
-    punpcklbw    m15, m15
-    pshufhw      m13, m15, q0000
-    pshuflw      m14, m15, q2222
-    pshuflw      m15, m15, q0000
-    punpcklqdq   m12, m12
-    punpckhqdq   m13, m13
-    punpcklqdq   m14, m14
-    punpcklqdq   m15, m15
-    psraw        m13, 8
-    psraw        m14, 8
-    psraw        m15, 8
-%endif
-
-%if ARCH_X86_64
-    mova         m11, [pw_2048]
-    mova         m10, [pw_16380]
-    lea          r11, [pb_right_ext_mask]
-
-    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
-%else
- %define m10    [PIC_sym(pw_16380)]
- %define m11    [PIC_sym(pw_2048)]
- %define m12    [esp+0x14]
- %define m13    [esp+0x24]
- %define m14    [esp+0x34]
- %define m15    [esp+0x44]
-    mova         m12, m3
-    mova         m13, m2
-    mova         m14, m1
-    mova         m15, m0
-
-    DEFINE_ARGS dst, left, src, stride, x, w, h, edge
- %define srcptrq    srcq
- %define dstptrq    dstq
- %define hd         dword [esp+ 0]
- %define edgeb      byte  [esp+12]
- %define xlimd      dword [esp+16]
-%endif
-
-    ; if (edge & has_right) align_w_to_16
-    ; else w -= 3, and use that as limit in x loop
-    test       edgeb, 2 ; has_right
-    jnz .align
-    mov        xlimd, -3
-    jmp .loop
-.align:
-    add           wd, 15
-    and           wd, ~15
-%if ARCH_X86_64
-    xor        xlimd, xlimd
-%else
-    mov        xlimd, 0
-%endif
-
-    ; main y loop for vertical filter
-.loop:
-%if ARCH_X86_64
-    mov      srcptrq, srcq
-    mov      dstptrq, dstq
-    lea           xd, [wq+xlimq]
-%else
-    mov      [esp+8], srcq
-    mov      [esp+4], dstq
-    mov           xd, xlimd
-    add           xd, wd
-%endif
-
-    ; load left edge pixels
-    test       edgeb, 1 ; have_left
-    jz .emu_left
-    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
-    jz .load_left_combined
-    movd          m0, [leftq]
-    movd          m1, [srcq]
-    punpckldq     m0, m1
-    pslldq        m0, 9
-    add        leftq, 4
-    jmp .left_load_done
-.load_left_combined:
-    movq          m0, [srcq-3]
-    pslldq        m0, 10
-    jmp .left_load_done
-.emu_left:
-    movd          m0, [srcq]
-%if cpuflag(ssse3)
-    pshufb        m0, [PIC_sym(pb_14x0_1_2)]
-%else
-    pslldq        m1, m0, 13
-    punpcklbw     m0, m0
-    pshuflw       m0, m0, q0000
-    punpcklqdq    m0, m0
-    psrldq        m0, 2
-    por           m0, m1
-%endif
-
-    ; load right edge pixels
-.left_load_done:
-    cmp           xd, 16
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    je .splat_right
-
-    ; for very small images (w=[1-2]), edge-extend the original cache,
-    ; ugly, but only runs in very odd cases
-%if cpuflag(ssse3)
-    add           wd, wd
- %if ARCH_X86_64
-    pshufb        m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
- %else
-    pshufb        m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
- %endif
-    shr           wd, 1
-%else
-    shl           wd, 4
-    pcmpeqd       m2, m2
-    movd          m3, wd
-    psrldq        m2, 2
-    punpckhbw     m1, m0, m0
-    pshufhw       m1, m1, q1122
-    psllq         m1, m3
-    pand          m0, m2
-    pandn         m2, m1
-    por           m0, m2
-    shr           wd, 4
-%endif
-
-    ; main x loop, mostly this starts in .main_load
-.splat_right:
-    ; no need to load new pixels, just extend them from the (possibly previously
-    ; extended) previous load into m0
-%if cpuflag(ssse3)
-    pshufb        m1, m0, [PIC_sym(pb_15)]
-%else
-    punpckhbw     m1, m0, m0
-    pshufhw       m1, m1, q3333
-    punpckhqdq    m1, m1
-%endif
-    jmp .main_loop
-.load_and_splat:
-    ; load new pixels and extend edge for right-most
-    movu          m1, [srcptrq+3]
-%if ARCH_X86_64
-    sub          r11, xq
-    movu          m2, [r11+16]
-    add          r11, xq
-%else
-    sub      PIC_reg, xd
-    movu          m2, [PIC_sym(pb_right_ext_mask)+16]
-    add      PIC_reg, xd
-%endif
-    movd          m3, [srcptrq+2+xq]
-%if cpuflag(ssse3)
-    pshufb        m3, [PIC_sym(pb_0)]
-%else
-    punpcklbw     m3, m3
-    pshuflw       m3, m3, q0000
-    punpcklqdq    m3, m3
-%endif
-    pand          m1, m2
-    pxor          m2, [PIC_sym(pb_right_ext_mask)]
-    pand          m3, m2
-    pxor          m2, [PIC_sym(pb_right_ext_mask)]
-    por           m1, m3
-    jmp .main_loop
-.main_load:
-    ; load subsequent line
-    movu          m1, [srcptrq+3]
-.main_loop:
-%if ARCH_X86_64
-    PALIGNR       m2, m1, m0, 10
-    PALIGNR       m3, m1, m0, 11
-    PALIGNR       m4, m1, m0, 12
-    PALIGNR       m5, m1, m0, 13
-    PALIGNR       m6, m1, m0, 14
-    PALIGNR       m7, m1, m0, 15
-
-    punpcklbw     m0, m2, m1
-    punpckhbw     m2, m1
-    punpcklbw     m8, m3, m7
-    punpckhbw     m3, m7
-    punpcklbw     m7, m4, m6
-    punpckhbw     m4, m6
-    PMADDUBSW     m0, m15, m6, m9, 1
-    PMADDUBSW     m2, m15, m6, m9, 0
-    PMADDUBSW     m8, m14, m6, m9, 0
-    PMADDUBSW     m3, m14, m6, m9, 0
-    PMADDUBSW     m7, m13, m6, m9, 0
-    PMADDUBSW     m4, m13, m6, m9, 0
-    paddw         m0, m8
-    paddw         m2, m3
- %if cpuflag(ssse3)
-    pxor          m6, m6
- %endif
-    punpcklbw     m3, m5, m6
-    punpckhbw     m5, m6
-    psllw         m8, m3, 7
-    psllw         m6, m5, 7
-    psubw         m8, m10
-    psubw         m6, m10
-    pmullw        m3, m12
-    pmullw        m5, m12
-    paddw         m0, m7
-    paddw         m2, m4
-    paddw         m0, m3
-    paddw         m2, m5
-    paddsw        m0, m8 ; see the avx2 for an explanation
-    paddsw        m2, m6 ; of how the clipping works here
-    psraw         m0, 3
-    psraw         m2, 3
-    paddw         m0, m11
-    paddw         m2, m11
-    mova [dstptrq+ 0], m0
-    mova [dstptrq+16], m2
-%else
-    PALIGNR       m2, m1, m0, 10
-    punpcklbw     m3, m2, m1
-    punpckhbw     m2, m1
-    PMADDUBSW     m3, m15, m4, m5, 1
-    PMADDUBSW     m2, m15, m4, m5, 0
-    PALIGNR       m4, m1, m0, 11
-    PALIGNR       m5, m1, m0, 15
-    punpcklbw     m6, m4, m5
-    punpckhbw     m4, m5
-    PMADDUBSW     m6, m14, m5, m7, 1
-    PMADDUBSW     m4, m14, m5, m7, 0
-    paddw         m3, m6
-    paddw         m2, m4
-    PALIGNR       m4, m1, m0, 12
-    PALIGNR       m5, m1, m0, 14
-    punpcklbw     m6, m4, m5
-    punpckhbw     m4, m5
-    PMADDUBSW     m6, m13, m5, m7, 1
-    PMADDUBSW     m4, m13, m5, m7, 0
-    paddw         m3, m6
-    paddw         m2, m4
-    PALIGNR       m6, m1, m0, 13
- %if cpuflag(ssse3)
-    pxor          m5, m5
- %endif
-    punpcklbw     m4, m6, m5
-    punpckhbw     m6, m5
-    psllw         m5, m4, 7
-    psllw         m7, m6, 7
-    psubw         m5, m10
-    psubw         m7, m10
-    pmullw        m4, m12
-    pmullw        m6, m12
-    paddw         m3, m4
-    paddw         m2, m6
-    paddsw        m3, m5
-    paddsw        m2, m7
-    psraw         m3, 3
-    psraw         m2, 3
-    paddw         m3, m11
-    paddw         m2, m11
-    mova [dstptrq+ 0], m3
-    mova [dstptrq+16], m2
-%endif
-
-    mova          m0, m1
-    add      srcptrq, 16
-    add      dstptrq, 32
-    sub           xd, 16
-    cmp           xd, 16
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    cmp           xd, xlimd
-    jg .splat_right
-
-%if ARCH_X86_32
-    mov         srcq, [esp+8]
-    mov         dstq, [esp+4]
-%endif
-    add         srcq, strideq
-    add         dstq, 384*2
-    dec           hd
-    jg .loop
-    RET
-%endmacro
-
-%macro WIENER_V 0
-%if ARCH_X86_64
-cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
-    mov        edged, edgem
-    movifnidn    fvq, fvmp
-    movifnidn     hd, hm
-    movq         m15, [fvq]
-    pshufd       m14, m15, q1111
-    pshufd       m15, m15, q0000
-    paddw        m14, [pw_0_128]
-    mova         m12, [pd_1024]
-
-    DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
-
-    mov        ylimd, edged
-    and        ylimd, 8 ; have_bottom
-    shr        ylimd, 2
-    sub        ylimd, 3
-%else
-cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
- %define ylimd [esp+12]
-
-    mov          r5d, edgem
-    and          r5d, 8
-    shr          r5d, 2
-    sub          r5d, 3
-    mov        ylimd, r5d
-    mov          fvq, fvmp
-    mov        edged, edgem
-
-    SETUP_PIC edged
-
-    movq          m0, [fvq]
-    pshufd        m1, m0, q1111
-    pshufd        m0, m0, q0000
-    paddw         m1, [PIC_sym(pw_0_128)]
-    mova  [esp+0x50], m0
-    mova  [esp+0x40], m1
-
-    DEFINE_ARGS dst, stride, mid, w, h, y, edge
- %define mptrq      midq
- %define dstptrq    dstq
- %define edgeb      byte [esp]
-%endif
-
-    ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
-    mova          m3, [midq] ; middle line
-
-    ; load top pixels
-    test       edgeb, 4 ; have_top
-    jz .emu_top
-    mova          m0, [midq-384*4]
-    mova          m2, [midq-384*2]
-    mova          m1, m0
-    jmp .load_bottom_pixels
-.emu_top:
-    mova          m0, m3
-    mova          m1, m3
-    mova          m2, m3
-
-    ; load bottom pixels
-.load_bottom_pixels:
-    mov           yd, hd
-%if ARCH_X86_64
-    mov        mptrq, midq
-    mov      dstptrq, dstq
-    add           yd, ylimd
-%else
-    mov      [esp+8], midq
-    mov      [esp+4], dstq
-    add           yd, ylimd
-%endif
-    jg .load_threelines
-
-    ; the remainder here is somewhat messy but only runs in very weird
-    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
-    ; so performance is not terribly important here...
-    je .load_twolines
-    cmp           yd, -1
-    je .load_oneline
-    ; h == 1 case
-    mova          m5, m3
-    mova          m4, m3
-    mova          m6, m3
-    jmp .loop
-.load_oneline:
-    ; h == 2 case
-    mova          m4, [midq+384*2]
-    mova          m5, m4
-    mova          m6, m4
-    jmp .loop
-.load_twolines:
-    ; h == 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    mova          m6, m5
-    jmp .loop
-.load_threelines:
-    ; h > 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    ; third line loaded in main loop below
-
-    ; main y loop for vertical filter
-.loop_load:
-    ; load one line into m6. if that pixel is no longer available, do
-    ; nothing, since m6 still has the data from the previous line in it. We
-    ; try to structure the loop so that the common case is evaluated fastest
-    mova          m6, [mptrq+384*6]
-.loop:
-%if ARCH_X86_64
-    paddw         m7, m0, m6
-    paddw         m8, m1, m5
-    paddw         m9, m2, m4
-    punpcklwd    m10, m7, m8
-    punpckhwd     m7, m8
-    punpcklwd    m11, m9, m3
-    punpckhwd     m9, m3
-    pmaddwd      m10, m15
-    pmaddwd       m7, m15
-    pmaddwd      m11, m14
-    pmaddwd       m9, m14
-    paddd        m10, m12
-    paddd         m7, m12
-    paddd        m10, m11
-    paddd         m7, m9
-    psrad        m10, 11
-    psrad         m7, 11
-    packssdw     m10, m7
-    packuswb     m10, m10
-    movq   [dstptrq], m10
-%else
-    mova  [esp+0x30], m1
-    mova  [esp+0x20], m2
-    mova  [esp+0x10], m3
-    paddw         m0, m6
-    paddw         m1, m5
-    paddw         m2, m4
-    punpcklwd     m7, m2, m3
-    punpckhwd     m2, m3
-    punpcklwd     m3, m0, m1
-    punpckhwd     m0, m1
-    mova          m1, [esp+0x50]
-    pmaddwd       m3, m1
-    pmaddwd       m0, m1
-    mova          m1, [esp+0x40]
-    pmaddwd       m7, m1
-    pmaddwd       m2, m1
-    paddd         m3, [PIC_sym(pd_1024)]
-    paddd         m0, [PIC_sym(pd_1024)]
-    paddd         m3, m7
-    paddd         m0, m2
-    psrad         m3, 11
-    psrad         m0, 11
-    packssdw      m3, m0
-    packuswb      m3, m3
-    movq      [dstq], m3
-    mova          m1, [esp+0x30]
-    mova          m2, [esp+0x20]
-    mova          m3, [esp+0x10]
-%endif
-    ; shift pixels one position
-    mova          m0, m1
-    mova          m1, m2
-    mova          m2, m3
-    mova          m3, m4
-    mova          m4, m5
-    mova          m5, m6
-    add        mptrq, 384*2
-    add      dstptrq, strideq
-    dec           yd
-    jg .loop_load
-    ; for the bottom pixels, continue using m6 (as extended edge)
-    cmp           yd, ylimd
-    jg .loop
-
-%if ARCH_X86_32
-    mov         midq, [esp+8]
-    mov         dstq, [esp+4]
-%endif
-    add         midq, 16
-    add         dstq, 8
-    sub           wd, 8
-    jg .loop_x
-    RET
-%endmacro
-
-INIT_XMM sse2
-WIENER_H
-WIENER_V
-
-INIT_XMM ssse3
-WIENER_H
-WIENER_V
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;      self-guided     ;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-%macro MULLD 2
-    pmulhuw       m5, %1, %2
-    pmullw        %1, %2
-    pslld         m5, 16
-    paddd         %1, m5
-%endmacro
-
-%macro GATHERDD 2
-    mova          m5, m7
-    movd         r6d, %2
- %if ARCH_X86_64
-    movd          %1, [r5+r6]
-    pextrw       r6d, %2, 2
-    pinsrw        m5, [r5+r6+2], 3
-    pextrw       r6d, %2, 4
-    pinsrw        %1, [r5+r6+2], 5
-    pextrw       r6d, %2, 6
-    pinsrw        m5, [r5+r6+2], 7
- %else
-    movd          %1, [PIC_sym(sgr_x_by_x-0xF03)+r6]
-    pextrw       r6d, %2, 2
-    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3
-    pextrw       r6d, %2, 4
-    pinsrw        %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5
-    pextrw       r6d, %2, 6
-    pinsrw        m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7
- %endif
-    por           %1, m5
-%endmacro
-
-%if ARCH_X86_64
-cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
-    mov        xlimd, edgem
-    movifnidn     xd, xm
-    mov           hd, hm
-    mov        edged, xlimd
-    and        xlimd, 2                             ; have_right
-    add           xd, xlimd
-    xor        xlimd, 2                             ; 2*!have_right
-%else
-cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
- %define wq     r0m
- %define xlimd  r1m
- %define hd     hmp
- %define edgeb  byte edgem
-
-    mov           r6, edgem
-    and           r6, 2                             ; have_right
-    add           xd, r6
-    xor           r6, 2                             ; 2*!have_right
-    mov        xlimd, r6
-    SETUP_PIC     r6, 0
-%endif
-
-    jnz .no_right
-    add           xd, 7
-    and           xd, ~7
-.no_right:
-    pxor          m1, m1
-    lea         srcq, [srcq+xq]
-    lea         sumq, [sumq+xq*2-2]
-    lea       sumsqq, [sumsqq+xq*4-4]
-    neg           xq
-    mov           wq, xq
-%if ARCH_X86_64
-    lea          r10, [pb_right_ext_mask+16]
-%endif
-.loop_y:
-    mov           xq, wq
-
-    ; load left
-    test       edgeb, 1                             ; have_left
-    jz .no_left
-    test       leftq, leftq
-    jz .load_left_from_main
-    movd          m0, [leftq]
-    pslldq        m0, 12
-    add        leftq, 4
-    jmp .expand_x
-.no_left:
-    movd          m0, [srcq+xq]
-    pshufb        m0, [PIC_sym(pb_0)]
-    jmp .expand_x
-.load_left_from_main:
-    movd          m0, [srcq+xq-2]
-    pslldq        m0, 14
-.expand_x:
-    punpckhbw    xm0, xm1
-
-    ; when we reach this, m0 contains left two px in highest words
-    cmp           xd, -8
-    jle .loop_x
-.partial_load_and_extend:
-    movd          m3, [srcq-4]
-    pshufb        m3, [PIC_sym(pb_3)]
-    movq          m2, [srcq+xq]
-    punpcklbw     m2, m1
-    punpcklbw     m3, m1
-%if ARCH_X86_64
-    movu          m4, [r10+xq*2]
-%else
-    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
-%endif
-    pand          m2, m4
-    pandn         m4, m3
-    por           m2, m4
-    jmp .loop_x_noload
-.right_extend:
-    pshufb        m2, m0, [PIC_sym(pb_14_15)]
-    jmp .loop_x_noload
-
-.loop_x:
-    movq          m2, [srcq+xq]
-    punpcklbw     m2, m1
-.loop_x_noload:
-    palignr       m3, m2, m0, 12
-    palignr       m4, m2, m0, 14
-
-    punpcklwd     m5, m3, m2
-    punpckhwd     m6, m3, m2
-    paddw         m3, m4
-    punpcklwd     m7, m4, m1
-    punpckhwd     m4, m1
-    pmaddwd       m5, m5
-    pmaddwd       m6, m6
-    pmaddwd       m7, m7
-    pmaddwd       m4, m4
-    paddd         m5, m7
-    paddd         m6, m4
-    paddw         m3, m2
-    movu [sumq+xq*2], m3
-    movu [sumsqq+xq*4+ 0], m5
-    movu [sumsqq+xq*4+16], m6
-
-    mova          m0, m2
-    add           xq, 8
-
-    ; if x <= -8 we can reload more pixels
-    ; else if x < 0 we reload and extend (this implies have_right=0)
-    ; else if x < xlimd we extend from previous load (this implies have_right=0)
-    ; else we are done
-
-    cmp           xd, -8
-    jle .loop_x
-    test          xd, xd
-    jl .partial_load_and_extend
-    cmp           xd, xlimd
-    jl .right_extend
-
-    add       sumsqq, (384+16)*4
-    add         sumq, (384+16)*2
-    add         srcq, strideq
-    dec           hd
-    jg .loop_y
-    RET
-
-%if ARCH_X86_64
-cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
-    movifnidn  edged, edgem
-%else
-cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
- %define sumsq_baseq dword [esp+0]
- %define sum_baseq   dword [esp+4]
- %define ylimd       dword [esp+8]
- %define m8          [esp+12]
-    mov        edged, r4m
-    mov           hd, r3m
-%endif
-    mov           xq, -2
-%if ARCH_X86_64
-    mov        ylimd, edged
-    and        ylimd, 8                             ; have_bottom
-    shr        ylimd, 2
-    sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
-    mov  sumsq_baseq, sumsqq
-    mov    sum_baseq, sumq
-.loop_x:
-    mov       sumsqq, sumsq_baseq
-    mov         sumq, sum_baseq
-    lea           yd, [hq+ylimq+2]
-%else
-    mov           yd, edged
-    and           yd, 8                             ; have_bottom
-    shr           yd, 2
-    sub           yd, 2                             ; -2 if have_bottom=0, else 0
-    mov  sumsq_baseq, sumsqq
-    mov    sum_baseq, sumq
-    mov        ylimd, yd
-.loop_x:
-    mov       sumsqd, sumsq_baseq
-    mov         sumd, sum_baseq
-    lea           yd, [hq+2]
-    add           yd, ylimd
-%endif
-    lea       sumsqq, [sumsqq+xq*4+4-(384+16)*4]
-    lea         sumq, [sumq+xq*2+2-(384+16)*2]
-    test       edgeb, 4                             ; have_top
-    jnz .load_top
-    movu          m0, [sumsqq+(384+16)*4*1]
-    movu          m1, [sumsqq+(384+16)*4*1+16]
-    mova          m2, m0
-    mova          m3, m1
-    mova          m4, m0
-    mova          m5, m1
-    movu          m6, [sumq+(384+16)*2*1]
-    mova          m7, m6
-    mova          m8, m6
-    jmp .loop_y_noload
-.load_top:
-    movu          m0, [sumsqq-(384+16)*4*1]      ; l2sq [left]
-    movu          m1, [sumsqq-(384+16)*4*1+16]   ; l2sq [right]
-    movu          m2, [sumsqq-(384+16)*4*0]      ; l1sq [left]
-    movu          m3, [sumsqq-(384+16)*4*0+16]   ; l1sq [right]
-    movu          m6, [sumq-(384+16)*2*1]        ; l2
-    movu          m7, [sumq-(384+16)*2*0]        ; l1
-.loop_y:
-%if ARCH_X86_64
-    movu          m8, [sumq+(384+16)*2*1]        ; l0
-%else
-    movu          m4, [sumq+(384+16)*2*1]        ; l0
-    mova          m8, m4
-%endif
-    movu          m4, [sumsqq+(384+16)*4*1]      ; l0sq [left]
-    movu          m5, [sumsqq+(384+16)*4*1+16]   ; l0sq [right]
-.loop_y_noload:
-    paddd         m0, m2
-    paddd         m1, m3
-    paddw         m6, m7
-    paddd         m0, m4
-    paddd         m1, m5
-    paddw         m6, m8
-    movu [sumsqq+ 0], m0
-    movu [sumsqq+16], m1
-    movu      [sumq], m6
-
-    ; shift position down by one
-    mova          m0, m2
-    mova          m1, m3
-    mova          m2, m4
-    mova          m3, m5
-    mova          m6, m7
-    mova          m7, m8
-    add       sumsqq, (384+16)*4
-    add         sumq, (384+16)*2
-    dec           yd
-    jg .loop_y
-    cmp           yd, ylimd
-    jg .loop_y_noload
-    add           xd, 8
-    cmp           xd, wd
-    jl .loop_x
-    RET
-
-cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s
-    movifnidn     sd, sm
-    sub           aq, (384+16-1)*4
-    sub           bq, (384+16-1)*2
-    add           hd, 2
-%if ARCH_X86_64
-    LEA           r5, sgr_x_by_x-0xF03
-%else
-    SETUP_PIC r5, 0
-%endif
-    movd          m6, sd
-    pshuflw       m6, m6, q0000
-    punpcklqdq    m6, m6
-    pxor          m7, m7
-    DEFINE_ARGS a, b, w, h, x
-%if ARCH_X86_64
-    mova          m8, [pd_0xF00801C7]
-    mova          m9, [pw_256]
-    psrld        m10, m9, 13                        ; pd_2048
-    mova         m11, [pb_unpcklwdw]
-%else
- %define m8     [PIC_sym(pd_0xF00801C7)]
- %define m9     [PIC_sym(pw_256)]
- %define m10    [PIC_sym(pd_2048)]
- %define m11    [PIC_sym(pb_unpcklwdw)]
-%endif
-.loop_y:
-    mov           xq, -2
-.loop_x:
-    movq          m0, [bq+xq*2]
-    movq          m1, [bq+xq*2+(384+16)*2]
-    punpcklwd     m0, m7
-    punpcklwd     m1, m7
-    movu          m2, [aq+xq*4]
-    movu          m3, [aq+xq*4+(384+16)*4]
-    pslld         m4, m2, 3
-    pslld         m5, m3, 3
-    paddd         m2, m4                            ; aa * 9
-    paddd         m3, m5
-    pmaddwd       m4, m0, m0
-    pmaddwd       m5, m1, m1
-    pmaddwd       m0, m8
-    pmaddwd       m1, m8
-    psubd         m2, m4                            ; p = aa * 9 - bb * bb
-    psubd         m3, m5
-    MULLD         m2, m6
-    MULLD         m3, m6
-    paddusw       m2, m8
-    paddusw       m3, m8
-    psrld         m2, 20                            ; z
-    psrld         m3, 20
-    GATHERDD      m4, m2                            ; xx
-    GATHERDD      m2, m3
-    psrld         m4, 24
-    psrld         m2, 24
-    packssdw      m3, m4, m2
-    pshufb        m4, m11
-    MULLD         m0, m4
-    pshufb        m2, m11
-    MULLD         m1, m2
-    psubw         m5, m9, m3
-    paddd         m0, m10
-    paddd         m1, m10
-    psrld         m0, 12
-    psrld         m1, 12
-    movq   [bq+xq*2], m5
-    psrldq        m5, 8
-    movq [bq+xq*2+(384+16)*2], m5
-    movu   [aq+xq*4], m0
-    movu [aq+xq*4+(384+16)*4], m1
-    add           xd, 4
-    cmp           xd, wd
-    jl .loop_x
-    add           aq, (384+16)*4*2
-    add           bq, (384+16)*2*2
-    sub           hd, 2
-    jg .loop_y
-    RET
-
-%if ARCH_X86_64
-cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
-                                       tmp_base, src_base, a_base, b_base, x, y
-    movifnidn     wd, wm
-    mov           hd, hm
-    mova         m15, [pw_16]
-    mov    tmp_baseq, tq
-    mov    src_baseq, srcq
-    mov      a_baseq, aq
-    mov      b_baseq, bq
-    xor           xd, xd
-%else
-cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
- %define tmp_baseq  [esp+8]
- %define src_baseq  [esp+12]
- %define a_baseq    [esp+16]
- %define b_baseq    [esp+20]
- %define wd         [esp+24]
- %define hd         [esp+28]
-    mov    tmp_baseq, tq
-    mov    src_baseq, srcq
-    mov      a_baseq, aq
-    mov      b_baseq, bq
-    mov           wd, xd
-    mov           hd, yd
-    xor           xd, xd
-    SETUP_PIC yd, 1, 1
-    jmp .loop_start
-%endif
-
-.loop_x:
-    mov           tq, tmp_baseq
-    mov         srcq, src_baseq
-    mov           aq, a_baseq
-    mov           bq, b_baseq
-%if ARCH_X86_32
-.loop_start:
-    movu          m0, [bq+xq*2-(384+16)*2-2]
-    movu          m2, [bq+xq*2-(384+16)*2+2]
-    mova          m1, [bq+xq*2-(384+16)*2]          ; b:top
-    paddw         m0, m2                            ; b:tl+tr
-    movu          m2, [bq+xq*2-2]
-    movu          m3, [bq+xq*2+2]
-    paddw         m1, [bq+xq*2]                     ; b:top+ctr
-    paddw         m2, m3                            ; b:l+r
-    mova  [esp+0x80], m0
-    mova  [esp+0x70], m1
-    mova  [esp+0x60], m2
-%endif
-    movu          m0, [aq+xq*4-(384+16)*4-4]
-    movu          m2, [aq+xq*4-(384+16)*4+4]
-    mova          m1, [aq+xq*4-(384+16)*4]          ; a:top [first half]
-    paddd         m0, m2                            ; a:tl+tr [first half]
-    movu          m2, [aq+xq*4-(384+16)*4-4+16]
-    movu          m4, [aq+xq*4-(384+16)*4+4+16]
-    mova          m3, [aq+xq*4-(384+16)*4+16]       ; a:top [second half]
-    paddd         m2, m4                            ; a:tl+tr [second half]
-    movu          m4, [aq+xq*4-4]
-    movu          m5, [aq+xq*4+4]
-    paddd         m1, [aq+xq*4]                     ; a:top+ctr [first half]
-    paddd         m4, m5                            ; a:l+r [first half]
-    movu          m5, [aq+xq*4+16-4]
-    movu          m6, [aq+xq*4+16+4]
-    paddd         m3, [aq+xq*4+16]                  ; a:top+ctr [second half]
-    paddd         m5, m6                            ; a:l+r [second half]
-%if ARCH_X86_64
-    movu          m6, [bq+xq*2-(384+16)*2-2]
-    movu          m8, [bq+xq*2-(384+16)*2+2]
-    mova          m7, [bq+xq*2-(384+16)*2]          ; b:top
-    paddw         m6, m8                            ; b:tl+tr
-    movu          m8, [bq+xq*2-2]
-    movu          m9, [bq+xq*2+2]
-    paddw         m7, [bq+xq*2]                     ; b:top+ctr
-    paddw         m8, m9                            ; b:l+r
-%endif
-
-    lea           tq, [tq+xq*2]
-    lea         srcq, [srcq+xq*1]
-    lea           aq, [aq+xq*4+(384+16)*4]
-    lea           bq, [bq+xq*2+(384+16)*2]
-    mov           yd, hd
-.loop_y:
-%if ARCH_X86_64
-    movu          m9, [bq-2]
-    movu         m10, [bq+2]
-    paddw         m7, [bq]                          ; b:top+ctr+bottom
-    paddw         m9, m10                           ; b:bl+br
-    paddw        m10, m7, m8                        ; b:top+ctr+bottom+l+r
-    paddw         m6, m9                            ; b:tl+tr+bl+br
-    psubw         m7, [bq-(384+16)*2*2]             ; b:ctr+bottom
-    paddw        m10, m6
-    psllw        m10, 2
-    psubw        m10, m6                            ; aa
-    pxor         m14, m14
-    movq         m12, [srcq]
-    punpcklbw    m12, m14
-    punpcklwd     m6, m10, m15
-    punpckhwd    m10, m15
-    punpcklwd    m13, m12, m15
-    punpckhwd    m12, m15
-    pmaddwd       m6, m13                           ; aa*src[x]+256 [first half]
-    pmaddwd      m10, m12                           ; aa*src[x]+256 [second half]
-%else
-    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
-    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
-    mova  [esp+0x50], m1
-    mova  [esp+0x40], m3
-    mova  [esp+0x30], m4
-    movu          m6, [aq-4]
-    movu          m7, [aq+4]
-    paddd         m1, m4                            ; a:top+ctr+bottom+l+r [first half]
-    paddd         m3, m5                            ; a:top+ctr+bottom+l+r [second half]
-    paddd         m6, m7                            ; a:bl+br [first half]
-    movu          m7, [aq+16-4]
-    movu          m4, [aq+16+4]
-    paddd         m7, m4                            ; a:bl+br [second half]
-    paddd         m0, m6                            ; a:tl+tr+bl+br [first half]
-    paddd         m2, m7                            ; a:tl+tr+bl+br [second half]
-    paddd         m1, m0
-    paddd         m3, m2
-    pslld         m1, 2
-    pslld         m3, 2
-    psubd         m1, m0                            ; bb [first half]
-    psubd         m3, m2                            ; bb [second half]
-%endif
-
-%if ARCH_X86_64
-    movu         m11, [aq-4]
-    movu         m12, [aq+4]
-    paddd         m1, [aq]                          ; a:top+ctr+bottom [first half]
-    paddd        m11, m12                           ; a:bl+br [first half]
-    movu         m12, [aq+16-4]
-    movu         m13, [aq+16+4]
-    paddd         m3, [aq+16]                       ; a:top+ctr+bottom [second half]
-    paddd        m12, m13                           ; a:bl+br [second half]
-    paddd        m13, m1, m4                        ; a:top+ctr+bottom+l+r [first half]
-    paddd        m14, m3, m5                        ; a:top+ctr+bottom+l+r [second half]
-    paddd         m0, m11                           ; a:tl+tr+bl+br [first half]
-    paddd         m2, m12                           ; a:tl+tr+bl+br [second half]
-    paddd        m13, m0
-    paddd        m14, m2
-    pslld        m13, 2
-    pslld        m14, 2
-    psubd        m13, m0                            ; bb [first half]
-    psubd        m14, m2                            ; bb [second half]
-    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
-    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
-%else
-    mova          m4, [esp+0x80]
-    mova  [esp+0x80], m5
-    mova          m5, [esp+0x70]
-    mova  [esp+0x70], m6
-    mova          m6, [esp+0x60]
-    mova  [esp+0x60], m7
-    mova  [esp+0x20], m1
-    movu          m7, [bq-2]
-    movu          m1, [bq+2]
-    paddw         m5, [bq]                          ; b:top+ctr+bottom
-    paddw         m7, m1
-    paddw         m1, m5, m6                        ; b:top+ctr+bottom+l+r
-    paddw         m4, m7                            ; b:tl+tr+bl+br
-    psubw         m5, [bq-(384+16)*2*2]             ; b:ctr+bottom
-    paddw         m1, m4
-    psllw         m1, 2
-    psubw         m1, m4                            ; aa
-    movq          m0, [srcq]
-    XCHG_PIC_REG
-    punpcklbw     m0, [PIC_sym(pb_right_ext_mask)+16]
-    punpcklwd     m4, m1, [PIC_sym(pw_16)]
-    punpckhwd     m1, [PIC_sym(pw_16)]
-    punpcklwd     m2, m0, [PIC_sym(pw_16)]
-    punpckhwd     m0, [PIC_sym(pw_16)]
-    XCHG_PIC_REG
-    pmaddwd       m4, m2                            ; aa*src[x]+256 [first half]
-    pmaddwd       m1, m0                            ; aa*src[x]+256 [second half]
-%endif
-
-%if ARCH_X86_64
-    paddd         m6, m13
-    paddd        m10, m14
-    psrad         m6, 9
-    psrad        m10, 9
-    packssdw      m6, m10
-    mova        [tq], m6
-%else
-    paddd         m4, [esp+0x20]
-    paddd         m1, m3
-    psrad         m4, 9
-    psrad         m1, 9
-    packssdw      m4, m1
-    mova        [tq], m4
-%endif
-
-    ; shift to next row
-%if ARCH_X86_64
-    mova          m0, m4
-    mova          m2, m5
-    mova          m4, m11
-    mova          m5, m12
-    mova          m6, m8
-    mova          m8, m9
-%else
-    mova          m1, [esp+0x50]
-    mova          m3, [esp+0x40]
-    mova          m0, [esp+0x30]
-    mova          m2, [esp+0x80]
-    mova          m4, [esp+0x70]
-    mova  [esp+0x70], m5
-    mova          m5, [esp+0x60]
-    mova  [esp+0x80], m6
-    mova  [esp+0x60], m7
-    psubd         m1, [aq-(384+16)*4*2]             ; a:ctr+bottom [first half]
-    psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
-%endif
-
-    add         srcq, strideq
-    add           aq, (384+16)*4
-    add           bq, (384+16)*2
-    add           tq, 384*2
-    dec           yd
-    jg .loop_y
-    add           xd, 8
-    cmp           xd, wd
-    jl .loop_x
-    RET
-
-cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt
-    movifnidn     hd, hm
-%if ARCH_X86_32
-    SETUP_PIC r6, 0
-%endif
-    movd          m0, wtm
-    pshufb        m0, [PIC_sym(pb_0_1)]
-    psllw         m0, 4
-    pxor          m7, m7
-    DEFINE_ARGS dst, stride, t, w, h, idx
-.loop_y:
-    xor         idxd, idxd
-.loop_x:
-    mova          m1, [tq+idxq*2+ 0]
-    mova          m4, [tq+idxq*2+16]
-    mova          m5, [dstq+idxq]
-    punpcklbw     m2, m5, m7
-    punpckhbw     m5, m7
-    psllw         m3, m2, 4
-    psllw         m6, m5, 4
-    psubw         m1, m3
-    psubw         m4, m6
-    pmulhrsw      m1, m0
-    pmulhrsw      m4, m0
-    paddw         m1, m2
-    paddw         m4, m5
-    packuswb      m1, m4
-    mova [dstq+idxq], m1
-    add         idxd, 16
-    cmp         idxd, wd
-    jl .loop_x
-    add         dstq, strideq
-    add           tq, 384 * 2
-    dec           hd
-    jg .loop_y
-    RET
-
-%if ARCH_X86_64
-cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim
-    mov        edged, edgem
-    movifnidn     wd, wm
-    mov           hd, hm
-    mova         m10, [pb_0]
-    mova         m11, [pb_0_1]
-%else
-cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edgeb      byte edgem
- %define wd         xd
- %define wq         wd
- %define wm         r5m
- %define strideq    r4m
-    SUB          esp, 8
-    SETUP_PIC sumsqd, 1, 1
-
- %define m10    [PIC_sym(pb_0)]
- %define m11    [PIC_sym(pb_0_1)]
-%endif
-
-    test       edgeb, 2                             ; have_right
-    jz .no_right
-    xor        xlimd, xlimd
-    add           wd, 2
-    add           wd, 15
-    and           wd, ~15
-    jmp .right_done
-.no_right:
-    mov        xlimd, 3
-    dec           wd
-.right_done:
-    pxor          m1, m1
-    lea         srcq, [srcq+wq+1]
-    lea         sumq, [sumq+wq*2-2]
-    lea       sumsqq, [sumsqq+wq*4-4]
-    neg           wq
-%if ARCH_X86_64
-    lea          r10, [pb_right_ext_mask+16]
-%else
-    mov           wm, xd
- %define wq wm
-%endif
-
-.loop_y:
-    mov           xq, wq
-    ; load left
-    test       edgeb, 1                             ; have_left
-    jz .no_left
-    test       leftq, leftq
-    jz .load_left_from_main
-    movd          m0, [leftq]
-    movd          m2, [srcq+xq-1]
-    pslldq        m2, 4
-    por           m0, m2
-    pslldq        m0, 11
-    add        leftq, 4
-    jmp .expand_x
-.no_left:
-    movd          m0, [srcq+xq-1]
-    XCHG_PIC_REG
-    pshufb        m0, m10
-    XCHG_PIC_REG
-    jmp .expand_x
-.load_left_from_main:
-    movd          m0, [srcq+xq-4]
-    pslldq        m0, 12
-.expand_x:
-    punpckhbw     m0, m1
-
-    ; when we reach this, m0 contains left two px in highest words
-    cmp           xd, -8
-    jle .loop_x
-    test          xd, xd
-    jge .right_extend
-.partial_load_and_extend:
-    XCHG_PIC_REG
-    movd          m3, [srcq-1]
-    movq          m2, [srcq+xq]
-    pshufb        m3, m10
-    punpcklbw     m3, m1
-    punpcklbw     m2, m1
-%if ARCH_X86_64
-    movu          m4, [r10+xq*2]
-%else
-    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
-    XCHG_PIC_REG
-%endif
-    pand          m2, m4
-    pandn         m4, m3
-    por           m2, m4
-    jmp .loop_x_noload
-.right_extend:
-    psrldq        m2, m0, 14
-    XCHG_PIC_REG
-    pshufb        m2, m11
-    XCHG_PIC_REG
-    jmp .loop_x_noload
-
-.loop_x:
-    movq          m2, [srcq+xq]
-    punpcklbw     m2, m1
-.loop_x_noload:
-    palignr       m3, m2, m0, 8
-    palignr       m4, m2, m0, 10
-    palignr       m5, m2, m0, 12
-    palignr       m6, m2, m0, 14
-
-%if ARCH_X86_64
-    paddw         m0, m3, m2
-    punpcklwd     m7, m3, m2
-    punpckhwd     m3, m2
-    paddw         m0, m4
-    punpcklwd     m8, m4, m5
-    punpckhwd     m4, m5
-    paddw         m0, m5
-    punpcklwd     m9, m6, m1
-    punpckhwd     m5, m6, m1
-    paddw         m0, m6
-    pmaddwd       m7, m7
-    pmaddwd       m3, m3
-    pmaddwd       m8, m8
-    pmaddwd       m4, m4
-    pmaddwd       m9, m9
-    pmaddwd       m5, m5
-    paddd         m7, m8
-    paddd         m3, m4
-    paddd         m7, m9
-    paddd         m3, m5
-    movu [sumq+xq*2], m0
-    movu [sumsqq+xq*4+ 0], m7
-    movu [sumsqq+xq*4+16], m3
-%else
-    paddw         m0, m3, m2
-    paddw         m0, m4
-    paddw         m0, m5
-    paddw         m0, m6
-    movu [sumq+xq*2], m0
-    punpcklwd     m7, m3, m2
-    punpckhwd     m3, m2
-    punpcklwd     m0, m4, m5
-    punpckhwd     m4, m5
-    punpckhwd     m5, m6, m1
-    pmaddwd       m7, m7
-    pmaddwd       m3, m3
-    pmaddwd       m0, m0
-    pmaddwd       m4, m4
-    pmaddwd       m5, m5
-    paddd         m7, m0
-    paddd         m3, m4
-    paddd         m3, m5
-    punpcklwd     m0, m6, m1
-    pmaddwd       m0, m0
-    paddd         m7, m0
-    movu [sumsqq+xq*4+ 0], m7
-    movu [sumsqq+xq*4+16], m3
-%endif
-
-    mova          m0, m2
-    add           xq, 8
-
-    ; if x <= -8 we can reload more pixels
-    ; else if x < 0 we reload and extend (this implies have_right=0)
-    ; else if x < xlimd we extend from previous load (this implies have_right=0)
-    ; else we are done
-
-    cmp           xd, -8
-    jle .loop_x
-    test          xd, xd
-    jl .partial_load_and_extend
-    cmp           xd, xlimd
-    jl .right_extend
-
-    add         srcq, strideq
-    add       sumsqq, (384+16)*4
-    add         sumq, (384+16)*2
-    dec           hd
-    jg .loop_y
-%if ARCH_X86_32
-    ADD          esp, 8
-%endif
-    RET
-
-%if ARCH_X86_64
-cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
-    movifnidn  edged, edgem
-    mov        ylimd, edged
-%else
-cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
- %define wm     [esp+0]
- %define hm     [esp+4]
- %define edgem  [esp+8]
-    mov           wm, xd
-    mov           hm, yd
-    mov        edgem, ylimd
-%endif
-
-    and        ylimd, 8                             ; have_bottom
-    shr        ylimd, 2
-    sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
-    mov           xq, -2
-%if ARCH_X86_64
-.loop_x:
-    lea           yd, [hd+ylimd+2]
-    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
-    lea     sum_ptrq, [  sumq+xq*2+2-(384+16)*2]
-    test       edgeb, 4                             ; have_top
-    jnz .load_top
-    movu          m0, [sumsq_ptrq+(384+16)*4*1]
-    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
-    mova          m2, m0
-    mova          m3, m1
-    mova          m4, m0
-    mova          m5, m1
-    mova          m6, m0
-    mova          m7, m1
-    movu         m10, [sum_ptrq+(384+16)*2*1]
-    mova         m11, m10
-    mova         m12, m10
-    mova         m13, m10
-    jmp .loop_y_second_load
-.load_top:
-    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
-    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
-    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
-    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
-    mova          m2, m0
-    mova          m3, m1
-    movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
-    movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
-    mova         m11, m10
-.loop_y:
-    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
-    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
-    movu         m13, [sum_ptrq+(384+16)*2*1]        ; l1
-.loop_y_second_load:
-    test          yd, yd
-    jle .emulate_second_load
-    movu          m8, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
-    movu          m9, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
-    movu         m14, [sum_ptrq+(384+16)*2*2]        ; l0
-.loop_y_noload:
-    paddd         m0, m2
-    paddd         m1, m3
-    paddw        m10, m11
-    paddd         m0, m4
-    paddd         m1, m5
-    paddw        m10, m12
-    paddd         m0, m6
-    paddd         m1, m7
-    paddw        m10, m13
-    paddd         m0, m8
-    paddd         m1, m9
-    paddw        m10, m14
-    movu [sumsq_ptrq+ 0], m0
-    movu [sumsq_ptrq+16], m1
-    movu  [sum_ptrq], m10
-
-    ; shift position down by one
-    mova          m0, m4
-    mova          m1, m5
-    mova          m2, m6
-    mova          m3, m7
-    mova          m4, m8
-    mova          m5, m9
-    mova         m10, m12
-    mova         m11, m13
-    mova         m12, m14
-    add   sumsq_ptrq, (384+16)*4*2
-    add     sum_ptrq, (384+16)*2*2
-    sub           yd, 2
-    jge .loop_y
-    ; l1 = l0
-    mova          m6, m8
-    mova          m7, m9
-    mova         m13, m14
-    cmp           yd, ylimd
-    jg .loop_y_noload
-    add           xd, 8
-    cmp           xd, wd
-    jl .loop_x
-    RET
-.emulate_second_load:
-    mova          m8, m6
-    mova          m9, m7
-    mova         m14, m13
-    jmp .loop_y_noload
-%else
-.sumsq_loop_x:
-    lea           yd, [ylimd+2]
-    add           yd, hm
-    lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
-    test  byte edgem, 4                             ; have_top
-    jnz .sumsq_load_top
-    movu          m0, [sumsq_ptrq+(384+16)*4*1]
-    movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
-    mova          m4, m0
-    mova          m5, m1
-    mova          m6, m0
-    mova          m7, m1
-    mova  [esp+0x1c], m0
-    mova  [esp+0x0c], m1
-    jmp .sumsq_loop_y_second_load
-.sumsq_load_top:
-    movu          m0, [sumsq_ptrq-(384+16)*4*1]      ; l3/4sq [left]
-    movu          m1, [sumsq_ptrq-(384+16)*4*1+16]   ; l3/4sq [right]
-    movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
-    movu          m5, [sumsq_ptrq-(384+16)*4*0+16]   ; l2sq [right]
-    mova  [esp+0x1c], m0
-    mova  [esp+0x0c], m1
-.sumsq_loop_y:
-    movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
-    movu          m7, [sumsq_ptrq+(384+16)*4*1+16]   ; l1sq [right]
-.sumsq_loop_y_second_load:
-    test          yd, yd
-    jle .sumsq_emulate_second_load
-    movu          m2, [sumsq_ptrq+(384+16)*4*2]      ; l0sq [left]
-    movu          m3, [sumsq_ptrq+(384+16)*4*2+16]   ; l0sq [right]
-.sumsq_loop_y_noload:
-    paddd         m0, [esp+0x1c]
-    paddd         m1, [esp+0x0c]
-    paddd         m0, m4
-    paddd         m1, m5
-    paddd         m0, m6
-    paddd         m1, m7
-    paddd         m0, m2
-    paddd         m1, m3
-    movu [sumsq_ptrq+ 0], m0
-    movu [sumsq_ptrq+16], m1
-
-    ; shift position down by one
-    mova          m0, m4
-    mova          m1, m5
-    mova          m4, m2
-    mova          m5, m3
-    mova  [esp+0x1c], m6
-    mova  [esp+0x0c], m7
-    add   sumsq_ptrq, (384+16)*4*2
-    sub           yd, 2
-    jge .sumsq_loop_y
-    ; l1 = l0
-    mova          m6, m2
-    mova          m7, m3
-    cmp           yd, ylimd
-    jg .sumsq_loop_y_noload
-    add           xd, 8
-    cmp           xd, wm
-    jl .sumsq_loop_x
-
-    mov           xd, -2
-.sum_loop_x:
-    lea           yd, [ylimd+2]
-    add           yd, hm
-    lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
-    test  byte edgem, 4                             ; have_top
-    jnz .sum_load_top
-    movu          m0, [sum_ptrq+(384+16)*2*1]
-    mova          m1, m0
-    mova          m2, m0
-    mova          m3, m0
-    jmp .sum_loop_y_second_load
-.sum_load_top:
-    movu          m0, [sum_ptrq-(384+16)*2*1]        ; l3/4
-    movu          m2, [sum_ptrq-(384+16)*2*0]        ; l2
-    mova          m1, m0
-.sum_loop_y:
-    movu          m3, [sum_ptrq+(384+16)*2*1]        ; l1
-.sum_loop_y_second_load:
-    test          yd, yd
-    jle .sum_emulate_second_load
-    movu          m4, [sum_ptrq+(384+16)*2*2]        ; l0
-.sum_loop_y_noload:
-    paddw         m0, m1
-    paddw         m0, m2
-    paddw         m0, m3
-    paddw         m0, m4
-    movu  [sum_ptrq], m0
-
-    ; shift position down by one
-    mova          m0, m2
-    mova          m1, m3
-    mova          m2, m4
-    add     sum_ptrq, (384+16)*2*2
-    sub           yd, 2
-    jge .sum_loop_y
-    ; l1 = l0
-    mova          m3, m4
-    cmp           yd, ylimd
-    jg .sum_loop_y_noload
-    add           xd, 8
-    cmp           xd, wm
-    jl .sum_loop_x
-    RET
-.sumsq_emulate_second_load:
-    mova          m2, m6
-    mova          m3, m7
-    jmp .sumsq_loop_y_noload
-.sum_emulate_second_load:
-    mova          m4, m3
-    jmp .sum_loop_y_noload
-%endif
-
-cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s
-    movifnidn     sd, sm
-    sub           aq, (384+16-1)*4
-    sub           bq, (384+16-1)*2
-    add           hd, 2
-%if ARCH_X86_64
-    LEA           r5, sgr_x_by_x-0xF03
-%else
-    SETUP_PIC r5, 0
-%endif
-    movd          m6, sd
-    pshuflw       m6, m6, q0000
-    punpcklqdq    m6, m6
-    pxor          m7, m7
-    DEFINE_ARGS a, b, w, h, x
-%if ARCH_X86_64
-    mova          m8, [pd_0xF0080029]
-    mova          m9, [pw_256]
-    psrld        m10, m9, 15                        ; pd_512
-%else
- %define m8     [PIC_sym(pd_0xF0080029)]
- %define m9     [PIC_sym(pw_256)]
- %define m10    [PIC_sym(pd_512)]
-%endif
-.loop_y:
-    mov           xq, -2
-.loop_x:
-    movq          m0, [bq+xq*2+0]
-    movq          m1, [bq+xq*2+8]
-    punpcklwd     m0, m7
-    punpcklwd     m1, m7
-    movu          m2, [aq+xq*4+ 0]
-    movu          m3, [aq+xq*4+16]
-    pslld         m4, m2, 3                         ; aa * 8
-    pslld         m5, m3, 3
-    paddd         m2, m4                            ; aa * 9
-    paddd         m3, m5
-    paddd         m4, m4                            ; aa * 16
-    paddd         m5, m5
-    paddd         m2, m4                            ; aa * 25
-    paddd         m3, m5
-    pmaddwd       m4, m0, m0
-    pmaddwd       m5, m1, m1
-    psubd         m2, m4                            ; p = aa * 25 - bb * bb
-    psubd         m3, m5
-    MULLD         m2, m6
-    MULLD         m3, m6
-    paddusw       m2, m8
-    paddusw       m3, m8
-    psrld         m2, 20                            ; z
-    psrld         m3, 20
-    GATHERDD      m4, m2                            ; xx
-    GATHERDD      m2, m3
-    psrld         m4, 24
-    psrld         m2, 24
-    packssdw      m3, m4, m2
-    pmullw        m4, m8
-    pmullw        m2, m8
-    psubw         m5, m9, m3
-    pmaddwd       m0, m4
-    pmaddwd       m1, m2
-    paddd         m0, m10
-    paddd         m1, m10
-    psrld         m0, 10
-    psrld         m1, 10
-    movu   [bq+xq*2], m5
-    movu [aq+xq*4+ 0], m0
-    movu [aq+xq*4+16], m1
-    add           xd, 8
-    cmp           xd, wd
-    jl .loop_x
-    add           aq, (384+16)*4*2
-    add           bq, (384+16)*2*2
-    sub           hd, 2
-    jg .loop_y
-    RET
-
-%if ARCH_X86_64
-cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \
-                                       tmp_base, src_base, a_base, b_base, x, y
-    movifnidn     wd, wm
-    mov           hd, hm
-    mov    tmp_baseq, tq
-    mov    src_baseq, srcq
-    mov      a_baseq, aq
-    mov      b_baseq, bq
-    mova          m9, [pw_5_6]
-    mova         m12, [pw_256]
-    psrlw        m10, m12, 8                    ; pw_1
-    psrlw        m11, m12, 1                    ; pw_128
-    pxor         m13, m13
-%else
-cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
- %define tmp_baseq  r0m
- %define src_baseq  r1m
- %define a_baseq    r3m
- %define b_baseq    r4m
- %define wd         r5m
- %define hd         r6m
-
-    SUB          esp, 8
-    SETUP_PIC yd
-
- %define m8     m5
- %define m9     [PIC_sym(pw_5_6)]
- %define m10    [PIC_sym(pw_1)]
- %define m11    [PIC_sym(pw_128)]
- %define m12    [PIC_sym(pw_256)]
- %define m13    m0
-%endif
-    xor           xd, xd
-.loop_x:
-    mov           tq, tmp_baseq
-    mov         srcq, src_baseq
-    mov           aq, a_baseq
-    mov           bq, b_baseq
-    movu          m0, [aq+xq*4-(384+16)*4-4]
-    mova          m1, [aq+xq*4-(384+16)*4]
-    movu          m2, [aq+xq*4-(384+16)*4+4]
-    movu          m3, [aq+xq*4-(384+16)*4-4+16]
-    mova          m4, [aq+xq*4-(384+16)*4+16]
-    movu          m5, [aq+xq*4-(384+16)*4+4+16]
-    paddd         m0, m2
-    paddd         m3, m5
-    paddd         m0, m1
-    paddd         m3, m4
-    pslld         m2, m0, 2
-    pslld         m5, m3, 2
-    paddd         m2, m0
-    paddd         m5, m3
-    paddd         m0, m2, m1                    ; prev_odd_b [first half]
-    paddd         m1, m5, m4                    ; prev_odd_b [second half]
-    movu          m3, [bq+xq*2-(384+16)*2-2]
-    mova          m4, [bq+xq*2-(384+16)*2]
-    movu          m5, [bq+xq*2-(384+16)*2+2]
-    paddw         m3, m5
-    punpcklwd     m5, m3, m4
-    punpckhwd     m3, m4
-    pmaddwd       m5, m9
-    pmaddwd       m3, m9
-    mova          m2, m5
-    packssdw      m2, m3                        ; prev_odd_a
-    lea           tq, [tq+xq*2]
-    lea         srcq, [srcq+xq*1]
-    lea           aq, [aq+xq*4+(384+16)*4]
-    lea           bq, [bq+xq*2+(384+16)*2]
-%if ARCH_X86_32
-    mov        [esp], PIC_reg
-%endif
-    mov           yd, hd
-    XCHG_PIC_REG
-.loop_y:
-    movu          m3, [aq-4]
-    mova          m4, [aq]
-    movu          m5, [aq+4]
-    paddd         m3, m5
-    paddd         m3, m4
-    pslld         m5, m3, 2
-    paddd         m5, m3
-    paddd         m5, m4                        ; cur_odd_b [first half]
-    movu          m3, [aq+16-4]
-    mova          m6, [aq+16]
-    movu          m7, [aq+16+4]
-    paddd         m3, m7
-    paddd         m3, m6
-    pslld         m7, m3, 2
-    paddd         m7, m3
-    paddd         m4, m7, m6                    ; cur_odd_b [second half]
-    movu          m3, [bq-2]
-    mova          m6, [bq]
-    movu          m7, [bq+2]
-    paddw         m3, m7
-    punpcklwd     m7, m3, m6
-    punpckhwd     m3, m6
-    pmaddwd       m7, m9
-    pmaddwd       m3, m9
-    packssdw      m6, m7, m3                    ; cur_odd_a
-
-    paddd         m0, m5                        ; cur_even_b [first half]
-    paddd         m1, m4                        ; cur_even_b [second half]
-    paddw         m2, m6                        ; cur_even_a
-
-    movq          m3, [srcq]
-%if ARCH_X86_64
-    punpcklbw     m3, m13
-%else
-    mova        [td], m5
-    pxor          m7, m7
-    punpcklbw     m3, m7
-%endif
-    punpcklwd     m7, m3, m10
-    punpckhwd     m3, m10
-    punpcklwd     m8, m2, m12
-    punpckhwd     m2, m12
-    pmaddwd       m7, m8
-    pmaddwd       m3, m2
-    paddd         m7, m0
-    paddd         m3, m1
-    psrad         m7, 9
-    psrad         m3, 9
-
-%if ARCH_X86_32
-    pxor         m13, m13
-%endif
-    movq          m8, [srcq+strideq]
-    punpcklbw     m8, m13
-    punpcklwd     m0, m8, m10
-    punpckhwd     m8, m10
-    punpcklwd     m1, m6, m11
-    punpckhwd     m2, m6, m11
-    pmaddwd       m0, m1
-    pmaddwd       m8, m2
-%if ARCH_X86_64
-    paddd         m0, m5
-%else
-    paddd         m0, [td]
-%endif
-    paddd         m8, m4
-    psrad         m0, 8
-    psrad         m8, 8
-
-    packssdw      m7, m3
-    packssdw      m0, m8
-%if ARCH_X86_32
-    mova          m5, [td]
-%endif
-    mova [tq+384*2*0], m7
-    mova [tq+384*2*1], m0
-
-    mova          m0, m5
-    mova          m1, m4
-    mova          m2, m6
-    add           aq, (384+16)*4*2
-    add           bq, (384+16)*2*2
-    add           tq, 384*2*2
-    lea         srcq, [srcq+strideq*2]
-%if ARCH_X86_64
-    sub           yd, 2
-%else
-    sub dword [esp+4], 2
-%endif
-    jg .loop_y
-    add           xd, 8
-    cmp           xd, wd
-    jl .loop_x
-%if ARCH_X86_32
-    ADD          esp, 8
-%endif
-    RET
-
-cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
-    movifnidn     wd, wm
-    movd          m0, wtm
-%if ARCH_X86_64
-    movifnidn     hd, hm
-    mova         m10, [pd_1024]
-    pxor         m11, m11
-%else
-    SETUP_PIC     hd, 0
- %define m10    [PIC_sym(pd_1024)]
- %define m11    m7
-%endif
-    pshufd        m0, m0, 0
-    DEFINE_ARGS dst, stride, t1, t2, w, h, idx
-%if ARCH_X86_32
- %define hd     hmp
-%endif
-
-.loop_y:
-    xor         idxd, idxd
-.loop_x:
-    mova          m1, [t1q+idxq*2+ 0]
-    mova          m2, [t1q+idxq*2+16]
-    mova          m3, [t2q+idxq*2+ 0]
-    mova          m4, [t2q+idxq*2+16]
-    mova          m6, [dstq+idxq]
-%if ARCH_X86_32
-    pxor          m11, m11
-%endif
-    punpcklbw     m5, m6, m11
-    punpckhbw     m6, m11
-    psllw         m7, m5, 4
-    psubw         m1, m7
-    psubw         m3, m7
-    psllw         m7, m6, 4
-    psubw         m2, m7
-    psubw         m4, m7
-    punpcklwd     m7, m1, m3
-    punpckhwd     m1, m3
-    punpcklwd     m3, m2, m4
-    punpckhwd     m2, m4
-    pmaddwd       m7, m0
-    pmaddwd       m1, m0
-    pmaddwd       m3, m0
-    pmaddwd       m2, m0
-    paddd         m7, m10
-    paddd         m1, m10
-    paddd         m3, m10
-    paddd         m2, m10
-    psrad         m7, 11
-    psrad         m1, 11
-    psrad         m3, 11
-    psrad         m2, 11
-    packssdw      m7, m1
-    packssdw      m3, m2
-    paddw         m7, m5
-    paddw         m3, m6
-    packuswb      m7, m3
-    mova [dstq+idxq], m7
-    add         idxd, 16
-    cmp         idxd, wd
-    jl .loop_x
-    add         dstq, strideq
-    add          t1q, 384 * 2
-    add          t2q, 384 * 2
-    dec           hd
-    jg .loop_y
-    RET
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -59,8 +59,8 @@
 subpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
 subpel_s_shuf2: db  0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
 subpel_s_shuf8: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
-bilin_h_shuf4:  db  1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
-bilin_h_shuf8:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+bilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+bilin_h_shuf8:  db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
 bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
 deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
 blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
@@ -76,6 +76,7 @@
 
 pb_64:   times 4 db 64
 pw_m256: times 2 dw -256
+pw_15:   times 2 dw 15
 pw_32:   times 2 dw 32
 pw_34:   times 2 dw 34
 pw_258:  times 2 dw 258
@@ -201,10 +202,9 @@
 SECTION .text
 
 INIT_XMM avx2
-DECLARE_REG_TMP 4, 6, 7
 cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
     movifnidn          mxyd, r6m ; mx
-    lea                  t2, [put_avx2]
+    lea                  r7, [put_avx2]
     tzcnt                wd, wm
     movifnidn            hd, hm
     test               mxyd, mxyd
@@ -213,35 +213,35 @@
     test               mxyd, mxyd
     jnz .v
 .put:
-    movzx                wd, word [t2+wq*2+table_offset(put,)]
-    add                  wq, t2
+    movzx                wd, word [r7+wq*2+table_offset(put,)]
+    add                  wq, r7
     jmp                  wq
 .put_w2:
-    movzx               t0d, word [srcq+ssq*0]
-    movzx               t1d, word [srcq+ssq*1]
+    movzx               r6d, word [srcq+ssq*0]
+    movzx               r7d, word [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    mov        [dstq+dsq*0], t0w
-    mov        [dstq+dsq*1], t1w
+    mov        [dstq+dsq*0], r6w
+    mov        [dstq+dsq*1], r7w
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .put_w2
     RET
 .put_w4:
-    mov                 t0d, [srcq+ssq*0]
-    mov                 t1d, [srcq+ssq*1]
+    mov                 r6d, [srcq+ssq*0]
+    mov                 r7d, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    mov        [dstq+dsq*0], t0d
-    mov        [dstq+dsq*1], t1d
+    mov        [dstq+dsq*0], r6d
+    mov        [dstq+dsq*1], r7d
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .put_w4
     RET
 .put_w8:
-    mov                  t0, [srcq+ssq*0]
-    mov                  t1, [srcq+ssq*1]
+    mov                  r6, [srcq+ssq*0]
+    mov                  r7, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    mov        [dstq+dsq*0], t0
-    mov        [dstq+dsq*1], t1
+    mov        [dstq+dsq*0], r6
+    mov        [dstq+dsq*1], r7
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .put_w8
@@ -298,17 +298,17 @@
 .h:
     ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
-    imul               mxyd, 0xff01
+    imul               mxyd, 255
     vbroadcasti128       m4, [bilin_h_shuf8]
-    add                mxyd, 16 << 8
+    add                mxyd, 16
     movd                xm5, mxyd
     mov                mxyd, r7m ; my
     vpbroadcastw         m5, xm5
     test               mxyd, mxyd
     jnz .hv
-    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
     vpbroadcastd         m3, [pw_2048]
-    add                  wq, t2
+    add                  wq, r7
     jmp                  wq
 .h_w2:
     movd                xm0, [srcq+ssq*0]
@@ -419,10 +419,10 @@
     jg .h_w64
     RET
 .h_w128:
-    mov                  t1, -32*3
+    mov                  r6, -32*3
 .h_w128_loop:
-    movu                 m0, [srcq+t1+32*3+8*0]
-    movu                 m1, [srcq+t1+32*3+8*1]
+    movu                 m0, [srcq+r6+32*3+8*0]
+    movu                 m1, [srcq+r6+32*3+8*1]
     pshufb               m0, m4
     pshufb               m1, m4
     pmaddubsw            m0, m5
@@ -430,8 +430,8 @@
     pmulhrsw             m0, m3
     pmulhrsw             m1, m3
     packuswb             m0, m1
-    mova     [dstq+t1+32*3], m0
-    add                  t1, 32
+    mova     [dstq+r6+32*3], m0
+    add                  r6, 32
     jle .h_w128_loop
     add                srcq, ssq
     add                dstq, dsq
@@ -439,11 +439,11 @@
     jg .h_w128
     RET
 .v:
-    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_v)]
-    imul               mxyd, 0xff01
+    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+    imul               mxyd, 255
     vpbroadcastd         m5, [pw_2048]
-    add                mxyd, 16 << 8
-    add                  wq, t2
+    add                mxyd, 16
+    add                  wq, r7
     movd                xm4, mxyd
     vpbroadcastw         m4, xm4
     jmp                  wq
@@ -454,7 +454,7 @@
     lea                srcq,      [srcq+ssq*2]
     pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
     pshuflw             xm1, xm1, q2301           ; 1 0
-    punpcklbw           xm1, xm0, xm1
+    punpcklbw           xm1, xm0
     pmaddubsw           xm1, xm4
     pmulhrsw            xm1, xm5
     packuswb            xm1, xm1
@@ -467,11 +467,11 @@
 .v_w4:
     movd                xm0, [srcq+ssq*0]
 .v_w4_loop:
-    vpbroadcastd        xm1, [srcq+ssq*1]
+    vpbroadcastd        xm2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    vpblendd            xm2, xm1, xm0, 0x01 ; 0 1
+    vpblendd            xm1, xm2, xm0, 0x01 ; 0 1
     vpbroadcastd        xm0, [srcq+ssq*0]
-    vpblendd            xm1, xm0, 0x02      ; 1 2
+    vpblendd            xm2, xm0, 0x02      ; 1 2
     punpcklbw           xm1, xm2
     pmaddubsw           xm1, xm4
     pmulhrsw            xm1, xm5
@@ -485,11 +485,11 @@
 .v_w8:
     movq                xm0, [srcq+ssq*0]
 .v_w8_loop:
-    movq                xm3, [srcq+ssq*1]
+    movq                xm2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw           xm1, xm3, xm0
+    punpcklbw           xm1, xm0, xm2
     movq                xm0, [srcq+ssq*0]
-    punpcklbw           xm2, xm0, xm3
+    punpcklbw           xm2, xm0
     pmaddubsw           xm1, xm4
     pmaddubsw           xm2, xm4
     pmulhrsw            xm1, xm5
@@ -504,11 +504,11 @@
 .v_w16:
     movu                xm0, [srcq+ssq*0]
 .v_w16_loop:
-    vbroadcasti128       m2, [srcq+ssq*1]
+    vbroadcasti128       m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    vpblendd             m3, m2, m0, 0x0f ; 0 1
+    vpblendd             m2, m3, m0, 0x0f ; 0 1
     vbroadcasti128       m0, [srcq+ssq*0]
-    vpblendd             m2, m0, 0xf0     ; 1 2
+    vpblendd             m3, m0, 0xf0     ; 1 2
     punpcklbw            m1, m2, m3
     punpckhbw            m2, m3
     pmaddubsw            m1, m4
@@ -528,8 +528,8 @@
 %%loop:
     movu                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m3, m0
-    punpckhbw            m2, m3, m0
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
     movu                 m0, [srcq+ssq*0]
     pmaddubsw            m1, m4
     pmaddubsw            m2, m4
@@ -536,15 +536,15 @@
     pmulhrsw             m1, m5
     pmulhrsw             m2, m5
     packuswb             m1, m2
-    mova       [dstq+dsq*0], m1
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    pmaddubsw            m1, m4
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
     pmaddubsw            m2, m4
-    pmulhrsw             m1, m5
+    pmaddubsw            m3, m4
     pmulhrsw             m2, m5
-    packuswb             m1, m2
-    mova       [dstq+dsq*1], m1
+    pmulhrsw             m3, m5
+    packuswb             m2, m3
+    mova       [dstq+dsq*0], m1
+    mova       [dstq+dsq*1], m2
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg %%loop
@@ -557,8 +557,8 @@
 .v_w64_loop:
     add                srcq, ssq
     movu                 m3, [srcq+32*0]
-    punpcklbw            m2, m3, m0
-    punpckhbw            m0, m3, m0
+    punpcklbw            m2, m0, m3
+    punpckhbw            m0, m3
     pmaddubsw            m2, m4
     pmaddubsw            m0, m4
     pmulhrsw             m2, m5
@@ -567,8 +567,8 @@
     mova                 m0, m3
     movu                 m3, [srcq+32*1]
     mova        [dstq+32*0], m2
-    punpcklbw            m2, m3, m1
-    punpckhbw            m1, m3, m1
+    punpcklbw            m2, m1, m3
+    punpckhbw            m1, m3
     pmaddubsw            m2, m4
     pmaddubsw            m1, m4
     pmulhrsw             m2, m5
@@ -581,28 +581,29 @@
     jg .v_w64_loop
     RET
 .v_w128:
-    mov                  t0, dstq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(3<<8)]
+    lea                 r6d, [hq+(3<<8)]
+    mov                  r4, srcq
+    mov                  r7, dstq
 .v_w128_loop:
     PUT_BILIN_V_W32
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 32
-    mov                dstq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r4, 32
+    add                  r7, 32
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+    sub                 r6d, 1<<8
     jg .v_w128_loop
     RET
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
-    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
     WIN64_SPILL_XMM       8
     shl                mxyd, 11 ; can't shift by 12 due to signed overflow
-    vpbroadcastd         m7, [pw_2048]
+    vpbroadcastd         m7, [pw_15]
     movd                xm6, mxyd
-    add                  wq, t2
+    add                  wq, r7
+    paddb                m5, m5
     vpbroadcastw         m6, xm6
     jmp                  wq
 .hv_w2:
@@ -618,10 +619,10 @@
     shufps              xm2, xm0, xm1, q1032 ; 0 _ 1 _
     mova                xm0, xm1
     psubw               xm1, xm2
-    paddw               xm1, xm1
     pmulhw              xm1, xm6
+    pavgw               xm2, xm7
     paddw               xm1, xm2
-    pmulhrsw            xm1, xm7
+    psrlw               xm1, 4
     packuswb            xm1, xm1
     pextrw     [dstq+dsq*0], xm1, 0
     pextrw     [dstq+dsq*1], xm1, 2
@@ -643,10 +644,10 @@
     shufps              xm2, xm0, xm1, q1032 ; 0 1
     mova                xm0, xm1
     psubw               xm1, xm2
-    paddw               xm1, xm1
     pmulhw              xm1, xm6
+    pavgw               xm2, xm7
     paddw               xm1, xm2
-    pmulhrsw            xm1, xm7
+    psrlw               xm1, 4
     packuswb            xm1, xm1
     movd       [dstq+dsq*0], xm1
     pextrd     [dstq+dsq*1], xm1, 1
@@ -667,10 +668,10 @@
     vperm2i128           m2, m0, m1, 0x21 ; 0 1
     mova                 m0, m1
     psubw                m1, m2
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m2, m7
     paddw                m1, m2
-    pmulhrsw             m1, m7
+    psrlw                m1, 4
     vextracti128        xm2, m1, 1
     packuswb            xm1, xm2
     movq       [dstq+dsq*0], xm1
@@ -694,16 +695,16 @@
     pshufb               m3, m4
     pmaddubsw            m2, m5
     psubw                m1, m2, m0
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m0, m7
     paddw                m1, m0
     pmaddubsw            m0, m3, m5
     psubw                m3, m0, m2
-    paddw                m3, m3
     pmulhw               m3, m6
+    pavgw                m2, m7
     paddw                m3, m2
-    pmulhrsw             m1, m7
-    pmulhrsw             m3, m7
+    psrlw                m1, 4
+    psrlw                m3, 4
     packuswb             m1, m3
     vpermq               m1, m1, q3120
     mova         [dstq+dsq*0], xm1
@@ -712,19 +713,21 @@
     sub                  hd, 2
     jg .hv_w16_loop
     RET
+.hv_w128:
+    lea                 r6d, [hq+(3<<16)]
+    jmp .hv_w32_start
+.hv_w64:
+    lea                 r6d, [hq+(1<<16)]
+.hv_w32_start:
+    mov                  r4, srcq
+    mov                  r7, dstq
 .hv_w32:
-    xor                 t2d, t2d
-.hv_w32gt:
-    mov                  t0, dstq
-    mov                  t1, srcq
 %if WIN64
     movaps              r4m, xmm8
 %endif
 .hv_w32_loop0:
     movu                 m0, [srcq+8*0]
-    vinserti128          m0, [srcq+8*2], 1
     movu                 m1, [srcq+8*1]
-    vinserti128          m1, [srcq+8*3], 1
     pshufb               m0, m4
     pshufb               m1, m4
     pmaddubsw            m0, m5
@@ -731,53 +734,44 @@
     pmaddubsw            m1, m5
 .hv_w32_loop:
     add                srcq, ssq
-    movu                xm2, [srcq+8*1]
-    vinserti128          m2, [srcq+8*3], 1
+    movu                 m2, [srcq+8*0]
+    movu                 m3, [srcq+8*1]
     pshufb               m2, m4
+    pshufb               m3, m4
     pmaddubsw            m2, m5
-    psubw                m3, m2, m1
-    paddw                m3, m3
-    pmulhw               m3, m6
-    paddw                m3, m1
-    mova                 m1, m2
-    pmulhrsw             m8, m3, m7
-    movu                xm2, [srcq+8*0]
-    vinserti128          m2, [srcq+8*2], 1
-    pshufb               m2, m4
-    pmaddubsw            m2, m5
-    psubw                m3, m2, m0
-    paddw                m3, m3
-    pmulhw               m3, m6
-    paddw                m3, m0
+    pmaddubsw            m3, m5
+    psubw                m8, m2, m0
+    pmulhw               m8, m6
+    pavgw                m0, m7
+    paddw                m8, m0
     mova                 m0, m2
-    pmulhrsw             m3, m7
-    packuswb             m3, m8
-    mova             [dstq], m3
+    psubw                m2, m3, m1
+    pmulhw               m2, m6
+    pavgw                m1, m7
+    paddw                m2, m1
+    mova                 m1, m3
+    psrlw                m8, 4
+    psrlw                m2, 4
+    packuswb             m8, m2
+    mova             [dstq], m8
     add                dstq, dsq
     dec                  hd
     jg .hv_w32_loop
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 32
-    mov                dstq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r4, 32
+    add                  r7, 32
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+    sub                 r6d, 1<<16
     jg .hv_w32_loop0
 %if WIN64
     movaps             xmm8, r4m
 %endif
     RET
-.hv_w64:
-    lea                 t2d, [hq+(1<<8)]
-    jmp .hv_w32gt
-.hv_w128:
-    lea                 t2d, [hq+(3<<8)]
-    jmp .hv_w32gt
 
-DECLARE_REG_TMP 3, 5, 6
 cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movifnidn          mxyd, r5m ; mx
-    lea                  t2, [prep%+SUFFIX]
+    lea                  r6, [prep%+SUFFIX]
     tzcnt                wd, wm
     movifnidn            hd, hm
     test               mxyd, mxyd
@@ -786,8 +780,8 @@
     test               mxyd, mxyd
     jnz .v
 .prep:
-    movzx                wd, word [t2+wq*2+table_offset(prep,)]
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep,)]
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
@@ -906,16 +900,16 @@
 .h:
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
-    imul               mxyd, 0xff01
+    imul               mxyd, 255
     vbroadcasti128       m4, [bilin_h_shuf8]
-    add                mxyd, 16 << 8
+    add                mxyd, 16
     movd                xm5, mxyd
     mov                mxyd, r6m ; my
     vpbroadcastw         m5, xm5
     test               mxyd, mxyd
     jnz .hv
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .h_w4:
@@ -1079,10 +1073,10 @@
     RET
 .v:
     WIN64_SPILL_XMM       7
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
-    imul               mxyd, 0xff01
-    add                mxyd, 16 << 8
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+    imul               mxyd, 255
+    add                mxyd, 16
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     movd                xm6, mxyd
     vpbroadcastw         m6, xm6
@@ -1100,9 +1094,9 @@
     vpblendd             m2, m1, m0, 0xa0 ; 0 2 2 4
     vpblendd             m1, m3, 0xaa     ; 0 1 2 3
     vpblendd             m2, m3, 0x55     ; 1 2 3 4
-    punpcklbw            m2, m1
-    pmaddubsw            m2, m6
-    mova             [tmpq], m2
+    punpcklbw            m1, m2
+    pmaddubsw            m1, m6
+    mova             [tmpq], m1
     add                tmpq, 32
     sub                  hd, 4
     jg .v_w4_loop
@@ -1116,15 +1110,15 @@
     lea                srcq, [srcq+strideq*4]
     vpblendd             m1, m0, 0x03     ; 0 2 2 2
     vpbroadcastq         m0, [srcq+strideq*0]
-    vpblendd             m3, m2, 0x33     ; 1 3 1 3
-    vpblendd             m2, m1, m3, 0x0f ; 1 3 2 2
-    vpblendd             m1, m3, 0xf0     ; 0 2 1 3
-    vpblendd             m2, m0, 0xc0     ; 1 3 2 4
-    punpcklbw            m3, m2, m1
-    punpckhbw            m2, m1
-    pmaddubsw            m3, m6
+    vpblendd             m2, m3, 0xcc     ; 1 3 1 3
+    vpblendd             m3, m2, m1, 0xf0 ; 1 3 2 2
+    vpblendd             m2, m1, 0x0f     ; 0 2 1 3
+    vpblendd             m3, m0, 0xc0     ; 1 3 2 4
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    pmaddubsw            m1, m6
     pmaddubsw            m2, m6
-    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*0], m1
     mova        [tmpq+32*1], m2
     add                tmpq, 32*2
     sub                  hd, 4
@@ -1133,25 +1127,25 @@
 .v_w16:
     vbroadcasti128       m0, [srcq+strideq*0]
 .v_w16_loop:
-    vbroadcasti128       m1, [srcq+strideq*2]
-    vbroadcasti128       m2, [srcq+strideq*1]
+    vbroadcasti128       m1, [srcq+strideq*1]
+    vbroadcasti128       m2, [srcq+strideq*2]
     vbroadcasti128       m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    shufpd               m4, m0, m1, 0x0c ; 0 2  ; 0l2l 0h2h
+    shufpd               m4, m0, m2, 0x0c ; 0 2
     vbroadcasti128       m0, [srcq+strideq*0]
-    shufpd               m2, m2, m3, 0x0c ; 1 3  ; 1l3l 1h3h
-    shufpd               m1, m1, m0, 0x0c ; 2 4  ; 2l4l 2h4h
-    punpcklbw            m3, m2, m4
+    shufpd               m1, m3, 0x0c     ; 1 3
+    shufpd               m2, m0, 0x0c     ; 2 4
+    punpcklbw            m3, m4, m1
     punpcklbw            m5, m1, m2
+    punpckhbw            m4, m1
     punpckhbw            m1, m2
-    punpckhbw            m2, m4
     pmaddubsw            m3, m6
     pmaddubsw            m5, m6
-    pmaddubsw            m2, m6
+    pmaddubsw            m4, m6
     pmaddubsw            m1, m6
     mova        [tmpq+32*0], m3
     mova        [tmpq+32*1], m5
-    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*2], m4
     mova        [tmpq+32*3], m1
     add                tmpq, 32*4
     sub                  hd, 4
@@ -1164,32 +1158,32 @@
     vpermq               m2, [srcq+strideq*2], q3120
     vpermq               m3, [srcq+stride3q ], q3120
     lea                srcq, [srcq+strideq*4]
-    punpcklbw            m4, m1, m0
-    punpckhbw            m5, m1, m0
+    punpcklbw            m4, m0, m1
+    punpckhbw            m5, m0, m1
     vpermq               m0, [srcq+strideq*0], q3120
     pmaddubsw            m4, m6
     pmaddubsw            m5, m6
     mova        [tmpq+32*0], m4
     mova        [tmpq+32*1], m5
-    punpcklbw            m4, m2, m1
-    punpckhbw            m5, m2, m1
+    punpcklbw            m4, m1, m2
+    punpckhbw            m1, m2
     pmaddubsw            m4, m6
+    pmaddubsw            m1, m6
+    punpcklbw            m5, m2, m3
+    punpckhbw            m2, m3
     pmaddubsw            m5, m6
+    pmaddubsw            m2, m6
     mova        [tmpq+32*2], m4
-    mova        [tmpq+32*3], m5
+    mova        [tmpq+32*3], m1
     add                tmpq, 32*8
-    punpcklbw            m4, m3, m2
-    punpckhbw            m5, m3, m2
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
+    punpcklbw            m1, m3, m0
+    punpckhbw            m3, m0
     pmaddubsw            m1, m6
-    pmaddubsw            m2, m6
-    mova        [tmpq-32*4], m4
-    mova        [tmpq-32*3], m5
+    pmaddubsw            m3, m6
+    mova        [tmpq-32*4], m5
+    mova        [tmpq-32*3], m2
     mova        [tmpq-32*2], m1
-    mova        [tmpq-32*1], m2
+    mova        [tmpq-32*1], m3
     sub                  hd, 4
     jg .v_w32_loop
     RET
@@ -1200,14 +1194,14 @@
     vpermq               m2, [srcq+strideq*1+32*0], q3120
     vpermq               m3, [srcq+strideq*1+32*1], q3120
     lea                srcq, [srcq+strideq*2]
-    punpcklbw            m4, m2, m0
-    punpckhbw            m5, m2, m0
+    punpcklbw            m4, m0, m2
+    punpckhbw            m0, m2
     pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
+    pmaddubsw            m0, m6
     mova        [tmpq+32*0], m4
-    mova        [tmpq+32*1], m5
-    punpcklbw            m4, m3, m1
-    punpckhbw            m5, m3, m1
+    mova        [tmpq+32*1], m0
+    punpcklbw            m4, m1, m3
+    punpckhbw            m5, m1, m3
     vpermq               m0, [srcq+strideq*0+32*0], q3120
     vpermq               m1, [srcq+strideq*0+32*1], q3120
     pmaddubsw            m4, m6
@@ -1215,52 +1209,52 @@
     mova        [tmpq+32*2], m4
     mova        [tmpq+32*3], m5
     add                tmpq, 32*8
-    punpcklbw            m4, m0, m2
-    punpckhbw            m5, m0, m2
-    punpcklbw            m2, m1, m3
-    punpckhbw            m3, m1, m3
+    punpcklbw            m4, m2, m0
+    punpckhbw            m2, m0
+    punpcklbw            m5, m3, m1
+    punpckhbw            m3, m1
     pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
     pmaddubsw            m2, m6
+    pmaddubsw            m5, m6
     pmaddubsw            m3, m6
     mova        [tmpq-32*4], m4
-    mova        [tmpq-32*3], m5
-    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*3], m2
+    mova        [tmpq-32*2], m5
     mova        [tmpq-32*1], m3
     sub                  hd, 2
     jg .v_w64_loop
     RET
 .v_w128:
-    mov                  t0, tmpq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(3<<8)]
+    lea                 r6d, [hq+(3<<8)]
+    mov                  r3, srcq
+    mov                  r5, tmpq
 .v_w128_loop0:
     vpermq               m0, [srcq+strideq*0], q3120
 .v_w128_loop:
     vpermq               m1, [srcq+strideq*1], q3120
     lea                srcq, [srcq+strideq*2]
-    punpcklbw            m2, m1, m0
-    punpckhbw            m3, m1, m0
+    punpcklbw            m2, m0, m1
+    punpckhbw            m3, m0, m1
     vpermq               m0, [srcq+strideq*0], q3120
-    punpcklbw            m4, m0, m1
-    punpckhbw            m5, m0, m1
     pmaddubsw            m2, m6
     pmaddubsw            m3, m6
+    punpcklbw            m4, m1, m0
+    punpckhbw            m1, m0
     pmaddubsw            m4, m6
-    pmaddubsw            m5, m6
+    pmaddubsw            m1, m6
     mova        [tmpq+32*0], m2
     mova        [tmpq+32*1], m3
     mova        [tmpq+32*8], m4
-    mova        [tmpq+32*9], m5
+    mova        [tmpq+32*9], m1
     add                tmpq, 32*16
     sub                  hd, 2
     jg .v_w128_loop
-    movzx                hd, t2b
-    add                  t0, 64
-    add                  t1, 32
-    mov                tmpq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r3, 32
+    add                  r5, 64
+    movzx                hd, r6b
+    mov                srcq, r3
+    mov                tmpq, r5
+    sub                 r6d, 1<<8
     jg .v_w128_loop0
     RET
 .hv:
@@ -1268,11 +1262,11 @@
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
     %assign stack_offset stack_offset - stack_size_padded
     WIN64_SPILL_XMM       7
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
     shl                mxyd, 11
     movd                xm6, mxyd
     vpbroadcastw         m6, xm6
-    add                  wq, t2
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .hv_w4:
@@ -1388,10 +1382,19 @@
     dec                  hd
     jg .hv_w32_loop
     RET
+.hv_w128:
+    lea                 r3d, [hq+(7<<8)]
+    mov                 r6d, 256
+    jmp .hv_w64_start
 .hv_w64:
-    mov                  t0, tmpq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(3<<8)]
+    lea                 r3d, [hq+(3<<8)]
+    mov                 r6d, 128
+.hv_w64_start:
+%if WIN64
+    PUSH                 r7
+%endif
+    mov                  r5, srcq
+    mov                  r7, tmpq
 .hv_w64_loop0:
     movu                xm0, [srcq+strideq*0+8*0]
     vinserti128          m0, [srcq+strideq*0+8*1], 1
@@ -1413,57 +1416,22 @@
     psubw                m2, m0, m1
     pmulhrsw             m2, m6
     paddw                m2, m1
-    mova        [tmpq+32*0], m3
-    add                tmpq, 32*8
-    mova        [tmpq-32*4], m2
+    mova        [tmpq+r6*0], m3
+    mova        [tmpq+r6*1], m2
+    lea                tmpq, [tmpq+r6*2]
     sub                  hd, 2
     jg .hv_w64_loop
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 16
-    mov                tmpq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
+    add                  r5, 16
+    add                  r7, 32
+    movzx                hd, r3b
+    mov                srcq, r5
+    mov                tmpq, r7
+    sub                 r3d, 1<<8
     jg .hv_w64_loop0
+%if WIN64
+    POP                  r7
+%endif
     RET
-.hv_w128:
-    mov                  t0, tmpq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(7<<8)]
-.hv_w128_loop0:
-    movu                xm0, [srcq+strideq*0+8*0]
-    vinserti128          m0, [srcq+strideq*0+8*1], 1
-    pshufb               m0, m4
-    pmaddubsw            m0, m5
-.hv_w128_loop:
-    movu                xm1, [srcq+strideq*1+8*0]
-    vinserti128          m1, [srcq+strideq*1+8*1], 1
-    lea                srcq, [srcq+strideq*2]
-    movu                xm2, [srcq+strideq*0+8*0]
-    vinserti128          m2, [srcq+strideq*0+8*1], 1
-    pshufb               m1, m4
-    pshufb               m2, m4
-    pmaddubsw            m1, m5
-    psubw                m3, m1, m0
-    pmulhrsw             m3, m6
-    paddw                m3, m0
-    pmaddubsw            m0, m2, m5
-    psubw                m2, m0, m1
-    pmulhrsw             m2, m6
-    paddw                m2, m1
-    mova        [tmpq+32*0], m3
-    mova        [tmpq+32*8], m2
-    add                tmpq, 32*16
-    sub                  hd, 2
-    jg .hv_w128_loop
-    movzx                hd, t2b
-    add                  t0, 32
-    add                  t1, 16
-    mov                tmpq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
-    jg .hv_w128_loop0
-    RET
 
 ; int8_t subpel_filters[5][15][8]
 %assign FILTER_REGULAR (0*15 << 16) | 3*15
@@ -1676,12 +1644,12 @@
     movd                xm2, [srcq+ssq*0]
     pinsrw              xm2, [srcq+ssq*1], 2
     pinsrw              xm2, [srcq+ssq*2], 4
-    pinsrw              xm2, [srcq+ss3q ], 6 ; 0 1 2 3
-    lea                srcq, [srcq+ssq*4]
-    movd                xm3, [srcq+ssq*0]
-    vpbroadcastd        xm1, [srcq+ssq*1]
-    vpbroadcastd        xm0, [srcq+ssq*2]
     add                srcq, ss3q
+    pinsrw              xm2, [srcq+ssq*0], 6 ; 0 1 2 3
+    movd                xm3, [srcq+ssq*1]
+    vpbroadcastd        xm1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm3, xm1, 0x02       ; 4 5
     vpblendd            xm1, xm0, 0x02       ; 5 6
     palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
@@ -1696,10 +1664,10 @@
     mova                xm2, xm3
     pmaddubsw           xm3, xm10            ; a2 b2
     paddw               xm5, xm3
-    vpbroadcastd        xm4, [srcq+ssq*0]
-    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
-    vpbroadcastd        xm0, [srcq+ssq*1]
+    vpbroadcastd        xm4, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm4, xm0, 0x02       ; 7 8
     punpcklbw           xm3, xm4             ; 67 78
     pmaddubsw           xm4, xm3, xm11       ; a3 b3
@@ -1716,12 +1684,12 @@
     movd                xm2, [srcq+ssq*0]
     pinsrd              xm2, [srcq+ssq*1], 1
     pinsrd              xm2, [srcq+ssq*2], 2
-    pinsrd              xm2, [srcq+ss3q ], 3 ; 0 1 2 3
-    lea                srcq, [srcq+ssq*4]
-    movd                xm3, [srcq+ssq*0]
-    vpbroadcastd        xm1, [srcq+ssq*1]
-    vpbroadcastd        xm0, [srcq+ssq*2]
     add                srcq, ss3q
+    pinsrd              xm2, [srcq+ssq*0], 3 ; 0 1 2 3
+    movd                xm3, [srcq+ssq*1]
+    vpbroadcastd        xm1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm3, xm1, 0x02       ; 4 5
     vpblendd            xm1, xm0, 0x02       ; 5 6
     palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
@@ -1736,10 +1704,10 @@
     mova                xm2, xm3
     pmaddubsw           xm3, xm10            ; a2 b2
     paddw               xm5, xm3
-    vpbroadcastd        xm4, [srcq+ssq*0]
-    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
-    vpbroadcastd        xm0, [srcq+ssq*1]
+    vpbroadcastd        xm4, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm4, xm0, 0x02       ; 7 8
     punpcklbw           xm3, xm4             ; 67 78
     pmaddubsw           xm4, xm3, xm11       ; a3 b3
@@ -1756,12 +1724,12 @@
     movq                xm1, [srcq+ssq*0]
     vpbroadcastq         m4, [srcq+ssq*1]
     vpbroadcastq         m2, [srcq+ssq*2]
-    vpbroadcastq         m5, [srcq+ss3q ]
-    lea                srcq, [srcq+ssq*4]
-    vpbroadcastq         m3, [srcq+ssq*0]
-    vpbroadcastq         m6, [srcq+ssq*1]
-    vpbroadcastq         m0, [srcq+ssq*2]
     add                srcq, ss3q
+    vpbroadcastq         m5, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
+    vpbroadcastq         m6, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastq         m0, [srcq+ssq*0]
     vpblendd             m1, m4, 0x30
     vpblendd             m4, m2, 0x30
     punpcklbw            m1, m4      ; 01 12
@@ -1772,6 +1740,8 @@
     vpblendd             m6, m0, 0x30
     punpcklbw            m3, m6      ; 45 56
 .v_w8_loop:
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddubsw            m5, m1, m8  ; a0 b0
     mova                 m1, m2
     pmaddubsw            m2, m9      ; a1 b1
@@ -1779,10 +1749,8 @@
     mova                 m2, m3
     pmaddubsw            m3, m10     ; a2 b2
     paddw                m5, m3
-    vpbroadcastq         m4, [srcq+ssq*0]
     vpblendd             m3, m0, m4, 0x30
-    vpbroadcastq         m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vpbroadcastq         m0, [srcq+ssq*0]
     vpblendd             m4, m0, 0x30
     punpcklbw            m3, m4      ; 67 78
     pmaddubsw            m4, m3, m11 ; a3 b3
@@ -1800,30 +1768,28 @@
 .v_w32:
 .v_w64:
 .v_w128:
-    lea                 r6d, [wq-16]
-    mov                  r4, dstq
-    mov                  r7, srcq
-    shl                 r6d, 4
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-128]
+    mov                  r4, srcq
+    mov                  r7, dstq
+    lea                 r6d, [hq+r6*2]
 .v_w16_loop0:
     vbroadcasti128       m4, [srcq+ssq*0]
     vbroadcasti128       m5, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    vbroadcasti128       m0, [srcq+ssq*1]
-    vbroadcasti128       m6, [srcq+ssq*0]
-    lea                srcq, [srcq+ssq*2]
-    vbroadcasti128       m1, [srcq+ssq*0]
-    vbroadcasti128       m2, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m6, [srcq+ssq*2]
+    add                srcq, ss3q
+    vbroadcasti128       m0, [srcq+ssq*0]
+    vbroadcasti128       m1, [srcq+ssq*1]
+    vbroadcasti128       m2, [srcq+ssq*2]
+    add                srcq, ss3q
     vbroadcasti128       m3, [srcq+ssq*0]
-    shufpd               m4, m4, m0, 0x0c
-    shufpd               m5, m5, m1, 0x0c
+    shufpd               m4, m0, 0x0c
+    shufpd               m5, m1, 0x0c
     punpcklbw            m1, m4, m5 ; 01
     punpckhbw            m4, m5     ; 34
-    shufpd               m6, m6, m2, 0x0c
+    shufpd               m6, m2, 0x0c
     punpcklbw            m2, m5, m6 ; 12
     punpckhbw            m5, m6     ; 45
-    shufpd               m0, m0, m3, 0x0c
+    shufpd               m0, m3, 0x0c
     punpcklbw            m3, m6, m0 ; 23
     punpckhbw            m6, m0     ; 56
 .v_w16_loop:
@@ -1861,11 +1827,11 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w16_loop
-    movzx                hd, r6b
     add                  r4, 16
     add                  r7, 16
-    mov                dstq, r4
-    mov                srcq, r7
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
     sub                 r6d, 1<<8
     jg .v_w16_loop0
     RET
@@ -1898,12 +1864,12 @@
     movq                xm2, [srcq+ssq*0]
     movhps              xm2, [srcq+ssq*1]
     movq                xm0, [srcq+ssq*2]
-    movhps              xm0, [srcq+ss3q ]
-    lea                srcq, [srcq+ssq*4]
-    vpbroadcastq         m3, [srcq+ssq*0]
-    vpbroadcastq         m4, [srcq+ssq*1]
-    vpbroadcastq         m1, [srcq+ssq*2]
     add                srcq, ss3q
+    movhps              xm0, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
+    vpbroadcastq         m4, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpbroadcastq         m1, [srcq+ssq*0]
     vpblendd             m2, m3, 0x30
     vpblendd             m0, m1, 0x30
     vpblendd             m2, m4, 0xc0
@@ -1920,6 +1886,11 @@
     pshufd              xm0, xm3, q2121
     punpcklwd           xm3, xm0       ; 45 56
 .hv_w2_loop:
+    movq                xm4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movhps              xm4, [srcq+ssq*0]
+    pshufb              xm4, xm6
+    pmaddubsw           xm4, xm7
     pmaddwd             xm5, xm1, xm10 ; a0 b0
     mova                xm1, xm2
     pmaddwd             xm2, xm11      ; a1 b1
@@ -1926,14 +1897,9 @@
     paddd               xm5, xm2
     mova                xm2, xm3
     pmaddwd             xm3, xm12      ; a2 b2
-    paddd               xm5, xm3
-    movq                xm4, [srcq+ssq*0]
-    movhps              xm4, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    pshufb              xm4, xm6
-    pmaddubsw           xm4, xm7
     phaddw              xm4, xm4
     pmulhrsw            xm4, xm8
+    paddd               xm5, xm3
     palignr             xm3, xm4, xm0, 12
     mova                xm0, xm4
     punpcklwd           xm3, xm0       ; 67 78
@@ -1954,13 +1920,13 @@
     vpbroadcastq         m2, [srcq+ssq*0]
     vpbroadcastq         m4, [srcq+ssq*1]
     vpbroadcastq         m0, [srcq+ssq*2]
-    vpbroadcastq         m5, [srcq+ss3q ]
-    lea                srcq, [srcq+ssq*4]
-    vpbroadcastq         m3, [srcq+ssq*0]
+    add                srcq, ss3q
+    vpbroadcastq         m5, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
     vpblendd             m2, m4, 0xcc ; 0 1
-    vpbroadcastq         m4, [srcq+ssq*1]
-    vpbroadcastq         m1, [srcq+ssq*2]
+    vpbroadcastq         m4, [srcq+ssq*2]
     add                srcq, ss3q
+    vpbroadcastq         m1, [srcq+ssq*0]
     vpblendd             m0, m5, 0xcc ; 2 3
     vpblendd             m3, m4, 0xcc ; 4 5
     pshufb               m2, m6
@@ -1981,6 +1947,8 @@
     pshufd               m0, m3, q2121
     punpcklwd            m3, m0       ; 45 56
 .hv_w4_loop:
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddwd              m5, m1, m10  ; a0 b0
     mova                 m1, m2
     pmaddwd              m2, m11      ; a1 b1
@@ -1988,9 +1956,7 @@
     mova                 m2, m3
     pmaddwd              m3, m12      ; a2 b2
     paddd                m5, m3
-    vpbroadcastq         m4, [srcq+ssq*0]
-    vpbroadcastq         m3, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vpbroadcastq         m3, [srcq+ssq*0]
     vpblendd             m4, m3, 0xcc ; 7 8
     pshufb               m4, m6
     pmaddubsw            m4, m7
@@ -2031,25 +1997,23 @@
     pshufd              m13, m0, q1111
     pshufd              m14, m0, q2222
     pshufd              m15, m0, q3333
-    lea                 r6d, [wq-8]
-    mov                  r4, dstq
-    mov                  r7, srcq
-    shl                 r6d, 5
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-64]
+    mov                  r4, srcq
+    mov                  r7, dstq
+    lea                 r6d, [hq+r6*4]
 .hv_w8_loop0:
     vbroadcasti128       m7, [subpel_h_shufA]
-    vbroadcasti128       m8, [subpel_h_shufB]
-    vbroadcasti128       m9, [subpel_h_shufC]
     movu                xm4, [srcq+ssq*0]
+    vbroadcasti128       m8, [subpel_h_shufB]
     movu                xm5, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    movu                xm6, [srcq+ssq*0]
-    vbroadcasti128       m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m9, [subpel_h_shufC]
+    movu                xm6, [srcq+ssq*2]
+    add                srcq, ss3q
+    vbroadcasti128       m0, [srcq+ssq*0]
     vpblendd             m4, m0, 0xf0        ; 0 3
-    vinserti128          m5, [srcq+ssq*0], 1 ; 1 4
-    vinserti128          m6, [srcq+ssq*1], 1 ; 2 5
-    lea                srcq, [srcq+ssq*2]
+    vinserti128          m5, [srcq+ssq*1], 1 ; 1 4
+    vinserti128          m6, [srcq+ssq*2], 1 ; 2 5
+    add                srcq, ss3q
     vinserti128          m0, [srcq+ssq*0], 1 ; 3 6
 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
     pshufb               %3, %1, %6
@@ -2130,11 +2094,11 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .hv_w8_loop
-    movzx                hd, r6b
     add                  r4, 8
     add                  r7, 8
-    mov                dstq, r4
-    mov                srcq, r7
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
     sub                 r6d, 1<<8
     jg .hv_w8_loop0
     RET
@@ -2153,48 +2117,6 @@
     pmulhrsw             m0, m4
 %endmacro
 
-%macro PREP_8TAP_V_W4 5 ; round, weights
-    movd                xm0, [srcq+strideq*0]
-    vpbroadcastd         m1, [srcq+strideq*2]
-    vpbroadcastd        xm2, [srcq+strideq*1]
-    vpbroadcastd         m3, [srcq+stride3q ]
-    lea                srcq, [srcq+strideq*4]
-    vpblendd             m1, m0, 0x01     ; 0 2 2 _   2 _ _ _
-    vpblendd             m3, m2, 0x03     ; 1 1 3 3   3 3 _ _
-    vpbroadcastd         m0, [srcq+strideq*0]
-    vpbroadcastd         m2, [srcq+strideq*1]
-    vpblendd             m1, m0, 0x68     ; 0 2 2 4   2 4 4 _
-    vpbroadcastd         m0, [srcq+strideq*2]
-    vbroadcasti128       m5, [deint_shuf4]
-    vpblendd             m3, m2, 0xc0     ; 1 1 3 3   3 3 5 5
-    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
-    vpblendd             m3, m1, 0xaa     ; 1 2 3 4   3 4 5 _
-    punpcklbw            m1, m2, m3       ; 01  12    23  34
-    vpblendd             m3, m0, 0x80     ; 1 2 3 4   3 4 5 6
-    punpckhbw            m2, m3           ; 23  34    45  56
-.v_w4_loop:
-    pinsrd              xm0, [srcq+stride3q ], 1
-    lea                srcq, [srcq+strideq*4]
-    vpbroadcastd         m3, [srcq+strideq*0]
-    vpbroadcastd         m4, [srcq+strideq*1]
-    vpblendd             m3, m4, 0x20     ; _ _ 8 _   8 9 _ _
-    vpblendd             m3, m0, 0x03     ; 6 7 8 _   8 9 _ _
-    vpbroadcastd         m0, [srcq+strideq*2]
-    vpblendd             m3, m0, 0x40     ; 6 7 8 _   8 9 a _
-    pshufb               m3, m5           ; 67  78    89  9a
-    pmaddubsw            m4, m1, m%2
-    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
-    pmaddubsw            m2, m%3
-    paddw                m4, m2
-    mova                 m2, m3
-    pmaddubsw            m3, m%5
-    paddw                m3, m4
-    pmaddubsw            m4, m1, m%4
-    paddw                m3, m4
-    pmulhrsw             m3, m%1
-    mova             [tmpq], m3
-%endmacro
-
 %if WIN64
 DECLARE_REG_TMP 6, 4
 %else
@@ -2347,7 +2269,45 @@
     jg .v_w16
     je .v_w8
 .v_w4:
-    PREP_8TAP_V_W4 7, 8, 9, 10, 11
+    movd                xm0, [srcq+strideq*0]
+    vpbroadcastd         m1, [srcq+strideq*2]
+    vpbroadcastd        xm2, [srcq+strideq*1]
+    add                srcq, stride3q
+    vpbroadcastd         m3, [srcq+strideq*0]
+    vpblendd             m1, m0, 0x01     ; 0 2 2 _   2 _ _ _
+    vpblendd             m3, m2, 0x03     ; 1 1 3 3   3 3 _ _
+    vpbroadcastd         m0, [srcq+strideq*1]
+    vpbroadcastd         m2, [srcq+strideq*2]
+    vpblendd             m1, m0, 0x68     ; 0 2 2 4   2 4 4 _
+    vpbroadcastd         m0, [srcq+stride3q ]
+    vbroadcasti128       m5, [deint_shuf4]
+    vpblendd             m3, m2, 0xc0     ; 1 1 3 3   3 3 5 5
+    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
+    vpblendd             m3, m1, 0xaa     ; 1 2 3 4   3 4 5 _
+    punpcklbw            m1, m2, m3       ; 01  12    23  34
+    vpblendd             m3, m0, 0x80     ; 1 2 3 4   3 4 5 6
+    punpckhbw            m2, m3           ; 23  34    45  56
+.v_w4_loop:
+    lea                srcq, [srcq+strideq*4]
+    pinsrd              xm0, [srcq+strideq*0], 1
+    vpbroadcastd         m3, [srcq+strideq*1]
+    vpbroadcastd         m4, [srcq+strideq*2]
+    vpblendd             m3, m0, 0x03     ; 6 7 8 _   8 _ _ _
+    vpbroadcastd         m0, [srcq+stride3q ]
+    vpblendd             m3, m4, 0x20     ; 6 7 8 _   8 9 _ _
+    vpblendd             m3, m0, 0x40     ; 6 7 8 _   8 9 a _
+    pshufb               m3, m5           ; 67  78    89  9a
+    pmaddubsw            m4, m1, m8
+    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
+    pmaddubsw            m2, m9
+    paddw                m4, m2
+    mova                 m2, m3
+    pmaddubsw            m3, m11
+    paddw                m3, m4
+    pmaddubsw            m4, m1, m10
+    paddw                m3, m4
+    pmulhrsw             m3, m7
+    mova             [tmpq], m3
     add                tmpq, 32
     sub                  hd, 4
     jg .v_w4_loop
@@ -2406,11 +2366,10 @@
     jg .v_w8_loop
     RET
 .v_w16:
-    lea                 r6d, [wq-16]
-    mov                  r5, tmpq
-    mov                  r7, srcq
-    shl                 r6d, 4
-    mov                 r6b, hb
+    add                  wd, wd
+    mov                  r5, srcq
+    mov                  r7, tmpq
+    lea                 r6d, [hq+wq*8-256]
 .v_w16_loop0:
     vbroadcasti128       m4, [srcq+strideq*0]
     vbroadcasti128       m5, [srcq+strideq*1]
@@ -2461,15 +2420,15 @@
     pmulhrsw            m14, m7
     pmulhrsw            m15, m7
     mova        [tmpq+wq*0], m14
-    mova        [tmpq+wq*2], m15
-    lea                tmpq, [tmpq+wq*4]
+    mova        [tmpq+wq*1], m15
+    lea                tmpq, [tmpq+wq*2]
     sub                  hd, 2
     jg .v_w16_loop
+    add                  r5, 16
+    add                  r7, 32
     movzx                hd, r6b
-    add                  r5, 32
-    add                  r7, 16
-    mov                tmpq, r5
-    mov                srcq, r7
+    mov                srcq, r5
+    mov                tmpq, r7
     sub                 r6d, 1<<8
     jg .v_w16_loop0
     RET
@@ -2557,8 +2516,8 @@
     vpbroadcastq         m2, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
     paddd                m6, m4
-    paddd                m5, m3
     vpbroadcastq         m4, [srcq+strideq*0]
+    paddd                m5, m3
     vpbroadcastq         m3, [srcq+strideq*1]
     vpblendd             m2, m4, 0xcc
     vpbroadcastq         m4, [srcq+strideq*2]
@@ -2591,18 +2550,17 @@
     jg .hv_w4_loop
     RET
 .hv_w8:
-    lea                 r6d, [wq-8]
-    mov                  r5, tmpq
-    mov                  r7, srcq
-    shl                 r6d, 5
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-64]
+    mov                  r5, srcq
+    mov                  r7, tmpq
+    lea                 r6d, [hq+r6*4]
 .hv_w8_loop0:
     vbroadcasti128       m7, [subpel_h_shufA]
-    vbroadcasti128       m8, [subpel_h_shufB]
-    vbroadcasti128       m9, [subpel_h_shufC]
     movu                xm4, [srcq+strideq*0]
+    vbroadcasti128       m8, [subpel_h_shufB]
     movu                xm5, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m9, [subpel_h_shufC]
     movu                xm6, [srcq+strideq*0]
     vbroadcasti128       m0, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
@@ -2676,11 +2634,11 @@
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .hv_w8_loop
+    add                  r5, 8
+    add                  r7, 16
     movzx                hd, r6b
-    add                  r5, 16
-    add                  r7, 8
-    mov                tmpq, r5
-    mov                srcq, r7
+    mov                srcq, r5
+    mov                tmpq, r7
     sub                 r6d, 1<<8
     jg .hv_w8_loop0
     RET
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -57,8 +57,8 @@
 subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 subpel_s_shuf2: db 0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
 subpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
-bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
-bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+bilin_h_shuf4:  db 0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+bilin_h_shuf8:  db 0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
 unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
 
 pb_8x0_8x8: times 8 db 0
@@ -77,6 +77,7 @@
 pw_1:     times 8 dw 1
 pw_2:     times 8 dw 2
 pw_8:     times 8 dw 8
+pw_15:    times 8 dw 15
 pw_26:    times 8 dw 26
 pw_34:    times 8 dw 34
 pw_512:   times 8 dw 512
@@ -220,16 +221,18 @@
  DECLARE_REG_TMP 7
  %define base 0
 %endif
-;
+
 %macro RESTORE_DSQ_32 1
  %if ARCH_X86_32
    mov                  %1, dsm ; restore dsq
  %endif
 %endmacro
-;
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+
+cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy
     movifnidn          mxyd, r6m ; mx
     LEA                  t0, put_ssse3
+    movifnidn          srcq, srcmp
+    movifnidn           ssq, ssmp
     tzcnt                wd, wm
     mov                  hd, hm
     test               mxyd, mxyd
@@ -335,20 +338,19 @@
 .h:
     ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
-    imul               mxyd, 0xff01
+    imul               mxyd, 0x00ff00ff
     mova                 m4, [base+bilin_h_shuf8]
     mova                 m0, [base+bilin_h_shuf4]
-    add                mxyd, 16 << 8
+    add                mxyd, 0x00100010
     movd                 m5, mxyd
     mov                mxyd, r7m ; my
-    pshuflw              m5, m5, q0000
-    punpcklqdq           m5, m5
+    pshufd               m5, m5, q0000
     test               mxyd, mxyd
     jnz .hv
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
     mova                 m3, [base+pw_2048]
     add                  wq, t0
-    RESTORE_DSQ_32       t0
+    movifnidn           dsq, dsmp
     jmp                  wq
 .h_w2:
     pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
@@ -485,14 +487,13 @@
     RET
 .v:
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
-    imul               mxyd, 0xff01
+    imul               mxyd, 0x00ff00ff
     mova                 m5, [base+pw_2048]
-    add                mxyd, 16 << 8
+    add                mxyd, 0x00100010
     add                  wq, t0
     movd                 m4, mxyd
-    pshuflw              m4, m4, q0000
-    punpcklqdq           m4, m4
-    RESTORE_DSQ_32       t0
+    pshufd               m4, m4, q0000
+    movifnidn           dsq, dsmp
     jmp                  wq
 .v_w2:
     movd                 m0, [srcq+ssq*0]
@@ -499,9 +500,9 @@
 .v_w2_loop:
     pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
     lea                srcq, [srcq+ssq*2]
-    pshuflw              m2, m0, q2301
+    pshuflw              m1, m0, q2301
     pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
-    punpcklbw            m1, m0, m2
+    punpcklbw            m1, m0
     pmaddubsw            m1, m4
     pmulhrsw             m1, m5
     packuswb             m1, m1
@@ -516,11 +517,12 @@
 .v_w4:
     movd                 m0, [srcq+ssq*0]
 .v_w4_loop:
-    movd                 m1, [srcq+ssq*1]
+    movd                 m2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpckldq            m2, m0, m1 ; 0 1
+    mova                 m1, m0
     movd                 m0, [srcq+ssq*0]
-    punpckldq            m1, m0  ; 1 2
+    punpckldq            m1, m2 ; 0 1
+    punpckldq            m2, m0 ; 1 2
     punpcklbw            m1, m2
     pmaddubsw            m1, m4
     pmulhrsw             m1, m5
@@ -536,11 +538,12 @@
 .v_w8:
     movq                 m0, [srcq+ssq*0]
 .v_w8_loop:
-    movq                 m3, [srcq+ssq*1]
+    movq                 m2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m3, m0
+    mova                 m1, m0
     movq                 m0, [srcq+ssq*0]
-    punpcklbw            m2, m0, m3
+    punpcklbw            m1, m2
+    punpcklbw            m2, m0
     pmaddubsw            m1, m4
     pmaddubsw            m2, m4
     pmulhrsw             m1, m5
@@ -552,66 +555,69 @@
     sub                  hd, 2
     jg .v_w8_loop
     RET
-    ;
 %macro PUT_BILIN_V_W16 0
     movu                 m0, [srcq+ssq*0]
 %%loop:
     movu                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m3, m0
-    punpckhbw            m2, m3, m0
+    mova                 m1, m0
+    mova                 m2, m0
     movu                 m0, [srcq+ssq*0]
+    punpcklbw            m1, m3
+    punpckhbw            m2, m3
     pmaddubsw            m1, m4
     pmaddubsw            m2, m4
     pmulhrsw             m1, m5
     pmulhrsw             m2, m5
     packuswb             m1, m2
-    mova       [dstq+dsq*0], m1
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    pmaddubsw            m1, m4
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
     pmaddubsw            m2, m4
-    pmulhrsw             m1, m5
+    pmaddubsw            m3, m4
     pmulhrsw             m2, m5
-    packuswb             m1, m2
-    mova       [dstq+dsq*1], m1
+    pmulhrsw             m3, m5
+    packuswb             m2, m3
+    mova       [dstq+dsq*0], m1
+    mova       [dstq+dsq*1], m2
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg %%loop
 %endmacro
-    ;
 .v_w16:
     PUT_BILIN_V_W16
     RET
+.v_w128:
+    lea                 r6d, [hq+(7<<16)]
+    jmp .v_w16gt
+.v_w64:
+    lea                 r6d, [hq+(3<<16)]
+    jmp .v_w16gt
+.v_w32:
+    lea                 r6d, [hq+(1<<16)]
 .v_w16gt:
-    mov                  r4, dstq
-    mov                  r6, srcq
+    mov                  r4, srcq
+%if ARCH_X86_64
+    mov                  r7, dstq
+%endif
 .v_w16gt_loop:
-%if ARCH_X86_32
-    mov                bakm, t0q
-    RESTORE_DSQ_32       t0
     PUT_BILIN_V_W16
-    mov                 t0q, bakm
+%if ARCH_X86_64
+    add                  r4, 16
+    add                  r7, 16
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
 %else
-    PUT_BILIN_V_W16
+    mov                dstq, dstmp
+    add                  r4, 16
+    movzx                hd, r6w
+    add                dstq, 16
+    mov                srcq, r4
+    mov               dstmp, dstq
 %endif
-    mov                  hw, t0w
-    add                  r4, mmsize
-    add                  r6, mmsize
-    mov                dstq, r4
-    mov                srcq, r6
-    sub                 t0d, 1<<16
+    sub                 r6d, 1<<16
     jg .v_w16gt
     RET
-.v_w32:
-    lea                 t0d, [hq+(1<<16)]
-    jmp .v_w16gt
-.v_w64:
-    lea                 t0d, [hq+(3<<16)]
-    jmp .v_w16gt
-.v_w128:
-    lea                 t0d, [hq+(7<<16)]
-    jmp .v_w16gt
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
@@ -618,32 +624,33 @@
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
     WIN64_SPILL_XMM       8
     shl                mxyd, 11 ; can't shift by 12 due to signed overflow
-    mova                 m7, [base+pw_2048]
+    mova                 m7, [base+pw_15]
     movd                 m6, mxyd
     add                  wq, t0
     pshuflw              m6, m6, q0000
+    paddb                m5, m5
     punpcklqdq           m6, m6
     jmp                  wq
 .hv_w2:
     RESTORE_DSQ_32       t0
     movd                 m0, [srcq+ssq*0]
-    pshufd               m0, m0, q0000      ; src[x - src_stride]
+    punpckldq            m0, m0
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w2_loop:
-    movd                 m1, [srcq+ssq*1]   ; src[x]
+    movd                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    movhps               m1, [srcq+ssq*0]   ; src[x + src_stride]
-    pshufd               m1, m1, q3120
+    movd                 m2, [srcq+ssq*0]
+    punpckldq            m1, m2
     pshufb               m1, m4
     pmaddubsw            m1, m5             ; 1 _ 2 _
     shufps               m2, m0, m1, q1032  ; 0 _ 1 _
     mova                 m0, m1
-    psubw                m1, m2   ; src[x + src_stride] - src[x]
-    paddw                m1, m1
-    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x])
-    paddw                m1, m2   ; src[x] + (my * (src[x + src_stride] - src[x])
-    pmulhrsw             m1, m7
+    psubw                m1, m2   ; 2 * (src[x + src_stride] - src[x])
+    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x]) >> 4
+    pavgw                m2, m7   ; src[x] + 8
+    paddw                m1, m2   ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
+    psrlw                m1, 4
     packuswb             m1, m1
 %if ARCH_X86_64
     movq                 r6, m1
@@ -660,8 +667,8 @@
     RET
 .hv_w4:
     mova                 m4, [base+bilin_h_shuf4]
-    RESTORE_DSQ_32       t0
     movddup             xm0, [srcq+ssq*0]
+    movifnidn           dsq, dsmp
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w4_loop:
@@ -669,14 +676,14 @@
     lea                srcq, [srcq+ssq*2]
     movhps               m1, [srcq+ssq*0]
     pshufb               m1, m4
-    pmaddubsw            m1, m5           ; 1 2
+    pmaddubsw            m1, m5            ; 1 2
     shufps               m2, m0, m1, q1032 ; 0 1
     mova                 m0, m1
     psubw                m1, m2
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m2, m7
     paddw                m1, m2
-    pmulhrsw             m1, m7
+    psrlw                m1, 4
     packuswb             m1, m1
     movd       [dstq+dsq*0], m1
     psrlq                m1, 32
@@ -686,28 +693,28 @@
     jg .hv_w4_loop
     RET
 .hv_w8:
-    RESTORE_DSQ_32       t0
-    movu                 m0, [srcq+ssq*0+8*0]
+    movu                 m0, [srcq+ssq*0]
+    movifnidn           dsq, dsmp
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w8_loop:
-    movu                 m2, [srcq+ssq*1+8*0]
+    movu                 m2, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
     pshufb               m2, m4
     pmaddubsw            m2, m5
     psubw                m1, m2, m0
-    paddw                m1, m1
     pmulhw               m1, m6
+    pavgw                m0, m7
     paddw                m1, m0
-    movu                 m0, [srcq+ssq*0+8*0]
+    movu                 m0, [srcq+ssq*0]
     pshufb               m0, m4
     pmaddubsw            m0, m5
     psubw                m3, m0, m2
-    paddw                m3, m3
     pmulhw               m3, m6
+    pavgw                m2, m7
     paddw                m3, m2
-    pmulhrsw             m1, m7
-    pmulhrsw             m3, m7
+    psrlw                m1, 4
+    psrlw                m3, 4
     packuswb             m1, m3
     movq       [dstq+dsq*0], m1
     movhps     [dstq+dsq*1], m1
@@ -715,27 +722,34 @@
     sub                  hd, 2
     jg .hv_w8_loop
     RET
+.hv_w128:
+    lea                 r6d, [hq+(7<<16)]
+    jmp .hv_w16_start
+.hv_w64:
+    lea                 r6d, [hq+(3<<16)]
+    jmp .hv_w16_start
+.hv_w32:
+    lea                 r6d, [hq+(1<<16)]
+.hv_w16_start:
+    mov                  r4, srcq
+%if ARCH_X86_32
+    %define m8 [dstq]
+%else
+    mov                  r7, dstq
+%endif
 .hv_w16:
-    xor                 t0d, t0d
-.hv_w16gt:
-    mov                  r4, dstq
-    mov                  r6, srcq
- %if WIN64
-    movaps              r4m, xmm8
- %endif
+    movifnidn           dsq, dsmp
+%if WIN64
+    movaps              r4m, m8
+%endif
 .hv_w16_loop0:
-    movu                 m0,     [srcq+8*0]
-    movu                 m1,     [srcq+8*1]
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
     pshufb               m0, m4
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
 .hv_w16_loop:
-%if ARCH_X86_32
- %define m0tmp [dstq]
-%else
- %define m0tmp m8
-%endif
     add                srcq, ssq
     movu                 m2, [srcq+8*0]
     movu                 m3, [srcq+8*1]
@@ -743,62 +757,51 @@
     pshufb               m3, m4
     pmaddubsw            m2, m5
     pmaddubsw            m3, m5
-    mova              m0tmp, m2
+    mova                 m8, m2
     psubw                m2, m0
-    paddw                m2, m2
     pmulhw               m2, m6
+    pavgw                m0, m7
     paddw                m2, m0
     mova                 m0, m3
     psubw                m3, m1
-    paddw                m3, m3
     pmulhw               m3, m6
+    pavgw                m1, m7
     paddw                m3, m1
     mova                 m1, m0
-    mova                 m0, m0tmp
-    pmulhrsw             m2, m7
-    pmulhrsw             m3, m7
+    mova                 m0, m8
+    psrlw                m2, 4
+    psrlw                m3, 4
     packuswb             m2, m3
     mova             [dstq], m2
     add                dstq, dsmp
     dec                  hd
     jg .hv_w16_loop
-    movzx                hd, t0w
-    add                  r4, mmsize
-    add                  r6, mmsize
-    mov                dstq, r4
-    mov                srcq, r6
-    sub                 t0d, 1<<16
-    jg .hv_w16_loop0
- %if WIN64
-    movaps             xmm8, r4m
- %endif
+%if ARCH_X86_32
+    mov                dstq, dstm
+    add                  r4, 16
+    movzx                hd, r6w
+    add                dstq, 16
+    mov                srcq, r4
+    mov                dstm, dstq
+%else
+    add                  r4, 16
+    add                  r7, 16
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+%endif
+    sub                 r6d, 1<<16
+    jg .hv_w16_loop0
+%if WIN64
+    movaps               m8, r4m
+%endif
     RET
-.hv_w32:
-    lea                 t0d, [hq+(1<<16)]
-    jmp .hv_w16gt
-.hv_w64:
-    lea                 t0d, [hq+(3<<16)]
-    jmp .hv_w16gt
-.hv_w128:
-    lea                 t0d, [hq+(7<<16)]
-    jmp .hv_w16gt
 
-%macro PSHUFB_0X1X 1-2 ; dst[, src]
- %if cpuflag(ssse3)
-    pshufb               %1, %2
- %else
-    punpcklbw            %1, %1
-    psraw                %1, 8
-    pshufd               %1, %1, q0000
- %endif
-%endmacro
-
 %macro PSHUFB_BILIN_H8 2 ; dst, src
  %if cpuflag(ssse3)
     pshufb               %1, %2
  %else
-    mova                 %2, %1
-    psrldq               %1, 1
+    psrldq               %2, %1, 1
     punpcklbw            %1, %2
  %endif
 %endmacro
@@ -807,8 +810,7 @@
  %if cpuflag(ssse3)
     pshufb               %1, %2
  %else
-    mova                 %2, %1
-    psrldq               %1, 1
+    psrldq               %2, %1, 1
     punpckhbw            %3, %1, %2
     punpcklbw            %1, %2
     punpcklqdq           %1, %3
@@ -845,17 +847,15 @@
 %endmacro
 
 %macro PREP_BILIN 0
-
-DECLARE_REG_TMP 3, 5, 6
 %if ARCH_X86_32
- %define base        t2-prep%+SUFFIX
+    %define base r6-prep%+SUFFIX
 %else
- %define base        0
+    %define base 0
 %endif
 
 cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movifnidn          mxyd, r5m ; mx
-    LEA                  t2, prep%+SUFFIX
+    LEA                  r6, prep%+SUFFIX
     tzcnt                wd, wm
     movifnidn            hd, hm
     test               mxyd, mxyd
@@ -865,11 +865,12 @@
     jnz .v
 .prep:
 %if notcpuflag(ssse3)
-    add                  t2, prep_ssse3 - prep_sse2
+    add                  r6, prep_ssse3 - prep_sse2
     jmp prep_ssse3
 %else
-    movzx                wd, word [t2+wq*2+table_offset(prep,)]
-    add                  wq, t2
+    movzx                wd, word [r6+wq*2+table_offset(prep,)]
+    pxor                 m4, m4
+    add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
@@ -877,17 +878,16 @@
     movd                 m1, [srcq+strideq*1]
     movd                 m2, [srcq+strideq*2]
     movd                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
     punpckldq            m0, m1
     punpckldq            m2, m3
-    lea                srcq, [srcq+strideq*4]
-    pxor                 m1, m1
-    punpcklbw            m0, m1
-    punpcklbw            m2, m1
+    punpcklbw            m0, m4
+    punpcklbw            m2, m4
     psllw                m0, 4
     psllw                m2, 4
-    mova    [tmpq+mmsize*0], m0
-    mova    [tmpq+mmsize*1], m2
-    add                tmpq, 32
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m2
+    add                tmpq, 16*2
     sub                  hd, 4
     jg .prep_w4
     RET
@@ -897,7 +897,6 @@
     movq                 m2, [srcq+strideq*2]
     movq                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    pxor                 m4, m4
     punpcklbw            m0, m4
     punpcklbw            m1, m4
     punpcklbw            m2, m4
@@ -915,16 +914,13 @@
     jg .prep_w8
     RET
 .prep_w16:
-    movq                 m0, [srcq+strideq*0+8*0]
-    movq                 m1, [srcq+strideq*0+8*1]
-    movq                 m2, [srcq+strideq*1+8*0]
-    movq                 m3, [srcq+strideq*1+8*1]
+    movu                 m1, [srcq+strideq*0]
+    movu                 m3, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    pxor                 m4, m4
-    punpcklbw            m0, m4
-    punpcklbw            m1, m4
-    punpcklbw            m2, m4
-    punpcklbw            m3, m4
+    punpcklbw            m0, m1, m4
+    punpckhbw            m1, m4
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
     psllw                m0, 4
     psllw                m1, 4
     psllw                m2, 4
@@ -937,27 +933,25 @@
     sub                  hd, 2
     jg .prep_w16
     RET
-.prep_w32:
-    mov                 t2d, 1
-    jmp .prep_w32_vloop
-.prep_w64:
-    mov                 t2d, 2
-    jmp .prep_w32_vloop
 .prep_w128:
-    mov                 t2d, 4
+    mov                  r3, -128
+    jmp .prep_w32_start
+.prep_w64:
+    mov                  r3, -64
+    jmp .prep_w32_start
+.prep_w32:
+    mov                  r3, -32
+.prep_w32_start:
+    sub                srcq, r3
 .prep_w32_vloop:
-    mov                 t1q, srcq
-    mov                 r3d, t2d
+    mov                  r6, r3
 .prep_w32_hloop:
-    movq                 m0, [t1q+8*0]
-    movq                 m1, [t1q+8*1]
-    movq                 m2, [t1q+8*2]
-    movq                 m3, [t1q+8*3]
-    pxor                 m4, m4
-    punpcklbw            m0, m4
-    punpcklbw            m1, m4
-    punpcklbw            m2, m4
-    punpcklbw            m3, m4
+    movu                 m1, [srcq+r6+16*0]
+    movu                 m3, [srcq+r6+16*1]
+    punpcklbw            m0, m1, m4
+    punpckhbw            m1, m4
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
     psllw                m0, 4
     psllw                m1, 4
     psllw                m2, 4
@@ -967,10 +961,9 @@
     mova        [tmpq+16*2], m2
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
-    add                 t1q, 32
-    dec                 r3d
-    jg .prep_w32_hloop
-    lea                srcq, [srcq+strideq]
+    add                  r6, 32
+    jl .prep_w32_hloop
+    add                srcq, strideq
     dec                  hd
     jg .prep_w32_vloop
     RET
@@ -978,40 +971,31 @@
 .h:
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
-    imul               mxyd, 0xff01
 %if cpuflag(ssse3)
+    imul               mxyd, 0x00ff00ff
     mova                 m4, [base+bilin_h_shuf8]
+    add                mxyd, 0x00100010
+%else
+    imul               mxyd, 0xffff
+    add                mxyd, 16
 %endif
-    add                mxyd, 16 << 8
     movd                 m5, mxyd
     mov                mxyd, r6m ; my
-%if cpuflag(ssse3)
-    pshuflw              m5, m5, q0000
-    punpcklqdq           m5, m5
-%else
-    PSHUFB_0X1X          m5
-%endif
+    pshufd               m5, m5, q0000
     test               mxyd, mxyd
     jnz .hv
-%if ARCH_X86_32
-    mov                  t1, t2 ; save base reg for w4
-%endif
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
 %if notcpuflag(ssse3)
     WIN64_SPILL_XMM 8
     pxor                 m6, m6
 %endif
-    add                  wq, t2
-    lea            stride3q, [strideq*3]
+    add                  wq, r6
     jmp                  wq
 .h_w4:
 %if cpuflag(ssse3)
- %if ARCH_X86_32
-    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
-    mova                 m4, [bilin_h_shuf4]
- %endif
+    mova                 m4, [base+bilin_h_shuf4]
 %endif
+    lea            stride3q, [strideq*3]
 .h_w4_loop:
     movq                 m0, [srcq+strideq*0]
     movhps               m0, [srcq+strideq*1]
@@ -1029,6 +1013,8 @@
     jg .h_w4_loop
     RET
 .h_w8:
+    lea            stride3q, [strideq*3]
+.h_w8_loop:
     movu                 m0, [srcq+strideq*0]
     movu                 m1, [srcq+strideq*1]
     movu                 m2, [srcq+strideq*2]
@@ -1048,7 +1034,7 @@
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
     sub                  hd, 4
-    jg .h_w8
+    jg .h_w8_loop
     RET
 .h_w16:
     movu                 m0, [srcq+strideq*0+8*0]
@@ -1072,22 +1058,23 @@
     sub                  hd, 2
     jg .h_w16
     RET
-.h_w32:
-    mov                 t2d, 1 << 0
-    jmp .h_w32_vloop
-.h_w64:
-    mov                 t2d, 1 << 1
-    jmp .h_w32_vloop
 .h_w128:
-    mov                 t2d, 1 << 3
+    mov                  r3, -128
+    jmp .h_w32_start
+.h_w64:
+    mov                  r3, -64
+    jmp .h_w32_start
+.h_w32:
+    mov                  r3, -32
+.h_w32_start:
+    sub                srcq, r3
 .h_w32_vloop:
-    mov                 t1q, srcq
-    mov                 r3d, t2d
+    mov                  r6, r3
 .h_w32_hloop:
-    movu                 m0, [t1q+8*0]
-    movu                 m1, [t1q+8*1]
-    movu                 m2, [t1q+8*2]
-    movu                 m3, [t1q+8*3]
+    movu                 m0, [srcq+r6+8*0]
+    movu                 m1, [srcq+r6+8*1]
+    movu                 m2, [srcq+r6+8*2]
+    movu                 m3, [srcq+r6+8*3]
     PSHUFB_BILIN_H8      m0, m4
     PSHUFB_BILIN_H8      m1, m4
     PSHUFB_BILIN_H8      m2, m4
@@ -1101,11 +1088,10 @@
     mova        [tmpq+16*2], m2
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
-    add                 t1q, 32
-    shr                 r3d, 1
-    jnz .h_w32_hloop
-    lea                srcq, [srcq+strideq]
-    sub                  hd, 1
+    add                  r6, 32
+    jl .h_w32_hloop
+    add                srcq, strideq
+    dec                  hd
     jg .h_w32_vloop
     RET
 .v:
@@ -1113,19 +1099,19 @@
  %assign stack_offset stack_offset - stack_size_padded
     WIN64_SPILL_XMM 8
 %endif
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
-    imul               mxyd, 0xff01
-    add                mxyd, 16 << 8
-    add                  wq, t2
-    lea            stride3q, [strideq*3]
-    movd                 m5, mxyd
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
 %if cpuflag(ssse3)
-    pshuflw              m5, m5, q0000
-    punpcklqdq           m5, m5
+    imul               mxyd, 0x00ff00ff
+    add                mxyd, 0x00100010
 %else
-    PSHUFB_0X1X          m5
+    imul               mxyd, 0xffff
     pxor                 m6, m6
+    add                mxyd, 16
 %endif
+    add                  wq, r6
+    lea            stride3q, [strideq*3]
+    movd                 m5, mxyd
+    pshufd               m5, m5, q0000
     jmp                  wq
 .v_w4:
     movd                 m0, [srcq+strideq*0]
@@ -1134,20 +1120,18 @@
     movd                 m2, [srcq+strideq*2]
     movd                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    punpcklwd            m0, m1  ; 0 1 _ _
-    punpcklwd            m1, m2  ; 1 2 _ _
-    punpcklbw            m1, m0
-    PMADDUBSW            m1, m5, m6, m7, 0
-    pshufd               m1, m1, q3120
-    mova        [tmpq+16*0], m1
+    punpckldq            m0, m1
+    punpckldq            m1, m2
+    punpcklbw            m0, m1 ; 01 12
+    PMADDUBSW            m0, m5, m6, m7, 0
+    mova        [tmpq+16*0], m0
     movd                 m0, [srcq+strideq*0]
-    punpcklwd            m2, m3  ; 2 3 _ _
-    punpcklwd            m3, m0  ; 3 4 _ _
-    punpcklbw            m3, m2
-    PMADDUBSW            m3, m5, m6, m7, 0
-    pshufd               m3, m3, q3120
-    mova        [tmpq+16*1], m3
-    add                tmpq, 32
+    punpckldq            m2, m3
+    punpckldq            m3, m0
+    punpcklbw            m2, m3 ; 23 34
+    PMADDUBSW            m2, m5, m6, m7, 0
+    mova        [tmpq+16*1], m2
+    add                tmpq, 16*2
     sub                  hd, 4
     jg .v_w4_loop
     RET
@@ -1154,26 +1138,23 @@
 .v_w8:
     movq                 m0, [srcq+strideq*0]
 .v_w8_loop:
-    movq                 m1, [srcq+strideq*2]
-    movq                 m2, [srcq+strideq*1]
+    movq                 m1, [srcq+strideq*1]
+    movq                 m2, [srcq+strideq*2]
     movq                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    shufpd               m4, m0, m1, 0x0c       ; 0 2
+    punpcklbw            m0, m1 ; 01
+    punpcklbw            m1, m2 ; 12
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    mova        [tmpq+16*0], m0
     movq                 m0, [srcq+strideq*0]
-    shufpd               m2, m3, 0x0c           ; 1 3
-    shufpd               m1, m0, 0x0c           ; 2 4
-    punpcklbw            m3, m2, m4
+    punpcklbw            m2, m3 ; 23
+    punpcklbw            m3, m0 ; 34
+    PMADDUBSW            m2, m5, m6, m7, 0
+    mova        [tmpq+16*1], m1
     PMADDUBSW            m3, m5, m6, m7, 0
-    mova        [tmpq+16*0], m3
-    punpckhbw            m3, m2, m4
-    PMADDUBSW            m3, m5, m6, m7, 0
-    mova        [tmpq+16*2], m3
-    punpcklbw            m3, m1, m2
-    punpckhbw            m1, m2
-    PMADDUBSW            m3, m5, m6, m7, 0
-    PMADDUBSW            m1, m5, m6, m7, 0
-    mova        [tmpq+16*1], m3
-    mova        [tmpq+16*3], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
     add                tmpq, 16*4
     sub                  hd, 4
     jg .v_w8_loop
@@ -1183,48 +1164,48 @@
 .v_w16_loop:
     movu                 m1, [srcq+strideq*1]
     movu                 m2, [srcq+strideq*2]
-    punpcklbw            m3, m1, m0
-    punpckhbw            m4, m1, m0
-    PMADDUBSW            m3, m5, m6, m7, 0
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*0], m3
-    mova        [tmpq+16*1], m4
-    punpcklbw            m3, m2, m1
-    punpckhbw            m4, m2, m1
-    PMADDUBSW            m3, m5, m6, m7, 0
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*2], m3
-    mova        [tmpq+16*3], m4
     movu                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
+    punpcklbw            m4, m0, m1
+    punpckhbw            m0, m1
+    PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m0, m5, m6, m7, 0
+    mova        [tmpq+16*0], m4
+    punpcklbw            m4, m1, m2
+    punpckhbw            m1, m2
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*1], m0
     movu                 m0, [srcq+strideq*0]
-    add                tmpq, 16*8
-    punpcklbw            m1, m3, m2
-    punpckhbw            m4, m3, m2
     PMADDUBSW            m1, m5, m6, m7, 0
+    mova        [tmpq+16*2], m4
+    punpcklbw            m4, m2, m3
+    punpckhbw            m2, m3
     PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq-16*4], m1
-    mova        [tmpq-16*3], m4
-    punpcklbw            m1, m0, m3
-    punpckhbw            m2, m0, m3
-    PMADDUBSW            m1, m5, m6, m7, 0
+    mova        [tmpq+16*3], m1
     PMADDUBSW            m2, m5, m6, m7, 0
-    mova        [tmpq-16*2], m1
-    mova        [tmpq-16*1], m2
+    mova        [tmpq+16*4], m4
+    punpcklbw            m4, m3, m0
+    punpckhbw            m3, m0
+    PMADDUBSW            m4, m5, m6, m7, 0
+    mova        [tmpq+16*5], m2
+    PMADDUBSW            m3, m5, m6, m7, 0
+    mova        [tmpq+16*6], m4
+    mova        [tmpq+16*7], m3
+    add                tmpq, 16*8
     sub                  hd, 4
     jg .v_w16_loop
     RET
-.v_w32:
-    lea                 t2d, [hq+(0<<16)]
-    mov                 t0d, 64
+.v_w128:
+    lea                 r3d, [hq+(3<<8)]
+    mov                 r6d, 256
     jmp .v_w32_start
 .v_w64:
-    lea                 t2d, [hq+(1<<16)]
-    mov                 t0d, 128
+    lea                 r3d, [hq+(1<<8)]
+    mov                 r6d, 128
     jmp .v_w32_start
-.v_w128:
-    lea                 t2d, [hq+(3<<16)]
-    mov                 t0d, 256
+.v_w32:
+    xor                 r3d, r3d
+    mov                 r6d, 64
 .v_w32_start:
 %if ARCH_X86_64
  %if WIN64
@@ -1232,7 +1213,7 @@
  %endif
     mov                  r7, tmpq
 %endif
-    mov                  t1, srcq
+    mov                  r5, srcq
 .v_w32_hloop:
     movu                 m0, [srcq+strideq*0+16*0]
     movu                 m1, [srcq+strideq*0+16*1]
@@ -1240,48 +1221,48 @@
     movu                 m2, [srcq+strideq*1+16*0]
     movu                 m3, [srcq+strideq*1+16*1]
     lea                srcq, [srcq+strideq*2]
-    punpcklbw            m4, m2, m0
+    punpcklbw            m4, m0, m2
+    punpckhbw            m0, m2
     PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m0, m5, m6, m7, 0
     mova        [tmpq+16*0], m4
-    punpckhbw            m4, m2, m0
+    mova        [tmpq+16*1], m0
+    movu                 m0, [srcq+strideq*0+16*0]
+    punpcklbw            m4, m1, m3
+    punpckhbw            m1, m3
     PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*1], m4
-    punpcklbw            m4, m3, m1
-    PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
     mova        [tmpq+16*2], m4
-    punpckhbw            m4, m3, m1
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*3], m4
-    add                tmpq, t0q
-    movu                 m0, [srcq+strideq*0+16*0]
+    mova        [tmpq+16*3], m1
     movu                 m1, [srcq+strideq*0+16*1]
-    punpcklbw            m4, m0, m2
+    add                tmpq, r6
+    punpcklbw            m4, m2, m0
+    punpckhbw            m2, m0
     PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
     mova        [tmpq+16*0], m4
-    punpckhbw            m4, m0, m2
+    mova        [tmpq+16*1], m2
+    punpcklbw            m4, m3, m1
+    punpckhbw            m3, m1
     PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*1], m4
-    punpcklbw            m4, m1, m3
-    PMADDUBSW            m4, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
     mova        [tmpq+16*2], m4
-    punpckhbw            m4, m1, m3
-    PMADDUBSW            m4, m5, m6, m7, 0
-    mova        [tmpq+16*3], m4
-    add                tmpq, t0q
+    mova        [tmpq+16*3], m3
+    add                tmpq, r6
     sub                  hd, 2
     jg .v_w32_vloop
-    movzx                hd, t2w
-    add                  t1, 32
-    mov                srcq, t1
+    add                  r5, 32
+    movzx                hd, r3b
+    mov                srcq, r5
 %if ARCH_X86_64
-    add                  r7, 2*16*2
+    add                  r7, 16*4
     mov                tmpq, r7
 %else
     mov                tmpq, tmpmp
-    add                tmpq, 2*16*2
+    add                tmpq, 16*4
     mov               tmpmp, tmpq
 %endif
-    sub                 t2d, 1<<16
+    sub                 r3d, 1<<8
     jg .v_w32_hloop
 %if WIN64
     POP                  r7
@@ -1290,71 +1271,56 @@
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
 %assign stack_offset stack_offset - stack_size_padded
 %if cpuflag(ssse3)
+    imul               mxyd, 0x08000800
     WIN64_SPILL_XMM 8
 %else
-    WIN64_SPILL_XMM 10
-%endif
-    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
-%if cpuflag(ssse3)
-    shl                mxyd, 11
-%else
+    or                 mxyd, 1<<16
+    WIN64_SPILL_XMM 9
  %if ARCH_X86_64
-    mova                 m8, [pw_8]
+    mova                 m8, [base+pw_8]
  %else
-  %define m8 [t1-prep_sse2+pw_8]
+  %define                m8  [base+pw_8]
  %endif
     pxor                 m7, m7
 %endif
     movd                 m6, mxyd
-    add                  wq, t2
-    pshuflw              m6, m6, q0000
-%if cpuflag(ssse3)
-    punpcklqdq           m6, m6
-%elif ARCH_X86_64
-    psrlw                m0, m8, 3
-    punpcklwd            m6, m0
-%else
-    punpcklwd            m6, [base+pw_1]
-%endif
-%if ARCH_X86_32
-    mov                  t1, t2 ; save base reg for w4
-%endif
-    lea            stride3q, [strideq*3]
+    add                  wq, r6
+    pshufd               m6, m6, q0000
     jmp                  wq
 .hv_w4:
 %if cpuflag(ssse3)
- %if ARCH_X86_32
-    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
-    mova                 m4, [bilin_h_shuf4]
- %endif
-%endif
+    mova                 m4, [base+bilin_h_shuf4]
+    movddup              m0, [srcq+strideq*0]
+%else
     movhps               m0, [srcq+strideq*0]
+%endif
+    lea                  r3, [strideq*3]
     PSHUFB_BILIN_H4      m0, m4, m3
     PMADDUBSW            m0, m5, m7, m4, 0 ; _ 0
 .hv_w4_loop:
     movq                 m1, [srcq+strideq*1]
     movhps               m1, [srcq+strideq*2]
-    movq                 m2, [srcq+stride3q ]
+    movq                 m2, [srcq+r3       ]
     lea                srcq, [srcq+strideq*4]
     movhps               m2, [srcq+strideq*0]
     PSHUFB_BILIN_H4      m1, m4, m3
     PSHUFB_BILIN_H4      m2, m4, m3
     PMADDUBSW            m1, m5, m7, m4, 0 ; 1 2
-    shufpd               m3, m0, m1, 0x01  ; 0 1
-    mova                 m0, m2
-    PMADDUBSW            m0, m5, m7, m4, 0 ; 3 4
-    shufpd               m2, m1, m0, 0x01  ; 2 3
-    psubw                m1, m3
+    PMADDUBSW            m2, m5, m7, m4, 0 ; 3 4
+    shufpd               m0, m1, 0x01      ; 0 1
+    shufpd               m3, m1, m2, 0x01  ; 2 3
+    psubw                m1, m0
     PMULHRSW             m1, m6, m4, m8, 4
-    paddw                m1, m3
-    psubw                m3, m0, m2
-    PMULHRSW             m3, m6, m4, m8, 4
-    paddw                m3, m2
+    paddw                m1, m0
+    mova                 m0, m2
+    psubw                m2, m3
+    PMULHRSW             m2, m6, m4, m8, 4
+    paddw                m2, m3
     mova        [tmpq+16*0], m1
-    mova        [tmpq+16*1], m3
+    mova        [tmpq+16*1], m2
     add                tmpq, 32
     sub                  hd, 4
     jg .hv_w4_loop
@@ -1365,7 +1331,8 @@
     PMADDUBSW            m0, m5, m7, m4, 0 ; 0
 .hv_w8_loop:
     movu                 m1, [srcq+strideq*1]
-    movu                 m2, [srcq+strideq*2]
+    lea                srcq, [srcq+strideq*2]
+    movu                 m2, [srcq+strideq*0]
     PSHUFB_BILIN_H8      m1, m4
     PSHUFB_BILIN_H8      m2, m4
     PMADDUBSW            m1, m5, m7, m4, 0 ; 1
@@ -1373,68 +1340,40 @@
     psubw                m3, m1, m0
     PMULHRSW             m3, m6, m4, m8, 4
     paddw                m3, m0
-%if notcpuflag(ssse3) && ARCH_X86_64
-    SWAP                 m9, m7
-%endif
-    psubw                m7, m2, m1
-    PMULHRSW             m7, m6, m4, m8, 4
-    paddw                m7, m1
+    mova                 m0, m2
+    psubw                m2, m1
+    PMULHRSW             m2, m6, m4, m8, 4
+    paddw                m2, m1
     mova        [tmpq+16*0], m3
-    mova        [tmpq+16*1], m7
-%if notcpuflag(ssse3) && ARCH_X86_64
-    SWAP                 m7, m9
-%endif
-    movu                 m1, [srcq+stride3q ]
-    lea                srcq, [srcq+strideq*4]
-    movu                 m0, [srcq+strideq*0]
-    PSHUFB_BILIN_H8      m1, m4
-    PSHUFB_BILIN_H8      m0, m4
-    PMADDUBSW            m1, m5, m7, m4, ARCH_X86_32 ; 3
-    PMADDUBSW            m0, m5, m7, m4, 0           ; 4
-    psubw                m3, m1, m2
-    PMULHRSW             m3, m6, m4, m8, 4
-    paddw                m3, m2
-%if notcpuflag(ssse3) && ARCH_X86_64
-    SWAP                 m9, m7
-%endif
-    psubw                m7, m0, m1
-    PMULHRSW             m7, m6, m4, m8, 4
-    paddw                m7, m1
-    mova        [tmpq+16*2], m3
-    mova        [tmpq+16*3], m7
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
-    SWAP                 m7, m9
- %else
-    pxor                 m7, m7
- %endif
-%endif
-    add                tmpq, 16*4
-    sub                  hd, 4
+    mova        [tmpq+16*1], m2
+    add                tmpq, 16*2
+    sub                  hd, 2
     jg .hv_w8_loop
     RET
-.hv_w16:
-    mov                 t2d, hd
-    mov                 t0d, 32
+.hv_w128:
+    lea                 r3d, [hq+(7<<8)]
+    mov                 r5d, 256
     jmp .hv_w16_start
-.hv_w32:
-    lea                 t2d, [hq+(1<<16)]
-    mov                 t0d, 64
-    jmp .hv_w16_start
 .hv_w64:
-    lea                 t2d, [hq+(3<<16)]
-    mov                 t0d, 128
+    lea                 r3d, [hq+(3<<8)]
+    mov                 r5d, 128
     jmp .hv_w16_start
-.hv_w128:
-    lea                 t2d, [hq+(7<<16)]
-    mov                 t0d, 256
+.hv_w32:
+    lea                 r3d, [hq+(1<<8)]
+    mov                 r5d, 64
+    jmp .hv_w16_start
+.hv_w16:
+    xor                 r3d, r3d
+    mov                 r5d, 32
 .hv_w16_start:
+%if ARCH_X86_64 || cpuflag(ssse3)
+    mov                  r6, srcq
+%endif
 %if ARCH_X86_64
  %if WIN64
     PUSH                 r7
  %endif
     mov                  r7, tmpq
-    mov                  r5, srcq
 %endif
 .hv_w16_hloop:
     movu                 m0, [srcq+strideq*0+8*0]
@@ -1459,7 +1398,7 @@
     PMULHRSW             m0, m6, m4, m8, 4
     paddw                m0, m1
     mova        [tmpq+16*1], m0
-    add                tmpq, t0q
+    add                tmpq, r5
     movu                 m0, [srcq+strideq*0+8*0]
     PSHUFB_BILIN_H8      m0, m4
     PMADDUBSW            m0, m5, m7, m4, 0 ; 2a
@@ -1474,24 +1413,30 @@
     PMULHRSW             m2, m6, m4, m8, 4
     paddw                m2, m3
     mova        [tmpq+16*1], m2
-    add                tmpq, t0q
+    add                tmpq, r5
     sub                  hd, 2
     jg .hv_w16_vloop
-    movzx                hd, t2w
+    movzx                hd, r3b
 %if ARCH_X86_64
-    add                  r5, 16
+    add                  r6, 16
     add                  r7, 2*16
-    mov                srcq, r5
+    mov                srcq, r6
     mov                tmpq, r7
+%elif cpuflag(ssse3)
+    mov                tmpq, tmpm
+    add                  r6, 16
+    add                tmpq, 2*16
+    mov                srcq, r6
+    mov                tmpm, tmpq
 %else
-    mov                srcq, srcmp
-    mov                tmpq, tmpmp
+    mov                srcq, srcm
+    mov                tmpq, tmpm
     add                srcq, 16
     add                tmpq, 2*16
-    mov               srcmp, srcq
-    mov               tmpmp, tmpq
+    mov                srcm, srcq
+    mov                tmpm, tmpq
 %endif
-    sub                 t2d, 1<<16
+    sub                 r3d, 1<<8
     jg .hv_w16_hloop
 %if WIN64
     POP                  r7
@@ -1538,13 +1483,9 @@
 %if ARCH_X86_32
  %define base_reg r1
  %define base base_reg-put_ssse3
- %define W32_RESTORE_DSQ mov dsq, dsm
- %define W32_RESTORE_SSQ mov ssq, ssm
 %else
  %define base_reg r8
  %define base 0
- %define W32_RESTORE_DSQ
- %define W32_RESTORE_SSQ
 %endif
 
 cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
@@ -1575,10 +1516,9 @@
     add                  wq, base_reg
 ; put_bilin mangling jump
 %assign stack_offset org_stack_offset
-%if ARCH_X86_32
-    mov                 dsq, dsm
-    mov                 ssq, ssm
-%elif WIN64
+    movifnidn           dsq, dsmp
+    movifnidn           ssq, ssmp
+%if WIN64
     pop                  r8
 %endif
     lea                  r6, [ssq*3]
@@ -1590,7 +1530,7 @@
     test                myd, 0xf00
 %endif
     jnz .hv
-    W32_RESTORE_SSQ
+    movifnidn           ssq, ssmp
     WIN64_SPILL_XMM      12
     cmp                  wd, 4
     jl .h_w2
@@ -1604,11 +1544,10 @@
     shr                 mxd, 16
     sub                srcq, 3
     movzx                wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
-    movd                 m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
-    pshufd               m5, m5, q0000
-    movd                 m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
-    pshufd               m6, m6, q0000
+    movq                 m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
     mova                 m7, [base+pw_34] ; 2 + (8 << 2)
+    pshufd               m5, m6, q0000
+    pshufd               m6, m6, q1111
     add                  wq, base_reg
     jmp                  wq
 .h_w2:
@@ -1620,9 +1559,9 @@
     dec                srcq
     mova                 m4, [base+subpel_h_shuf4]
     movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
-    pshufd               m3, m3, q0000
     mova                 m5, [base+pw_34] ; 2 + (8 << 2)
-    W32_RESTORE_DSQ
+    pshufd               m3, m3, q0000
+    movifnidn           dsq, dsmp
 .h_w2_loop:
     movq                 m0, [srcq+ssq*0]
     movhps               m0, [srcq+ssq*1]
@@ -1633,10 +1572,10 @@
     paddw                m0, m5 ; pw34
     psraw                m0, 6
     packuswb             m0, m0
-    movd                r4d, m0
-    mov        [dstq+dsq*0], r4w
-    shr                 r4d, 16
-    mov        [dstq+dsq*1], r4w
+    movd                r6d, m0
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .h_w2_loop
@@ -1649,10 +1588,10 @@
 %endif
     dec                srcq
     movd                 m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
-    pshufd               m3, m3, q0000
-    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
     mova                 m6, [base+subpel_h_shufA]
-    W32_RESTORE_DSQ
+    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
+    pshufd               m3, m3, q0000
+    movifnidn           dsq, dsmp
 .h_w4_loop:
     movq                 m0, [srcq+ssq*0] ; 1
     movq                 m1, [srcq+ssq*1] ; 2
@@ -1672,7 +1611,6 @@
     sub                  hd, 2
     jg .h_w4_loop
     RET
-    ;
 %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
  %if ARCH_X86_32
     pshufb              %2, %1, [base+subpel_h_shufB]
@@ -1693,18 +1631,17 @@
     paddw               %1, m7     ; pw34
     psraw               %1, 6
 %endmacro
-    ;
 .h_w8:
-    movu                 m0,     [srcq+ssq*0]
-    movu                 m1,     [srcq+ssq*1]
-    PUT_8TAP_H           m0, m2, m3, m4
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_H           m0, m2, m3, m4
     PUT_8TAP_H           m1, m2, m3, m4
     packuswb             m0, m1
 %if ARCH_X86_32
-    movq       [dstq      ], m0
+    movq             [dstq], m0
     add                dstq, dsm
-    movhps     [dstq      ], m0
+    movhps           [dstq], m0
     add                dstq, dsm
 %else
     movq       [dstq+dsq*0], m0
@@ -1714,22 +1651,23 @@
     sub                  hd, 2
     jg .h_w8
     RET
-.h_w16:
-    xor                 r6d, r6d
-    jmp .h_start
-.h_w32:
-    mov                  r6, -16*1
-    jmp .h_start
-.h_w64:
-    mov                  r6, -16*3
-    jmp .h_start
 .h_w128:
-    mov                  r6, -16*7
-.h_start:
-    sub                srcq, r6
-    sub                dstq, r6
-    mov                  r4, r6
-.h_loop:
+    mov                  r4, -16*7
+    jmp .h_w16_start
+.h_w64:
+    mov                  r4, -16*3
+    jmp .h_w16_start
+.h_w32:
+    mov                  r4, -16*1
+    jmp .h_w16_start
+.h_w16:
+    xor                 r4d, r4d
+.h_w16_start:
+    sub                srcq, r4
+    sub                dstq, r4
+.h_w16_loop_v:
+    mov                  r6, r4
+.h_w16_loop_h:
     movu                 m0, [srcq+r6+8*0]
     movu                 m1, [srcq+r6+8*1]
     PUT_8TAP_H           m0, m2, m3, m4
@@ -1736,17 +1674,12 @@
     PUT_8TAP_H           m1, m2, m3, m4
     packuswb             m0, m1
     mova          [dstq+r6], m0
-    add                  r6, mmsize
-    jle .h_loop
+    add                  r6, 16
+    jle .h_w16_loop_h
     add                srcq, ssq
-%if ARCH_X86_32
-    add                dstq, dsm
-%else
-    add                dstq, dsq
-%endif
-    mov                  r6, r4
+    add                dstq, dsmp
     dec                  hd
-    jg .h_loop
+    jg .h_w16_loop_v
     RET
 .v:
 %if ARCH_X86_32
@@ -1754,7 +1687,7 @@
     shr                 ssd, 16
     cmp                  hd, 6
     cmovs               ssd, mxd
-    lea                 ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
+    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
 %else
  %assign stack_offset org_stack_offset
     WIN64_SPILL_XMM      16
@@ -1762,12 +1695,12 @@
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    lea                 myq, [base_reg+myq*8+subpel_filters-put_ssse3]
+    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
 %endif
     tzcnt               r6d, wd
     movzx               r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+    punpcklwd            m0, m0
     mova                 m7, [base+pw_512]
-    psrlw                m2, m7, 1 ; 0x0100
     add                  r6, base_reg
 %if ARCH_X86_32
  %define            subpel0  [rsp+mmsize*0]
@@ -1775,20 +1708,16 @@
  %define            subpel2  [rsp+mmsize*2]
  %define            subpel3  [rsp+mmsize*3]
 %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
-    ALLOC_STACK   -mmsize*4
+    ALLOC_STACK       -16*4
 %assign regs_used 7
-    movd                 m0, [ssq+0]
-    pshufb               m0, m2
-    mova            subpel0, m0
-    movd                 m0, [ssq+2]
-    pshufb               m0, m2
-    mova            subpel1, m0
-    movd                 m0, [ssq+4]
-    pshufb               m0, m2
-    mova            subpel2, m0
-    movd                 m0, [ssq+6]
-    pshufb               m0, m2
-    mova            subpel3, m0
+    pshufd               m1, m0, q0000
+    mova            subpel0, m1
+    pshufd               m1, m0, q1111
+    mova            subpel1, m1
+    pshufd               m1, m0, q2222
+    mova            subpel2, m1
+    pshufd               m1, m0, q3333
+    mova            subpel3, m1
     mov                 ssq, [rstk+stack_offset+gprsize*4]
     lea                 ssq, [ssq*3]
     sub                srcq, ssq
@@ -1799,47 +1728,46 @@
  %define            subpel1  m9
  %define            subpel2  m10
  %define            subpel3  m11
-    movd            subpel0, [myq+0]
-    pshufb          subpel0, m2
-    movd            subpel1, [myq+2]
-    pshufb          subpel1, m2
-    movd            subpel2, [myq+4]
-    pshufb          subpel2, m2
-    movd            subpel3, [myq+6]
-    pshufb          subpel3, m2
     lea                ss3q, [ssq*3]
+    pshufd               m8, m0, q0000
     sub                srcq, ss3q
+    pshufd               m9, m0, q1111
+    pshufd              m10, m0, q2222
+    pshufd              m11, m0, q3333
 %endif
     jmp                  r6
 .v_w2:
-    movd                 m2, [srcq+ssq*0]    ; 0
-    pinsrw               m2, [srcq+ssq*1], 2 ; 0 1
-    pinsrw               m2, [srcq+ssq*2], 4 ; 0 1 2
+    movd                 m1, [srcq+ssq*0]
+    movd                 m0, [srcq+ssq*1]
 %if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    pinsrw               m2, [srcq+ssq*0], 6 ; 0 1 2 3
-    add                srcq, ssq
-%else
-    pinsrw               m2, [srcq+ss3q ], 6 ; 0 1 2 3
-    lea                srcq, [srcq+ssq*4]
-%endif
-    movd                 m3, [srcq+ssq*0]    ; 4
-    movd                 m1, [srcq+ssq*1]    ; 5
-    movd                 m0, [srcq+ssq*2]    ; 6
-%if ARCH_X86_32
+    movd                 m2, [srcq+ssq*0]
+    movd                 m5, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
+    movd                 m3, [srcq+ssq*0]
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
 %else
+    movd                 m2, [srcq+ssq*2]
     add                srcq, ss3q
+    movd                 m5, [srcq+ssq*0]
+    movd                 m3, [srcq+ssq*1]
+    movd                 m4, [srcq+ssq*2]
+    add                srcq, ss3q
 %endif
-    punpckldq            m3, m1              ; 4 5 _ _
-    punpckldq            m1, m0              ; 5 6 _ _
-    palignr              m4, m3, m2, 4       ; 1 2 3 4
-    punpcklbw            m3, m1              ; 45 56
-    punpcklbw            m1, m2, m4          ; 01 12
-    punpckhbw            m2, m4              ; 23 34
+    punpcklwd            m1, m0           ; 0 1
+    punpcklwd            m0, m2           ; 1 2
+    punpcklbw            m1, m0           ; 01 12
+    movd                 m0, [srcq+ssq*0]
+    punpcklwd            m2, m5           ; 2 3
+    punpcklwd            m5, m3           ; 3 4
+    punpcklwd            m3, m4           ; 4 5
+    punpcklwd            m4, m0           ; 5 6
+    punpcklbw            m2, m5           ; 23 34
+    punpcklbw            m3, m4           ; 45 56
 .v_w2_loop:
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddubsw            m5, m1, subpel0     ; a0 b0
     mova                 m1, m2
     pmaddubsw            m2, subpel1         ; a1 b1
@@ -1847,17 +1775,14 @@
     mova                 m2, m3
     pmaddubsw            m3, subpel2         ; a2 b2
     paddw                m5, m3
-    movd                 m4, [srcq+ssq*0]    ; 7
-    punpckldq            m3, m0, m4          ; 6 7 _ _
-    movd                 m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
-    punpckldq            m4, m0              ; 7 8 _ _
+    punpcklwd            m3, m0, m4          ; 6 7
+    movd                 m0, [srcq+ssq*0]
+    punpcklwd            m4, m0              ; 7 8
     punpcklbw            m3, m4              ; 67 78
     pmaddubsw            m4, m3, subpel3     ; a3 b3
     paddw                m5, m4
     pmulhrsw             m5, m7
     packuswb             m5, m5
-    pshuflw              m5, m5, q2020
     movd                r6d, m5
     mov        [dstq+dsq*0], r6w
     shr                 r6d, 16
@@ -1873,51 +1798,46 @@
 .v_w32:
 .v_w64:
 .v_w128:
-%endif ; ARCH_X86_32
-    lea                 r6d, [wq - 4] ; horizontal loop
-    mov                  r4, dstq
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define               srcm [rsp+mmsize*4+gprsize]
+    shl                  wd, 14
+%if STACK_ALIGNMENT < 16
+ %define               dstm [rsp+mmsize*4+gprsize]
+    mov                dstm, dstq
 %endif
-    mov                srcm, srcq
-%else
-    mov                  r7, srcq
-%endif
-    shl                 r6d, (16 - 2)  ; (wq / 4) << 16
-    mov                 r6w, hw
+    lea                 r6d, [hq+wq-(1<<16)]
+    mov                  r4, srcq
 .v_w4_loop0:
-    movd                 m2, [srcq+ssq*0] ; 0
-    movhps               m2, [srcq+ssq*2] ; 0 _ 2
-    movd                 m3, [srcq+ssq*1] ; 1
-%if ARCH_X86_32
-    lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    movhps               m3, [srcq+ssq*0] ; 1 _ 3
-    lea                srcq, [srcq+ssq*1]
-%else
-    movhps               m3, [srcq+ss3q ] ; 1 _ 3
-    lea                srcq, [srcq+ssq*4]
 %endif
-    pshufd               m2, m2, q2020    ; 0 2 0 2
-    pshufd               m3, m3, q2020    ; 1 3 1 3
-    punpckldq            m2, m3           ; 0 1 2 3
-    movd                 m3, [srcq+ssq*0] ; 4
-    movd                 m1, [srcq+ssq*1] ; 5
-    movd                 m0, [srcq+ssq*2] ; 6
+    movd                 m1, [srcq+ssq*0]
+    movd                 m0, [srcq+ssq*1]
 %if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
+    movd                 m2, [srcq+ssq*0]
+    movd                 m5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movd                 m3, [srcq+ssq*0]
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
 %else
+    movd                 m2, [srcq+ssq*2]
     add                srcq, ss3q
+    movd                 m5, [srcq+ssq*0]
+    movd                 m3, [srcq+ssq*1]
+    movd                 m4, [srcq+ssq*2]
+    add                srcq, ss3q
 %endif
-    punpckldq            m3, m1           ; 4 5 _ _
-    punpckldq            m1, m0           ; 5 6 _ _
-    palignr              m4, m3, m2, 4    ; 1 2 3 4
-    punpcklbw            m3, m1           ; 45 56
-    punpcklbw            m1, m2, m4       ; 01 12
-    punpckhbw            m2, m4           ; 23 34
+    punpckldq            m1, m0           ; 0 1
+    punpckldq            m0, m2           ; 1 2
+    punpcklbw            m1, m0           ; 01 12
+    movd                 m0, [srcq+ssq*0]
+    punpckldq            m2, m5           ; 2 3
+    punpckldq            m5, m3           ; 3 4
+    punpckldq            m3, m4           ; 4 5
+    punpckldq            m4, m0           ; 5 6
+    punpcklbw            m2, m5           ; 23 34
+    punpcklbw            m3, m4           ; 45 56
 .v_w4_loop:
+    movd                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
     pmaddubsw            m5, m1, subpel0  ; a0 b0
     mova                 m1, m2
     pmaddubsw            m2, subpel1      ; a1 b1
@@ -1925,10 +1845,8 @@
     mova                 m2, m3
     pmaddubsw            m3, subpel2      ; a2 b2
     paddw                m5, m3
-    movd                 m4, [srcq+ssq*0]
     punpckldq            m3, m0, m4       ; 6 7 _ _
-    movd                 m0, [srcq+ssq*1]
-    lea                srcq, [srcq+ssq*2]
+    movd                 m0, [srcq+ssq*0]
     punpckldq            m4, m0           ; 7 8 _ _
     punpcklbw            m3, m4           ; 67 78
     pmaddubsw            m4, m3, subpel3  ; a3 b3
@@ -1936,24 +1854,21 @@
     pmulhrsw             m5, m7
     packuswb             m5, m5
     movd       [dstq+dsq*0], m5
-    pshufd               m5, m5, q0101
+    psrlq                m5, 32
     movd       [dstq+dsq*1], m5
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w4_loop
-    mov                  hw, r6w ; reset vertical loop
-    add                  r4, 4
-    mov                dstq, r4
 %if ARCH_X86_32
-    mov                srcq, srcm
-    add                srcq, 4
-    mov                srcm, srcq
-%else
-    add                  r7, 4
-    mov                srcq, r7
-%endif
-    sub                 r6d, 1<<16 ; horizontal--
+    mov                dstq, dstm
+    add                  r4, 4
+    movzx                hd, r6w
+    add                dstq, 4
+    mov                srcq, r4
+    mov                dstm, dstq
+    sub                 r6d, 1<<16
     jg .v_w4_loop0
+%endif
     RET
 %if ARCH_X86_64
 .v_w8:
@@ -1961,56 +1876,51 @@
 .v_w32:
 .v_w64:
 .v_w128:
-    lea                 r6d, [wq - 8] ; horizontal loop
-    mov                  r4, dstq
-    mov                  r7, srcq
-    shl                 r6d, 8 - 3; (wq / 8) << 8
-    mov                 r6b, hb
+    lea                 r6d, [wq*8-64]
+    mov                  r4, srcq
+    mov                  r7, dstq
+    lea                 r6d, [hq+r6*4]
 .v_w8_loop0:
-    movq                 m4, [srcq+ssq*0]   ; 0
-    movq                 m5, [srcq+ssq*1]   ; 1
-    lea                srcq, [srcq+ssq*2]
-    movq                 m6, [srcq+ssq*0]   ; 2
-    movq                 m0, [srcq+ssq*1]   ; 3
-    lea                srcq, [srcq+ssq*2]
-    movq                 m1, [srcq+ssq*0]   ; 4
-    movq                 m2, [srcq+ssq*1]   ; 5
-    lea                srcq, [srcq+ssq*2]   ;
-    movq                 m3, [srcq+ssq*0]   ; 6
-    shufpd               m4, m0, 0x0c
-    shufpd               m5, m1, 0x0c
-    punpcklbw            m1, m4, m5 ; 01
-    punpckhbw            m4, m5     ; 34
-    shufpd               m6, m2, 0x0c
-    punpcklbw            m2, m5, m6 ; 12
-    punpckhbw            m5, m6     ; 45
-    shufpd               m0, m3, 0x0c
-    punpcklbw            m3, m6, m0 ; 23
-    punpckhbw            m6, m0     ; 56
+    movq                 m1, [srcq+ssq*0]
+    movq                 m2, [srcq+ssq*1]
+    movq                 m3, [srcq+ssq*2]
+    add                srcq, ss3q
+    movq                 m4, [srcq+ssq*0]
+    movq                 m5, [srcq+ssq*1]
+    movq                 m6, [srcq+ssq*2]
+    add                srcq, ss3q
+    movq                 m0, [srcq+ssq*0]
+    punpcklbw            m1, m2 ; 01
+    punpcklbw            m2, m3 ; 12
+    punpcklbw            m3, m4 ; 23
+    punpcklbw            m4, m5 ; 34
+    punpcklbw            m5, m6 ; 45
+    punpcklbw            m6, m0 ; 56
 .v_w8_loop:
-    movq                m12, [srcq+ssq*1]   ; 8
+    movq                m13, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    movq                m13, [srcq+ssq*0]   ; 9
     pmaddubsw           m14, m1, subpel0 ; a0
-    pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m1, m3
+    pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m2, m4
     pmaddubsw            m3, subpel1 ; a1
+    mova                m12, m0
     pmaddubsw            m4, subpel1 ; b1
+    movq                 m0, [srcq+ssq*0]
     paddw               m14, m3
     paddw               m15, m4
     mova                 m3, m5
-    mova                 m4, m6
     pmaddubsw            m5, subpel2 ; a2
+    mova                 m4, m6
     pmaddubsw            m6, subpel2 ; b2
+    punpcklbw           m12, m13     ; 67
+    punpcklbw           m13, m0      ; 78
     paddw               m14, m5
+    mova                 m5, m12
+    pmaddubsw           m12, subpel3 ; a3
     paddw               m15, m6
-    shufpd               m6, m0, m12, 0x0d
-    shufpd               m0, m12, m13, 0x0c
-    punpcklbw            m5, m6, m0  ; 67
-    punpckhbw            m6, m0      ; 78
-    pmaddubsw           m12, m5, subpel3 ; a3
-    pmaddubsw           m13, m6, subpel3 ; b3
+    mova                 m6, m13
+    pmaddubsw           m13, subpel3 ; b3
     paddw               m14, m12
     paddw               m15, m13
     pmulhrsw            m14, m7
@@ -2021,12 +1931,12 @@
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w8_loop
-    movzx                hd, r6b ; reset vertical loop
     add                  r4, 8
     add                  r7, 8
-    mov                dstq, r4
-    mov                srcq, r7
-    sub                 r6d, 1<<8 ; horizontal--
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
+    sub                 r6d, 1<<8
     jg .v_w8_loop0
     RET
 %endif ;ARCH_X86_64
@@ -2051,7 +1961,7 @@
     cmp                  hd, 6
     cmovs               ssd, mxd
     movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
-    W32_RESTORE_SSQ
+    mov                 ssq, ssmp
     lea                  r6, [ssq*3]
     sub                srcq, r6
  %define           base_reg  r6
@@ -2064,7 +1974,6 @@
  %define           subpelv1  [rsp+mmsize*1]
  %define           subpelv2  [rsp+mmsize*2]
  %define           subpelv3  [rsp+mmsize*3]
-    punpcklqdq           m0, m0
     punpcklbw            m0, m0
     psraw                m0, 8 ; sign-extend
     pshufd               m6, m0, q0000
@@ -2088,7 +1997,6 @@
  %define           subpelv1  m11
  %define           subpelv2  m12
  %define           subpelv3  m13
-    punpcklqdq           m0, m0
     punpcklbw            m0, m0
     psraw                m0, 8 ; sign-extend
     mova                 m8, [base+pw_8192]
@@ -2103,22 +2011,21 @@
     je .hv_w4
 .hv_w2:
     mova                 m6, [base+subpel_h_shuf4]
-    ;
     movq                 m2, [srcq+ssq*0]     ; 0
     movhps               m2, [srcq+ssq*1]     ; 0 _ 1
-    movq                 m0, [srcq+ssq*2]     ; 2
 %if ARCH_X86_32
  %define           w8192reg  [base+pw_8192]
  %define            d512reg  [base+pd_512]
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
-    lea                srcq, [srcq+ssq*1]
+    movq                 m0, [srcq+ssq*0]     ; 2
+    movhps               m0, [srcq+ssq*1]     ; 2 _ 3
+    lea                srcq, [srcq+ssq*2]
 %else
  %define           w8192reg  m8
  %define            d512reg  m9
-    movhps               m0, [srcq+ss3q ]     ; 2 _ 3
-    lea                srcq, [srcq+ssq*4]
+    movq                 m0, [srcq+ssq*2]     ; 2
+    add                srcq, ss3q
+    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
 %endif
     pshufb               m2, m6 ; 0 ~ 1 ~
     pshufb               m0, m6 ; 2 ~ 3 ~
@@ -2126,16 +2033,16 @@
     pmaddubsw            m0, m7 ; subpel_filters
     phaddw               m2, m0 ; 0 1 2 3
     pmulhrsw             m2, w8192reg
-    ;
+%if ARCH_X86_32
     movq                 m3, [srcq+ssq*0]     ; 4
     movhps               m3, [srcq+ssq*1]     ; 4 _ 5
-    movq                 m0, [srcq+ssq*2]     ; 6
-%if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
 %else
+    movq                 m3, [srcq+ssq*1]     ; 4
+    movhps               m3, [srcq+ssq*2]     ; 4 _ 5
     add                srcq, ss3q
 %endif
+    movq                 m0, [srcq+ssq*0]     ; 6
     pshufb               m3, m6 ; 4 ~ 5 ~
     pshufb               m0, m6 ; 6 ~
     pmaddubsw            m3, m7 ; subpel_filters
@@ -2142,7 +2049,6 @@
     pmaddubsw            m0, m7 ; subpel_filters
     phaddw               m3, m0 ; 4 5 6 _
     pmulhrsw             m3, w8192reg
-    ;
     palignr              m4, m3, m2, 4; V        1 2 3 4
     punpcklwd            m1, m2, m4   ; V 01 12    0 1 1 2
     punpckhwd            m2, m4       ; V 23 34    2 3 3 4
@@ -2149,6 +2055,11 @@
     pshufd               m0, m3, q2121; V          5 6 5 6
     punpcklwd            m3, m0       ; V 45 56    4 5 5 6
 .hv_w2_loop:
+    movq                 m4, [srcq+ssq*1] ; V 7
+    lea                srcq, [srcq+ssq*2] ; V
+    movhps               m4, [srcq+ssq*0] ; V 7 8
+    pshufb               m4, m6
+    pmaddubsw            m4, m7
     pmaddwd              m5, m1, subpelv0; V a0 b0
     mova                 m1, m2       ; V
     pmaddwd              m2, subpelv1 ; V a1 b1
@@ -2155,14 +2066,9 @@
     paddd                m5, m2       ; V
     mova                 m2, m3       ; V
     pmaddwd              m3, subpelv2 ; a2 b2
-    paddd                m5, m3       ; V
-    movq                 m4, [srcq+ssq*0] ; V 7
-    movhps               m4, [srcq+ssq*1] ; V 7 8
-    lea                srcq, [srcq+ssq*2] ; V
-    pshufb               m4, m6
-    pmaddubsw            m4, m7
     phaddw               m4, m4
     pmulhrsw             m4, w8192reg
+    paddd                m5, m3       ; V
     palignr              m3, m4, m0, 12
     mova                 m0, m4
     punpcklwd            m3, m0           ; V 67 78
@@ -2182,7 +2088,6 @@
     RET
 %undef w8192reg
 %undef d512reg
-    ;
 .hv_w4:
 %define hv4_line_0_0 4
 %define hv4_line_0_1 5
@@ -2194,7 +2099,6 @@
 %define hv4_line_1_1 11
 %define hv4_line_1_2 12
 %define hv4_line_1_3 13
-    ;
 %macro SAVELINE_W4 3
     mova     [rsp+mmsize*hv4_line_%3_%2], %1
 %endmacro
@@ -2201,7 +2105,6 @@
 %macro RESTORELINE_W4 3
     mova     %1, [rsp+mmsize*hv4_line_%3_%2]
 %endmacro
-    ;
 %if ARCH_X86_32
  %define           w8192reg  [base+pw_8192]
  %define            d512reg  [base+pd_512]
@@ -2213,13 +2116,13 @@
     mova                 m6, [base+subpel_h_shuf4]
     movq                 m5, [srcq+ssq*0]   ; 0 _ _ _
     movhps               m5, [srcq+ssq*1]   ; 0 _ 1 _
-    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
 %if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-    movhps               m4, [srcq+ssq*0]   ; 2 _ 3 _
-    add                srcq, ssq
+    movq                 m4, [srcq+ssq*0]   ; 2 _ _ _
+    movhps               m4, [srcq+ssq*1]   ; 2 _ 3 _
+    lea                srcq, [srcq+ssq*2]
 %else
+    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
     movhps               m4, [srcq+ss3q ]   ; 2 _ 3 _
     lea                srcq, [srcq+ssq*4]
 %endif
@@ -2243,7 +2146,14 @@
     mova                 m6, [base+subpel_h_shuf4]
     movq                 m5, [srcq+ssq*0]   ; 4 _ _ _
     movhps               m5, [srcq+ssq*1]   ; 4 _ 5 _
+%if ARCH_X86_32
+    lea                srcq, [srcq+ssq*2]
+    movq                 m4, [srcq+ssq*0]   ; 6 _ _ _
+    add                srcq, ssq
+%else
     movq                 m4, [srcq+ssq*2]   ; 6 _ _ _
+    add                srcq, ss3q
+%endif
     pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
     pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
     pmaddubsw            m3, m7 ;H subpel_filters
@@ -2259,13 +2169,6 @@
     pmaddubsw            m0, m7 ;H subpel_filters
     phaddw               m3, m0 ;H 4 5 6 7
     pmulhrsw             m3, w8192reg ;H pw_8192
-    ;
-%if ARCH_X86_32
-    lea                srcq, [srcq+ssq*2]
-    add                srcq, ssq
-%else
-    add                srcq, ss3q
-%endif
     ;process high
     palignr              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
@@ -2293,7 +2196,6 @@
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
-    ;
     mova                 m6, [base+subpel_h_shuf4]
     movq                 m4, [srcq+ssq*0] ; 7
     movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
@@ -2325,10 +2227,10 @@
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
-    ;
     mova                 m6, [base+subpel_h_shuf4+16]
     movq                 m4, [srcq+ssq*0] ; 7
     movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
+    lea                srcq, [srcq+ssq*2]
     pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
     pmaddubsw            m4, m7 ;H subpel_filters
     phaddw               m4, m4 ;H                7 8 7 8
@@ -2340,12 +2242,10 @@
     paddd                m5, d512reg ; pd_512
     paddd                m5, m4
     psrad                m4, m5, 10
-    ;
     RESTORELINE_W4       m5, 5, 0
     packssdw             m5, m4 ; d -> w
     packuswb             m5, m5 ; w -> b
     pshuflw              m5, m5, q3120
-    lea                srcq, [srcq+ssq*2]
     movd       [dstq+dsq*0], m5
     psrlq                m5, 32
     movd       [dstq+dsq*1], m5
@@ -2365,7 +2265,6 @@
 %undef subpelv1
 %undef subpelv2
 %undef subpelv3
-    ;
 .hv_w8:
     %assign stack_offset org_stack_offset
 %define hv8_line_1 0
@@ -2400,7 +2299,7 @@
     mov                 ssq, ssmp
     ALLOC_STACK  -mmsize*13
 %if STACK_ALIGNMENT < 16
- %define               srcm  [rsp+mmsize*13+gprsize*1]
+ %define               dstm  [rsp+mmsize*13+gprsize*1]
  %define                dsm  [rsp+mmsize*13+gprsize*2]
     mov                  r6, [rstk+stack_offset+gprsize*2]
     mov                 dsm, r6
@@ -2420,10 +2319,10 @@
     mova           subpelv2, m4
     mova           subpelv3, m5
     lea                  r6, [ssq*3]
+    mov                dstm, dstq
     sub                srcq, r6
-    mov                srcm, srcq
 %else
-    ALLOC_STACK    mmsize*5, 16
+    ALLOC_STACK        16*5, 16
  %define           subpelh0  m10
  %define           subpelh1  m11
  %define           subpelv0  m12
@@ -2440,7 +2339,6 @@
     movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
-    punpcklqdq           m1, m1
     punpcklbw            m1, m1
     psraw                m1, 8 ; sign-extend
     pshufd         subpelv0, m1, q0000
@@ -2448,18 +2346,18 @@
     pshufd         subpelv2, m1, q2222
     pshufd         subpelv3, m1, q3333
     lea                ss3q, [ssq*3]
+    mov                  r7, dstq
     sub                srcq, ss3q
-    mov                  r7, srcq
 %endif
-    lea                 r6d, [wq-4]
-    mov                  r4, dstq
-    shl                 r6d, (16 - 2)
-    mov                 r6w, hw
+    shl                  wd, 14
+    lea                 r6d, [hq+wq-(1<<16)]
+    mov                  r4, srcq
 .hv_w8_loop0:
     movu                 m4, [srcq+ssq*0] ; 0 = _ _
     movu                 m5, [srcq+ssq*1] ; 1 = _ _
+%if ARCH_X86_32
     lea                srcq, [srcq+ssq*2]
-    ;
+%endif
 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
  %if ARCH_X86_32
     pshufb               %3, %1, [base+subpel_h_shufB]
@@ -2478,7 +2376,6 @@
     paddw                %1, %3      ; A0+C4
     phaddw               %1, %2
 %endmacro
-    ;
 %if ARCH_X86_64
     mova                 m7, [base+subpel_h_shufA]
     mova                 m8, [base+subpel_h_shufB]
@@ -2486,12 +2383,17 @@
 %endif
     HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
     HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+%if ARCH_X86_32
     movu                 m6, [srcq+ssq*0] ; 2 = _ _
     movu                 m0, [srcq+ssq*1] ; 3 = _ _
     lea                srcq, [srcq+ssq*2]
+%else
+    movu                 m6, [srcq+ssq*2] ; 2 = _ _
+    add                srcq, ss3q
+    movu                 m0, [srcq+ssq*0] ; 3 = _ _
+%endif
     HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
     HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
-    ;
     mova                 m7, [base+pw_8192]
     pmulhrsw             m4, m7 ; H pw_8192
     pmulhrsw             m5, m7 ; H pw_8192
@@ -2503,11 +2405,16 @@
     SAVELINE_W8           1, m1
     SAVELINE_W8           2, m2
     SAVELINE_W8           3, m3
-    ;
     mova                 m7, [base+subpel_h_shufA]
+%if ARCH_X86_32
     movu                 m4, [srcq+ssq*0]       ; 4 = _ _
     movu                 m5, [srcq+ssq*1]       ; 5 = _ _
     lea                srcq, [srcq+ssq*2]
+%else
+    movu                 m4, [srcq+ssq*1]       ; 4 = _ _
+    movu                 m5, [srcq+ssq*2]       ; 5 = _ _
+    add                srcq, ss3q
+%endif
     movu                 m6, [srcq+ssq*0]       ; 6 = _ _
     HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
     HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
@@ -2519,7 +2426,6 @@
     punpcklwd            m4, m0, m1  ; 3 4 ~
     punpcklwd            m5, m1, m2  ; 4 5 ~
     punpcklwd            m6, m2, m3  ; 5 6 ~
-    ;
     SAVELINE_W8           6, m3
     RESTORELINE_W8        1, m1
     RESTORELINE_W8        2, m2
@@ -2603,16 +2509,19 @@
     RESTORELINE_W8        4, m4
     jmp .hv_w8_loop
 .hv_w8_outer:
-    movzx                hd, r6w
-    add                  r4, 4
-    mov                dstq, r4
 %if ARCH_X86_32
-    mov                srcq, srcm
-    add                srcq, 4
-    mov                srcm, srcq
+    mov                dstq, dstm
+    add                  r4, 4
+    movzx                hd, r6w
+    add                dstq, 4
+    mov                srcq, r4
+    mov                dstm, dstq
 %else
+    add                  r4, 4
     add                  r7, 4
-    mov                srcq, r7
+    movzx                hd, r6b
+    mov                srcq, r4
+    mov                dstq, r7
 %endif
     sub                 r6d, 1<<16
     jg .hv_w8_loop0
@@ -2836,7 +2745,7 @@
     add                 mxd, t0d ; 8tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
     add                 myd, t1d ; 8tap_v, my, 4tap_v
-    movsxd               wq, wm
+    mov                  wd, wm
     movifnidn          srcd, srcm
     movifnidn            hd, hm
     test                mxd, 0xf00
@@ -2846,6 +2755,7 @@
     LEA            base_reg, prep_ssse3
     tzcnt                wd, wd
     movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+    pxor                 m4, m4
     add                  wq, base_reg
     movifnidn       strided, stridem
     lea                  r6, [strideq*3]
@@ -2885,16 +2795,13 @@
     shr                 mxd, 16
     sub                srcq, 3
     movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
-    movd                 m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
-    pshufd               m5, m5, q0000
-    movd                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
-    pshufd               m6, m6, q0000
+    movq                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
 %if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
+    pshufd               m5, m6, q0000
+    pshufd               m6, m6, q1111
 %else
-    punpcklbw            m5, m5
     punpcklbw            m6, m6
-    psraw                m5, 8
     psraw                m6, 8
  %if ARCH_X86_64
     mova                 m7, [pw_2]
@@ -2902,6 +2809,8 @@
  %else
   %define m15 m4
  %endif
+    pshufd               m5, m6, q1010
+    punpckhqdq           m6, m6
 %endif
     add                  wq, base_reg
     jmp                  wq
@@ -2913,10 +2822,10 @@
 %endif
     dec                srcq
     movd                 m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
-    pshufd               m4, m4, q0000
 %if cpuflag(ssse3)
     mova                 m6, [base+pw_8192]
     mova                 m5, [base+subpel_h_shufA]
+    pshufd               m4, m4, q0000
 %else
     mova                 m6, [base+pw_2]
  %if ARCH_X86_64
@@ -2926,6 +2835,7 @@
  %endif
     punpcklbw            m4, m4
     psraw                m4, 8
+    punpcklqdq           m4, m4
 %endif
 %if ARCH_X86_64
     lea            stride3q, [strideq*3]
@@ -3089,11 +2999,14 @@
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    lea                 myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
 %if cpuflag(ssse3)
     mova                 m2, [base+pw_512]
-    psrlw                m2, m2, 1 ; 0x0100
     mova                 m7, [base+pw_8192]
+    punpcklwd            m0, m0
+%else
+    punpcklbw            m0, m0
+    psraw                m0, 8
 %endif
 %if ARCH_X86_32
  %define            subpel0  [rsp+mmsize*0]
@@ -3107,20 +3020,16 @@
     ALLOC_STACK   -mmsize*5
  %endif
 %assign regs_used 7
-    movd                 m0, [myq+0]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel0, m0
-    movd                 m0, [myq+2]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel1, m0
-    movd                 m0, [myq+4]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel2, m0
-    movd                 m0, [myq+6]
-    PSHUFB_0X1X          m0, m2
-    mova            subpel3, m0
     mov             strideq, [rstk+stack_offset+gprsize*3]
+    pshufd               m1, m0, q0000
+    mova            subpel0, m1
+    pshufd               m1, m0, q1111
+    mova            subpel1, m1
     lea                  r5, [strideq*3]
+    pshufd               m1, m0, q2222
+    mova            subpel2, m1
+    pshufd               m1, m0, q3333
+    mova            subpel3, m1
     sub                srcq, r5
 %else
  %define            subpel0  m8
@@ -3127,15 +3036,11 @@
  %define            subpel1  m9
  %define            subpel2  m10
  %define            subpel3  m11
-    movd            subpel0, [myq+0]
-    PSHUFB_0X1X     subpel0, m2
-    movd            subpel1, [myq+2]
-    PSHUFB_0X1X     subpel1, m2
-    movd            subpel2, [myq+4]
-    PSHUFB_0X1X     subpel2, m2
-    movd            subpel3, [myq+6]
-    PSHUFB_0X1X     subpel3, m2
+    pshufd               m8, m0, q0000
+    pshufd               m9, m0, q1111
     lea            stride3q, [strideq*3]
+    pshufd              m10, m0, q2222
+    pshufd              m11, m0, q3333
     sub                srcq, stride3q
     cmp                  wd, 8
     jns .v_w8
@@ -3159,35 +3064,34 @@
     mov                 r5w, hw
 .v_w4_loop0:
 %endif
-    movd                 m2, [srcq+strideq*0] ; 0
-    movhps               m2, [srcq+strideq*2] ; 0 _ 2
-    movd                 m3, [srcq+strideq*1] ; 1
+    movd                 m1, [srcq+strideq*0]
+    movd                 m0, [srcq+strideq*1]
 %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
-    movhps               m3, [srcq+strideq*1] ; 1 _ 3
+    movd                 m2, [srcq+strideq*0]
+    movd                 m4, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-%else
-    movhps               m3, [srcq+stride3q ] ; 1 _ 3
-    lea                srcq, [srcq+strideq*4]
-%endif
-    pshufd               m2, m2, q2020    ; 0 2 0 2
-    pshufd               m3, m3, q2020    ; 1 3 1 3
-    punpckldq            m2, m3           ; 0 1 2 3
-    movd                 m3, [srcq+strideq*0] ; 4
-    movd                 m1, [srcq+strideq*1] ; 5
-    movd                 m0, [srcq+strideq*2] ; 6
-%if ARCH_X86_32
+    movd                 m3, [srcq+strideq*0]
+    movd                 m5, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    add                srcq, strideq
 %else
+    movd                 m2, [srcq+strideq*2]
     add                srcq, stride3q
+    movd                 m4, [srcq+strideq*0]
+    movd                 m3, [srcq+strideq*1]
+    movd                 m5, [srcq+strideq*2]
+    add                srcq, stride3q
 %endif
-    punpckldq            m3, m1           ; 4 5 _ _
-    punpckldq            m1, m0           ; 5 6 _ _
-    PALIGNR              m4, m3, m2, 4    ; 1 2 3 4
-    punpcklbw            m3, m1           ; 45 56
-    punpcklbw            m1, m2, m4       ; 01 12
-    punpckhbw            m2, m4           ; 23 34
+    punpckldq            m1, m0 ; 0 1
+    punpckldq            m0, m2 ; 1 2
+    punpcklbw            m1, m0 ; 01 12
+    movd                 m0, [srcq+strideq*0]
+    punpckldq            m2, m4 ; 2 3
+    punpckldq            m4, m3 ; 3 4
+    punpckldq            m3, m5 ; 4 5
+    punpckldq            m5, m0 ; 5 6
+    punpcklbw            m2, m4 ; 23 34
+    punpcklbw            m3, m5 ; 45 56
 .v_w4_loop:
 %if ARCH_X86_32 && notcpuflag(ssse3)
     mova                 m7, subpel0
@@ -3208,11 +3112,11 @@
 %endif
     mova                 m2, m3
     PMADDUBSW            m3, subpel2, m6, m4, 0  ; a2 b2
+    movd                 m4, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
     paddw                m5, m3
-    movd                 m4, [srcq+strideq*0]
     punpckldq            m3, m0, m4       ; 6 7 _ _
-    movd                 m0, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
+    movd                 m0, [srcq+strideq*0]
     punpckldq            m4, m0           ; 7 8 _ _
     punpcklbw            m3, m4           ; 67 78
 %if notcpuflag(ssse3)
@@ -3242,13 +3146,13 @@
     sub                  hd, 2
     jg .v_w4_loop
 %if ARCH_X86_32
-    mov                  hw, r5w ; reset vertical loop
-    mov                tmpq, tmpm
     mov                srcq, srcm
-    add                tmpq, 8
+    mov                tmpq, tmpm
+    movzx                hd, r5w
     add                srcq, 4
-    mov                tmpm, tmpq
+    add                tmpq, 8
     mov                srcm, srcq
+    mov                tmpm, tmpq
     sub                 r5d, 1<<16 ; horizontal--
     jg .v_w4_loop0
 %endif
@@ -3255,37 +3159,30 @@
     RET
 %if ARCH_X86_64
 .v_w8:
-    lea                 r5d, [wq - 8] ; horizontal loop
+    lea                 r6d, [wq*8-64]
+    mov                  r5, srcq
     mov                  r8, tmpq
-    mov                  r6, srcq
-    shl                 r5d, 8 - 3; (wq / 8) << 8
-    mov                 r5b, hb
+    lea                 r6d, [hq+r6*4]
 .v_w8_loop0:
-    movq                 m4, [srcq+strideq*0]
-    movq                 m5, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
-    movq                 m6, [srcq+strideq*0]
-    movq                 m0, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
     movq                 m1, [srcq+strideq*0]
     movq                 m2, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
-    movq                 m3, [srcq+strideq*0]
-    shufpd               m4, m0, 0x0c
-    shufpd               m5, m1, 0x0c
-    punpcklbw            m1, m4, m5 ; 01
-    punpckhbw            m4, m5     ; 34
-    shufpd               m6, m2, 0x0c
-    punpcklbw            m2, m5, m6 ; 12
-    punpckhbw            m5, m6     ; 45
-    shufpd               m0, m3, 0x0c
-    punpcklbw            m3, m6, m0 ; 23
-    punpckhbw            m6, m0     ; 56
+    movq                 m3, [srcq+strideq*2]
+    add                srcq, stride3q
+    movq                 m4, [srcq+strideq*0]
+    movq                 m5, [srcq+strideq*1]
+    movq                 m6, [srcq+strideq*2]
+    add                srcq, stride3q
+    movq                 m0, [srcq+strideq*0]
+    punpcklbw            m1, m2 ; 01
+    punpcklbw            m2, m3 ; 12
+    punpcklbw            m3, m4 ; 23
+    punpcklbw            m4, m5 ; 34
+    punpcklbw            m5, m6 ; 45
+    punpcklbw            m6, m0 ; 56
 .v_w8_loop:
-%if cpuflag(ssse3)
-    movq                m12, [srcq+strideq*1]
+    movq                m13, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                m13, [srcq+strideq*0]
+%if cpuflag(ssse3)
     pmaddubsw           m14, m1, subpel0 ; a0
     pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m1, m3
@@ -3298,64 +3195,59 @@
     mova                 m4, m6
     pmaddubsw            m5, subpel2 ; a2
     pmaddubsw            m6, subpel2 ; b2
+    punpcklbw           m12, m0, m13 ; 67
+    movq                 m0, [srcq+strideq*0]
+    punpcklbw           m13, m0      ; 78
     paddw               m14, m5
+    mova                 m5, m12
+    pmaddubsw           m12, subpel3 ; a3
     paddw               m15, m6
-    shufpd               m6, m0, m12, 0x0d
-    shufpd               m0, m12, m13, 0x0c
-    punpcklbw            m5, m6, m0  ; 67
-    punpckhbw            m6, m0      ; 78
-    pmaddubsw           m12, m5, subpel3 ; a3
-    pmaddubsw           m13, m6, subpel3 ; b3
+    mova                 m6, m13
+    pmaddubsw           m13, subpel3 ; b3
     paddw               m14, m12
     paddw               m15, m13
     pmulhrsw            m14, m7
     pmulhrsw            m15, m7
-    movu        [tmpq+wq*0], m14
-    movu        [tmpq+wq*2], m15
 %else
     mova                m14, m1
     PMADDUBSW           m14, subpel0, m7, m12, 1 ; a0
+    mova                m15, m2
+    PMADDUBSW           m15, subpel0, m7, m12, 0 ; b0
     mova                 m1, m3
     PMADDUBSW            m3, subpel1, m7, m12, 0 ; a1
+    mova                 m2, m4
+    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
     paddw               m14, m3
     mova                 m3, m5
     PMADDUBSW            m5, subpel2, m7, m12, 0 ; a2
-    paddw               m14, m5
-    movq                m12, [srcq+strideq*1]
-    lea                srcq, [srcq+strideq*2]
-    movq                m13, [srcq+strideq*0]
-    shufpd              m15, m0, m12, 0x0d
-    shufpd               m0, m12, m13, 0x0c
-    punpcklbw            m5, m15, m0  ; 67
-    punpckhbw           m15, m0       ; 78
-    mova                m13, m5
-    PMADDUBSW           m13, subpel3, m7, m12, 0 ; a3
-    paddw               m14, m13
-    PMULHRSW_8192       m14, m14, [base+pw_2]
-    movu        [tmpq+wq*0], m14
-    mova                m14, m2
-    PMADDUBSW           m14, subpel0, m7, m12, 0 ; b0
-    mova                 m2, m4
-    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
-    paddw               m14, m4
+    paddw               m15, m4
     mova                 m4, m6
     PMADDUBSW            m6, subpel2, m7, m12, 0 ; b2
-    paddw               m14, m6
-    mova                 m6, m15
-    PMADDUBSW           m15, subpel3, m7, m12, 0 ; b3
-    paddw               m14, m15
+    paddw               m15, m6
+    punpcklbw           m12, m0, m13 ; 67
+    movq                 m0, [srcq+strideq*0]
+    punpcklbw           m13, m0      ; 78
+    paddw               m14, m5
+    mova                 m5, m12
+    PMADDUBSW           m12, subpel3, m7, m6, 0  ; a3
+    paddw               m14, m12
+    mova                 m6, m13
+    PMADDUBSW           m13, subpel3, m7, m12, 0 ; b3
+    paddw               m15, m13
     PMULHRSW_8192       m14, m14, [base+pw_2]
-    movu        [tmpq+wq*2], m14
+    PMULHRSW_8192       m15, m15, [base+pw_2]
 %endif
+    movu        [tmpq+wq*0], m14
+    movu        [tmpq+wq*2], m15
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .v_w8_loop
-    movzx                hd, r5b ; reset vertical loop
+    add                  r5, 8
     add                  r8, 16
-    add                  r6, 8
+    movzx                hd, r6b
+    mov                srcq, r5
     mov                tmpq, r8
-    mov                srcq, r6
-    sub                 r5d, 1<<8 ; horizontal--
+    sub                 r6d, 1<<8
     jg .v_w8_loop0
     RET
 %endif ;ARCH_X86_64
@@ -3363,7 +3255,6 @@
 %undef subpel1
 %undef subpel2
 %undef subpel3
-    ;
 .hv:
     %assign stack_offset org_stack_offset
     cmp                  wd, 4
@@ -3466,13 +3357,13 @@
 %endif
     movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
-    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
 %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
-    add                srcq, strideq
-    movhps               m4, [srcq+strideq*0]   ; 2 _ 3 _
-    add                srcq, strideq
+    movq                 m4, [srcq+strideq*0]   ; 2 _ _ _
+    movhps               m4, [srcq+strideq*1]   ; 2 _ 3 _
+    lea                srcq, [srcq+strideq*2]
 %else
+    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
     movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
     lea                srcq, [srcq+strideq*4]
 %endif
@@ -3506,7 +3397,14 @@
 %endif
     movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    movq                 m4, [srcq+strideq*0]   ; 6 _ _ _
+    add                srcq, strideq
+%else
     movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
+    add                srcq, stride3q
+%endif
     PSHUFB_SUBPEL_H_4a   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
     PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
     PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
@@ -3531,12 +3429,6 @@
     mova                 m2, [esp+mmsize*4]
  %endif
 %endif
-%if ARCH_X86_32
-    lea                srcq, [srcq+strideq*2]
-    add                srcq, strideq
-%else
-    add                srcq, stride3q
-%endif
     ;process high
     PALIGNR              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
@@ -3572,7 +3464,6 @@
   %define m15 m3
  %endif
 %endif
-    ;
 %if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
 %endif
@@ -3620,7 +3511,6 @@
     mova         [esp+0xA0], m5
  %endif
 %endif
-    ;
 %if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
 %endif
@@ -3644,7 +3534,6 @@
     paddd                m5, d32reg ; pd_32
     paddd                m5, m4
     psrad                m4, m5, 6
-    ;
     RESTORELINE_W4       m5, 5, 0
     packssdw             m5, m4
     pshufd               m5, m5, q3120
@@ -3666,7 +3555,6 @@
 %undef subpelv1
 %undef subpelv2
 %undef subpelv3
-    ;
 .hv_w8:
     %assign stack_offset org_stack_offset
 %define hv8_line_1 0
@@ -3699,20 +3587,20 @@
   %define              tmpm  [rsp+mmsize*13+gprsize*1]
   %define              srcm  [rsp+mmsize*13+gprsize*2]
   %define           stridem  [rsp+mmsize*13+gprsize*3]
+    mov                tmpm, tmpq
     mov             stridem, strideq
  %endif
+ %if cpuflag(ssse3)
     pshufd               m0, m1, q0000
     pshufd               m1, m1, q1111
-    punpcklbw            m5, m5
- %if notcpuflag(ssse3)
-    punpcklbw            m0, m0
+ %else
     punpcklbw            m1, m1
- %endif
-    psraw                m5, 8
- %if notcpuflag(ssse3)
-    psraw                m0, 8
     psraw                m1, 8
+    pshufd               m0, m1, q1010
+    punpckhqdq           m1, m1
  %endif
+    punpcklbw            m5, m5
+    psraw                m5, 8
     pshufd               m2, m5, q0000
     pshufd               m3, m5, q1111
     pshufd               m4, m5, q2222
@@ -3742,38 +3630,31 @@
     cmp                  hd, 6
     cmovs               myd, mxd
     movq                 m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
-    punpcklbw            m1, m1
- %if notcpuflag(ssse3)
-    punpcklbw      subpelh0, subpelh0
-    punpcklbw      subpelh1, subpelh1
+ %else
+    punpcklbw            m0, m0
+    psraw                m0, 8
+    pshufd         subpelh0, m0, q1010
+    pshufd         subpelh1, m0, q3232
+    mova                 m7, [base+pw_2]
  %endif
+    punpcklbw            m1, m1
     psraw                m1, 8
- %if notcpuflag(ssse3)
-    psraw          subpelh0, 8
-    psraw          subpelh1, 8
- %endif
     pshufd         subpelv0, m1, q0000
     pshufd         subpelv1, m1, q1111
     pshufd         subpelv2, m1, q2222
     pshufd         subpelv3, m1, q3333
- %if notcpuflag(ssse3)
-    mova                 m7, [base+pw_2]
- %endif
     lea            stride3q, [strideq*3]
     sub                srcq, 3
     sub                srcq, stride3q
     mov                  r6, srcq
-%endif
-    lea                 r5d, [wq-4]
-%if ARCH_X86_64
     mov                  r8, tmpq
-%else
-    mov                tmpm, tmpq
 %endif
-    shl                 r5d, (16 - 2)
-    mov                 r5w, hw
+    lea                 r5d, [wq-4]
+    shl                 r5d, 14
+    add                 r5d, hd
 .hv_w8_loop0:
 %if cpuflag(ssse3)
  %if ARCH_X86_64
@@ -3791,24 +3672,24 @@
 %endif
     PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
     PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
+%if ARCH_X86_64
+    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
+    add                srcq, stride3q
+    PREP_8TAP_HV         m0, srcq+strideq*0, m7, m9
+%else
     lea                srcq, [srcq+strideq*2]
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
-    SWAP                 m9, m4
- %else
+ %if notcpuflag(ssse3)
     mova              [esp], m4
  %endif
-%endif
     PREP_8TAP_HV         m6, srcq+strideq*0, m7, m4
     PREP_8TAP_HV         m0, srcq+strideq*1, m7, m4
     lea                srcq, [srcq+strideq*2]
+%endif
 %if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
 %else
     mova                 m7, [base+pw_2]
- %if ARCH_X86_64
-    SWAP                 m4, m9
- %else
+ %if ARCH_X86_32
     mova                 m4, [esp]
  %endif
 %endif
@@ -3824,28 +3705,26 @@
     SAVELINE_W8           3, m3
 %if cpuflag(ssse3)
     mova                 m7, [base+subpel_h_shufA]
+%endif
+%if ARCH_X86_64
+    PREP_8TAP_HV         m4, srcq+strideq*1, m8, m9
+    PREP_8TAP_HV         m5, srcq+strideq*2, m8, m9
+    add                srcq, stride3q
+    PREP_8TAP_HV         m6, srcq+strideq*0, m8, m9
 %else
- %if ARCH_X86_64
-    SWAP                 m8, m7
-    SWAP                 m9, m0
- %else
+ %if notcpuflag(ssse3)
     mova         [esp+0x30], m0
  %endif
-%endif
     PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
     PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
-    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
     lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_HV         m6, srcq+strideq*0, m7, m0
+%endif
 %if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
-%else
- %if ARCH_X86_64
-    SWAP                 m0, m9
-    SWAP                 m7, m8
- %else
+%elif ARCH_X86_32
     mova                 m0, [esp+0x30]
     mova                 m7, [base+pw_2]
- %endif
 %endif
     PMULHRSW_8192        m1, m4, m7
     PMULHRSW_8192        m2, m5, m7
@@ -3902,8 +3781,8 @@
  %endif
 %endif
     PREP_8TAP_HV         m0, srcq+strideq*1, m5, m6
-    PREP_8TAP_HV         m4, srcq+strideq*2, m5, m6
     lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_HV         m4, srcq+strideq*0, m5, m6
 %if cpuflag(ssse3)
     mova                 m5, [base+pw_8192]
 %else
@@ -3933,19 +3812,20 @@
     RESTORELINE_W8        4, m4
     jmp .hv_w8_loop
 .hv_w8_outer:
-    movzx                hd, r5w
 %if ARCH_X86_32
     mov                srcq, srcm
     mov                tmpq, tmpm
+    movzx                hd, r5w
     add                srcq, 4
     add                tmpq, 8
     mov                srcm, srcq
     mov                tmpm, tmpq
 %else
-    add                  r8, 8
-    mov                tmpq, r8
     add                  r6, 4
+    add                  r8, 8
+    movzx                hd, r5b
     mov                srcq, r6
+    mov                tmpq, r8
 %endif
     sub                 r5d, 1<<16
     jg .hv_w8_loop0
--- a/tests/checkasm/looprestoration.c
+++ b/tests/checkasm/looprestoration.c
@@ -27,6 +27,7 @@
 
 #include "tests/checkasm/checkasm.h"
 
+#include <stdio.h>
 #include <string.h>
 
 #include "src/levels.h"
@@ -33,6 +34,10 @@
 #include "src/looprestoration.h"
 #include "src/tables.h"
 
+static int to_binary(int x) { /* 0-15 -> 0000-1111 */
+    return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
+}
+
 static void init_tmp(pixel *buf, const ptrdiff_t stride,
                      const int w, const int h, const int bitdepth_max)
 {
@@ -47,38 +52,30 @@
     ALIGN_STK_64(pixel, c_dst, 448 * 64,);
     ALIGN_STK_64(pixel, a_dst, 448 * 64,);
     ALIGN_STK_64(pixel, h_edge, 448 * 8,);
+    ALIGN_STK_16(int16_t, filter, 2, [8]);
     pixel left[64][4];
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride,
                  const pixel (*const left)[4],
                  const pixel *lpf, ptrdiff_t lpf_stride,
-                 int w, int h, const int16_t filterh[7],
-                 const int16_t filterv[7], enum LrEdgeFlags edges
-                 HIGHBD_DECL_SUFFIX);
+                 int w, int h, const int16_t filter[2][8],
+                 enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
 
-    for (int pl = 0; pl < 2; pl++) {
-        if (check_func(c->wiener, "wiener_%s_%dbpc",
-                       pl ? "chroma" : "luma", bpc))
-        {
-            int16_t filter[2][3], filter_v[7], filter_h[7];
+    for (int t = 0; t < 2; t++) {
+        if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) {
+            filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5;
+            filter[0][1] = filter[0][5] = (rnd() & 31) - 23;
+            filter[0][2] = filter[0][4] = (rnd() & 63) - 17;
+            filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
+#if BITDEPTH != 8
+            filter[0][3] += 128;
+#endif
 
-            filter[0][0] = pl ? 0 : (rnd() & 15) - 5;
-            filter[0][1] = (rnd() & 31) - 23;
-            filter[0][2] = (rnd() & 63) - 17;
-            filter[1][0] = pl ? 0 : (rnd() & 15) - 5;
-            filter[1][1] = (rnd() & 31) - 23;
-            filter[1][2] = (rnd() & 63) - 17;
+            filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5;
+            filter[1][1] = filter[1][5] = (rnd() & 31) - 23;
+            filter[1][2] = filter[1][4] = (rnd() & 63) - 17;
+            filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
 
-            filter_h[0] = filter_h[6] = filter[0][0];
-            filter_h[1] = filter_h[5] = filter[0][1];
-            filter_h[2] = filter_h[4] = filter[0][2];
-            filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2);
-
-            filter_v[0] = filter_v[6] = filter[1][0];
-            filter_v[1] = filter_v[5] = filter[1][1];
-            filter_v[2] = filter_v[4] = filter[1][2];
-            filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2);
-
             const int base_w = 1 + (rnd() % 384);
             const int base_h = 1 + (rnd() & 63);
             const int bitdepth_max = (1 << bpc) - 1;
@@ -95,17 +92,22 @@
 
                 call_ref(c_dst + 32, 448 * sizeof(pixel), left,
                          h_edge + 32, 448 * sizeof(pixel),
-                         w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX);
+                         w, h, filter, edges HIGHBD_TAIL_SUFFIX);
                 call_new(a_dst + 32, 448 * sizeof(pixel), left,
                          h_edge + 32, 448 * sizeof(pixel),
-                         w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX);
-                checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
-                                     a_dst + 32, 448 * sizeof(pixel),
-                                     w, h, "dst");
+                         w, h, filter, edges HIGHBD_TAIL_SUFFIX);
+                if (checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
+                                         a_dst + 32, 448 * sizeof(pixel),
+                                         w, h, "dst"))
+                {
+                    fprintf(stderr, "size = %dx%d, edges = %04d\n",
+                            w, h, to_binary(edges));
+                    break;
+                }
             }
             bench_new(a_dst + 32, 448 * sizeof(pixel), left,
                       h_edge + 32, 448 * sizeof(pixel),
-                      256, 64, filter_h, filter_v, 0xf HIGHBD_TAIL_SUFFIX);
+                      256, 64, filter, 0xf HIGHBD_TAIL_SUFFIX);
         }
     }
 }
--- /dev/null
+++ b/tests/header_test.c
@@ -1,0 +1,33 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include DAV1D_TEST_HEADER
+
+int main()
+{
+    return 0;
+}
--- a/tests/header_test.c.in
+++ /dev/null
@@ -1,33 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <dav1d/INPUT>
-
-int main()
-{
-    return 0;
-}
--- a/tests/libfuzzer/dav1d_fuzzer.c
+++ b/tests/libfuzzer/dav1d_fuzzer.c
@@ -31,6 +31,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include <stdlib.h>
 
 #include <dav1d/dav1d.h>
 #include "src/cpu.h"
@@ -38,8 +39,6 @@
 
 #ifdef DAV1D_ALLOC_FAIL
 
-#include <stdlib.h>
-
 #include "alloc_fail.h"
 
 static unsigned djb_xor(const uint8_t * c, size_t len) {
@@ -56,6 +55,39 @@
 
 #define DAV1D_FUZZ_MAX_SIZE 4096 * 4096
 
+// search for "--cpumask xxx" in argv and remove both parameters
+int LLVMFuzzerInitialize(int *argc, char ***argv) {
+    int i = 1;
+    for (; i < *argc; i++) {
+        if (!strcmp((*argv)[i], "--cpumask")) {
+            const char * cpumask = (*argv)[i+1];
+            if (cpumask) {
+                char *end;
+                unsigned res;
+                if (!strncmp(cpumask, "0x", 2)) {
+                    cpumask += 2;
+                    res = (unsigned) strtoul(cpumask, &end, 16);
+                } else {
+                    res = (unsigned) strtoul(cpumask, &end, 0);
+                }
+                if (end != cpumask && !end[0]) {
+                    dav1d_set_cpu_flags_mask(res);
+                }
+            }
+            break;
+        }
+    }
+
+    for (; i < *argc - 2; i++) {
+        (*argv)[i] = (*argv)[i + 2];
+    }
+
+    *argc = i;
+
+    return 0;
+}
+
+
 // expects ivf input
 
 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
@@ -146,12 +178,19 @@
             dav1d_data_unref(&buf);
     }
 
-    do {
-        memset(&pic, 0, sizeof(pic));
-        err = dav1d_get_picture(ctx, &pic);
-        if (err == 0)
-            dav1d_picture_unref(&pic);
-    } while (err != DAV1D_ERR(EAGAIN));
+    memset(&pic, 0, sizeof(pic));
+    if ((err = dav1d_get_picture(ctx, &pic)) == 0) {
+        /* Test calling dav1d_picture_unref() after dav1d_close() */
+        do {
+            Dav1dPicture pic2 = { 0 };
+            if ((err = dav1d_get_picture(ctx, &pic2)) == 0)
+                dav1d_picture_unref(&pic2);
+        } while (err != DAV1D_ERR(EAGAIN));
+
+        dav1d_close(&ctx);
+        dav1d_picture_unref(&pic);
+        return 0;
+    }
 
 cleanup:
     dav1d_flush(ctx);
--- a/tests/libfuzzer/dav1d_fuzzer.h
+++ b/tests/libfuzzer/dav1d_fuzzer.h
@@ -31,6 +31,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+int LLVMFuzzerInitialize(int *argc, char ***argv);
 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
 
 #endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */
--- a/tests/libfuzzer/main.c
+++ b/tests/libfuzzer/main.c
@@ -40,7 +40,7 @@
 
 // expects ivf input
 
-int main(const int argc, char *const *const argv) {
+int main(int argc, char *argv[]) {
     int ret = -1;
     FILE *f = NULL;
     int64_t fsize;
@@ -47,6 +47,10 @@
     const char *filename = NULL;
     uint8_t *data = NULL;
     size_t size = 0;
+
+    if (LLVMFuzzerInitialize(&argc, &argv)) {
+        return 1;
+    }
 
     if (argc != 2) {
         fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]);
--- a/tests/libfuzzer/meson.build
+++ b/tests/libfuzzer/meson.build
@@ -72,8 +72,15 @@
 
 objcopy = find_program('objcopy',
                        required: false)
+
+if meson.version().version_compare('<0.56.99')
+    lto = get_option('b_lto') ? 'true' : 'false'
+else
+    lto = get_option('b_lto')
+endif
+
 if (objcopy.found() and
-    not get_option('b_lto') and
+    lto == 'false' and
     get_option('default_library') == 'static' and
     cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args))
 
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -31,8 +31,6 @@
     subdir_done()
 endif
 
-libdav1d_nasm_objs_if_needed = []
-
 if is_asm_enabled
     checkasm_sources = files(
         'checkasm/checkasm.c',
@@ -62,25 +60,27 @@
         checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects()
     endforeach
 
-    checkasm_nasm_objs = []
+    checkasm_asm_objs = []
+    checkasm_asm_sources = []
     if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
-        checkasm_sources += files('checkasm/arm/checkasm_64.S')
+        checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
     elif host_machine.cpu_family().startswith('arm')
-        checkasm_sources += files('checkasm/arm/checkasm_32.S')
+        checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
     elif host_machine.cpu_family().startswith('x86')
-        checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm'))
+        checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
     endif
 
-    m_lib = cc.find_library('m', required: false)
-
-    if meson.version().version_compare('< 0.48.999')
-        libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs
+    if use_gaspp
+        checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources)
+    else
+        checkasm_sources += checkasm_asm_sources
     endif
 
+    m_lib = cc.find_library('m', required: false)
+
     checkasm = executable('checkasm',
         checkasm_sources,
-        checkasm_nasm_objs,
-        libdav1d_nasm_objs_if_needed,
+        checkasm_asm_objs,
 
         objects: [
             checkasm_bitdepth_objs,
@@ -98,7 +98,8 @@
             ],
         )
 
-    test('checkasm', checkasm, is_parallel: false)
+    test('checkasm', checkasm, suite: 'checkasm', is_parallel: false)
+    benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench')
 endif
 
 c99_extension_flag = cc.first_supported_argument(
@@ -110,31 +111,21 @@
 
 # dav1d_api_headers
 foreach header : dav1d_api_headers
-    header_file = '@0@'.format(header).split('/')[-1]
-    target = header_file + '_test'
+    target = header + '_test'
 
-    header_test_source = custom_target(target,
-        output : target + '.c',
-        input : 'header_test.c.in',
-        capture : true,
-        command : ['sed', '-e', 's/INPUT/' + header_file + '/', '@INPUT@']
-    )
-
     header_test_exe = executable(target,
-        header_test_source,
+        'header_test.c',
         include_directories: dav1d_inc_dirs,
-        c_args: [c99_extension_flag],
+        c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag],
         build_by_default: true
     )
 
-    test(target, header_test_exe)
+    test(target, header_test_exe, suite: 'headers')
 endforeach
 
 
 # fuzzing binaries
-if meson.version().version_compare('>=0.49')
-    subdir('libfuzzer')
-endif
+subdir('libfuzzer')
 
 # Include dav1d test data repository with additional tests
 if get_option('testdata_tests')
--- /dev/null
+++ b/tools/dav1d.manifest
@@ -1,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes"?>
+<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
+    <assemblyIdentity type="win32" name="VideoLAN.dav1d" version="1.0.0.0"/>
+    <application xmlns="urn:schemas-microsoft-com:asm.v3">
+        <windowsSettings>
+            <longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
+            <activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage>
+        </windowsSettings>
+    </application>
+</assembly>
--- /dev/null
+++ b/tools/dav1d.rc.in
@@ -1,0 +1,33 @@
+#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
+
+#include <windows.h>
+
+1 RT_MANIFEST "dav1d.manifest"
+1 VERSIONINFO
+FILETYPE VFT_APP
+FILEOS VOS_NT_WINDOWS32
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
+BEGIN
+  BLOCK "StringFileInfo"
+  BEGIN
+    BLOCK "040904E4"
+    BEGIN
+      VALUE "CompanyName", "VideoLAN"
+      VALUE "ProductName", "dav1d"
+      VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+      VALUE "FileVersion", API_VERSION_NUMBER_STR
+      VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
+      VALUE "InternalName", "dav1d"
+      VALUE "OriginalFilename", "dav1d.exe"
+      VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
+    END
+  END
+  BLOCK "VarFileInfo"
+  BEGIN
+    VALUE "Translation", 0x409, 1252
+  END
+END
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -77,8 +77,24 @@
     'dav1d_cli_parse.c',
 )
 
+if host_machine.system() == 'windows'
+    rc_file = configure_file(
+        input : 'dav1d.rc.in',
+        output : 'dav1d.rc',
+        configuration : rc_data
+    )
+
+    dav1d_rc_obj = winmod.compile_resources(rc_file,
+       depend_files : files('dav1d.manifest'),
+       include_directories : include_directories('.')
+    )
+else
+    dav1d_rc_obj = []
+endif
+
 dav1d = executable('dav1d',
     dav1d_sources,
+    dav1d_rc_obj,
     rev_target, cli_config_h_target,
 
     link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],