shithub: dav1d

Download patch

ref: d88abfec51e91832a64f36f90c7c61ce67a79333
parent: 14072e733465b034644dd08cfaffb3bf7ac0a310
author: Henrik Gramner <[email protected]>
date: Tue Sep 25 12:36:45 EDT 2018

x86: MC AVX2

--- a/meson.build
+++ b/meson.build
@@ -189,20 +189,6 @@
     'src/recon.c'
 )
 
-# Build a helper library for each bitdepth
-bitdepth_objs = []
-foreach bitdepth : dav1d_bitdepths
-    bitdepth_lib = static_library(
-        'dav1d_bitdepth_@0@'.format(bitdepth),
-        libdav1d_tmpl_sources, config_h_target,
-        include_directories: dav1d_inc_dirs,
-        c_args: ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
-        install: false,
-        build_by_default: false,
-    )
-    bitdepth_objs += bitdepth_lib.extract_all_objects()
-endforeach
-
 entrypoints_src = files(
     'src/lib.c',
     'src/thread_task.c'
@@ -241,8 +227,12 @@
     libdav1d_sources += files(
         'src/x86/cpu.c',
     )
+    libdav1d_tmpl_sources += files(
+        'src/x86/mc_init.c',
+    )
     libdav1d_sources_asm = files(
         'src/x86/cpuid.asm',
+        'src/x86/mc.asm',
     )
 
     nasm = find_program('nasm')
@@ -280,6 +270,20 @@
 if host_machine.system() == 'windows'
     libdav1d_sources += files('src/win32/thread.c')
 endif
+
+# Build a helper library for each bitdepth
+bitdepth_objs = []
+foreach bitdepth : dav1d_bitdepths
+    bitdepth_lib = static_library(
+        'dav1d_bitdepth_@0@'.format(bitdepth),
+        libdav1d_tmpl_sources, config_h_target,
+        include_directories: dav1d_inc_dirs,
+        c_args: ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag,
+        install: false,
+        build_by_default: false,
+    )
+    bitdepth_objs += bitdepth_lib.extract_all_objects()
+endforeach
 
 libdav1d = library('dav1d',
     libdav1d_sources, rev_target, nasm_objs,
--- a/src/mc.c
+++ b/src/mc.c
@@ -530,4 +530,8 @@
     c->w_mask[2] = w_mask_420_c;
     c->warp8x8  = warp_affine_8x8_c;
     c->warp8x8t = warp_affine_8x8t_c;
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_mc_dsp_init_x86)(c);
+#endif
 }
--- a/src/mc.h
+++ b/src/mc.h
@@ -101,4 +101,7 @@
 void dav1d_mc_dsp_init_8bpc(Dav1dMCDSPContext *c);
 void dav1d_mc_dsp_init_10bpc(Dav1dMCDSPContext *c);
 
+void dav1d_mc_dsp_init_x86_8bpc(Dav1dMCDSPContext *c);
+void dav1d_mc_dsp_init_x86_10bpc(Dav1dMCDSPContext *c);
+
 #endif /* __DAV1D_SRC_MC_H__ */
--- /dev/null
+++ b/src/x86/mc.asm
@@ -1,0 +1,3035 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64 && UNIX64 ; FIXME: Windows
+
+SECTION_RODATA 32
+
+subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
+                db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+subpel_h_shufB: db 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
+bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+deint_shuf4:    db 0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
+
+pw_8:    times 2 dw 8
+pw_26:   times 2 dw 26
+pw_34:   times 2 dw 34
+pw_258:  times 2 dw 258
+pw_512:  times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_8192: times 2 dw 8192
+pd_32:   dd 32
+pd_512:  dd 512
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BIDIR_JMP_TABLE 1-7 4, 8, 16, 32, 64, 128
+    %xdefine %1_table (%%table - 2*4)
+    %xdefine %%prefix mangle(private_prefix %+ _%1)
+    %%table:
+    %rep 6
+        dd %%prefix %+ .w%2 - (%%table - 2*4)
+        %rotate 1
+    %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg_avx2
+BIDIR_JMP_TABLE w_avg_avx2
+BIDIR_JMP_TABLE mask_avx2
+BIDIR_JMP_TABLE w_mask_420_avx2
+
+%macro BASE_JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - %3)
+    %xdefine %%base %1_%2
+    %%table:
+    %rep %0 - 2
+        dw %%base %+ _w%3 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
+
+BASE_JMP_TABLE put,  avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2,    4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+    %xdefine %%base %1_%3
+    %assign %%types %4
+    %if %%types & 1
+        %xdefine %1_%2_h_%3_table  (%%h  - %5)
+        %%h:
+        %rep %0 - 4
+            dw %%prefix %+ .h_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 2
+        %xdefine %1_%2_v_%3_table  (%%v  - %5)
+        %%v:
+        %rep %0 - 4
+            dw %%prefix %+ .v_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 4
+        %xdefine %1_%2_hv_%3_table (%%hv - %5)
+        %%hv:
+        %rep %0 - 4
+            dw %%prefix %+ .hv_w%5 - %%base
+            %rotate 1
+        %endrep
+    %endif
+%endmacro
+
+HV_JMP_TABLE put,  8tap,  avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap,  avx2, 1,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+SECTION .text
+
+INIT_XMM avx2
+DECLARE_REG_TMP 4, 6, 7
+cglobal put_bilin, 4, 8, 8, dst, ds, src, ss, w, h, mxy
+    movifnidn          mxyd, r6m ; mx
+    lea                  t2, [put_avx2]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r7m ; my
+    test               mxyd, mxyd
+    jnz .v
+.put:
+    movzx                wd, word [t2+wq*2+table_offset(put,)]
+    add                  wq, t2
+    lea                  t1, [ssq*3]
+    lea                  t2, [dsq*3]
+    jmp                  wq
+.put_w2:
+    movzx               t0d, word [srcq+ssq*0]
+    movzx               t1d, word [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], t0w
+    mov        [dstq+dsq*1], t1w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w2
+    RET
+.put_w4:
+    mov                 t0d, [srcq+ssq*0]
+    mov                 t1d, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], t0d
+    mov        [dstq+dsq*1], t1d
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w4
+    RET
+.put_w8:
+    movq                 m0, [srcq+ssq*0]
+    movq                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movq       [dstq+dsq*0], m0
+    movq       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w8
+    RET
+.put_w16:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    movu                 m2, [srcq+ssq*2]
+    movu                 m3, [srcq+t1   ]
+    lea                srcq, [srcq+ssq*4]
+    mova       [dstq+dsq*0], m0
+    mova       [dstq+dsq*1], m1
+    mova       [dstq+dsq*2], m2
+    mova       [dstq+t2   ], m3
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .put_w16
+    RET
+INIT_YMM avx2
+.put_w32:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    movu                 m2, [srcq+ssq*2]
+    movu                 m3, [srcq+t1   ]
+    lea                srcq, [srcq+ssq*4]
+    mova       [dstq+dsq*0], m0
+    mova       [dstq+dsq*1], m1
+    mova       [dstq+dsq*2], m2
+    mova       [dstq+t2   ], m3
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .put_w32
+    RET
+.put_w64:
+    movu                 m0, [srcq+ssq*0+32*0]
+    movu                 m1, [srcq+ssq*0+32*1]
+    movu                 m2, [srcq+ssq*1+32*0]
+    movu                 m3, [srcq+ssq*1+32*1]
+    lea                srcq, [srcq+ssq*2]
+    mova  [dstq+dsq*0+32*0], m0
+    mova  [dstq+dsq*0+32*1], m1
+    mova  [dstq+dsq*1+32*0], m2
+    mova  [dstq+dsq*1+32*1], m3
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w64
+    RET
+.put_w128:
+    movu                 m0, [srcq+32*0]
+    movu                 m1, [srcq+32*1]
+    movu                 m2, [srcq+32*2]
+    movu                 m3, [srcq+32*3]
+    add                srcq, ssq
+    mova        [dstq+32*0], m0
+    mova        [dstq+32*1], m1
+    mova        [dstq+32*2], m2
+    mova        [dstq+32*3], m3
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w128
+    RET
+.h:
+    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+    imul               mxyd, 0xff01
+    vbroadcasti128       m4, [bilin_h_shuf8]
+    add                mxyd, 16 << 8
+    movd                xm5, mxyd
+    mov                mxyd, r7m ; my
+    vpbroadcastw         m5, xm5
+    test               mxyd, mxyd
+    jnz .hv
+    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+    vpbroadcastd         m6, [pw_2048]
+    add                  wq, t2
+    jmp                  wq
+.h_w2:
+    movd                xm0, [srcq+ssq*0]
+    pinsrd              xm0, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+    pmulhrsw            xm0, xm6
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2
+    RET
+.h_w4:
+    mova                xm4, [bilin_h_shuf4]
+.h_w4_loop:
+    movq                xm0, [srcq+ssq*0]
+    movhps              xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+    pmulhrsw            xm0, xm6
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4_loop
+    RET
+.h_w8:
+    movu                xm0, [srcq+ssq*0]
+    movu                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pshufb              xm1, xm4
+    pmaddubsw           xm0, xm5
+    pmaddubsw           xm1, xm5
+    pmulhrsw            xm0, xm6
+    pmulhrsw            xm1, xm6
+    packuswb            xm0, xm1
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                xm0,     [srcq+ssq*0+8*0]
+    vinserti128          m0, m0, [srcq+ssq*1+8*0], 1
+    movu                xm1,     [srcq+ssq*0+8*1]
+    vinserti128          m1, m1, [srcq+ssq*1+8*1], 1
+    lea                srcq,     [srcq+ssq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w16
+    RET
+.h_w32:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w32
+    RET
+.h_w64:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    movu                 m2, [srcq+8*4]
+    movu                 m3, [srcq+8*5]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    pmulhrsw             m2, m6
+    pmulhrsw             m3, m6
+    packuswb             m0, m1
+    packuswb             m2, m3
+    mova        [dstq+32*0], m0
+    mova        [dstq+32*1], m2
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w64
+    RET
+.h_w128:
+    mov                  t1, -32*3
+.h_w128_loop:
+    movu                 m0, [srcq+t1+32*3+8*0]
+    movu                 m1, [srcq+t1+32*3+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova     [dstq+t1+32*3], m0
+    add                  t1, 32
+    jle .h_w128_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w128
+    RET
+.v:
+    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_v)]
+    imul               mxyd, 0xff01
+    vpbroadcastd         m7, [pw_2048]
+    add                mxyd, 16 << 8
+    add                  wq, t2
+    movd                xm6, mxyd
+    vpbroadcastw         m6, xm6
+    jmp                  wq
+.v_w2:
+    movd                xm0,      [srcq+ssq*0]
+.v_w2_loop:
+    pinsrw              xm1, xm0, [srcq+ssq*1], 1 ; 0 1
+    lea                srcq,      [srcq+ssq*2]
+    pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
+    pshuflw             xm1, xm1, q2301           ; 1 0
+    punpcklbw           xm1, xm0, xm1
+    pmaddubsw           xm1, xm6
+    pmulhrsw            xm1, xm7
+    packuswb            xm1, xm1
+    pextrw     [dstq+dsq*0], xm1, 1
+    pextrw     [dstq+dsq*1], xm1, 0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movd                xm0, [srcq+ssq*0]
+.v_w4_loop:
+    vpbroadcastd        xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd            xm2, xm1, xm0, 0x01 ; 0 1
+    vpbroadcastd        xm0, [srcq+ssq*0]
+    vpblendd            xm1, xm1, xm0, 0x02 ; 1 2
+    punpcklbw           xm1, xm2
+    pmaddubsw           xm1, xm6
+    pmulhrsw            xm1, xm7
+    packuswb            xm1, xm1
+    movd       [dstq+dsq*0], xm1
+    pextrd     [dstq+dsq*1], xm1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                xm0, [srcq+ssq*0]
+.v_w8_loop:
+    vpbroadcastq        xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd            xm2, xm1, xm0, 0x03 ; 0 1
+    vpbroadcastq        xm0, [srcq+ssq*0]
+    vpblendd            xm1, xm1, xm0, 0x0c ; 1 2
+    punpcklbw           xm3, xm1, xm2
+    punpckhbw           xm1, xm2
+    pmaddubsw           xm3, xm6
+    pmaddubsw           xm1, xm6
+    pmulhrsw            xm3, xm7
+    pmulhrsw            xm1, xm7
+    packuswb            xm3, xm1
+    movq       [dstq+dsq*0], xm3
+    movhps     [dstq+dsq*1], xm3
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    RET
+.v_w16:
+    movu                xm0, [srcq+ssq*0]
+.v_w16_loop:
+    vbroadcasti128       m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd             m3, m2, m0, 0x0f ; 0 1
+    vbroadcasti128       m0, [srcq+ssq*0]
+    vpblendd             m2, m2, m0, 0xf0 ; 1 2
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    pmaddubsw            m1, m6
+    pmaddubsw            m2, m6
+    pmulhrsw             m1, m7
+    pmulhrsw             m2, m7
+    packuswb             m1, m2
+    mova         [dstq+dsq*0], xm1
+    vextracti128 [dstq+dsq*1], m1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w16_loop
+    RET
+.v_w32:
+%macro PUT_BILIN_V_W32 0
+    movu                 m0, [srcq+ssq*0]
+%%loop:
+    movu                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklbw            m1, m4, m0
+    punpckhbw            m3, m4, m0
+    movu                 m0, [srcq+ssq*0]
+    punpcklbw            m2, m0, m4
+    punpckhbw            m4, m0, m4
+    pmaddubsw            m1, m6
+    pmaddubsw            m3, m6
+    pmaddubsw            m2, m6
+    pmaddubsw            m4, m6
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    pmulhrsw             m2, m7
+    pmulhrsw             m4, m7
+    packuswb             m1, m3
+    packuswb             m2, m4
+    mova       [dstq+dsq*0], m1
+    mova       [dstq+dsq*1], m2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg %%loop
+%endmacro
+    PUT_BILIN_V_W32
+    RET
+.v_w64:
+    movu                 m0, [srcq+32*0]
+    movu                 m1, [srcq+32*1]
+.v_w64_loop:
+    add                srcq, ssq
+    movu                 m3, [srcq+32*0]
+    movu                 m4, [srcq+32*1]
+    punpcklbw            m2, m3, m0
+    punpckhbw            m5, m3, m0
+    pmaddubsw            m2, m6
+    pmaddubsw            m5, m6
+    mova                 m0, m3
+    pmulhrsw             m2, m7
+    pmulhrsw             m5, m7
+    packuswb             m2, m5
+    punpcklbw            m3, m4, m1
+    punpckhbw            m5, m4, m1
+    pmaddubsw            m3, m6
+    pmaddubsw            m5, m6
+    mova                 m1, m4
+    pmulhrsw             m3, m7
+    pmulhrsw             m5, m7
+    packuswb             m3, m5
+    mova        [dstq+32*0], m2
+    mova        [dstq+32*1], m3
+    add                dstq, dsq
+    dec                  hd
+    jg .v_w64_loop
+    RET
+.v_w128:
+    mov                  t0, dstq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(3<<8)]
+.v_w128_loop:
+    PUT_BILIN_V_W32
+    mov                  hb, t2b
+    add                  t0, 32
+    add                  t1, 32
+    mov                dstq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .v_w128_loop
+    RET
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+    movzx                wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
+    vpbroadcastd         m7, [pw_2048]
+    movd                xm6, mxyd
+    add                  wq, t2
+    vpbroadcastw         m6, xm6
+    jmp                  wq
+.hv_w2:
+    vpbroadcastd        xm0, [srcq+ssq*0]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+.hv_w2_loop:
+    movd                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pinsrd              xm1, [srcq+ssq*0], 1
+    pshufb              xm1, xm4
+    pmaddubsw           xm1, xm5             ; 1 _ 2 _
+    shufps              xm2, xm0, xm1, q1032 ; 0 _ 1 _
+    mova                xm0, xm1
+    psubw               xm1, xm2
+    paddw               xm1, xm1
+    pmulhw              xm1, xm6
+    paddw               xm1, xm2
+    pmulhrsw            xm1, xm7
+    packuswb            xm1, xm1
+    pextrw     [dstq+dsq*0], xm1, 0
+    pextrw     [dstq+dsq*1], xm1, 2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    mova                xm4, [bilin_h_shuf4]
+    movddup             xm0, [srcq+ssq*0]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm5
+.hv_w4_loop:
+    movq                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movhps              xm1, [srcq+ssq*0]
+    pshufb              xm1, xm4
+    pmaddubsw           xm1, xm5             ; 1 2
+    shufps              xm2, xm0, xm1, q1032 ; 0 1
+    mova                xm0, xm1
+    psubw               xm1, xm2
+    paddw               xm1, xm1
+    pmulhw              xm1, xm6
+    paddw               xm1, xm2
+    pmulhrsw            xm1, xm7
+    packuswb            xm1, xm1
+    movd       [dstq+dsq*0], xm1
+    pextrd     [dstq+dsq*1], xm1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    vbroadcasti128       m0,     [srcq+ssq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w8_loop:
+    movu                xm1,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    vinserti128          m1, m1, [srcq+ssq*0], 1
+    pshufb               m1, m4
+    pmaddubsw            m1, m5           ; 1 2
+    vperm2i128           m2, m0, m1, 0x21 ; 0 1
+    mova                 m0, m1
+    psubw                m1, m2
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m2
+    pmulhrsw             m1, m7
+    vextracti128        xm2, m1, 1
+    packuswb            xm1, xm2
+    movq       [dstq+dsq*0], xm1
+    movhps     [dstq+dsq*1], xm1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    RET
+.hv_w16:
+    movu                 m0,     [srcq+ssq*0+8*0]
+    vinserti128          m0, m0, [srcq+ssq*0+8*1], 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w16_loop:
+    movu                xm2,     [srcq+ssq*1+8*0]
+    vinserti128          m2, m2, [srcq+ssq*1+8*1], 1
+    lea                srcq,     [srcq+ssq*2]
+    movu                xm3,     [srcq+ssq*0+8*0]
+    vinserti128          m3, m3, [srcq+ssq*0+8*1], 1
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m2, m5
+    psubw                m1, m2, m0
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m0
+    pmaddubsw            m0, m3, m5
+    psubw                m3, m0, m2
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m2
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    packuswb             m1, m3
+    vpermq               m1, m1, q3120
+    mova         [dstq+dsq*0], xm1
+    vextracti128 [dstq+dsq*1], m1, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w16_loop
+    RET
+.hv_w32:
+%macro PUT_BILIN_HV_W32 0
+    movu                 m0,     [srcq+8*0]
+    vinserti128          m0, m0, [srcq+8*2], 1
+    movu                 m1,     [srcq+8*1]
+    vinserti128          m1, m1, [srcq+8*3], 1
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+%%loop:
+    add                srcq, ssq
+    movu                xm2,     [srcq+8*1]
+    vinserti128          m2, m2, [srcq+8*3], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m1
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m1
+    mova                 m1, m2
+    pmulhrsw             m8, m3, m7
+ASSERT UNIX64 ; using an additional vector register here
+    movu                xm2,     [srcq+8*0]
+    vinserti128          m2, m2, [srcq+8*2], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m0
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m0
+    mova                 m0, m2
+    pmulhrsw             m3, m7
+    packuswb             m3, m8
+    mova             [dstq], m3
+    add                dstq, dsq
+    dec                  hd
+    jg %%loop
+%endmacro
+    PUT_BILIN_HV_W32
+    RET
+.hv_w64:
+    mov                  t0, dstq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(1<<8)]
+.hv_w64_loop:
+    PUT_BILIN_HV_W32
+    mov                  hb, t2b
+    add                  t0, 32
+    add                  t1, 32
+    mov                dstq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .hv_w64_loop
+    RET
+.hv_w128:
+    mov                  t0, dstq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(3<<8)]
+.hv_w128_loop:
+    PUT_BILIN_HV_W32
+    mov                  hb, t2b
+    add                  t0, 32
+    add                  t1, 32
+    mov                dstq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .hv_w128_loop
+    RET
+
+DECLARE_REG_TMP 3, 5, 6
+cglobal prep_bilin, 3, 7, 7, tmp, src, stride, w, h, mxy, stride3
+    movifnidn          mxyd, r5m ; mx
+    lea                  t2, [prep_avx2]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r6m ; my
+    test               mxyd, mxyd
+    jnz .v
+.prep:
+    movzx                wd, word [t2+wq*2+table_offset(prep,)]
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.prep_w4:
+    movd                xm0, [srcq+strideq*0]
+    pinsrd              xm0, [srcq+strideq*1], 1
+    pinsrd              xm0, [srcq+strideq*2], 2
+    pinsrd              xm0, [srcq+stride3q ], 3
+    lea                srcq, [srcq+strideq*4]
+    pmovzxbw             m0, xm0
+    psllw                m0, 4
+    mova             [tmpq], m0
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .prep_w4
+    RET
+.prep_w8:
+    movq                xm0, [srcq+strideq*0]
+    movhps              xm0, [srcq+strideq*1]
+    movq                xm1, [srcq+strideq*2]
+    movhps              xm1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pmovzxbw             m0, xm0
+    pmovzxbw             m1, xm1
+    psllw                m0, 4
+    psllw                m1, 4
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .prep_w8
+    RET
+.prep_w16:
+    pmovzxbw             m0, [srcq+strideq*0]
+    pmovzxbw             m1, [srcq+strideq*1]
+    pmovzxbw             m2, [srcq+strideq*2]
+    pmovzxbw             m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    add                tmpq, 32*4
+    sub                  hd, 4
+    jg .prep_w16
+    RET
+.prep_w32:
+    pmovzxbw             m0, [srcq+strideq*0+16*0]
+    pmovzxbw             m1, [srcq+strideq*0+16*1]
+    pmovzxbw             m2, [srcq+strideq*1+16*0]
+    pmovzxbw             m3, [srcq+strideq*1+16*1]
+    lea                srcq, [srcq+strideq*2]
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    add                tmpq, 32*4
+    sub                  hd, 2
+    jg .prep_w32
+    RET
+.prep_w64:
+    pmovzxbw             m0, [srcq+16*0]
+    pmovzxbw             m1, [srcq+16*1]
+    pmovzxbw             m2, [srcq+16*2]
+    pmovzxbw             m3, [srcq+16*3]
+    add                srcq, strideq
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    add                tmpq, 32*4
+    dec                  hd
+    jg .prep_w64
+    RET
+.prep_w128:
+    pmovzxbw             m0, [srcq+16*0]
+    pmovzxbw             m1, [srcq+16*1]
+    pmovzxbw             m2, [srcq+16*2]
+    pmovzxbw             m3, [srcq+16*3]
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    pmovzxbw             m0, [srcq+16*4]
+    pmovzxbw             m1, [srcq+16*5]
+    pmovzxbw             m2, [srcq+16*6]
+    pmovzxbw             m3, [srcq+16*7]
+    add                tmpq, 32*8
+    add                srcq, strideq
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq-32*4], m0
+    mova        [tmpq-32*3], m1
+    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*1], m3
+    dec                  hd
+    jg .prep_w128
+    RET
+.h:
+    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+    ; = (16 - mx) * src[x] + mx * src[x + 1]
+    imul               mxyd, 0xff01
+    vbroadcasti128       m4, [bilin_h_shuf8]
+    add                mxyd, 16 << 8
+    movd                xm5, mxyd
+    mov                mxyd, r6m ; my
+    vpbroadcastw         m5, xm5
+    test               mxyd, mxyd
+    jnz .hv
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.h_w4:
+    vbroadcasti128       m4, [bilin_h_shuf4]
+.h_w4_loop:
+    movq                xm0, [srcq+strideq*0]
+    movhps              xm0, [srcq+strideq*1]
+    movq                xm1, [srcq+strideq*2]
+    movhps              xm1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vinserti128          m0, m0, xm1, 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+    mova             [tmpq], m0
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+.h_w8:
+    movu                xm0,     [srcq+strideq*0]
+    vinserti128          m0, m0, [srcq+strideq*1], 1
+    movu                xm1,     [srcq+strideq*2]
+    vinserti128          m1, m1, [srcq+stride3q ], 1
+    lea                srcq,     [srcq+strideq*4]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .h_w8
+    RET
+.h_w16:
+    movu                xm0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    movu                xm1,     [srcq+strideq*1+8*0]
+    vinserti128          m1, m1, [srcq+strideq*1+8*1], 1
+    movu                xm2,     [srcq+strideq*2+8*0]
+    vinserti128          m2, m2, [srcq+strideq*2+8*1], 1
+    movu                xm3,     [srcq+stride3q +8*0]
+    vinserti128          m3, m3, [srcq+stride3q +8*1], 1
+    lea                srcq,     [srcq+strideq*4]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    add                tmpq, 32*4
+    sub                  hd, 4
+    jg .h_w16
+    RET
+.h_w32:
+    movu                xm0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    movu                xm1,     [srcq+strideq*0+8*2]
+    vinserti128          m1, m1, [srcq+strideq*0+8*3], 1
+    movu                xm2,     [srcq+strideq*1+8*0]
+    vinserti128          m2, m2, [srcq+strideq*1+8*1], 1
+    movu                xm3,     [srcq+strideq*1+8*2]
+    vinserti128          m3, m3, [srcq+strideq*1+8*3], 1
+    lea                srcq,     [srcq+strideq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    add                tmpq, 32*4
+    sub                  hd, 2
+    jg .h_w32
+    RET
+.h_w64:
+    movu                xm0,     [srcq+8*0]
+    vinserti128          m0, m0, [srcq+8*1], 1
+    movu                xm1,     [srcq+8*2]
+    vinserti128          m1, m1, [srcq+8*3], 1
+    movu                xm2,     [srcq+8*4]
+    vinserti128          m2, m2, [srcq+8*5], 1
+    movu                xm3,     [srcq+8*6]
+    vinserti128          m3, m3, [srcq+8*7], 1
+    add                srcq, strideq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    add                tmpq, 32*4
+    dec                  hd
+    jg .h_w64
+    RET
+.h_w128:
+    movu                xm0,     [srcq+8*0]
+    vinserti128          m0, m0, [srcq+8*1], 1
+    movu                xm1,     [srcq+8*2]
+    vinserti128          m1, m1, [srcq+8*3], 1
+    movu                xm2,     [srcq+8*4]
+    vinserti128          m2, m2, [srcq+8*5], 1
+    movu                xm3,     [srcq+8*6]
+    vinserti128          m3, m3, [srcq+8*7], 1
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+32*0], m0
+    mova        [tmpq+32*1], m1
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m3
+    movu                xm0,     [srcq+8* 8]
+    vinserti128          m0, m0, [srcq+8* 9], 1
+    movu                xm1,     [srcq+8*10]
+    vinserti128          m1, m1, [srcq+8*11], 1
+    movu                xm2,     [srcq+8*12]
+    vinserti128          m2, m2, [srcq+8*13], 1
+    movu                xm3,     [srcq+8*14]
+    vinserti128          m3, m3, [srcq+8*15], 1
+    add                tmpq, 32*8
+    add                srcq, strideq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq-32*4], m0
+    mova        [tmpq-32*3], m1
+    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*1], m3
+    dec                  hd
+    jg .h_w128
+    RET
+.v:
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+    imul               mxyd, 0xff01
+    add                mxyd, 16 << 8
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    movd                xm6, mxyd
+    vpbroadcastw         m6, xm6
+    jmp                  wq
+.v_w4:
+    movd                xm0, [srcq+strideq*0]
+.v_w4_loop:
+    vpbroadcastd         m1, [srcq+strideq*2]
+    vpbroadcastd        xm2, [srcq+strideq*1]
+    vpbroadcastd         m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd             m1, m1, m0, 0x05 ; 0 2 2 2
+    vpbroadcastd         m0, [srcq+strideq*0]
+    vpblendd             m3, m3, m2, 0x0f ; 1 1 3 3
+    vpblendd             m2, m1, m0, 0xa0 ; 0 2 2 4
+    vpblendd             m1, m1, m3, 0xaa ; 0 1 2 3
+    vpblendd             m2, m2, m3, 0x55 ; 1 2 3 4
+    punpcklbw            m2, m1
+    pmaddubsw            m2, m6
+    mova             [tmpq], m2
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                xm0, [srcq+strideq*0]
+.v_w8_loop:
+    vpbroadcastq         m1, [srcq+strideq*2]
+    vpbroadcastq         m2, [srcq+strideq*1]
+    vpbroadcastq         m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd             m1, m1, m0, 0x03 ; 0 2 2 2
+    vpbroadcastq         m0, [srcq+strideq*0]
+    vpblendd             m3, m3, m2, 0x33 ; 1 3 1 3
+    vpblendd             m2, m1, m3, 0x0f ; 1 3 2 2
+    vpblendd             m1, m1, m3, 0xf0 ; 0 2 1 3
+    vpblendd             m2, m2, m0, 0xc0 ; 1 3 2 4
+    punpcklbw            m3, m2, m1
+    punpckhbw            m2, m1
+    pmaddubsw            m3, m6
+    pmaddubsw            m2, m6
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*1], m2
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .v_w8_loop
+    RET
+.v_w16:
+    vbroadcasti128       m0, [srcq+strideq*0]
+.v_w16_loop:
+    vbroadcasti128       m1, [srcq+strideq*2]
+    vbroadcasti128       m2, [srcq+strideq*1]
+    vbroadcasti128       m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    shufpd               m4, m0, m1, 0x0c ; 0 2
+    vbroadcasti128       m0, [srcq+strideq*0]
+    shufpd               m2, m2, m3, 0x0c ; 1 3
+    shufpd               m1, m1, m0, 0x0c ; 2 4
+    punpcklbw            m3, m2, m4
+    punpcklbw            m5, m1, m2
+    punpckhbw            m1, m2
+    punpckhbw            m2, m4
+    pmaddubsw            m3, m6
+    pmaddubsw            m5, m6
+    pmaddubsw            m2, m6
+    pmaddubsw            m1, m6
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*1], m5
+    mova        [tmpq+32*2], m2
+    mova        [tmpq+32*3], m1
+    add                tmpq, 32*4
+    sub                  hd, 4
+    jg .v_w16_loop
+    RET
+.v_w32:
+    vpermq               m0, [srcq+strideq*0], q3120
+.v_w32_loop:
+    vpermq               m1, [srcq+strideq*1], q3120
+    vpermq               m2, [srcq+strideq*2], q3120
+    vpermq               m3, [srcq+stride3q ], q3120
+    lea                srcq, [srcq+strideq*4]
+    punpcklbw            m4, m1, m0
+    punpckhbw            m5, m1, m0
+    vpermq               m0, [srcq+strideq*0], q3120
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*0], m4
+    mova        [tmpq+32*1], m5
+    punpcklbw            m4, m2, m1
+    punpckhbw            m5, m2, m1
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*2], m4
+    mova        [tmpq+32*3], m5
+    add                tmpq, 32*8
+    punpcklbw            m4, m3, m2
+    punpckhbw            m5, m3, m2
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    pmaddubsw            m1, m6
+    pmaddubsw            m2, m6
+    mova        [tmpq-32*4], m4
+    mova        [tmpq-32*3], m5
+    mova        [tmpq-32*2], m1
+    mova        [tmpq-32*1], m2
+    sub                  hd, 4
+    jg .v_w32_loop
+    RET
+.v_w64:
+    vpermq               m0, [srcq+strideq*0+32*0], q3120
+    vpermq               m1, [srcq+strideq*0+32*1], q3120
+.v_w64_loop:
+    vpermq               m2, [srcq+strideq*1+32*0], q3120
+    vpermq               m3, [srcq+strideq*1+32*1], q3120
+    lea                srcq, [srcq+strideq*2]
+    punpcklbw            m4, m2, m0
+    punpckhbw            m5, m2, m0
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*0], m4
+    mova        [tmpq+32*1], m5
+    punpcklbw            m4, m3, m1
+    punpckhbw            m5, m3, m1
+    vpermq               m0, [srcq+strideq*0+32*0], q3120
+    vpermq               m1, [srcq+strideq*0+32*1], q3120
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*2], m4
+    mova        [tmpq+32*3], m5
+    add                tmpq, 32*8
+    punpcklbw            m4, m0, m2
+    punpckhbw            m5, m0, m2
+    punpcklbw            m2, m1, m3
+    punpckhbw            m3, m1, m3
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    mova        [tmpq-32*4], m4
+    mova        [tmpq-32*3], m5
+    mova        [tmpq-32*2], m2
+    mova        [tmpq-32*1], m3
+    sub                  hd, 2
+    jg .v_w64_loop
+    RET
+.v_w128:
+    mov                  t0, tmpq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(3<<8)]
+.v_w128_loop0:
+    vpermq               m0, [srcq+strideq*0], q3120
+.v_w128_loop:
+    vpermq               m1, [srcq+strideq*1], q3120
+    lea                srcq, [srcq+strideq*2]
+    punpcklbw            m2, m1, m0
+    punpckhbw            m3, m1, m0
+    vpermq               m0, [srcq+strideq*0], q3120
+    punpcklbw            m4, m0, m1
+    punpckhbw            m5, m0, m1
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m6
+    mova        [tmpq+32*0], m2
+    mova        [tmpq+32*1], m3
+    mova        [tmpq+32*8], m4
+    mova        [tmpq+32*9], m5
+    add                tmpq, 32*16
+    sub                  hd, 2
+    jg .v_w128_loop
+    mov                  hb, t2b
+    add                  t0, 64
+    add                  t1, 32
+    mov                tmpq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .v_w128_loop0
+    RET
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+    shl                mxyd, 11
+    movd                xm6, mxyd
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    vpbroadcastw         m6, xm6
+    jmp                  wq
+.hv_w4:
+    vbroadcasti128       m4, [bilin_h_shuf4]
+    vpbroadcastq         m0, [srcq+strideq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w4_loop:
+    movq                xm1, [srcq+strideq*1]
+    movhps              xm1, [srcq+strideq*2]
+    movq                xm2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movhps              xm2, [srcq+strideq*0]
+    vinserti128          m1, m1, xm2, 1
+    pshufb               m1, m4
+    pmaddubsw            m1, m5        ; 1 2 3 4
+    vpblendd             m2, m1, m0, 0xc0
+    vpermq               m2, m2, q2103 ; 0 1 2 3
+    mova                 m0, m1
+    psubw                m1, m2
+    pmulhrsw             m1, m6
+    paddw                m1, m2
+    mova             [tmpq], m1
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    vbroadcasti128       m0,     [srcq+strideq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w8_loop:
+    movu                xm1,     [srcq+strideq*1]
+    vinserti128          m1, m1, [srcq+strideq*2], 1
+    movu                xm2,     [srcq+stride3q ]
+    lea                srcq,     [srcq+strideq*4]
+    vinserti128          m2, m2, [srcq+strideq*0], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5           ; 1 2
+    vperm2i128           m3, m0, m1, 0x21 ; 0 1
+    pmaddubsw            m0, m2, m5       ; 3 4
+    vperm2i128           m2, m1, m0, 0x21 ; 2 3
+    psubw                m1, m3
+    pmulhrsw             m1, m6
+    paddw                m1, m3
+    psubw                m3, m0, m2
+    pmulhrsw             m3, m6
+    paddw                m3, m2
+    mova        [tmpq+32*0], m1
+    mova        [tmpq+32*1], m3
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .hv_w8_loop
+    RET
+.hv_w16:
+    movu                 m0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w16_loop:
+    movu                xm1,     [srcq+strideq*1+8*0]
+    vinserti128          m1, m1, [srcq+strideq*1+8*1], 1
+    lea                srcq,     [srcq+strideq*2]
+    movu                xm2,     [srcq+strideq*0+8*0]
+    vinserti128          m2, m2, [srcq+strideq*0+8*1], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    psubw                m3, m1, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    pmaddubsw            m0, m2, m5
+    psubw                m2, m0, m1
+    pmulhrsw             m2, m6
+    paddw                m2, m1
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*1], m2
+    add                tmpq, 32*2
+    sub                  hd, 2
+    jg .hv_w16_loop
+    RET
+.hv_w32:
+    movu                 m0,     [srcq+8*0]
+    vinserti128          m0, m0, [srcq+8*1], 1
+    movu                 m1,     [srcq+8*2]
+    vinserti128          m1, m1, [srcq+8*3], 1
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+.hv_w32_loop:
+    add                srcq, strideq
+    movu                xm2,     [srcq+8*0]
+    vinserti128          m2, m2, [srcq+8*1], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    mova                 m0, m2
+    mova          [tmpq+ 0], m3
+    movu                xm2,     [srcq+8*2]
+    vinserti128          m2, m2, [srcq+8*3], 1
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m1
+    pmulhrsw             m3, m6
+    paddw                m3, m1
+    mova                 m1, m2
+    mova          [tmpq+32], m3
+    add                tmpq, 32*2
+    dec                  hd
+    jg .hv_w32_loop
+    RET
+.hv_w64:
+    mov                  t0, tmpq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(3<<8)]
+.hv_w64_loop0:
+    movu                 m0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w64_loop:
+    movu                xm1,     [srcq+strideq*1+8*0]
+    vinserti128          m1, m1, [srcq+strideq*1+8*1], 1
+    lea                srcq,     [srcq+strideq*2]
+    movu                xm2,     [srcq+strideq*0+8*0]
+    vinserti128          m2, m2, [srcq+strideq*0+8*1], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    psubw                m3, m1, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    pmaddubsw            m0, m2, m5
+    psubw                m2, m0, m1
+    pmulhrsw             m2, m6
+    paddw                m2, m1
+    mova        [tmpq+32*0], m3
+    add                tmpq, 32*8
+    mova        [tmpq-32*4], m2
+    sub                  hd, 2
+    jg .hv_w64_loop
+    mov                  hb, t2b
+    add                  t0, 32
+    add                  t1, 16
+    mov                tmpq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .hv_w64_loop0
+    RET
+.hv_w128:
+    mov                  t0, tmpq
+    mov                  t1, srcq
+    lea                 t2d, [hq+(7<<8)]
+.hv_w128_loop0:
+    movu                 m0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w128_loop:
+    movu                xm1,     [srcq+strideq*1+8*0]
+    vinserti128          m1, m1, [srcq+strideq*1+8*1], 1
+    lea                srcq,     [srcq+strideq*2]
+    movu                xm2,     [srcq+strideq*0+8*0]
+    vinserti128          m2, m2, [srcq+strideq*0+8*1], 1
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    psubw                m3, m1, m0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    pmaddubsw            m0, m2, m5
+    psubw                m2, m0, m1
+    pmulhrsw             m2, m6
+    paddw                m2, m1
+    mova        [tmpq+32*0], m3
+    mova        [tmpq+32*8], m2
+    add                tmpq, 32*16
+    sub                  hd, 2
+    jg .hv_w128_loop
+    mov                  hb, t2b
+    add                  t0, 32
+    add                  t1, 16
+    mov                tmpq, t0
+    mov                srcq, t1
+    sub                 t2d, 1<<8
+    jg .hv_w128_loop0
+    RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
+%assign FILTER_SHARP   (2*15 << 16) | 3*15
+
+DECLARE_REG_TMP 7, 8
+%macro PUT_8TAP_FN 3 ; type, type_h, type_v
+cglobal put_8tap_%1
+    mov                 t0d, FILTER_%2
+    mov                 t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PUT_8TAP_FN regular,        REGULAR, REGULAR
+PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_FN sharp,          SHARP,   SHARP
+PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+cglobal put_8tap, 4, 9, 16, dst, ds, src, ss, w, h, mx, my, ss3
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    lea                  r8, [put_avx2]
+    movsxd               wq, wm
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jnz .v
+    tzcnt                wd, wd
+    movzx                wd, word [r8+wq*2+table_offset(put,)]
+    add                  wq, r8
+    lea                  r6, [ssq*3]
+    lea                  r7, [dsq*3]
+    jmp                  wq
+.h:
+    test                myd, 0xf00
+    jnz .hv
+    vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
+    cmp                  wd, 4
+    jl .h_w2
+    vbroadcasti128       m6, [subpel_h_shufA]
+    je .h_w4
+    tzcnt                wd, wd
+    vbroadcasti128       m7, [subpel_h_shufB]
+    vbroadcasti128       m8, [subpel_h_shufC]
+    shr                 mxd, 16
+    sub                srcq, 3
+    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+    vpbroadcastd         m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+    add                  wq, r8
+    jmp                  wq
+.h_w2:
+    movzx               mxd, mxb
+    dec                srcq
+    mova                xm4, [subpel_h_shuf4]
+    vpbroadcastd        xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w2_loop:
+    movq                xm0, [srcq+ssq*0]
+    movhps              xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm4
+    pmaddubsw           xm0, xm3
+    phaddw              xm0, xm0
+    paddw               xm0, xm5
+    psraw               xm0, 6
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2_loop
+    RET
+.h_w4:
+    movzx               mxd, mxb
+    dec                srcq
+    vpbroadcastd        xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w4_loop:
+    movq                xm0, [srcq+ssq*0]
+    movq                xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm0, xm6
+    pshufb              xm1, xm6
+    pmaddubsw           xm0, xm3
+    pmaddubsw           xm1, xm3
+    phaddw              xm0, xm1
+    paddw               xm0, xm5
+    psraw               xm0, 6
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4_loop
+    RET
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+    pshufb              m%2, m%1, m7
+    pshufb              m%3, m%1, m8
+    pshufb              m%1, m6
+    pmaddubsw           m%4, m%2, m9
+    pmaddubsw           m%2, m10
+    pmaddubsw           m%3, m10
+    pmaddubsw           m%1, m9
+    paddw               m%3, m%4
+    paddw               m%1, m%2
+    phaddw              m%1, m%3
+    paddw               m%1, m5
+    psraw               m%1, 6
+%endmacro
+    movu                xm0,     [srcq+ssq*0]
+    vinserti128          m0, m0, [srcq+ssq*1], 1
+    lea                srcq,     [srcq+ssq*2]
+    PUT_8TAP_H            0, 1, 2, 3
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                xm0,     [srcq+ssq*0+8*0]
+    vinserti128          m0, m0, [srcq+ssq*1+8*0], 1
+    movu                xm1,     [srcq+ssq*0+8*1]
+    vinserti128          m1, m1, [srcq+ssq*1+8*1], 1
+    PUT_8TAP_H            0, 2, 3, 4
+    lea                srcq, [srcq+ssq*2]
+    PUT_8TAP_H            1, 2, 3, 4
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w16
+    RET
+.h_w32:
+    xor                 r6d, r6d
+    jmp .h_start
+.h_w64:
+    mov                  r6, -32*1
+    jmp .h_start
+.h_w128:
+    mov                  r6, -32*3
+.h_start:
+    sub                srcq, r6
+    sub                dstq, r6
+    mov                  r4, r6
+.h_loop:
+    movu                 m0, [srcq+r6+8*0]
+    movu                 m1, [srcq+r6+8*1]
+    PUT_8TAP_H            0, 2, 3, 4
+    PUT_8TAP_H            1, 2, 3, 4
+    packuswb             m0, m1
+    mova          [dstq+r6], m0
+    add                  r6, 32
+    jle .h_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    mov                  r6, r4
+    dec                  hd
+    jg .h_loop
+    RET
+.v:
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    tzcnt               r6d, wd
+    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+    vpbroadcastd         m7, [pw_512]
+    lea                 myq, [r8+myq*8+subpel_filters-put_avx2]
+    vpbroadcastw         m8, [myq+0]
+    vpbroadcastw         m9, [myq+2]
+    vpbroadcastw        m10, [myq+4]
+    vpbroadcastw        m11, [myq+6]
+    add                  r6, r8
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+    jmp                  r6
+.v_w2:
+    movd                xm2, [srcq+ssq*0]
+    pinsrw              xm2, [srcq+ssq*1], 2
+    pinsrw              xm2, [srcq+ssq*2], 4
+    pinsrw              xm2, [srcq+ss3q ], 6 ; 0 1 2 3
+    lea                srcq, [srcq+ssq*4]
+    movd                xm3, [srcq+ssq*0]
+    vpbroadcastd        xm1, [srcq+ssq*1]
+    vpbroadcastd        xm0, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd            xm3, xm3, xm1, 0x02  ; 4 5
+    vpblendd            xm1, xm1, xm0, 0x02  ; 5 6
+    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
+    punpcklbw           xm3, xm1             ; 45 56
+    punpcklbw           xm1, xm2, xm4        ; 01 12
+    punpckhbw           xm2, xm4             ; 23 34
+.v_w2_loop:
+    pmaddubsw           xm5, xm1, xm8        ; a0 b0
+    mova                xm1, xm2
+    pmaddubsw           xm2, xm9             ; a1 b1
+    paddw               xm5, xm2
+    mova                xm2, xm3
+    pmaddubsw           xm3, xm10            ; a2 b2
+    paddw               xm5, xm3
+    vpbroadcastd        xm4, [srcq+ssq*0]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd            xm4, xm4, xm0, 0x02  ; 7 8
+    punpcklbw           xm3, xm4             ; 67 78
+    pmaddubsw           xm4, xm3, xm11       ; a3 b3
+    paddw               xm5, xm4
+    pmulhrsw            xm5, xm7
+    packuswb            xm5, xm5
+    pextrw     [dstq+dsq*0], xm5, 0
+    pextrw     [dstq+dsq*1], xm5, 2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movd                xm2, [srcq+ssq*0]
+    pinsrd              xm2, [srcq+ssq*1], 1
+    pinsrd              xm2, [srcq+ssq*2], 2
+    pinsrd              xm2, [srcq+ss3q ], 3 ; 0 1 2 3
+    lea                srcq, [srcq+ssq*4]
+    movd                xm3, [srcq+ssq*0]
+    vpbroadcastd        xm1, [srcq+ssq*1]
+    vpbroadcastd        xm0, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd            xm3, xm3, xm1, 0x02  ; 4 5
+    vpblendd            xm1, xm1, xm0, 0x02  ; 5 6
+    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
+    punpcklbw           xm3, xm1             ; 45 56
+    punpcklbw           xm1, xm2, xm4        ; 01 12
+    punpckhbw           xm2, xm4             ; 23 34
+.v_w4_loop:
+    pmaddubsw           xm5, xm1, xm8        ; a0 b0
+    mova                xm1, xm2
+    pmaddubsw           xm2, xm9             ; a1 b1
+    paddw               xm5, xm2
+    mova                xm2, xm3
+    pmaddubsw           xm3, xm10            ; a2 b2
+    paddw               xm5, xm3
+    vpbroadcastd        xm4, [srcq+ssq*0]
+    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
+    vpbroadcastd        xm0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd            xm4, xm4, xm0, 0x02  ; 7 8
+    punpcklbw           xm3, xm4             ; 67 78
+    pmaddubsw           xm4, xm3, xm11       ; a3 b3
+    paddw               xm5, xm4
+    pmulhrsw            xm5, xm7
+    packuswb            xm5, xm5
+    movd       [dstq+dsq*0], xm5
+    pextrd     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                xm1, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m2, [srcq+ssq*2]
+    vpbroadcastq         m5, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpbroadcastq         m3, [srcq+ssq*0]
+    vpbroadcastq         m6, [srcq+ssq*1]
+    vpbroadcastq         m0, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd             m1, m1, m4, 0x30
+    vpblendd             m4, m4, m2, 0x30
+    punpcklbw            m1, m4 ; 01 12
+    vpblendd             m2, m2, m5, 0x30
+    vpblendd             m5, m5, m3, 0x30
+    punpcklbw            m2, m5 ; 23 34
+    vpblendd             m3, m3, m6, 0x30
+    vpblendd             m6, m6, m0, 0x30
+    punpcklbw            m3, m6 ; 45 56
+.v_w8_loop:
+    pmaddubsw            m5, m1, m8  ; a0 b0
+    mova                 m1, m2
+    pmaddubsw            m2, m9      ; a1 b1
+    paddw                m5, m2
+    mova                 m2, m3
+    pmaddubsw            m3, m10     ; a2 b2
+    paddw                m5, m3
+    vpbroadcastq         m4, [srcq+ssq*0]
+    vpblendd             m3, m0, m4, 0x30
+    vpbroadcastq         m0, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd             m4, m4, m0, 0x30
+    punpcklbw            m3, m4      ; 67 78
+    pmaddubsw            m4, m3, m11 ; a3 b3
+    paddw                m5, m4
+    pmulhrsw             m5, m7
+    vextracti128        xm4, m5, 1
+    packuswb            xm5, xm4
+    movq       [dstq+dsq*0], xm5
+    movhps     [dstq+dsq*1], xm5
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+    lea                 r6d, [wq-16]
+    mov                  r4, dstq
+    mov                  r7, srcq
+    shl                 r6d, 4
+    mov                 r6b, hb
+.v_w16_loop0:
+    vbroadcasti128       m4, [srcq+ssq*0]
+    vbroadcasti128       m5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m0, [srcq+ssq*1]
+    vbroadcasti128       m6, [srcq+ssq*0]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m1, [srcq+ssq*0]
+    vbroadcasti128       m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128       m3, [srcq+ssq*0]
+    shufpd               m4, m4, m0, 0x0c
+    shufpd               m5, m5, m1, 0x0c
+    punpcklbw            m1, m4, m5 ; 01
+    punpckhbw            m4, m5     ; 34
+    shufpd               m6, m6, m2, 0x0c
+    punpcklbw            m2, m5, m6 ; 12
+    punpckhbw            m5, m6     ; 45
+    shufpd               m0, m0, m3, 0x0c
+    punpcklbw            m3, m6, m0 ; 23
+    punpckhbw            m6, m0     ; 56
+.v_w16_loop:
+    vbroadcasti128      m12, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vbroadcasti128      m13, [srcq+ssq*0]
+    pmaddubsw           m14, m1, m8  ; a0
+    pmaddubsw           m15, m2, m8  ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddubsw            m3, m9      ; a1
+    pmaddubsw            m4, m9      ; b1
+    paddw               m14, m3
+    paddw               m15, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddubsw            m5, m10     ; a2
+    pmaddubsw            m6, m10     ; b2
+    paddw               m14, m5
+    paddw               m15, m6
+    shufpd               m6, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m6, m0  ; 67
+    punpckhbw            m6, m0      ; 78
+    pmaddubsw           m12, m5, m11 ; a3
+    pmaddubsw           m13, m6, m11 ; b3
+    paddw               m14, m12
+    paddw               m15, m13
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    packuswb            m14, m15
+    vpermq              m14, m14, q3120
+    mova         [dstq+dsq*0], xm14
+    vextracti128 [dstq+dsq*1], m14, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w16_loop
+    mov                  hb, r6b
+    add                  r4, 16
+    add                  r7, 16
+    mov                dstq, r4
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .v_w16_loop0
+    RET
+.hv:
+    cmp                  wd, 4
+    jg .hv_w8
+    movzx               mxd, mxb
+    dec                srcq
+    vpbroadcastd         m7, [r8+mxq*8+subpel_filters-put_avx2+2]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    vpbroadcastd         m8, [pw_8192]
+    vpbroadcastd         m9, [pd_512]
+    pshufd              m10, m0, q0000
+    pshufd              m11, m0, q1111
+    pshufd              m12, m0, q2222
+    pshufd              m13, m0, q3333
+    cmp                  wd, 4
+    je .hv_w4
+    vbroadcasti128       m6, [subpel_h_shuf4]
+    movq                xm2, [srcq+ssq*0]
+    movhps              xm2, [srcq+ssq*1]
+    movq                xm0, [srcq+ssq*2]
+    movhps              xm0, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpbroadcastq         m3, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd             m2, m2, m3, 0x30
+    vpblendd             m0, m0, m1, 0x30
+    vpblendd             m2, m2, m4, 0xc0
+    pshufb               m2, m6
+    pshufb               m0, m6
+    pmaddubsw            m2, m7
+    pmaddubsw            m0, m7
+    phaddw               m2, m0
+    pmulhrsw             m2, m8
+    vextracti128        xm3, m2, 1
+    palignr             xm4, xm3, xm2, 4
+    punpcklwd           xm1, xm2, xm4  ; 01 12
+    punpckhwd           xm2, xm4       ; 23 34
+    pshufd              xm0, xm3, q2121
+    punpcklwd           xm3, xm0       ; 45 56
+.hv_w2_loop:
+    pmaddwd             xm5, xm1, xm10 ; a0 b0
+    mova                xm1, xm2
+    pmaddwd             xm2, xm11      ; a1 b1
+    paddd               xm5, xm2
+    mova                xm2, xm3
+    pmaddwd             xm3, xm12      ; a2 b2
+    paddd               xm5, xm3
+    movq                xm4, [srcq+ssq*0]
+    movhps              xm4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb              xm4, xm6
+    pmaddubsw           xm4, xm7
+    phaddw              xm4, xm4
+    pmulhrsw            xm4, xm8
+    palignr             xm3, xm4, xm0, 12
+    mova                xm0, xm4
+    punpcklwd           xm3, xm0       ; 67 78
+    pmaddwd             xm4, xm3, xm13 ; a3 b3
+    paddd               xm5, xm9
+    paddd               xm5, xm4
+    psrad               xm5, 10
+    packssdw            xm5, xm5
+    packuswb            xm5, xm5
+    pextrw     [dstq+dsq*0], xm5, 0
+    pextrw     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    mova                 m6, [subpel_h_shuf4]
+    vpbroadcastq         m2, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m0, [srcq+ssq*2]
+    vpbroadcastq         m5, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpbroadcastq         m3, [srcq+ssq*0]
+    vpblendd             m2, m2, m4, 0xcc ; 0 1
+    vpbroadcastq         m4, [srcq+ssq*1]
+    vpbroadcastq         m1, [srcq+ssq*2]
+    add                srcq, ss3q
+    vpblendd             m0, m0, m5, 0xcc ; 2 3
+    vpblendd             m3, m3, m4, 0xcc ; 4 5
+    pshufb               m2, m6
+    pshufb               m0, m6
+    pshufb               m3, m6
+    pshufb               m1, m6
+    pmaddubsw            m2, m7
+    pmaddubsw            m0, m7
+    pmaddubsw            m3, m7
+    pmaddubsw            m1, m7
+    phaddw               m2, m0
+    phaddw               m3, m1
+    pmulhrsw             m2, m8
+    pmulhrsw             m3, m8
+    palignr              m4, m3, m2, 4
+    punpcklwd            m1, m2, m4  ; 01 12
+    punpckhwd            m2, m4      ; 23 34
+    pshufd               m0, m3, q2121
+    punpcklwd            m3, m0      ; 45 56
+.hv_w4_loop:
+    pmaddwd              m5, m1, m10 ; a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, m11     ; a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, m12     ; a2 b2
+    paddd                m5, m3
+    vpbroadcastq         m4, [srcq+ssq*0]
+    vpbroadcastq         m3, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    vpblendd             m4, m4, m3, 0xcc ; 7 8
+    pshufb               m4, m6
+    pmaddubsw            m4, m7
+    phaddw               m4, m4
+    pmulhrsw             m4, m8
+    palignr              m3, m4, m0, 12
+    mova                 m0, m4
+    punpcklwd            m3, m0      ; 67 78
+    pmaddwd              m4, m3, m13 ; a3 b3
+    paddd                m5, m9
+    paddd                m5, m4
+    psrad                m5, 10
+    vextracti128        xm4, m5, 1
+    packssdw            xm5, xm4
+    packuswb            xm5, xm5
+    pshuflw             xm5, xm5, q3120
+    movd       [dstq+dsq*0], xm5
+    pextrd     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    shr                 mxd, 16
+    sub                srcq, 3
+    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+0]
+    vpbroadcastd        m11, [r8+mxq*8+subpel_filters-put_avx2+4]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
+    lea                ss3q, [ssq*3]
+    sub                srcq, ss3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd              m12, m0, q0000
+    pshufd              m13, m0, q1111
+    pshufd              m14, m0, q2222
+    pshufd              m15, m0, q3333
+    lea                 r6d, [wq-8]
+    mov                  r4, dstq
+    mov                  r7, srcq
+    shl                 r6d, 5
+    mov                 r6b, hb
+.hv_w8_loop0:
+    vbroadcasti128       m7, [subpel_h_shufA]
+    vbroadcasti128       m8, [subpel_h_shufB]
+    vbroadcasti128       m9, [subpel_h_shufC]
+    movu                xm4,     [srcq+ssq*0]
+    movu                xm5,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    movu                xm6,     [srcq+ssq*0]
+    vbroadcasti128       m0,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    vpblendd             m4, m4, m0, 0xf0        ; 0 3
+    vinserti128          m5, m5, [srcq+ssq*0], 1 ; 1 4
+    vinserti128          m6, m6, [srcq+ssq*1], 1 ; 2 5
+    lea                srcq,     [srcq+ssq*2]
+    vinserti128          m0, m0, [srcq+ssq*0], 1 ; 3 6
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+    pshufb               %3, %1, %6
+    pshufb               %4, %1, %7
+    pshufb               %1, %5
+    pmaddubsw            %2, %3, m10
+    pmaddubsw            %4, m11
+    pmaddubsw            %3, m11
+    pmaddubsw            %1, m10
+    paddw                %2, %4
+    paddw                %1, %3
+    phaddw               %1, %2
+%endmacro
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m0, m1, m2, m3, m7, m8, m9
+    vpbroadcastd         m7, [pw_8192]
+    vpermq               m4, m4, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m6, m6, q3120
+    pmulhrsw             m0, m7
+    pmulhrsw             m4, m7
+    pmulhrsw             m5, m7
+    pmulhrsw             m6, m7
+    vpermq               m7, m0, q3120
+    punpcklwd            m1, m4, m5  ; 01
+    punpckhwd            m4, m5      ; 34
+    punpcklwd            m2, m5, m6  ; 12
+    punpckhwd            m5, m6      ; 45
+    punpcklwd            m3, m6, m7  ; 23
+    punpckhwd            m6, m7      ; 56
+.hv_w8_loop:
+    vextracti128        r6m, m0, 1 ; not enough registers
+    movu                xm0,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    vinserti128          m0, m0, [srcq+ssq*0], 1 ; 7 8
+    pmaddwd              m8, m1, m12 ; a0
+    pmaddwd              m9, m2, m12 ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddwd              m3, m13     ; a1
+    pmaddwd              m4, m13     ; b1
+    paddd                m8, m3
+    paddd                m9, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddwd              m5, m14     ; a2
+    pmaddwd              m6, m14     ; b2
+    paddd                m8, m5
+    paddd                m9, m6
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+    vbroadcasti128       m5, [subpel_h_shufA]
+    HV_H_W8              m0, m5, m6, m7, m5, m6, m7
+    vpbroadcastd         m5, [pw_8192]
+    vpbroadcastd         m7, [pd_512]
+    vbroadcasti128       m6, r6m
+    pmulhrsw             m0, m5
+    paddd                m8, m7
+    paddd                m9, m7
+    vpermq               m7, m0, q3120    ; 7 8
+    shufpd               m6, m6, m7, 0x04 ; 6 7
+    punpcklwd            m5, m6, m7  ; 67
+    punpckhwd            m6, m7      ; 78
+    pmaddwd              m7, m5, m15 ; a3
+    paddd                m8, m7
+    pmaddwd              m7, m6, m15 ; b3
+    paddd                m7, m9
+    psrad                m8, 10
+    psrad                m7, 10
+    packssdw             m8, m7
+    vextracti128        xm7, m8, 1
+    packuswb            xm8, xm7
+    pshufd              xm7, xm8, q3120
+    movq       [dstq+dsq*0], xm7
+    movhps     [dstq+dsq*1], xm7
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    mov                  hb, r6b
+    add                  r4, 8
+    add                  r7, 8
+    mov                dstq, r4
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .hv_w8_loop0
+    RET
+
+DECLARE_REG_TMP 6, 7
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+    mov                 t0d, FILTER_%2
+    mov                 t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PREP_8TAP_FN regular,        REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_FN sharp,          SHARP,   SHARP
+PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+cglobal prep_8tap, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    lea                  r7, [prep_avx2]
+    movsxd               wq, wm
+    movifnidn            hd, hm
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jnz .v
+    tzcnt                wd, wd
+    movzx                wd, word [r7+wq*2+table_offset(prep,)]
+    add                  wq, r7
+    lea                  r6, [strideq*3]
+    jmp                  wq
+.h:
+    test                myd, 0xf00
+    jnz .hv
+    vbroadcasti128       m5, [subpel_h_shufA]
+    vpbroadcastd         m4, [pw_8192]
+    cmp                  wd, 4
+    je .h_w4
+    tzcnt                wd, wd
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+    shr                 mxd, 16
+    sub                srcq, 3
+    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep_avx2+0]
+    vpbroadcastd         m9, [r7+mxq*8+subpel_filters-prep_avx2+4]
+    add                  wq, r7
+    jmp                  wq
+.h_w4:
+    movzx               mxd, mxb
+    dec                srcq
+    vpbroadcastd         m3, [r7+mxq*8+subpel_filters-prep_avx2+2]
+    lea            stride3q, [strideq*3]
+.h_w4_loop:
+    movq                xm0, [srcq+strideq*0]
+    vpbroadcastq         m2, [srcq+strideq*2]
+    movq                xm1, [srcq+strideq*1]
+    vpblendd             m0, m0, m2, 0xf0
+    vpbroadcastq         m2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd             m1, m1, m2, 0xf0
+    pshufb               m0, m5
+    pshufb               m1, m5
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    phaddw               m0, m1
+    pmulhrsw             m0, m4
+    mova             [tmpq], m0
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+.h_w8:
+%macro PREP_8TAP_H 0
+    pshufb               m1, m0, m6
+    pshufb               m2, m0, m7
+    pshufb               m0, m5
+    pmaddubsw            m3, m1, m8
+    pmaddubsw            m1, m9
+    pmaddubsw            m2, m9
+    pmaddubsw            m0, m8
+    paddw                m2, m3
+    paddw                m0, m1
+    phaddw               m0, m2
+    pmulhrsw             m0, m4
+%endmacro
+    movu                xm0,     [srcq+strideq*0]
+    vinserti128          m0, m0, [srcq+strideq*1], 1
+    lea                srcq,     [srcq+strideq*2]
+    PREP_8TAP_H
+    mova             [tmpq], m0
+    add                tmpq, 32
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                xm0,     [srcq+strideq*0+8*0]
+    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+    PREP_8TAP_H
+    mova        [tmpq+32*0], m0
+    movu                xm0,     [srcq+strideq*1+8*0]
+    vinserti128          m0, m0, [srcq+strideq*1+8*1], 1
+    lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_H
+    mova        [tmpq+32*1], m0
+    add                tmpq, 64
+    sub                  hd, 2
+    jg .h_w16
+    RET
+.h_w32:
+    xor                 r6d, r6d
+    jmp .h_start
+.h_w64:
+    mov                  r6, -32*1
+    jmp .h_start
+.h_w128:
+    mov                  r6, -32*3
+.h_start:
+    sub                srcq, r6
+    mov                  r5, r6
+.h_loop:
+    movu                xm0,     [srcq+r6+8*0]
+    vinserti128          m0, m0, [srcq+r6+8*1], 1
+    PREP_8TAP_H
+    mova        [tmpq+32*0], m0
+    movu                xm0,     [srcq+r6+8*2]
+    vinserti128          m0, m0, [srcq+r6+8*3], 1
+    PREP_8TAP_H
+    mova        [tmpq+32*1], m0
+    add                tmpq, 64
+    add                  r6, 32
+    jle .h_loop
+    add                srcq, strideq
+    mov                  r6, r5
+    dec                  hd
+    jg .h_loop
+    RET
+.v:
+    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
+    shr                 myd, 16  ; Note that the code is 8-tap only, having
+    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
+    cmove               myd, mxd ; had a negligible effect on performance.
+    ; TODO: Would a 6-tap code path be worth it?
+    vpbroadcastd         m7, [pw_8192]
+    lea                 myq, [r7+myq*8+subpel_filters-prep_avx2]
+    vpbroadcastw         m8, [myq+0]
+    vpbroadcastw         m9, [myq+2]
+    vpbroadcastw        m10, [myq+4]
+    vpbroadcastw        m11, [myq+6]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    cmp                  wd, 8
+    jg .v_w16
+    je .v_w8
+.v_w4:
+    movd                xm0, [srcq+strideq*0]
+    vpbroadcastd         m1, [srcq+strideq*2]
+    vpbroadcastd        xm2, [srcq+strideq*1]
+    vpbroadcastd         m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd             m1, m1, m0, 0x01 ; 0 2 2 _   2 _ _ _
+    vpblendd             m3, m3, m2, 0x03 ; 1 1 3 3   3 3 _ _
+    vpbroadcastd         m0, [srcq+strideq*0]
+    vpbroadcastd         m2, [srcq+strideq*1]
+    vpblendd             m1, m1, m0, 0x68 ; 0 2 2 4   2 4 4 _
+    vpbroadcastd         m0, [srcq+strideq*2]
+    vbroadcasti128       m6, [deint_shuf4]
+    vpblendd             m3, m3, m2, 0xc0 ; 1 1 3 3   3 3 5 5
+    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
+    vpblendd             m3, m3, m1, 0xaa ; 1 2 3 4   3 4 5 _
+    punpcklbw            m1, m2, m3       ; 01  12    23  34
+    vpblendd             m3, m3, m0, 0x80 ; 1 2 3 4   3 4 5 6
+    punpckhbw            m2, m3           ; 23  34    45  56
+.v_w4_loop:
+    pinsrd              xm0, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastd         m3, [srcq+strideq*0]
+    vpbroadcastd         m4, [srcq+strideq*1]
+    vpblendd             m3, m3, m4, 0x20 ; _ _ 8 _   8 9 _ _
+    vpblendd             m3, m3, m0, 0x03 ; 6 7 8 _   8 9 _ _
+    vpbroadcastd         m0, [srcq+strideq*2]
+    vpblendd             m3, m3, m0, 0x40 ; 6 7 8 _   8 9 a _
+    pshufb               m3, m6           ; 67  78    89  9a
+    pmaddubsw            m4, m1, m8
+    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
+    pmaddubsw            m2, m9
+    paddw                m4, m2
+    mova                 m2, m3
+    pmaddubsw            m3, m11
+    paddw                m3, m4
+    pmaddubsw            m4, m1, m10
+    paddw                m3, m4
+    pmulhrsw             m3, m7
+    mova             [tmpq], m3
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                xm1, [srcq+strideq*0]
+    vpbroadcastq         m4, [srcq+strideq*1]
+    vpbroadcastq         m2, [srcq+strideq*2]
+    vpbroadcastq         m5, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m3, [srcq+strideq*0]
+    vpbroadcastq         m6, [srcq+strideq*1]
+    vpbroadcastq         m0, [srcq+strideq*2]
+    vpblendd             m1, m1, m4, 0x30
+    vpblendd             m4, m4, m2, 0x30
+    punpcklbw            m1, m4 ; 01 12
+    vpblendd             m2, m2, m5, 0x30
+    vpblendd             m5, m5, m3, 0x30
+    punpcklbw            m2, m5 ; 23 34
+    vpblendd             m3, m3, m6, 0x30
+    vpblendd             m6, m6, m0, 0x30
+    punpcklbw            m3, m6 ; 45 56
+.v_w8_loop:
+    vpbroadcastq         m4, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pmaddubsw            m5, m2, m9  ; a1
+    pmaddubsw            m6, m2, m8  ; b0
+    vpblendd             m2, m0, m4, 0x30
+    vpbroadcastq         m0, [srcq+strideq*0]
+    vpblendd             m4, m4, m0, 0x30
+    punpcklbw            m2, m4      ; 67 78
+    pmaddubsw            m1, m8      ; a0
+    pmaddubsw            m4, m3, m9  ; b1
+    paddw                m5, m1
+    mova                 m1, m3
+    pmaddubsw            m3, m10     ; a2
+    paddw                m6, m4
+    paddw                m5, m3
+    vpbroadcastq         m4, [srcq+strideq*1]
+    vpblendd             m3, m0, m4, 0x30
+    vpbroadcastq         m0, [srcq+strideq*2]
+    vpblendd             m4, m4, m0, 0x30
+    punpcklbw            m3, m4      ; 89 9a
+    pmaddubsw            m4, m2, m11 ; a3
+    paddw                m5, m4
+    pmaddubsw            m4, m2, m10 ; b2
+    paddw                m6, m4
+    pmaddubsw            m4, m3, m11 ; b3
+    paddw                m6, m4
+    pmulhrsw             m5, m7
+    pmulhrsw             m6, m7
+    mova        [tmpq+32*0], m5
+    mova        [tmpq+32*1], m6
+    add                tmpq, 32*2
+    sub                  hd, 4
+    jg .v_w8_loop
+    RET
+.v_w16:
+    lea                 r6d, [wq-16]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+    shl                 r6d, 4
+    mov                 r6b, hb
+.v_w16_loop0:
+    vbroadcasti128       m4, [srcq+strideq*0]
+    vbroadcasti128       m5, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m0, [srcq+strideq*1]
+    vbroadcasti128       m6, [srcq+strideq*0]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m1, [srcq+strideq*0]
+    vbroadcasti128       m2, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128       m3, [srcq+strideq*0]
+    shufpd               m4, m4, m0, 0x0c
+    shufpd               m5, m5, m1, 0x0c
+    punpcklbw            m1, m4, m5 ; 01
+    punpckhbw            m4, m5     ; 34
+    shufpd               m6, m6, m2, 0x0c
+    punpcklbw            m2, m5, m6 ; 12
+    punpckhbw            m5, m6     ; 45
+    shufpd               m0, m0, m3, 0x0c
+    punpcklbw            m3, m6, m0 ; 23
+    punpckhbw            m6, m0     ; 56
+.v_w16_loop:
+    vbroadcasti128      m12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vbroadcasti128      m13, [srcq+strideq*0]
+    pmaddubsw           m14, m1, m8  ; a0
+    pmaddubsw           m15, m2, m8  ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddubsw            m3, m9      ; a1
+    pmaddubsw            m4, m9      ; b1
+    paddw               m14, m3
+    paddw               m15, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddubsw            m5, m10     ; a2
+    pmaddubsw            m6, m10     ; b2
+    paddw               m14, m5
+    paddw               m15, m6
+    shufpd               m6, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m6, m0  ; 67
+    punpckhbw            m6, m0      ; 78
+    pmaddubsw           m12, m5, m11 ; a3
+    pmaddubsw           m13, m6, m11 ; b3
+    paddw               m14, m12
+    paddw               m15, m13
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    mova        [tmpq+wq*0], m14
+    mova        [tmpq+wq*2], m15
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_w16_loop
+    mov                  hb, r6b
+    add                  r5, 32
+    add                  r7, 16
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .v_w16_loop0
+    RET
+.hv:
+    cmp                  wd, 4
+    jg .hv_w8
+    movzx               mxd, mxb
+    dec                srcq
+    mova                 m7, [subpel_h_shuf4]
+    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep_avx2+2]
+    pmovzxbd             m9, [deint_shuf4]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep_avx2]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    vpbroadcastd        m10, [pw_8192]
+    vpbroadcastd        m11, [pd_32]
+    pshufd              m12, m0, q0000
+    pshufd              m13, m0, q1111
+    pshufd              m14, m0, q2222
+    pshufd              m15, m0, q3333
+    vpbroadcastq         m2, [srcq+strideq*0]
+    vpbroadcastq         m4, [srcq+strideq*1]
+    vpbroadcastq         m0, [srcq+strideq*2]
+    vpbroadcastq         m5, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m3, [srcq+strideq*0]
+    vpbroadcastq         m6, [srcq+strideq*1]
+    vpbroadcastq         m1, [srcq+strideq*2]
+    vpblendd             m2, m2, m4, 0xcc ; 0 1
+    vpblendd             m0, m0, m5, 0xcc ; 2 3
+    vpblendd             m3, m3, m6, 0xcc ; 4 5
+    pshufb               m2, m7
+    pshufb               m0, m7
+    pshufb               m3, m7
+    pshufb               m1, m7
+    pmaddubsw            m2, m8
+    pmaddubsw            m0, m8
+    pmaddubsw            m3, m8
+    pmaddubsw            m1, m8
+    phaddw               m2, m0
+    phaddw               m3, m1
+    pmulhrsw             m2, m10
+    pmulhrsw             m3, m10
+    palignr              m4, m3, m2, 4
+    punpcklwd            m1, m2, m4  ; 01 12
+    punpckhwd            m2, m4      ; 23 34
+    pshufd               m0, m3, q2121
+    punpcklwd            m3, m0      ; 45 56
+.hv_w4_loop:
+    pmaddwd              m5, m1, m12 ; a0 b0
+    pmaddwd              m6, m2, m12 ; c0 d0
+    pmaddwd              m2, m13     ; a1 b1
+    pmaddwd              m4, m3, m13 ; c1 d1
+    mova                 m1, m3
+    pmaddwd              m3, m14     ; a2 b2
+    paddd                m5, m2
+    vpbroadcastq         m2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    paddd                m6, m4
+    paddd                m5, m3
+    vpbroadcastq         m4, [srcq+strideq*0]
+    vpbroadcastq         m3, [srcq+strideq*1]
+    vpblendd             m2, m2, m4, 0xcc
+    vpbroadcastq         m4, [srcq+strideq*2]
+    vpblendd             m3, m3, m4, 0xcc
+    pshufb               m2, m7
+    pshufb               m3, m7
+    pmaddubsw            m2, m8
+    pmaddubsw            m3, m8
+    phaddw               m2, m3
+    pmulhrsw             m2, m10
+    palignr              m3, m2, m0, 12
+    mova                 m0, m2
+    punpcklwd            m2, m3, m0  ; 67 78
+    punpckhwd            m3, m0      ; 89 9a
+    pmaddwd              m4, m2, m14 ; c2 d2
+    paddd                m6, m11
+    paddd                m5, m11
+    paddd                m6, m4
+    pmaddwd              m4, m2, m15 ; a3 b3
+    paddd                m5, m4
+    pmaddwd              m4, m3, m15 ; c3 d3
+    paddd                m6, m4
+    psrad                m5, 6
+    psrad                m6, 6
+    packssdw             m5, m6
+    vpermd               m5, m9, m5
+    mova             [tmpq], m5
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    shr                 mxd, 16
+    sub                srcq, 3
+    vpbroadcastd        m10, [r7+mxq*8+subpel_filters-prep_avx2+0]
+    vpbroadcastd        m11, [r7+mxq*8+subpel_filters-prep_avx2+4]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep_avx2]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd              m12, m0, q0000
+    pshufd              m13, m0, q1111
+    pshufd              m14, m0, q2222
+    pshufd              m15, m0, q3333
+    lea                 r6d, [wq-8]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+    shl                 r6d, 5
+    mov                 r6b, hb
+.hv_w8_loop0:
+    vbroadcasti128       m7, [subpel_h_shufA]
+    vbroadcasti128       m8, [subpel_h_shufB]
+    vbroadcasti128       m9, [subpel_h_shufC]
+    movu                xm4,     [srcq+strideq*0]
+    movu                xm5,     [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    movu                xm6,     [srcq+strideq*0]
+    vbroadcasti128       m0,     [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    vpblendd             m4, m4, m0, 0xf0            ; 0 3
+    vinserti128          m5, m5, [srcq+strideq*0], 1 ; 1 4
+    vinserti128          m6, m6, [srcq+strideq*1], 1 ; 2 5
+    lea                srcq,     [srcq+strideq*2]
+    vinserti128          m0, m0, [srcq+strideq*0], 1 ; 3 6
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9
+    HV_H_W8              m0, m1, m2, m3, m7, m8, m9
+    vpbroadcastd         m7, [pw_8192]
+    vpermq               m4, m4, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m6, m6, q3120
+    pmulhrsw             m0, m7
+    pmulhrsw             m4, m7
+    pmulhrsw             m5, m7
+    pmulhrsw             m6, m7
+    vpermq               m7, m0, q3120
+    punpcklwd            m1, m4, m5  ; 01
+    punpckhwd            m4, m5      ; 34
+    punpcklwd            m2, m5, m6  ; 12
+    punpckhwd            m5, m6      ; 45
+    punpcklwd            m3, m6, m7  ; 23
+    punpckhwd            m6, m7      ; 56
+.hv_w8_loop:
+    vextracti128     [tmpq], m0, 1 ; not enough registers
+    movu                xm0,     [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    vinserti128          m0, m0, [srcq+strideq*0], 1 ; 7 8
+    pmaddwd              m8, m1, m12 ; a0
+    pmaddwd              m9, m2, m12 ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddwd              m3, m13     ; a1
+    pmaddwd              m4, m13     ; b1
+    paddd                m8, m3
+    paddd                m9, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddwd              m5, m14     ; a2
+    pmaddwd              m6, m14     ; b2
+    paddd                m8, m5
+    paddd                m9, m6
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+    vbroadcasti128       m5, [subpel_h_shufA]
+    HV_H_W8              m0, m5, m6, m7, m5, m6, m7
+    vpbroadcastd         m5, [pw_8192]
+    vpbroadcastd         m7, [pd_32]
+    vbroadcasti128       m6, [tmpq]
+    pmulhrsw             m0, m5
+    paddd                m8, m7
+    paddd                m9, m7
+    vpermq               m7, m0, q3120    ; 7 8
+    shufpd               m6, m6, m7, 0x04 ; 6 7
+    punpcklwd            m5, m6, m7  ; 67
+    punpckhwd            m6, m7      ; 78
+    pmaddwd              m7, m5, m15 ; a3
+    paddd                m8, m7
+    pmaddwd              m7, m6, m15 ; b3
+    paddd                m7, m9
+    psrad                m8, 6
+    psrad                m7, 6
+    packssdw             m8, m7
+    vpermq               m7, m8, q3120
+    mova         [tmpq+wq*0], xm7
+    vextracti128 [tmpq+wq*2], m7, 1
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    mov                  hb, r6b
+    add                  r5, 16
+    add                  r7, 8
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .hv_w8_loop0
+    RET
+
+%macro BIDIR_FN 1 ; op
+    %1                    0
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vextracti128        xm1, m0, 1
+    movd   [dstq          ], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    cmp                  hd, 4
+    je .ret
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq          ], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    cmp                  hd, 8
+    je .ret
+    %1                    2
+    lea                dstq, [dstq+strideq*4]
+    vextracti128        xm1, m0, 1
+    movd   [dstq          ], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq          ], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+.ret:
+    RET
+.w8_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*4]
+.w8:
+    vextracti128        xm1, m0, 1
+    movq   [dstq          ], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xm1
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.w16_loop:
+    %1_INC_PTR            4
+    %1                    0
+    lea                dstq, [dstq+strideq*4]
+.w16:
+    vpermq               m0, m0, q3120
+    mova         [dstq          ], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*2], xm0
+    vextracti128 [dstq+stride3q ], m0, 1
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32_loop:
+    %1_INC_PTR            4
+    %1                    0
+    lea                dstq, [dstq+strideq*2]
+.w32:
+    vpermq               m0, m0, q3120
+    mova             [dstq], m0
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova   [dstq+strideq*1], m0
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64_loop:
+    %1_INC_PTR            4
+    %1                    0
+    add                dstq, strideq
+.w64:
+    vpermq               m0, m0, q3120
+    mova             [dstq], m0
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova          [dstq+32], m0
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    %1                    0
+    add                dstq, strideq
+.w128:
+    vpermq               m0, m0, q3120
+    mova        [dstq+0*32], m0
+    %1                    2
+    vpermq               m0, m0, q3120
+    mova        [dstq+1*32], m0
+    %1_INC_PTR            8
+    %1                   -4
+    vpermq               m0, m0, q3120
+    mova        [dstq+2*32], m0
+    %1                   -2
+    vpermq               m0, m0, q3120
+    mova        [dstq+3*32], m0
+    dec                  hd
+    jg .w128_loop
+    RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+    mova                 m0, [tmp1q+(%1+0)*mmsize]
+    paddw                m0, [tmp2q+(%1+0)*mmsize]
+    mova                 m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m1, [tmp2q+(%1+1)*mmsize]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    packuswb             m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+    lea                  r6, [avg_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, dword [r6+wq*4]
+    vpbroadcastd         m2, [pw_1024+r6-avg_avx2_table]
+    add                  wq, r6
+    BIDIR_FN            AVG
+
+%macro W_AVG 1 ; src_offset
+    ; (a * weight + b * (16 - weight) + 128) >> 8
+    ; = ((a - b) * weight + (b << 4) + 128) >> 8
+    ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+    mova                 m0,     [tmp2q+(%1+0)*mmsize]
+    psubw                m2, m0, [tmp1q+(%1+0)*mmsize]
+    mova                 m1,     [tmp2q+(%1+1)*mmsize]
+    psubw                m3, m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m2, m2 ; compensate for the weight only being half
+    paddw                m3, m3 ; of what it should be
+    pmulhw               m2, m4
+    pmulhw               m3, m4
+    paddw                m0, m2
+    paddw                m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+    lea                  r6, [w_avg_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    vpbroadcastw         m0, r6m ; weight
+    movsxd               wq, dword [r6+wq*4]
+    pxor                 m4, m4
+    psllw                m0, 11 ; can't shift by 12, sign bit must be preserved
+    psubw                m4, m0
+    vpbroadcastd         m5, [pw_2048+r6-w_avg_avx2_table]
+    add                  wq, r6
+    BIDIR_FN          W_AVG
+
+%macro MASK 1 ; src_offset
+    ; (a * m + b * (64 - m) + 512) >> 10
+    ; = ((a - b) * m + (b << 6) + 512) >> 10
+    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+    vpermq               m3,     [maskq+(%1+0)*(mmsize/2)], q3120
+    mova                 m0,     [tmp2q+(%1+0)*mmsize]
+    psubw                m1, m0, [tmp1q+(%1+0)*mmsize]
+    psubb                m3, m4, m3
+    paddw                m1, m1     ; (b - a) << 1
+    paddb                m3, m3
+    punpcklbw            m2, m4, m3 ; -m << 9
+    pmulhw               m1, m2
+    paddw                m0, m1
+    mova                 m1,     [tmp2q+(%1+1)*mmsize]
+    psubw                m2, m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m2, m2
+    punpckhbw            m3, m4, m3
+    pmulhw               m2, m3
+    paddw                m1, m2
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+    add               maskq, %1*mmsize/2
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+    lea                  r7, [mask_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    mov               maskq, maskmp
+    movsxd               wq, dword [r7+wq*4]
+    pxor                 m4, m4
+    vpbroadcastd         m5, [pw_2048+r7-mask_avx2_table]
+    add                  wq, r7
+    BIDIR_FN           MASK
+
+%macro W_MASK_420 2 ; src_offset, mask_out
+    mova                 m0, [tmp1q+(%1+0)*mmsize]
+    mova                 m1, [tmp2q+(%1+0)*mmsize]
+    psubw                m1, m0
+    pabsw               m%2, m1
+    paddw               m%2, m6
+    psrlw               m%2, 8       ; (abs(tmp1 - tmp2) + 8) >> 8
+    psubusw             m%2, m7, m%2 ; 64 - min(m, 64)
+    psllw                m2, m%2, 10
+    pmulhw               m1, m2
+    paddw                m0, m1
+    mova                 m1, [tmp1q+(%1+1)*mmsize]
+    mova                 m2, [tmp2q+(%1+1)*mmsize]
+    psubw                m2, m1
+    pabsw                m3, m2
+    paddw                m3, m6
+    psrlw                m3, 8
+    psubusw              m3, m7, m3
+    phaddw              m%2, m3
+    psllw                m3, 10
+    pmulhw               m2, m3
+    paddw                m1, m2
+    pmulhrsw             m0, m8
+    pmulhrsw             m1, m8
+    packuswb             m0, m1
+%endmacro
+
+cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
+    lea                  r7, [w_mask_420_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    mov               maskq, maskmp
+    vpbroadcastw         m0, r7m ; sign
+    movsxd               wq, dword [r7+wq*4]
+    vpbroadcastd         m6, [pw_8       +r7-w_mask_420_avx2_table]
+    vpbroadcastd         m7, [pw_26      +r7-w_mask_420_avx2_table] ; 64 - 38
+    vpbroadcastd         m8, [pw_2048    +r7-w_mask_420_avx2_table]
+    vpbroadcastd         m9, [pw_258     +r7-w_mask_420_avx2_table] ; 64 * 4 + 2
+    pmovzxbd            m10, [deint_shuf4+r7-w_mask_420_avx2_table]
+    psubw                m9, m0
+    add                  wq, r7
+    W_MASK_420            0, 4
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vextracti128        xm1, m0, 1
+    movd   [dstq          ], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    cmp                  hd, 4
+    je .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq          ], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    cmp                  hd, 8
+    jg .w4_h16
+.w4_end:
+    vextracti128        xm0, m4, 1
+    vpblendd            xm1, xm4, xm0, 0x05
+    vpblendd            xm4, xm4, xm0, 0x0a
+    pshufd              xm1, xm1, q2301
+    psubw               xm4, xm9, xm4
+    psubw               xm4, xm1
+    psrlw               xm4, 2
+    packuswb            xm4, xm4
+    movq            [maskq], xm4
+    RET
+.w4_h16:
+    W_MASK_420            2, 5
+    lea                dstq, [dstq+strideq*4]
+    phaddd               m4, m5
+    vextracti128        xm1, m0, 1
+    psubw                m4, m9, m4
+    psrlw                m4, 2
+    vpermd               m4, m10, m4
+    vextracti128        xm5, m4, 1
+    packuswb            xm4, xm5
+    movd   [dstq          ], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq          ], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    mova            [maskq], xm4
+    RET
+.w8_loop:
+    add               tmp1q, 2*32
+    add               tmp2q, 2*32
+    W_MASK_420            0, 4
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 8
+.w8:
+    vextracti128        xm2, m4, 1
+    vextracti128        xm1, m0, 1
+    psubw               xm4, xm9, xm4
+    psubw               xm4, xm2
+    psrlw               xm4, 2
+    packuswb            xm4, xm4
+    movq   [dstq          ], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xm1
+    movq            [maskq], xm4
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 4*32
+    add               tmp2q, 4*32
+    W_MASK_420            0, 4
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 16
+.w16:
+    vpermq               m0, m0, q3120
+    mova         [dstq          ], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    W_MASK_420            2, 5
+    punpckhqdq           m1, m4, m5
+    punpcklqdq           m4, m5
+    psubw                m1, m9, m1
+    psubw                m1, m4
+    psrlw                m1, 2
+    vpermq               m0, m0, q3120
+    packuswb             m1, m1
+    vpermd               m1, m10, m1
+    mova         [dstq+strideq*2], xm0
+    vextracti128 [dstq+stride3q ], m0, 1
+    mova            [maskq], xm1
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32_loop:
+    add               tmp1q, 4*32
+    add               tmp2q, 4*32
+    W_MASK_420            0, 4
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 16
+.w32:
+    vpermq               m0, m0, q3120
+    mova             [dstq], m0
+    W_MASK_420            2, 5
+    psubw                m4, m9, m4
+    psubw                m4, m5
+    psrlw                m4, 2
+    vpermq               m0, m0, q3120
+    packuswb             m4, m4
+    vpermd               m4, m10, m4
+    mova   [dstq+strideq*1], m0
+    mova            [maskq], xm4
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64_loop_even:
+    psubw               m11, m9, m4
+    psubw               m12, m9, m5
+    dec                  hd
+.w64_loop:
+    add               tmp1q, 4*32
+    add               tmp2q, 4*32
+    W_MASK_420            0, 4
+    add                dstq, strideq
+.w64:
+    vpermq               m0, m0, q3120
+    mova             [dstq], m0
+    W_MASK_420            2, 5
+    vpermq               m0, m0, q3120
+    mova          [dstq+32], m0
+    test                 hd, 1
+    jz .w64_loop_even
+    psubw                m4, m11, m4
+    psubw                m5, m12, m5
+    psrlw                m4, 2
+    psrlw                m5, 2
+    packuswb             m4, m5
+    vpermd               m4, m10, m4
+    mova            [maskq], m4
+    add               maskq, 32
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop_even:
+    psubw               m13, m9, m4
+    psubw               m14, m9, m5
+    dec                  hd
+.w128_loop:
+    W_MASK_420            0, 4
+    add                dstq, strideq
+.w128:
+    vpermq               m0, m0, q3120
+    mova        [dstq+0*32], m0
+    W_MASK_420            2, 5
+    vpermq               m0, m0, q3120
+    mova        [dstq+1*32], m0
+    add               tmp1q, 8*32
+    add               tmp2q, 8*32
+    test                 hd, 1
+    jz .w128_even
+    psubw                m4, m11, m4
+    psubw                m5, m12, m5
+    psrlw                m4, 2
+    psrlw                m5, 2
+    packuswb             m4, m5
+    vpermd               m4, m10, m4
+    mova            [maskq], m4
+    jmp .w128_odd
+.w128_even:
+    psubw               m11, m9, m4
+    psubw               m12, m9, m5
+.w128_odd:
+    W_MASK_420           -4, 4
+    vpermq               m0, m0, q3120
+    mova        [dstq+2*32], m0
+    W_MASK_420           -2, 5
+    vpermq               m0, m0, q3120
+    mova        [dstq+3*32], m0
+    test                 hd, 1
+    jz .w128_loop_even
+    psubw                m4, m13, m4
+    psubw                m5, m14, m5
+    psrlw                m4, 2
+    psrlw                m5, 2
+    packuswb             m4, m5
+    vpermd               m4, m10, m4
+    mova         [maskq+32], m4
+    add               maskq, 64
+    dec                  hd
+    jg .w128_loop
+    RET
+
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/mc_init.c
@@ -1,0 +1,97 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "src/mc.h"
+#include "src/x86/cpu.h"
+
+decl_mc_fn(dav1d_put_8tap_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
+decl_mc_fn(dav1d_put_bilin_avx2);
+
+decl_mct_fn(dav1d_prep_8tap_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
+decl_mct_fn(dav1d_prep_bilin_avx2);
+
+decl_avg_fn(dav1d_avg_avx2);
+decl_w_avg_fn(dav1d_w_avg_avx2);
+decl_mask_fn(dav1d_mask_avx2);
+decl_w_mask_fn(dav1d_w_mask_420_avx2);
+
+void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+#define init_mc_fn(type, name, suffix) \
+    c->mc[type] = dav1d_put_##name##_##suffix
+#define init_mct_fn(type, name, suffix) \
+    c->mct[type] = dav1d_prep_##name##_##suffix
+    const enum CpuFlags flags = dav1d_get_cpu_flags_x86();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64 && !defined(_WIN32) // FIXME: Windows
+    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
+    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
+    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
+    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
+    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
+    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
+    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               avx2);
+
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
+
+    c->avg = dav1d_avg_avx2;
+    c->w_avg = dav1d_w_avg_avx2;
+    c->mask = dav1d_mask_avx2;
+    c->w_mask[2] = dav1d_w_mask_420_avx2;
+#endif
+}