shithub: dav1d

Download patch

ref: aec3d25c6be14b06c6fc6de3eb70dc92faaac0fc
parent: 24effd4762c3641444817359248dd84ad5c0eb69
author: Ronald S. Bultje <[email protected]>
date: Fri Oct 5 12:29:22 EDT 2018

Add horizontal loopfilter AVX2 SIMD

--- a/src/lf_apply.c
+++ b/src/lf_apply.c
@@ -49,7 +49,7 @@
     for (int x = 0; x < w; x++) {
         if (!have_left && !x) continue;
         dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls,
-                                     starty4 ? (const uint32_t[3]) {
+                                     starty4 ? (const uint32_t[4]) {
                                          mask[x][0] >> starty4,
                                          mask[x][1] >> starty4,
                                          mask[x][2] >> starty4,
@@ -98,7 +98,7 @@
     for (int x = 0; x < w; x++) {
         if (!have_left && !x) continue;
         dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls,
-                                     starty4 ? (const uint32_t[2]) {
+                                     starty4 ? (const uint32_t[3]) {
                                          mask[x][0] >> starty4,
                                          mask[x][1] >> starty4,
                                      } : mask[x],
@@ -105,7 +105,7 @@
                                      (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
                                      &f->lf.lim_lut, endy4 - starty4);
         dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls,
-                                     starty4 ? (const uint32_t[2]) {
+                                     starty4 ? (const uint32_t[3]) {
                                          mask[x][0] >> starty4,
                                          mask[x][1] >> starty4,
                                      } : mask[x],
--- a/src/x86/loopfilter.asm
+++ b/src/x86/loopfilter.asm
@@ -63,22 +63,496 @@
     por           %1, %4
 %endmacro
 
-%macro FILTER 1 ; width
+%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
+    ; transpose 16x4
+    punpcklbw    m%5, m%1, m%2
+    punpckhbw    m%1, m%2
+    punpcklbw    m%2, m%3, m%4
+    punpckhbw    m%3, m%4
+    punpcklwd    m%4, m%5, m%2
+    punpckhwd    m%5, m%2
+    punpcklwd    m%2, m%1, m%3
+    punpckhwd    m%1, m%3
+
+    ; write out
+    movd [dstq+strideq*0-2], xm%4
+    pextrd [dstq+strideq*1-2], xm%4, 1
+    pextrd [dstq+strideq*2-2], xm%4, 2
+    pextrd [dstq+stride3q-2], xm%4, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%5
+    pextrd [dstq+strideq*1-2], xm%5, 1
+    pextrd [dstq+strideq*2-2], xm%5, 2
+    pextrd [dstq+stride3q-2], xm%5, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%2
+    pextrd [dstq+strideq*1-2], xm%2, 1
+    pextrd [dstq+strideq*2-2], xm%2, 2
+    pextrd [dstq+stride3q-2], xm%2, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%1
+    pextrd [dstq+strideq*1-2], xm%1, 1
+    pextrd [dstq+strideq*2-2], xm%1, 2
+    pextrd [dstq+stride3q-2], xm%1, 3
+    lea         dstq, [dstq+strideq*4]
+
+    vextracti128 xm%4, m%4, 1
+    vextracti128 xm%5, m%5, 1
+    vextracti128 xm%2, m%2, 1
+    vextracti128 xm%1, m%1, 1
+
+    movd [dstq+strideq*0-2], xm%4
+    pextrd [dstq+strideq*1-2], xm%4, 1
+    pextrd [dstq+strideq*2-2], xm%4, 2
+    pextrd [dstq+stride3q-2], xm%4, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%5
+    pextrd [dstq+strideq*1-2], xm%5, 1
+    pextrd [dstq+strideq*2-2], xm%5, 2
+    pextrd [dstq+stride3q-2], xm%5, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%2
+    pextrd [dstq+strideq*1-2], xm%2, 1
+    pextrd [dstq+strideq*2-2], xm%2, 2
+    pextrd [dstq+stride3q-2], xm%2, 3
+    lea         dstq, [dstq+strideq*4]
+    movd [dstq+strideq*0-2], xm%1
+    pextrd [dstq+strideq*1-2], xm%1, 1
+    pextrd [dstq+strideq*2-2], xm%1, 2
+    pextrd [dstq+stride3q-2], xm%1, 3
+    lea         dstq, [dstq+strideq*4]
+%endmacro
+
+%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%if %1 == 0
+    mova          %3, m15
+%endif
+
+    ; input in m0-15
+    punpcklbw    m15, m0, m1
+    punpckhbw     m0, m1
+    punpcklbw     m1, m2, m3
+    punpckhbw     m2, m3
+    punpcklbw     m3, m4, m5
+    punpckhbw     m4, m5
+    punpcklbw     m5, m6, m7
+    punpckhbw     m6, m7
+    punpcklbw     m7, m8, m9
+    punpckhbw     m8, m9
+    punpcklbw     m9, m10, m11
+    punpckhbw    m10, m11
+    punpcklbw    m11, m12, m13
+    punpckhbw    m12, m13
+    mova         m13, %3
+    mova          %3, m12
+    punpcklbw    m12, m14, m13
+    punpckhbw    m13, m14, m13
+
+    ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
+    punpcklwd    m14, m15, m1
+    punpckhwd    m15, m1
+    punpcklwd     m1, m0, m2
+    punpckhwd     m0, m2
+    punpcklwd     m2, m3, m5
+    punpckhwd     m3, m5
+    punpcklwd     m5, m4, m6
+    punpckhwd     m4, m6
+    punpcklwd     m6, m7, m9
+    punpckhwd     m7, m9
+    punpcklwd     m9, m8, m10
+    punpckhwd     m8, m10
+    punpcklwd    m10, m11, m12
+    punpckhwd    m11, m12
+    mova         m12, %3
+    mova          %3, m11
+    punpcklwd    m11, m12, m13
+    punpckhwd    m12, m13
+
+    ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+    punpckldq    m13, m14, m2
+    punpckhdq    m14, m2
+    punpckldq     m2, m15, m3
+    punpckhdq    m15, m3
+    punpckldq     m3, m1, m5
+    punpckhdq     m1, m5
+    punpckldq     m5, m0, m4
+    punpckhdq     m0, m4
+    punpckldq     m4, m6, m10
+    punpckhdq     m6, m10
+    punpckldq    m10, m9, m11
+    punpckhdq     m9, m11
+    punpckldq    m11, m8, m12
+    punpckhdq     m8, m12
+    mova         m12, %3
+    mova          %3, m8
+    punpckldq     m8, m7, m12
+    punpckhdq     m7, m12
+
+    ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
+    punpcklqdq   m12, m13, m4
+    punpckhqdq   m13, m4
+    punpcklqdq    m4, m14, m6
+    punpckhqdq   m14, m6
+    punpcklqdq    m6, m2, m8
+    punpckhqdq    m2, m8
+    punpcklqdq    m8, m15, m7
+    punpckhqdq   m15, m7
+    punpcklqdq    m7, m3, m10
+    punpckhqdq    m3, m10
+    punpcklqdq   m10, m1, m9
+    punpckhqdq    m1, m9
+    punpcklqdq    m9, m5, m11
+    punpckhqdq    m5, m11
+    mova         m11, %3
+    mova          %3, m12
+    punpcklqdq   m12, m0, m11
+    punpckhqdq    m0, m11
+%if %2 == 0
+    mova         m11, %3
+%endif
+
+    ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
+    SWAP          0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
+    SWAP          3, 14, 12, 9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+    ; load data
+%ifidn %2, v
+%if %1 == 4
+    lea         tmpq, [dstq+mstrideq*2]
+    mova          m3, [tmpq+strideq*0]          ; p1
+    mova          m4, [tmpq+strideq*1]          ; p0
+    mova          m5, [tmpq+strideq*2]          ; q0
+    mova          m6, [tmpq+stride3q]           ; q1
+%else
+    ; load 6-8 pixels, remainder (for wd=16) will be read inline
+    lea         tmpq, [dstq+mstrideq*4]
+%if %1 != 6
+    mova         m12, [tmpq+strideq*0]
+%endif
+    mova         m13, [tmpq+strideq*1]
+    mova          m3, [tmpq+strideq*2]
+    mova          m4, [tmpq+stride3q]
+    mova          m5, [dstq+strideq*0]
+    mova          m6, [dstq+strideq*1]
+    mova         m14, [dstq+strideq*2]
+%if %1 != 6
+    mova         m15, [dstq+stride3q]
+%endif
+%endif
+%else
+    ; load lines
+%if %1 == 4
+    movd         xm3, [dstq+strideq*0-2]
+    movd         xm4, [dstq+strideq*1-2]
+    movd         xm5, [dstq+strideq*2-2]
+    movd         xm6, [dstq+stride3q -2]
+    lea         tmpq, [dstq+strideq*4]
+    pinsrd       xm3, [tmpq+strideq*0-2], 2
+    pinsrd       xm4, [tmpq+strideq*1-2], 2
+    pinsrd       xm5, [tmpq+strideq*2-2], 2
+    pinsrd       xm6, [tmpq+stride3q -2], 2
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd       xm3, [tmpq+strideq*0-2], 1
+    pinsrd       xm4, [tmpq+strideq*1-2], 1
+    pinsrd       xm5, [tmpq+strideq*2-2], 1
+    pinsrd       xm6, [tmpq+stride3q -2], 1
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd       xm3, [tmpq+strideq*0-2], 3
+    pinsrd       xm4, [tmpq+strideq*1-2], 3
+    pinsrd       xm5, [tmpq+strideq*2-2], 3
+    pinsrd       xm6, [tmpq+stride3q -2], 3
+    lea         tmpq, [tmpq+strideq*4]
+    movd        xm12, [tmpq+strideq*0-2]
+    movd        xm13, [tmpq+strideq*1-2]
+    movd        xm14, [tmpq+strideq*2-2]
+    movd        xm15, [tmpq+stride3q -2]
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd      xm12, [tmpq+strideq*0-2], 2
+    pinsrd      xm13, [tmpq+strideq*1-2], 2
+    pinsrd      xm14, [tmpq+strideq*2-2], 2
+    pinsrd      xm15, [tmpq+stride3q -2], 2
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd      xm12, [tmpq+strideq*0-2], 1
+    pinsrd      xm13, [tmpq+strideq*1-2], 1
+    pinsrd      xm14, [tmpq+strideq*2-2], 1
+    pinsrd      xm15, [tmpq+stride3q -2], 1
+    lea         tmpq, [tmpq+strideq*4]
+    pinsrd      xm12, [tmpq+strideq*0-2], 3
+    pinsrd      xm13, [tmpq+strideq*1-2], 3
+    pinsrd      xm14, [tmpq+strideq*2-2], 3
+    pinsrd      xm15, [tmpq+stride3q -2], 3
+    vinserti128   m3, xm12, 1
+    vinserti128   m4, xm13, 1
+    vinserti128   m5, xm14, 1
+    vinserti128   m6, xm15, 1
+
+    ; transpose 4x16
+    ; xm3: A-D0,A-D8,A-D4,A-D12
+    ; xm4: A-D1,A-D9,A-D5,A-D13
+    ; xm5: A-D2,A-D10,A-D6,A-D14
+    ; xm6: A-D3,A-D11,A-D7,A-D15
+    punpcklbw     m7, m3, m4
+    punpckhbw     m3, m4
+    punpcklbw     m4, m5, m6
+    punpckhbw     m5, m6
+    ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
+    ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
+    ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
+    ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
+    punpcklwd     m6, m7, m4
+    punpckhwd     m7, m4
+    punpcklwd     m4, m3, m5
+    punpckhwd     m3, m5
+    ; xm6: A0-3,B0-3,C0-3,D0-3
+    ; xm7: A8-11,B8-11,C8-11,D8-11
+    ; xm4: A4-7,B4-7,C4-7,D4-7
+    ; xm3: A12-15,B12-15,C12-15,D12-15
+    punpckldq     m5, m6, m4
+    punpckhdq     m6, m4
+    punpckldq     m4, m7, m3
+    punpckhdq     m7, m3
+    ; xm5: A0-7,B0-7
+    ; xm6: C0-7,D0-7
+    ; xm4: A8-15,B8-15
+    ; xm7: C8-15,D8-15
+    punpcklqdq    m3, m5, m4
+    punpckhqdq    m4, m5, m4
+    punpcklqdq    m5, m6, m7
+    punpckhqdq    m6, m7
+    ; xm3: A0-15
+    ; xm5: B0-15
+    ; xm4: C0-15
+    ; xm6: D0-15
+%elif %1 == 6 || %1 == 8
+    movq         xm3, [dstq+strideq*0-%1/2]
+    movq         xm4, [dstq+strideq*1-%1/2]
+    movq         xm5, [dstq+strideq*2-%1/2]
+    movq         xm6, [dstq+stride3q -%1/2]
+    lea         tmpq, [dstq+strideq*8]
+    movhps       xm3, [tmpq+strideq*0-%1/2]
+    movhps       xm4, [tmpq+strideq*1-%1/2]
+    movhps       xm5, [tmpq+strideq*2-%1/2]
+    movhps       xm6, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movq         xm7, [tmpq+strideq*0-%1/2]
+    movq         xm8, [tmpq+strideq*1-%1/2]
+    movq         xm9, [tmpq+strideq*2-%1/2]
+    movq        xm11, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movhps       xm7, [tmpq+strideq*0-%1/2]
+    movhps       xm8, [tmpq+strideq*1-%1/2]
+    movhps       xm9, [tmpq+strideq*2-%1/2]
+    movhps      xm11, [tmpq+stride3q -%1/2]
+    vinserti128   m3, xm7, 1
+    vinserti128   m4, xm8, 1
+    vinserti128   m5, xm9, 1
+    vinserti128   m6, xm11, 1
+    lea         tmpq, [dstq+strideq*4]
+    movq        xm12, [tmpq+strideq*0-%1/2]
+    movq        xm13, [tmpq+strideq*1-%1/2]
+    movq        xm14, [tmpq+strideq*2-%1/2]
+    movq        xm15, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movhps      xm12, [tmpq+strideq*0-%1/2]
+    movhps      xm13, [tmpq+strideq*1-%1/2]
+    movhps      xm14, [tmpq+strideq*2-%1/2]
+    movhps      xm15, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movq         xm7, [tmpq+strideq*0-%1/2]
+    movq         xm8, [tmpq+strideq*1-%1/2]
+    movq         xm9, [tmpq+strideq*2-%1/2]
+    movq        xm11, [tmpq+stride3q -%1/2]
+    lea         tmpq, [tmpq+strideq*8]
+    movhps       xm7, [tmpq+strideq*0-%1/2]
+    movhps       xm8, [tmpq+strideq*1-%1/2]
+    movhps       xm9, [tmpq+strideq*2-%1/2]
+    movhps      xm11, [tmpq+stride3q -%1/2]
+    vinserti128  m12, xm7, 1
+    vinserti128  m13, xm8, 1
+    vinserti128  m14, xm9, 1
+    vinserti128  m15, xm11, 1
+
+    ; transpose 8x16
+    ; xm3: A-H0,A-H8
+    ; xm4: A-H1,A-H9
+    ; xm5: A-H2,A-H10
+    ; xm6: A-H3,A-H11
+    ; xm12: A-H4,A-H12
+    ; xm13: A-H5,A-H13
+    ; xm14: A-H6,A-H14
+    ; xm15: A-H7,A-H15
+    punpcklbw    m7, m3, m4
+    punpckhbw    m3, m4
+    punpcklbw    m4, m5, m6
+    punpckhbw    m5, m6
+    punpcklbw    m6, m12, m13
+    punpckhbw   m12, m13
+    punpcklbw   m13, m14, m15
+    punpckhbw   m14, m15
+    ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
+    ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
+    ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
+    ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
+    ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
+    ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+    ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+    ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+    punpcklwd   m15, m7, m4
+    punpckhwd    m7, m4
+    punpcklwd    m4, m3, m5
+    punpckhwd    m3, m5
+    punpcklwd    m5, m6, m13
+    punpckhwd    m6, m13
+    punpcklwd   m13, m12, m14
+    punpckhwd   m12, m14
+    ; xm15: A0-3,B0-3,C0-3,D0-3
+    ; xm7: E0-3,F0-3,G0-3,H0-3
+    ; xm4: A8-11,B8-11,C8-11,D8-11
+    ; xm3: E8-11,F8-11,G8-11,H8-11
+    ; xm5: A4-7,B4-7,C4-7,D4-7
+    ; xm6: E4-7,F4-7,G4-7,H4-7
+    ; xm13: A12-15,B12-15,C12-15,D12-15
+    ; xm12: E12-15,F12-15,G12-15,H12-15
+    punpckldq   m14, m15, m5
+    punpckhdq   m15, m5
+    punpckldq    m5, m7, m6
+%if %1 != 6
+    punpckhdq    m7, m6
+%endif
+    punpckldq    m6, m4, m13
+    punpckhdq    m4, m13
+    punpckldq   m13, m3, m12
+%if %1 != 6
+    punpckhdq   m12, m3, m12
+%endif
+    ; xm14: A0-7,B0-7
+    ; xm15: C0-7,D0-7
+    ; xm5: E0-7,F0-7
+    ; xm7: G0-7,H0-7
+    ; xm6: A8-15,B8-15
+    ; xm4: C8-15,D8-15
+    ; xm13: E8-15,F8-15
+    ; xm12: G8-15,H8-15
+    punpcklqdq   m3, m14, m6
+    punpckhqdq  m14, m6
+    punpckhqdq   m6, m15, m4
+    punpcklqdq  m15, m4
+    punpcklqdq   m4, m5, m13
+    punpckhqdq  m13, m5, m13
+%if %1 == 8
+    punpcklqdq   m5, m7, m12
+    punpckhqdq  m12, m7, m12
+    ; xm3: A0-15
+    ; xm14: B0-15
+    ; xm15: C0-15
+    ; xm6: D0-15
+    ; xm4: E0-15
+    ; xm13: F0-15
+    ; xm5: G0-15
+    ; xm12: H0-15
+    SWAP         12, 3, 15
+    SWAP         13, 14, 5, 4, 6
+    ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+%else
+    SWAP         13, 3, 14
+    SWAP          6, 4, 15, 5
+    ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
+%endif
+%else
+    ; load and 16x16 transpose. We only use 14 pixels but we'll need the
+    ; remainder at the end for the second transpose
+    movu         xm0, [dstq+strideq*0-8]
+    movu         xm1, [dstq+strideq*1-8]
+    movu         xm2, [dstq+strideq*2-8]
+    movu         xm3, [dstq+stride3q -8]
+    lea         tmpq, [dstq+strideq*4]
+    movu         xm4, [tmpq+strideq*0-8]
+    movu         xm5, [tmpq+strideq*1-8]
+    movu         xm6, [tmpq+strideq*2-8]
+    movu         xm7, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    movu         xm8, [tmpq+strideq*0-8]
+    movu         xm9, [tmpq+strideq*1-8]
+    movu        xm10, [tmpq+strideq*2-8]
+    movu        xm11, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    movu        xm12, [tmpq+strideq*0-8]
+    movu        xm13, [tmpq+strideq*1-8]
+    movu        xm14, [tmpq+strideq*2-8]
+    movu        xm15, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128   m0, [tmpq+strideq*0-8], 1
+    vinserti128   m1, [tmpq+strideq*1-8], 1
+    vinserti128   m2, [tmpq+strideq*2-8], 1
+    vinserti128   m3, [tmpq+stride3q -8], 1
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128   m4, [tmpq+strideq*0-8], 1
+    vinserti128   m5, [tmpq+strideq*1-8], 1
+    vinserti128   m6, [tmpq+strideq*2-8], 1
+    vinserti128   m7, [tmpq+stride3q -8], 1
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128   m8, [tmpq+strideq*0-8], 1
+    vinserti128   m9, [tmpq+strideq*1-8], 1
+    vinserti128  m10, [tmpq+strideq*2-8], 1
+    vinserti128  m11, [tmpq+stride3q -8], 1
+    lea         tmpq, [tmpq+strideq*4]
+    vinserti128  m12, [tmpq+strideq*0-8], 1
+    vinserti128  m13, [tmpq+strideq*1-8], 1
+    vinserti128  m14, [tmpq+strideq*2-8], 1
+    vinserti128  m15, [tmpq+stride3q -8], 1
+
+    TRANSPOSE_16X16B 0, 1, [rsp+11*32]
+    mova  [rsp+12*32], m1
+    mova  [rsp+13*32], m2
+    mova  [rsp+14*32], m3
+    mova  [rsp+15*32], m12
+    mova  [rsp+16*32], m13
+    mova  [rsp+17*32], m14
+    mova  [rsp+18*32], m15
+    ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
+    SWAP           12, 4, 7
+    SWAP           13, 5, 8
+    SWAP            3, 6, 9
+    SWAP           10, 14
+    SWAP           11, 15
+%endif
+%endif
+
+    ; load L/E/I/H
+%ifidn %2, v
     movu          m1, [lq]
     movu          m0, [lq+l_strideq]
+%else
+    movq         xm1, [lq]
+    movq         xm2, [lq+l_strideq*2]
+    movhps       xm1, [lq+l_strideq]
+    movhps       xm2, [lq+l_stride3q]
+    lea           lq, [lq+l_strideq*4]
+    movq        xm10, [lq]
+    movq         xm0, [lq+l_strideq*2]
+    movhps      xm10, [lq+l_strideq]
+    movhps       xm0, [lq+l_stride3q]
+    lea           lq, [lq+l_strideq*4]
+    vinserti128   m1, xm10, 1
+    vinserti128   m2, xm0, 1
+    shufps        m0, m1, m2, q3131
+    shufps        m1, m2, q2020
+%endif
     pxor          m2, m2
-    pcmpeqb       m3, m2, m0
-    pand          m1, m3
+    pcmpeqb      m10, m2, m0
+    pand          m1, m10
     por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
     pshufb        m0, [pb_4x1_4x5_4x9_4x13]     ; l[x][1]
     pcmpeqb      m10, m2, m0                    ; !L
-    pand          m1, m0, [pb_240]
-    psrlq         m1, 4                         ; H
     psrlq         m2, m0, [lutq+128]
     pand          m2, [pb_63]
-    vpbroadcastb  m4, [lutq+136]
-    pminub        m2, m4
+    vpbroadcastb  m1, [lutq+136]
+    pminub        m2, m1
     pmaxub        m2, [pb_1]                    ; I
+    pand          m1, m0, [pb_240]
+    psrlq         m1, 4                         ; H
     paddb         m0, [pb_2]
     paddb         m0, m0
     paddb         m0, m2                        ; E
@@ -86,20 +560,6 @@
     pxor          m2, [pb_128]
     pxor          m0, [pb_128]
 
-%if %1 == 4
-    lea         tmpq, [dstq+mstrideq*2]
-    mova          m3, [tmpq+strideq*0]          ; p1
-    mova          m4, [tmpq+strideq*1]          ; p0
-    mova          m5, [tmpq+strideq*2]          ; q0
-    mova          m6, [tmpq+stride3q]           ; q1
-%else
-    lea         tmpq, [dstq+mstrideq*4]
-    mova          m3, [tmpq+strideq*2]
-    mova          m4, [tmpq+stride3q]
-    mova          m5, [dstq+strideq*0]
-    mova          m6, [dstq+strideq*1]
-%endif
-
     ABSSUB        m8, m3, m4, m9                ; abs(p1-p0)
     pmaxub        m8, m10
     ABSSUB        m9, m5, m6, m10               ; abs(q1-q0)
@@ -111,15 +571,6 @@
     pxor          m7, m8, [pb_128]
     pcmpgtb       m7, m1                        ; hev
 
-%if %1 != 6
-    mova         m12, [tmpq+strideq*0]
-%endif
-    mova         m13, [tmpq+strideq*1]
-    mova         m14, [dstq+strideq*2]
-%if %1 != 6
-    mova         m15, [dstq+stride3q]
-%endif
-
 %if %1 == 6
     ABSSUB        m9, m13, m4, m10              ; abs(p2-p0)
     pmaxub        m9, m8
@@ -179,23 +630,47 @@
     por           m8, m10
 
 %if %1 == 16
+%ifidn %2, v
     lea         tmpq, [dstq+mstrideq*8]
     mova          m0, [tmpq+strideq*1]
+%else
+    mova          m0, [rsp+12*32]
+%endif
     ABSSUB        m1, m0, m4, m2
+%ifidn %2, v
     mova          m0, [tmpq+strideq*2]
+%else
+    mova          m0, [rsp+13*32]
+%endif
     ABSSUB        m2, m0, m4, m10
     pmaxub        m1, m2
+%ifidn %2, v
     mova          m0, [tmpq+stride3q]
+%else
+    mova          m0, [rsp+14*32]
+%endif
     ABSSUB        m2, m0, m4, m10
     pmaxub        m1, m2
+%ifidn %2, v
     lea         tmpq, [dstq+strideq*4]
     mova          m0, [tmpq+strideq*0]
+%else
+    mova          m0, [rsp+15*32]
+%endif
     ABSSUB        m2, m0, m5, m10
     pmaxub        m1, m2
+%ifidn %2, v
     mova          m0, [tmpq+strideq*1]
+%else
+    mova          m0, [rsp+16*32]
+%endif
     ABSSUB        m2, m0, m5, m10
     pmaxub        m1, m2
+%ifidn %2, v
     mova          m0, [tmpq+strideq*2]
+%else
+    mova          m0, [rsp+17*32]
+%endif
     ABSSUB        m2, m0, m5, m10
     pmaxub        m1, m2
     pxor          m1, [pb_128]
@@ -279,10 +754,16 @@
 
 %if %1 == 16
     ; flat16 filter
+%ifidn %2, v
     lea         tmpq, [dstq+mstrideq*8]
     mova          m0, [tmpq+strideq*1]          ; p6
     mova          m2, [tmpq+strideq*2]          ; p5
     mova          m7, [tmpq+stride3q]           ; p4
+%else
+    mova          m0, [rsp+12*32]
+    mova          m2, [rsp+13*32]
+    mova          m7, [rsp+14*32]
+%endif
 
     mova  [rsp+0*32], m9
     mova  [rsp+1*32], m14
@@ -318,7 +799,11 @@
     pand          m8, m1
     pandn         m9, m1, m2
     por           m8, m9
+%ifidn %2, v
     mova [tmpq+strideq*2], m8                   ; p5
+%else
+    mova [rsp+13*32], m8
+%endif
 
     ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
     ; write -5
@@ -340,7 +825,11 @@
     pand          m8, m1
     pandn         m9, m1, m7
     por           m8, m9
+%ifidn %2, v
     mova [tmpq+stride3q], m8                    ; p4
+%else
+    mova [rsp+14*32], m8
+%endif
 
     ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
     ; write -4
@@ -364,7 +853,11 @@
     pand          m8, m1
     pandn         m9, m1, m12
     por           m8, m9
+%ifidn %2, v
     mova [tmpq+strideq*4], m8                   ; p3
+%else
+    mova [rsp+19*32], m8
+%endif
 
     ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
     ; write -3
@@ -392,7 +885,9 @@
 
     ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
     ; write -2
+%ifidn %2, v
     lea         tmpq, [dstq+strideq*4]
+%endif
     punpcklbw     m8, m0, m4
     punpckhbw     m9, m0, m4
     pmaddubsw     m8, [pb_m1_1]
@@ -399,7 +894,11 @@
     pmaddubsw     m9, [pb_m1_1]
     paddw        m10, m8
     paddw        m11, m9                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%ifidn %2, v
     mova          m9, [tmpq+strideq*0]          ; q4
+%else
+    mova          m9, [rsp+15*32]
+%endif
     punpcklbw     m8, m12, m9
     punpckhbw     m9, m12, m9
     pmaddubsw     m8, [pb_m1_1]
@@ -418,7 +917,11 @@
 
     ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
     ; write -1
+%ifidn %2, v
     mova          m9, [tmpq+strideq*1]          ; q5
+%else
+    mova          m9, [rsp+16*32]
+%endif
     punpcklbw     m8, m0, m5
     punpckhbw     m0, m5
     pmaddubsw     m8, [pb_m1_1]
@@ -444,7 +947,11 @@
 
     ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
     ; write +0
+%ifidn %2, v
     mova          m0, [tmpq+strideq*2]          ; q6
+%else
+    mova          m0, [rsp+17*32]
+%endif
     paddw        m10, [rsp+3*32]
     paddw        m11, [rsp+4*32]                ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
     punpcklbw     m8, m3, m0
@@ -517,7 +1024,11 @@
     pand          m8, m1
     pandn         m9, m1, m15
     por           m8, m9
+%ifidn %2, v
     mova [tmpq+mstrideq], m8                    ; q3
+%else
+    mova [rsp+20*32], m8
+%endif
 
     ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
     ; write +4
@@ -534,9 +1045,17 @@
     pmulhrsw      m9, m11, [pw_2048]
     packuswb      m8, m9
     pand          m8, m1
+%ifidn %2, v
     pandn         m9, m1, [tmpq+strideq*0]
+%else
+    pandn         m9, m1, [rsp+15*32]
+%endif
     por           m8, m9
+%ifidn %2, v
     mova [tmpq+strideq*0], m8                    ; q4
+%else
+    mova [rsp+15*32], m8
+%endif
 
     ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
     ; write +5
@@ -552,16 +1071,25 @@
     pmulhrsw     m11, [pw_2048]
     packuswb     m10, m11
     pand         m10, m1
+%ifidn %2, v
     pandn        m11, m1, [tmpq+strideq*1]
+%else
+    pandn        m11, m1, [rsp+16*32]
+%endif
     por          m10, m11
+%ifidn %2, v
     mova [tmpq+strideq*1], m10                  ; q5
+%else
+    mova [rsp+16*32], m10
+%endif
 
     mova          m9, [rsp+0*32]
+%ifidn %2, v
     lea         tmpq, [dstq+mstrideq*4]
 %endif
+%endif
 %if %1 >= 8
     ; flat8 filter
-
     punpcklbw     m0, m12, m3
     punpckhbw     m1, m12, m3
     pmaddubsw     m2, m0, [pb_3_1]
@@ -583,8 +1111,10 @@
     packuswb      m8, m11
     pand          m8, m9
     pandn        m11, m9, m13
-    por           m8, m11                      ; p2
-    mova [tmpq+strideq*1], m8                  ; p2
+    por          m10, m8, m11                  ; p2
+%ifidn %2, v
+    mova [tmpq+strideq*1], m10                 ; p2
+%endif
 
     pmaddubsw     m8, m0, [pb_m1_1]
     pmaddubsw    m11, m1, [pb_m1_1]
@@ -602,7 +1132,11 @@
     pand          m8, m9
     pandn        m11, m9, m3
     por           m8, m11                       ; p1
+%ifidn %2, v
     mova [tmpq+strideq*2], m8                   ; p1
+%else
+    mova  [rsp+0*32], m8
+%endif
 
     pmaddubsw     m0, [pb_1]
     pmaddubsw     m1, [pb_1]
@@ -620,7 +1154,11 @@
     pand          m8, m9
     pandn        m11, m9, m4
     por           m8, m11                       ; p0
+%ifidn %2, v
     mova [tmpq+stride3q ], m8                   ; p0
+%else
+    mova  [rsp+1*32], m8
+%endif
 
     punpcklbw     m0, m5, m15
     punpckhbw     m1, m5, m15
@@ -639,8 +1177,10 @@
     packuswb      m8, m11
     pand          m8, m9
     pandn        m11, m9, m5
-    por           m8, m11                       ; q0
-    mova [dstq+strideq*0], m8                   ; q0
+    por          m11, m8, m11                   ; q0
+%ifidn %2, v
+    mova [dstq+strideq*0], m11                  ; q0
+%endif
 
     pmaddubsw     m0, [pb_m1_1]
     pmaddubsw     m1, [pb_m1_1]
@@ -647,18 +1187,20 @@
     paddw         m2, m0
     paddw         m7, m1
     punpcklbw     m8, m13, m6
-    punpckhbw    m11, m13, m6
+    punpckhbw    m13, m6
     pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw    m11, [pb_m1_1]
+    pmaddubsw    m13, [pb_m1_1]
     paddw         m2, m8
-    paddw         m7, m11                       ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+    paddw         m7, m13                       ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
     psrlw         m8, m2, 3
-    psrlw        m11, m7, 3
-    packuswb      m8, m11
+    psrlw        m13, m7, 3
+    packuswb      m8, m13
     pand          m8, m9
-    pandn        m11, m9, m6
-    por           m8, m11                       ; q1
-    mova [dstq+strideq*1], m8                   ; q1
+    pandn        m13, m9, m6
+    por          m13, m8, m13                   ; q1
+%ifidn %2, v
+    mova [dstq+strideq*1], m13                  ; q1
+%endif
 
     punpcklbw     m0, m3, m6
     punpckhbw     m1, m3, m6
@@ -676,9 +1218,152 @@
     psrlw         m7, 3
     packuswb      m2, m7
     pand          m2, m9
-    pandn        m11, m9, m14
-    por           m2, m11                       ; q2
+    pandn         m7, m9, m14
+    por           m2, m7                        ; q2
+%ifidn %2, v
     mova [dstq+strideq*2], m2                   ; q2
+%else
+    mova          m0, [rsp+0*32]
+    mova          m1, [rsp+1*32]
+%if %1 == 8
+    ; 16x8 transpose
+    punpcklbw     m3, m12, m10
+    punpckhbw    m12, m10
+    punpcklbw    m10, m0, m1
+    punpckhbw     m0, m1
+    punpcklbw     m1, m11, m13
+    punpckhbw    m11, m13
+    punpcklbw    m13, m2, m15
+    punpckhbw     m2, m15
+
+    punpcklwd    m15, m3, m10
+    punpckhwd     m3, m10
+    punpcklwd    m10, m12, m0
+    punpckhwd    m12, m0
+    punpcklwd     m0, m1, m13
+    punpckhwd     m1, m13
+    punpcklwd    m13, m11, m2
+    punpckhwd    m11, m2
+
+    punpckldq     m2, m15, m0
+    punpckhdq    m15, m0
+    punpckldq     m0, m3, m1
+    punpckhdq     m3, m1
+    punpckldq     m1, m10, m13
+    punpckhdq    m10, m13
+    punpckldq    m13, m12, m11
+    punpckhdq    m12, m11
+
+    ; write 8x32
+    movq   [dstq+strideq*0-4], xm2
+    movhps [dstq+strideq*1-4], xm2
+    movq   [dstq+strideq*2-4], xm15
+    movhps [dstq+stride3q -4], xm15
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm0
+    movhps [dstq+strideq*1-4], xm0
+    movq   [dstq+strideq*2-4], xm3
+    movhps [dstq+stride3q -4], xm3
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm1
+    movhps [dstq+strideq*1-4], xm1
+    movq   [dstq+strideq*2-4], xm10
+    movhps [dstq+stride3q -4], xm10
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm13
+    movhps [dstq+strideq*1-4], xm13
+    movq   [dstq+strideq*2-4], xm12
+    movhps [dstq+stride3q -4], xm12
+    lea         dstq, [dstq+strideq*4]
+
+    vextracti128  xm2,  m2, 1
+    vextracti128 xm15, m15, 1
+    vextracti128  xm0,  m0, 1
+    vextracti128  xm3,  m3, 1
+    vextracti128  xm1,  m1, 1
+    vextracti128 xm10, m10, 1
+    vextracti128 xm13, m13, 1
+    vextracti128 xm12, m12, 1
+
+    movq   [dstq+strideq*0-4], xm2
+    movhps [dstq+strideq*1-4], xm2
+    movq   [dstq+strideq*2-4], xm15
+    movhps [dstq+stride3q -4], xm15
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm0
+    movhps [dstq+strideq*1-4], xm0
+    movq   [dstq+strideq*2-4], xm3
+    movhps [dstq+stride3q -4], xm3
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm1
+    movhps [dstq+strideq*1-4], xm1
+    movq   [dstq+strideq*2-4], xm10
+    movhps [dstq+stride3q -4], xm10
+    lea         dstq, [dstq+strideq*4]
+    movq   [dstq+strideq*0-4], xm13
+    movhps [dstq+strideq*1-4], xm13
+    movq   [dstq+strideq*2-4], xm12
+    movhps [dstq+stride3q -4], xm12
+    lea         dstq, [dstq+strideq*4]
+%else
+    ; 16x16 transpose and store
+    SWAP           5, 10, 2
+    SWAP           6, 0
+    SWAP           7, 1
+    SWAP           8, 11
+    SWAP           9, 13
+    mova          m0, [rsp+11*32]
+    mova          m1, [rsp+12*32]
+    mova          m2, [rsp+13*32]
+    mova          m3, [rsp+14*32]
+    mova          m4, [rsp+19*32]
+    mova         m11, [rsp+20*32]
+    mova         m12, [rsp+15*32]
+    mova         m13, [rsp+16*32]
+    mova         m14, [rsp+17*32]
+    TRANSPOSE_16X16B 1, 0, [rsp+18*32]
+    movu [dstq+strideq*0-8], xm0
+    movu [dstq+strideq*1-8], xm1
+    movu [dstq+strideq*2-8], xm2
+    movu [dstq+stride3q -8], xm3
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm4
+    movu [dstq+strideq*1-8], xm5
+    movu [dstq+strideq*2-8], xm6
+    movu [dstq+stride3q -8], xm7
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm8
+    movu [dstq+strideq*1-8], xm9
+    movu [dstq+strideq*2-8], xm10
+    movu [dstq+stride3q -8], xm11
+    lea         dstq, [dstq+strideq*4]
+    movu [dstq+strideq*0-8], xm12
+    movu [dstq+strideq*1-8], xm13
+    movu [dstq+strideq*2-8], xm14
+    movu [dstq+stride3q -8], xm15
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m0, 1
+    vextracti128 [dstq+strideq*1-8], m1, 1
+    vextracti128 [dstq+strideq*2-8], m2, 1
+    vextracti128 [dstq+stride3q -8], m3, 1
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m4, 1
+    vextracti128 [dstq+strideq*1-8], m5, 1
+    vextracti128 [dstq+strideq*2-8], m6, 1
+    vextracti128 [dstq+stride3q -8], m7, 1
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m8, 1
+    vextracti128 [dstq+strideq*1-8], m9, 1
+    vextracti128 [dstq+strideq*2-8], m10, 1
+    vextracti128 [dstq+stride3q -8], m11, 1
+    lea         dstq, [dstq+strideq*4]
+    vextracti128 [dstq+strideq*0-8], m12, 1
+    vextracti128 [dstq+strideq*1-8], m13, 1
+    vextracti128 [dstq+strideq*2-8], m14, 1
+    vextracti128 [dstq+stride3q -8], m15, 1
+    lea         dstq, [dstq+strideq*4]
+%endif
+%endif
 %elif %1 == 6
     ; flat6 filter
 
@@ -698,7 +1383,9 @@
     pand          m2, m9
     pandn        m12, m9, m3
     por           m2, m12
+%ifidn %2, v
     mova [tmpq+strideq*2], m2                   ; p1
+%endif
 
     pmaddubsw     m8, [pb_m1_1]
     pmaddubsw    m11, [pb_m1_1]
@@ -716,23 +1403,27 @@
     pand         m12, m9
     pandn        m13, m9, m4
     por          m12, m13
+%ifidn %2, v
     mova [tmpq+stride3q], m12                   ; p0
+%endif
 
     paddw         m0, m8
     paddw         m1, m11
     punpcklbw     m8, m3, m14
     punpckhbw    m11, m3, m14
-    pmaddubsw    m12, m8, [pb_m1_1]
+    pmaddubsw    m14, m8, [pb_m1_1]
     pmaddubsw    m13, m11, [pb_m1_1]
-    paddw         m0, m12
+    paddw         m0, m14
     paddw         m1, m13
-    pmulhrsw     m12, m0, [pw_4096]
+    pmulhrsw     m14, m0, [pw_4096]
     pmulhrsw     m13, m1, [pw_4096]
-    packuswb     m12, m13
-    pand         m12, m9
+    packuswb     m14, m13
+    pand         m14, m9
     pandn        m13, m9, m5
-    por          m12, m13
-    mova [dstq+strideq*0], m12                  ; q0
+    por          m14, m13
+%ifidn %2, v
+    mova [dstq+strideq*0], m14                  ; q0
+%endif
 
     pmaddubsw     m8, [pb_m1_2]
     pmaddubsw    m11, [pb_m1_2]
@@ -748,19 +1439,27 @@
     pand          m0, m9
     pandn         m9, m6
     por           m0, m9
+%ifidn %2, v
     mova [dstq+strideq*1], m0                   ; q1
 %else
+    TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
+%endif
+%else
+%ifidn %2, v
     mova [tmpq+strideq*0], m3                   ; p1
     mova [tmpq+strideq*1], m4                   ; p0
     mova [tmpq+strideq*2], m5                   ; q0
     mova [tmpq+stride3q ], m6                   ; q1
+%else
+    TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
 %endif
+%endif
 %endmacro
 
 INIT_YMM avx2
-cglobal lpf_v_sb128y, 7, 10, 16, 32 * 11, \
-                      dst, stride, mask, l, l_stride, lut, \
-                      w, stride3, mstride, tmp
+cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \
+                    dst, stride, mask, l, l_stride, lut, \
+                    w, stride3, mstride, tmp
     shl    l_strideq, 2
     sub           lq, l_strideq
     mov     mstrideq, strideq
@@ -771,7 +1470,7 @@
     cmp byte [maskq+8], 0                       ; vmask[2]
     je .no_flat16
 
-    FILTER        16
+    FILTER        16, v
     jmp .end
 
 .no_flat16:
@@ -778,7 +1477,7 @@
     cmp byte [maskq+4], 0                       ; vmask[1]
     je .no_flat
 
-    FILTER         8
+    FILTER         8, v
     jmp .end
 
 .no_flat:
@@ -785,7 +1484,7 @@
     cmp byte [maskq+0], 0                       ; vmask[0]
     je .end
 
-    FILTER         4
+    FILTER         4, v
 
 .end:
     add           lq, 32
@@ -796,10 +1495,50 @@
     RET
 
 INIT_YMM avx2
-cglobal lpf_v_sb128uv, 7, 10, 16, \
-                       dst, stride, mask, l, l_stride, lut, \
-                       w, stride3, mstride, tmp
+cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \
+                    dst, stride, mask, l, l_stride, lut, \
+                    h, stride3, l_stride3, tmp
     shl    l_strideq, 2
+    sub           lq, 4
+    lea     stride3q, [strideq*3]
+    lea   l_stride3q, [l_strideq*3]
+
+.loop:
+    cmp byte [maskq+8], 0                       ; vmask[2]
+    je .no_flat16
+
+    FILTER        16, h
+    jmp .end
+
+.no_flat16:
+    cmp byte [maskq+4], 0                       ; vmask[1]
+    je .no_flat
+
+    FILTER         8, h
+    jmp .end
+
+.no_flat:
+    cmp byte [maskq+0], 0                       ; vmask[0]
+    je .no_filter
+
+    FILTER         4, h
+    jmp .end
+
+.no_filter:
+    lea         dstq, [dstq+stride3q*8]
+    lea           lq, [lq+l_strideq*8]
+    lea         dstq, [dstq+strideq*8]
+.end:
+    add        maskq, 1
+    sub           hd, 8
+    jg .loop
+    RET
+
+INIT_YMM avx2
+cglobal lpf_v_sb_uv, 7, 10, 16, \
+                     dst, stride, mask, l, l_stride, lut, \
+                     w, stride3, mstride, tmp
+    shl    l_strideq, 2
     sub           lq, l_strideq
     mov     mstrideq, strideq
     neg     mstrideq
@@ -809,7 +1548,7 @@
     cmp byte [maskq+4], 0                       ; vmask[1]
     je .no_flat
 
-    FILTER         6
+    FILTER         6, v
     jmp .end
 
 .no_flat:
@@ -816,7 +1555,7 @@
     cmp byte [maskq+0], 0                       ; vmask[0]
     je .end
 
-    FILTER         4
+    FILTER         4, v
 
 .end:
     add           lq, 32
@@ -823,6 +1562,39 @@
     add         dstq, 32
     add        maskq, 1
     sub           wd, 8
+    jg .loop
+    RET
+
+INIT_YMM avx2
+cglobal lpf_h_sb_uv, 7, 10, 16, \
+                     dst, stride, mask, l, l_stride, lut, \
+                     h, stride3, l_stride3, tmp
+    shl    l_strideq, 2
+    sub           lq, 4
+    lea     stride3q, [strideq*3]
+    lea   l_stride3q, [l_strideq*3]
+
+.loop:
+    cmp byte [maskq+4], 0                       ; vmask[1]
+    je .no_flat
+
+    FILTER         6, h
+    jmp .end
+
+.no_flat:
+    cmp byte [maskq+0], 0                       ; vmask[0]
+    je .no_filter
+
+    FILTER         4, h
+    jmp .end
+
+.no_filter:
+    lea         dstq, [dstq+stride3q*8]
+    lea           lq, [lq+l_strideq*8]
+    lea         dstq, [dstq+strideq*8]
+.end:
+    add        maskq, 1
+    sub           hd, 8
     jg .loop
     RET
 
--- a/src/x86/loopfilter_init.c
+++ b/src/x86/loopfilter_init.c
@@ -28,8 +28,10 @@
 #include "src/cpu.h"
 #include "src/loopfilter.h"
 
-decl_loopfilter_sb_fn(dav1d_lpf_v_sb128y_avx2);
-decl_loopfilter_sb_fn(dav1d_lpf_v_sb128uv_avx2);
+decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_avx2);
+decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_avx2);
+decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_avx2);
+decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_avx2);
 
 void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
@@ -37,7 +39,9 @@
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
 #if BITDEPTH == 8 && ARCH_X86_64
-    c->loop_filter_sb[0][1] = dav1d_lpf_v_sb128y_avx2;
-    c->loop_filter_sb[1][1] = dav1d_lpf_v_sb128uv_avx2;
+    c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_avx2;
+    c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_avx2;
+    c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_avx2;
+    c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_avx2;
 #endif
 }