shithub: dav1d

Download patch

ref: 78d27b7d1c923f632bc266470436e7f46a940d70
parent: 3497c4c905f8c85d8c65b28c352ff85dfddd66ed
author: Henrik Gramner <[email protected]>
date: Wed Dec 2 09:10:57 EST 2020

x86: Rewrite wiener SSE2/SSSE3/AVX2 asm

The previous implementation did two separate passes in the horizontal
and vertical directions, with the intermediate values being stored
in a buffer on the stack. This caused bad cache thrashing.

By interleaving the horizontal and vertical passes in combination
with a ring buffer for storing only a few rows at a time the
performance is improved by a significant amount.

Also split the function into 7-tap and 5-tap versions. The latter is
faster and fairly common (always for chroma, sometimes for luma).

--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -288,7 +288,7 @@
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-    c->wiener = wiener_filter_neon;
+    c->wiener[0] = c->wiener[1] = wiener_filter_neon;
     if (bpc <= 10)
         c->selfguided = sgr_filter_neon;
 }
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -67,7 +67,7 @@
 typedef decl_selfguided_filter_fn(*selfguided_fn);
 
 typedef struct Dav1dLoopRestorationDSPContext {
-    wienerfilter_fn wiener;
+    wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
     selfguided_fn selfguided;
 } Dav1dLoopRestorationDSPContext;
 
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -509,7 +509,7 @@
 }
 
 COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
-    c->wiener = wiener_c;
+    c->wiener[0] = c->wiener[1] = wiener_c;
     c->selfguided = selfguided_c;
 
 #if HAVE_ASM
--- a/src/lr_apply_tmpl.c
+++ b/src/lr_apply_tmpl.c
@@ -163,6 +163,7 @@
     int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
 
     ALIGN_STK_16(int16_t, filter, 2, [8]);
+    wienerfilter_fn wiener_fn = NULL;
     if (lr->type == DAV1D_RESTORATION_WIENER) {
         filter[0][0] = filter[0][6] = lr->filter_h[0];
         filter[0][1] = filter[0][5] = lr->filter_h[1];
@@ -178,6 +179,8 @@
         filter[1][1] = filter[1][5] = lr->filter_v[1];
         filter[1][2] = filter[1][4] = lr->filter_v[2];
         filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
+
+        wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
     } else {
         assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
     }
@@ -185,9 +188,9 @@
     while (y + stripe_h <= row_h) {
         // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
         edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
-        if (lr->type == DAV1D_RESTORATION_WIENER) {
-            dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
-                           filter, edges HIGHBD_CALL_SUFFIX);
+        if (wiener_fn) {
+            wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+                      filter, edges HIGHBD_CALL_SUFFIX);
         } else {
             dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
                                lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
--- a/src/ppc/looprestoration_init_tmpl.c
+++ b/src/ppc/looprestoration_init_tmpl.c
@@ -332,7 +332,7 @@
     if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
 
 #if BITDEPTH == 8
-    c->wiener = wiener_filter_vsx;
+    c->wiener[0] = c->wiener[1] = wiener_filter_vsx;
 #endif
 }
 
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -29,20 +29,25 @@
 %if ARCH_X86_64
 
 SECTION_RODATA 32
+
+wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
+wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
+wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
+wiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
+wiener_l_shuf: db  4,  4,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+pb_0to31:      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+               db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 pb_right_ext_mask: times 32 db 0xff
                    times 32 db 0
-pb_14x0_1_2: times 14 db 0
-             db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
-                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_15: times 16 db 15
-pw_16: times 2 dw 16
-pw_256: times 2 dw 256
-pw_2048: times 2 dw 2048
-pw_16380: times 2 dw 16380
-pw_5_6: dw 5, 6
-pd_6: dd 6
-pd_1024: dd 1024
+
+pb_3:          times 4 db 3
+pb_m5:         times 4 db -5
+pw_16:         times 2 dw 16
+pw_256:        times 2 dw 256
+pw_2056:       times 2 dw 2056
+pw_m16380:     times 2 dw -16380
+pw_5_6:        dw 5, 6
+pd_1024:       dd 1024
 pd_0xf0080029: dd 0xf0080029
 pd_0xf00801c7: dd 0xf00801c7
 
@@ -50,277 +55,662 @@
 
 SECTION .text
 
-INIT_YMM avx2
-cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, flt, w, h, edge
-    mov        edged, edgem
-    vpbroadcastb m15, [fltq+0]
-    movifnidn     wd, wm
-    vpbroadcastb m14, [fltq+2]
-    mov           hd, hm
-    vpbroadcastb m13, [fltq+4]
-    vpbroadcastw m12, [fltq+6]
-    vpbroadcastd m11, [pw_2048]
-    vpbroadcastd m10, [pw_16380]
-    lea          r11, [pb_right_ext_mask]
+%macro REPX 2-*
+    %xdefine %%f(x) %1
+%rep %0 - 1
+    %rotate 1
+    %%f(%1)
+%endrep
+%endmacro
 
-    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
 
-    ; if (edge & has_right) align_w_to_32
-    ; else w -= 32, and use that as limit in x loop
-    test       edgeb, 2 ; has_right
-    jnz .align
-    mov        xlimq, -3
-    jmp .loop
-.align:
-    add           wd, 31
-    and           wd, ~31
-    xor        xlimd, xlimd
-
-    ; main y loop for vertical filter
-.loop:
-    mov      srcptrq, srcq
-    mov      dstptrq, dstq
-    lea           xq, [wq+xlimq]
-
-    ; load left edge pixels
-    test       edgeb, 1 ; have_left
-    jz .emu_left
-    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
-    jz .load_left_combined
-    movd         xm0, [leftq]
-    add        leftq, 4
-    pinsrd       xm0, [srcq], 1
-    pslldq       xm0, 9
-    jmp .left_load_done
-.load_left_combined:
-    movq         xm0, [srcq-3]
-    pslldq       xm0, 10
-    jmp .left_load_done
-.emu_left:
-    movd         xm0, [srcq]
-    pshufb       xm0, [pb_14x0_1_2]
-
-    ; load right edge pixels
-.left_load_done:
-    cmp           xd, 32
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    je .splat_right
-
-    ; for very small images (w=[1-2]), edge-extend the original cache,
-    ; ugly, but only runs in very odd cases
-    add           wd, wd
-    pshufb       xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
-    shr           wd, 1
-
-    ; main x loop, mostly this starts in .main_load
-.splat_right:
-    ; no need to load new pixels, just extend them from the (possibly previously
-    ; extended) previous load into m0
-    pshufb       xm1, xm0, [pb_15]
-    jmp .main_loop
-.load_and_splat:
-    ; load new pixels and extend edge for right-most
-    movu          m1, [srcptrq+3]
-    sub          r11, xq
-    movu          m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
-    add          r11, xq
-    vpbroadcastb  m3, [srcptrq+2+xq]
-    pand          m1, m2
-    pandn         m3, m2, m3
-    por           m1, m3
-    jmp .main_loop
-.main_load:
-    ; load subsequent line
-    movu          m1, [srcptrq+3]
+INIT_YMM avx2
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+                                               lpf_stride, w, edge, flt, h
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    vbroadcasti128  m6, [wiener_shufA]
+    vpbroadcastb   m11, [fltq+ 0] ; x0 x0
+    vbroadcasti128  m7, [wiener_shufB]
+    vpbroadcastd   m12, [fltq+ 2]
+    vbroadcasti128  m8, [wiener_shufC]
+    packsswb       m12, m12       ; x1 x2
+    vpbroadcastw   m13, [fltq+ 6] ; x3
+    vbroadcasti128  m9, [wiener_shufD]
+    add           lpfq, wq
+    vpbroadcastd   m10, [pw_m16380]
+    lea             t1, [rsp+wq*2+16]
+    vpbroadcastd   m14, [fltq+16] ; y0 y1
+    add           dstq, wq
+    vpbroadcastd   m15, [fltq+20] ; y2 y3
+    neg             wq
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t6, t1
+    mov             t5, t1
+    add             t1, 384*2
+    call .h_top
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov             t4, t1
+    add             t1, 384*2
+    mov      [rsp+8*1], lpf_strideq
+    add             r7, lpf_strideq
+    mov      [rsp+8*0], r7 ; below
+    call .h
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+.main:
+    lea             t0, [t1+384*2]
 .main_loop:
-    vinserti128   m0, xm1, 1
-
-    palignr       m2, m1, m0, 10
-    palignr       m3, m1, m0, 11
-    palignr       m4, m1, m0, 12
-    palignr       m5, m1, m0, 13
-    palignr       m6, m1, m0, 14
-    palignr       m7, m1, m0, 15
-
-    punpcklbw     m0, m2, m1
-    punpckhbw     m2, m1
-    punpcklbw     m8, m3, m7
-    punpckhbw     m3, m7
-    punpcklbw     m7, m4, m6
-    punpckhbw     m4, m6
-    pxor          m9, m9
-    punpcklbw     m6, m5, m9
-    punpckhbw     m5, m9
-
-    pmaddubsw     m0, m15
-    pmaddubsw     m2, m15
-    pmaddubsw     m8, m14
-    pmaddubsw     m3, m14
-    pmaddubsw     m7, m13
-    pmaddubsw     m4, m13
-    paddw         m0, m8
-    paddw         m2, m3
-    psllw         m8, m6, 7
-    psllw         m3, m5, 7
-    psubw         m8, m10
-    psubw         m3, m10
-    pmullw        m6, m12
-    pmullw        m5, m12
-    paddw         m0, m7
-    paddw         m2, m4
-    paddw         m0, m6
-    paddw         m2, m5
-    ; for a signed overflow to happen we need filter and pixels as follow:
-    ; filter => -5,-23,-17,90,-17,-23,-5
-    ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0
-    ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6]
-    ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84]
-    ;  32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A]
-    ; => signed 16-bit overflow occurs
-    paddsw        m0, m8  ; paddsw clips this range to [-8000;+7FFF]
-    paddsw        m2, m3
-    psraw         m0, 3   ; shift changes the range to [-1000;+FFF]
-    psraw         m2, 3
-    paddw         m0, m11 ; adding back 800 (removed in m8) changes the
-    paddw         m2, m11 ; range to [-800;+17FF] as defined in the spec
-    mova   [dstptrq], xm0 ; (note that adding another 800 would give us
-    mova [dstptrq+16], xm2;  the same range as in the C code => [0;1FFF])
-    vextracti128 [dstptrq+32], m0, 1
-    vextracti128 [dstptrq+48], m2, 1
-    vextracti128 xm0, m1, 1
-    add      srcptrq, 32
-    add      dstptrq, 64
-    sub           xq, 32
-    cmp           xd, 32
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    cmp           xd, xlimd
-    jg .splat_right
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v3
+    mov           lpfq, [rsp+8*0]
+    call .hv_bottom
+    add           lpfq, [rsp+8*1]
+    call .hv_bottom
+.v1:
+    call .v
+    RET
+.no_top:
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov      [rsp+8*1], lpf_strideq
+    lea             r7, [r7+lpf_strideq*2]
+    mov      [rsp+8*0], r7
+    call .h
+    mov             t6, t1
+    mov             t5, t1
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v3
+    add             t0, 384*8
+    call .hv
+    dec             hd
+    jnz .main
+.v3:
+    call .v
+.v2:
+    call .v
+    jmp .v1
+.extend_right:
+    movd           xm2, r10d
+    vpbroadcastd    m0, [pb_3]
+    vpbroadcastd    m1, [pb_m5]
+    vpbroadcastb    m2, xm2
+    movu            m3, [pb_0to31]
+    psubb           m0, m2
+    psubb           m1, m2
+    pminub          m0, m3
+    pminub          m1, m3
+    pshufb          m4, m0
+    pshufb          m5, m1
+    ret
+.h:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .h_main
+.h_extend_left:
+    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
+    mova            m4, [lpfq+r10] ; before the start of the buffer
+    palignr         m4, m5, 12
+    pshufb          m4, [wiener_l_shuf]
+    jmp .h_main
+.h_top:
+    mov            r10, wq
+    movu            m4, [lpfq+r10-4]
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jnz .h_main
+    pshufb          m4, [wiener_l_shuf]
+    jmp .h_main
+.h_loop:
+    movu            m4, [lpfq+r10-4]
+.h_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp           r10d, -34
+    jl .h_have_right
+    call .extend_right
+.h_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m11
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m11
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    paddw           m0, m2
+    pshufb          m2, m4, m8
+    pmaddubsw       m2, m12
+    paddw           m1, m3
+    pshufb          m3, m5, m8
+    pmaddubsw       m3, m12
+    pshufb          m4, m9
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m9
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m10
+    paddw           m5, m10
+    paddw           m0, m2
+    vpbroadcastd    m2, [pw_2056]
+    paddw           m1, m3
+    paddsw          m0, m4
+    paddsw          m1, m5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m2
+    paddw           m1, m2
+    mova [t1+r10*2+ 0], m0
+    mova [t1+r10*2+32], m1
+    add            r10, 32
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .hv_main
+.hv_extend_left:
+    movu            m4, [lpfq+r10-4]
+    pshufb          m4, [wiener_l_shuf]
+    jmp .hv_main
+.hv_bottom:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+r10-4]
+.hv_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp           r10d, -34
+    jl .hv_have_right
+    call .extend_right
+.hv_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m11
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m11
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    paddw           m0, m2
+    pshufb          m2, m4, m8
+    pmaddubsw       m2, m12
+    paddw           m1, m3
+    pshufb          m3, m5, m8
+    pmaddubsw       m3, m12
+    pshufb          m4, m9
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m9
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m10
+    paddw           m5, m10
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m2, [t4+r10*2]
+    paddw           m2, [t2+r10*2]
+    mova            m3, [t3+r10*2]
+    paddsw          m0, m4
+    vpbroadcastd    m4, [pw_2056]
+    paddsw          m1, m5
+    mova            m5, [t5+r10*2]
+    paddw           m5, [t1+r10*2]
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m4
+    paddw           m1, m4
+    paddw           m4, m0, [t6+r10*2]
+    mova    [t0+r10*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m14
+    punpckhwd       m4, m5
+    pmaddwd         m4, m14
+    paddd           m0, m3
+    paddd           m4, m2
+    mova            m2, [t4+r10*2+32]
+    paddw           m2, [t2+r10*2+32]
+    mova            m3, [t3+r10*2+32]
+    mova            m5, [t5+r10*2+32]
+    paddw           m5, [t1+r10*2+32]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
+    paddw           m4, m1, [t6+r10*2+32]
+    mova [t0+r10*2+32], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m14
+    punpckhwd       m4, m5
+    pmaddwd         m4, m14
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .hv_loop
+    mov             t6, t5
+    mov             t5, t4
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t6
+    add           dstq, dst_strideq
+    ret
+.v:
+    mov            r10, wq
+.v_loop:
+    mova            m2, [t4+r10*2+ 0]
+    paddw           m2, [t2+r10*2+ 0]
+    mova            m4, [t3+r10*2+ 0]
+    mova            m6, [t1+r10*2+ 0]
+    paddw           m8, m6, [t6+r10*2+ 0]
+    paddw           m6, [t5+r10*2+ 0]
+    mova            m3, [t4+r10*2+32]
+    paddw           m3, [t2+r10*2+32]
+    mova            m5, [t3+r10*2+32]
+    mova            m7, [t1+r10*2+32]
+    paddw           m9, m7, [t6+r10*2+32]
+    paddw           m7, [t5+r10*2+32]
+    punpcklwd       m0, m2, m4
+    pmaddwd         m0, m15
+    punpckhwd       m2, m4
+    pmaddwd         m2, m15
+    punpcklwd       m4, m8, m6
+    pmaddwd         m4, m14
+    punpckhwd       m6, m8, m6
+    pmaddwd         m6, m14
+    punpcklwd       m1, m3, m5
+    pmaddwd         m1, m15
+    punpckhwd       m3, m5
+    pmaddwd         m3, m15
+    punpcklwd       m5, m9, m7
+    pmaddwd         m5, m14
+    punpckhwd       m7, m9, m7
+    pmaddwd         m7, m14
+    paddd           m0, m4
+    paddd           m2, m6
+    paddd           m1, m5
+    paddd           m3, m7
+    REPX {psrad x, 11}, m0, m2, m1, m3
+    packssdw        m0, m2
+    packssdw        m1, m3
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .v_loop
+    mov             t6, t5
+    mov             t5, t4
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    add           dstq, dst_strideq
+    ret
 
-    add         srcq, strideq
-    add         dstq, 384*2
-    dec           hd
-    jg .loop
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+                                             lpf_stride, w, edge, flt, h
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    vbroadcasti128  m6, [wiener_shufB]
+    vpbroadcastd   m12, [fltq+ 2]
+    vbroadcasti128  m7, [wiener_shufC]
+    packsswb       m12, m12       ; x1 x2
+    vpbroadcastw   m13, [fltq+ 6] ; x3
+    vbroadcasti128  m8, [wiener_shufD]
+    add           lpfq, wq
+    vpbroadcastd    m9, [pw_m16380]
+    vpbroadcastd   m10, [pw_2056]
+    lea             t1, [rsp+wq*2+16]
+    mova           m11, [wiener_l_shuf]
+    vpbroadcastd   m14, [fltq+16] ; __ y1
+    add           dstq, wq
+    vpbroadcastd   m15, [fltq+20] ; y2 y3
+    neg             wq
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t4, t1
+    add             t1, 384*2
+    call .h_top
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov             t3, t1
+    add             t1, 384*2
+    mov      [rsp+8*1], lpf_strideq
+    add             r7, lpf_strideq
+    mov      [rsp+8*0], r7 ; below
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+.main:
+    mov             t0, t4
+.main_loop:
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v2
+    mov           lpfq, [rsp+8*0]
+    call .hv_bottom
+    add           lpfq, [rsp+8*1]
+    call .hv_bottom
+.end:
     RET
+.no_top:
+    lea             r7, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstq
+    mov      [rsp+8*1], lpf_strideq
+    lea             r7, [r7+lpf_strideq*2]
+    mov      [rsp+8*0], r7
+    call .h
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v2
+    add             t0, 384*6
+    call .hv
+    dec             hd
+    jnz .main
+.v2:
+    call .v
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    add           dstq, dst_strideq
+.v1:
+    call .v
+    jmp .end
+.h:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .h_main
+.h_extend_left:
+    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
+    mova            m4, [lpfq+r10] ; before the start of the buffer
+    palignr         m4, m5, 12
+    pshufb          m4, m11
+    jmp .h_main
+.h_top:
+    mov            r10, wq
+    movu            m4, [lpfq+r10-4]
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jnz .h_main
+    pshufb          m4, m11
+    jmp .h_main
+.h_loop:
+    movu            m4, [lpfq+r10-4]
+.h_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp           r10d, -33
+    jl .h_have_right
+    call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.h_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m12
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m12
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    pshufb          m4, m8
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m8
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m9
+    paddw           m5, m9
+    paddw           m0, m2
+    paddw           m1, m3
+    paddsw          m0, m4
+    paddsw          m1, m5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m10
+    paddw           m1, m10
+    mova [t1+r10*2+ 0], m0
+    mova [t1+r10*2+32], m1
+    add            r10, 32
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movd           xm4, [leftq]
+    vpblendd        m4, [lpfq+r10-4], 0xfe
+    add          leftq, 4
+    jmp .hv_main
+.hv_extend_left:
+    movu            m4, [lpfq+r10-4]
+    pshufb          m4, m11
+    jmp .hv_main
+.hv_bottom:
+    mov            r10, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+r10-4]
+.hv_main:
+    movu            m5, [lpfq+r10+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp           r10d, -33
+    jl .hv_have_right
+    call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.hv_have_right:
+    pshufb          m0, m4, m6
+    pmaddubsw       m0, m12
+    pshufb          m1, m5, m6
+    pmaddubsw       m1, m12
+    pshufb          m2, m4, m7
+    pmaddubsw       m2, m12
+    pshufb          m3, m5, m7
+    pmaddubsw       m3, m12
+    pshufb          m4, m8
+    paddw           m0, m2
+    pmullw          m2, m4, m13
+    pshufb          m5, m8
+    paddw           m1, m3
+    pmullw          m3, m5, m13
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m9
+    paddw           m5, m9
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m2, [t3+r10*2]
+    paddw           m2, [t1+r10*2]
+    mova            m3, [t2+r10*2]
+    paddsw          m0, m4
+    paddsw          m1, m5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m10
+    paddw           m1, m10
+    paddw           m4, m0, [t4+r10*2]
+    mova    [t0+r10*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m14
+    punpckhwd       m4, m4
+    pmaddwd         m4, m14
+    paddd           m0, m3
+    paddd           m4, m2
+    mova            m2, [t3+r10*2+32]
+    paddw           m2, [t1+r10*2+32]
+    mova            m3, [t2+r10*2+32]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
+    paddw           m4, m1, [t4+r10*2+32]
+    mova [t0+r10*2+32], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m15
+    punpckhwd       m2, m3
+    pmaddwd         m2, m15
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m14
+    punpckhwd       m4, m4
+    pmaddwd         m4, m14
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .hv_loop
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t4
+    add           dstq, dst_strideq
+    ret
+.v:
+    mov            r10, wq
+    psrld          m13, m14, 16 ; y1 __
+.v_loop:
+    mova            m6, [t1+r10*2+ 0]
+    paddw           m2, m6, [t3+r10*2+ 0]
+    mova            m4, [t2+r10*2+ 0]
+    mova            m7, [t1+r10*2+32]
+    paddw           m3, m7, [t3+r10*2+32]
+    mova            m5, [t2+r10*2+32]
+    paddw           m6, [t4+r10*2+ 0]
+    paddw           m7, [t4+r10*2+32]
+    punpcklwd       m0, m2, m4
+    pmaddwd         m0, m15
+    punpckhwd       m2, m4
+    pmaddwd         m2, m15
+    punpcklwd       m1, m3, m5
+    pmaddwd         m1, m15
+    punpckhwd       m3, m5
+    pmaddwd         m3, m15
+    punpcklwd       m5, m7, m6
+    pmaddwd         m4, m5, m14
+    punpckhwd       m7, m6
+    pmaddwd         m6, m7, m14
+    pmaddwd         m5, m13
+    pmaddwd         m7, m13
+    paddd           m0, m4
+    paddd           m2, m6
+    paddd           m1, m5
+    paddd           m3, m7
+    REPX {psrad x, 11}, m0, m2, m1, m3
+    packssdw        m0, m2
+    packssdw        m1, m3
+    packuswb        m0, m1
+    mova    [dstq+r10], m0
+    add            r10, 32
+    jl .v_loop
+    ret
 
-cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, flt, edge
-    movifnidn   fltq, fltmp
-    mov        edged, edgem
-    movifnidn     hd, hm
-    vpbroadcastd m10, [fltq+16]
-    vpbroadcastd m11, [fltq+20]
-    vpbroadcastd m12, [pd_1024]
-
-    DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
-    rorx       ylimd, edged, 2
-    and        ylimd, 2 ; have_bottom
-    sub        ylimd, 3
-
-    ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
-    mova          m3, [midq] ; middle line
-
-    ; load top pixels
-    test       edgeb, 4 ; have_top
-    jz .emu_top
-    mova          m0, [midq-384*4]
-    mova          m2, [midq-384*2]
-    mova          m1, m0
-    jmp .load_bottom_pixels
-.emu_top:
-    mova          m0, m3
-    mova          m1, m3
-    mova          m2, m3
-
-    ; load bottom pixels
-.load_bottom_pixels:
-    mov           yd, hd
-    mov        mptrq, midq
-    mov      dstptrq, dstq
-    add           yd, ylimd
-    jg .load_threelines
-
-    ; the remainder here is somewhat messy but only runs in very weird
-    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
-    ; so performance is not terribly important here...
-    je .load_twolines
-    cmp           yd, -1
-    je .load_oneline
-    ; h == 1 case
-    mova          m5, m3
-    mova          m4, m3
-    mova          m6, m3
-    jmp .loop
-.load_oneline:
-    ; h == 2 case
-    mova          m4, [midq+384*2]
-    mova          m5, m4
-    mova          m6, m4
-    jmp .loop
-.load_twolines:
-    ; h == 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    mova          m6, m5
-    jmp .loop
-.load_threelines:
-    ; h > 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    ; third line loaded in main loop below
-
-    ; main y loop for vertical filter
-.loop_load:
-    ; load one line into m6. if that pixel is no longer available, do
-    ; nothing, since m6 still has the data from the previous line in it. We
-    ; try to structure the loop so that the common case is evaluated fastest
-    mova          m6, [mptrq+384*6]
-.loop:
-    paddw         m0, m6
-    paddw         m7, m1, m5
-    paddw         m8, m2, m4
-    punpcklwd     m9, m0, m7
-    punpckhwd     m0, m7
-    punpcklwd     m7, m8, m3
-    punpckhwd     m8, m3
-    pmaddwd       m9, m10
-    pmaddwd       m0, m10
-    pmaddwd       m7, m11
-    pmaddwd       m8, m11
-    add        mptrq, 384*2
-    paddd         m7, m9
-    paddd         m0, m8
-    paddd         m7, m12
-    paddd         m0, m12
-    psrad         m7, 11
-    psrad         m0, 11
-    packssdw      m7, m0
-    vextracti128 xm0, m7, 1
-    packuswb     xm7, xm0
-    mova   [dstptrq], xm7
-    ; shift pixels one position
-    mova          m0, m1
-    mova          m1, m2
-    mova          m2, m3
-    mova          m3, m4
-    mova          m4, m5
-    mova          m5, m6
-    add      dstptrq, strideq
-    dec           yd
-    jg .loop_load
-    ; for the bottom pixels, continue using m6 (as extended edge)
-    cmp           yd, ylimd
-    jg .loop
-    add         midq, 32
-    add         dstq, 16
-    sub           wd, 16
-    jg .loop_x
-    RET
-
-INIT_YMM avx2
 cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
     mov        xlimd, edgem
     movifnidn     wd, wm
--- a/src/x86/looprestoration_init_tmpl.c
+++ b/src/x86/looprestoration_init_tmpl.c
@@ -31,54 +31,19 @@
 #include "common/intops.h"
 #include "src/tables.h"
 
-// Future potential optimizations:
-// - special chroma versions which don't filter [0]/[6];
-// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
-//   to bottom) instead of scanline-ordered should be faster since then the
-//   if (have_left) and similar conditions run only once instead of per line;
-// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
-//   to run 32 (like filter_h_avx2), and then all vpermqs can go;
-// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
-//   since then the have_left condition can be inlined;
-// - consider having the wrapper (wiener_filter_${ext}) also in hand-written
-//   assembly, so the setup overhead is minimized.
-
 #define WIENER_FILTER(ext) \
-\
-void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \
-                                 const pixel *src, ptrdiff_t stride, \
-                                 const int16_t filter[2][8], const intptr_t w, \
-                                 int h, enum LrEdgeFlags edges); \
-void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \
-                                 const int16_t *mid, int w, int h, \
-                                 const int16_t filter[2][8], \
-                                 enum LrEdgeFlags edges); \
-\
-static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
-                                const pixel (*const left)[4], \
-                                const pixel *lpf, const ptrdiff_t lpf_stride, \
-                                const int w, const int h, \
+void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+                                const pixel (*left)[4], const pixel *lpf, \
+                                ptrdiff_t lpf_stride, int w, int h, \
                                 const int16_t filter[2][8], \
-                                const enum LrEdgeFlags edges) \
-{ \
-    ALIGN_STK_32(int16_t, mid, 68 * 384,); \
-\
-    /* horizontal filter */ \
-    dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \
-                                filter, w, h, edges); \
-    if (edges & LR_HAVE_TOP) \
-        dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \
-                                    filter, w, 2, edges); \
-    if (edges & LR_HAVE_BOTTOM) \
-        dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \
-                                    lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \
-                                    filter, w, 2, edges); \
-\
-    dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, filter, edges); \
-}
+                                enum LrEdgeFlags edges); \
+void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+                                const pixel (*left)[4], const pixel *lpf, \
+                                ptrdiff_t lpf_stride, int w, int h, \
+                                const int16_t filter[2][8], \
+                                enum LrEdgeFlags edges);
 
 #define SGR_FILTER(ext) \
-\
 void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
                             const pixel (*left)[4], \
                             const pixel *src, const ptrdiff_t stride, \
@@ -201,15 +166,13 @@
     } \
 }
 
-#define DEF_LR_FILTERS(ext) \
-WIENER_FILTER(ext) \
-SGR_FILTER(ext)
-
 #if BITDEPTH == 8
 WIENER_FILTER(sse2)
-DEF_LR_FILTERS(ssse3)
+WIENER_FILTER(ssse3)
+SGR_FILTER(ssse3)
 # if ARCH_X86_64
-DEF_LR_FILTERS(avx2)
+WIENER_FILTER(avx2)
+SGR_FILTER(avx2)
 # endif
 #endif
 
@@ -218,18 +181,21 @@
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
 #if BITDEPTH == 8
-    c->wiener = wiener_filter_sse2;
+    c->wiener[0] = dav1d_wiener_filter7_sse2;
+    c->wiener[1] = dav1d_wiener_filter5_sse2;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 #if BITDEPTH == 8
-    c->wiener = wiener_filter_ssse3;
+    c->wiener[0] = dav1d_wiener_filter7_ssse3;
+    c->wiener[1] = dav1d_wiener_filter5_ssse3;
     c->selfguided = sgr_filter_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 #if BITDEPTH == 8 && ARCH_X86_64
-    c->wiener = wiener_filter_avx2;
+    c->wiener[0] = dav1d_wiener_filter7_avx2;
+    c->wiener[1] = dav1d_wiener_filter5_avx2;
     c->selfguided = sgr_filter_avx2;
 #endif
 }
--- a/src/x86/looprestoration_sse.asm
+++ b/src/x86/looprestoration_sse.asm
@@ -29,34 +29,33 @@
 
 SECTION_RODATA 16
 
-pb_right_ext_mask: times 16 db 0xff
-                   times 16 db 0
-pb_14x0_1_2: times 14 db 0
-             db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
-                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
-pb_0: times 16 db 0
-pb_2: times 16 db 2
-pb_3: times 16 db 3
-pb_4: times 16 db 4
-pb_15: times 16 db 15
-pb_0_1: times 8 db 0, 1
-pb_6_7: times 8 db 6, 7
-pb_14_15: times 8 db 14, 15
-pw_1: times 8 dw 1
-pw_16: times 8 dw 16
-pw_128: times 8 dw 128
-pw_255: times 8 dw 255
-pw_256: times 8 dw 256
-pw_2048: times 8 dw 2048
-pw_16380: times 8 dw 16380
-pw_5_6: times 4 dw 5, 6
-pd_1024: times 4 dd 1024
+wiener_init:   db  6,  7,  6,  7,  6,  7,  6,  7,  0,  0,  0,  0,  2,  4,  2,  4
+wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
+wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
+wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
+wiener_shufD:  db  4, -1,  5, -1,  6, -1,  7, -1,  8, -1,  9, -1, 10, -1, 11, -1
+wiener_l_shuf: db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
+pb_unpcklwdw:  db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
+
+pb_right_ext_mask: times 24 db 0xff
+                   times 8 db 0
+pb_0:          times 16 db 0
+pb_3:          times 16 db 3
+pb_15:         times 16 db 15
+pb_0_1:        times 8 db 0, 1
+pb_14_15:      times 8 db 14, 15
+pw_1:          times 8 dw 1
+pw_16:         times 8 dw 16
+pw_128:        times 8 dw 128
+pw_256:        times 8 dw 256
+pw_2048:       times 8 dw 2048
+pw_2056:       times 8 dw 2056
+pw_m16380:     times 8 dw -16380
+pw_5_6:        times 4 dw 5, 6
+pd_1024:       times 4 dd 1024
 %if ARCH_X86_32
-pd_256: times 4 dd 256
-pd_512: times 4 dd 512
-pd_2048: times 4 dd 2048
+pd_512:        times 4 dd 512
+pd_2048:       times 4 dd 2048
 %endif
 pd_0xF0080029: times 4 dd 0xF0080029
 pd_0xF00801C7: times 4 dd 0XF00801C7
@@ -95,539 +94,1037 @@
  %define PIC_sym(sym)   (sym)
 %endif
 
-%macro PALIGNR 4 ; dst, src1, src2, shift
- %if cpuflag(ssse3)
-    palignr       %1, %2, %3, %4
- %else
-  %assign %%i regnumof%+%1 + 1
-  %define %%tmp m %+ %%i
-    psrldq        %1, %3, %4
-    pslldq     %%tmp, %2, 16-%4
-    por           %1, %%tmp
- %endif
-%endmacro
-
-%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
- %if cpuflag(ssse3)
-    pmaddubsw     %1, %2
- %else
-  %if %5 == 1
-    pxor          %3, %3
-  %endif
-    punpckhbw     %4, %1, %3
-    punpcklbw     %1, %3
-    pmaddwd       %4, %2
-    pmaddwd       %1, %2
-    packssdw      %1, %4
- %endif
-%endmacro
-
-;;;;;;;;;;;;;;;;;;;;;;
-;;      wiener      ;;
-;;;;;;;;;;;;;;;;;;;;;;
-
-%macro WIENER_H 0
+%macro WIENER 0
 %if ARCH_X86_64
-cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, flt, w, h, edge
-    mov        edged, edgem
-    movifnidn     wd, wm
-    mov           hd, hm
+DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+                                               lpf_stride, w, edge, flt, h, x
+    %define base 0
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    movq           m14, [fltq]
+    add           lpfq, wq
+    lea             t1, [rsp+wq*2+16]
+    mova           m15, [pw_2056]
+    add           dstq, wq
+    movq            m7, [fltq+16]
+    neg             wq
+%if cpuflag(ssse3)
+    pshufb         m14, [wiener_init]
+    mova            m8, [wiener_shufA]
+    pshufd         m12, m14, q2222  ; x0 x0
+    mova            m9, [wiener_shufB]
+    pshufd         m13, m14, q3333  ; x1 x2
+    mova           m10, [wiener_shufC]
+    punpcklqdq     m14, m14         ; x3
+    mova           m11, [wiener_shufD]
 %else
-cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, flt, w, h, edge
-    mov           r5, edgem
-    mov     [esp+12], r5
-    mov           wd, wm
-    mov           hd, hm
-    SETUP_PIC hd
- %define m15 m0
- %define m14 m1
- %define m13 m2
- %define m12 m3
+    mova           m10, [pw_m16380]
+    punpcklwd      m14, m14
+    pshufd         m11, m14, q0000 ; x0
+    pshufd         m12, m14, q1111 ; x1
+    pshufd         m13, m14, q2222 ; x2
+    pshufd         m14, m14, q3333 ; x3
 %endif
-
-    movq         m15, [fltq]
+%else
+DECLARE_REG_TMP 4, 0, _, 5
 %if cpuflag(ssse3)
-    pshufb       m12, m15, [PIC_sym(pb_6_7)]
-    pshufb       m13, m15, [PIC_sym(pb_4)]
-    pshufb       m14, m15, [PIC_sym(pb_2)]
-    pshufb       m15, m15, [PIC_sym(pb_0)]
+    %define m10         [base+wiener_shufC]
+    %define m11         [base+wiener_shufD]
+    %define stk_off     96
 %else
-    pshuflw      m12, m15, q3333
-    punpcklbw    m15, m15
-    pshufhw      m13, m15, q0000
-    pshuflw      m14, m15, q2222
-    pshuflw      m15, m15, q0000
-    punpcklqdq   m12, m12
-    punpckhqdq   m13, m13
-    punpcklqdq   m14, m14
-    punpcklqdq   m15, m15
-    psraw        m13, 8
-    psraw        m14, 8
-    psraw        m15, 8
+    %define m10         [base+pw_m16380]
+    %define m11         [stk+96]
+    %define stk_off     112
 %endif
-
-%if ARCH_X86_64
-    mova         m11, [pw_2048]
-    mova         m10, [pw_16380]
-    lea          r11, [pb_right_ext_mask]
-
-    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+    %define base        r6-pb_right_ext_mask-21
+    %define stk         esp
+    %define dstq        leftq
+    %define edgeb       byte edged
+    %define edged       [stk+ 8]
+    %define dstmp       [stk+12]
+    %define hd    dword [stk+16]
+    %define wq          [stk+20]
+    %define dst_strideq [stk+24]
+    %define leftmp      [stk+28]
+    %define t2          [stk+32]
+    %define t4          [stk+36]
+    %define t5          [stk+40]
+    %define t6          [stk+44]
+    %define m8          [base+wiener_shufA]
+    %define m9          [base+wiener_shufB]
+    %define m12         [stk+48]
+    %define m13         [stk+64]
+    %define m14         [stk+80]
+    %define m15         [base+pw_2056]
+    mov             r1, r7m ; flt
+    mov             r0, r0m ; dst
+    mov             r5, r5m ; w
+    mov           lpfq, lpfm
+    mov             r2, r8m ; edge
+    mov             r4, r6m ; h
+    movq            m3, [r1+ 0]
+    movq            m7, [r1+16]
+    add             r0, r5
+    mov             r1, r1m ; dst_stride
+    add           lpfq, r5
+    mov          edged, r2
+    mov             r2, r2m ; left
+    mov          dstmp, r0
+    lea             t1, [rsp+r5*2+stk_off]
+    mov             hd, r4
+    neg             r5
+    mov    lpf_strideq, lpf_stridem
+    LEA             r6, pb_right_ext_mask+21
+    mov             wq, r5
+    mov    dst_strideq, r1
+    mov         leftmp, r2
+%if cpuflag(ssse3)
+    pshufb          m3, [base+wiener_init]
+    pshufd          m1, m3, q2222
+    pshufd          m2, m3, q3333
+    punpcklqdq      m3, m3
 %else
- %define m10    [PIC_sym(pw_16380)]
- %define m11    [PIC_sym(pw_2048)]
- %define m12    [esp+0x14]
- %define m13    [esp+0x24]
- %define m14    [esp+0x34]
- %define m15    [esp+0x44]
-    mova         m12, m3
-    mova         m13, m2
-    mova         m14, m1
-    mova         m15, m0
-
-    DEFINE_ARGS dst, left, src, stride, x, w, h, edge
- %define srcptrq    srcq
- %define dstptrq    dstq
- %define hd         dword [esp+ 0]
- %define edgeb      byte  [esp+12]
- %define xlimd      dword [esp+16]
+    punpcklwd       m3, m3
+    pshufd          m0, m3, q0000
+    pshufd          m1, m3, q1111
+    pshufd          m2, m3, q2222
+    pshufd          m3, m3, q3333
+    mova           m11, m0
 %endif
-
-    ; if (edge & has_right) align_w_to_16
-    ; else w -= 3, and use that as limit in x loop
-    test       edgeb, 2 ; has_right
-    jnz .align
-    mov        xlimd, -3
-    jmp .loop
-.align:
-    add           wd, 15
-    and           wd, ~15
+    mova           m12, m1
+    mova           m13, m2
+    mova           m14, m3
+%endif
+    pshufd          m6, m7, q0000 ; y0 y1
+    pshufd          m7, m7, q1111 ; y2 y3
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t6, t1
+    mov             t5, t1
+    add             t1, 384*2
+    call .h_top
+    lea             t3, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov [rsp+gprsize*1], lpf_strideq
+    add             t3, lpf_strideq
+    mov [rsp+gprsize*0], t3 ; below
+    mov             t4, t1
+    add             t1, 384*2
+    call .h
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+.main:
+    lea             t0, [t1+384*2]
+.main_loop:
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v3
+    mov           lpfq, [rsp+gprsize*0]
+    call .hv_bottom
+    add           lpfq, [rsp+gprsize*1]
+    call .hv_bottom
+.v1:
+    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+    RET
+.no_top:
+    lea             t3, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov [rsp+gprsize*1], lpf_strideq
+    lea             t3, [t3+lpf_strideq*2]
+    mov [rsp+gprsize*0], t3
+    call .h
+    mov             t6, t1
+    mov             t5, t1
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v2
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v3
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v3
+    add             t0, 384*8
+    call .hv
+    dec             hd
+    jnz .main
+.v3:
+    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+.v2:
+    call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+    jmp .v1
+.extend_right:
+    movd            m2, [lpfq-4]
 %if ARCH_X86_64
-    xor        xlimd, xlimd
+    push            r0
+    lea             r0, [pb_right_ext_mask+21]
+    movu            m0, [r0+xq+0]
+    movu            m1, [r0+xq+8]
+    pop             r0
 %else
-    mov        xlimd, 0
+    movu            m0, [r6+xq+0]
+    movu            m1, [r6+xq+8]
 %endif
-
-    ; main y loop for vertical filter
-.loop:
-%if ARCH_X86_64
-    mov      srcptrq, srcq
-    mov      dstptrq, dstq
-    lea           xd, [wq+xlimq]
+%if cpuflag(ssse3)
+    pshufb          m2, [base+pb_3]
 %else
-    mov      [esp+8], srcq
-    mov      [esp+4], dstq
-    mov           xd, xlimd
-    add           xd, wd
+    punpcklbw       m2, m2
+    pshuflw         m2, m2, q3333
+    punpcklqdq      m2, m2
 %endif
-
-    ; load left edge pixels
-    test       edgeb, 1 ; have_left
-    jz .emu_left
-    test       leftq, leftq ; left == NULL for the edge-extended bottom/top
-    jz .load_left_combined
-    movd          m0, [leftq]
-    movd          m1, [srcq]
-    punpckldq     m0, m1
-    pslldq        m0, 9
-    add        leftq, 4
-    jmp .left_load_done
-.load_left_combined:
-    movq          m0, [srcq-3]
-    pslldq        m0, 10
-    jmp .left_load_done
-.emu_left:
-    movd          m0, [srcq]
+    pand            m4, m0
+    pand            m5, m1
+    pandn           m0, m2
+    pandn           m1, m2
+    por             m4, m0
+    por             m5, m1
+    ret
+.h:
+    %define stk esp+4 ; offset due to call
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .h_main
+.h_extend_left:
 %if cpuflag(ssse3)
-    pshufb        m0, [PIC_sym(pb_14x0_1_2)]
+    mova            m4, [lpfq+xq]
+    pshufb          m4, [base+wiener_l_shuf]
 %else
-    pslldq        m1, m0, 13
-    punpcklbw     m0, m0
-    pshuflw       m0, m0, q0000
-    punpcklqdq    m0, m0
-    psrldq        m0, 2
-    por           m0, m1
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
 %endif
-
-    ; load right edge pixels
-.left_load_done:
-    cmp           xd, 16
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    je .splat_right
-
-    ; for very small images (w=[1-2]), edge-extend the original cache,
-    ; ugly, but only runs in very odd cases
+    jmp .h_main
+.h_top:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+.h_loop:
+    movu            m4, [lpfq+xq-4]
+.h_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp             xd, -18
+    jl .h_have_right
+    call .extend_right
+.h_have_right:
+%macro %%h7 0
 %if cpuflag(ssse3)
-    add           wd, wd
- %if ARCH_X86_64
-    pshufb        m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
- %else
-    pshufb        m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
- %endif
-    shr           wd, 1
+    pshufb          m0, m4, m8
+    pmaddubsw       m0, m12
+    pshufb          m1, m5, m8
+    pmaddubsw       m1, m12
+    pshufb          m2, m4, m9
+    pmaddubsw       m2, m13
+    pshufb          m3, m5, m9
+    pmaddubsw       m3, m13
+    paddw           m0, m2
+    pshufb          m2, m4, m10
+    pmaddubsw       m2, m13
+    paddw           m1, m3
+    pshufb          m3, m5, m10
+    pmaddubsw       m3, m13
+    pshufb          m4, m11
+    paddw           m0, m2
+    pmullw          m2, m14, m4
+    pshufb          m5, m11
+    paddw           m1, m3
+    pmullw          m3, m14, m5
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m0, m2
+    mova            m2, [base+pw_m16380]
+    paddw           m1, m3
+    paddw           m4, m2
+    paddw           m5, m2
+    paddsw          m0, m4
+    paddsw          m1, m5
 %else
-    shl           wd, 4
-    pcmpeqd       m2, m2
-    movd          m3, wd
-    psrldq        m2, 2
-    punpckhbw     m1, m0, m0
-    pshufhw       m1, m1, q1122
-    psllq         m1, m3
-    pand          m0, m2
-    pandn         m2, m1
-    por           m0, m2
-    shr           wd, 4
+    psrldq          m0, m4, 1
+    pslldq          m1, m4, 1
+    pxor            m3, m3
+    punpcklbw       m0, m3
+    punpckhbw       m1, m3
+    paddw           m0, m1
+    pmullw          m0, m11
+    psrldq          m1, m4, 2
+    pslldq          m2, m4, 2
+    punpcklbw       m1, m3
+    punpckhbw       m2, m3
+    paddw           m1, m2
+    pmullw          m1, m12
+    paddw           m0, m1
+    pshufd          m2, m4, q0321
+    punpcklbw       m2, m3
+    pmullw          m1, m14, m2
+    paddw           m0, m1
+    psrldq          m1, m4, 3
+    pslldq          m4, 3
+    punpcklbw       m1, m3
+    punpckhbw       m4, m3
+    paddw           m1, m4
+    pmullw          m1, m13
+    paddw           m0, m1
+    psllw           m2, 7
+    paddw           m2, m10
+    paddsw          m0, m2
+    psrldq          m1, m5, 1
+    pslldq          m2, m5, 1
+    punpcklbw       m1, m3
+    punpckhbw       m2, m3
+    paddw           m1, m2
+    pmullw          m1, m11
+    psrldq          m2, m5, 2
+    pslldq          m4, m5, 2
+    punpcklbw       m2, m3
+    punpckhbw       m4, m3
+    paddw           m2, m4
+    pmullw          m2, m12
+    paddw           m1, m2
+    pshufd          m4, m5, q0321
+    punpcklbw       m4, m3
+    pmullw          m2, m14, m4
+    paddw           m1, m2
+    psrldq          m2, m5, 3
+    pslldq          m5, 3
+    punpcklbw       m2, m3
+    punpckhbw       m5, m3
+    paddw           m2, m5
+    pmullw          m2, m13
+    paddw           m1, m2
+    psllw           m4, 7
+    paddw           m4, m10
+    paddsw          m1, m4
 %endif
-
-    ; main x loop, mostly this starts in .main_load
-.splat_right:
-    ; no need to load new pixels, just extend them from the (possibly previously
-    ; extended) previous load into m0
+%endmacro
+    %%h7
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
+    mova  [t1+xq*2+ 0], m0
+    mova  [t1+xq*2+16], m1
+    add             xq, 16
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .hv_main
+.hv_extend_left:
 %if cpuflag(ssse3)
-    pshufb        m1, m0, [PIC_sym(pb_15)]
+    mova            m4, [lpfq+xq]
+    pshufb          m4, [base+wiener_l_shuf]
 %else
-    punpckhbw     m1, m0, m0
-    pshufhw       m1, m1, q3333
-    punpckhqdq    m1, m1
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
 %endif
-    jmp .main_loop
-.load_and_splat:
-    ; load new pixels and extend edge for right-most
-    movu          m1, [srcptrq+3]
+    jmp .hv_main
+.hv_bottom:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+xq-4]
+.hv_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp             xd, -18
+    jl .hv_have_right
+    call .extend_right
+.hv_have_right:
+    %%h7
 %if ARCH_X86_64
-    sub          r11, xq
-    movu          m2, [r11+16]
-    add          r11, xq
+    mova            m2, [t4+xq*2]
+    paddw           m2, [t2+xq*2]
 %else
-    sub      PIC_reg, xd
-    movu          m2, [PIC_sym(pb_right_ext_mask)+16]
-    add      PIC_reg, xd
+    mov             r2, t4
+    mova            m2, [r2+xq*2]
+    mov             r2, t2
+    paddw           m2, [r2+xq*2]
+    mov             r2, t5
 %endif
-    movd          m3, [srcptrq+2+xq]
-%if cpuflag(ssse3)
-    pshufb        m3, [PIC_sym(pb_0)]
+    mova            m3, [t3+xq*2]
+%if ARCH_X86_64
+    mova            m5, [t5+xq*2]
 %else
-    punpcklbw     m3, m3
-    pshuflw       m3, m3, q0000
-    punpcklqdq    m3, m3
+    mova            m5, [r2+xq*2]
+    mov             r2, t6
 %endif
-    pand          m1, m2
-    pxor          m2, [PIC_sym(pb_right_ext_mask)]
-    pand          m3, m2
-    pxor          m2, [PIC_sym(pb_right_ext_mask)]
-    por           m1, m3
-    jmp .main_loop
-.main_load:
-    ; load subsequent line
-    movu          m1, [srcptrq+3]
-.main_loop:
+    paddw           m5, [t1+xq*2]
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
 %if ARCH_X86_64
-    PALIGNR       m2, m1, m0, 10
-    PALIGNR       m3, m1, m0, 11
-    PALIGNR       m4, m1, m0, 12
-    PALIGNR       m5, m1, m0, 13
-    PALIGNR       m6, m1, m0, 14
-    PALIGNR       m7, m1, m0, 15
-
-    punpcklbw     m0, m2, m1
-    punpckhbw     m2, m1
-    punpcklbw     m8, m3, m7
-    punpckhbw     m3, m7
-    punpcklbw     m7, m4, m6
-    punpckhbw     m4, m6
-    PMADDUBSW     m0, m15, m6, m9, 1
-    PMADDUBSW     m2, m15, m6, m9, 0
-    PMADDUBSW     m8, m14, m6, m9, 0
-    PMADDUBSW     m3, m14, m6, m9, 0
-    PMADDUBSW     m7, m13, m6, m9, 0
-    PMADDUBSW     m4, m13, m6, m9, 0
-    paddw         m0, m8
-    paddw         m2, m3
- %if cpuflag(ssse3)
-    pxor          m6, m6
- %endif
-    punpcklbw     m3, m5, m6
-    punpckhbw     m5, m6
-    psllw         m8, m3, 7
-    psllw         m6, m5, 7
-    psubw         m8, m10
-    psubw         m6, m10
-    pmullw        m3, m12
-    pmullw        m5, m12
-    paddw         m0, m7
-    paddw         m2, m4
-    paddw         m0, m3
-    paddw         m2, m5
-    paddsw        m0, m8 ; see the avx2 for an explanation
-    paddsw        m2, m6 ; of how the clipping works here
-    psraw         m0, 3
-    psraw         m2, 3
-    paddw         m0, m11
-    paddw         m2, m11
-    mova [dstptrq+ 0], m0
-    mova [dstptrq+16], m2
+    paddw           m4, m0, [t6+xq*2]
 %else
-    PALIGNR       m2, m1, m0, 10
-    punpcklbw     m3, m2, m1
-    punpckhbw     m2, m1
-    PMADDUBSW     m3, m15, m4, m5, 1
-    PMADDUBSW     m2, m15, m4, m5, 0
-    PALIGNR       m4, m1, m0, 11
-    PALIGNR       m5, m1, m0, 15
-    punpcklbw     m6, m4, m5
-    punpckhbw     m4, m5
-    PMADDUBSW     m6, m14, m5, m7, 1
-    PMADDUBSW     m4, m14, m5, m7, 0
-    paddw         m3, m6
-    paddw         m2, m4
-    PALIGNR       m4, m1, m0, 12
-    PALIGNR       m5, m1, m0, 14
-    punpcklbw     m6, m4, m5
-    punpckhbw     m4, m5
-    PMADDUBSW     m6, m13, m5, m7, 1
-    PMADDUBSW     m4, m13, m5, m7, 0
-    paddw         m3, m6
-    paddw         m2, m4
-    PALIGNR       m6, m1, m0, 13
- %if cpuflag(ssse3)
-    pxor          m5, m5
- %endif
-    punpcklbw     m4, m6, m5
-    punpckhbw     m6, m5
-    psllw         m5, m4, 7
-    psllw         m7, m6, 7
-    psubw         m5, m10
-    psubw         m7, m10
-    pmullw        m4, m12
-    pmullw        m6, m12
-    paddw         m3, m4
-    paddw         m2, m6
-    paddsw        m3, m5
-    paddsw        m2, m7
-    psraw         m3, 3
-    psraw         m2, 3
-    paddw         m3, m11
-    paddw         m2, m11
-    mova [dstptrq+ 0], m3
-    mova [dstptrq+16], m2
+    paddw           m4, m0, [r2+xq*2]
+    mov             r2, t4
 %endif
+    mova     [t0+xq*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m6
+    punpckhwd       m4, m5
+    pmaddwd         m4, m6
+    paddd           m0, m3
+    mova            m3, [t3+xq*2+16]
+    paddd           m4, m2
+%if ARCH_X86_64
+    mova            m2, [t4+xq*2+16]
+    paddw           m2, [t2+xq*2+16]
+    mova            m5, [t5+xq*2+16]
+%else
+    mova            m2, [r2+xq*2+16]
+    mov             r2, t2
+    paddw           m2, [r2+xq*2+16]
+    mov             r2, t5
+    mova            m5, [r2+xq*2+16]
+    mov             r2, t6
+%endif
+    paddw           m5, [t1+xq*2+16]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
+%if ARCH_X86_64
+    paddw           m4, m1, [t6+xq*2+16]
+%else
+    paddw           m4, m1, [r2+xq*2+16]
+    mov           dstq, dstmp
+%endif
+    mova  [t0+xq*2+16], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m6
+    punpckhwd       m4, m5
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .hv_loop
+    add           dstq, dst_strideq
+%if ARCH_X86_64
+    mov             t6, t5
+    mov             t5, t4
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t6
+%else
+    mov          dstmp, dstq
+    mov             r1, t5
+    mov             r2, t4
+    mov             t6, r1
+    mov             t5, r2
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, r1
+%endif
+    ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+    mov             xq, wq
+.v_loop:
+%if ARCH_X86_64
+    mova            m1, [t4+xq*2]
+    paddw           m1, [t2+xq*2]
+%else
+    mov             r2, t4
+    mova            m1, [r2+xq*2]
+    mov             r2, t2
+    paddw           m1, [r2+xq*2]
+    mov             r2, t6
+%endif
+    mova            m2, [t3+xq*2]
+    mova            m4, [t1+xq*2]
+%if ARCH_X86_64
+    paddw           m3, m4, [t6+xq*2]
+    paddw           m4, [t5+xq*2]
+%else
+    paddw           m3, m4, [r2+xq*2]
+    mov             r2, t5
+    paddw           m4, [r2+xq*2]
+    mov             r2, t4
+%endif
+    punpcklwd       m0, m1, m2
+    pmaddwd         m0, m7
+    punpckhwd       m1, m2
+    pmaddwd         m1, m7
+    punpcklwd       m2, m3, m4
+    pmaddwd         m2, m6
+    punpckhwd       m3, m4
+    pmaddwd         m3, m6
+    paddd           m0, m2
+    paddd           m1, m3
+%if ARCH_X86_64
+    mova            m2, [t4+xq*2+16]
+    paddw           m2, [t2+xq*2+16]
+%else
+    mova            m2, [r2+xq*2+16]
+    mov             r2, t2
+    paddw           m2, [r2+xq*2+16]
+    mov             r2, t6
+%endif
+    mova            m3, [t3+xq*2+16]
+    mova            m5, [t1+xq*2+16]
+%if ARCH_X86_64
+    paddw           m4, m5, [t6+xq*2+16]
+    paddw           m5, [t5+xq*2+16]
+%else
+    paddw           m4, m5, [r2+xq*2+16]
+    mov             r2, t5
+    paddw           m5, [r2+xq*2+16]
+    movifnidn     dstq, dstmp
+%endif
+    psrad           m0, 11
+    psrad           m1, 11
+    packssdw        m0, m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m5
+    pmaddwd         m3, m6
+    punpckhwd       m4, m5
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .v_loop
+    add           dstq, dst_strideq
+%if ARCH_X86_64
+    mov             t6, t5
+    mov             t5, t4
+%else
+    mov          dstmp, dstq
+    mov             r1, t5
+    mov             r2, t4
+    mov             t6, r1
+    mov             t5, r2
+%endif
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    ret
+%endif
 
-    mova          m0, m1
-    add      srcptrq, 16
-    add      dstptrq, 32
-    sub           xd, 16
-    cmp           xd, 16
-    jg .main_load
-    test          xd, xd
-    jg .load_and_splat
-    cmp           xd, xlimd
-    jg .splat_right
-
-%if ARCH_X86_32
-    mov         srcq, [esp+8]
-    mov         dstq, [esp+4]
+%if ARCH_X86_64
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+                                             lpf_stride, w, edge, flt, h, x
+    mov           fltq, fltmp
+    mov          edged, r8m
+    mov             wd, wm
+    mov             hd, r6m
+    movq           m14, [fltq]
+    add           lpfq, wq
+    mova            m8, [pw_m16380]
+    lea             t1, [rsp+wq*2+16]
+    mova           m15, [pw_2056]
+    add           dstq, wq
+    movq            m7, [fltq+16]
+    neg             wq
+%if cpuflag(ssse3)
+    pshufb         m14, [wiener_init]
+    mova            m9, [wiener_shufB]
+    pshufd         m13, m14, q3333  ; x1 x2
+    mova           m10, [wiener_shufC]
+    punpcklqdq     m14, m14         ; x3
+    mova           m11, [wiener_shufD]
+    mova           m12, [wiener_l_shuf]
+%else
+    punpcklwd      m14, m14
+    pshufd         m11, m14, q1111 ; x1
+    pshufd         m13, m14, q2222 ; x2
+    pshufd         m14, m14, q3333 ; x3
 %endif
-    add         srcq, strideq
-    add         dstq, 384*2
-    dec           hd
-    jg .loop
+%else
+%if cpuflag(ssse3)
+    %define stk_off     80
+%else
+    %define m11         [stk+80]
+    %define stk_off     96
+%endif
+cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+    %define stk         esp
+    %define leftmp      [stk+28]
+    %define m8          [base+pw_m16380]
+    %define m12         [base+wiener_l_shuf]
+    %define m14         [stk+48]
+    mov             r1, r7m ; flt
+    mov             r0, r0m ; dst
+    mov             r5, r5m ; w
+    mov           lpfq, lpfm
+    mov             r2, r8m ; edge
+    mov             r4, r6m ; h
+    movq            m2, [r1+ 0]
+    movq            m7, [r1+16]
+    add             r0, r5
+    mov             r1, r1m ; dst_stride
+    add           lpfq, r5
+    mov          edged, r2
+    mov             r2, r2m ; left
+    mov          dstmp, r0
+    lea             t1, [rsp+r5*2+stk_off]
+    mov             hd, r4
+    neg             r5
+    mov    lpf_strideq, lpf_stridem
+    LEA             r6, pb_right_ext_mask+21
+    mov             wq, r5
+    mov    dst_strideq, r1
+    mov         leftmp, r2
+%if cpuflag(ssse3)
+    pshufb          m2, [base+wiener_init]
+    pshufd          m1, m2, q3333
+    punpcklqdq      m2, m2
+%else
+    punpcklwd       m2, m2
+    pshufd          m0, m2, q1111
+    pshufd          m1, m2, q2222
+    pshufd          m2, m2, q3333
+    mova           m11, m0
+%endif
+    mova           m13, m1
+    mova           m14, m2
+%endif
+    pshufd          m6, m7, q0000 ; __ y1
+    pshufd          m7, m7, q1111 ; y2 y3
+    test         edgeb, 4 ; LR_HAVE_TOP
+    jz .no_top
+    call .h_top
+    add           lpfq, lpf_strideq
+    mov             t4, t1
+    add             t1, 384*2
+    call .h_top
+    lea             xq, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov             t3, t1
+    add             t1, 384*2
+    mov [rsp+gprsize*1], lpf_strideq
+    add             xq, lpf_strideq
+    mov [rsp+gprsize*0], xq ; below
+    call .h
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+.main:
+    mov             t0, t4
+.main_loop:
+    call .hv
+    dec             hd
+    jnz .main_loop
+    test         edgeb, 8 ; LR_HAVE_BOTTOM
+    jz .v2
+    mov           lpfq, [rsp+gprsize*0]
+    call .hv_bottom
+    add           lpfq, [rsp+gprsize*1]
+    call .hv_bottom
+.end:
     RET
+.no_top:
+    lea             t3, [lpfq+lpf_strideq*4]
+    mov           lpfq, dstmp
+    mov [rsp+gprsize*1], lpf_strideq
+    lea             t3, [t3+lpf_strideq*2]
+    mov [rsp+gprsize*0], t3
+    call .h
+    mov             t4, t1
+    mov             t3, t1
+    mov             t2, t1
+    dec             hd
+    jz .v1
+    add           lpfq, dst_strideq
+    add             t1, 384*2
+    call .h
+    dec             hd
+    jz .v2
+    lea             t0, [t1+384*2]
+    call .hv
+    dec             hd
+    jz .v2
+    add             t0, 384*6
+    call .hv
+    dec             hd
+    jnz .main
+.v2:
+    call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+    add           dstq, dst_strideq
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    movifnidn    dstmp, dstq
+.v1:
+    call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+    jmp .end
+.h:
+    %define stk esp+4
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+    mova            m4, [lpfq+xq]
+    pshufb          m4, m12
+%else
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
+%endif
+    jmp .h_main
+.h_top:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .h_extend_left
+.h_loop:
+    movu            m4, [lpfq+xq-4]
+.h_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .h_have_right
+    cmp             xd, -17
+    jl .h_have_right
+    call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+    pshufb          m0, m4, m9
+    pmaddubsw       m0, m13
+    pshufb          m1, m5, m9
+    pmaddubsw       m1, m13
+    pshufb          m2, m4, m10
+    pmaddubsw       m2, m13
+    pshufb          m3, m5, m10
+    pmaddubsw       m3, m13
+    pshufb          m4, m11
+    paddw           m0, m2
+    pmullw          m2, m14, m4
+    pshufb          m5, m11
+    paddw           m1, m3
+    pmullw          m3, m14, m5
+    psllw           m4, 7
+    psllw           m5, 7
+    paddw           m4, m8
+    paddw           m5, m8
+    paddw           m0, m2
+    paddw           m1, m3
+    paddsw          m0, m4
+    paddsw          m1, m5
+%else
+    psrldq          m0, m4, 2
+    pslldq          m1, m4, 2
+    pxor            m3, m3
+    punpcklbw       m0, m3
+    punpckhbw       m1, m3
+    paddw           m0, m1
+    pmullw          m0, m11
+    pshufd          m2, m4, q0321
+    punpcklbw       m2, m3
+    pmullw          m1, m14, m2
+    paddw           m0, m1
+    psrldq          m1, m4, 3
+    pslldq          m4, 3
+    punpcklbw       m1, m3
+    punpckhbw       m4, m3
+    paddw           m1, m4
+    pmullw          m1, m13
+    paddw           m0, m1
+    psllw           m2, 7
+    paddw           m2, m8
+    paddsw          m0, m2
+    psrldq          m1, m5, 2
+    pslldq          m4, m5, 2
+    punpcklbw       m1, m3
+    punpckhbw       m4, m3
+    paddw           m1, m4
+    pmullw          m1, m11
+    pshufd          m4, m5, q0321
+    punpcklbw       m4, m3
+    pmullw          m2, m14, m4
+    paddw           m1, m2
+    psrldq          m2, m5, 3
+    pslldq          m5, 3
+    punpcklbw       m2, m3
+    punpckhbw       m5, m3
+    paddw           m2, m5
+    pmullw          m2, m13
+    paddw           m1, m2
+    psllw           m4, 7
+    paddw           m4, m8
+    paddsw          m1, m4
+%endif
 %endmacro
-
-%macro WIENER_V 0
+    %%h5
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
+    mova  [t1+xq*2+ 0], m0
+    mova  [t1+xq*2+16], m1
+    add             xq, 16
+    jl .h_loop
+    ret
+ALIGN function_align
+.hv:
+    add           lpfq, dst_strideq
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+    movifnidn    leftq, leftmp
+    mova            m4, [lpfq+xq]
+    movd            m5, [leftq]
+    add          leftq, 4
+    pslldq          m4, 4
+    por             m4, m5
+    movifnidn   leftmp, leftq
+    jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+    mova            m4, [lpfq+xq]
+    pshufb          m4, m12
+%else
+    mova            m5, [lpfq+xq]
+    pshufd          m4, m5, q2103
+    punpcklbw       m5, m5
+    punpcklwd       m5, m5
+    movss           m4, m5
+%endif
+    jmp .hv_main
+.hv_bottom:
+    mov             xq, wq
+    test         edgeb, 1 ; LR_HAVE_LEFT
+    jz .hv_extend_left
+.hv_loop:
+    movu            m4, [lpfq+xq-4]
+.hv_main:
+    movu            m5, [lpfq+xq+4]
+    test         edgeb, 2 ; LR_HAVE_RIGHT
+    jnz .hv_have_right
+    cmp             xd, -17
+    jl .hv_have_right
+    call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.hv_have_right:
+    %%h5
+    mova            m2, [t3+xq*2]
+    paddw           m2, [t1+xq*2]
+    psraw           m0, 3
+    psraw           m1, 3
+    paddw           m0, m15
+    paddw           m1, m15
 %if ARCH_X86_64
-cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, flt, edge
-    mov        edged, edgem
-    movifnidn   fltq, fltmp
-    movifnidn     hd, hm
-    movq         m15, [fltq+16]
-    pshufd       m14, m15, q1111
-    pshufd       m15, m15, q0000
-    mova         m12, [pd_1024]
-
-    DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
-
-    mov        ylimd, edged
-    and        ylimd, 8 ; have_bottom
-    shr        ylimd, 2
-    sub        ylimd, 3
+    mova            m3, [t2+xq*2]
+    paddw           m4, m0, [t4+xq*2]
 %else
-cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, flt, edge
- %define ylimd [esp+12]
-
-    mov          r5d, edgem
-    and          r5d, 8
-    shr          r5d, 2
-    sub          r5d, 3
-    mov        ylimd, r5d
-    mov         fltq, fltmp
-    mov        edged, edgem
-
-    SETUP_PIC edged
-
-    movq          m0, [fltq+16]
-    pshufd        m1, m0, q1111
-    pshufd        m0, m0, q0000
-    mova  [esp+0x50], m0
-    mova  [esp+0x40], m1
-
-    DEFINE_ARGS dst, stride, mid, w, h, y, edge
- %define mptrq      midq
- %define dstptrq    dstq
- %define edgeb      byte [esp]
+    mov             r2, t2
+    mova            m3, [r2+xq*2]
+    mov             r2, t4
+    paddw           m4, m0, [r2+xq*2]
 %endif
-
-    ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
-    mova          m3, [midq] ; middle line
-
-    ; load top pixels
-    test       edgeb, 4 ; have_top
-    jz .emu_top
-    mova          m0, [midq-384*4]
-    mova          m2, [midq-384*2]
-    mova          m1, m0
-    jmp .load_bottom_pixels
-.emu_top:
-    mova          m0, m3
-    mova          m1, m3
-    mova          m2, m3
-
-    ; load bottom pixels
-.load_bottom_pixels:
-    mov           yd, hd
+    mova     [t0+xq*2], m0
+    punpcklwd       m0, m2, m3
+    pmaddwd         m0, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m6
+    punpckhwd       m4, m4
+    pmaddwd         m4, m6
+    paddd           m0, m3
+    paddd           m4, m2
+    mova            m2, [t3+xq*2+16]
+    paddw           m2, [t1+xq*2+16]
+    psrad           m0, 11
+    psrad           m4, 11
+    packssdw        m0, m4
 %if ARCH_X86_64
-    mov        mptrq, midq
-    mov      dstptrq, dstq
-    add           yd, ylimd
+    mova            m3, [t2+xq*2+16]
+    paddw           m4, m1, [t4+xq*2+16]
 %else
-    mov      [esp+8], midq
-    mov      [esp+4], dstq
-    add           yd, ylimd
+    paddw           m4, m1, [r2+xq*2+16]
+    mov             r2, t2
+    mova            m3, [r2+xq*2+16]
+    mov           dstq, dstmp
 %endif
-    jg .load_threelines
-
-    ; the remainder here is somewhat messy but only runs in very weird
-    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
-    ; so performance is not terribly important here...
-    je .load_twolines
-    cmp           yd, -1
-    je .load_oneline
-    ; h == 1 case
-    mova          m5, m3
-    mova          m4, m3
-    mova          m6, m3
-    jmp .loop
-.load_oneline:
-    ; h == 2 case
-    mova          m4, [midq+384*2]
-    mova          m5, m4
-    mova          m6, m4
-    jmp .loop
-.load_twolines:
-    ; h == 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    mova          m6, m5
-    jmp .loop
-.load_threelines:
-    ; h > 3 case
-    mova          m4, [midq+384*2]
-    mova          m5, [midq+384*4]
-    ; third line loaded in main loop below
-
-    ; main y loop for vertical filter
-.loop_load:
-    ; load one line into m6. if that pixel is no longer available, do
-    ; nothing, since m6 still has the data from the previous line in it. We
-    ; try to structure the loop so that the common case is evaluated fastest
-    mova          m6, [mptrq+384*6]
-.loop:
+    mova  [t0+xq*2+16], m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4, m4
+    pmaddwd         m3, m6
+    punpckhwd       m4, m4
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .hv_loop
+    add           dstq, dst_strideq
+    mov             t4, t3
+    mov             t3, t2
+    mov             t2, t1
+    mov             t1, t0
+    mov             t0, t4
+    movifnidn    dstmp, dstq
+    ret
+%if cpuflag(ssse3)
+.v:
+    mov             xq, wq
+.v_loop:
+    mova            m3, [t1+xq*2]
+    paddw           m1, m3, [t3+xq*2]
 %if ARCH_X86_64
-    paddw         m7, m0, m6
-    paddw         m8, m1, m5
-    paddw         m9, m2, m4
-    punpcklwd    m10, m7, m8
-    punpckhwd     m7, m8
-    punpcklwd    m11, m9, m3
-    punpckhwd     m9, m3
-    pmaddwd      m10, m15
-    pmaddwd       m7, m15
-    pmaddwd      m11, m14
-    pmaddwd       m9, m14
-    paddd        m10, m12
-    paddd         m7, m12
-    paddd        m10, m11
-    paddd         m7, m9
-    psrad        m10, 11
-    psrad         m7, 11
-    packssdw     m10, m7
-    packuswb     m10, m10
-    movq   [dstptrq], m10
+    mova            m2, [t2+xq*2]
+    paddw           m3, [t4+xq*2]
 %else
-    mova  [esp+0x30], m1
-    mova  [esp+0x20], m2
-    mova  [esp+0x10], m3
-    paddw         m0, m6
-    paddw         m1, m5
-    paddw         m2, m4
-    punpcklwd     m7, m2, m3
-    punpckhwd     m2, m3
-    punpcklwd     m3, m0, m1
-    punpckhwd     m0, m1
-    mova          m1, [esp+0x50]
-    pmaddwd       m3, m1
-    pmaddwd       m0, m1
-    mova          m1, [esp+0x40]
-    pmaddwd       m7, m1
-    pmaddwd       m2, m1
-    paddd         m3, [PIC_sym(pd_1024)]
-    paddd         m0, [PIC_sym(pd_1024)]
-    paddd         m3, m7
-    paddd         m0, m2
-    psrad         m3, 11
-    psrad         m0, 11
-    packssdw      m3, m0
-    packuswb      m3, m3
-    movq      [dstq], m3
-    mova          m1, [esp+0x30]
-    mova          m2, [esp+0x20]
-    mova          m3, [esp+0x10]
+    mov             r2, t2
+    mova            m2, [r2+xq*2]
+    mov             r2, t4
+    paddw           m3, [r2+xq*2]
 %endif
-    ; shift pixels one position
-    mova          m0, m1
-    mova          m1, m2
-    mova          m2, m3
-    mova          m3, m4
-    mova          m4, m5
-    mova          m5, m6
-    add        mptrq, 384*2
-    add      dstptrq, strideq
-    dec           yd
-    jg .loop_load
-    ; for the bottom pixels, continue using m6 (as extended edge)
-    cmp           yd, ylimd
-    jg .loop
-
-%if ARCH_X86_32
-    mov         midq, [esp+8]
-    mov         dstq, [esp+4]
+    punpcklwd       m0, m1, m2
+    pmaddwd         m0, m7
+    punpckhwd       m1, m2
+    pmaddwd         m1, m7
+    punpcklwd       m2, m3
+    pmaddwd         m2, m6
+    punpckhwd       m3, m3
+    pmaddwd         m3, m6
+    paddd           m0, m2
+    paddd           m1, m3
+    mova            m4, [t1+xq*2+16]
+    paddw           m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+    mova            m3, [t2+xq*2+16]
+    paddw           m4, [t4+xq*2+16]
+%else
+    paddw           m4, [r2+xq*2+16]
+    mov             r2, t2
+    mova            m3, [r2+xq*2+16]
+    mov           dstq, dstmp
 %endif
-    add         midq, 16
-    add         dstq, 8
-    sub           wd, 8
-    jg .loop_x
-    RET
+    psrad           m0, 11
+    psrad           m1, 11
+    packssdw        m0, m1
+    punpcklwd       m1, m2, m3
+    pmaddwd         m1, m7
+    punpckhwd       m2, m3
+    pmaddwd         m2, m7
+    punpcklwd       m3, m4
+    pmaddwd         m3, m6
+    punpckhwd       m4, m4
+    pmaddwd         m4, m6
+    paddd           m1, m3
+    paddd           m2, m4
+    psrad           m1, 11
+    psrad           m2, 11
+    packssdw        m1, m2
+    packuswb        m0, m1
+    mova     [dstq+xq], m0
+    add             xq, 16
+    jl .v_loop
+    ret
+%endif
 %endmacro
 
 INIT_XMM sse2
-WIENER_H
-WIENER_V
+WIENER
 
 INIT_XMM ssse3
-WIENER_H
-WIENER_V
+WIENER
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;      self-guided     ;;
@@ -698,7 +1195,7 @@
     neg           xq
     mov           wq, xq
 %if ARCH_X86_64
-    lea          r10, [pb_right_ext_mask+16]
+    lea          r10, [pb_right_ext_mask+24]
 %endif
 .loop_y:
     mov           xq, wq
@@ -734,7 +1231,7 @@
 %if ARCH_X86_64
     movu          m4, [r10+xq*2]
 %else
-    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+    movu          m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
 %endif
     pand          m2, m4
     pandn         m4, m3
@@ -1132,7 +1629,7 @@
     psubw         m1, m4                            ; aa
     movq          m0, [srcq]
     XCHG_PIC_REG
-    punpcklbw     m0, [PIC_sym(pb_right_ext_mask)+16]
+    punpcklbw     m0, [PIC_sym(pb_0)]
     punpcklwd     m4, m1, [PIC_sym(pw_16)]
     punpckhwd     m1, [PIC_sym(pw_16)]
     punpcklwd     m2, m0, [PIC_sym(pw_16)]
@@ -1266,7 +1763,7 @@
     lea       sumsqq, [sumsqq+wq*4-4]
     neg           wq
 %if ARCH_X86_64
-    lea          r10, [pb_right_ext_mask+16]
+    lea          r10, [pb_right_ext_mask+24]
 %else
     mov           wm, xd
  %define wq wm
@@ -1313,7 +1810,7 @@
 %if ARCH_X86_64
     movu          m4, [r10+xq*2]
 %else
-    movu          m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+    movu          m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
     XCHG_PIC_REG
 %endif
     pand          m2, m4
@@ -1880,6 +2377,7 @@
 %endif
     RET
 
+%undef t2
 cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
     movifnidn     wd, wm
     movd          m0, wtm
--- a/tests/checkasm/looprestoration.c
+++ b/tests/checkasm/looprestoration.c
@@ -27,6 +27,7 @@
 
 #include "tests/checkasm/checkasm.h"
 
+#include <stdio.h>
 #include <string.h>
 
 #include "src/levels.h"
@@ -33,6 +34,10 @@
 #include "src/looprestoration.h"
 #include "src/tables.h"
 
+static int to_binary(int x) { /* 0-15 -> 0000-1111 */
+    return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
+}
+
 static void init_tmp(pixel *buf, const ptrdiff_t stride,
                      const int w, const int h, const int bitdepth_max)
 {
@@ -56,11 +61,9 @@
                  int w, int h, const int16_t filter[2][8],
                  enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
 
-    for (int pl = 0; pl < 2; pl++) {
-        if (check_func(c->wiener, "wiener_%s_%dbpc",
-                       pl ? "chroma" : "luma", bpc))
-        {
-            filter[0][0] = filter[0][6] = pl ? 0 : (rnd() & 15) - 5;
+    for (int t = 0; t < 2; t++) {
+        if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) {
+            filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5;
             filter[0][1] = filter[0][5] = (rnd() & 31) - 23;
             filter[0][2] = filter[0][4] = (rnd() & 63) - 17;
             filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
@@ -68,7 +71,7 @@
             filter[0][3] += 128;
 #endif
 
-            filter[1][0] = filter[1][6] = pl ? 0 : (rnd() & 15) - 5;
+            filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5;
             filter[1][1] = filter[1][5] = (rnd() & 31) - 23;
             filter[1][2] = filter[1][4] = (rnd() & 63) - 17;
             filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
@@ -93,9 +96,14 @@
                 call_new(a_dst + 32, 448 * sizeof(pixel), left,
                          h_edge + 32, 448 * sizeof(pixel),
                          w, h, filter, edges HIGHBD_TAIL_SUFFIX);
-                checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
-                                     a_dst + 32, 448 * sizeof(pixel),
-                                     w, h, "dst");
+                if (checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
+                                         a_dst + 32, 448 * sizeof(pixel),
+                                         w, h, "dst"))
+                {
+                    fprintf(stderr, "size = %dx%d, edges = %04d\n",
+                            w, h, to_binary(edges));
+                    break;
+                }
             }
             bench_new(a_dst + 32, 448 * sizeof(pixel), left,
                       h_edge + 32, 448 * sizeof(pixel),