ref: 78d27b7d1c923f632bc266470436e7f46a940d70
parent: 3497c4c905f8c85d8c65b28c352ff85dfddd66ed
author: Henrik Gramner <[email protected]>
date: Wed Dec 2 09:10:57 EST 2020
x86: Rewrite wiener SSE2/SSSE3/AVX2 asm The previous implementation did two separate passes in the horizontal and vertical directions, with the intermediate values being stored in a buffer on the stack. This caused bad cache thrashing. By interleaving the horizontal and vertical passes in combination with a ring buffer for storing only a few rows at a time the performance is improved by a significant amount. Also split the function into 7-tap and 5-tap versions. The latter is faster and fairly common (always for chroma, sometimes for luma).
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -288,7 +288,7 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
- c->wiener = wiener_filter_neon;
+ c->wiener[0] = c->wiener[1] = wiener_filter_neon;
if (bpc <= 10)
c->selfguided = sgr_filter_neon;
}
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -67,7 +67,7 @@
typedef decl_selfguided_filter_fn(*selfguided_fn);
typedef struct Dav1dLoopRestorationDSPContext {
- wienerfilter_fn wiener;
+ wienerfilter_fn wiener[2]; /* 7-tap, 5-tap */
selfguided_fn selfguided;
} Dav1dLoopRestorationDSPContext;
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -509,7 +509,7 @@
}
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
- c->wiener = wiener_c;
+ c->wiener[0] = c->wiener[1] = wiener_c;
c->selfguided = selfguided_c;
#if HAVE_ASM
--- a/src/lr_apply_tmpl.c
+++ b/src/lr_apply_tmpl.c
@@ -163,6 +163,7 @@
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
ALIGN_STK_16(int16_t, filter, 2, [8]);
+ wienerfilter_fn wiener_fn = NULL;
if (lr->type == DAV1D_RESTORATION_WIENER) {
filter[0][0] = filter[0][6] = lr->filter_h[0];
filter[0][1] = filter[0][5] = lr->filter_h[1];
@@ -178,6 +179,8 @@
filter[1][1] = filter[1][5] = lr->filter_v[1];
filter[1][2] = filter[1][4] = lr->filter_v[2];
filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
+
+ wiener_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])];
} else {
assert(lr->type == DAV1D_RESTORATION_SGRPROJ);
}
@@ -185,9 +188,9 @@
while (y + stripe_h <= row_h) {
// Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
- if (lr->type == DAV1D_RESTORATION_WIENER) {
- dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
- filter, edges HIGHBD_CALL_SUFFIX);
+ if (wiener_fn) {
+ wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+ filter, edges HIGHBD_CALL_SUFFIX);
} else {
dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX);
--- a/src/ppc/looprestoration_init_tmpl.c
+++ b/src/ppc/looprestoration_init_tmpl.c
@@ -332,7 +332,7 @@
if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
#if BITDEPTH == 8
- c->wiener = wiener_filter_vsx;
+ c->wiener[0] = c->wiener[1] = wiener_filter_vsx;
#endif
}
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -29,20 +29,25 @@
%if ARCH_X86_64
SECTION_RODATA 32
+
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
pb_right_ext_mask: times 32 db 0xff
times 32 db 0
-pb_14x0_1_2: times 14 db 0
- db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
- db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_15: times 16 db 15
-pw_16: times 2 dw 16
-pw_256: times 2 dw 256
-pw_2048: times 2 dw 2048
-pw_16380: times 2 dw 16380
-pw_5_6: dw 5, 6
-pd_6: dd 6
-pd_1024: dd 1024
+
+pb_3: times 4 db 3
+pb_m5: times 4 db -5
+pw_16: times 2 dw 16
+pw_256: times 2 dw 256
+pw_2056: times 2 dw 2056
+pw_m16380: times 2 dw -16380
+pw_5_6: dw 5, 6
+pd_1024: dd 1024
pd_0xf0080029: dd 0xf0080029
pd_0xf00801c7: dd 0xf00801c7
@@ -50,277 +55,662 @@
SECTION .text
-INIT_YMM avx2
-cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, flt, w, h, edge
- mov edged, edgem
- vpbroadcastb m15, [fltq+0]
- movifnidn wd, wm
- vpbroadcastb m14, [fltq+2]
- mov hd, hm
- vpbroadcastb m13, [fltq+4]
- vpbroadcastw m12, [fltq+6]
- vpbroadcastd m11, [pw_2048]
- vpbroadcastd m10, [pw_16380]
- lea r11, [pb_right_ext_mask]
+%macro REPX 2-*
+ %xdefine %%f(x) %1
+%rep %0 - 1
+ %rotate 1
+ %%f(%1)
+%endrep
+%endmacro
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers
- ; if (edge & has_right) align_w_to_32
- ; else w -= 32, and use that as limit in x loop
- test edgeb, 2 ; has_right
- jnz .align
- mov xlimq, -3
- jmp .loop
-.align:
- add wd, 31
- and wd, ~31
- xor xlimd, xlimd
-
- ; main y loop for vertical filter
-.loop:
- mov srcptrq, srcq
- mov dstptrq, dstq
- lea xq, [wq+xlimq]
-
- ; load left edge pixels
- test edgeb, 1 ; have_left
- jz .emu_left
- test leftq, leftq ; left == NULL for the edge-extended bottom/top
- jz .load_left_combined
- movd xm0, [leftq]
- add leftq, 4
- pinsrd xm0, [srcq], 1
- pslldq xm0, 9
- jmp .left_load_done
-.load_left_combined:
- movq xm0, [srcq-3]
- pslldq xm0, 10
- jmp .left_load_done
-.emu_left:
- movd xm0, [srcq]
- pshufb xm0, [pb_14x0_1_2]
-
- ; load right edge pixels
-.left_load_done:
- cmp xd, 32
- jg .main_load
- test xd, xd
- jg .load_and_splat
- je .splat_right
-
- ; for very small images (w=[1-2]), edge-extend the original cache,
- ; ugly, but only runs in very odd cases
- add wd, wd
- pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
- shr wd, 1
-
- ; main x loop, mostly this starts in .main_load
-.splat_right:
- ; no need to load new pixels, just extend them from the (possibly previously
- ; extended) previous load into m0
- pshufb xm1, xm0, [pb_15]
- jmp .main_loop
-.load_and_splat:
- ; load new pixels and extend edge for right-most
- movu m1, [srcptrq+3]
- sub r11, xq
- movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
- add r11, xq
- vpbroadcastb m3, [srcptrq+2+xq]
- pand m1, m2
- pandn m3, m2, m3
- por m1, m3
- jmp .main_loop
-.main_load:
- ; load subsequent line
- movu m1, [srcptrq+3]
+INIT_YMM avx2
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ vbroadcasti128 m6, [wiener_shufA]
+ vpbroadcastb m11, [fltq+ 0] ; x0 x0
+ vbroadcasti128 m7, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m8, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m9, [wiener_shufD]
+ add lpfq, wq
+ vpbroadcastd m10, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ vpbroadcastd m14, [fltq+16] ; y0 y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ neg wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov t4, t1
+ add t1, 384*2
+ mov [rsp+8*1], lpf_strideq
+ add r7, lpf_strideq
+ mov [rsp+8*0], r7 ; below
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
.main_loop:
- vinserti128 m0, xm1, 1
-
- palignr m2, m1, m0, 10
- palignr m3, m1, m0, 11
- palignr m4, m1, m0, 12
- palignr m5, m1, m0, 13
- palignr m6, m1, m0, 14
- palignr m7, m1, m0, 15
-
- punpcklbw m0, m2, m1
- punpckhbw m2, m1
- punpcklbw m8, m3, m7
- punpckhbw m3, m7
- punpcklbw m7, m4, m6
- punpckhbw m4, m6
- pxor m9, m9
- punpcklbw m6, m5, m9
- punpckhbw m5, m9
-
- pmaddubsw m0, m15
- pmaddubsw m2, m15
- pmaddubsw m8, m14
- pmaddubsw m3, m14
- pmaddubsw m7, m13
- pmaddubsw m4, m13
- paddw m0, m8
- paddw m2, m3
- psllw m8, m6, 7
- psllw m3, m5, 7
- psubw m8, m10
- psubw m3, m10
- pmullw m6, m12
- pmullw m5, m12
- paddw m0, m7
- paddw m2, m4
- paddw m0, m6
- paddw m2, m5
- ; for a signed overflow to happen we need filter and pixels as follow:
- ; filter => -5,-23,-17,90,-17,-23,-5
- ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0
- ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6]
- ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84]
- ; 32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A]
- ; => signed 16-bit overflow occurs
- paddsw m0, m8 ; paddsw clips this range to [-8000;+7FFF]
- paddsw m2, m3
- psraw m0, 3 ; shift changes the range to [-1000;+FFF]
- psraw m2, 3
- paddw m0, m11 ; adding back 800 (removed in m8) changes the
- paddw m2, m11 ; range to [-800;+17FF] as defined in the spec
- mova [dstptrq], xm0 ; (note that adding another 800 would give us
- mova [dstptrq+16], xm2; the same range as in the C code => [0;1FFF])
- vextracti128 [dstptrq+32], m0, 1
- vextracti128 [dstptrq+48], m2, 1
- vextracti128 xm0, m1, 1
- add srcptrq, 32
- add dstptrq, 64
- sub xq, 32
- cmp xd, 32
- jg .main_load
- test xd, xd
- jg .load_and_splat
- cmp xd, xlimd
- jg .splat_right
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp+8*0]
+ call .hv_bottom
+ add lpfq, [rsp+8*1]
+ call .hv_bottom
+.v1:
+ call .v
+ RET
+.no_top:
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov [rsp+8*1], lpf_strideq
+ lea r7, [r7+lpf_strideq*2]
+ mov [rsp+8*0], r7
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call .v
+.v2:
+ call .v
+ jmp .v1
+.extend_right:
+ movd xm2, r10d
+ vpbroadcastd m0, [pb_3]
+ vpbroadcastd m1, [pb_m5]
+ vpbroadcastb m2, xm2
+ movu m3, [pb_0to31]
+ psubb m0, m2
+ psubb m1, m2
+ pminub m0, m3
+ pminub m1, m3
+ pshufb m4, m0
+ pshufb m5, m1
+ ret
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, [wiener_l_shuf]
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ movu m4, [lpfq+r10-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jnz .h_main
+ pshufb m4, [wiener_l_shuf]
+ jmp .h_main
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -34
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ vpbroadcastd m2, [pw_2056]
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m2
+ paddw m1, m2
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, [wiener_l_shuf]
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -34
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m11
+ pshufb m1, m5, m6
+ pmaddubsw m1, m11
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ paddw m0, m2
+ pshufb m2, m4, m8
+ pmaddubsw m2, m12
+ paddw m1, m3
+ pshufb m3, m5, m8
+ pmaddubsw m3, m12
+ pshufb m4, m9
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m9
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m10
+ paddw m5, m10
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t4+r10*2]
+ paddw m2, [t2+r10*2]
+ mova m3, [t3+r10*2]
+ paddsw m0, m4
+ vpbroadcastd m4, [pw_2056]
+ paddsw m1, m5
+ mova m5, [t5+r10*2]
+ paddw m5, [t1+r10*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m4, m0, [t6+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t4+r10*2+32]
+ paddw m2, [t2+r10*2+32]
+ mova m3, [t3+r10*2+32]
+ mova m5, [t5+r10*2+32]
+ paddw m5, [t1+r10*2+32]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
+ paddw m4, m1, [t6+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m14
+ punpckhwd m4, m5
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+ add dstq, dst_strideq
+ ret
+.v:
+ mov r10, wq
+.v_loop:
+ mova m2, [t4+r10*2+ 0]
+ paddw m2, [t2+r10*2+ 0]
+ mova m4, [t3+r10*2+ 0]
+ mova m6, [t1+r10*2+ 0]
+ paddw m8, m6, [t6+r10*2+ 0]
+ paddw m6, [t5+r10*2+ 0]
+ mova m3, [t4+r10*2+32]
+ paddw m3, [t2+r10*2+32]
+ mova m5, [t3+r10*2+32]
+ mova m7, [t1+r10*2+32]
+ paddw m9, m7, [t6+r10*2+32]
+ paddw m7, [t5+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m4, m8, m6
+ pmaddwd m4, m14
+ punpckhwd m6, m8, m6
+ pmaddwd m6, m14
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m9, m7
+ pmaddwd m5, m14
+ punpckhwd m7, m9, m7
+ pmaddwd m7, m14
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ REPX {psrad x, 11}, m0, m2, m1, m3
+ packssdw m0, m2
+ packssdw m1, m3
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, dst_strideq
+ ret
- add srcq, strideq
- add dstq, 384*2
- dec hd
- jg .loop
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ vbroadcasti128 m6, [wiener_shufB]
+ vpbroadcastd m12, [fltq+ 2]
+ vbroadcasti128 m7, [wiener_shufC]
+ packsswb m12, m12 ; x1 x2
+ vpbroadcastw m13, [fltq+ 6] ; x3
+ vbroadcasti128 m8, [wiener_shufD]
+ add lpfq, wq
+ vpbroadcastd m9, [pw_m16380]
+ vpbroadcastd m10, [pw_2056]
+ lea t1, [rsp+wq*2+16]
+ mova m11, [wiener_l_shuf]
+ vpbroadcastd m14, [fltq+16] ; __ y1
+ add dstq, wq
+ vpbroadcastd m15, [fltq+20] ; y2 y3
+ neg wq
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov t3, t1
+ add t1, 384*2
+ mov [rsp+8*1], lpf_strideq
+ add r7, lpf_strideq
+ mov [rsp+8*0], r7 ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp+8*0]
+ call .hv_bottom
+ add lpfq, [rsp+8*1]
+ call .hv_bottom
+.end:
RET
+.no_top:
+ lea r7, [lpfq+lpf_strideq*4]
+ mov lpfq, dstq
+ mov [rsp+8*1], lpf_strideq
+ lea r7, [r7+lpf_strideq*2]
+ mov [rsp+8*0], r7
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call .v
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ add dstq, dst_strideq
+.v1:
+ call .v
+ jmp .end
+.h:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .h_main
+.h_extend_left:
+ vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located
+ mova m4, [lpfq+r10] ; before the start of the buffer
+ palignr m4, m5, 12
+ pshufb m4, m11
+ jmp .h_main
+.h_top:
+ mov r10, wq
+ movu m4, [lpfq+r10-4]
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jnz .h_main
+ pshufb m4, m11
+ jmp .h_main
+.h_loop:
+ movu m4, [lpfq+r10-4]
+.h_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp r10d, -33
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.h_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ mova [t1+r10*2+ 0], m0
+ mova [t1+r10*2+32], m1
+ add r10, 32
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movd xm4, [leftq]
+ vpblendd m4, [lpfq+r10-4], 0xfe
+ add leftq, 4
+ jmp .hv_main
+.hv_extend_left:
+ movu m4, [lpfq+r10-4]
+ pshufb m4, m11
+ jmp .hv_main
+.hv_bottom:
+ mov r10, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+r10-4]
+.hv_main:
+ movu m5, [lpfq+r10+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp r10d, -33
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right
+.hv_have_right:
+ pshufb m0, m4, m6
+ pmaddubsw m0, m12
+ pshufb m1, m5, m6
+ pmaddubsw m1, m12
+ pshufb m2, m4, m7
+ pmaddubsw m2, m12
+ pshufb m3, m5, m7
+ pmaddubsw m3, m12
+ pshufb m4, m8
+ paddw m0, m2
+ pmullw m2, m4, m13
+ pshufb m5, m8
+ paddw m1, m3
+ pmullw m3, m5, m13
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m9
+ paddw m5, m9
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [t3+r10*2]
+ paddw m2, [t1+r10*2]
+ mova m3, [t2+r10*2]
+ paddsw m0, m4
+ paddsw m1, m5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m10
+ paddw m1, m10
+ paddw m4, m0, [t4+r10*2]
+ mova [t0+r10*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+r10*2+32]
+ paddw m2, [t1+r10*2+32]
+ mova m3, [t2+r10*2+32]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
+ paddw m4, m1, [t4+r10*2+32]
+ mova [t0+r10*2+32], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m15
+ punpckhwd m2, m3
+ pmaddwd m2, m15
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m14
+ punpckhwd m4, m4
+ pmaddwd m4, m14
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .hv_loop
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ add dstq, dst_strideq
+ ret
+.v:
+ mov r10, wq
+ psrld m13, m14, 16 ; y1 __
+.v_loop:
+ mova m6, [t1+r10*2+ 0]
+ paddw m2, m6, [t3+r10*2+ 0]
+ mova m4, [t2+r10*2+ 0]
+ mova m7, [t1+r10*2+32]
+ paddw m3, m7, [t3+r10*2+32]
+ mova m5, [t2+r10*2+32]
+ paddw m6, [t4+r10*2+ 0]
+ paddw m7, [t4+r10*2+32]
+ punpcklwd m0, m2, m4
+ pmaddwd m0, m15
+ punpckhwd m2, m4
+ pmaddwd m2, m15
+ punpcklwd m1, m3, m5
+ pmaddwd m1, m15
+ punpckhwd m3, m5
+ pmaddwd m3, m15
+ punpcklwd m5, m7, m6
+ pmaddwd m4, m5, m14
+ punpckhwd m7, m6
+ pmaddwd m6, m7, m14
+ pmaddwd m5, m13
+ pmaddwd m7, m13
+ paddd m0, m4
+ paddd m2, m6
+ paddd m1, m5
+ paddd m3, m7
+ REPX {psrad x, 11}, m0, m2, m1, m3
+ packssdw m0, m2
+ packssdw m1, m3
+ packuswb m0, m1
+ mova [dstq+r10], m0
+ add r10, 32
+ jl .v_loop
+ ret
-cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, flt, edge
- movifnidn fltq, fltmp
- mov edged, edgem
- movifnidn hd, hm
- vpbroadcastd m10, [fltq+16]
- vpbroadcastd m11, [fltq+20]
- vpbroadcastd m12, [pd_1024]
-
- DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
- rorx ylimd, edged, 2
- and ylimd, 2 ; have_bottom
- sub ylimd, 3
-
- ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
- mova m3, [midq] ; middle line
-
- ; load top pixels
- test edgeb, 4 ; have_top
- jz .emu_top
- mova m0, [midq-384*4]
- mova m2, [midq-384*2]
- mova m1, m0
- jmp .load_bottom_pixels
-.emu_top:
- mova m0, m3
- mova m1, m3
- mova m2, m3
-
- ; load bottom pixels
-.load_bottom_pixels:
- mov yd, hd
- mov mptrq, midq
- mov dstptrq, dstq
- add yd, ylimd
- jg .load_threelines
-
- ; the remainder here is somewhat messy but only runs in very weird
- ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
- ; so performance is not terribly important here...
- je .load_twolines
- cmp yd, -1
- je .load_oneline
- ; h == 1 case
- mova m5, m3
- mova m4, m3
- mova m6, m3
- jmp .loop
-.load_oneline:
- ; h == 2 case
- mova m4, [midq+384*2]
- mova m5, m4
- mova m6, m4
- jmp .loop
-.load_twolines:
- ; h == 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- mova m6, m5
- jmp .loop
-.load_threelines:
- ; h > 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- ; third line loaded in main loop below
-
- ; main y loop for vertical filter
-.loop_load:
- ; load one line into m6. if that pixel is no longer available, do
- ; nothing, since m6 still has the data from the previous line in it. We
- ; try to structure the loop so that the common case is evaluated fastest
- mova m6, [mptrq+384*6]
-.loop:
- paddw m0, m6
- paddw m7, m1, m5
- paddw m8, m2, m4
- punpcklwd m9, m0, m7
- punpckhwd m0, m7
- punpcklwd m7, m8, m3
- punpckhwd m8, m3
- pmaddwd m9, m10
- pmaddwd m0, m10
- pmaddwd m7, m11
- pmaddwd m8, m11
- add mptrq, 384*2
- paddd m7, m9
- paddd m0, m8
- paddd m7, m12
- paddd m0, m12
- psrad m7, 11
- psrad m0, 11
- packssdw m7, m0
- vextracti128 xm0, m7, 1
- packuswb xm7, xm0
- mova [dstptrq], xm7
- ; shift pixels one position
- mova m0, m1
- mova m1, m2
- mova m2, m3
- mova m3, m4
- mova m4, m5
- mova m5, m6
- add dstptrq, strideq
- dec yd
- jg .loop_load
- ; for the bottom pixels, continue using m6 (as extended edge)
- cmp yd, ylimd
- jg .loop
- add midq, 32
- add dstq, 16
- sub wd, 16
- jg .loop_x
- RET
-
-INIT_YMM avx2
cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
mov xlimd, edgem
movifnidn wd, wm
--- a/src/x86/looprestoration_init_tmpl.c
+++ b/src/x86/looprestoration_init_tmpl.c
@@ -31,54 +31,19 @@
#include "common/intops.h"
#include "src/tables.h"
-// Future potential optimizations:
-// - special chroma versions which don't filter [0]/[6];
-// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
-// to bottom) instead of scanline-ordered should be faster since then the
-// if (have_left) and similar conditions run only once instead of per line;
-// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
-// to run 32 (like filter_h_avx2), and then all vpermqs can go;
-// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
-// since then the have_left condition can be inlined;
-// - consider having the wrapper (wiener_filter_${ext}) also in hand-written
-// assembly, so the setup overhead is minimized.
-
#define WIENER_FILTER(ext) \
-\
-void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \
- const pixel *src, ptrdiff_t stride, \
- const int16_t filter[2][8], const intptr_t w, \
- int h, enum LrEdgeFlags edges); \
-void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \
- const int16_t *mid, int w, int h, \
- const int16_t filter[2][8], \
- enum LrEdgeFlags edges); \
-\
-static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
- const pixel (*const left)[4], \
- const pixel *lpf, const ptrdiff_t lpf_stride, \
- const int w, const int h, \
+void dav1d_wiener_filter7_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+ const pixel (*left)[4], const pixel *lpf, \
+ ptrdiff_t lpf_stride, int w, int h, \
const int16_t filter[2][8], \
- const enum LrEdgeFlags edges) \
-{ \
- ALIGN_STK_32(int16_t, mid, 68 * 384,); \
-\
- /* horizontal filter */ \
- dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \
- filter, w, h, edges); \
- if (edges & LR_HAVE_TOP) \
- dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \
- filter, w, 2, edges); \
- if (edges & LR_HAVE_BOTTOM) \
- dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \
- lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \
- filter, w, 2, edges); \
-\
- dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, filter, edges); \
-}
+ enum LrEdgeFlags edges); \
+void dav1d_wiener_filter5_##ext(pixel *const dst, ptrdiff_t dst_stride, \
+ const pixel (*left)[4], const pixel *lpf, \
+ ptrdiff_t lpf_stride, int w, int h, \
+ const int16_t filter[2][8], \
+ enum LrEdgeFlags edges);
#define SGR_FILTER(ext) \
-\
void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \
const pixel (*left)[4], \
const pixel *src, const ptrdiff_t stride, \
@@ -201,15 +166,13 @@
} \
}
-#define DEF_LR_FILTERS(ext) \
-WIENER_FILTER(ext) \
-SGR_FILTER(ext)
-
#if BITDEPTH == 8
WIENER_FILTER(sse2)
-DEF_LR_FILTERS(ssse3)
+WIENER_FILTER(ssse3)
+SGR_FILTER(ssse3)
# if ARCH_X86_64
-DEF_LR_FILTERS(avx2)
+WIENER_FILTER(avx2)
+SGR_FILTER(avx2)
# endif
#endif
@@ -218,18 +181,21 @@
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
- c->wiener = wiener_filter_sse2;
+ c->wiener[0] = dav1d_wiener_filter7_sse2;
+ c->wiener[1] = dav1d_wiener_filter5_sse2;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
- c->wiener = wiener_filter_ssse3;
+ c->wiener[0] = dav1d_wiener_filter7_ssse3;
+ c->wiener[1] = dav1d_wiener_filter5_ssse3;
c->selfguided = sgr_filter_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
- c->wiener = wiener_filter_avx2;
+ c->wiener[0] = dav1d_wiener_filter7_avx2;
+ c->wiener[1] = dav1d_wiener_filter5_avx2;
c->selfguided = sgr_filter_avx2;
#endif
}
--- a/src/x86/looprestoration_sse.asm
+++ b/src/x86/looprestoration_sse.asm
@@ -29,34 +29,33 @@
SECTION_RODATA 16
-pb_right_ext_mask: times 16 db 0xff
- times 16 db 0
-pb_14x0_1_2: times 14 db 0
- db 1, 2
-pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
- db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
-pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
-pb_0: times 16 db 0
-pb_2: times 16 db 2
-pb_3: times 16 db 3
-pb_4: times 16 db 4
-pb_15: times 16 db 15
-pb_0_1: times 8 db 0, 1
-pb_6_7: times 8 db 6, 7
-pb_14_15: times 8 db 14, 15
-pw_1: times 8 dw 1
-pw_16: times 8 dw 16
-pw_128: times 8 dw 128
-pw_255: times 8 dw 255
-pw_256: times 8 dw 256
-pw_2048: times 8 dw 2048
-pw_16380: times 8 dw 16380
-pw_5_6: times 4 dw 5, 6
-pd_1024: times 4 dd 1024
+wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4
+wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14
+wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12
+wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1
+wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
+
+pb_right_ext_mask: times 24 db 0xff
+ times 8 db 0
+pb_0: times 16 db 0
+pb_3: times 16 db 3
+pb_15: times 16 db 15
+pb_0_1: times 8 db 0, 1
+pb_14_15: times 8 db 14, 15
+pw_1: times 8 dw 1
+pw_16: times 8 dw 16
+pw_128: times 8 dw 128
+pw_256: times 8 dw 256
+pw_2048: times 8 dw 2048
+pw_2056: times 8 dw 2056
+pw_m16380: times 8 dw -16380
+pw_5_6: times 4 dw 5, 6
+pd_1024: times 4 dd 1024
%if ARCH_X86_32
-pd_256: times 4 dd 256
-pd_512: times 4 dd 512
-pd_2048: times 4 dd 2048
+pd_512: times 4 dd 512
+pd_2048: times 4 dd 2048
%endif
pd_0xF0080029: times 4 dd 0xF0080029
pd_0xF00801C7: times 4 dd 0XF00801C7
@@ -95,539 +94,1037 @@
%define PIC_sym(sym) (sym)
%endif
-%macro PALIGNR 4 ; dst, src1, src2, shift
- %if cpuflag(ssse3)
- palignr %1, %2, %3, %4
- %else
- %assign %%i regnumof%+%1 + 1
- %define %%tmp m %+ %%i
- psrldq %1, %3, %4
- pslldq %%tmp, %2, 16-%4
- por %1, %%tmp
- %endif
-%endmacro
-
-%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
- %if cpuflag(ssse3)
- pmaddubsw %1, %2
- %else
- %if %5 == 1
- pxor %3, %3
- %endif
- punpckhbw %4, %1, %3
- punpcklbw %1, %3
- pmaddwd %4, %2
- pmaddwd %1, %2
- packssdw %1, %4
- %endif
-%endmacro
-
-;;;;;;;;;;;;;;;;;;;;;;
-;; wiener ;;
-;;;;;;;;;;;;;;;;;;;;;;
-
-%macro WIENER_H 0
+%macro WIENER 0
%if ARCH_X86_64
-cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, flt, w, h, edge
- mov edged, edgem
- movifnidn wd, wm
- mov hd, hm
+DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers
+cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h, x
+ %define base 0
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ movq m14, [fltq]
+ add lpfq, wq
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ add dstq, wq
+ movq m7, [fltq+16]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m8, [wiener_shufA]
+ pshufd m12, m14, q2222 ; x0 x0
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
%else
-cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, flt, w, h, edge
- mov r5, edgem
- mov [esp+12], r5
- mov wd, wm
- mov hd, hm
- SETUP_PIC hd
- %define m15 m0
- %define m14 m1
- %define m13 m2
- %define m12 m3
+ mova m10, [pw_m16380]
+ punpcklwd m14, m14
+ pshufd m11, m14, q0000 ; x0
+ pshufd m12, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
%endif
-
- movq m15, [fltq]
+%else
+DECLARE_REG_TMP 4, 0, _, 5
%if cpuflag(ssse3)
- pshufb m12, m15, [PIC_sym(pb_6_7)]
- pshufb m13, m15, [PIC_sym(pb_4)]
- pshufb m14, m15, [PIC_sym(pb_2)]
- pshufb m15, m15, [PIC_sym(pb_0)]
+ %define m10 [base+wiener_shufC]
+ %define m11 [base+wiener_shufD]
+ %define stk_off 96
%else
- pshuflw m12, m15, q3333
- punpcklbw m15, m15
- pshufhw m13, m15, q0000
- pshuflw m14, m15, q2222
- pshuflw m15, m15, q0000
- punpcklqdq m12, m12
- punpckhqdq m13, m13
- punpcklqdq m14, m14
- punpcklqdq m15, m15
- psraw m13, 8
- psraw m14, 8
- psraw m15, 8
+ %define m10 [base+pw_m16380]
+ %define m11 [stk+96]
+ %define stk_off 112
%endif
-
-%if ARCH_X86_64
- mova m11, [pw_2048]
- mova m10, [pw_16380]
- lea r11, [pb_right_ext_mask]
-
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride
+ %define base r6-pb_right_ext_mask-21
+ %define stk esp
+ %define dstq leftq
+ %define edgeb byte edged
+ %define edged [stk+ 8]
+ %define dstmp [stk+12]
+ %define hd dword [stk+16]
+ %define wq [stk+20]
+ %define dst_strideq [stk+24]
+ %define leftmp [stk+28]
+ %define t2 [stk+32]
+ %define t4 [stk+36]
+ %define t5 [stk+40]
+ %define t6 [stk+44]
+ %define m8 [base+wiener_shufA]
+ %define m9 [base+wiener_shufB]
+ %define m12 [stk+48]
+ %define m13 [stk+64]
+ %define m14 [stk+80]
+ %define m15 [base+pw_2056]
+ mov r1, r7m ; flt
+ mov r0, r0m ; dst
+ mov r5, r5m ; w
+ mov lpfq, lpfm
+ mov r2, r8m ; edge
+ mov r4, r6m ; h
+ movq m3, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r5
+ mov r1, r1m ; dst_stride
+ add lpfq, r5
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r5*2+stk_off]
+ mov hd, r4
+ neg r5
+ mov lpf_strideq, lpf_stridem
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r5
+ mov dst_strideq, r1
+ mov leftmp, r2
+%if cpuflag(ssse3)
+ pshufb m3, [base+wiener_init]
+ pshufd m1, m3, q2222
+ pshufd m2, m3, q3333
+ punpcklqdq m3, m3
%else
- %define m10 [PIC_sym(pw_16380)]
- %define m11 [PIC_sym(pw_2048)]
- %define m12 [esp+0x14]
- %define m13 [esp+0x24]
- %define m14 [esp+0x34]
- %define m15 [esp+0x44]
- mova m12, m3
- mova m13, m2
- mova m14, m1
- mova m15, m0
-
- DEFINE_ARGS dst, left, src, stride, x, w, h, edge
- %define srcptrq srcq
- %define dstptrq dstq
- %define hd dword [esp+ 0]
- %define edgeb byte [esp+12]
- %define xlimd dword [esp+16]
+ punpcklwd m3, m3
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ mova m11, m0
%endif
-
- ; if (edge & has_right) align_w_to_16
- ; else w -= 3, and use that as limit in x loop
- test edgeb, 2 ; has_right
- jnz .align
- mov xlimd, -3
- jmp .loop
-.align:
- add wd, 15
- and wd, ~15
+ mova m12, m1
+ mova m13, m2
+ mova m14, m3
+%endif
+ pshufd m6, m7, q0000 ; y0 y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t6, t1
+ mov t5, t1
+ add t1, 384*2
+ call .h_top
+ lea t3, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov [rsp+gprsize*1], lpf_strideq
+ add t3, lpf_strideq
+ mov [rsp+gprsize*0], t3 ; below
+ mov t4, t1
+ add t1, 384*2
+ call .h
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+.main:
+ lea t0, [t1+384*2]
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v3
+ mov lpfq, [rsp+gprsize*0]
+ call .hv_bottom
+ add lpfq, [rsp+gprsize*1]
+ call .hv_bottom
+.v1:
+ call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+ RET
+.no_top:
+ lea t3, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov [rsp+gprsize*1], lpf_strideq
+ lea t3, [t3+lpf_strideq*2]
+ mov [rsp+gprsize*0], t3
+ call .h
+ mov t6, t1
+ mov t5, t1
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v2
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v3
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v3
+ add t0, 384*8
+ call .hv
+ dec hd
+ jnz .main
+.v3:
+ call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+.v2:
+ call mangle(private_prefix %+ _wiener_filter7_ssse3).v
+ jmp .v1
+.extend_right:
+ movd m2, [lpfq-4]
%if ARCH_X86_64
- xor xlimd, xlimd
+ push r0
+ lea r0, [pb_right_ext_mask+21]
+ movu m0, [r0+xq+0]
+ movu m1, [r0+xq+8]
+ pop r0
%else
- mov xlimd, 0
+ movu m0, [r6+xq+0]
+ movu m1, [r6+xq+8]
%endif
-
- ; main y loop for vertical filter
-.loop:
-%if ARCH_X86_64
- mov srcptrq, srcq
- mov dstptrq, dstq
- lea xd, [wq+xlimq]
+%if cpuflag(ssse3)
+ pshufb m2, [base+pb_3]
%else
- mov [esp+8], srcq
- mov [esp+4], dstq
- mov xd, xlimd
- add xd, wd
+ punpcklbw m2, m2
+ pshuflw m2, m2, q3333
+ punpcklqdq m2, m2
%endif
-
- ; load left edge pixels
- test edgeb, 1 ; have_left
- jz .emu_left
- test leftq, leftq ; left == NULL for the edge-extended bottom/top
- jz .load_left_combined
- movd m0, [leftq]
- movd m1, [srcq]
- punpckldq m0, m1
- pslldq m0, 9
- add leftq, 4
- jmp .left_load_done
-.load_left_combined:
- movq m0, [srcq-3]
- pslldq m0, 10
- jmp .left_load_done
-.emu_left:
- movd m0, [srcq]
+ pand m4, m0
+ pand m5, m1
+ pandn m0, m2
+ pandn m1, m2
+ por m4, m0
+ por m5, m1
+ ret
+.h:
+ %define stk esp+4 ; offset due to call
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
%if cpuflag(ssse3)
- pshufb m0, [PIC_sym(pb_14x0_1_2)]
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
%else
- pslldq m1, m0, 13
- punpcklbw m0, m0
- pshuflw m0, m0, q0000
- punpcklqdq m0, m0
- psrldq m0, 2
- por m0, m1
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
%endif
-
- ; load right edge pixels
-.left_load_done:
- cmp xd, 16
- jg .main_load
- test xd, xd
- jg .load_and_splat
- je .splat_right
-
- ; for very small images (w=[1-2]), edge-extend the original cache,
- ; ugly, but only runs in very odd cases
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -18
+ jl .h_have_right
+ call .extend_right
+.h_have_right:
+%macro %%h7 0
%if cpuflag(ssse3)
- add wd, wd
- %if ARCH_X86_64
- pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
- %else
- pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
- %endif
- shr wd, 1
+ pshufb m0, m4, m8
+ pmaddubsw m0, m12
+ pshufb m1, m5, m8
+ pmaddubsw m1, m12
+ pshufb m2, m4, m9
+ pmaddubsw m2, m13
+ pshufb m3, m5, m9
+ pmaddubsw m3, m13
+ paddw m0, m2
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ paddw m1, m3
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m0, m2
+ mova m2, [base+pw_m16380]
+ paddw m1, m3
+ paddw m4, m2
+ paddw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
%else
- shl wd, 4
- pcmpeqd m2, m2
- movd m3, wd
- psrldq m2, 2
- punpckhbw m1, m0, m0
- pshufhw m1, m1, q1122
- psllq m1, m3
- pand m0, m2
- pandn m2, m1
- por m0, m2
- shr wd, 4
+ psrldq m0, m4, 1
+ pslldq m1, m4, 1
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ psrldq m1, m4, 2
+ pslldq m2, m4, 2
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m12
+ paddw m0, m1
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m10
+ paddsw m0, m2
+ psrldq m1, m5, 1
+ pslldq m2, m5, 1
+ punpcklbw m1, m3
+ punpckhbw m2, m3
+ paddw m1, m2
+ pmullw m1, m11
+ psrldq m2, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m2, m3
+ punpckhbw m4, m3
+ paddw m2, m4
+ pmullw m2, m12
+ paddw m1, m2
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m10
+ paddsw m1, m4
%endif
-
- ; main x loop, mostly this starts in .main_load
-.splat_right:
- ; no need to load new pixels, just extend them from the (possibly previously
- ; extended) previous load into m0
+%endmacro
+ %%h7
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
%if cpuflag(ssse3)
- pshufb m1, m0, [PIC_sym(pb_15)]
+ mova m4, [lpfq+xq]
+ pshufb m4, [base+wiener_l_shuf]
%else
- punpckhbw m1, m0, m0
- pshufhw m1, m1, q3333
- punpckhqdq m1, m1
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
%endif
- jmp .main_loop
-.load_and_splat:
- ; load new pixels and extend edge for right-most
- movu m1, [srcptrq+3]
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -18
+ jl .hv_have_right
+ call .extend_right
+.hv_have_right:
+ %%h7
%if ARCH_X86_64
- sub r11, xq
- movu m2, [r11+16]
- add r11, xq
+ mova m2, [t4+xq*2]
+ paddw m2, [t2+xq*2]
%else
- sub PIC_reg, xd
- movu m2, [PIC_sym(pb_right_ext_mask)+16]
- add PIC_reg, xd
+ mov r2, t4
+ mova m2, [r2+xq*2]
+ mov r2, t2
+ paddw m2, [r2+xq*2]
+ mov r2, t5
%endif
- movd m3, [srcptrq+2+xq]
-%if cpuflag(ssse3)
- pshufb m3, [PIC_sym(pb_0)]
+ mova m3, [t3+xq*2]
+%if ARCH_X86_64
+ mova m5, [t5+xq*2]
%else
- punpcklbw m3, m3
- pshuflw m3, m3, q0000
- punpcklqdq m3, m3
+ mova m5, [r2+xq*2]
+ mov r2, t6
%endif
- pand m1, m2
- pxor m2, [PIC_sym(pb_right_ext_mask)]
- pand m3, m2
- pxor m2, [PIC_sym(pb_right_ext_mask)]
- por m1, m3
- jmp .main_loop
-.main_load:
- ; load subsequent line
- movu m1, [srcptrq+3]
-.main_loop:
+ paddw m5, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
%if ARCH_X86_64
- PALIGNR m2, m1, m0, 10
- PALIGNR m3, m1, m0, 11
- PALIGNR m4, m1, m0, 12
- PALIGNR m5, m1, m0, 13
- PALIGNR m6, m1, m0, 14
- PALIGNR m7, m1, m0, 15
-
- punpcklbw m0, m2, m1
- punpckhbw m2, m1
- punpcklbw m8, m3, m7
- punpckhbw m3, m7
- punpcklbw m7, m4, m6
- punpckhbw m4, m6
- PMADDUBSW m0, m15, m6, m9, 1
- PMADDUBSW m2, m15, m6, m9, 0
- PMADDUBSW m8, m14, m6, m9, 0
- PMADDUBSW m3, m14, m6, m9, 0
- PMADDUBSW m7, m13, m6, m9, 0
- PMADDUBSW m4, m13, m6, m9, 0
- paddw m0, m8
- paddw m2, m3
- %if cpuflag(ssse3)
- pxor m6, m6
- %endif
- punpcklbw m3, m5, m6
- punpckhbw m5, m6
- psllw m8, m3, 7
- psllw m6, m5, 7
- psubw m8, m10
- psubw m6, m10
- pmullw m3, m12
- pmullw m5, m12
- paddw m0, m7
- paddw m2, m4
- paddw m0, m3
- paddw m2, m5
- paddsw m0, m8 ; see the avx2 for an explanation
- paddsw m2, m6 ; of how the clipping works here
- psraw m0, 3
- psraw m2, 3
- paddw m0, m11
- paddw m2, m11
- mova [dstptrq+ 0], m0
- mova [dstptrq+16], m2
+ paddw m4, m0, [t6+xq*2]
%else
- PALIGNR m2, m1, m0, 10
- punpcklbw m3, m2, m1
- punpckhbw m2, m1
- PMADDUBSW m3, m15, m4, m5, 1
- PMADDUBSW m2, m15, m4, m5, 0
- PALIGNR m4, m1, m0, 11
- PALIGNR m5, m1, m0, 15
- punpcklbw m6, m4, m5
- punpckhbw m4, m5
- PMADDUBSW m6, m14, m5, m7, 1
- PMADDUBSW m4, m14, m5, m7, 0
- paddw m3, m6
- paddw m2, m4
- PALIGNR m4, m1, m0, 12
- PALIGNR m5, m1, m0, 14
- punpcklbw m6, m4, m5
- punpckhbw m4, m5
- PMADDUBSW m6, m13, m5, m7, 1
- PMADDUBSW m4, m13, m5, m7, 0
- paddw m3, m6
- paddw m2, m4
- PALIGNR m6, m1, m0, 13
- %if cpuflag(ssse3)
- pxor m5, m5
- %endif
- punpcklbw m4, m6, m5
- punpckhbw m6, m5
- psllw m5, m4, 7
- psllw m7, m6, 7
- psubw m5, m10
- psubw m7, m10
- pmullw m4, m12
- pmullw m6, m12
- paddw m3, m4
- paddw m2, m6
- paddsw m3, m5
- paddsw m2, m7
- psraw m3, 3
- psraw m2, 3
- paddw m3, m11
- paddw m2, m11
- mova [dstptrq+ 0], m3
- mova [dstptrq+16], m2
+ paddw m4, m0, [r2+xq*2]
+ mov r2, t4
%endif
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m0, m3
+ mova m3, [t3+xq*2+16]
+ paddd m4, m2
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+ mova m5, [t5+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t5
+ mova m5, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ paddw m5, [t1+xq*2+16]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
+%if ARCH_X86_64
+ paddw m4, m1, [t6+xq*2+16]
+%else
+ paddw m4, m1, [r2+xq*2+16]
+ mov dstq, dstmp
+%endif
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, dst_strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t6
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, r1
+%endif
+ ret
+%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code
+.v:
+ mov xq, wq
+.v_loop:
+%if ARCH_X86_64
+ mova m1, [t4+xq*2]
+ paddw m1, [t2+xq*2]
+%else
+ mov r2, t4
+ mova m1, [r2+xq*2]
+ mov r2, t2
+ paddw m1, [r2+xq*2]
+ mov r2, t6
+%endif
+ mova m2, [t3+xq*2]
+ mova m4, [t1+xq*2]
+%if ARCH_X86_64
+ paddw m3, m4, [t6+xq*2]
+ paddw m4, [t5+xq*2]
+%else
+ paddw m3, m4, [r2+xq*2]
+ mov r2, t5
+ paddw m4, [r2+xq*2]
+ mov r2, t4
+%endif
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3, m4
+ pmaddwd m2, m6
+ punpckhwd m3, m4
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+%if ARCH_X86_64
+ mova m2, [t4+xq*2+16]
+ paddw m2, [t2+xq*2+16]
+%else
+ mova m2, [r2+xq*2+16]
+ mov r2, t2
+ paddw m2, [r2+xq*2+16]
+ mov r2, t6
+%endif
+ mova m3, [t3+xq*2+16]
+ mova m5, [t1+xq*2+16]
+%if ARCH_X86_64
+ paddw m4, m5, [t6+xq*2+16]
+ paddw m5, [t5+xq*2+16]
+%else
+ paddw m4, m5, [r2+xq*2+16]
+ mov r2, t5
+ paddw m5, [r2+xq*2+16]
+ movifnidn dstq, dstmp
+%endif
+ psrad m0, 11
+ psrad m1, 11
+ packssdw m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m6
+ punpckhwd m4, m5
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ add dstq, dst_strideq
+%if ARCH_X86_64
+ mov t6, t5
+ mov t5, t4
+%else
+ mov dstmp, dstq
+ mov r1, t5
+ mov r2, t4
+ mov t6, r1
+ mov t5, r2
+%endif
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ ret
+%endif
- mova m0, m1
- add srcptrq, 16
- add dstptrq, 32
- sub xd, 16
- cmp xd, 16
- jg .main_load
- test xd, xd
- jg .load_and_splat
- cmp xd, xlimd
- jg .splat_right
-
-%if ARCH_X86_32
- mov srcq, [esp+8]
- mov dstq, [esp+4]
+%if ARCH_X86_64
+cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \
+ lpf_stride, w, edge, flt, h, x
+ mov fltq, fltmp
+ mov edged, r8m
+ mov wd, wm
+ mov hd, r6m
+ movq m14, [fltq]
+ add lpfq, wq
+ mova m8, [pw_m16380]
+ lea t1, [rsp+wq*2+16]
+ mova m15, [pw_2056]
+ add dstq, wq
+ movq m7, [fltq+16]
+ neg wq
+%if cpuflag(ssse3)
+ pshufb m14, [wiener_init]
+ mova m9, [wiener_shufB]
+ pshufd m13, m14, q3333 ; x1 x2
+ mova m10, [wiener_shufC]
+ punpcklqdq m14, m14 ; x3
+ mova m11, [wiener_shufD]
+ mova m12, [wiener_l_shuf]
+%else
+ punpcklwd m14, m14
+ pshufd m11, m14, q1111 ; x1
+ pshufd m13, m14, q2222 ; x2
+ pshufd m14, m14, q3333 ; x3
%endif
- add srcq, strideq
- add dstq, 384*2
- dec hd
- jg .loop
+%else
+%if cpuflag(ssse3)
+ %define stk_off 80
+%else
+ %define m11 [stk+80]
+ %define stk_off 96
+%endif
+cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride
+ %define stk esp
+ %define leftmp [stk+28]
+ %define m8 [base+pw_m16380]
+ %define m12 [base+wiener_l_shuf]
+ %define m14 [stk+48]
+ mov r1, r7m ; flt
+ mov r0, r0m ; dst
+ mov r5, r5m ; w
+ mov lpfq, lpfm
+ mov r2, r8m ; edge
+ mov r4, r6m ; h
+ movq m2, [r1+ 0]
+ movq m7, [r1+16]
+ add r0, r5
+ mov r1, r1m ; dst_stride
+ add lpfq, r5
+ mov edged, r2
+ mov r2, r2m ; left
+ mov dstmp, r0
+ lea t1, [rsp+r5*2+stk_off]
+ mov hd, r4
+ neg r5
+ mov lpf_strideq, lpf_stridem
+ LEA r6, pb_right_ext_mask+21
+ mov wq, r5
+ mov dst_strideq, r1
+ mov leftmp, r2
+%if cpuflag(ssse3)
+ pshufb m2, [base+wiener_init]
+ pshufd m1, m2, q3333
+ punpcklqdq m2, m2
+%else
+ punpcklwd m2, m2
+ pshufd m0, m2, q1111
+ pshufd m1, m2, q2222
+ pshufd m2, m2, q3333
+ mova m11, m0
+%endif
+ mova m13, m1
+ mova m14, m2
+%endif
+ pshufd m6, m7, q0000 ; __ y1
+ pshufd m7, m7, q1111 ; y2 y3
+ test edgeb, 4 ; LR_HAVE_TOP
+ jz .no_top
+ call .h_top
+ add lpfq, lpf_strideq
+ mov t4, t1
+ add t1, 384*2
+ call .h_top
+ lea xq, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov t3, t1
+ add t1, 384*2
+ mov [rsp+gprsize*1], lpf_strideq
+ add xq, lpf_strideq
+ mov [rsp+gprsize*0], xq ; below
+ call .h
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+.main:
+ mov t0, t4
+.main_loop:
+ call .hv
+ dec hd
+ jnz .main_loop
+ test edgeb, 8 ; LR_HAVE_BOTTOM
+ jz .v2
+ mov lpfq, [rsp+gprsize*0]
+ call .hv_bottom
+ add lpfq, [rsp+gprsize*1]
+ call .hv_bottom
+.end:
RET
+.no_top:
+ lea t3, [lpfq+lpf_strideq*4]
+ mov lpfq, dstmp
+ mov [rsp+gprsize*1], lpf_strideq
+ lea t3, [t3+lpf_strideq*2]
+ mov [rsp+gprsize*0], t3
+ call .h
+ mov t4, t1
+ mov t3, t1
+ mov t2, t1
+ dec hd
+ jz .v1
+ add lpfq, dst_strideq
+ add t1, 384*2
+ call .h
+ dec hd
+ jz .v2
+ lea t0, [t1+384*2]
+ call .hv
+ dec hd
+ jz .v2
+ add t0, 384*6
+ call .hv
+ dec hd
+ jnz .main
+.v2:
+ call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+ add dstq, dst_strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ movifnidn dstmp, dstq
+.v1:
+ call mangle(private_prefix %+ _wiener_filter5_ssse3).v
+ jmp .end
+.h:
+ %define stk esp+4
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .h_main
+.h_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .h_main
+.h_top:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .h_extend_left
+.h_loop:
+ movu m4, [lpfq+xq-4]
+.h_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .h_have_right
+ cmp xd, -17
+ jl .h_have_right
+ call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.h_have_right:
+%macro %%h5 0
+%if cpuflag(ssse3)
+ pshufb m0, m4, m9
+ pmaddubsw m0, m13
+ pshufb m1, m5, m9
+ pmaddubsw m1, m13
+ pshufb m2, m4, m10
+ pmaddubsw m2, m13
+ pshufb m3, m5, m10
+ pmaddubsw m3, m13
+ pshufb m4, m11
+ paddw m0, m2
+ pmullw m2, m14, m4
+ pshufb m5, m11
+ paddw m1, m3
+ pmullw m3, m14, m5
+ psllw m4, 7
+ psllw m5, 7
+ paddw m4, m8
+ paddw m5, m8
+ paddw m0, m2
+ paddw m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%else
+ psrldq m0, m4, 2
+ pslldq m1, m4, 2
+ pxor m3, m3
+ punpcklbw m0, m3
+ punpckhbw m1, m3
+ paddw m0, m1
+ pmullw m0, m11
+ pshufd m2, m4, q0321
+ punpcklbw m2, m3
+ pmullw m1, m14, m2
+ paddw m0, m1
+ psrldq m1, m4, 3
+ pslldq m4, 3
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m13
+ paddw m0, m1
+ psllw m2, 7
+ paddw m2, m8
+ paddsw m0, m2
+ psrldq m1, m5, 2
+ pslldq m4, m5, 2
+ punpcklbw m1, m3
+ punpckhbw m4, m3
+ paddw m1, m4
+ pmullw m1, m11
+ pshufd m4, m5, q0321
+ punpcklbw m4, m3
+ pmullw m2, m14, m4
+ paddw m1, m2
+ psrldq m2, m5, 3
+ pslldq m5, 3
+ punpcklbw m2, m3
+ punpckhbw m5, m3
+ paddw m2, m5
+ pmullw m2, m13
+ paddw m1, m2
+ psllw m4, 7
+ paddw m4, m8
+ paddsw m1, m4
+%endif
%endmacro
-
-%macro WIENER_V 0
+ %%h5
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
+ mova [t1+xq*2+ 0], m0
+ mova [t1+xq*2+16], m1
+ add xq, 16
+ jl .h_loop
+ ret
+ALIGN function_align
+.hv:
+ add lpfq, dst_strideq
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+ movifnidn leftq, leftmp
+ mova m4, [lpfq+xq]
+ movd m5, [leftq]
+ add leftq, 4
+ pslldq m4, 4
+ por m4, m5
+ movifnidn leftmp, leftq
+ jmp .hv_main
+.hv_extend_left:
+%if cpuflag(ssse3)
+ mova m4, [lpfq+xq]
+ pshufb m4, m12
+%else
+ mova m5, [lpfq+xq]
+ pshufd m4, m5, q2103
+ punpcklbw m5, m5
+ punpcklwd m5, m5
+ movss m4, m5
+%endif
+ jmp .hv_main
+.hv_bottom:
+ mov xq, wq
+ test edgeb, 1 ; LR_HAVE_LEFT
+ jz .hv_extend_left
+.hv_loop:
+ movu m4, [lpfq+xq-4]
+.hv_main:
+ movu m5, [lpfq+xq+4]
+ test edgeb, 2 ; LR_HAVE_RIGHT
+ jnz .hv_have_right
+ cmp xd, -17
+ jl .hv_have_right
+ call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right
+.hv_have_right:
+ %%h5
+ mova m2, [t3+xq*2]
+ paddw m2, [t1+xq*2]
+ psraw m0, 3
+ psraw m1, 3
+ paddw m0, m15
+ paddw m1, m15
%if ARCH_X86_64
-cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, flt, edge
- mov edged, edgem
- movifnidn fltq, fltmp
- movifnidn hd, hm
- movq m15, [fltq+16]
- pshufd m14, m15, q1111
- pshufd m15, m15, q0000
- mova m12, [pd_1024]
-
- DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
-
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
- sub ylimd, 3
+ mova m3, [t2+xq*2]
+ paddw m4, m0, [t4+xq*2]
%else
-cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, flt, edge
- %define ylimd [esp+12]
-
- mov r5d, edgem
- and r5d, 8
- shr r5d, 2
- sub r5d, 3
- mov ylimd, r5d
- mov fltq, fltmp
- mov edged, edgem
-
- SETUP_PIC edged
-
- movq m0, [fltq+16]
- pshufd m1, m0, q1111
- pshufd m0, m0, q0000
- mova [esp+0x50], m0
- mova [esp+0x40], m1
-
- DEFINE_ARGS dst, stride, mid, w, h, y, edge
- %define mptrq midq
- %define dstptrq dstq
- %define edgeb byte [esp]
+ mov r2, t2
+ mova m3, [r2+xq*2]
+ mov r2, t4
+ paddw m4, m0, [r2+xq*2]
%endif
-
- ; main x loop for vertical filter, does one column of 16 pixels
-.loop_x:
- mova m3, [midq] ; middle line
-
- ; load top pixels
- test edgeb, 4 ; have_top
- jz .emu_top
- mova m0, [midq-384*4]
- mova m2, [midq-384*2]
- mova m1, m0
- jmp .load_bottom_pixels
-.emu_top:
- mova m0, m3
- mova m1, m3
- mova m2, m3
-
- ; load bottom pixels
-.load_bottom_pixels:
- mov yd, hd
+ mova [t0+xq*2], m0
+ punpcklwd m0, m2, m3
+ pmaddwd m0, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m0, m3
+ paddd m4, m2
+ mova m2, [t3+xq*2+16]
+ paddw m2, [t1+xq*2+16]
+ psrad m0, 11
+ psrad m4, 11
+ packssdw m0, m4
%if ARCH_X86_64
- mov mptrq, midq
- mov dstptrq, dstq
- add yd, ylimd
+ mova m3, [t2+xq*2+16]
+ paddw m4, m1, [t4+xq*2+16]
%else
- mov [esp+8], midq
- mov [esp+4], dstq
- add yd, ylimd
+ paddw m4, m1, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
%endif
- jg .load_threelines
-
- ; the remainder here is somewhat messy but only runs in very weird
- ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
- ; so performance is not terribly important here...
- je .load_twolines
- cmp yd, -1
- je .load_oneline
- ; h == 1 case
- mova m5, m3
- mova m4, m3
- mova m6, m3
- jmp .loop
-.load_oneline:
- ; h == 2 case
- mova m4, [midq+384*2]
- mova m5, m4
- mova m6, m4
- jmp .loop
-.load_twolines:
- ; h == 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- mova m6, m5
- jmp .loop
-.load_threelines:
- ; h > 3 case
- mova m4, [midq+384*2]
- mova m5, [midq+384*4]
- ; third line loaded in main loop below
-
- ; main y loop for vertical filter
-.loop_load:
- ; load one line into m6. if that pixel is no longer available, do
- ; nothing, since m6 still has the data from the previous line in it. We
- ; try to structure the loop so that the common case is evaluated fastest
- mova m6, [mptrq+384*6]
-.loop:
+ mova [t0+xq*2+16], m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .hv_loop
+ add dstq, dst_strideq
+ mov t4, t3
+ mov t3, t2
+ mov t2, t1
+ mov t1, t0
+ mov t0, t4
+ movifnidn dstmp, dstq
+ ret
+%if cpuflag(ssse3)
+.v:
+ mov xq, wq
+.v_loop:
+ mova m3, [t1+xq*2]
+ paddw m1, m3, [t3+xq*2]
%if ARCH_X86_64
- paddw m7, m0, m6
- paddw m8, m1, m5
- paddw m9, m2, m4
- punpcklwd m10, m7, m8
- punpckhwd m7, m8
- punpcklwd m11, m9, m3
- punpckhwd m9, m3
- pmaddwd m10, m15
- pmaddwd m7, m15
- pmaddwd m11, m14
- pmaddwd m9, m14
- paddd m10, m12
- paddd m7, m12
- paddd m10, m11
- paddd m7, m9
- psrad m10, 11
- psrad m7, 11
- packssdw m10, m7
- packuswb m10, m10
- movq [dstptrq], m10
+ mova m2, [t2+xq*2]
+ paddw m3, [t4+xq*2]
%else
- mova [esp+0x30], m1
- mova [esp+0x20], m2
- mova [esp+0x10], m3
- paddw m0, m6
- paddw m1, m5
- paddw m2, m4
- punpcklwd m7, m2, m3
- punpckhwd m2, m3
- punpcklwd m3, m0, m1
- punpckhwd m0, m1
- mova m1, [esp+0x50]
- pmaddwd m3, m1
- pmaddwd m0, m1
- mova m1, [esp+0x40]
- pmaddwd m7, m1
- pmaddwd m2, m1
- paddd m3, [PIC_sym(pd_1024)]
- paddd m0, [PIC_sym(pd_1024)]
- paddd m3, m7
- paddd m0, m2
- psrad m3, 11
- psrad m0, 11
- packssdw m3, m0
- packuswb m3, m3
- movq [dstq], m3
- mova m1, [esp+0x30]
- mova m2, [esp+0x20]
- mova m3, [esp+0x10]
+ mov r2, t2
+ mova m2, [r2+xq*2]
+ mov r2, t4
+ paddw m3, [r2+xq*2]
%endif
- ; shift pixels one position
- mova m0, m1
- mova m1, m2
- mova m2, m3
- mova m3, m4
- mova m4, m5
- mova m5, m6
- add mptrq, 384*2
- add dstptrq, strideq
- dec yd
- jg .loop_load
- ; for the bottom pixels, continue using m6 (as extended edge)
- cmp yd, ylimd
- jg .loop
-
-%if ARCH_X86_32
- mov midq, [esp+8]
- mov dstq, [esp+4]
+ punpcklwd m0, m1, m2
+ pmaddwd m0, m7
+ punpckhwd m1, m2
+ pmaddwd m1, m7
+ punpcklwd m2, m3
+ pmaddwd m2, m6
+ punpckhwd m3, m3
+ pmaddwd m3, m6
+ paddd m0, m2
+ paddd m1, m3
+ mova m4, [t1+xq*2+16]
+ paddw m2, m4, [t3+xq*2+16]
+%if ARCH_X86_64
+ mova m3, [t2+xq*2+16]
+ paddw m4, [t4+xq*2+16]
+%else
+ paddw m4, [r2+xq*2+16]
+ mov r2, t2
+ mova m3, [r2+xq*2+16]
+ mov dstq, dstmp
%endif
- add midq, 16
- add dstq, 8
- sub wd, 8
- jg .loop_x
- RET
+ psrad m0, 11
+ psrad m1, 11
+ packssdw m0, m1
+ punpcklwd m1, m2, m3
+ pmaddwd m1, m7
+ punpckhwd m2, m3
+ pmaddwd m2, m7
+ punpcklwd m3, m4
+ pmaddwd m3, m6
+ punpckhwd m4, m4
+ pmaddwd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 11
+ psrad m2, 11
+ packssdw m1, m2
+ packuswb m0, m1
+ mova [dstq+xq], m0
+ add xq, 16
+ jl .v_loop
+ ret
+%endif
%endmacro
INIT_XMM sse2
-WIENER_H
-WIENER_V
+WIENER
INIT_XMM ssse3
-WIENER_H
-WIENER_V
+WIENER
;;;;;;;;;;;;;;;;;;;;;;;;;;
;; self-guided ;;
@@ -698,7 +1195,7 @@
neg xq
mov wq, xq
%if ARCH_X86_64
- lea r10, [pb_right_ext_mask+16]
+ lea r10, [pb_right_ext_mask+24]
%endif
.loop_y:
mov xq, wq
@@ -734,7 +1231,7 @@
%if ARCH_X86_64
movu m4, [r10+xq*2]
%else
- movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+ movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
%endif
pand m2, m4
pandn m4, m3
@@ -1132,7 +1629,7 @@
psubw m1, m4 ; aa
movq m0, [srcq]
XCHG_PIC_REG
- punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16]
+ punpcklbw m0, [PIC_sym(pb_0)]
punpcklwd m4, m1, [PIC_sym(pw_16)]
punpckhwd m1, [PIC_sym(pw_16)]
punpcklwd m2, m0, [PIC_sym(pw_16)]
@@ -1266,7 +1763,7 @@
lea sumsqq, [sumsqq+wq*4-4]
neg wq
%if ARCH_X86_64
- lea r10, [pb_right_ext_mask+16]
+ lea r10, [pb_right_ext_mask+24]
%else
mov wm, xd
%define wq wm
@@ -1313,7 +1810,7 @@
%if ARCH_X86_64
movu m4, [r10+xq*2]
%else
- movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2]
+ movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24]
XCHG_PIC_REG
%endif
pand m2, m4
@@ -1880,6 +2377,7 @@
%endif
RET
+%undef t2
cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
movifnidn wd, wm
movd m0, wtm
--- a/tests/checkasm/looprestoration.c
+++ b/tests/checkasm/looprestoration.c
@@ -27,6 +27,7 @@
#include "tests/checkasm/checkasm.h"
+#include <stdio.h>
#include <string.h>
#include "src/levels.h"
@@ -33,6 +34,10 @@
#include "src/looprestoration.h"
#include "src/tables.h"
+static int to_binary(int x) { /* 0-15 -> 0000-1111 */
+ return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8);
+}
+
static void init_tmp(pixel *buf, const ptrdiff_t stride,
const int w, const int h, const int bitdepth_max)
{
@@ -56,11 +61,9 @@
int w, int h, const int16_t filter[2][8],
enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
- for (int pl = 0; pl < 2; pl++) {
- if (check_func(c->wiener, "wiener_%s_%dbpc",
- pl ? "chroma" : "luma", bpc))
- {
- filter[0][0] = filter[0][6] = pl ? 0 : (rnd() & 15) - 5;
+ for (int t = 0; t < 2; t++) {
+ if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) {
+ filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5;
filter[0][1] = filter[0][5] = (rnd() & 31) - 23;
filter[0][2] = filter[0][4] = (rnd() & 63) - 17;
filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2;
@@ -68,7 +71,7 @@
filter[0][3] += 128;
#endif
- filter[1][0] = filter[1][6] = pl ? 0 : (rnd() & 15) - 5;
+ filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5;
filter[1][1] = filter[1][5] = (rnd() & 31) - 23;
filter[1][2] = filter[1][4] = (rnd() & 63) - 17;
filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2;
@@ -93,9 +96,14 @@
call_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),
w, h, filter, edges HIGHBD_TAIL_SUFFIX);
- checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
- a_dst + 32, 448 * sizeof(pixel),
- w, h, "dst");
+ if (checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel),
+ a_dst + 32, 448 * sizeof(pixel),
+ w, h, "dst"))
+ {
+ fprintf(stderr, "size = %dx%d, edges = %04d\n",
+ w, h, to_binary(edges));
+ break;
+ }
}
bench_new(a_dst + 32, 448 * sizeof(pixel), left,
h_edge + 32, 448 * sizeof(pixel),