ref: 4687c4696f780207bbb7f8d14038302f0e9c63bd
parent: b73acaa894ac4bb66d04d27919e436c158be7fe8
author: Ronald S. Bultje <[email protected]>
date: Sun Mar 29 11:14:38 EDT 2020
x86: add SSSE3 versions for filmgrain.fguv_32x32xn[422/444] fguv_32x32xn_8bpc_420_csfl0_c: 14568.2 fguv_32x32xn_8bpc_420_csfl0_ssse3: 1162.3 fguv_32x32xn_8bpc_420_csfl1_c: 10682.0 fguv_32x32xn_8bpc_420_csfl1_ssse3: 910.3 fguv_32x32xn_8bpc_422_csfl0_c: 16370.5 fguv_32x32xn_8bpc_422_csfl0_ssse3: 1202.6 fguv_32x32xn_8bpc_422_csfl1_c: 11333.8 fguv_32x32xn_8bpc_422_csfl1_ssse3: 958.8 fguv_32x32xn_8bpc_444_csfl0_c: 12950.1 fguv_32x32xn_8bpc_444_csfl0_ssse3: 1133.6 fguv_32x32xn_8bpc_444_csfl1_c: 8806.7 fguv_32x32xn_8bpc_444_csfl1_ssse3: 731.0
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -34,6 +34,8 @@
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
@@ -56,6 +58,8 @@
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -2071,6 +2071,7 @@
.end_hv:
RET
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
INIT_XMM ssse3
%if ARCH_X86_32
; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
@@ -2077,7 +2078,7 @@
; sby, luma, lstride, uv_pl, is_id)
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
-cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
tmp, src, scaling, h, fg_data, picptr, unused
mov r0, r0m
mov r1, r2m
@@ -2100,7 +2101,7 @@
mov [rsp+8*mmsize+13*gprsize], r2
mov [rsp+8*mmsize+14*gprsize], r4
%else
-cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
tmp, src, scaling, h, fg_data, picptr, unused
%endif
mov srcq, srcm
@@ -2125,13 +2126,13 @@
%define base r5-pb_mask
mov r5m, r5
%else
-cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
- grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
lea r8, [pb_mask]
%define base r8-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
- movd m2, [base+byte_blend+3]
+ pcmpeqw m2, m2
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
lea tmpd, [r6d*2]
@@ -2143,6 +2144,7 @@
movd m5, [base+min+r6*2]
cmovne r6d, tmpd
movd m4, [base+max+r6*2]
+ psrldq m2, 14+%2
punpcklwd m3, m3
punpcklwd m5, m5
punpcklwd m4, m4
@@ -2157,7 +2159,7 @@
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
-%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
%else
@@ -2183,10 +2185,18 @@
test overlapd, overlapd
jz %%no_vertical_overlap
%if ARCH_X86_32
+%if %2
movd m1, [base+pb_23_22]
+%else
+ movd m1, [base+pb_27_17_17_27]
+%endif
mova m0, [base+pw_1024]
%else
+%if %2
movd m1, [pb_23_22]
+%else
+ movd m1, [pb_27_17_17_27]
+%endif
mova m0, [pw_1024]
%endif
pshufd m1, m1, q0000
@@ -2216,7 +2226,9 @@
%define luma_bakq lumaq
mov wq, r4m
+%if %3
shl r10mp, 1
+%endif
%else
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
@@ -2226,7 +2238,7 @@
mov lumaq, r9mp
lea src_bakq, [srcq+wq]
- lea luma_bakq, [lumaq+wq*2]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
neg wq
sub r0mp, srcq
%if ARCH_X86_32
@@ -2237,7 +2249,7 @@
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
%else
mov r11mp, src_bakq
- mov r10mp, strideq
+ mov r12mp, strideq
%endif
%%loop_x:
@@ -2266,8 +2278,8 @@
ror offyd, 8
shr offxd, 12
and offyd, 0xf
- imul offyd, 82
- lea offyq, [offyq+offxq+498] ; offy*stride+offx
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2276,6 +2288,7 @@
h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
%endif
+%%loop_x_odd:
mov hd, r7m
mov grain_lutq, grain_lutmp
%%loop_y:
@@ -2283,6 +2296,7 @@
%if ARCH_X86_32
mov lumaq, r9mp
%endif
+%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2300,9 +2314,20 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2314,6 +2339,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
; scaling[luma_src]
@@ -2364,9 +2392,13 @@
add srcq, r2mp
; we already incremented lumaq above
%else
- add srcq, r10mp
+ add srcq, r12mp
+%if %3
lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
%endif
+%endif
add grain_lutq, 82
dec hw
jg %%loop_y
@@ -2384,12 +2416,27 @@
%else
mov srcq, r11mp
%endif
- lea lumaq, [luma_bakq+wq*2]
+ lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
+%if %2 == 0
+ ; adjust top_offxy
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jc %%loop_x_even
+ test dword r8m, 2
+ jz %%loop_x_odd
+ jmp %%loop_x_odd_v_overlap
+%%loop_x_even:
+%endif
test dword r8m, 1
jz %%loop_x
@@ -2400,8 +2447,12 @@
; horizontal overlap (without vertical overlap)
%%loop_x_h_overlap:
%if ARCH_X86_32
+%if %2
lea r6, [offxyd+16]
mov [rsp+8*mmsize+0*gprsize], r6
+%else
+ mov [rsp+8*mmsize+0*gprsize], offxyd
+%endif
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
@@ -2410,8 +2461,12 @@
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, lstride
+%if %2
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
+%else
+ mov left_offxyd, offyd
%endif
+%endif
mov r6d, seed
or seed, 0xEFF4
shr r6d, 1
@@ -2435,8 +2490,8 @@
ror offyd, 8
shr offxd, 12
and offyd, 0xf
- imul offyd, 82
- lea offyq, [offyq+offxq+498] ; offy*stride+offx
+ imul offyd, 164>>%3
+ lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2452,6 +2507,7 @@
%if ARCH_X86_32
mov lumaq, r9mp
%endif
+%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2469,9 +2525,20 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2483,6 +2550,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
; scaling[luma_src]
@@ -2547,9 +2617,13 @@
add srcq, r2mp
; lumaq has already been incremented above
%else
- add srcq, r10mp
+ add srcq, r12mp
+%if %3
lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
%endif
+%endif
add grain_lutq, 82
dec hw
jg %%loop_y_h_overlap
@@ -2567,17 +2641,32 @@
%else
mov srcq, r11mp
%endif
- lea lumaq, [luma_bakq+wq*2]
+ lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
+%if %2 == 0
+ xor dword r8m, 4
+ ; adjust top_offxyd
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add r11d, 16
+%endif
+ add offxyd, 16
+%endif
; r8m = sbym
test dword r8m, 2
+%if %2
jne %%loop_x_hv_overlap
jmp %%loop_x_h_overlap
+%else
+ jne %%loop_x_odd_v_overlap
+ jmp %%loop_x_odd
+%endif
%%end:
RET
@@ -2612,7 +2701,9 @@
mov r3m, seed
mov wq, r4m
+%if %3
shl r10mp, 1
+%endif
%else
xor seed, sbyd ; (cur_seed << 16) | top_seed
@@ -2624,7 +2715,7 @@
mov lumaq, r9mp
lea src_bakq, [srcq+wq]
- lea luma_bakq, [lumaq+wq*2]
+ lea luma_bakq, [lumaq+wq*(1+%2)]
neg wq
sub r0mp, srcq
%if ARCH_X86_32
@@ -2635,7 +2726,7 @@
DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
%else
mov r11mp, src_bakq
- mov r10mp, strideq
+ mov r12mp, strideq
%endif
%%loop_x_v_overlap:
@@ -2674,9 +2765,9 @@
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
- imul offyd, 82
+ imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
- lea offyq, [offyq+offxq+0x10001*498+16*82]
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
@@ -2693,12 +2784,20 @@
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
%endif
+%%loop_x_odd_v_overlap:
mov hd, r7m
mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+ mova m1, [base+pb_27_17]
+%else
+ mova m1, [pb_27_17]
+%endif
%%loop_y_v_overlap:
%if ARCH_X86_32
mov lumaq, r9mp
%endif
+%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2716,9 +2815,20 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2730,6 +2840,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
; scaling[luma_src]
@@ -2740,10 +2853,10 @@
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
%endif
- pcmpeqw m1, m1
- psrlw m1, 8
- pand m7, m1
- pand m5, m1
+ pcmpeqw m4, m4
+ psrlw m4, 8
+ pand m7, m4
+ pand m5, m4
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
@@ -2753,17 +2866,22 @@
%else
movu m4, [grain_lutq+top_offxyq]
%endif
- punpckhbw m1, m4, m3
+ punpckhbw m6, m4, m3
punpcklbw m4, m3
- pmaddubsw m2, m9, m1
+%if %3
+ pmaddubsw m2, m9, m6
pmaddubsw m3, m9, m4
+%else
+ pmaddubsw m2, m1, m6
+ pmaddubsw m3, m1, m4
+%endif
pmulhrsw m2, m8
pmulhrsw m3, m8
packsswb m3, m2
- pxor m1, m1
- pcmpgtb m1, m3
- punpcklbw m2, m3, m1
- punpckhbw m3, m1
+ pxor m6, m6
+ pcmpgtb m6, m3
+ punpcklbw m2, m3, m6
+ punpckhbw m3, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
pmullw m2, m7
@@ -2773,7 +2891,7 @@
; unpack chroma_source
pxor m4, m4
- punpckhbw m1, m0, m4
+ punpckhbw m6, m0, m4
punpcklbw m0, m4 ; m0-1: src as word
%if ARCH_X86_32
@@ -2782,12 +2900,12 @@
; dst = clip_pixel(src, noise)
paddw m0, m2
- paddw m1, m3
+ paddw m6, m3
pmaxsw m0, m13
- pmaxsw m1, m13
+ pmaxsw m6, m13
pminsw m0, m12
- pminsw m1, m12
- packuswb m0, m1
+ pminsw m6, m12
+ packuswb m0, m6
movifnidn dstq, dstmp
mova [dstq+srcq], m0
@@ -2797,10 +2915,24 @@
add srcq, r2mp
; lumaq has already been incremented above
%else
- add srcq, r10mp
+ add srcq, r12mp
+%if %3
lea lumaq, [lumaq+lstrideq*2]
+%else
+ add lumaq, lstrideq
%endif
+%endif
add grain_lutq, 82
+%if %3 == 0
+ btc hd, 16
+%if ARCH_X86_32
+ mov r5, r5m
+ mova m1, [base+pb_17_27]
+%else
+ mova m1, [pb_17_27]
+%endif
+ jnc %%loop_y_v_overlap
+%endif
jmp %%loop_y
%%end_y_v_overlap:
@@ -2817,7 +2949,7 @@
%else
mov srcq, r11mp
%endif
- lea lumaq, [luma_bakq+wq*2]
+ lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
@@ -2824,9 +2956,20 @@
mov r9m, lumaq
%endif
+%if %2
; since fg_dataq.overlap is guaranteed to be set, we never jump
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ btc dword r8m, 2
+ jnc %%loop_x_odd_v_overlap
+%endif
%%loop_x_hv_overlap:
%if ARCH_X86_32
@@ -2833,9 +2976,13 @@
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
mov r6, [rsp+8*mmsize+1*gprsize]
+%if %2
lea r0, [r3d+16]
add r6, 16
mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
+%else
+ mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
+%endif
mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
@@ -2846,8 +2993,13 @@
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
+%if %2
lea topleft_offxyq, [top_offxyq+16]
lea left_offxyq, [offxyq+16]
+%else
+ mov topleft_offxyq, top_offxyq
+ mov left_offxyq, offxyq
+%endif
; we assume from the block above that bits 8-15 of tmpd are zero'ed
%endif
@@ -2881,9 +3033,9 @@
ror offxd, 12
and offyd, 0xf000f
and offxd, 0xf000f
- imul offyd, 82
+ imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
- lea offyq, [offyq+offxq+0x10001*498+16*82]
+ lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
%if ARCH_X86_32
DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
@@ -2900,6 +3052,12 @@
mov hd, r7m
mov grain_lutq, grain_lutmp
+%if ARCH_X86_32
+ mov r5, r5m
+ mova m3, [base+pb_27_17]
+%else
+ mova m3, [pb_27_17]
+%endif
%%loop_y_hv_overlap:
; src
%if ARCH_X86_32
@@ -2907,6 +3065,7 @@
mov lumaq, r9mp
%endif
+%if %2
mova m4, [lumaq+ 0]
mova m6, [lumaq+16]
mova m0, [srcq]
@@ -2924,9 +3083,20 @@
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
+%else
+ mova m4, [lumaq]
+ mova m0, [srcq]
+%if ARCH_X86_32
+ add lumaq, r10mp
+ mov r9mp, lumaq
+%endif
+ pxor m2, m2
+%endif
%if %1
+%if %2
packuswb m4, m6 ; luma
+%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2938,6 +3108,9 @@
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
+%elif %2 == 0
+ punpckhbw m6, m4, m2
+ punpcklbw m4, m2
%endif
; scaling[src]
@@ -2946,9 +3119,14 @@
vpgatherdw m5, m6, scalingq, r0, r5
%else
movd m1, [grain_lutq+topleft_offxyq]
+%if %3
vpgatherdw m7, m4, scalingq, r2, r12
vpgatherdw m5, m6, scalingq, r2, r12
+%else
+ vpgatherdw m7, m4, scalingq, r2, r13
+ vpgatherdw m5, m6, scalingq, r2, r13
%endif
+%endif
pcmpeqw m2, m2
psrlw m2, 8
pand m7, m2
@@ -2961,7 +3139,7 @@
movd m1, [grain_lutq+r0]
mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
%endif
- movu m3, [grain_lutq+offxyq]
+ movu m2, [grain_lutq+offxyq]
%if ARCH_X86_32
movu m6, [grain_lutq+r5]
movd m4, [grain_lutq+r0]
@@ -2971,23 +3149,32 @@
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m1, m6
- punpcklbw m4, m3
+ punpcklbw m4, m2
+%if %2
punpcklwd m4, m1
+%else
+ punpckldq m4, m1
+%endif
pmaddubsw m1, m9, m4
pmulhrsw m1, m8
packsswb m1, m1
- pandn m4, m10, m3
- pandn m3, m10, m6
- psrldq m6, m1, 1
+ pandn m4, m10, m2
+ pandn m2, m10, m6
+ psrldq m6, m1, 2-%2
pand m1, m10
pand m6, m10
por m4, m1
- por m3, m6
+ por m2, m6
; followed by v interpolation (top | cur -> cur)
- punpckhbw m1, m3, m4
- punpcklbw m3, m4
+ punpckhbw m1, m2, m4
+ punpcklbw m2, m4
+%if %3
pmaddubsw m4, m9, m1
- pmaddubsw m1, m9, m3
+ pmaddubsw m1, m9, m2
+%else
+ pmaddubsw m4, m3, m1
+ pmaddubsw m1, m3, m2
+%endif
pmulhrsw m4, m8
pmulhrsw m1, m8
packsswb m1, m4
@@ -3008,17 +3195,17 @@
; unpack chroma source
pxor m4, m4
- punpckhbw m3, m0, m4
+ punpckhbw m5, m0, m4
punpcklbw m0, m4 ; m0-1: src as word
; dst = clip_pixel(src, noise)
paddw m0, m2
- paddw m3, m1
+ paddw m5, m1
pmaxsw m0, m13
- pmaxsw m3, m13
+ pmaxsw m5, m13
pminsw m0, m12
- pminsw m3, m12
- packuswb m0, m3
+ pminsw m5, m12
+ packuswb m0, m5
movifnidn dstq, dstmp
mova [dstq+srcq], m0
@@ -3026,12 +3213,36 @@
add srcq, r2mp
; lumaq has been adjusted above already
%else
- add srcq, r10mp
- lea lumaq, [lumaq+lstrideq*2]
+ add srcq, r12mp
+%if %3
+ lea lumaq, [lumaq+lstrideq*(1+%2)]
+%else
+ add lumaq, r10mp
%endif
+%endif
add grain_lutq, 82
dec hw
+%if %3
jg %%loop_y_h_overlap
+%else
+ jle %%end_y_hv_overlap
+%if ARCH_X86_32
+ mov r5, r5m
+ mova m3, [base+pb_17_27]
+%else
+ mova m3, [pb_17_27]
+%endif
+ btc hd, 16
+ jnc %%loop_y_hv_overlap
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+ jmp %%loop_y_h_overlap
+%%end_y_hv_overlap:
+%if ARCH_X86_64
+ mov lstrideq, r10mp
+%endif
+%endif
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
@@ -3046,18 +3257,44 @@
%else
mov srcq, r11mp
%endif
- lea lumaq, [luma_bakq+wq*2]
+ lea lumaq, [luma_bakq+wq*(1+%2)]
add srcq, wq
%if ARCH_X86_32
mov r4m, wq
mov r9m, lumaq
%endif
+%if %2
jmp %%loop_x_hv_overlap
+%else
+%if ARCH_X86_32
+ add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+ add top_offxyd, 16
+%endif
+ add offxyd, 16
+ xor dword r8m, 4
+ jmp %%loop_x_odd_v_overlap
+%endif
%%end_hv:
RET
%endmacro
- FGUV_32x32xN_LOOP 1
+ %%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
- FGUV_32x32xN_LOOP 0
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 422, 1, 0
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 444, 0, 0