shithub: dav1d

Download patch

ref: 4687c4696f780207bbb7f8d14038302f0e9c63bd
parent: b73acaa894ac4bb66d04d27919e436c158be7fe8
author: Ronald S. Bultje <[email protected]>
date: Sun Mar 29 11:14:38 EDT 2020

x86: add SSSE3 versions for filmgrain.fguv_32x32xn[422/444]

fguv_32x32xn_8bpc_420_csfl0_c: 14568.2
fguv_32x32xn_8bpc_420_csfl0_ssse3: 1162.3
fguv_32x32xn_8bpc_420_csfl1_c: 10682.0
fguv_32x32xn_8bpc_420_csfl1_ssse3: 910.3
fguv_32x32xn_8bpc_422_csfl0_c: 16370.5
fguv_32x32xn_8bpc_422_csfl0_ssse3: 1202.6
fguv_32x32xn_8bpc_422_csfl1_c: 11333.8
fguv_32x32xn_8bpc_422_csfl1_ssse3: 958.8
fguv_32x32xn_8bpc_444_csfl0_c: 12950.1
fguv_32x32xn_8bpc_444_csfl0_ssse3: 1133.6
fguv_32x32xn_8bpc_444_csfl1_c: 8806.7
fguv_32x32xn_8bpc_444_csfl1_ssse3: 731.0

--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -34,6 +34,8 @@
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
 
 decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
@@ -56,6 +58,8 @@
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
     c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/film_grain_ssse3.asm
+++ b/src/x86/film_grain_ssse3.asm
@@ -2071,6 +2071,7 @@
 .end_hv:
     RET
 
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
 INIT_XMM ssse3
 %if ARCH_X86_32
 ; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
@@ -2077,7 +2078,7 @@
 ;                         sby, luma, lstride, uv_pl, is_id)
 %if STACK_ALIGNMENT < mmsize
 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
-cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
         tmp, src, scaling, h, fg_data, picptr, unused
     mov              r0, r0m
     mov              r1, r2m
@@ -2100,7 +2101,7 @@
     mov [rsp+8*mmsize+13*gprsize], r2
     mov [rsp+8*mmsize+14*gprsize], r4
 %else
-cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
         tmp, src, scaling, h, fg_data, picptr, unused
 %endif
     mov            srcq, srcm
@@ -2125,13 +2126,13 @@
 %define base r5-pb_mask
     mov             r5m, r5
 %else
-cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
-                                      grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
     lea              r8, [pb_mask]
 %define base r8-pb_mask
 %endif
     mov             r6d, [fg_dataq+FGData.scaling_shift]
-    movd             m2, [base+byte_blend+3]
+    pcmpeqw          m2, m2
     movd             m3, [base+mul_bits+r6*2-14]
     mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
     lea            tmpd, [r6d*2]
@@ -2143,6 +2144,7 @@
     movd             m5, [base+min+r6*2]
     cmovne          r6d, tmpd
     movd             m4, [base+max+r6*2]
+    psrldq           m2, 14+%2
     punpcklwd        m3, m3
     punpcklwd        m5, m5
     punpcklwd        m4, m4
@@ -2157,7 +2159,7 @@
     cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
     jne .csfl
 
-%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
 %if ARCH_X86_32
     DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
 %else
@@ -2183,10 +2185,18 @@
     test       overlapd, overlapd
     jz %%no_vertical_overlap
 %if ARCH_X86_32
+%if %2
     movd             m1, [base+pb_23_22]
+%else
+    movd             m1, [base+pb_27_17_17_27]
+%endif
     mova             m0, [base+pw_1024]
 %else
+%if %2
     movd             m1, [pb_23_22]
+%else
+    movd             m1, [pb_27_17_17_27]
+%endif
     mova             m0, [pw_1024]
 %endif
     pshufd           m1, m1, q0000
@@ -2216,7 +2226,9 @@
 %define luma_bakq lumaq
 
     mov              wq, r4m
+%if %3
     shl           r10mp, 1
+%endif
 %else
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
@@ -2226,7 +2238,7 @@
 
     mov           lumaq, r9mp
     lea        src_bakq, [srcq+wq]
-    lea       luma_bakq, [lumaq+wq*2]
+    lea       luma_bakq, [lumaq+wq*(1+%2)]
     neg              wq
     sub            r0mp, srcq
 %if ARCH_X86_32
@@ -2237,7 +2249,7 @@
     DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
 %else
     mov           r11mp, src_bakq
-    mov           r10mp, strideq
+    mov           r12mp, strideq
 %endif
 
 %%loop_x:
@@ -2266,8 +2278,8 @@
     ror           offyd, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
 
 %if ARCH_X86_32
     DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2276,6 +2288,7 @@
                 h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
 %endif
 
+%%loop_x_odd:
     mov              hd, r7m
     mov      grain_lutq, grain_lutmp
 %%loop_y:
@@ -2283,6 +2296,7 @@
 %if ARCH_X86_32
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2300,9 +2314,20 @@
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2314,6 +2339,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[luma_src]
@@ -2364,9 +2392,13 @@
     add            srcq, r2mp
     ; we already incremented lumaq above
 %else
-    add            srcq, r10mp
+    add            srcq, r12mp
+%if %3
     lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
 %endif
+%endif
     add      grain_lutq, 82
     dec              hw
     jg %%loop_y
@@ -2384,12 +2416,27 @@
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
     mov             r9m, lumaq
 %endif
+%if %2 == 0
+    ; adjust top_offxy
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16
+%endif
+    add          offxyd, 16
+    btc       dword r8m, 2
+    jc %%loop_x_even
+    test      dword r8m, 2
+    jz %%loop_x_odd
+    jmp %%loop_x_odd_v_overlap
+%%loop_x_even:
+%endif
     test      dword r8m, 1
     jz %%loop_x
 
@@ -2400,8 +2447,12 @@
     ; horizontal overlap (without vertical overlap)
 %%loop_x_h_overlap:
 %if ARCH_X86_32
+%if %2
     lea              r6, [offxyd+16]
     mov [rsp+8*mmsize+0*gprsize], r6
+%else
+    mov [rsp+8*mmsize+0*gprsize], offxyd
+%endif
 
     DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
 
@@ -2410,8 +2461,12 @@
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 offx, offy, see, left_offxy, unused1, unused2, lstride
 
+%if %2
     lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
+%else
+    mov     left_offxyd, offyd
 %endif
+%endif
     mov             r6d, seed
     or             seed, 0xEFF4
     shr             r6d, 1
@@ -2435,8 +2490,8 @@
     ror           offyd, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
 
 %if ARCH_X86_32
     DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2452,6 +2507,7 @@
 %if ARCH_X86_32
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2469,9 +2525,20 @@
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2483,6 +2550,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[luma_src]
@@ -2547,9 +2617,13 @@
     add            srcq, r2mp
     ; lumaq has already been incremented above
 %else
-    add            srcq, r10mp
+    add            srcq, r12mp
+%if %3
     lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
 %endif
+%endif
     add      grain_lutq, 82
     dec              hw
     jg %%loop_y_h_overlap
@@ -2567,17 +2641,32 @@
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
     mov             r9m, lumaq
 %endif
+%if %2 == 0
+    xor       dword r8m, 4
+    ; adjust top_offxyd
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16
+%endif
+    add          offxyd, 16
+%endif
 
     ; r8m = sbym
     test      dword r8m, 2
+%if %2
     jne %%loop_x_hv_overlap
     jmp %%loop_x_h_overlap
+%else
+    jne %%loop_x_odd_v_overlap
+    jmp %%loop_x_odd
+%endif
 
 %%end:
     RET
@@ -2612,7 +2701,9 @@
 
     mov             r3m, seed
     mov              wq, r4m
+%if %3
     shl           r10mp, 1
+%endif
 %else
     xor            seed, sbyd               ; (cur_seed << 16) | top_seed
 
@@ -2624,7 +2715,7 @@
 
     mov           lumaq, r9mp
     lea        src_bakq, [srcq+wq]
-    lea       luma_bakq, [lumaq+wq*2]
+    lea       luma_bakq, [lumaq+wq*(1+%2)]
     neg              wq
     sub            r0mp, srcq
 %if ARCH_X86_32
@@ -2635,7 +2726,7 @@
     DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
 %else
     mov           r11mp, src_bakq
-    mov           r10mp, strideq
+    mov           r12mp, strideq
 %endif
 
 %%loop_x_v_overlap:
@@ -2674,9 +2765,9 @@
     ror           offxd, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
 %if ARCH_X86_32
     DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
@@ -2693,12 +2784,20 @@
     DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
 %endif
 
+%%loop_x_odd_v_overlap:
     mov              hd, r7m
     mov      grain_lutq, grain_lutmp
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m1, [base+pb_27_17]
+%else
+    mova             m1, [pb_27_17]
+%endif
 %%loop_y_v_overlap:
 %if ARCH_X86_32
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2716,9 +2815,20 @@
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2730,6 +2840,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[luma_src]
@@ -2740,10 +2853,10 @@
     vpgatherdw       m7, m4, scalingq, r12, r2
     vpgatherdw       m5, m6, scalingq, r12, r2
 %endif
-    pcmpeqw          m1, m1
-    psrlw            m1, 8
-    pand             m7, m1
-    pand             m5, m1
+    pcmpeqw          m4, m4
+    psrlw            m4, 8
+    pand             m7, m4
+    pand             m5, m4
 
     ; grain = grain_lut[offy+y][offx+x]
     movu             m3, [grain_lutq+offxyq]
@@ -2753,17 +2866,22 @@
 %else
     movu             m4, [grain_lutq+top_offxyq]
 %endif
-    punpckhbw        m1, m4, m3
+    punpckhbw        m6, m4, m3
     punpcklbw        m4, m3
-    pmaddubsw        m2, m9, m1
+%if %3
+    pmaddubsw        m2, m9, m6
     pmaddubsw        m3, m9, m4
+%else
+    pmaddubsw        m2, m1, m6
+    pmaddubsw        m3, m1, m4
+%endif
     pmulhrsw         m2, m8
     pmulhrsw         m3, m8
     packsswb         m3, m2
-    pxor             m1, m1
-    pcmpgtb          m1, m3
-    punpcklbw        m2, m3, m1
-    punpckhbw        m3, m1
+    pxor             m6, m6
+    pcmpgtb          m6, m3
+    punpcklbw        m2, m3, m6
+    punpckhbw        m3, m6
 
     ; noise = round2(scaling[luma_src] * grain, scaling_shift)
     pmullw           m2, m7
@@ -2773,7 +2891,7 @@
 
     ; unpack chroma_source
     pxor             m4, m4
-    punpckhbw        m1, m0, m4
+    punpckhbw        m6, m0, m4
     punpcklbw        m0, m4                 ; m0-1: src as word
 
 %if ARCH_X86_32
@@ -2782,12 +2900,12 @@
 
     ; dst = clip_pixel(src, noise)
     paddw            m0, m2
-    paddw            m1, m3
+    paddw            m6, m3
     pmaxsw           m0, m13
-    pmaxsw           m1, m13
+    pmaxsw           m6, m13
     pminsw           m0, m12
-    pminsw           m1, m12
-    packuswb         m0, m1
+    pminsw           m6, m12
+    packuswb         m0, m6
     movifnidn      dstq, dstmp
     mova    [dstq+srcq], m0
 
@@ -2797,10 +2915,24 @@
     add            srcq, r2mp
     ; lumaq has already been incremented above
 %else
-    add            srcq, r10mp
+    add            srcq, r12mp
+%if %3
     lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
 %endif
+%endif
     add      grain_lutq, 82
+%if %3 == 0
+    btc              hd, 16
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m1, [base+pb_17_27]
+%else
+    mova             m1, [pb_17_27]
+%endif
+    jnc %%loop_y_v_overlap
+%endif
     jmp %%loop_y
 
 %%end_y_v_overlap:
@@ -2817,7 +2949,7 @@
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
@@ -2824,9 +2956,20 @@
     mov             r9m, lumaq
 %endif
 
+%if %2
     ; since fg_dataq.overlap is guaranteed to be set, we never jump
     ; back to .loop_x_v_overlap, and instead always fall-through to
     ; h+v overlap
+%else
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    add          offxyd, 16
+    btc       dword r8m, 2
+    jnc %%loop_x_odd_v_overlap
+%endif
 
 %%loop_x_hv_overlap:
 %if ARCH_X86_32
@@ -2833,9 +2976,13 @@
     DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
 
     mov              r6, [rsp+8*mmsize+1*gprsize]
+%if %2
     lea              r0, [r3d+16]
     add              r6, 16
     mov [rsp+8*mmsize+0*gprsize], r0        ; left_offxy
+%else
+    mov [rsp+8*mmsize+0*gprsize], r3        ; left_offxy
+%endif
     mov [rsp+8*mmsize+2*gprsize], r6        ; topleft_offxy
 
     DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
@@ -2846,8 +2993,13 @@
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
 
+%if %2
     lea  topleft_offxyq, [top_offxyq+16]
     lea     left_offxyq, [offxyq+16]
+%else
+    mov  topleft_offxyq, top_offxyq
+    mov     left_offxyq, offxyq
+%endif
 
     ; we assume from the block above that bits 8-15 of tmpd are zero'ed
 %endif
@@ -2881,9 +3033,9 @@
     ror           offxd, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
 %if ARCH_X86_32
     DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
@@ -2900,6 +3052,12 @@
 
     mov              hd, r7m
     mov      grain_lutq, grain_lutmp
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m3, [base+pb_27_17]
+%else
+    mova             m3, [pb_27_17]
+%endif
 %%loop_y_hv_overlap:
     ; src
 %if ARCH_X86_32
@@ -2907,6 +3065,7 @@
 
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2924,9 +3083,20 @@
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2938,6 +3108,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[src]
@@ -2946,9 +3119,14 @@
     vpgatherdw       m5, m6, scalingq, r0, r5
 %else
     movd             m1, [grain_lutq+topleft_offxyq]
+%if %3
     vpgatherdw       m7, m4, scalingq, r2, r12
     vpgatherdw       m5, m6, scalingq, r2, r12
+%else
+    vpgatherdw       m7, m4, scalingq, r2, r13
+    vpgatherdw       m5, m6, scalingq, r2, r13
 %endif
+%endif
     pcmpeqw          m2, m2
     psrlw            m2, 8
     pand             m7, m2
@@ -2961,7 +3139,7 @@
     movd             m1, [grain_lutq+r0]
     mov              r0, [rsp+8*mmsize+0*gprsize]       ; left_offxy
 %endif
-    movu             m3, [grain_lutq+offxyq]
+    movu             m2, [grain_lutq+offxyq]
 %if ARCH_X86_32
     movu             m6, [grain_lutq+r5]
     movd             m4, [grain_lutq+r0]
@@ -2971,23 +3149,32 @@
 %endif
     ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
     punpcklbw        m1, m6
-    punpcklbw        m4, m3
+    punpcklbw        m4, m2
+%if %2
     punpcklwd        m4, m1
+%else
+    punpckldq        m4, m1
+%endif
     pmaddubsw        m1, m9, m4
     pmulhrsw         m1, m8
     packsswb         m1, m1
-    pandn            m4, m10, m3
-    pandn            m3, m10, m6
-    psrldq           m6, m1, 1
+    pandn            m4, m10, m2
+    pandn            m2, m10, m6
+    psrldq           m6, m1, 2-%2
     pand             m1, m10
     pand             m6, m10
     por              m4, m1
-    por              m3, m6
+    por              m2, m6
     ; followed by v interpolation (top | cur -> cur)
-    punpckhbw        m1, m3, m4
-    punpcklbw        m3, m4
+    punpckhbw        m1, m2, m4
+    punpcklbw        m2, m4
+%if %3
     pmaddubsw        m4, m9, m1
-    pmaddubsw        m1, m9, m3
+    pmaddubsw        m1, m9, m2
+%else
+    pmaddubsw        m4, m3, m1
+    pmaddubsw        m1, m3, m2
+%endif
     pmulhrsw         m4, m8
     pmulhrsw         m1, m8
     packsswb         m1, m4
@@ -3008,17 +3195,17 @@
 
     ; unpack chroma source
     pxor             m4, m4
-    punpckhbw        m3, m0, m4
+    punpckhbw        m5, m0, m4
     punpcklbw        m0, m4                 ; m0-1: src as word
 
     ; dst = clip_pixel(src, noise)
     paddw            m0, m2
-    paddw            m3, m1
+    paddw            m5, m1
     pmaxsw           m0, m13
-    pmaxsw           m3, m13
+    pmaxsw           m5, m13
     pminsw           m0, m12
-    pminsw           m3, m12
-    packuswb         m0, m3
+    pminsw           m5, m12
+    packuswb         m0, m5
     movifnidn      dstq, dstmp
     mova    [dstq+srcq], m0
 
@@ -3026,12 +3213,36 @@
     add            srcq, r2mp
     ; lumaq has been adjusted above already
 %else
-    add            srcq, r10mp
-    lea           lumaq, [lumaq+lstrideq*2]
+    add            srcq, r12mp
+%if %3
+    lea           lumaq, [lumaq+lstrideq*(1+%2)]
+%else
+    add           lumaq, r10mp
 %endif
+%endif
     add      grain_lutq, 82
     dec              hw
+%if %3
     jg %%loop_y_h_overlap
+%else
+    jle %%end_y_hv_overlap
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m3, [base+pb_17_27]
+%else
+    mova             m3, [pb_17_27]
+%endif
+    btc              hd, 16
+    jnc %%loop_y_hv_overlap
+%if ARCH_X86_64
+    mov        lstrideq, r10mp
+%endif
+    jmp %%loop_y_h_overlap
+%%end_y_hv_overlap:
+%if ARCH_X86_64
+    mov        lstrideq, r10mp
+%endif
+%endif
 
 %if ARCH_X86_32
     DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
@@ -3046,18 +3257,44 @@
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
     mov             r9m, lumaq
 %endif
+%if %2
     jmp %%loop_x_hv_overlap
+%else
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    add          offxyd, 16
+    xor       dword r8m, 4
+    jmp %%loop_x_odd_v_overlap
+%endif
 
 %%end_hv:
     RET
 %endmacro
 
-    FGUV_32x32xN_LOOP 1
+    %%FGUV_32x32xN_LOOP 1, %2, %3
 .csfl:
-    FGUV_32x32xN_LOOP 0
+    %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 422, 1, 0
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 444, 0, 0