ref: f16b43cdfa2f3f2d5af36185819bebf1ca9c806d
dir: /src/x86/film_grain.asm/
; Copyright © 2019, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 pw_1024: times 16 dw 1024 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pd_m65536: dd ~0xffff pb_23_22: times 2 db 23, 22 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512 max: dw 255, 240, 235 min: dw 0, 16 pb_27_17_17_27: db 27, 17, 17, 27 pw_1: dw 1 %macro JMP_TABLE 1-* %xdefine %1_table %%table %xdefine %%base %1_table %xdefine %%prefix mangle(private_prefix %+ _%1) %%table: %rep %0 - 1 dd %%prefix %+ .ar%2 - %%base %rotate 1 %endrep %endmacro ALIGN 4 JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 struc FGData .seed: resd 1 .num_y_points: resd 1 .y_points: resb 14 * 2 .chroma_scaling_from_luma: resd 1 .num_uv_points: resd 2 .uv_points: resb 2 * 10 * 2 .scaling_shift: resd 1 .ar_coeff_lag: resd 1 .ar_coeffs_y: resb 24 .ar_coeffs_uv: resb 2 * 28 ; includes padding .ar_coeff_shift: resq 1 .grain_scale_shift: resd 1 .uv_mult: resd 2 .uv_luma_mult: resd 2 .uv_offset: resd 2 .overlap_flag: resd 1 .clip_to_restricted_range: resd 1 endstruc cextern gaussian_sequence SECTION .text INIT_XMM avx2 cglobal generate_grain_y, 2, 9, 16, buf, fg_data lea r4, [pb_mask] %define base r4-pb_mask movq xm1, [base+rnd_next_upperbit_mask] movq xm4, [base+mul_bits] movq xm7, [base+hmul_bits] mov r2d, [fg_dataq+FGData.grain_scale_shift] vpbroadcastw xm8, [base+round+r2*2] mova xm5, [base+pb_mask] vpbroadcastw xm0, [fg_dataq+FGData.seed] vpbroadcastd xm9, [base+pd_m65536] mov r2, -73*82 sub bufq, r2 lea r3, [gaussian_sequence] .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds psllq xm6, xm2, 30 por xm2, xm6 psllq xm6, xm2, 15 por xm2, xm6 ; aggregate each bit into next seed's high bit pmulhuw xm3, xm0, xm7 por xm2, xm3 ; 4 next output seeds pshuflw xm0, xm2, q3333 psrlw xm2, 5 pmovzxwd xm3, xm2 mova xm6, xm9 vpgatherdd xm2, [r3+xm3*2], xm6 pandn xm2, xm9, xm2 packusdw xm2, xm2 pmulhrsw xm2, xm8 packsswb xm2, xm2 movd [bufq+r2], xm2 add r2, 4 jl .loop ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] movsxd r2, [base+generate_grain_y_avx2_table+r2*4] lea r2, [r2+base+generate_grain_y_avx2_table] jmp r2 .ar1: DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd xm4, [fg_dataq+FGData.ar_coeffs_y] DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 pinsrb xm4, [pb_1], 3 pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd sub bufq, 82*73-(82*3+79) mov hd, 70 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -76 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm0, [bufq+xq-82-1] ; top/left pmovsxbw xm2, [bufq+xq-82+0] ; top pmovsxbw xm1, [bufq+xq-82+1] ; top/right punpcklwd xm0, xm2 punpcklwd xm1, xm3 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d %if WIN64 sarx val3d, val3d, shiftd %else sar val3d, shiftb %endif movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 dec hd jg .y_loop_ar1 .ar0: RET .ar2: DEFINE_ARGS buf, fg_data, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] vpbroadcastw xm14, [base+round_vals-12+shiftq*2] movq xm15, [base+byte_blend+1] pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 pmovsxbw xm9, xm9 DEFINE_ARGS buf, fg_data, h, x pshufd xm12, xm9, q0000 pshufd xm13, xm9, q1111 pshufd xm11, xm8, q3333 pshufd xm10, xm8, q2222 pshufd xm9, xm8, q1111 pshufd xm8, xm8, q0000 pmovzxwd xm14, xm14 sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] punpcklwd xm2, xm0, xm2 punpcklwd xm3, xm4 pmaddwd xm2, xm8 pmaddwd xm3, xm11 paddd xm2, xm3 psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] punpcklwd xm4, xm5 punpcklwd xm6, xm1 psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] punpcklwd xm7, xm1 pmaddwd xm4, xm9 pmaddwd xm6, xm10 pmaddwd xm7, xm12 paddd xm4, xm6 paddd xm2, xm7 paddd xm2, xm4 paddd xm2, xm14 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm1, xm0 pmaddwd xm3, xm1, xm13 paddd xm3, xm2 psrldq xm1, 4 ; y=0,x=0 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value paddw xm3, xm1 packsswb xm3, xm3 pextrb [bufq+xq], xm3, 0 pslldq xm3, 2 pand xm3, xm15 pandn xm0, xm15, xm0 por xm0, xm3 psrldq xm0, 1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 dec hd jg .y_loop_ar2 RET .ar3: DEFINE_ARGS buf, fg_data, shift %if WIN64 SUB rsp, 16*12 %assign stack_size_padded (stack_size_padded+16*12) %assign stack_size (stack_size+16*12) %else ALLOC_STACK 16*12 %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] vpbroadcastw xm14, [base+round_vals-12+shiftq*2] movq xm15, [base+byte_blend] pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pshufd xm9, xm0, q1111 pshufd xm10, xm0, q2222 pshufd xm11, xm0, q3333 pshufd xm0, xm0, q0000 pshufd xm6, xm1, q1111 pshufd xm7, xm1, q2222 pshufd xm8, xm1, q3333 pshufd xm1, xm1, q0000 pshufd xm3, xm2, q1111 psrldq xm13, xm2, 10 pinsrw xm2, [pw_1], 5 pshufd xm4, xm2, q2222 pshufd xm2, xm2, q0000 pinsrw xm13, [base+round_vals+shiftq*2-10], 3 mova [rsp+ 0*16], xm0 mova [rsp+ 1*16], xm9 mova [rsp+ 2*16], xm10 mova [rsp+ 3*16], xm11 mova [rsp+ 4*16], xm1 mova [rsp+ 5*16], xm6 mova [rsp+ 6*16], xm7 mova [rsp+ 7*16], xm8 mova [rsp+ 8*16], xm2 mova [rsp+ 9*16], xm3 mova [rsp+10*16], xm4 DEFINE_ARGS buf, fg_data, h, x sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor xm3, xm3 pcmpgtb xm6, xm3, xm2 pcmpgtb xm5, xm3, xm1 pcmpgtb xm4, xm3, xm0 punpckhbw xm3, xm0, xm4 punpcklbw xm0, xm4 punpckhbw xm4, xm1, xm5 punpcklbw xm1, xm5 punpckhbw xm5, xm2, xm6 punpcklbw xm2, xm6 psrldq xm6, xm0, 2 psrldq xm7, xm0, 4 psrldq xm8, xm0, 6 psrldq xm9, xm0, 8 palignr xm10, xm3, xm0, 10 palignr xm11, xm3, xm0, 12 punpcklwd xm0, xm6 punpcklwd xm7, xm8 punpcklwd xm9, xm10 punpcklwd xm11, xm1 pmaddwd xm0, [rsp+ 0*16] pmaddwd xm7, [rsp+ 1*16] pmaddwd xm9, [rsp+ 2*16] pmaddwd xm11, [rsp+ 3*16] paddd xm0, xm7 paddd xm9, xm11 paddd xm0, xm9 psrldq xm6, xm1, 2 psrldq xm7, xm1, 4 psrldq xm8, xm1, 6 psrldq xm9, xm1, 8 palignr xm10, xm4, xm1, 10 palignr xm11, xm4, xm1, 12 psrldq xm12, xm2, 2 punpcklwd xm6, xm7 punpcklwd xm8, xm9 punpcklwd xm10, xm11 punpcklwd xm12, xm2, xm12 pmaddwd xm6, [rsp+ 4*16] pmaddwd xm8, [rsp+ 5*16] pmaddwd xm10, [rsp+ 6*16] pmaddwd xm12, [rsp+ 7*16] paddd xm6, xm8 paddd xm10, xm12 paddd xm6, xm10 paddd xm0, xm6 psrldq xm6, xm2, 4 psrldq xm7, xm2, 6 psrldq xm8, xm2, 8 palignr xm9, xm5, xm2, 10 palignr xm5, xm5, xm2, 12 punpcklwd xm6, xm7 punpcklwd xm8, xm9 punpcklwd xm5, xm14 pmaddwd xm6, [rsp+ 8*16] pmaddwd xm8, [rsp+ 9*16] pmaddwd xm5, [rsp+10*16] paddd xm0, xm6 paddd xm8, xm5 paddd xm0, xm8 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pmovsxbw xm2, xm1 pmaddwd xm2, xm13 pshufd xm3, xm2, q1111 paddd xm2, xm3 ; left+cur paddd xm2, xm0 ; add top psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value packsswb xm2, xm2 pextrb [bufq+xq], xm2, 0 pslldq xm2, 3 pand xm2, xm15 pandn xm1, xm15, xm1 por xm1, xm2 psrldq xm1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 dec hd jg .y_loop_ar3 RET INIT_XMM avx2 cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv lea r4, [pb_mask] %define base r4-pb_mask movq xm1, [base+rnd_next_upperbit_mask] movq xm4, [base+mul_bits] movq xm7, [base+hmul_bits] mov r5d, [fg_dataq+FGData.grain_scale_shift] vpbroadcastw xm8, [base+round+r5*2] mova xm5, [base+pb_mask] vpbroadcastw xm0, [fg_dataq+FGData.seed] vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] pxor xm0, xm9 vpbroadcastd xm9, [base+pd_m65536] lea r6, [gaussian_sequence] mov r7d, 38 add bufq, 44 .loop_y: mov r5, -44 .loop_x: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds psllq xm6, xm2, 30 por xm2, xm6 psllq xm6, xm2, 15 por xm2, xm6 ; aggregate each bit into next seed's high bit pmulhuw xm3, xm0, xm7 por xm2, xm3 ; 4 next output seeds pshuflw xm0, xm2, q3333 psrlw xm2, 5 pmovzxwd xm3, xm2 mova xm6, xm9 vpgatherdd xm2, [r6+xm3*2], xm6 pandn xm2, xm9, xm2 packusdw xm2, xm2 pmulhrsw xm2, xm8 packsswb xm2, xm2 movd [bufq+r5], xm2 add r5, 4 jl .loop_x add bufq, 82 dec r7d jg .loop_y ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movsxd r5, [base+generate_grain_uv_420_avx2_table+r5*4] lea r5, [r5+base+generate_grain_uv_420_avx2_table] jmp r5 .ar0: INIT_YMM avx2 DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift imul uvd, 25 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd xm3, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h pmovsxbw xm4, xm4 vpbroadcastd m7, [pb_1] vpbroadcastw m6, [hmul_bits+4] vpbroadcastw m4, xm4 vpbroadcastw m3, xm3 sub bufq, 82*38+82-(82*3+41) add bufyq, 3+82*3 mov hd, 35 .y_loop_ar0: ; first 32 pixels movu xm8, [bufyq] movu xm9, [bufyq+82] movu xm10, [bufyq+16] movu xm11, [bufyq+82+16] vinserti128 m8, [bufyq+32], 1 vinserti128 m9, [bufyq+82+32], 1 vinserti128 m10, [bufyq+48], 1 vinserti128 m11, [bufyq+82+48], 1 pmaddubsw m8, m7, m8 pmaddubsw m9, m7, m9 pmaddubsw m10, m7, m10 pmaddubsw m11, m7, m11 paddw m8, m9 paddw m10, m11 pmulhrsw m8, m6 pmulhrsw m10, m6 pmullw m8, m4 pmullw m10, m4 pmulhrsw m8, m3 pmulhrsw m10, m3 packsswb m8, m10 movu m0, [bufq] punpckhbw m1, m0, m8 punpcklbw m0, m8 pmaddubsw m1, m7, m1 pmaddubsw m0, m7, m0 packsswb m0, m1 movu [bufq], m0 ; last 6 pixels movu xm8, [bufyq+32*2] movu xm9, [bufyq+32*2+82] pmaddubsw xm8, xm7, xm8 pmaddubsw xm9, xm7, xm9 paddw xm8, xm9 pmulhrsw xm8, xm6 pmullw xm8, xm4 pmulhrsw xm8, xm3 packsswb xm8, xm8 movq xm0, [bufq+32] punpcklbw xm8, xm0 pmaddubsw xm8, xm7, xm8 packsswb xm8, xm8 vpblendw xm0, xm8, xm0, 1000b movq [bufq+32], xm0 add bufq, 82 add bufyq, 82*2 dec hd jg .y_loop_ar0 RET .ar1: INIT_XMM avx2 DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift imul uvd, 25 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd vpbroadcastd xm7, [pb_1] vpbroadcastw xm6, [hmul_bits+4] vpbroadcastd xm3, xm3 sub bufq, 82*38+44-(82*3+41) add bufyq, 79+82*3 mov hd, 35 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -38 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm0, [bufq+xq-82-1] ; top/left movq xm8, [bufyq+xq*2] movq xm9, [bufyq+xq*2+82] psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right pmaddubsw xm8, xm7, xm8 pmaddubsw xm9, xm7, xm9 paddw xm8, xm9 pmulhrsw xm8, xm6 punpcklwd xm0, xm2 punpcklwd xm1, xm8 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 paddd xm0, xm3 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 add bufyq, 82*2 dec hd jg .y_loop_ar1 RET .ar2: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 25 vpbroadcastw xm15, [base+round_vals-12+shiftq*2] pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 pinsrw xm9, [base+pw_1], 5 vpbroadcastw xm7, [base+hmul_bits+4] vpbroadcastd xm6, [base+pb_1] DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd xm12, xm9, q0000 pshufd xm13, xm9, q1111 pshufd xm14, xm9, q2222 pshufd xm11, xm8, q3333 pshufd xm10, xm8, q2222 pshufd xm9, xm8, q1111 pshufd xm8, xm8, q0000 sub bufq, 82*38+44-(82*3+41) add bufyq, 79+82*3 mov hd, 35 .y_loop_ar2: mov xq, -38 .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] punpcklwd xm2, xm0, xm2 punpcklwd xm3, xm4 pmaddwd xm2, xm8 pmaddwd xm3, xm11 paddd xm2, xm3 psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] psrldq xm0, 8 ; y=-2,x=[+2,+5] punpcklwd xm4, xm5 punpcklwd xm0, xm1 psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] punpcklwd xm3, xm1 pmaddwd xm4, xm9 pmaddwd xm0, xm10 pmaddwd xm3, xm12 paddd xm4, xm0 paddd xm2, xm3 paddd xm2, xm4 movq xm0, [bufyq+xq*2] movq xm3, [bufyq+xq*2+82] pmaddubsw xm0, xm6, xm0 pmaddubsw xm3, xm6, xm3 paddw xm0, xm3 pmulhrsw xm0, xm7 punpcklwd xm0, xm15 pmaddwd xm0, xm14 paddd xm2, xm0 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm0, xm0 pmaddwd xm3, xm0, xm13 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] pslldq xm3, 2 psrldq xm0, 2 paddw xm3, xm0 vpblendw xm0, xm3, 00000010b packsswb xm0, xm0 pextrb [bufq+xq], xm0, 1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 add bufyq, 82*2 dec hd jg .y_loop_ar2 RET .ar3: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift SUB rsp, 16*12 %assign stack_size_padded (stack_size_padded+16*12) %assign stack_size (stack_size+16*12) mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 25 vpbroadcastw xm14, [base+round_vals-12+shiftq*2] pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] pshufd xm9, xm0, q1111 pshufd xm10, xm0, q2222 pshufd xm11, xm0, q3333 pshufd xm0, xm0, q0000 pshufd xm6, xm1, q1111 pshufd xm7, xm1, q2222 pshufd xm8, xm1, q3333 pshufd xm1, xm1, q0000 pshufd xm3, xm2, q1111 pshufd xm4, xm2, q2222 vpbroadcastw xm5, xm5 vpblendw xm4, xm5, 10101010b ; interleave luma cf psrldq xm5, xm2, 10 pshufd xm2, xm2, q0000 pinsrw xm5, [base+round_vals+shiftq*2-10], 3 pmovzxwd xm14, xm14 mova [rsp+ 0*16], xm0 mova [rsp+ 1*16], xm9 mova [rsp+ 2*16], xm10 mova [rsp+ 3*16], xm11 mova [rsp+ 4*16], xm1 mova [rsp+ 5*16], xm6 mova [rsp+ 6*16], xm7 mova [rsp+ 7*16], xm8 mova [rsp+ 8*16], xm2 mova [rsp+ 9*16], xm3 mova [rsp+10*16], xm4 mova [rsp+11*16], xm5 vpbroadcastd xm13, [base+pb_1] vpbroadcastw xm15, [base+hmul_bits+4] DEFINE_ARGS buf, bufy, fg_data, h, unused, x sub bufq, 82*38+44-(82*3+41) add bufyq, 79+82*3 mov hd, 35 .y_loop_ar3: mov xq, -38 .x_loop_ar3: movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor xm3, xm3 pcmpgtb xm6, xm3, xm2 pcmpgtb xm5, xm3, xm1 pcmpgtb xm4, xm3, xm0 punpckhbw xm3, xm0, xm4 punpcklbw xm0, xm4 punpckhbw xm4, xm1, xm5 punpcklbw xm1, xm5 punpckhbw xm5, xm2, xm6 punpcklbw xm2, xm6 psrldq xm6, xm0, 2 psrldq xm7, xm0, 4 psrldq xm8, xm0, 6 psrldq xm9, xm0, 8 palignr xm10, xm3, xm0, 10 palignr xm11, xm3, xm0, 12 punpcklwd xm0, xm6 punpcklwd xm7, xm8 punpcklwd xm9, xm10 punpcklwd xm11, xm1 pmaddwd xm0, [rsp+ 0*16] pmaddwd xm7, [rsp+ 1*16] pmaddwd xm9, [rsp+ 2*16] pmaddwd xm11, [rsp+ 3*16] paddd xm0, xm7 paddd xm9, xm11 paddd xm0, xm9 psrldq xm6, xm1, 2 psrldq xm7, xm1, 4 psrldq xm8, xm1, 6 psrldq xm9, xm1, 8 palignr xm10, xm4, xm1, 10 palignr xm11, xm4, xm1, 12 psrldq xm12, xm2, 2 punpcklwd xm6, xm7 punpcklwd xm8, xm9 punpcklwd xm10, xm11 punpcklwd xm12, xm2, xm12 pmaddwd xm6, [rsp+ 4*16] pmaddwd xm8, [rsp+ 5*16] pmaddwd xm10, [rsp+ 6*16] pmaddwd xm12, [rsp+ 7*16] paddd xm6, xm8 paddd xm10, xm12 paddd xm6, xm10 paddd xm0, xm6 psrldq xm6, xm2, 4 psrldq xm7, xm2, 6 psrldq xm8, xm2, 8 palignr xm9, xm5, xm2, 10 palignr xm5, xm5, xm2, 12 movq xm1, [bufyq+xq*2] movq xm2, [bufyq+xq*2+82] pmaddubsw xm1, xm13, xm1 pmaddubsw xm2, xm13, xm2 paddw xm1, xm2 pmulhrsw xm1, xm15 punpcklwd xm6, xm7 punpcklwd xm8, xm9 punpcklwd xm5, xm1 pmaddwd xm6, [rsp+ 8*16] pmaddwd xm8, [rsp+ 9*16] pmaddwd xm5, [rsp+10*16] paddd xm0, xm6 paddd xm8, xm5 paddd xm0, xm8 paddd xm0, xm14 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pmovsxbw xm1, xm1 pmaddwd xm2, xm1, [rsp+16*11] pshufd xm3, xm2, q1111 paddd xm2, xm3 ; left+cur paddd xm2, xm0 ; add top psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw, we only care about one value pslldq xm2, 6 vpblendw xm1, xm2, 1000b packsswb xm1, xm1 pextrb [bufq+xq], xm1, 3 psrldq xm1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 add bufyq, 82*2 dec hd jg .y_loop_ar3 RET INIT_YMM avx2 cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut pcmpeqw m10, m10 psrld m10, 24 mov r7d, [fg_dataq+FGData.scaling_shift] lea r8, [pb_mask] %define base r8-pb_mask vpbroadcastw m11, [base+mul_bits+r7*2-14] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] vpbroadcastw m12, [base+max+r7*4] vpbroadcastw m13, [base+min+r7*2] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap mov overlapd, [fg_dataq+FGData.overlap_flag] movifnidn sbyd, sbym test sbyd, sbyd setnz r7b test r7b, overlapb jnz .vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ unused1, unused2, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x: mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap mov offxd, seed rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap mov hd, hm mov grain_lutq, grain_lutmp .loop_y: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word punpckhwd m5, m0, m2 punpcklwd m4, m0, m2 punpckhwd m7, m1, m2 punpcklwd m6, m1, m2 ; m4-7: src as dword ; scaling[src] pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m8, [scalingq+m4], m3 vpgatherdd m4, [scalingq+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m5, [scalingq+m6], m3 vpgatherdd m6, [scalingq+m7], m9 pand m8, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m8, m4 packusdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m8 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] test overlapd, overlapd jz .loop_x ; r8m = sbym movd xm15, [pb_27_17_17_27] cmp dword r8m, 0 jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) movd xm14, [pw_1024] .loop_x_h_overlap: mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx mov offxd, seed rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy mov hd, hm mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word punpckhwd m5, m0, m2 punpcklwd m4, m0, m2 punpckhwd m7, m1, m2 punpcklwd m6, m1, m2 ; m4-7: src as dword ; scaling[src] pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m8, [scalingq+m4], m3 vpgatherdd m4, [scalingq+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m5, [scalingq+m6], m3 vpgatherdd m6, [scalingq+m7], m9 pand m8, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m8, m4 packusdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] punpcklbw xm4, xm3 pmaddubsw xm4, xm15, xm4 pmulhrsw xm4, xm14 packsswb xm4, xm4 vpblendw xm4, xm3, 11111110b vpblendd m3, m4, 00001111b pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m8 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq] ; r8m = sbym cmp dword r8m, 0 jne .loop_x_hv_overlap jmp .loop_x_h_overlap .end: RET .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ unused1, unused2, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq vpbroadcastd m14, [pw_1024] .loop_x_v_overlap: vpbroadcastw m15, [pb_27_17_17_27] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy movzx top_offxyd, offxyw shr offxyd, 16 mov hd, hm mov grain_lutq, grain_lutmp .loop_y_v_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word punpckhwd m5, m0, m2 punpcklwd m4, m0, m2 punpckhwd m7, m1, m2 punpcklwd m6, m1, m2 ; m4-7: src as dword ; scaling[src] pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m8, [scalingq+m4], m3 vpgatherdd m4, [scalingq+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m5, [scalingq+m6], m3 vpgatherdd m6, [scalingq+m7], m9 pand m8, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m8, m4 packusdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] movu m4, [grain_lutq+top_offxyq] punpckhbw m6, m4, m3 punpcklbw m4, m3 pmaddubsw m6, m15, m6 pmaddubsw m4, m15, m4 pmulhrsw m6, m14 pmulhrsw m4, m14 packsswb m3, m4, m6 pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m8 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq+srcq], m0 vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line add srcq, strideq add grain_lutq, 82 dec hw jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: add wq, 32 jge .end_hv lea srcq, [src_bakq+wq] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap movd xm15, [pb_27_17_17_27] .loop_x_hv_overlap: vpbroadcastw m8, [pb_27_17_17_27] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+32] lea left_offxyq, [offyq+32] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy movzx top_offxyd, offxyw shr offxyd, 16 mov hd, hm mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word punpckhwd m5, m0, m2 punpcklwd m4, m0, m2 punpckhwd m7, m1, m2 punpcklwd m6, m1, m2 ; m4-7: src as dword ; scaling[src] pcmpeqw m3, m3 ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel vpgatherdd m9, [scalingq+m4], m3 pcmpeqw m3, m3 vpgatherdd m4, [scalingq+m5], m3 pcmpeqw m3, m3 vpgatherdd m5, [scalingq+m6], m3 pcmpeqw m3, m3 vpgatherdd m6, [scalingq+m7], m3 pand m9, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m9, m4 packusdw m5, m6 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] movu m6, [grain_lutq+top_offxyq] movd xm4, [grain_lutq+left_offxyq] movd xm7, [grain_lutq+topleft_offxyq] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw xm4, xm3 punpcklbw xm7, xm6 pmaddubsw xm4, xm15, xm4 pmaddubsw xm7, xm15, xm7 pmulhrsw xm4, xm14 pmulhrsw xm7, xm14 packsswb xm4, xm4 packsswb xm7, xm7 vpblendw xm4, xm3, 11111110b vpblendw xm7, xm6, 11111110b vpblendd m3, m4, 00001111b vpblendd m6, m7, 00001111b ; followed by v interpolation (top | cur -> cur) punpckhbw m7, m6, m3 punpcklbw m6, m3 pmaddubsw m7, m8, m7 pmaddubsw m6, m8, m6 pmulhrsw m7, m14 pmulhrsw m6, m14 packsswb m3, m6, m7 pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m9 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq+srcq], m0 vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line add srcq, strideq add grain_lutq, 82 dec hw jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: add wq, 32 lea srcq, [src_bakq+wq] jl .loop_x_hv_overlap .end_hv: RET cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id pcmpeqw m10, m10 psrld m10, 24 mov r7d, [fg_dataq+FGData.scaling_shift] lea r8, [pb_mask] %define base r8-pb_mask vpbroadcastw m11, [base+mul_bits+r7*2-14] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, dword is_idm vpbroadcastw m13, [base+min+r7*2] shlx r7d, r7d, r9d vpbroadcastw m12, [base+max+r7*2] cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro FGUV_32x32xN_LOOP 1 ; not-csfl DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %if %1 mov r7d, dword r11m vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] punpcklbw m14, m1, m0 vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] %else vpbroadcastd m14, [pw_1024] vpbroadcastd m15, [pb_23_22] %endif mov overlapd, [fg_dataq+FGData.overlap_flag] movifnidn sbyd, sbym test sbyd, sbyd setnz r7b test r7b, overlapb jnz %%vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, unused5, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*2] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x: mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, unused1, unused2, lstride mov offxd, seed rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf imul offyd, 82 lea offyq, [offyq+offxq+498] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride mov hd, hm mov grain_lutq, grain_lutmp %%loop_y: ; src mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] vinserti128 m4, [lumaq+lstrideq*2 +0], 1 vinserti128 m6, [lumaq+lstrideq*2+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %if %1 packuswb m4, m6 ; luma punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 punpcklwd m4, m2 punpckhwd m7, m6, m2 punpcklwd m6, m2 ; m4-7: luma_src as dword ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m8, [scalingq+m4], m3 vpgatherdd m4, [scalingq+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m5, [scalingq+m6], m3 vpgatherdd m6, [scalingq+m7], m9 pand m8, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m8, m4 packusdw m5, m6 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] movu xm3, [grain_lutq+offxyq+ 0] vinserti128 m3, [grain_lutq+offxyq+82], 1 pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m8 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*4] add grain_lutq, 82*2 sub hb, 2 jg %%loop_y add wq, 16 jge %%end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*2] add srcq, wq add dstq, wq test overlapd, overlapd jz %%loop_x ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx mov offxd, seed rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf imul offyd, 82 lea offyq, [offyq+offxq+498] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride mov hd, hm mov grain_lutq, grain_lutmp %%loop_y_h_overlap: ; src mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] vinserti128 m4, [lumaq+lstrideq*2 +0], 1 vinserti128 m6, [lumaq+lstrideq*2+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %if %1 packuswb m4, m6 ; luma punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 punpcklwd m4, m2 punpckhwd m7, m6, m2 punpcklwd m6, m2 ; m4-7: luma_src as dword ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m8, [scalingq+m4], m3 vpgatherdd m4, [scalingq+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m5, [scalingq+m6], m3 vpgatherdd m6, [scalingq+m7], m9 pand m8, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m8, m4 packusdw m5, m6 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] %if %1 vpbroadcastd m6, [pb_23_22] ; FIXME %endif movu xm3, [grain_lutq+offxyq+ 0] movd xm4, [grain_lutq+left_offxyq+ 0] vinserti128 m3, [grain_lutq+offxyq+82], 1 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 punpcklbw m4, m3 %if %1 pmaddubsw m4, m6, m4 pmulhrsw m4, [pw_1024] %else pmaddubsw m4, m15, m4 pmulhrsw m4, m14 %endif packsswb m4, m4 pcmpeqw m6, m6 ; FIXME psrldq m6, 15 ; FIXME vpblendvb m3, m3, m4, m6 pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m8 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*4] add grain_lutq, 82*2 sub hb, 2 jg %%loop_y_h_overlap add wq, 16 jge %%end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*2] add srcq, wq add dstq, wq ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %%end: RET %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, overlap, unused1, unused2, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused1, unused2, see, overlap, unused3, unused4, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*2] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x_v_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy, unused, lstride rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 82 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq+0x10001*498+16*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride movzx top_offxyd, offxyw shr offxyd, 16 mov hd, hm mov grain_lutq, grain_lutmp %%loop_y_v_overlap: ; src mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] vinserti128 m4, [lumaq+lstrideq*2 +0], 1 vinserti128 m6, [lumaq+lstrideq*2+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %if %1 packuswb m4, m6 ; luma punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 punpcklwd m4, m2 punpckhwd m7, m6, m2 punpcklwd m6, m2 ; m4-7: luma_src as dword ; scaling[luma_src] pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m8, [scalingq+m4], m3 vpgatherdd m4, [scalingq+m5], m9 pcmpeqw m3, m3 pcmpeqw m9, m9 vpgatherdd m5, [scalingq+m6], m3 vpgatherdd m6, [scalingq+m7], m9 pand m8, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m8, m4 packusdw m5, m6 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] %if %1 vpbroadcastd m6, [pb_23_22] %endif movq xm3, [grain_lutq+offxyq] movq xm4, [grain_lutq+top_offxyq] vinserti128 m3, [grain_lutq+offxyq+8], 1 vinserti128 m4, [grain_lutq+top_offxyq+8], 1 punpcklbw m4, m3 %if %1 pmaddubsw m4, m6, m4 pmulhrsw m4, [pw_1024] %else pmaddubsw m4, m15, m4 pmulhrsw m4, m14 %endif packsswb m4, m4 vpermq m4, m4, q3120 ; only interpolate first line, insert second line unmodified vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m8 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 sub hb, 2 jl %%end_y_v_overlap lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*4] add grain_lutq, 82*2 jmp %%loop_y %%end_y_v_overlap: add wq, 16 jge %%end_hv mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*2] add srcq, wq add dstq, wq ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %%loop_x_hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 82 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq+0x10001*498+16*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride movzx top_offxyd, offxyw shr offxyd, 16 mov hd, hm mov grain_lutq, grain_lutmp %%loop_y_hv_overlap: ; src mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] vinserti128 m4, [lumaq+lstrideq*2 +0], 1 vinserti128 m6, [lumaq+lstrideq*2+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %if %1 packuswb m4, m6 ; luma punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 punpcklwd m4, m2 punpckhwd m7, m6, m2 punpcklwd m6, m2 ; m4-7: src as dword ; scaling[src] pcmpeqw m9, m9 pcmpeqw m3, m3 vpgatherdd m8, [scalingq+m4], m9 vpgatherdd m4, [scalingq+m5], m3 pcmpeqw m9, m9 pcmpeqw m3, m3 vpgatherdd m5, [scalingq+m6], m9 vpgatherdd m6, [scalingq+m7], m3 pand m8, m10 pand m4, m10 pand m5, m10 pand m6, m10 packusdw m8, m4 packusdw m5, m6 ; unpack chroma source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] %if %1 vpbroadcastd m9, [pb_23_22] %endif movu xm3, [grain_lutq+offxyq] movq xm6, [grain_lutq+top_offxyq] vinserti128 m3, [grain_lutq+offxyq+82], 1 vinserti128 m6, [grain_lutq+top_offxyq+8], 1 movd xm4, [grain_lutq+left_offxyq] movd xm7, [grain_lutq+topleft_offxyq] vinserti128 m4, [grain_lutq+left_offxyq+82], 1 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m4, m3 punpcklbw xm7, xm6 %if %1 pmaddubsw m4, m9, m4 pmaddubsw xm7, xm9, xm7 pmulhrsw m4, [pw_1024] pmulhrsw xm7, [pw_1024] %else pmaddubsw m4, m15, m4 pmaddubsw xm7, xm15, xm7 pmulhrsw m4, m14 pmulhrsw xm7, xm14 %endif packsswb m4, m4 packsswb xm7, xm7 pcmpeqw m9, m9 ; this is kind of ugly psrldq m9, 15 vpblendvb m3, m3, m4, m9 shufpd m9, m9, m9, 1110b vpblendvb m6, m6, m7, m9 vpermq m9, m3, q3120 ; followed by v interpolation (top | cur -> cur) punpcklbw m6, m9 %if %1 vpbroadcastd m9, [pb_23_22] pmaddubsw m6, m9, m6 pmulhrsw m6, [pw_1024] %else pmaddubsw m6, m15, m6 pmulhrsw m6, m14 %endif packsswb m6, m6 vpermq m6, m6, q3120 vpblendd m3, m3, m6, 00001111b pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m8 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*4] add grain_lutq, 82*2 sub hb, 2 jg %%loop_y_h_overlap %%end_y_hv_overlap: add wq, 16 jge %%end_hv mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*2] add srcq, wq add dstq, wq jmp %%loop_x_hv_overlap %%end_hv: RET %endmacro FGUV_32x32xN_LOOP 1 .csfl: FGUV_32x32xN_LOOP 0 %endif ; ARCH_X86_64