ref: 239b87f071cd016376b196617a896683591b7626
dir: /src/x86/film_grain_ssse3.asm/
; Copyright © 2019, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "ext/x86/x86inc.asm" SECTION_RODATA pw_1024: times 8 dw 1024 pb_27_17: times 8 db 27, 17 pb_17_27: times 8 db 17, 27 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pb_23_22: times 2 db 23, 22 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512 max: dw 255, 240, 235 min: dw 0, 16 pw_1: dw 1 %define pb_27_17_17_27 pb_17_27 - 2 %macro JMP_TABLE 1-* %xdefine %1_table %%table %xdefine %%base %1_table %xdefine %%prefix mangle(private_prefix %+ _%1) %%table: %rep %0 - 1 dd %%prefix %+ .ar%2 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3 struc FGData .seed: resd 1 .num_y_points: resd 1 .y_points: resb 14 * 2 .chroma_scaling_from_luma: resd 1 .num_uv_points: resd 2 .uv_points: resb 2 * 10 * 2 .scaling_shift: resd 1 .ar_coeff_lag: resd 1 .ar_coeffs_y: resb 24 .ar_coeffs_uv: resb 2 * 28 ; includes padding .ar_coeff_shift: resq 1 .grain_scale_shift: resd 1 .uv_mult: resd 2 .uv_luma_mult: resd 2 .uv_offset: resd 2 .overlap_flag: resd 1 .clip_to_restricted_range: resd 1 endstruc cextern gaussian_sequence SECTION .text %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro INIT_XMM ssse3 cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r2d, [fg_dataq+FGData.grain_scale_shift] movd m2, [base+round+r2*2] movd m0, [fg_dataq+FGData.seed] mova m5, [base+pb_mask] pshuflw m2, m2, q0000 pshuflw m0, m0, q0000 mov r2, -73*82 sub bufq, r2 lea r3, [base+gaussian_sequence] .loop: pand m6, m0, m1 psrlw m3, m6, 10 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m6, m4 ; bits 0x0f00 are set pshufb m3, m5, m6 ; set 15th bit for next 4 seeds psllq m6, m3, 30 por m3, m6 psllq m6, m3, 15 por m3, m6 ; aggregate each bit into next seed's high bit pmulhuw m6, m0, m7 por m3, m6 ; 4 next output seeds pshuflw m0, m3, q3333 psrlw m3, 5 %if ARCH_X86_64 movq r6, m3 mov r8, r6 movzx r5d, r6w shr r6d, 16 shr r8, 32 movzx r7, r8w shr r8, 16 movd m6, [r3+r5*2] pinsrw m6, [r3+r6*2], 1 pinsrw m6, [r3+r7*2], 2 pinsrw m6, [r3+r8*2], 3 %else movd r6, m3 pshuflw m3, m3, q3232 movzx r5, r6w shr r6, 16 movd m6, [r3+r5*2] pinsrw m6, [r3+r6*2], 1 movd r6, m3 movzx r5, r6w shr r6, 16 pinsrw m6, [r3+r5*2], 2 pinsrw m6, [r3+r6*2], 3 %endif pmulhrsw m6, m2 packsswb m6, m6 movd [bufq+r2], m6 add r2, 4 jl .loop ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] movsxd r2, [base+generate_grain_y_ssse3_table+r2*4] lea r2, [r2+base+generate_grain_y_ssse3_table] jmp r2 .ar1: %if ARCH_X86_32 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max %elif WIN64 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 mov bufq, r0 %else DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 %endif movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd m4, [fg_dataq+FGData.ar_coeffs_y] mov ecx, [fg_dataq+FGData.ar_coeff_shift] %if ARCH_X86_32 mov r1m, cf3d DEFINE_ARGS buf, shift, val3, min, max, x, val0 %define hd r0mp %define cf3d r1mp %elif WIN64 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 %else DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 %endif pxor m6, m6 pcmpgtb m7, m6, m4 punpcklbw m4, m7 pinsrw m4, [base+pw_1], 3 pshufd m5, m4, q1111 pshufd m4, m4, q0000 movd m3, [base+round_vals+shiftq*2-12] ; rnd pshuflw m3, m3, q0000 sub bufq, 82*73-(82*3+79) mov hd, 70 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -76 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: movq m0, [bufq+xq-82-1] ; top/left pcmpgtb m7, m6, m0 punpcklbw m0, m7 psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 punpcklwd m1, m3 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 imul val3d, cf3d add val3d, val0d sar val3d, shiftb movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 dec hd jg .y_loop_ar1 .ar0: RET .ar2: %if ARCH_X86_32 %assign stack_offset_old stack_offset ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m6, [base+round_vals-12+shiftq*2] movd m7, [base+byte_blend+1] SCRATCH 7, 15, 7 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 pxor m7, m7 pshuflw m6, m6, q0000 punpcklwd m6, m7 pcmpgtb m4, m7, m0 pcmpgtb m5, m7, m1 punpcklbw m0, m4 punpcklbw m1, m5 DEFINE_ARGS buf, fg_data, h, x pshufd m4, m1, q0000 pshufd m5, m1, q1111 pshufd m3, m0, q3333 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 SCRATCH 6, 14, 6 sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pcmpgtb m2, m7, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] psrldq m3, m1, 2 ; y=-1,x=[-1,+5] psrldq m4, m1, 4 ; y=-1,x=[+0,+5] punpcklwd m2, m0, m5 punpcklwd m3, m4 pmaddwd m2, m8 pmaddwd m3, m11 paddd m2, m3 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] psrldq m5, m0, 6 ; y=-2,x=[+1,+5] psrldq m6, m0, 8 ; y=-2,x=[+2,+5] punpcklwd m4, m5 punpcklwd m6, m1 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] psrldq m1, m1, 8 ; y=-1,x=[+2,+5] punpcklwd m5, m1 pmaddwd m4, m9 pmaddwd m6, m10 pmaddwd m5, m12 paddd m4, m6 paddd m2, m5 paddd m2, m4 paddd m2, m14 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pcmpgtb m4, m7, m0 punpcklbw m1, m0, m4 pmaddwd m3, m1, m13 paddd m3, m2 psrldq m1, 4 ; y=0,x=0 psrldq m2, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value paddw m3, m1 packsswb m3, m3 pslldq m3, 2 pand m3, m15 pandn m1, m15, m0 por m0, m1, m3 psrldq m0, 1 ; overwrite 2 pixels, but that's ok movd [bufq+xq-1], m0 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 dec hd jg .y_loop_ar2 RET .ar3: DEFINE_ARGS buf, fg_data, shift %if ARCH_X86_32 %assign stack_offset stack_offset_old ALLOC_STACK -16*14 %elif WIN64 SUB rsp, 16*6 %assign stack_size_padded (stack_size_padded+16*6) %assign stack_size (stack_size+16*6) %else ALLOC_STACK -16*6 %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m6, [base+round_vals-12+shiftq*2] movd m7, [base+byte_blend] movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pxor m3, m3 pcmpgtb m4, m3, m0 pcmpgtb m3, m2 pshuflw m6, m6, q0000 SCRATCH 6, 14, 12 SCRATCH 7, 15, 13 punpckhbw m1, m0, m4 punpcklbw m0, m4 punpcklbw m2, m3 pshufd m3, m0, q1111 pshufd m4, m0, q2222 pshufd m5, m0, q3333 pshufd m0, m0, q0000 mova [rsp+ 0*16], m0 mova [rsp+ 1*16], m3 mova [rsp+ 2*16], m4 mova [rsp+ 3*16], m5 pshufd m6, m1, q1111 pshufd m7, m1, q2222 pshufd m5, m1, q3333 pshufd m1, m1, q0000 pshufd m3, m2, q1111 psrldq m0, m2, 10 pinsrw m2, [base+pw_1], 5 pshufd m4, m2, q2222 pshufd m2, m2, q0000 pinsrw m0, [base+round_vals+shiftq*2-10], 3 mova [rsp+ 4*16], m1 mova [rsp+ 5*16], m6 SCRATCH 7, 8, 6 SCRATCH 5, 9, 7 SCRATCH 2, 10, 8 SCRATCH 3, 11, 9 SCRATCH 4, 12, 10 SCRATCH 0, 13, 11 DEFINE_ARGS buf, fg_data, h, x sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] pxor m3, m3 pcmpgtb m3, m0 punpckhbw m2, m0, m3 punpcklbw m0, m3 psrldq m5, m0, 2 psrldq m6, m0, 4 psrldq m7, m0, 6 punpcklwd m4, m0, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 0*16] pmaddwd m6, [rsp+ 1*16] paddd m4, m6 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] pxor m5, m5 pcmpgtb m5, m1 punpckhbw m3, m1, m5 punpcklbw m1, m5 palignr m6, m2, m0, 10 palignr m7, m2, m0, 12 psrldq m0, 8 punpcklwd m0, m6 punpcklwd m7, m1 pmaddwd m0, [rsp+ 2*16] pmaddwd m7, [rsp+ 3*16] paddd m0, m7 paddd m0, m4 psrldq m4, m1, 2 psrldq m5, m1, 4 psrldq m6, m1, 6 psrldq m7, m1, 8 punpcklwd m4, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 4*16] pmaddwd m6, [rsp+ 5*16] paddd m4, m6 paddd m0, m4 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor m7, m7 pcmpgtb m7, m2 punpckhbw m5, m2, m7 punpcklbw m2, m7 palignr m7, m3, m1, 10 palignr m3, m1, 12 psrldq m1, m2, 2 punpcklwd m7, m3 punpcklwd m3, m2, m1 pmaddwd m7, m8 pmaddwd m3, m9 paddd m7, m3 paddd m0, m7 psrldq m6, m2, 4 psrldq m1, m2, 6 psrldq m3, m2, 8 palignr m4, m5, m2, 10 palignr m5, m5, m2, 12 punpcklwd m6, m1 punpcklwd m3, m4 punpcklwd m5, m14 pmaddwd m6, m10 pmaddwd m3, m11 pmaddwd m5, m12 paddd m0, m6 paddd m3, m5 paddd m0, m3 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pxor m5, m5 pcmpgtb m5, m1 punpcklbw m2, m1, m5 pmaddwd m2, m13 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value packsswb m2, m2 pslldq m2, 3 pand m2, m15 pandn m3, m15, m1 por m1, m2, m3 movd [bufq+xq-3], m1 psrldq m1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 dec hd jg .y_loop_ar3 RET %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv movifnidn r2, r2mp movifnidn r3, r3mp LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r5d, [fg_dataq+FGData.grain_scale_shift] movd m6, [base+round+r5*2] mova m5, [base+pb_mask] movd m0, [fg_dataq+FGData.seed] movd m2, [base+pw_seed_xor+uvq*4] pxor m0, m2 pshuflw m6, m6, q0000 pshuflw m0, m0, q0000 lea r6, [base+gaussian_sequence] %if %2 %if ARCH_X86_64 mov r7d, 73-35*%3 %else mov r3mp, 73-35*%3 %endif add bufq, 44 .loop_y: mov r5, -44 .loop_x: %else mov r5, -82*73 sub bufq, r5 .loop: %endif pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set pshufb m3, m5, m2 ; set 15th bit for next 4 seeds psllq m2, m3, 30 por m3, m2 psllq m2, m3, 15 por m3, m2 ; aggregate each bit into next seed's high bit pmulhuw m2, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 %if ARCH_X86_64 movd r9d, m2 pshuflw m2, m2, q3232 movzx r8, r9w shr r9, 16 movd m3, [r6+r8*2] pinsrw m3, [r6+r9*2], 1 movd r9d, m2 movzx r8, r9w shr r9, 16 pinsrw m3, [r6+r8*2], 2 pinsrw m3, [r6+r9*2], 3 %else movd r2, m2 pshuflw m2, m2, q3232 movzx r1, r2w shr r2, 16 movd m3, [r6+r1*2] pinsrw m3, [r6+r2*2], 1 movd r2, m2 movzx r1, r2w shr r2, 16 pinsrw m3, [r6+r1*2], 2 pinsrw m3, [r6+r2*2], 3 %endif pmulhrsw m3, m6 packsswb m3, m3 movd [bufq+r5], m3 add r5, 4 %if %2 jl .loop_x add bufq, 82 %if ARCH_X86_64 dec r7d %else dec r3mp %endif jg .loop_y %else jl .loop %endif %if ARCH_X86_32 mov r2, r2mp %endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4] lea r5, [r5+base+generate_grain_uv_%1_ssse3_table] jmp r5 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 %assign stack_offset_old stack_offset ALLOC_STACK -2*16 %endif imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd m4, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h, x pxor m0, m0 pcmpgtb m0, m5 punpcklbw m5, m0 movd m7, [base+pb_1] %if %2 movd m6, [base+hmul_bits+2+%3*2] %endif pshuflw m5, m5, q0000 pshuflw m4, m4, q0000 pshufd m7, m7, q0000 %if %2 pshuflw m6, m6, q0000 %endif punpcklqdq m5, m5 punpcklqdq m4, m4 %if %2 punpcklqdq m6, m6 %endif pcmpeqw m1, m1 pslldq m1, 12>>%2 SCRATCH 1, 8, 0 SCRATCH 4, 9, 1 %if %2 sub bufq, 82*(73-35*%3)+82-(82*3+41) %else sub bufq, 82*70-3 %endif add bufyq, 3+82*3 mov hd, 70-35*%3 .y_loop_ar0: xor xd, xd .x_loop_ar0: ; first 32 pixels %if %2 movu m1, [bufyq+xq*2] %if %3 movu m2, [bufyq+xq*2+82] %endif movu m3, [bufyq+xq*2+16] %if %3 movu m4, [bufyq+xq*2+82+16] %endif pmaddubsw m0, m7, m1 %if %3 pmaddubsw m1, m7, m2 %endif pmaddubsw m2, m7, m3 %if %3 pmaddubsw m3, m7, m4 paddw m0, m1 paddw m2, m3 %endif pmulhrsw m0, m6 pmulhrsw m2, m6 %else movu m0, [bufyq+xq] pxor m6, m6 pcmpgtb m6, m0 punpckhbw m2, m0, m6 punpcklbw m0, m6 %endif pmullw m0, m5 pmullw m2, m5 pmulhrsw m0, m9 pmulhrsw m2, m9 movu m1, [bufq+xq] pxor m4, m4 pcmpgtb m4, m1 punpckhbw m3, m1, m4 %if %2 punpcklbw m1, m4 paddw m2, m3 paddw m0, m1 %else punpcklbw m6, m1, m4 paddw m2, m3 paddw m0, m6 %endif packsswb m0, m2 %if %2 movu [bufq+xq], m0 add xd, 16 cmp xd, 32 jl .x_loop_ar0 ; last 6/12 pixels movu m1, [bufyq+xq*(1+%2)] %if %3 movu m2, [bufyq+xq*2+82] %endif pmaddubsw m0, m7, m1 %if %3 pmaddubsw m1, m7, m2 paddw m0, m1 %endif pmulhrsw m0, m6 pmullw m0, m5 pmulhrsw m0, m9 movq m1, [bufq+xq] pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 paddw m0, m2 packsswb m0, m0 pandn m2, m8, m0 pand m1, m8 por m2, m1 movq [bufq+xq], m2 %else add xd, 16 cmp xd, 80 je .y_loop_final_ar0 movu [bufq+xq-16], m0 jmp .x_loop_ar0 .y_loop_final_ar0: pandn m2, m8, m0 pand m1, m8 por m2, m1 movu [bufq+xq-16], m2 %endif add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET .ar1: %if ARCH_X86_32 %assign stack_offset stack_offset_old %assign stack_size_padded 0 %xdefine rstk rsp %endif DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x imul uvd, 28 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 %if ARCH_X86_32 mov r3mp, cf3d DEFINE_ARGS buf, shift, fg_data, val3, min, max, x %elif WIN64 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x mov bufq, r0 %else DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m3, [base+round_vals+shiftq*2-12] ; rnd %if %2 movd m7, [base+pb_1] movd m6, [base+hmul_bits+2+%3*2] %endif psrldq m4, 1 %if ARCH_X86_32 DEFINE_ARGS buf, shift, val0, val3, min, max, x %elif WIN64 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 %else DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 %endif pxor m5, m5 punpcklwd m3, m5 %if %2 punpcklwd m6, m6 %endif pcmpgtb m5, m4 punpcklbw m4, m5 pshufd m5, m4, q1111 pshufd m4, m4, q0000 pshufd m3, m3, q0000 %if %2 pshufd m7, m7, q0000 pshufd m6, m6, q0000 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif %if ARCH_X86_32 add r1mp, 79+82*3 mov r0mp, 70-35*%3 %else add bufyq, 79+82*3 mov hd, 70-35*%3 %endif mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: %if %2 %if ARCH_X86_32 mov r2, r1mp movq m0, [r2+xq*2] %if %3 movq m1, [r2+xq*2+82] %endif %else movq m0, [bufyq+xq*2] %if %3 movq m1, [bufyq+xq*2+82] %endif %endif pmaddubsw m2, m7, m0 %if %3 pmaddubsw m0, m7, m1 paddw m2, m0 %endif pmulhrsw m2, m6 %else %if ARCH_X86_32 mov r2, r1mp movd m2, [r2+xq] %else movd m2, [bufyq+xq] %endif pxor m0, m0 pcmpgtb m0, m2 punpcklbw m2, m0 %endif movq m0, [bufq+xq-82-1] ; top/left pxor m1, m1 pcmpgtb m1, m0 punpcklbw m0, m1 psrldq m1, m0, 4 ; top/right punpcklwd m1, m2 psrldq m2, m0, 2 ; top punpcklwd m0, m2 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 paddd m0, m3 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 %if ARCH_X86_32 imul val3d, r3mp %else imul val3d, cf3d %endif add val3d, val0d sar val3d, shiftb movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 %if ARCH_X86_32 add r1mp, 82<<%3 dec r0mp %else add bufyq, 82<<%3 dec hd %endif jg .y_loop_ar1 RET .ar2: %if ARCH_X86_32 %assign stack_offset stack_offset_old %assign stack_size_padded 0 %xdefine rstk rsp ALLOC_STACK -8*16 %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 movd m7, [base+round_vals-12+shiftq*2] movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 pxor m2, m2 pcmpgtb m2, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 pinsrw m1, [base+pw_1], 5 punpcklwd m7, m7 pshufd m7, m7, q0000 DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd m4, m1, q0000 pshufd m5, m1, q1111 pshufd m6, m1, q2222 pshufd m3, m0, q3333 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 SCRATCH 6, 14, 6 SCRATCH 7, 15, 7 %if %2 movd m7, [base+hmul_bits+2+%3*2] movd m6, [base+pb_1] punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: pxor m2, m2 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pcmpgtb m2, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] psrldq m3, m1, 2 ; y=-1,x=[-1,+5] psrldq m4, m1, 4 ; y=-1,x=[+0,+5] punpcklwd m2, m0, m5 punpcklwd m3, m4 pmaddwd m2, m8 pmaddwd m3, m11 paddd m2, m3 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] psrldq m5, m0, 6 ; y=-2,x=[+1,+5] psrldq m0, 8 ; y=-2,x=[+2,+5] punpcklwd m4, m5 punpcklwd m0, m1 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] psrldq m1, m1, 8 ; y=-1,x=[+2,+5] punpcklwd m3, m1 pmaddwd m4, m9 pmaddwd m0, m10 pmaddwd m3, m12 paddd m4, m0 paddd m2, m3 paddd m2, m4 %if %2 movq m1, [bufyq+xq*2] %if %3 movq m3, [bufyq+xq*2+82] %endif pmaddubsw m0, m6, m1 %if %3 pmaddubsw m1, m6, m3 paddw m0, m1 %endif pmulhrsw m0, m7 %else movd m0, [bufyq+xq] pxor m1, m1 pcmpgtb m1, m0 punpcklbw m0, m1 %endif punpcklwd m0, m15 pmaddwd m0, m14 paddd m2, m0 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] pxor m4, m4 movd m5, [base+byte_blend+1] punpcklbw m5, m5 .x_loop_ar2_inner: pcmpgtb m1, m4, m0 punpcklbw m0, m1 pmaddwd m3, m0, m13 paddd m3, m2 psrldq m2, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] pslldq m3, 4 pand m3, m5 paddw m0, m3 packsswb m0, m0 movd [bufq+xq-2], m0 psrldq m0, 1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET .ar3: %if ARCH_X86_32 %assign stack_offset stack_offset_old %assign stack_size_padded 0 %xdefine rstk rsp %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 ALLOC_STACK -15*16 %else SUB rsp, 16*7 %assign stack_size_padded (stack_size_padded+16*7) %assign stack_size (stack_size+16*7) %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 pxor m3, m3 pcmpgtb m3, m0 punpckhbw m1, m0, m3 punpcklbw m0, m3 pshufd m2, m0, q1111 pshufd m3, m0, q2222 pshufd m4, m0, q3333 pshufd m0, m0, q0000 pshufd m5, m1, q1111 pshufd m6, m1, q2222 pshufd m7, m1, q3333 pshufd m1, m1, q0000 mova [rsp+ 0*16], m0 mova [rsp+ 1*16], m2 mova [rsp+ 2*16], m3 mova [rsp+ 3*16], m4 mova [rsp+ 4*16], m1 mova [rsp+ 5*16], m5 mova [rsp+ 6*16], m6 SCRATCH 7, 8, 7 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] pxor m4, m4 pcmpgtb m4, m2 punpckhbw m5, m2, m4 punpcklbw m2, m4 pshufd m4, m2, q3232 punpcklwd m3, m4, m5 pshuflw m5, m4, q3321 pshufd m4, m3, q0000 pshufd m3, m2, q1111 pshufd m2, m2, q0000 pinsrw m5, [base+round_vals+shiftq*2-10], 3 SCRATCH 2, 9, 8 SCRATCH 3, 10, 9 SCRATCH 4, 11, 10 SCRATCH 5, 12, 11 movd m2, [base+round_vals-12+shiftq*2] %if %2 movd m1, [base+pb_1] movd m3, [base+hmul_bits+2+%3*2] %endif pxor m0, m0 punpcklwd m2, m0 %if %2 punpcklwd m3, m3 %endif pshufd m2, m2, q0000 %if %2 pshufd m1, m1, q0000 pshufd m3, m3, q0000 SCRATCH 1, 13, 12 %endif SCRATCH 2, 14, 13 %if %2 SCRATCH 3, 15, 14 %endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] pxor m4, m4 pcmpgtb m4, m0 punpckhbw m3, m0, m4 punpcklbw m0, m4 psrldq m5, m0, 2 psrldq m6, m0, 4 psrldq m7, m0, 6 punpcklwd m4, m0, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 0*16] pmaddwd m6, [rsp+ 1*16] paddd m4, m6 palignr m2, m3, m0, 10 palignr m3, m0, 12 psrldq m0, 8 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] pxor m6, m6 pcmpgtb m6, m1 punpckhbw m5, m1, m6 punpcklbw m1, m6 punpcklwd m0, m2 punpcklwd m3, m1 pmaddwd m0, [rsp+ 2*16] pmaddwd m3, [rsp+ 3*16] paddd m0, m3 paddd m0, m4 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor m7, m7 pcmpgtb m7, m2 punpckhbw m6, m2, m7 punpcklbw m2, m7 palignr m3, m5, m1, 10 palignr m5, m1, 12 psrldq m4, m2, 2 punpcklwd m3, m5 punpcklwd m5, m2, m4 pmaddwd m3, [rsp+ 6*16] pmaddwd m5, m8 paddd m3, m5 paddd m0, m3 psrldq m3, m1, 2 psrldq m4, m1, 4 psrldq m5, m1, 6 psrldq m1, 8 punpcklwd m3, m4 punpcklwd m5, m1 pmaddwd m3, [rsp+ 4*16] pmaddwd m5, [rsp+ 5*16] paddd m3, m5 paddd m0, m3 %if %2 movq m1, [bufyq+xq*2] %if %3 movq m3, [bufyq+xq*2+82] %endif pmaddubsw m7, m13, m1 %if %3 pmaddubsw m5, m13, m3 paddw m7, m5 %endif pmulhrsw m7, m15 %else movd m7, [bufyq+xq] pxor m1, m1 pcmpgtb m1, m7 punpcklbw m7, m1 %endif psrldq m1, m2, 4 psrldq m3, m2, 6 palignr m4, m6, m2, 10 palignr m6, m2, 12 psrldq m2, 8 punpcklwd m1, m3 punpcklwd m2, m4 punpcklwd m6, m7 pmaddwd m1, m9 pmaddwd m2, m10 pmaddwd m6, m11 paddd m1, m2 paddd m0, m6 paddd m0, m1 paddd m0, m14 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] pxor m4, m4 movd m5, [base+byte_blend] .x_loop_ar3_inner: pcmpgtb m2, m4, m1 punpcklbw m3, m1, m2 pmaddwd m2, m3, m12 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw, we only care about one value packsswb m2, m2 pandn m3, m5, m1 pslld m2, 24 pand m2, m5 por m1, m2, m3 movd [bufq+xq-3], m1 psrldq m1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET %endmacro generate_grain_uv_fn 420, 1, 1 generate_grain_uv_fn 422, 1, 0 generate_grain_uv_fn 444, 0, 0 %macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg %assign %%idx 0 %define %%tmp %2 %if %0 == 6 %define %%tmp %6 %endif %rep 4 %if %%idx == 0 movd %5 %+ d, %2 pshuflw %%tmp, %2, q3232 %else movd %5 %+ d, %%tmp %if %%idx == 2 punpckhqdq %%tmp, %%tmp %elif %%idx == 4 psrlq %%tmp, 32 %endif %endif movzx %4 %+ d, %5 %+ w shr %5 %+ d, 16 %if %%idx == 0 movd %1, [%3+%4] %else pinsrw %1, [%3+%4], %%idx + 0 %endif pinsrw %1, [%3+%5], %%idx + 1 %assign %%idx %%idx+2 %endrep %endmacro INIT_XMM ssse3 ; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov r5, r8m mov [rsp+6*mmsize+ 3*gprsize], r0 mov [rsp+6*mmsize+ 5*gprsize], r1 mov [rsp+6*mmsize+ 7*gprsize], r2 mov [rsp+6*mmsize+ 9*gprsize], r3 mov [rsp+6*mmsize+10*gprsize], r4 mov [rsp+6*mmsize+11*gprsize], r5 %else cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize %define r0m [rsp+6*mmsize+ 3*gprsize] %define r1m [rsp+6*mmsize+ 4*gprsize] %define r2m [rsp+6*mmsize+ 5*gprsize] %define r3m [rsp+6*mmsize+ 6*gprsize] %define r4m [rsp+6*mmsize+ 7*gprsize] %define r5m [rsp+6*mmsize+ 8*gprsize] %define r6m [rsp+6*mmsize+ 9*gprsize] %define r7m [rsp+6*mmsize+10*gprsize] %define r8m [rsp+6*mmsize+11*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, picptrq %else cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut lea r7, [pb_mask] %define base r7-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] pcmpeqw m2, m2 psrldq m2, 14 movd m4, [base+max+r6*4] movd m5, [base+min+r6*2] punpcklwd m3, m3 punpcklwd m4, m4 punpcklwd m5, m5 pshufd m3, m3, q0000 pshufd m4, m4, q0000 pshufd m5, m5, q0000 SCRATCH 2, 10, 0 SCRATCH 3, 11, 1 SCRATCH 4, 12, 2 SCRATCH 5, 13, 3 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %endif mov sbyd, r8m mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 test overlapd, overlapd jz .no_vertical_overlap mova m6, [base+pw_1024] movd m7, [base+pb_27_17_17_27] SCRATCH 6, 14, 4 SCRATCH 7, 15, 5 test sbyd, sbyd jnz .vertical_overlap ; fall-through .no_vertical_overlap: mov r8m, overlapd %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ unused1, unused2, see, unused3 %endif lea src_bakq, [srcq+wq] neg wq sub dstmp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r4m, wq DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 %endif .loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, unused mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, ; r6m=grain_lut, r7m=h, r8m=overlap_v|h DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, unused %endif .loop_x_odd: mov hd, r7m mov grain_lutq, grain_lutmp .loop_y: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq, r0, r5, m3 vpgatherdw m5, m1, scalingq, r0, r5, m3 %else vpgatherdw m4, m0, scalingq, r12, r13, m3 vpgatherdw m5, m1, scalingq, r12, r13, m3 %endif pcmpeqw m3, m3 psrlw m3, 8 pand m4, m3 pand m5, m3 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m4 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 add srcq, r2mp add grain_lutq, 82 dec hd jg .loop_y %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp %else lea srcq, [src_bakq+wq] %endif btc dword r8m, 2 jc .next_blk add offxyd, 16 test dword r8m, 2 ; r8m & 2 = have_top_overlap jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+6*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif jnz .loop_x_odd_v_overlap .next_blk: test dword r8m, 1 jz .loop_x test dword r8m, 2 jnz .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: %if ARCH_X86_32 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, ; r6m=grain_lut, r7m=h, r8m=overlap_v|h DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 add offxyd, 16 ; left_offxyd mov [rsp+6*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 mov seed, r3m %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy %endif mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq, r0, r5, m3 vpgatherdw m5, m1, scalingq, r0, r5, m3 %else vpgatherdw m4, m0, scalingq, r12, r13, m3 vpgatherdw m5, m1, scalingq, r12, r13, m3 %endif pcmpeqw m3, m3 psrlw m3, 8 pand m4, m3 pand m5, m3 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+6*mmsize+0*gprsize] movd m7, [grain_lutq+r5] %else movd m7, [grain_lutq+left_offxyq] %endif punpcklbw m7, m3 pmaddubsw m6, m15, m7 pmulhrsw m6, m14 packsswb m6, m6 pand m6, m10 pandn m7, m10, m3 por m6, m7 pcmpgtb m2, m6 punpcklbw m7, m6, m2 punpckhbw m6, m2 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m7, m4 pmullw m6, m5 pmulhrsw m7, m11 pmulhrsw m6, m11 ; dst = clip_pixel(src, noise) paddw m0, m7 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 add srcq, r2mp add grain_lutq, 82 dec hd jg .loop_y_h_overlap %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r1m add srcq, r4m %else lea srcq, [src_bakq+wq] %endif xor dword r8m, 4 add offxyd, 16 ; since this half-block had left-overlap, the next does not test dword r8m, 2 ; have_top_overlap jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+6*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif jmp .loop_x_odd_v_overlap .end: RET .vertical_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap %endif or overlapd, 2 ; top_overlap: overlap & 2 mov r8m, overlapd movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul tmpd, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add tmpd, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and tmpd, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, tmpd %if ARCH_X86_32 xor sbyd, seed ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ tmp, unused2, see, unused3 %endif lea src_bakq, [srcq+wq] neg wq sub dstmp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %endif .loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed, ; because of the 'and tmpd, 0x00ff00ff' above mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, unused, top_offxy mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, unused, top_offxy %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+6*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 .loop_x_odd_v_overlap: %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] mov [rsp+5*mmsize+8], r5 %else mova m8, [pb_27_17] %endif mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_v_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq, r0, r5, m3 vpgatherdw m5, m1, scalingq, r0, r5, m3 %else vpgatherdw m4, m0, scalingq, r12, r13, m3 vpgatherdw m5, m1, scalingq, r12, r13, m3 %endif pcmpeqw m3, m3 psrlw m3, 8 pand m4, m3 pand m5, m3 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+6*mmsize+1*gprsize] movu m7, [grain_lutq+r5] %else movu m7, [grain_lutq+top_offxyq] %endif punpckhbw m6, m7, m3 punpcklbw m7, m3 %if ARCH_X86_32 mov r5, [rsp+5*mmsize+8] pmaddubsw m3, [r5], m6 pmaddubsw m6, [r5], m7 %else pmaddubsw m3, m8, m6 pmaddubsw m6, m8, m7 %endif pmulhrsw m3, m14 pmulhrsw m6, m14 packsswb m6, m3 pcmpgtb m7, m2, m6 punpcklbw m2, m6, m7 punpckhbw m6, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m4 pmullw m6, m5 pmulhrsw m2, m11 pmulhrsw m6, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add dword [rsp+5*mmsize+8], mmsize %else mova m8, [pb_17_27] %endif add srcq, r2mp add grain_lutq, 82 dec hw jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines btc hd, 16 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp %else lea srcq, [src_bakq+wq] %endif btc dword r8m, 2 jc .loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 add dword [rsp+6*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif jmp .loop_x_odd_v_overlap .loop_x_hv_overlap: %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] mov [rsp+5*mmsize+8], r5 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak mov r5, [rsp+6*mmsize+1*gprsize] mov r4, offxyd add r5, 16 add r4, 16 mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak xor tmpd, tmpd mov seed, r3m %else mova m8, [pb_27_17] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ tmp, unused2, see, unused3 ; we assume from the block above that bits 8-15 of tmpd are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut movzx r5, offxyw ; top_offxy mov [rsp+6*mmsize+1*gprsize], r5 %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy movzx top_offxyd, offxyw %endif shr offxyd, 16 mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy movu m6, [grain_lutq+r5] mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy movd m4, [grain_lutq+r0] movd m7, [grain_lutq+r5] %else movu m6, [grain_lutq+top_offxyq] movd m4, [grain_lutq+left_offxyq] movd m7, [grain_lutq+topleft_offxyq] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m4, m3 punpcklbw m7, m6 pmaddubsw m2, m15, m4 pmaddubsw m4, m15, m7 pmulhrsw m2, m14 pmulhrsw m4, m14 packsswb m2, m2 packsswb m4, m4 pand m2, m10 pand m4, m10 pandn m7, m10, m3 pandn m3, m10, m6 por m7, m2 por m3, m4 ; followed by v interpolation (top | cur -> cur) punpckhbw m4, m3, m7 punpcklbw m3, m7 %if ARCH_X86_32 mov r5, [rsp+5*mmsize+8] pmaddubsw m7, [r5], m4 pmaddubsw m4, [r5], m3 %else pmaddubsw m7, m8, m4 pmaddubsw m4, m8, m3 %endif pmulhrsw m7, m14 pmulhrsw m4, m14 packsswb m4, m7 pxor m2, m2 pcmpgtb m7, m2, m4 punpcklbw m3, m4, m7 punpckhbw m4, m7 ; src mova m0, [srcq] punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m5, m0, scalingq, r0, r5, m7 vpgatherdw m6, m1, scalingq, r0, r5, m7 %else vpgatherdw m5, m0, scalingq, r13, r14, m7 vpgatherdw m6, m1, scalingq, r13, r14, m7 %endif pcmpeqw m7, m7 psrlw m7, 8 pand m5, m7 pand m6, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m3, m5 pmullw m4, m6 pmulhrsw m3, m11 pmulhrsw m4, m11 ; dst = clip_pixel(src, noise) paddw m0, m3 paddw m1, m4 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add dword [rsp+5*mmsize+8], mmsize %else mova m8, [pb_17_27] %endif add srcq, r2mp add grain_lutq, 82 dec hw jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines btc hd, 16 jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r1m add srcq, r4m %else lea srcq, [src_bakq+wq] %endif xor dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 add dword [rsp+6*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif jmp .loop_x_odd_v_overlap .end_hv: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver INIT_XMM ssse3 %if ARCH_X86_32 ; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, ; sby, luma, lstride, uv_pl, is_id) %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov [rsp+8*mmsize+3*gprsize], r0 mov [rsp+8*mmsize+5*gprsize], r1 mov [rsp+8*mmsize+7*gprsize], r2 mov [rsp+8*mmsize+9*gprsize], r3 mov [rsp+8*mmsize+10*gprsize], r4 mov r0, r8m mov r1, r9m mov r2, r10m mov r4, r11m mov r3, r12m mov [rsp+8*mmsize+11*gprsize], r0 mov [rsp+8*mmsize+12*gprsize], r1 mov [rsp+8*mmsize+13*gprsize], r2 mov [rsp+8*mmsize+14*gprsize], r4 %else cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize %define r0m [rsp+8*mmsize+ 3*gprsize] %define r1m [rsp+8*mmsize+ 4*gprsize] %define r2m [rsp+8*mmsize+ 5*gprsize] %define r3m [rsp+8*mmsize+ 6*gprsize] %define r4m [rsp+8*mmsize+ 7*gprsize] %define r5m [rsp+8*mmsize+ 8*gprsize] %define r6m [rsp+8*mmsize+ 9*gprsize] %define r7m [rsp+8*mmsize+10*gprsize] %define r8m [rsp+8*mmsize+11*gprsize] %define r9m [rsp+8*mmsize+12*gprsize] %define r10m [rsp+8*mmsize+13*gprsize] %define r11m [rsp+8*mmsize+14*gprsize] %define r12m [rsp+8*mmsize+15*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, r5 %else cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id lea r8, [pb_mask] %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] pcmpeqw m2, m2 movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] lea tmpd, [r6d*2] %if ARCH_X86_32 && STACK_ALIGNMENT < mmsize test r3, r3 %else cmp dword r12m, 0 ; is_idm %endif movd m5, [base+min+r6*2] cmovne r6d, tmpd movd m4, [base+max+r6*2] psrldq m2, 14+%2 punpcklwd m3, m3 punpcklwd m5, m5 punpcklwd m4, m4 pshufd m3, m3, q0000 pshufd m5, m5, q0000 pshufd m4, m4, q0000 SCRATCH 2, 10, 0 SCRATCH 3, 11, 1 SCRATCH 4, 12, 2 SCRATCH 5, 13, 3 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %endif %if %1 mov r6d, dword r11m movd m0, [fg_dataq+FGData.uv_mult+r6*4] movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] punpcklbw m6, m1, m0 movd m7, [fg_dataq+FGData.uv_offset+r6*4] punpcklwd m6, m6 punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 SCRATCH 6, 14, 4 SCRATCH 7, 15, 5 %endif mov sbyd, r8m mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 test overlapd, overlapd jz %%no_vertical_overlap %if ARCH_X86_32 %if %2 movd m1, [base+pb_23_22] %else movd m1, [base+pb_27_17_17_27] %endif mova m0, [base+pw_1024] %else %if %2 movd m1, [pb_23_22] %else movd m1, [pb_27_17_17_27] %endif mova m0, [pw_1024] %endif pshufd m1, m1, q0000 SCRATCH 0, 8, 6 SCRATCH 1, 9, 7 test sbyd, sbyd jnz %%vertical_overlap ; fall-through %%no_vertical_overlap: mov r8m, overlapd %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak %define luma_bakq lumaq mov wq, r4m %if %3 shl r10mp, 1 %endif %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak mov lstrideq, r10mp %endif mov lumaq, r9mp lea src_bakq, [srcq+wq] lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r11m, luma_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq mov r12mp, strideq %endif %%loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, unused1, unused2, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak %endif %%loop_x_odd: mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: ; src %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq, r0, r5 vpgatherdw m5, m6, scalingq, r0, r5 %else vpgatherdw m7, m4, scalingq, r12, r2 vpgatherdw m5, m6, scalingq, r12, r2 %endif pcmpeqw m1, m1 psrlw m1, 8 pand m7, m1 pand m5, m1 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq+ 0] pcmpgtb m6, m2, m3 punpcklbw m2, m3, m6 punpckhbw m3, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; we already incremented lumaq above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 dec hw jg %%loop_y %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 == 0 ; adjust top_offxy %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 %endif add offxyd, 16 btc dword r8m, 2 jc %%loop_x_even test dword r8m, 2 jz %%loop_x_odd jmp %%loop_x_odd_v_overlap %%loop_x_even: %endif test dword r8m, 1 jz %%loop_x ; r8m = sbym test dword r8m, 2 jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: %if ARCH_X86_32 %if %2 lea r6, [offxyd+16] mov [rsp+8*mmsize+0*gprsize], r6 %else mov [rsp+8*mmsize+0*gprsize], offxyd %endif DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut mov seed, r3m %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride %if %2 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx %else mov left_offxyd, offyd %endif %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_h_overlap: ; src %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq, r0, r5 vpgatherdw m5, m6, scalingq, r0, r5 %else vpgatherdw m7, m4, scalingq, r12, r2 vpgatherdw m5, m6, scalingq, r12, r2 %endif pcmpeqw m1, m1 psrlw m1, 8 pand m7, m1 pand m5, m1 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq+ 0] %if ARCH_X86_32 mov r0, [rsp+8*mmsize+0*gprsize] movd m4, [grain_lutq+r0+ 0] %else movd m4, [grain_lutq+left_offxyq+ 0] %endif punpcklbw m2, m4, m3 pmaddubsw m4, m9, m2 pmulhrsw m4, m8 packsswb m4, m4 pand m4, m10 pandn m2, m10, m3 por m3, m4, m2 pxor m4, m4 pcmpgtb m4, m3 punpcklbw m2, m3, m4 punpckhbw m3, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; lumaq has already been incremented above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 dec hw jg %%loop_y_h_overlap %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 == 0 xor dword r8m, 4 ; adjust top_offxyd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 %endif add offxyd, 16 %endif ; r8m = sbym test dword r8m, 2 %if %2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %else jne %%loop_x_odd_v_overlap jmp %%loop_x_odd %endif %%end: RET %%vertical_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap %endif or overlapd, 2 ; top_overlap: overlap & 2 mov r8m, overlapd movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul tmpd, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add tmpd, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and tmpd, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, tmpd %if ARCH_X86_32 xor sbyd, seed ; (cur_seed << 16) | top_seed DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %if %3 shl r10mp, 1 %endif %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak mov lstrideq, r10mp %endif mov lumaq, r9mp lea src_bakq, [srcq+wq] lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r11m, luma_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq mov r12mp, strideq %endif %%loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m xor tmpd, tmpd %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy, unused, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak %endif movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %endif %%loop_x_odd_v_overlap: mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m mova m1, [base+pb_27_17] %else mova m1, [pb_27_17] %endif %%loop_y_v_overlap: %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq, r0, r5 vpgatherdw m5, m6, scalingq, r0, r5 %else vpgatherdw m7, m4, scalingq, r12, r2 vpgatherdw m5, m6, scalingq, r12, r2 %endif pcmpeqw m4, m4 psrlw m4, 8 pand m7, m4 pand m5, m4 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r0, [rsp+8*mmsize+1*gprsize] movu m4, [grain_lutq+r0] %else movu m4, [grain_lutq+top_offxyq] %endif punpckhbw m6, m4, m3 punpcklbw m4, m3 %if %3 pmaddubsw m2, m9, m6 pmaddubsw m3, m9, m4 %else pmaddubsw m2, m1, m6 pmaddubsw m3, m1, m4 %endif pmulhrsw m2, m8 pmulhrsw m3, m8 packsswb m3, m2 pxor m6, m6 pcmpgtb m6, m3 punpcklbw m2, m3, m6 punpckhbw m3, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; unpack chroma_source pxor m4, m4 punpckhbw m6, m0, m4 punpcklbw m0, m4 ; m0-1: src as word %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m6, m3 pmaxsw m0, m13 pmaxsw m6, m13 pminsw m0, m12 pminsw m6, m12 packuswb m0, m6 movifnidn dstq, dstmp mova [dstq+srcq], m0 dec hw je %%end_y_v_overlap %if ARCH_X86_32 add srcq, r2mp ; lumaq has already been incremented above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 %if %3 == 0 btc hd, 16 %if ARCH_X86_32 mov r5, r5m mova m1, [base+pb_17_27] %else mova m1, [pb_17_27] %endif jnc %%loop_y_v_overlap %endif jmp %%loop_y %%end_y_v_overlap: %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %else %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 btc dword r8m, 2 jnc %%loop_x_odd_v_overlap %endif %%loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused mov r6, [rsp+8*mmsize+1*gprsize] %if %2 lea r0, [r3d+16] add r6, 16 mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy %else mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy %endif mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused mov seed, r3m xor tmpd, tmpd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride %if %2 lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offxyq+16] %else mov topleft_offxyq, top_offxyq mov left_offxyq, offxyq %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak %endif movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd %endif mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m mova m3, [base+pb_27_17] %else mova m3, [pb_27_17] %endif %%loop_y_hv_overlap: ; src %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq, r0, r5 vpgatherdw m5, m6, scalingq, r0, r5 %else movd m1, [grain_lutq+topleft_offxyq] %if %3 vpgatherdw m7, m4, scalingq, r2, r12 vpgatherdw m5, m6, scalingq, r2, r12 %else vpgatherdw m7, m4, scalingq, r2, r13 vpgatherdw m5, m6, scalingq, r2, r13 %endif %endif pcmpeqw m2, m2 psrlw m2, 8 pand m7, m2 pand m5, m2 ; grain = grain_lut[offy+y][offx+x] %if ARCH_X86_32 mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy movd m1, [grain_lutq+r0] mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy %endif movu m2, [grain_lutq+offxyq] %if ARCH_X86_32 movu m6, [grain_lutq+r5] movd m4, [grain_lutq+r0] %else movu m6, [grain_lutq+top_offxyq] movd m4, [grain_lutq+left_offxyq] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m1, m6 punpcklbw m4, m2 %if %2 punpcklwd m4, m1 %else punpckldq m4, m1 %endif pmaddubsw m1, m9, m4 pmulhrsw m1, m8 packsswb m1, m1 pandn m4, m10, m2 pandn m2, m10, m6 psrldq m6, m1, 2-%2 pand m1, m10 pand m6, m10 por m4, m1 por m2, m6 ; followed by v interpolation (top | cur -> cur) punpckhbw m1, m2, m4 punpcklbw m2, m4 %if %3 pmaddubsw m4, m9, m1 pmaddubsw m1, m9, m2 %else pmaddubsw m4, m3, m1 pmaddubsw m1, m3, m2 %endif pmulhrsw m4, m8 pmulhrsw m1, m8 packsswb m1, m4 pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 punpckhbw m1, m4 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m7 pmullw m1, m5 pmulhrsw m2, m11 pmulhrsw m1, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; unpack chroma source pxor m4, m4 punpckhbw m5, m0, m4 punpcklbw m0, m4 ; m0-1: src as word ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m5, m1 pmaxsw m0, m13 pmaxsw m5, m13 pminsw m0, m12 pminsw m5, m12 packuswb m0, m5 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; lumaq has been adjusted above already %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*(1+%2)] %else add lumaq, r10mp %endif %endif add grain_lutq, 82 dec hw %if %3 jg %%loop_y_h_overlap %else jle %%end_y_hv_overlap %if ARCH_X86_32 mov r5, r5m mova m3, [base+pb_17_27] %else mova m3, [pb_17_27] %endif btc hd, 16 jnc %%loop_y_hv_overlap %if ARCH_X86_64 mov lstrideq, r10mp %endif jmp %%loop_y_h_overlap %%end_y_hv_overlap: %if ARCH_X86_64 mov lstrideq, r10mp %endif %endif %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 jmp %%loop_x_hv_overlap %else %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 xor dword r8m, 4 jmp %%loop_x_odd_v_overlap %endif %%end_hv: RET %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 %endmacro FGUV_FN 420, 1, 1 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif FGUV_FN 422, 1, 0 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif FGUV_FN 444, 0, 0