ref: 5462c2a80de6f7f6c0b1b0d20cbe571b09510a19
parent: 40891aab9bc2f60bdbef16d1a499f36944e1738d
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Wed Dec 11 19:24:02 EST 2019
x86: add prep_bilin AVX512 asm ------------------------------------------ mct_bilinear_w4_0_8bpc_avx2: 3.8 mct_bilinear_w4_0_8bpc_avx512icl: 3.7 --------------------- mct_bilinear_w8_0_8bpc_avx2: 5.0 mct_bilinear_w8_0_8bpc_avx512icl: 4.8 --------------------- mct_bilinear_w16_0_8bpc_avx2: 8.5 mct_bilinear_w16_0_8bpc_avx512icl: 7.1 --------------------- mct_bilinear_w32_0_8bpc_avx2: 29.5 mct_bilinear_w32_0_8bpc_avx512icl: 17.1 --------------------- mct_bilinear_w64_0_8bpc_avx2: 68.1 mct_bilinear_w64_0_8bpc_avx512icl: 34.7 --------------------- mct_bilinear_w128_0_8bpc_avx2: 180.5 mct_bilinear_w128_0_8bpc_avx512icl: 138.0 ------------------------------------------ mct_bilinear_w4_h_8bpc_avx2: 4.0 mct_bilinear_w4_h_8bpc_avx512icl: 3.9 --------------------- mct_bilinear_w8_h_8bpc_avx2: 5.3 mct_bilinear_w8_h_8bpc_avx512icl: 5.0 --------------------- mct_bilinear_w16_h_8bpc_avx2: 11.7 mct_bilinear_w16_h_8bpc_avx512icl: 7.5 --------------------- mct_bilinear_w32_h_8bpc_avx2: 41.8 mct_bilinear_w32_h_8bpc_avx512icl: 20.3 --------------------- mct_bilinear_w64_h_8bpc_avx2: 94.9 mct_bilinear_w64_h_8bpc_avx512icl: 35.0 --------------------- mct_bilinear_w128_h_8bpc_avx2: 240.1 mct_bilinear_w128_h_8bpc_avx512icl: 143.8 ------------------------------------------ mct_bilinear_w4_v_8bpc_avx2: 4.1 mct_bilinear_w4_v_8bpc_avx512icl: 4.0 --------------------- mct_bilinear_w8_v_8bpc_avx2: 6.0 mct_bilinear_w8_v_8bpc_avx512icl: 5.4 --------------------- mct_bilinear_w16_v_8bpc_avx2: 10.3 mct_bilinear_w16_v_8bpc_avx512icl: 8.9 --------------------- mct_bilinear_w32_v_8bpc_avx2: 29.5 mct_bilinear_w32_v_8bpc_avx512icl: 25.9 --------------------- mct_bilinear_w64_v_8bpc_avx2: 64.3 mct_bilinear_w64_v_8bpc_avx512icl: 41.3 --------------------- mct_bilinear_w128_v_8bpc_avx2: 198.2 mct_bilinear_w128_v_8bpc_avx512icl: 139.6 ------------------------------------------ mct_bilinear_w4_hv_8bpc_avx2: 5.6 mct_bilinear_w4_hv_8bpc_avx512icl: 5.2 --------------------- mct_bilinear_w8_hv_8bpc_avx2: 8.3 mct_bilinear_w8_hv_8bpc_avx512icl: 7.0 --------------------- mct_bilinear_w16_hv_8bpc_avx2: 19.4 mct_bilinear_w16_hv_8bpc_avx512icl: 12.1 --------------------- mct_bilinear_w32_hv_8bpc_avx2: 69.1 mct_bilinear_w32_hv_8bpc_avx512icl: 32.5 --------------------- mct_bilinear_w64_hv_8bpc_avx2: 164.4 mct_bilinear_w64_hv_8bpc_avx512icl: 71.1 --------------------- mct_bilinear_w128_hv_8bpc_avx2: 405.2 mct_bilinear_w128_hv_8bpc_avx512icl: 193.1 ------------------------------------------
--- a/meson.build
+++ b/meson.build
@@ -268,12 +268,12 @@
if get_option('stack_alignment') > 0
stack_alignment = get_option('stack_alignment')
elif host_machine.cpu_family() == 'x86_64'
- if cc.has_argument('-mpreferred-stack-boundary=5')
- stackalign_flag = ['-mpreferred-stack-boundary=5']
+ if cc.has_argument('-mpreferred-stack-boundary=6')
+ stackalign_flag = ['-mpreferred-stack-boundary=6']
stackrealign_flag = ['-mincoming-stack-boundary=4']
stack_alignment = 32
- elif cc.has_argument('-mstack-alignment=32')
- stackalign_flag = ['-mstack-alignment=32']
+ elif cc.has_argument('-mstack-alignment=64')
+ stackalign_flag = ['-mstack-alignment=64']
stackrealign_flag = ['-mstackrealign']
stack_alignment = 32
else
--- a/src/internal.h
+++ b/src/internal.h
@@ -288,7 +288,7 @@
uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
uint8_t txtp_map[32 * 32]; // inter-only
- ALIGN(union, 32) {
+ ALIGN(union, 64) {
struct {
union {
uint8_t lap_8bpc [128 * 32];
--- a/src/lib.c
+++ b/src/lib.c
@@ -152,7 +152,7 @@
f->c = c;
f->lf.last_sharpness = -1;
f->n_tc = s->n_tile_threads;
- f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 32);
+ f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64);
if (!f->tc) goto error;
memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads);
if (f->n_tc > 1) {
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -27,7 +27,7 @@
%if ARCH_X86_64
-SECTION_RODATA 32
+SECTION_RODATA 64
; dav1d_obmc_masks[] with 64-x interleaved
obmc_masks: db 0, 0, 0, 0
@@ -46,19 +46,41 @@
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
-warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
- db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
-warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
- db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
-subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
- db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
-subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
-deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
-blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+ db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+ db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
+ db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
+bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
+ db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
+bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
+ db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
+bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
+ db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
+ db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
+bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
+ db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
+bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_64: times 4 db 64
pw_34: times 2 dw 34
@@ -108,9 +130,11 @@
%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
-BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
-BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
@@ -144,10 +168,11 @@
%endif
%endmacro
-HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
@@ -729,10 +754,11 @@
lea t2d, [hq+(3<<8)]
jmp .hv_w32gt
+%macro PREP_BILIN 0
DECLARE_REG_TMP 3, 5, 6
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
- lea t2, [prep_avx2]
+ lea t2, [prep%+SUFFIX]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -751,9 +777,9 @@
pinsrd xm0, [srcq+strideq*2], 2
pinsrd xm0, [srcq+stride3q ], 3
lea srcq, [srcq+strideq*4]
- pmovzxbw m0, xm0
- psllw m0, 4
- mova [tmpq], m0
+ pmovzxbw ym0, xm0
+ psllw ym0, 4
+ mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .prep_w4
@@ -760,6 +786,16 @@
RET
.prep_w8:
movq xm0, [srcq+strideq*0]
+%if cpuflag(avx512)
+ movq xm1, [srcq+strideq*1]
+ vinserti128 ym0, [srcq+strideq*2], 1
+ vinserti128 ym1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pmovzxbw m0, ym0
+ psllw m0, 4
+ mova [tmpq], m0
+%else
movhps xm0, [srcq+strideq*1]
movq xm1, [srcq+strideq*2]
movhps xm1, [srcq+stride3q ]
@@ -770,83 +806,120 @@
psllw m1, 4
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
+%endif
add tmpq, 32*2
sub hd, 4
jg .prep_w8
RET
.prep_w16:
+%if cpuflag(avx512)
+ movu xm0, [srcq+strideq*0]
+ vinserti128 ym0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 ym1, [srcq+stride3q ], 1
+ pmovzxbw m0, ym0
+ pmovzxbw m1, ym1
+%else
pmovzxbw m0, [srcq+strideq*0]
pmovzxbw m1, [srcq+strideq*1]
pmovzxbw m2, [srcq+strideq*2]
pmovzxbw m3, [srcq+stride3q ]
+%endif
lea srcq, [srcq+strideq*4]
psllw m0, 4
psllw m1, 4
+%if notcpuflag(avx512)
psllw m2, 4
psllw m3, 4
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
+%endif
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m1
+%if notcpuflag(avx512)
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
+%endif
add tmpq, 32*4
sub hd, 4
jg .prep_w16
RET
.prep_w32:
+%if cpuflag(avx512)
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+%else
pmovzxbw m0, [srcq+strideq*0+16*0]
pmovzxbw m1, [srcq+strideq*0+16*1]
pmovzxbw m2, [srcq+strideq*1+16*0]
pmovzxbw m3, [srcq+strideq*1+16*1]
lea srcq, [srcq+strideq*2]
+%endif
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
- add tmpq, 32*4
- sub hd, 2
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m1
+ mova [tmpq+mmsize*2], m2
+ mova [tmpq+mmsize*3], m3
+ add tmpq, mmsize*4
+ sub hd, mmsize*4/(32*2)
jg .prep_w32
RET
.prep_w64:
+%if cpuflag(avx512)
+ pmovzxbw m0, [srcq+strideq*0+32*0]
+ pmovzxbw m1, [srcq+strideq*0+32*1]
+ pmovzxbw m2, [srcq+strideq*1+32*0]
+ pmovzxbw m3, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+%else
pmovzxbw m0, [srcq+16*0]
pmovzxbw m1, [srcq+16*1]
pmovzxbw m2, [srcq+16*2]
pmovzxbw m3, [srcq+16*3]
add srcq, strideq
+%endif
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
- add tmpq, 32*4
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m1
+ mova [tmpq+mmsize*2], m2
+ mova [tmpq+mmsize*3], m3
+ add tmpq, mmsize*4
+%if cpuflag(avx512)
+ sub hd, 2
+%else
dec hd
+%endif
jg .prep_w64
RET
.prep_w128:
- pmovzxbw m0, [srcq+16*0]
- pmovzxbw m1, [srcq+16*1]
- pmovzxbw m2, [srcq+16*2]
- pmovzxbw m3, [srcq+16*3]
+ pmovzxbw m0, [srcq+(mmsize/2)*0]
+ pmovzxbw m1, [srcq+(mmsize/2)*1]
+ pmovzxbw m2, [srcq+(mmsize/2)*2]
+ pmovzxbw m3, [srcq+(mmsize/2)*3]
psllw m0, 4
psllw m1, 4
psllw m2, 4
psllw m3, 4
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m1
+ mova [tmpq+mmsize*2], m2
+ mova [tmpq+mmsize*3], m3
+%if notcpuflag(avx512)
pmovzxbw m0, [srcq+16*4]
pmovzxbw m1, [srcq+16*5]
pmovzxbw m2, [srcq+16*6]
pmovzxbw m3, [srcq+16*7]
+%endif
add tmpq, 32*8
add srcq, strideq
+%if notcpuflag(avx512)
psllw m0, 4
psllw m1, 4
psllw m2, 4
@@ -855,6 +928,7 @@
mova [tmpq-32*3], m1
mova [tmpq-32*2], m2
mova [tmpq-32*1], m3
+%endif
dec hd
jg .prep_w128
RET
@@ -862,11 +936,15 @@
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
imul mxyd, 0xff01
- vbroadcasti128 m4, [bilin_h_shuf8]
add mxyd, 16 << 8
+%if cpuflag(avx512)
+ vpbroadcastw m5, mxyd
+%else
movd xm5, mxyd
- mov mxyd, r6m ; my
+ vbroadcasti128 m4, [bilin_h_shuf8]
vpbroadcastw m5, xm5
+%endif
+ mov mxyd, r6m ; my
test mxyd, mxyd
jnz .hv
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
@@ -874,7 +952,7 @@
lea stride3q, [strideq*3]
jmp wq
.h_w4:
- vbroadcasti128 m4, [bilin_h_shuf4]
+ vbroadcasti128 ym4, [bilin_h_shuf4]
.h_w4_loop:
movq xm0, [srcq+strideq*0]
movhps xm0, [srcq+strideq*1]
@@ -881,40 +959,70 @@
movq xm1, [srcq+strideq*2]
movhps xm1, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- vinserti128 m0, m0, xm1, 1
- pshufb m0, m4
- pmaddubsw m0, m5
- mova [tmpq], m0
+ vinserti128 ym0, xm1, 1
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ mova [tmpq], ym0
add tmpq, 32
sub hd, 4
jg .h_w4_loop
RET
.h_w8:
- movu xm0, [srcq+strideq*0]
- vinserti128 m0, m0, [srcq+strideq*1], 1
- movu xm1, [srcq+strideq*2]
- vinserti128 m1, m1, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
+%if cpuflag(avx512)
+ vbroadcasti128 m4, [bilin_h_shuf8]
+.h_w8_loop:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 ym0, [srcq+strideq*1], 1
+ vinserti128 m0, [srcq+strideq*2], 2
+ vinserti128 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq+64*0], m0
+%else
+.h_w8_loop:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
mova [tmpq+32*0], m0
mova [tmpq+32*1], m1
+%endif
add tmpq, 32*2
sub hd, 4
- jg .h_w8
+ jg .h_w8_loop
RET
.h_w16:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
- movu xm2, [srcq+strideq*2+8*0]
- vinserti128 m2, m2, [srcq+strideq*2+8*1], 1
- movu xm3, [srcq+stride3q +8*0]
- vinserti128 m3, m3, [srcq+stride3q +8*1], 1
- lea srcq, [srcq+strideq*4]
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpermb m0, m4, m0
+ vpermb m1, m4, m1
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+%else
+.h_w16_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ movu xm2, [srcq+strideq*2+8*0]
+ vinserti128 m2, [srcq+strideq*2+8*1], 1
+ movu xm3, [srcq+stride3q +8*0]
+ vinserti128 m3, [srcq+stride3q +8*1], 1
+ lea srcq, [srcq+strideq*4]
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
@@ -927,93 +1035,133 @@
mova [tmpq+32*1], m1
mova [tmpq+32*2], m2
mova [tmpq+32*3], m3
+%endif
add tmpq, 32*4
sub hd, 4
- jg .h_w16
+ jg .h_w16_loop
RET
.h_w32:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
- movu xm1, [srcq+strideq*0+8*2]
- vinserti128 m1, m1, [srcq+strideq*0+8*3], 1
- movu xm2, [srcq+strideq*1+8*0]
- vinserti128 m2, m2, [srcq+strideq*1+8*1], 1
- movu xm3, [srcq+strideq*1+8*2]
- vinserti128 m3, m3, [srcq+strideq*1+8*3], 1
- lea srcq, [srcq+strideq*2]
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm32]
+.h_w32_loop:
+ vpermb m0, m4, [srcq+strideq*0]
+ vpermb m1, m4, [srcq+strideq*1]
+ vpermb m2, m4, [srcq+strideq*2]
+ vpermb m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+%else
+.h_w32_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*0+8*2]
+ vinserti128 m1, [srcq+strideq*0+8*3], 1
+ movu xm2, [srcq+strideq*1+8*0]
+ vinserti128 m2, [srcq+strideq*1+8*1], 1
+ movu xm3, [srcq+strideq*1+8*2]
+ vinserti128 m3, [srcq+strideq*1+8*3], 1
+ lea srcq, [srcq+strideq*2]
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
+%endif
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
- add tmpq, 32*4
- sub hd, 2
- jg .h_w32
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m1
+ mova [tmpq+mmsize*2], m2
+ mova [tmpq+mmsize*3], m3
+ add tmpq, mmsize*4
+ sub hd, mmsize*4/(32*2)
+ jg .h_w32_loop
RET
.h_w64:
- movu xm0, [srcq+8*0]
- vinserti128 m0, m0, [srcq+8*1], 1
- movu xm1, [srcq+8*2]
- vinserti128 m1, m1, [srcq+8*3], 1
- movu xm2, [srcq+8*4]
- vinserti128 m2, m2, [srcq+8*5], 1
- movu xm3, [srcq+8*6]
- vinserti128 m3, m3, [srcq+8*7], 1
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm32]
+.h_w64_loop:
+ vpermb m0, m4, [srcq+strideq*0+32*0]
+ vpermb m1, m4, [srcq+strideq*0+32*1]
+ vpermb m2, m4, [srcq+strideq*1+32*0]
+ vpermb m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+%else
+.h_w64_loop:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
add srcq, strideq
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
+%endif
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
- add tmpq, 32*4
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m1
+ mova [tmpq+mmsize*2], m2
+ mova [tmpq+mmsize*3], m3
+ add tmpq, mmsize*4
+%if cpuflag(avx512)
+ sub hd, 2
+%else
dec hd
- jg .h_w64
+%endif
+ jg .h_w64_loop
RET
.h_w128:
- movu xm0, [srcq+8*0]
- vinserti128 m0, m0, [srcq+8*1], 1
- movu xm1, [srcq+8*2]
- vinserti128 m1, m1, [srcq+8*3], 1
- movu xm2, [srcq+8*4]
- vinserti128 m2, m2, [srcq+8*5], 1
- movu xm3, [srcq+8*6]
- vinserti128 m3, m3, [srcq+8*7], 1
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm32]
+.h_w128_loop:
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+%else
+.h_w128_loop:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
+%endif
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
- movu xm0, [srcq+8* 8]
- vinserti128 m0, m0, [srcq+8* 9], 1
- movu xm1, [srcq+8*10]
- vinserti128 m1, m1, [srcq+8*11], 1
- movu xm2, [srcq+8*12]
- vinserti128 m2, m2, [srcq+8*13], 1
- movu xm3, [srcq+8*14]
- vinserti128 m3, m3, [srcq+8*15], 1
+ mova [tmpq+mmsize*0], m0
+ mova [tmpq+mmsize*1], m1
+ mova [tmpq+mmsize*2], m2
+ mova [tmpq+mmsize*3], m3
+%if notcpuflag(avx512)
+ movu xm0, [srcq+8* 8]
+ vinserti128 m0, [srcq+8* 9], 1
+ movu xm1, [srcq+8*10]
+ vinserti128 m1, [srcq+8*11], 1
+ movu xm2, [srcq+8*12]
+ vinserti128 m2, [srcq+8*13], 1
+ movu xm3, [srcq+8*14]
+ vinserti128 m3, [srcq+8*15], 1
+%endif
add tmpq, 32*8
add srcq, strideq
+%if notcpuflag(avx512)
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
@@ -1026,8 +1174,9 @@
mova [tmpq-32*3], m1
mova [tmpq-32*2], m2
mova [tmpq-32*1], m3
+%endif
dec hd
- jg .h_w128
+ jg .h_w128_loop
RET
.v:
WIN64_SPILL_XMM 7
@@ -1036,10 +1185,28 @@
add mxyd, 16 << 8
add wq, t2
lea stride3q, [strideq*3]
+%if cpuflag(avx512)
+ vpbroadcastw m6, mxyd
+%else
movd xm6, mxyd
vpbroadcastw m6, xm6
+%endif
jmp wq
.v_w4:
+%if cpuflag(avx512)
+ vpbroadcastd xm0, [srcq+strideq*0]
+ mov r3d, 0x29
+ vbroadcasti128 ym3, [bilin_v_shuf4]
+ kmovb k1, r3d
+.v_w4_loop:
+ vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
+ vpbroadcastd ym2, [srcq+strideq*2]
+ vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ym0, [srcq+strideq*0]
+ punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
+ pshufb ym2, ym3
+%else
movd xm0, [srcq+strideq*0]
.v_w4_loop:
vpbroadcastd m1, [srcq+strideq*2]
@@ -1053,15 +1220,31 @@
vpblendd m1, m1, m3, 0xaa ; 0 1 2 3
vpblendd m2, m2, m3, 0x55 ; 1 2 3 4
punpcklbw m2, m1
- pmaddubsw m2, m6
- mova [tmpq], m2
+%endif
+ pmaddubsw ym2, ym6
+ mova [tmpq], ym2
add tmpq, 32
sub hd, 4
jg .v_w4_loop
RET
.v_w8:
+%if cpuflag(avx512icl)
+ mova m5, [bilin_v_perm8]
+ vbroadcasti128 ym0, [srcq+strideq*0]
+%else
movq xm0, [srcq+strideq*0]
+%endif
.v_w8_loop:
+%if cpuflag(avx512icl)
+ vinserti128 ym1, ym0, [srcq+strideq*1], 1
+ vpbroadcastq ym0, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym0, [srcq+strideq*0], 0
+ vpermt2b m1, m5, m0
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+%else
vpbroadcastq m1, [srcq+strideq*2]
vpbroadcastq m2, [srcq+strideq*1]
vpbroadcastq m3, [srcq+stride3q ]
@@ -1078,11 +1261,28 @@
pmaddubsw m2, m6
mova [tmpq+32*0], m3
mova [tmpq+32*1], m2
+%endif
add tmpq, 32*2
sub hd, 4
jg .v_w8_loop
RET
.v_w16:
+%if cpuflag(avx512icl)
+ mova m5, [bilin_v_perm16]
+ movu xm0, [srcq+strideq*0]
+.v_w16_loop:
+ movu xm2, [srcq+strideq*2]
+ vinserti128 ym1, ym0, [srcq+strideq*1], 1
+ vpermt2b m1, m5, m2
+ vinserti128 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ vpermt2b m2, m5, m0
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+%else
vbroadcasti128 m0, [srcq+strideq*0]
.v_w16_loop:
vbroadcasti128 m1, [srcq+strideq*2]
@@ -1089,10 +1289,10 @@
vbroadcasti128 m2, [srcq+strideq*1]
vbroadcasti128 m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2
+ shufpd m4, m0, m1, 0x0c ; 0 2 ; 0l2l 0h2h
vbroadcasti128 m0, [srcq+strideq*0]
- shufpd m2, m2, m3, 0x0c ; 1 3
- shufpd m1, m1, m0, 0x0c ; 2 4
+ shufpd m2, m2, m3, 0x0c ; 1 3 ; 1l3l 1h3h
+ shufpd m1, m1, m0, 0x0c ; 2 4 ; 2l4l 2h4h
punpcklbw m3, m2, m4
punpcklbw m5, m1, m2
punpckhbw m1, m2
@@ -1105,30 +1305,54 @@
mova [tmpq+32*1], m5
mova [tmpq+32*2], m2
mova [tmpq+32*3], m1
+%endif
add tmpq, 32*4
sub hd, 4
jg .v_w16_loop
RET
.v_w32:
- vpermq m0, [srcq+strideq*0], q3120
+%if cpuflag(avx512icl)
+ mova m5, [bilin_v_perm32]
+ movu ym0, [srcq+strideq*0]
.v_w32_loop:
- vpermq m1, [srcq+strideq*1], q3120
- vpermq m2, [srcq+strideq*2], q3120
- vpermq m3, [srcq+stride3q ], q3120
+ movu ym2, [srcq+strideq*1]
+ movu ym3, [srcq+strideq*2]
+ movu ym4, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
+ vpermt2b m0, m5, m2
+ vpermt2b m2, m5, m3
+ vpermt2b m3, m5, m4
+ pmaddubsw m1, m0, m6
+ movu ym0, [srcq+strideq*0]
+ vpermt2b m4, m5, m0
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m4
+ add tmpq, 64*4
+%else
+ vpermq ym0, [srcq+strideq*0], q3120
+.v_w32_loop:
+ vpermq ym1, [srcq+strideq*1], q3120
+ vpermq ym2, [srcq+strideq*2], q3120
+ vpermq ym3, [srcq+stride3q ], q3120
+ lea srcq, [srcq+strideq*4]
punpcklbw m4, m1, m0
punpckhbw m5, m1, m0
- vpermq m0, [srcq+strideq*0], q3120
+ vpermq ym0, [srcq+strideq*0], q3120
pmaddubsw m4, m6
pmaddubsw m5, m6
- mova [tmpq+32*0], m4
- mova [tmpq+32*1], m5
+ mova [tmpq+32*0], ym4
+ mova [tmpq+32*1], ym5
punpcklbw m4, m2, m1
punpckhbw m5, m2, m1
pmaddubsw m4, m6
pmaddubsw m5, m6
- mova [tmpq+32*2], m4
- mova [tmpq+32*3], m5
+ mova [tmpq+32*2], ym4
+ mova [tmpq+32*3], ym5
add tmpq, 32*8
punpcklbw m4, m3, m2
punpckhbw m5, m3, m2
@@ -1142,10 +1366,32 @@
mova [tmpq-32*3], m5
mova [tmpq-32*2], m1
mova [tmpq-32*1], m2
+%endif
sub hd, 4
jg .v_w32_loop
RET
.v_w64:
+%if cpuflag(avx512)
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0]
+.v_w64_loop:
+ vpermq m1, m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m1, m0
+ punpckhbw m2, m1, m0
+ vpermq m0, m5, [srcq+strideq*0]
+ punpcklbw m3, m0, m1
+ punpckhbw m1, m0, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m1
+ add tmpq, 64*4
+%else
vpermq m0, [srcq+strideq*0+32*0], q3120
vpermq m1, [srcq+strideq*0+32*1], q3120
.v_w64_loop:
@@ -1179,10 +1425,49 @@
mova [tmpq-32*3], m5
mova [tmpq-32*2], m2
mova [tmpq-32*1], m3
+%endif
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
+%if cpuflag(avx512)
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+.v_w128_loop:
+ vpermq m2, m5, [srcq+strideq*1+ 0]
+ vpermq m3, m5, [srcq+strideq*1+64]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m0, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m0
+ punpcklbw m4, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*2], m4
+ mova [tmpq+64*3], m1
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+ punpcklbw m4, m0, m2
+ punpckhbw m2, m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*4], m4
+ mova [tmpq+64*5], m2
+ punpcklbw m4, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m3, m6
+ mova [tmpq+64*6], m4
+ mova [tmpq+64*7], m3
+ add tmpq, 64*8
+ sub hd, 2
+ jg .v_w128_loop
+%else
mov t0, tmpq
mov t1, srcq
lea t2d, [hq+(3<<8)]
@@ -1214,6 +1499,7 @@
mov srcq, t1
sub t2d, 1<<8
jg .v_w128_loop0
+%endif
RET
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
@@ -1222,16 +1508,20 @@
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
+%if cpuflag(avx512)
+ vpbroadcastw m6, mxyd
+%else
movd xm6, mxyd
+ vpbroadcastw m6, xm6
+%endif
add wq, t2
lea stride3q, [strideq*3]
- vpbroadcastw m6, xm6
jmp wq
.hv_w4:
- vbroadcasti128 m4, [bilin_h_shuf4]
- vpbroadcastq m0, [srcq+strideq*0]
- pshufb m0, m4
- pmaddubsw m0, m5
+ vbroadcasti128 ym4, [bilin_h_shuf4]
+ vpbroadcastq ym0, [srcq+strideq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
.hv_w4_loop:
movq xm1, [srcq+strideq*1]
movhps xm1, [srcq+strideq*2]
@@ -1238,26 +1528,47 @@
movq xm2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
movhps xm2, [srcq+strideq*0]
- vinserti128 m1, m1, xm2, 1
- pshufb m1, m4
- pmaddubsw m1, m5 ; 1 2 3 4
- vpblendd m2, m1, m0, 0xc0
- vpermq m2, m2, q2103 ; 0 1 2 3
- mova m0, m1
- psubw m1, m2
- pmulhrsw m1, m6
- paddw m1, m2
- mova [tmpq], m1
+ vinserti128 ym1, xm2, 1
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2 3 4
+%if cpuflag(avx512)
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+%else
+ vpblendd ym2, ym1, ym0, 0xc0
+ vpermq ym2, ym2, q2103 ; 0 1 2 3
+%endif
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym6
+ paddw ym1, ym2
+ mova [tmpq], ym1
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
RET
.hv_w8:
- vbroadcasti128 m0, [srcq+strideq*0]
+%if cpuflag(avx512)
+ vbroadcasti128 m4, [bilin_h_shuf8]
+%endif
+ vbroadcasti128 m0, [srcq+strideq*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
- movu xm1, [srcq+strideq*1]
+ movu xm1, [srcq+strideq*1]
+%if cpuflag(avx512)
+ vinserti128 ym1, [srcq+strideq*2], 1
+ vinserti128 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m1, [srcq+strideq*0], 3
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+%else
vinserti128 m1, m1, [srcq+strideq*2], 1
movu xm2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
@@ -1276,21 +1587,49 @@
paddw m3, m2
mova [tmpq+32*0], m1
mova [tmpq+32*1], m3
+%endif
add tmpq, 32*2
sub hd, 4
jg .hv_w8_loop
RET
.hv_w16:
- movu m0, [srcq+strideq*0+8*0]
- vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm16]
+ vbroadcasti32x8 m0, [srcq+strideq*0]
+ vpermb m0, m4, m0
+%else
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
pshufb m0, m4
+%endif
pmaddubsw m0, m5
.hv_w16_loop:
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- movu xm2, [srcq+strideq*0+8*0]
- vinserti128 m2, m2, [srcq+strideq*0+8*1], 1
+%if cpuflag(avx512icl)
+ movu ym1, [srcq+strideq*1]
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ movu ym2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti32x8 m2, [srcq+strideq*0], 1
+ vpermb m1, m4, m1
+ vpermb m2, m4, m2
+ pmaddubsw m1, m5 ; 1 2
+ vshufi32x4 m3, m0, m1, q1032 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vshufi32x4 m2, m1, m0, q1032 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m3
+%else
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
pshufb m1, m4
pshufb m2, m4
pmaddubsw m1, m5
@@ -1303,15 +1642,37 @@
paddw m2, m1
mova [tmpq+32*0], m3
mova [tmpq+32*1], m2
- add tmpq, 32*2
- sub hd, 2
+%endif
+ add tmpq, mmsize*2
+ sub hd, mmsize*2/(16*2)
jg .hv_w16_loop
RET
.hv_w32:
- movu m0, [srcq+8*0]
- vinserti128 m0, m0, [srcq+8*1], 1
- movu m1, [srcq+8*2]
- vinserti128 m1, m1, [srcq+8*3], 1
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+strideq*0]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m1, m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermb m2, m4, [srcq+strideq*0]
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+64*0], m3
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+%else
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
@@ -1338,14 +1699,41 @@
mova [tmpq+32], m3
add tmpq, 32*2
dec hd
+%endif
jg .hv_w32_loop
RET
.hv_w64:
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, strideq
+ vpermb m2, m4, [srcq+32*0]
+ vpermb m3, m4, [srcq+32*1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m7, m2, m0
+ psubw m8, m3, m1
+ pmulhrsw m7, m6
+ pmulhrsw m8, m6
+ paddw m7, m0
+ paddw m8, m1
+ mova [tmpq+ 0], m7
+ mova [tmpq+64], m8
+ mova m0, m2
+ mova m1, m3
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+%else
mov t0, tmpq
mov t1, srcq
lea t2d, [hq+(3<<8)]
.hv_w64_loop0:
- movu m0, [srcq+strideq*0+8*0]
+ movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
pshufb m0, m4
pmaddubsw m0, m5
@@ -1377,13 +1765,58 @@
mov srcq, t1
sub t2d, 1<<8
jg .hv_w64_loop0
+%endif
RET
.hv_w128:
+%if cpuflag(avx512icl)
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+.hv_w128_loop:
+ add srcq, strideq
+ vpermb m7, m4, [srcq+32*0]
+ vpermb m8, m4, [srcq+32*1]
+ vpermb m9, m4, [srcq+32*2]
+ vpermb m10, m4, [srcq+32*3]
+ pmaddubsw m7, m5
+ pmaddubsw m8, m5
+ pmaddubsw m9, m5
+ pmaddubsw m10, m5
+ psubw m11, m7, m0
+ psubw m12, m8, m1
+ psubw m13, m9, m2
+ psubw m14, m10, m3
+ pmulhrsw m11, m6
+ pmulhrsw m12, m6
+ pmulhrsw m13, m6
+ pmulhrsw m14, m6
+ paddw m11, m0
+ paddw m12, m1
+ paddw m13, m2
+ paddw m14, m3
+ mova [tmpq+64*0], m11
+ mova [tmpq+64*1], m12
+ mova [tmpq+64*2], m13
+ mova [tmpq+64*3], m14
+ mova m0, m7
+ mova m1, m8
+ mova m2, m9
+ mova m3, m10
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+%else
mov t0, tmpq
mov t1, srcq
lea t2d, [hq+(7<<8)]
.hv_w128_loop0:
- movu m0, [srcq+strideq*0+8*0]
+ movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
pshufb m0, m4
pmaddubsw m0, m5
@@ -1409,13 +1842,21 @@
sub hd, 2
jg .hv_w128_loop
mov hb, t2b
- add t0, 32
- add t1, 16
+ add t0, mmsize
+ add t1, mmsize/2
mov tmpq, t0
mov srcq, t1
sub t2d, 1<<8
jg .hv_w128_loop0
+%endif
RET
+%endmacro
+
+INIT_ZMM avx512icl
+PREP_BILIN
+
+INIT_YMM avx2
+PREP_BILIN
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -67,6 +67,7 @@
decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
+decl_mct_fn(dav1d_prep_bilin_avx512icl);
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_mct_fn(dav1d_prep_bilin_ssse3);
@@ -202,5 +203,12 @@
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
c->emu_edge = dav1d_emu_edge_avx2;
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
#endif
}