ref: e2aa2d1446d7c8af0d6d00b3e8d91563aeac36bd
parent: 4903d87b73b5bc7bd7fe949034666dc2bc4512af
author: David Michael Barr <[email protected]>
date: Wed Nov 28 16:00:49 EST 2018
x86: Add chroma-from-luma AC 4:2:0 AVX2 asm
--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -68,7 +68,9 @@
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0
pb_1: times 4 db 1
+pb_2: times 4 db 2
pb_128: times 4 db 128
+pw_1: times 2 dw 1
pw_8: times 2 dw 8
pw_128: times 2 dw 128
pw_255: times 2 dw 255
@@ -1781,6 +1783,101 @@
add wq, t0
movifnidn acq, acmp
jmp wq
+
+cglobal ipred_cfl_ac_420, 6, 10, 5, ac, y, stride, wpad, hpad, w, h
+ shl wpadd, 2
+ shl hpadd, 2
+ mov r9d, hm
+ mov r6d, wd
+ movsxd wq, wd
+ add yq, strideq
+ mov r7, acq
+ sub r6d, wpadd
+ sub r9d, hpadd
+ mov r8d, r9d
+ vpbroadcastd xm2, [pb_2]
+.dec_rows:
+ mov r3, yq
+ xor r4, r4
+ sub r3, strideq
+.dec_cols:
+ movq xm0, [r3+r4*2]
+ movq xm1, [yq+r4*2]
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm2
+ paddw xm0, xm1
+ movq [r7+r4*2], xm0
+ add r4, 4
+ cmp r6d, r4d
+ jg .dec_cols
+ lea r7, [r7+wq*2]
+ lea yq, [yq+strideq*2]
+ dec r8d
+ jg .dec_rows
+ cmp r6d, wd
+ je .wpad_end
+ mov r7, acq
+ lea r1, [r6q+r6q]
+.wpad_rows:
+ vpbroadcastw xm0, [r7+r1-2]
+ mov r2q, r6q
+.wpad_cols:
+ movq [r7+r2q*2], xm0
+ add r2q, 4
+ cmp wd, r2d
+ jg .wpad_cols
+ lea r7, [r7+wq*2]
+ dec r9d
+ jg .wpad_rows
+.wpad_end:
+ bsf r3d, hm
+ shlx r6d, wd, r3d
+ neg wd
+ bsf r3d, r6d
+ movsxd wq, wd
+ add wq, wq
+ movsxd r2q, r6d
+ lea r2q, [acq+r2q*2]
+.hpad_loop:
+ cmp r2q, r7
+ jbe .hpad_end
+ mov r1, [r7+wq]
+ add r7, 8
+ mov [r7-8], r1
+ jmp .hpad_loop
+.hpad_end:
+ mov r1, acq
+ pxor m1, m1
+ vpbroadcastd m3, [pw_1]
+.sum_loop:
+ movdqu m0, [r1]
+ add r1, 32
+ cmp r2q, r1
+ pmaddwd m0, m3
+ paddd m1, m0
+ ja .sum_loop
+ vextracti128 xm0, m1, 1
+ sar r6d, 1
+ movd xm4, r6d
+ mov r6d, r3d
+ paddd xm0, xm1
+ punpckhqdq xm1, xm0, xm0
+ paddd xm1, xm0
+ vbroadcastss xm0, xm4
+ psrlq xm2, xm1, 32
+ movq xm4, r6q
+ paddd xm0, xm2
+ paddd xm0, xm1
+ psrld xm0, xm4
+ vpbroadcastw m0, xm0
+.sub_loop:
+ movdqu m1, [acq]
+ add acq, 32
+ psubw m1, m0
+ movdqu [acq-32], m1
+ cmp r2q, acq
+ ja .sub_loop
+ RET
cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
vbroadcasti128 m4, [palq]
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -45,6 +45,8 @@
decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2);
decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2);
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2);
+
decl_pal_pred_fn(dav1d_pal_pred_avx2);
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
@@ -69,6 +71,8 @@
c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_avx2;
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_avx2;
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2;
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2;
c->pal_pred = dav1d_pal_pred_avx2;
#endif