ref: e4d0bd4131b01efe1fa9454da415145c8749410a
parent: 701f88b9e3b60a9578ab1e98803c37561e41f466
author: Henrik Gramner <[email protected]>
date: Mon Oct 8 11:39:33 EDT 2018
x86: Add paeth intra prediction AVX2 asm
--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -28,8 +28,12 @@
%if ARCH_X86_64
-SECTION_RODATA
+SECTION_RODATA 32
+paeth_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 6, 6, 6, 6, 2, 2, 2, 2
+ db 5, 5, 5, 5, 1, 1, 1, 1, 4, 4, 4, 4, 0, 0, 0, 0
+
+pb_1: times 4 db 1
pb_128: times 4 db 128
%macro JMP_TABLE 3-*
@@ -44,10 +48,11 @@
%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
-JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
- s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
-JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
-JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
SECTION .text
@@ -394,6 +399,148 @@
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w64
+ RET
+
+%macro PAETH 2 ; top, ldiff
+ pavgb m1, m%1, m3 ; Calculating tldiff normally requires
+ pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it
+ pand m0, m4 ; in 8-bit with some tricks which avoids
+ psubusb m2, m5, m1 ; having to unpack everything to 16-bit.
+ psubb m1, m0
+ psubusb m1, m5
+ por m1, m2
+ paddusb m1, m1
+ por m1, m0 ; min(tldiff, 255)
+ psubusb m2, m5, m3
+ psubusb m0, m3, m5
+ por m2, m0 ; tdiff
+ pminub m2, m%2
+ pcmpeqb m0, m%2, m2 ; ldiff <= tdiff
+ vpblendvb m0, m%1, m3, m0
+ pminub m1, m2
+ pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff
+ vpblendvb m0, m5, m0, m1
+%endmacro
+
+cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
+ lea r5, [ipred_paeth_avx2_table]
+ tzcnt wd, wm
+ vpbroadcastb m5, [tlq] ; topleft
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m4, [r5-ipred_paeth_avx2_table+pb_1]
+ add wq, r5
+ jmp wq
+.w4:
+ vpbroadcastd m6, [tlq+1] ; top
+ mova m8, [r5-ipred_paeth_avx2_table+paeth_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0 ; ldiff
+.w4_loop:
+ sub tlq, 8
+ vpbroadcastq m3, [tlq]
+ pshufb m3, m8 ; left
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 2
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+r3 ], xm1, 2
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 1
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 1
+ pextrd [dstq+r3 ], xm1, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 8
+ jg .w4_loop
+.ret:
+ RET
+ALIGN function_align
+.w8:
+ vpbroadcastq m6, [tlq+1]
+ mova m8, [r5-ipred_paeth_avx2_table+paeth_shuf]
+ lea r3, [strideq*3]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w8_loop:
+ sub tlq, 4
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+r3 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m6, [tlq+1]
+ mova xm8, xm4 ; lower half = 1, upper half = 0
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w16_loop:
+ sub tlq, 2
+ vpbroadcastd m3, [tlq]
+ pshufb m3, m8
+ PAETH 6, 7
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ movu m6, [tlq+1]
+ psubusb m7, m5, m6
+ psubusb m0, m6, m5
+ por m7, m0
+.w32_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 7
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w32_loop
+ RET
+ALIGN function_align
+.w64:
+ movu m6, [tlq+ 1]
+ movu m7, [tlq+33]
+%if WIN64
+ movaps r4m, xmm9
+%endif
+ psubusb m8, m5, m6
+ psubusb m0, m6, m5
+ psubusb m9, m5, m7
+ psubusb m1, m7, m5
+ por m8, m0
+ por m9, m1
+.w64_loop:
+ dec tlq
+ vpbroadcastb m3, [tlq]
+ PAETH 6, 8
+ mova [dstq+32*0], m0
+ PAETH 7, 9
+ mova [dstq+32*1], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+%if WIN64
+ movaps xmm9, r4m
+%endif
RET
%endif
--- a/src/x86/ipred_init.c
+++ b/src/x86/ipred_init.c
@@ -34,6 +34,7 @@
decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2);
decl_angular_ipred_fn(dav1d_ipred_h_avx2);
decl_angular_ipred_fn(dav1d_ipred_v_avx2);
+decl_angular_ipred_fn(dav1d_ipred_paeth_avx2);
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -47,5 +48,6 @@
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_avx2;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_avx2;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_avx2;
+ c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_avx2;
#endif
}