ref: 205b723e569947d0fd0d2a65e23a55c424a119a3
parent: 33ce38293c908a94ab376193c96657a30807503c
author: Henrik Gramner <[email protected]>
date: Fri Jan 18 14:52:20 EST 2019
Add SGR optimizations
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -446,11 +446,11 @@
const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20;
+ const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
- const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
// This is where we invert A and B, so that B is of size coef.
- AA[i] = (((1U << 8) - x) * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
- BB[i] = x;
+ AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
+ BB[i] = 256 - x;
}
AA += step * REST_UNIT_STRIDE;
BB += step * REST_UNIT_STRIDE;
--- a/src/tables.c
+++ b/src/tables.c
@@ -502,25 +502,25 @@
{ 2, 0, 22, -1 },
};
-const int dav1d_sgr_x_by_xplus1[256] = {
- 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
- 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
- 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
- 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
- 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
- 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
- 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
- 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- 256,
+const uint8_t dav1d_sgr_x_by_x[256] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
+ 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0
};
const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
--- a/src/tables.h
+++ b/src/tables.h
@@ -107,7 +107,7 @@
extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int16_t dav1d_sgr_params[16][4];
-extern const int dav1d_sgr_x_by_xplus1[256];
+extern const uint8_t dav1d_sgr_x_by_x[256];
extern const int8_t dav1d_mc_subpel_filters[5][15][8];
extern const int8_t dav1d_mc_warp_filter[193][8];
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -42,14 +42,12 @@
pw_16380: times 2 dw 16380
pw_0_128: dw 0, 128
pw_5_6: dw 5, 6
-pw_82: times 2 dw 82
-pw_91_5: dw 91, 5
pd_6: dd 6
-pd_255: dd 255
pd_1024: dd 1024
-pd_0x80000: dd 0x80000
+pd_0xf0080029: dd 0xf0080029
+pd_0xf00801c7: dd 0xf00801c7
-cextern sgr_x_by_xplus1
+cextern sgr_x_by_x
SECTION .text
@@ -477,76 +475,65 @@
RET
INIT_YMM avx2
-cglobal sgr_calc_ab1, 4, 6, 14, a, b, w, h, s
+cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
add hd, 2
- lea r5, [sgr_x_by_xplus1]
- pxor m6, m6
- vpbroadcastd m7, [pw_91_5]
+ lea r5, [sgr_x_by_x-0xf03]
%ifidn sd, sm
- movd xm8, sd
- vpbroadcastd m8, xm8
+ movd xm6, sd
+ vpbroadcastd m6, xm6
%else
- vpbroadcastd m8, sm
+ vpbroadcastd m6, sm
%endif
- vpbroadcastd m9, [pd_0x80000]
- vpbroadcastd m10, [pd_255]
- psrad m12, m9, 8 ; pd_2048
- psrad m11, m9, 11 ; pd_256
- pcmpeqb m13, m13
+ vpbroadcastd m8, [pd_0xf00801c7]
+ vpbroadcastd m9, [pw_256]
+ pcmpeqb m7, m7
+ psrld m10, m9, 13 ; pd_2048
DEFINE_ARGS a, b, w, h, x
+
.loop_y:
mov xq, -2
.loop_x:
- movu xm0, [aq+xq*4+ 0]
- movu xm1, [aq+xq*4+16]
- vinserti128 m0, [aq+xq*4+ 0+(384+16)*4], 1
- vinserti128 m1, [aq+xq*4+16+(384+16)*4], 1
- movu xm2, [bq+xq*2]
- vinserti128 m2, [bq+xq*2+(384+16)*2], 1
- pslld m3, m0, 3
- pslld m4, m1, 3
- paddd m3, m0 ; aa * 9 [first half]
- paddd m4, m1 ; aa * 9 [second half]
- punpcklwd m0, m6, m2
- punpckhwd m2, m6, m2
- pmaddwd m1, m0, m0
- pmaddwd m5, m2, m2
- pmaddwd m0, m7
- pmaddwd m2, m7
- psubd m3, m1 ; p = aa * 9 - bb * bb [first half]
- psubd m4, m5 ; p = aa * 9 - bb * bb [second half]
- pmulld m3, m8
- pmulld m4, m8
- paddd m3, m9
- paddd m4, m9
- psrld m3, 20 ; z [first half]
- psrld m4, 20 ; z [second half]
- pminsd m3, m10
- pminsd m4, m10
- mova m5, m13
- vpgatherdd m1, [r5+m3*4], m5 ; xx [first half]
- mova m5, m13
- vpgatherdd m3, [r5+m4*4], m5 ; xx [second half]
- psubd m5, m11, m1
- psubd m4, m11, m3
- packssdw m1, m3
- pmullw m5, m7
- pmullw m4, m7
- pmaddwd m5, m0
- pmaddwd m4, m2
- paddd m5, m12
- paddd m4, m12
- psrad m5, 12
- psrad m4, 12
- movu [bq+xq*2], xm1
- vextracti128 [bq+xq*2+(384+16)*2], m1, 1
- movu [aq+xq*4+ 0], xm5
- movu [aq+xq*4+16], xm4
- vextracti128 [aq+xq*4+ 0+(384+16)*4], m5, 1
- vextracti128 [aq+xq*4+16+(384+16)*4], m4, 1
-
+ pmovzxwd m0, [bq+xq*2]
+ pmovzxwd m1, [bq+xq*2+(384+16)*2]
+ movu m2, [aq+xq*4]
+ movu m3, [aq+xq*4+(384+16)*4]
+ pslld m4, m2, 3
+ pslld m5, m3, 3
+ paddd m2, m4 ; aa * 9
+ paddd m3, m5
+ pmaddwd m4, m0, m0
+ pmaddwd m5, m1, m1
+ pmaddwd m0, m8
+ pmaddwd m1, m8
+ psubd m2, m4 ; p = aa * 9 - bb * bb
+ psubd m3, m5
+ pmulld m2, m6
+ pmulld m3, m6
+ paddusw m2, m8
+ paddusw m3, m8
+ psrld m2, 20 ; z
+ psrld m3, 20
+ mova m5, m7
+ vpgatherdd m4, [r5+m2], m5 ; xx
+ mova m5, m7
+ vpgatherdd m2, [r5+m3], m5
+ psrld m4, 24
+ psrld m2, 24
+ pmulld m0, m4
+ pmulld m1, m2
+ packssdw m4, m2
+ psubw m4, m9, m4
+ vpermq m4, m4, q3120
+ paddd m0, m10
+ paddd m1, m10
+ psrld m0, 12
+ psrld m1, 12
+ movu [bq+xq*2], xm4
+ vextracti128 [bq+xq*2+(384+16)*2], m4, 1
+ movu [aq+xq*4], m0
+ movu [aq+xq*4+(384+16)*4], m1
add xd, 8
cmp xd, wd
jl .loop_x
@@ -903,78 +890,67 @@
jmp .loop_y_noload
INIT_YMM avx2
-cglobal sgr_calc_ab2, 4, 6, 14, a, b, w, h, s
+cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
sub aq, (384+16-1)*4
sub bq, (384+16-1)*2
add hd, 2
- lea r5, [sgr_x_by_xplus1]
- pxor m6, m6
- vpbroadcastd m7, [pw_82]
+ lea r5, [sgr_x_by_x-0xf03]
%ifidn sd, sm
- movd xm8, sd
- vpbroadcastd m8, xm8
+ movd xm6, sd
+ vpbroadcastd m6, xm6
%else
- vpbroadcastd m8, sm
+ vpbroadcastd m6, sm
%endif
- vpbroadcastd m9, [pd_0x80000]
- vpbroadcastd m10, [pd_255]
- psrad m12, m9, 8 ; pd_2048
- psrad m11, m9, 11 ; pd_256
- pcmpeqb m13, m13
+ vpbroadcastd m8, [pd_0xf0080029]
+ vpbroadcastd m9, [pw_256]
+ pcmpeqb m7, m7
+ psrld m10, m9, 15 ; pd_512
DEFINE_ARGS a, b, w, h, x
.loop_y:
mov xq, -2
.loop_x:
- movu xm0, [aq+xq*4+ 0]
- movu xm1, [aq+xq*4+16]
- vinserti128 m0, [aq+xq*4+32], 1
- vinserti128 m1, [aq+xq*4+48], 1
- movu m2, [bq+xq*2]
- pslld m3, m0, 5 ; aa * 32 [first half]
- pslld m4, m1, 5 ; aa * 32 [second half]
- paddd m3, m0 ; aa * 33 [first half]
- paddd m4, m1 ; aa * 33 [first half]
- pslld m0, 3 ; aa * 8 [first half]
- pslld m1, 3 ; aa * 8 [second half]
- psubd m3, m0 ; aa * 25 [first half]
- psubd m4, m1 ; aa * 25 [second half]
- punpcklwd m0, m2, m6
- punpckhwd m2, m6
- pmaddwd m1, m0, m0
- pmaddwd m5, m2, m2
- paddw m0, m0
- paddw m2, m2
- psubd m3, m1 ; p = aa * 25 - bb * bb [first half]
- psubd m4, m5 ; p = aa * 25 - bb * bb [second half]
- pmulld m3, m8
- pmulld m4, m8
- paddd m3, m9
- paddd m4, m9
- psrld m3, 20 ; z [first half]
- psrld m4, 20 ; z [second half]
- pminsd m3, m10
- pminsd m4, m10
- mova m5, m13
- vpgatherdd m1, [r5+m3*4], m5 ; xx [first half]
- mova m5, m13
- vpgatherdd m3, [r5+m4*4], m5 ; xx [second half]
- psubd m5, m11, m1
- psubd m4, m11, m3
- packssdw m1, m3
- pmullw m5, m7
- pmullw m4, m7
- pmaddwd m5, m0
- pmaddwd m4, m2
- paddd m5, m12
- paddd m4, m12
- psrad m5, 12
- psrad m4, 12
- movu [bq+xq*2], m1
- movu [aq+xq*4+ 0], xm5
- movu [aq+xq*4+16], xm4
- vextracti128 [aq+xq*4+32], m5, 1
- vextracti128 [aq+xq*4+48], m4, 1
-
+ pmovzxwd m0, [bq+xq*2+ 0]
+ pmovzxwd m1, [bq+xq*2+16]
+ movu m2, [aq+xq*4+ 0]
+ movu m3, [aq+xq*4+32]
+ pslld m4, m2, 3 ; aa * 8
+ pslld m5, m3, 3
+ paddd m2, m4 ; aa * 9
+ paddd m3, m5
+ paddd m4, m4 ; aa * 16
+ paddd m5, m5
+ paddd m2, m4 ; aa * 25
+ paddd m3, m5
+ pmaddwd m4, m0, m0
+ pmaddwd m5, m1, m1
+ psubd m2, m4 ; p = aa * 25 - bb * bb
+ psubd m3, m5
+ pmulld m2, m6
+ pmulld m3, m6
+ paddusw m2, m8
+ paddusw m3, m8
+ psrld m2, 20 ; z
+ psrld m3, 20
+ mova m5, m7
+ vpgatherdd m4, [r5+m2], m5 ; xx
+ mova m5, m7
+ vpgatherdd m2, [r5+m3], m5
+ psrld m4, 24
+ psrld m2, 24
+ packssdw m3, m4, m2
+ pmullw m4, m8
+ pmullw m2, m8
+ psubw m3, m9, m3
+ vpermq m3, m3, q3120
+ pmaddwd m0, m4
+ pmaddwd m1, m2
+ paddd m0, m10
+ paddd m1, m10
+ psrld m0, 10
+ psrld m1, 10
+ movu [bq+xq*2], m3
+ movu [aq+xq*4+ 0], m0
+ movu [aq+xq*4+32], m1
add xd, 16
cmp xd, wd
jl .loop_x