ref: 059ad248d6b8b5a6115875acd9631746c8c3dc34
parent: 12a64ec7a0f6be0315e5e4036d0c4608038846c6
author: Henrik Gramner <[email protected]>
date: Sat May 23 12:50:12 EDT 2020
x86: Add minor looprestoration asm optimizations Eliminate store forwarding stalls. Use shorter instruction encodings where possible. Misc. tweaks.
--- a/src/x86/looprestoration.asm
+++ b/src/x86/looprestoration.asm
@@ -51,9 +51,12 @@
SECTION .text
INIT_YMM avx2
-cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
+cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge
+ mov edged, edgem
vpbroadcastb m15, [fhq+0]
+ movifnidn wd, wm
vpbroadcastb m14, [fhq+2]
+ mov hd, hm
vpbroadcastb m13, [fhq+4]
vpbroadcastw m12, [fhq+6]
vpbroadcastd m11, [pw_2048]
@@ -64,7 +67,7 @@
; if (edge & has_right) align_w_to_32
; else w -= 32, and use that as limit in x loop
- test edged, 2 ; has_right
+ test edgeb, 2 ; has_right
jnz .align
mov xlimq, -3
jmp .loop
@@ -80,7 +83,7 @@
lea xq, [wq+xlimq]
; load left edge pixels
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .emu_left
test leftq, leftq ; left == NULL for the edge-extended bottom/top
jz .load_left_combined
@@ -203,17 +206,19 @@
jg .loop
RET
-cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
- vpbroadcastd m14, [fvq+4]
- vpbroadcastd m15, [fvq]
- vpbroadcastd m13, [pw_0_128]
- paddw m14, m13
+cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge
+ movifnidn fvq, fvmp
+ mov edged, edgem
+ movifnidn hd, hm
+ vpbroadcastd m10, [fvq]
+ vpbroadcastd m11, [fvq+4]
+ vpbroadcastd m0, [pw_0_128]
vpbroadcastd m12, [pd_1024]
DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
+ rorx ylimd, edged, 2
+ paddw m11, m0
+ and ylimd, 2 ; have_bottom
sub ylimd, 3
; main x loop for vertical filter, does one column of 16 pixels
@@ -221,7 +226,7 @@
mova m3, [midq] ; middle line
; load top pixels
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jz .emu_top
mova m0, [midq-384*4]
mova m2, [midq-384*2]
@@ -276,27 +281,28 @@
; try to structure the loop so that the common case is evaluated fastest
mova m6, [mptrq+384*6]
.loop:
- paddw m7, m0, m6
- paddw m8, m1, m5
- paddw m9, m2, m4
- punpcklwd m10, m7, m8
- punpckhwd m7, m8
- punpcklwd m11, m9, m3
- punpckhwd m9, m3
- pmaddwd m10, m15
- pmaddwd m7, m15
- pmaddwd m11, m14
- pmaddwd m9, m14
- paddd m10, m11
+ paddw m0, m6
+ paddw m7, m1, m5
+ paddw m8, m2, m4
+ punpcklwd m9, m0, m7
+ punpckhwd m0, m7
+ punpcklwd m7, m8, m3
+ punpckhwd m8, m3
+ pmaddwd m9, m10
+ pmaddwd m0, m10
+ pmaddwd m7, m11
+ pmaddwd m8, m11
+ add mptrq, 384*2
paddd m7, m9
- paddd m10, m12
+ paddd m0, m8
paddd m7, m12
- psrad m10, 11
+ paddd m0, m12
psrad m7, 11
- packssdw m10, m7
- packuswb m10, m10
- vpermq m10, m10, q3120
- mova [dstptrq], xm10
+ psrad m0, 11
+ packssdw m7, m0
+ vextracti128 xm0, m7, 1
+ packuswb xm7, xm0
+ mova [dstptrq], xm7
; shift pixels one position
mova m0, m1
mova m1, m2
@@ -305,44 +311,44 @@
mova m4, m5
mova m5, m6
add dstptrq, strideq
- add mptrq, 384*2
dec yd
jg .loop_load
; for the bottom pixels, continue using m6 (as extended edge)
cmp yd, ylimd
jg .loop
-
- add dstq, 16
add midq, 32
+ add dstq, 16
sub wd, 16
jg .loop_x
RET
INIT_YMM avx2
-cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- mov xlimd, edged
+cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov xlimd, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ mov edged, xlimd
and xlimd, 2 ; have_right
- add wd, xlimd
- xor xlimd, 2 ; 2*!have_right
- jnz .no_right
- add wd, 15
+ jz .no_right
+ add wd, 2+15
and wd, ~15
.no_right:
+ lea r10, [pb_right_ext_mask+32]
+ xor xlimd, 2 ; 2*!have_right
pxor m1, m1
- lea srcq, [srcq+wq]
+ add srcq, wq
lea sumq, [sumq+wq*2-2]
lea sumsqq, [sumsqq+wq*4-4]
neg wq
- lea r10, [pb_right_ext_mask+32]
.loop_y:
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
- pinsrw xm0, [leftq+2], 7
+ vpbroadcastw xm0, [leftq+2]
add leftq, 4
jmp .expand_x
.no_left:
@@ -349,7 +355,7 @@
vpbroadcastb xm0, [srcq+xq]
jmp .expand_x
.load_left_from_main:
- pinsrw xm0, [srcq+xq-2], 7
+ vpbroadcastw xm0, [srcq+xq-2]
.expand_x:
punpckhbw xm0, xm1
@@ -359,8 +365,8 @@
.partial_load_and_extend:
vpbroadcastb m3, [srcq-1]
pmovzxbw m2, [srcq+xq]
- punpcklbw m3, m1
movu m4, [r10+xq*2]
+ punpcklbw m3, m1
pand m2, m4
pandn m4, m3
por m2, m4
@@ -380,22 +386,21 @@
punpcklwd m5, m3, m2
punpckhwd m6, m3, m2
paddw m3, m4
- punpcklwd m7, m4, m1
+ punpcklwd m0, m4, m1
punpckhwd m4, m1
pmaddwd m5, m5
pmaddwd m6, m6
- pmaddwd m7, m7
+ pmaddwd m0, m0
pmaddwd m4, m4
- paddd m5, m7
- paddd m6, m4
paddw m3, m2
+ paddd m5, m0
+ vextracti128 xm0, m2, 1
+ paddd m6, m4
movu [sumq+xq*2], m3
- movu [sumsqq+xq*4+ 0], xm5
- movu [sumsqq+xq*4+16], xm6
+ movu [sumsqq+xq*4+ 0], xm5
+ movu [sumsqq+xq*4+16], xm6
vextracti128 [sumsqq+xq*4+32], m5, 1
vextracti128 [sumsqq+xq*4+48], m6, 1
-
- vextracti128 xm0, m2, 1
add xq, 16
; if x <= -16 we can reload more pixels
@@ -418,25 +423,25 @@
RET
INIT_YMM avx2
-cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
mov xq, -2
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
+ rorx ylimd, edged, 2
+ and ylimd, 2 ; have_bottom
sub ylimd, 2 ; -2 if have_bottom=0, else 0
.loop_x:
lea yd, [hq+ylimq+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+32]
+ movu m6, [sum_ptrq+(384+16)*2*1]
mova m2, m0
mova m3, m1
mova m4, m0
mova m5, m1
- movu m6, [sum_ptrq+(384+16)*2*1]
mova m7, m6
mova m8, m6
jmp .loop_y_noload
@@ -550,8 +555,10 @@
RET
INIT_YMM avx2
-cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+ movifnidn wd, wm
+ mov hd, hm
vpbroadcastd m15, [pw_16]
xor xd, xd
.loop_x:
@@ -654,45 +661,53 @@
RET
INIT_YMM avx2
-cglobal sgr_weighted1, 6, 6, 7, dst, stride, t, w, h, wt
- movd xm0, wtd
- vpbroadcastw m0, xm0
- psllw m0, 4
+cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
+%ifidn wtd, wtm
+ shl wtd, 4
+ movd xm5, wtd
+ vpbroadcastw m5, xm5
+%else
+ vpbroadcastw m5, wtm
+ mov hd, hm
+ psllw m5, 4
+%endif
DEFINE_ARGS dst, stride, t, w, h, idx
.loop_y:
xor idxd, idxd
.loop_x:
- mova m1, [tq+idxq*2+ 0]
- mova m4, [tq+idxq*2+32]
+ mova m0, [tq+idxq*2+ 0]
+ mova m1, [tq+idxq*2+32]
pmovzxbw m2, [dstq+idxq+ 0]
- pmovzxbw m5, [dstq+idxq+16]
- psllw m3, m2, 4
- psllw m6, m5, 4
- psubw m1, m3
- psubw m4, m6
- pmulhrsw m1, m0
- pmulhrsw m4, m0
- paddw m1, m2
- paddw m4, m5
- packuswb m1, m4
- vpermq m1, m1, q3120
- mova [dstq+idxq], m1
+ pmovzxbw m3, [dstq+idxq+16]
+ psllw m4, m2, 4
+ psubw m0, m4
+ psllw m4, m3, 4
+ psubw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [dstq+idxq], m0
add idxd, 32
cmp idxd, wd
jl .loop_x
+ add tq, 384*2
add dstq, strideq
- add tq, 384 * 2
dec hd
jg .loop_y
RET
INIT_YMM avx2
-cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
- test edged, 2 ; have_right
+cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+ mov edged, edgem
+ movifnidn wd, wm
+ mov hd, hm
+ test edgeb, 2 ; have_right
jz .no_right
xor xlimd, xlimd
- add wd, 2
- add wd, 15
+ add wd, 2+15
and wd, ~15
jmp .right_done
.no_right:
@@ -699,30 +714,30 @@
mov xlimd, 3
sub wd, 1
.right_done:
+ lea r10, [pb_right_ext_mask+32]
pxor m1, m1
lea srcq, [srcq+wq+1]
lea sumq, [sumq+wq*2-2]
lea sumsqq, [sumsqq+wq*4-4]
neg wq
- lea r10, [pb_right_ext_mask+32]
.loop_y:
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
- movd xm0, [leftq]
- pinsrd xm0, [srcq+xq-1], 1
- pslldq xm0, 11
+ vpbroadcastd xm2, [leftq]
+ movd xm0, [srcq+xq-1]
add leftq, 4
+ palignr xm0, xm2, 1
jmp .expand_x
.no_left:
vpbroadcastb xm0, [srcq+xq-1]
jmp .expand_x
.load_left_from_main:
- pinsrd xm0, [srcq+xq-4], 3
+ vpbroadcastd xm0, [srcq+xq-4]
.expand_x:
punpckhbw xm0, xm1
@@ -734,8 +749,8 @@
.partial_load_and_extend:
vpbroadcastb m3, [srcq-1]
pmovzxbw m2, [srcq+xq]
- punpcklbw m3, m1
movu m4, [r10+xq*2]
+ punpcklbw m3, m1
pand m2, m4
pandn m4, m3
por m2, m4
@@ -775,8 +790,8 @@
paddd m7, m9
paddd m3, m5
movu [sumq+xq*2], m0
- movu [sumsqq+xq*4+ 0], xm7
- movu [sumsqq+xq*4+16], xm3
+ movu [sumsqq+xq*4+ 0], xm7
+ movu [sumsqq+xq*4+16], xm3
vextracti128 [sumsqq+xq*4+32], m7, 1
vextracti128 [sumsqq+xq*4+48], m3, 1
@@ -795,28 +810,29 @@
cmp xd, xlimd
jl .right_extend
+ add srcq, strideq
add sumsqq, (384+16)*4
add sumq, (384+16)*2
- add srcq, strideq
dec hd
jg .loop_y
RET
INIT_YMM avx2
-cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+ movifnidn edged, edgem
mov xq, -2
- mov ylimd, edged
- and ylimd, 8 ; have_bottom
- shr ylimd, 2
+ rorx ylimd, edged, 2
+ and ylimd, 2 ; have_bottom
sub ylimd, 3 ; -3 if have_bottom=0, else -1
.loop_x:
lea yd, [hq+ylimq+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+32]
+ movu m10, [sum_ptrq+(384+16)*2*1]
mova m2, m0
mova m3, m1
mova m4, m0
@@ -823,7 +839,6 @@
mova m5, m1
mova m6, m0
mova m7, m1
- movu m10, [sum_ptrq+(384+16)*2*1]
mova m11, m10
mova m12, m10
mova m13, m10
@@ -833,10 +848,10 @@
movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right]
movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left]
movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right]
- mova m2, m0
- mova m3, m1
movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4
movu m12, [sum_ptrq-(384+16)*2*0] ; l2
+ mova m2, m0
+ mova m3, m1
mova m11, m10
.loop_y:
movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left]
@@ -967,8 +982,10 @@
RET
INIT_YMM avx2
-cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \
+cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+ movifnidn wd, wm
+ mov hd, hm
vpbroadcastd m9, [pw_5_6]
vpbroadcastd m12, [pw_256]
psrlw m11, m12, 1 ; pw_128
@@ -1084,8 +1101,10 @@
RET
INIT_YMM avx2
-cglobal sgr_weighted2, 7, 7, 11, dst, stride, t1, t2, w, h, wt
- vpbroadcastd m0, [wtq]
+cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt
+ movifnidn wd, wm
+ movifnidn hd, hm
+ vpbroadcastd m0, wtm
vpbroadcastd m10, [pd_1024]
DEFINE_ARGS dst, stride, t1, t2, w, h, idx
.loop_y:
--- a/src/x86/looprestoration_init_tmpl.c
+++ b/src/x86/looprestoration_init_tmpl.c
@@ -169,7 +169,7 @@
void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
const coef *t1, const coef *t2, \
const int w, const int h, \
- const int16_t wt[2]); \
+ const uint32_t wt); \
\
static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
const pixel (*const left)[4], \
@@ -194,7 +194,7 @@
w, h, dav1d_sgr_params[sgr_idx][2], edges); \
dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
w, h, dav1d_sgr_params[sgr_idx][3], edges); \
- const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; \
+ const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
} \
}
--- a/src/x86/looprestoration_ssse3.asm
+++ b/src/x86/looprestoration_ssse3.asm
@@ -188,13 +188,13 @@
%define srcptrq srcq
%define dstptrq dstq
%define hd dword [esp+ 0]
- %define edged dword [esp+12]
+ %define edgeb byte [esp+12]
%define xlimd dword [esp+16]
%endif
; if (edge & has_right) align_w_to_16
; else w -= 3, and use that as limit in x loop
- test edged, 2 ; has_right
+ test edgeb, 2 ; has_right
jnz .align
mov xlimd, -3
jmp .loop
@@ -221,7 +221,7 @@
%endif
; load left edge pixels
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .emu_left
test leftq, leftq ; left == NULL for the edge-extended bottom/top
jz .load_left_combined
@@ -477,7 +477,7 @@
DEFINE_ARGS dst, stride, mid, w, h, y, edge
%define mptrq midq
%define dstptrq dstq
- %define edged dword [esp]
+ %define edgeb byte [esp]
%endif
; main x loop for vertical filter, does one column of 16 pixels
@@ -485,7 +485,7 @@
mova m3, [midq] ; middle line
; load top pixels
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jz .emu_top
mova m0, [midq-384*4]
mova m2, [midq-384*2]
@@ -604,8 +604,8 @@
mova m3, m4
mova m4, m5
mova m5, m6
- add dstptrq, strideq
add mptrq, 384*2
+ add dstptrq, strideq
dec yd
jg .loop_load
; for the bottom pixels, continue using m6 (as extended edge)
@@ -616,8 +616,8 @@
mov midq, [esp+8]
mov dstq, [esp+4]
%endif
- add dstq, 8
add midq, 16
+ add dstq, 8
sub wd, 8
jg .loop_x
RET
@@ -679,7 +679,7 @@
%define wq r0m
%define xlimd r1m
%define hd hmp
- %define edged edgemp
+ %define edgeb byte edgem
mov r6, edgem
and r6, 2 ; have_right
@@ -706,7 +706,7 @@
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
@@ -795,11 +795,13 @@
cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
movifnidn edged, edgem
%else
-cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
+cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
%define sumsq_baseq dword [esp+0]
%define sum_baseq dword [esp+4]
%define ylimd dword [esp+8]
%define m8 [esp+12]
+ mov edged, r4m
+ mov hd, r3m
%endif
mov xq, -2
%if ARCH_X86_64
@@ -812,7 +814,7 @@
.loop_x:
mov sumsqq, sumsq_baseq
mov sumq, sum_baseq
- lea yd, [hd+ylimd+2]
+ lea yd, [hq+ylimq+2]
%else
mov yd, edged
and yd, 8 ; have_bottom
@@ -824,12 +826,12 @@
.loop_x:
mov sumsqd, sumsq_baseq
mov sumd, sum_baseq
- lea yd, [hd+2]
+ lea yd, [hq+2]
add yd, ylimd
%endif
lea sumsqq, [sumsqq+xq*4+4-(384+16)*4]
lea sumq, [sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsqq+(384+16)*4*1]
movu m1, [sumsqq+(384+16)*4*1+16]
@@ -1180,10 +1182,10 @@
psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half]
%endif
+ add srcq, strideq
add aq, (384+16)*4
add bq, (384+16)*2
add tq, 384*2
- add srcq, strideq
dec yd
jg .loop_y
add xd, 8
@@ -1237,7 +1239,7 @@
mova m11, [pb_0_1]
%else
cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edged edgemp
+ %define edgeb byte edgem
%define wd xd
%define wq wd
%define wm r5m
@@ -1249,7 +1251,7 @@
%define m11 [PIC_sym(pb_0_1)]
%endif
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .no_right
xor xlimd, xlimd
add wd, 2
@@ -1275,7 +1277,7 @@
.loop_y:
mov xq, wq
; load left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
test leftq, leftq
jz .load_left_from_main
@@ -1401,9 +1403,9 @@
cmp xd, xlimd
jl .right_extend
+ add srcq, strideq
add sumsqq, (384+16)*4
add sumq, (384+16)*2
- add srcq, strideq
dec hd
jg .loop_y
%if ARCH_X86_32
@@ -1434,7 +1436,7 @@
lea yd, [hd+ylimd+2]
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2]
- test edged, 4 ; have_top
+ test edgeb, 4 ; have_top
jnz .load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+16]
@@ -1520,7 +1522,7 @@
lea yd, [ylimd+2]
add yd, hm
lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
- test dword edgem, 4 ; have_top
+ test byte edgem, 4 ; have_top
jnz .sumsq_load_top
movu m0, [sumsq_ptrq+(384+16)*4*1]
movu m1, [sumsq_ptrq+(384+16)*4*1+16]
@@ -1582,7 +1584,7 @@
lea yd, [ylimd+2]
add yd, hm
lea sum_ptrq, [sumq+xq*2+2-(384+16)*2]
- test dword edgem, 4 ; have_top
+ test byte edgem, 4 ; have_top
jnz .sum_load_top
movu m0, [sum_ptrq+(384+16)*2*1]
mova m1, m0
@@ -1882,7 +1884,7 @@
cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
movifnidn wd, wm
- mov wtq, wtmp
+ movd m0, wtm
%if ARCH_X86_64
movifnidn hd, hm
mova m10, [pd_1024]
@@ -1892,7 +1894,6 @@
%define m10 [PIC_sym(pd_1024)]
%define m11 m7
%endif
- movd m0, [wtq]
pshufd m0, m0, 0
DEFINE_ARGS dst, stride, t1, t2, w, h, idx
%if ARCH_X86_32