ref: a91a03b0e143f77bd647b481aff034eb3315ee7b
parent: 713aa34c421726719a2e9676ba2d0d5af99512bf
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Wed Sep 18 05:25:57 EDT 2019
x86: add warp_affine SSE4 and SSSE3 asm ------------------------------------------ x86_64: warp_8x8_8bpc_c: 1773.4 x86_32: warp_8x8_8bpc_c: 1740.4 ---------- x86_64: warp_8x8_8bpc_ssse3: 317.5 x86_32: warp_8x8_8bpc_ssse3: 378.4 ---------- x86_64: warp_8x8_8bpc_sse4: 303.7 x86_32: warp_8x8_8bpc_sse4: 367.7 ---------- x86_64: warp_8x8_8bpc_avx2: 224.9 --------------------- --------------------- x86_64: warp_8x8t_8bpc_c: 1664.6 x86_32: warp_8x8t_8bpc_c: 1674.0 ---------- x86_64: warp_8x8t_8bpc_ssse3: 320.7 x86_32: warp_8x8t_8bpc_ssse3: 379.5 ---------- x86_64: warp_8x8t_8bpc_sse4: 304.8 x86_32: warp_8x8t_8bpc_sse4: 369.8 ---------- x86_64: warp_8x8t_8bpc_avx2: 228.5 ------------------------------------------
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -88,7 +88,11 @@
decl_blend_dir_fn(dav1d_blend_h_ssse3);
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
decl_emu_edge_fn(dav1d_emu_edge_avx2);
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
@@ -134,7 +138,19 @@
c->blend = dav1d_blend_ssse3;
c->blend_v = dav1d_blend_v_ssse3;
c->blend_h = dav1d_blend_h_ssse3;
+
+ c->warp8x8 = dav1d_warp_affine_8x8_ssse3;
+ c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
+
c->emu_edge = dav1d_emu_edge_ssse3;
+#endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+ return;
+
+#if BITDEPTH == 8
+ c->warp8x8 = dav1d_warp_affine_8x8_sse4;
+ c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -44,6 +44,10 @@
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
@@ -53,17 +57,18 @@
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
-pb_64: times 16 db 64
-pw_8: times 8 dw 8
-pw_26: times 8 dw 26
-pw_34: times 8 dw 34
-pw_512: times 8 dw 512
-pw_1024: times 8 dw 1024
-pw_2048: times 8 dw 2048
-pw_6903: times 8 dw 6903
-pw_8192: times 8 dw 8192
-pd_32: times 4 dd 32
-pd_512: times 4 dd 512
+pb_64: times 16 db 64
+pw_8: times 8 dw 8
+pw_26: times 8 dw 26
+pw_34: times 8 dw 34
+pw_512: times 8 dw 512
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+pw_6903: times 8 dw 6903
+pw_8192: times 8 dw 8192
+pd_32: times 4 dd 32
+pd_512: times 4 dd 512
+pd_32768: times 4 dd 32768
pw_258: times 2 dw 258
@@ -146,6 +151,8 @@
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+cextern mc_warp_filter
+
SECTION .text
INIT_XMM ssse3
@@ -3301,6 +3308,580 @@
sub r5d, 1<<16
jg .hv_w8_loop0
RET
+
+%if ARCH_X86_32
+ %macro SAVE_ALPHA_BETA 0
+ mov alpham, alphad
+ mov betam, betad
+ %endmacro
+
+ %macro SAVE_DELTA_GAMMA 0
+ mov deltam, deltad
+ mov gammam, gammad
+ %endmacro
+
+ %macro LOAD_ALPHA_BETA_MX 0
+ mov mym, myd
+ mov alphad, alpham
+ mov betad, betam
+ mov mxd, mxm
+ %endmacro
+
+ %macro LOAD_DELTA_GAMMA_MY 0
+ mov mxm, mxd
+ mov deltad, deltam
+ mov gammad, gammam
+ mov myd, mym
+ %endmacro
+
+ %define PIC_reg r2
+ %define PIC_base_offset $$
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+%else
+ %define SAVE_ALPHA_BETA
+ %define SAVE_DELTA_GAMMA
+ %define PIC_sym(sym) sym
+%endif
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+ %assign copy_args 8*4
+ %else
+ %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 0
+ %if copy_args
+ mov r0, r0m
+ mov r1, r1m
+ mov r2, r2m
+ mov r3, r3m
+ mov r5, r5m
+ mov dstm, r0
+ mov dsm, r1
+ mov srcm, r2
+ mov ssm, r3
+ mov mxm, r5
+ mov r0, r6m
+ mov mym, r0
+ %endif
+%endmacro
+
+%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
+ %if cpuflag(sse4)
+ pblendw %1, %2, 0xAA
+ %else
+ pand %2, m10
+ por %1, %2
+ %endif
+%endmacro
+
+%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
+ ; Can be done using gathers, but that's terribly slow on many CPU:s
+ %if ARCH_X86_32
+ %define m8 m4
+ %define m9 m5
+ %define m14 m6
+ %define m15 m7
+ %define m11 m7
+ pxor m11, m11
+ %endif
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m2, [filterq+myq *8] ; a
+ movq m8, [filterq+tmp1q*8] ; e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; b
+ movq m0, [filterq+tmp1q*8] ; f
+ punpcklwd m2, m3
+ punpcklwd m8, m0
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq m0, [filterq+myq *8] ; c
+ movq m9, [filterq+tmp1q*8] ; g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq m3, [filterq+tmp2q*8] ; d
+ movq m1, [filterq+tmp1q*8] ; h
+ punpcklwd m0, m3
+ punpcklwd m9, m1
+ punpckldq m1, m2, m0
+ punpckhdq m2, m0
+ punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m0, %3
+ pmaddwd m3, %5
+ pmaddwd m1, %7
+ pmaddwd m14, %9
+ paddd m0, m3
+ paddd m1, m14
+ paddd m0, m1
+ mova %1, m0
+ %if ARCH_X86_64
+ SWAP m3, m14
+ %endif
+ punpckldq m0, m8, m9
+ punpckhdq m8, m9
+ punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
+ punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
+ punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
+ punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
+ pmaddwd m1, %4
+ pmaddwd m14, %6
+ pmaddwd m2, %8
+ pmaddwd m15, %10
+ paddd m1, m14
+ paddd m2, m15
+ paddd m1, m2
+ mova %2, m1
+ %if ARCH_X86_64
+ SWAP m14, m3
+ %endif
+%endmacro
+
+%if ARCH_X86_64
+ %define counterd r4d
+%else
+ %if copy_args == 0
+ %define counterd dword r4m
+ %else
+ %define counterd dword [esp+stack_size-4*7]
+ %endif
+%endif
+
+%macro WARP_AFFINE_8X8T 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
+%else
+cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
+ %if copy_args
+ %define tmpm [esp+stack_size-4*1]
+ %define tsm [esp+stack_size-4*2]
+ %endif
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
+.loop:
+%if ARCH_X86_32
+ %define m12 m4
+ %define m13 m5
+ %define m14 m6
+ %define m15 m7
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+ psrad m12, 13
+ psrad m13, 13
+ psrad m14, 13
+ psrad m15, 13
+ packssdw m12, m13
+ packssdw m14, m15
+ mova m13, [PIC_sym(pw_8192)]
+ pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
+ pmulhrsw m14, m13
+ mova [tmpq+tsq*0], m12
+ mova [tmpq+tsq*2], m14
+ dec counterd
+ jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
+%if ARCH_X86_32
+ mov tmpm, tmpd
+ mov r0, [esp+0x100]
+ mov r1, [esp+0x104]
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+%endmacro
+
+%macro WARP_AFFINE_8X8 0
+%if ARCH_X86_64
+cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+%else
+cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
+ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
+ filter, tmp1, delta, my, gamma
+ %define alphaq r0
+ %define alphad r0
+ %define alpham [esp+gprsize+0x100]
+ %define betaq r1
+ %define betad r1
+ %define betam [esp+gprsize+0x104]
+ %define deltaq r0
+ %define deltad r0
+ %define deltam [esp+gprsize+0x108]
+ %define gammaq r1
+ %define gammad r1
+ %define gammam [esp+gprsize+0x10C]
+ %define filterq r3
+ %define tmp1q r4
+ %define tmp1d r4
+ %define tmp1m [esp+gprsize+0x110]
+ %define myq r5
+ %define myd r5
+ %define mym r6m
+ %if copy_args
+ %define dstm [esp+stack_size-4*1]
+ %define dsm [esp+stack_size-4*2]
+ %define srcm [esp+stack_size-4*3]
+ %define ssm [esp+stack_size-4*4]
+ %define mxm [esp+stack_size-4*5]
+ %define mym [esp+stack_size-4*6]
+ %endif
+%endif
+ call .main
+ jmp .start
+.loop:
+%if ARCH_X86_32
+ mov dstm, dstd
+ mov alphad, [esp+0x100]
+ mov betad, [esp+0x104]
+%endif
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
+ mova m10, [PIC_sym(pw_8192)]
+ %else
+ %define m10 [PIC_sym(pw_8192)]
+ %endif
+%endif
+%if ARCH_X86_32
+ %define m12 m5
+ %define m13 m6
+ mova m12, [esp+0xC0]
+ mova m13, [esp+0xD0]
+%endif
+%if cpuflag(sse4)
+ %if ARCH_X86_32
+ %define m11 m4
+ pxor m11, m11
+ %endif
+ psrad m12, 18
+ psrad m13, 18
+ packusdw m12, m13
+ pavgw m12, m11 ; (x + (1 << 10)) >> 11
+%else
+ psrad m12, 17
+ psrad m13, 17
+ packssdw m12, m13
+ pmulhrsw m12, m10 ; (x + (1 << 10)) >> 11
+%endif
+%if ARCH_X86_32
+ %define m14 m6
+ %define m15 m7
+ mova m14, [esp+0xE0]
+ mova m15, [esp+0xF0]
+%endif
+%if cpuflag(sse4)
+ psrad m14, 18
+ psrad m15, 18
+ packusdw m14, m15
+ pavgw m14, m11 ; (x + (1 << 10)) >> 11
+%else
+ psrad m14, 17
+ psrad m15, 17
+ packssdw m14, m15
+ pmulhrsw m14, m10 ; (x + (1 << 10)) >> 11
+%endif
+ packuswb m12, m14
+ movq [dstq+dsq*0], m12
+ movhps [dstq+dsq*1], m12
+ dec counterd
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+%assign stack_offset stack_offset+gprsize
+%if ARCH_X86_32
+ %assign stack_size stack_size+4
+ %if copy_args
+ %assign stack_offset stack_offset-4
+ %endif
+ RELOC_ARGS
+ LEA PIC_reg, $$
+ %define PIC_mem [esp+gprsize+0x114]
+ mov abcdd, abcdm
+ %if copy_args == 0
+ mov ssd, ssm
+ mov mxd, mxm
+ %endif
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+ SAVE_DELTA_GAMMA
+%if ARCH_X86_32
+ mov abcdd, abcdm
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+%if ARCH_X86_32
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%endif
+ sub betad, tmp2d ; beta -= alpha*3
+ lea filterq, [PIC_sym(mc_warp_filter)]
+%if ARCH_X86_64
+ mov myd, r6m
+ pxor m11, m11
+%endif
+ call .h
+ psrld m2, m0, 16
+ psrld m3, m1, 16
+%if ARCH_X86_32
+ mova [esp+gprsize+0x10], m3
+%endif
+ call .h
+ psrld m4, m0, 16
+ psrld m5, m1, 16
+%if ARCH_X86_32
+ mova [esp+gprsize+0x20], m4
+ mova [esp+gprsize+0x30], m5
+%endif
+ call .h
+%if ARCH_X86_64
+ %define blendmask [rsp+gprsize+0x80]
+%else
+ mova m3, [esp+gprsize+0x10]
+ %define blendmask [esp+gprsize+0x120]
+ %define m10 m7
+%endif
+ pcmpeqd m10, m10
+ pslld m10, 16
+ mova blendmask, m10
+ BLENDHWDW m2, m0 ; 0
+ BLENDHWDW m3, m1 ; 2
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 1
+ BLENDHWDW m5, m1 ; 3
+ mova [rsp+gprsize+0x20], m4
+ mova [rsp+gprsize+0x30], m5
+ call .h
+%if ARCH_X86_32
+ mova m3, [esp+gprsize+0x10]
+ %define m10 m5
+%endif
+ psrld m6, m2, 16
+ psrld m7, m3, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 2
+ BLENDHWDW m7, m1 ; 4
+ mova [rsp+gprsize+0x40], m6
+ mova [rsp+gprsize+0x50], m7
+ call .h
+%if ARCH_X86_32
+ mova m4, [esp+gprsize+0x20]
+ mova m5, [esp+gprsize+0x30]
+%endif
+ psrld m2, m4, 16
+ psrld m3, m5, 16
+ mova m10, blendmask
+ BLENDHWDW m2, m0 ; 3
+ BLENDHWDW m3, m1 ; 5
+ mova [rsp+gprsize+0x60], m2
+ mova [rsp+gprsize+0x70], m3
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x40]
+ mova m7, [esp+gprsize+0x50]
+ %define m10 m7
+%endif
+ psrld m4, m6, 16
+ psrld m5, m7, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 4
+ BLENDHWDW m5, m1 ; 6
+%if ARCH_X86_64
+ add myd, 512+(64<<10)
+ mova m6, m2
+ mova m7, m3
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ add dword mym, 512+(64<<10)
+%endif
+ mov counterd, 4
+ SAVE_ALPHA_BETA
+.main2:
+ call .h
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0x60]
+ mova m7, [esp+gprsize+0x70]
+ %define m10 m5
+%endif
+ psrld m6, 16
+ psrld m7, 16
+ mova m10, blendmask
+ BLENDHWDW m6, m0 ; 5
+ BLENDHWDW m7, m1 ; 7
+%if ARCH_X86_64
+ WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5, \
+ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7
+%else
+ mova [esp+gprsize+0xA0], m6
+ mova [esp+gprsize+0xB0], m7
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0]
+ LOAD_ALPHA_BETA_MX
+%endif
+ call .h
+ mova m2, [rsp+gprsize+0x40]
+ mova m3, [rsp+gprsize+0x50]
+%if ARCH_X86_32
+ mova m4, [rsp+gprsize+0x80]
+ mova m5, [rsp+gprsize+0x90]
+ %define m10 m7
+%endif
+ mova [rsp+gprsize+0x00], m2
+ mova [rsp+gprsize+0x10], m3
+ mova [rsp+gprsize+0x40], m4
+ mova [rsp+gprsize+0x50], m5
+ psrld m4, 16
+ psrld m5, 16
+ mova m10, blendmask
+ BLENDHWDW m4, m0 ; 6
+ BLENDHWDW m5, m1 ; 8
+%if ARCH_X86_64
+ WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
+ m6, m7, \
+ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
+ m4, m5
+%else
+ mova [esp+gprsize+0x80], m4
+ mova [esp+gprsize+0x90], m5
+ LOAD_DELTA_GAMMA_MY
+ WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
+ [esp+gprsize+0x20], [esp+gprsize+0x30], \
+ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
+ [esp+gprsize+0x00], [esp+gprsize+0x10], \
+ [esp+gprsize+0x80], [esp+gprsize+0x90]
+ mov mym, myd
+ mov dstd, dstm
+ mov dsd, dsm
+ mov mxd, mxm
+%endif
+ mova m2, [rsp+gprsize+0x60]
+ mova m3, [rsp+gprsize+0x70]
+%if ARCH_X86_32
+ mova m6, [esp+gprsize+0xA0]
+ mova m7, [esp+gprsize+0xB0]
+%endif
+ mova [rsp+gprsize+0x20], m2
+ mova [rsp+gprsize+0x30], m3
+ mova [rsp+gprsize+0x60], m6
+ mova [rsp+gprsize+0x70], m7
+ ret
+ALIGN function_align
+.h:
+%if ARCH_X86_32
+ %define m8 m3
+ %define m9 m4
+ %define m10 m5
+ %define m14 m6
+ %define m15 m7
+%endif
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+%if ARCH_X86_32
+ %assign stack_offset stack_offset+4
+ %assign stack_size stack_size+4
+ %define PIC_mem [esp+gprsize*2+0x114]
+ mov PIC_mem, PIC_reg
+ mov srcd, srcm
+%endif
+ movu m10, [srcq]
+%if ARCH_X86_32
+ add srcd, ssm
+ mov srcm, srcd
+ mov PIC_reg, PIC_mem
+%else
+ add srcq, ssq
+%endif
+ shr mxd, 10
+ shr tmp1d, 10
+ movq m1, [filterq+mxq *8] ; 0 X
+ movq m8, [filterq+tmp1q*8] ; 4 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m1, [filterq+tmp2q*8] ; 0 1
+ movhps m8, [filterq+tmp1q*8] ; 4 5
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq m14, [filterq+mxq *8] ; 2 X
+ movq m9, [filterq+tmp1q*8] ; 6 X
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movhps m14, [filterq+tmp2q*8] ; 2 3
+ movhps m9, [filterq+tmp1q*8] ; 6 7
+ pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
+ pmaddubsw m0, m1
+ pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
+ pmaddubsw m1, m8
+ pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
+ pmaddubsw m15, m14
+ pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
+ pmaddubsw m10, m9
+ mova m14, [PIC_sym(pw_8192)]
+ mova m9, [PIC_sym(pd_32768)]
+ phaddw m0, m15
+ phaddw m1, m10
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ pmaddwd m1, m14
+ paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
+ paddd m1, m9
+ ret
+%endmacro
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM ssse3
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
%if WIN64
DECLARE_REG_TMP 6, 4