shithub: dav1d

Download patch

ref: 6044a1ae4a606ec5940bbc28ada4a5727c9358dc
parent: 08184fd81d9e04f5cf6e8a92dc0461295c863e0a
author: Francois Cartegnie <[email protected]>
date: Fri Nov 23 06:37:58 EST 2018

add SSSE3 avg/w_avg/mask

Adaption of the avx2 code

--- a/src/meson.build
+++ b/src/meson.build
@@ -121,6 +121,7 @@
             'x86/loopfilter.asm',
             'x86/looprestoration.asm',
             'x86/mc.asm',
+            'x86/mc_ssse3.asm',
         )
 
         # Compile the ASM sources with NASM
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -51,8 +51,11 @@
 decl_mct_fn(dav1d_prep_bilin_avx2);
 
 decl_avg_fn(dav1d_avg_avx2);
+decl_avg_fn(dav1d_avg_ssse3);
 decl_w_avg_fn(dav1d_w_avg_avx2);
+decl_w_avg_fn(dav1d_w_avg_ssse3);
 decl_mask_fn(dav1d_mask_avx2);
+decl_mask_fn(dav1d_mask_ssse3);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
 decl_blend_fn(dav1d_blend_avx2);
 decl_blend_dir_fn(dav1d_blend_v_avx2);
@@ -70,7 +73,18 @@
     c->mct[type] = dav1d_prep_##name##_##suffix
     const unsigned flags = dav1d_get_cpu_flags();
 
-    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+    if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+        return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+    c->avg = dav1d_avg_ssse3;
+    c->w_avg = dav1d_w_avg_ssse3;
+    c->mask = dav1d_mask_ssse3;
+#endif
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+        return;
 
 #if BITDEPTH == 8 && ARCH_X86_64
     init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
--- /dev/null
+++ b/src/x86/mc_ssse3.asm
@@ -1,0 +1,251 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+
+%macro BIDIR_JMP_TABLE 1-*
+    ;evaluated at definition time (in loop below)
+    %xdefine %1_table (%%table - 2*%2)
+    %xdefine %%base %1_table
+    %xdefine %%prefix mangle(private_prefix %+ _%1)
+    ; dynamically generated label
+    %%table:
+    %rep %0 - 1 ; repeat for num args
+        dd %%prefix %+ .w%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg_ssse3,        4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_ssse3,      4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_ssse3,       4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+    %1                    0
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+    movd   [dstq          ], m0      ; copy dw[0]
+    pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
+    movd   [dstq+strideq*1], m1      ; copy dw[1]
+    punpckhqdq           m0, m0      ; swap dw[3,2] with dw[1,0]
+    movd   [dstq+strideq*2], m0      ; dw[2]
+    psrlq                m0, 32      ; shift right in dw[3]
+    movd   [dstq+stride3q ], m0      ; copy
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+.w8_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq*2]
+.w8:
+    movq   [dstq          ], m0
+    movhps [dstq+strideq*1], m0
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+.w16_loop:
+    %1_INC_PTR            2
+    %1                    0
+    lea                dstq, [dstq+strideq]
+.w16:
+    mova   [dstq          ], m0
+    dec                  hd
+    jg .w16_loop
+    RET
+.w32_loop:
+    %1_INC_PTR            4
+    %1                    0
+    lea                dstq, [dstq+strideq]
+.w32:
+    mova   [dstq          ], m0
+    %1                    2
+    mova   [dstq + 16     ], m0
+    dec                  hd
+    jg .w32_loop
+    RET
+.w64_loop:
+    %1_INC_PTR            8
+    %1                    0
+    add                dstq, strideq
+.w64:
+    %assign i 0
+    %rep 4
+    mova   [dstq + i*16   ], m0
+    %assign i i+1
+    %if i < 4
+    %1                    2*i
+    %endif
+    %endrep
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    %1_INC_PTR            16
+    %1                    0
+    add                dstq, strideq
+.w128:
+    %assign i 0
+    %rep 8
+    mova   [dstq + i*16   ], m0
+    %assign i i+1
+    %if i < 8
+    %1                    2*i
+    %endif
+    %endrep
+    dec                  hd
+    jg .w128_loop
+    RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+    ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+    mova                 m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+    paddw                m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+    mova                 m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m1, [tmp2q+(%1+1)*mmsize]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+    packuswb             m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+    lea                  r6, [avg_ssse3_table]
+    tzcnt                wd, wm ; leading zeros
+    movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
+    movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+    mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+    add                  wq, r6
+    BIDIR_FN            AVG
+
+%macro W_AVG 1 ; src_offset
+    ; (a * weight + b * (16 - weight) + 128) >> 8
+    ; = ((a - b) * weight + (b << 4) + 128) >> 8
+    ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+    mova                 m0,     [tmp2q+(%1+0)*mmsize]
+    psubw                m2, m0, [tmp1q+(%1+0)*mmsize]
+    mova                 m1,     [tmp2q+(%1+1)*mmsize]
+    psubw                m3, m1, [tmp1q+(%1+1)*mmsize]
+    paddw                m2, m2 ; compensate for the weight only being half
+    paddw                m3, m3 ; of what it should be
+    pmulhw               m2, m4 ; (b-a) * (-weight << 12)
+    pmulhw               m3, m4 ; (b-a) * (-weight << 12)
+    paddw                m0, m2 ; ((b-a) * -weight) + b
+    paddw                m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+    lea                  r6, [w_avg_ssse3_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movd                 m0, r6m
+    pshuflw              m0, m0, q0000
+    punpcklqdq           m0, m0
+    movsxd               wq, dword [r6+wq*4]
+    pxor                 m4, m4
+    psllw                m0, 11 ; can't shift by 12, sign bit must be preserved
+    psubw                m4, m0
+    mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
+    add                  wq, r6
+    BIDIR_FN          W_AVG
+
+%macro MASK 1 ; src_offset
+    ; (a * m + b * (64 - m) + 512) >> 10
+    ; = ((a - b) * m + (b << 6) + 512) >> 10
+    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+    mova                 m3,     [maskq+(%1+0)*(mmsize/2)]
+    mova                 m0,     [tmp2q+(%1+0)*mmsize] ; b
+    psubw                m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+    mova                 m6, m3      ; m
+    psubb                m3, m4, m6  ; -m
+    paddw                m1, m1     ; (b - a) << 1
+    paddb                m3, m3     ; -m << 1
+    punpcklbw            m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+    pmulhw               m1, m2     ; (-m * (b - a)) << 10
+    paddw                m0, m1     ; + b
+    mova                 m1,     [tmp2q+(%1+1)*mmsize] ; b
+    psubw                m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+    paddw                m2, m2  ; (b - a) << 1
+    mova                 m6, m3  ; (-m << 1)
+    punpckhbw            m3, m4, m6 ; (-m << 9)
+    pmulhw               m2, m3 ; (-m << 9)
+    paddw                m1, m2 ; (-m * (b - a)) << 10
+    pmulhrsw             m0, m5 ; round
+    pmulhrsw             m1, m5 ; round
+    packuswb             m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+    add               maskq, %1*mmsize/2
+    add               tmp1q, %1*mmsize
+    add               tmp2q, %1*mmsize
+%endmacro
+
+cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+    lea                  r7, [mask_ssse3_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    mov               maskq, maskmp
+    movsxd               wq, dword [r7+wq*4]
+    pxor                 m4, m4
+    mova                 m5, [pw_2048+r7-mask_ssse3_table]
+    add                  wq, r7
+    BIDIR_FN           MASK
+
+%endif ; ARCH_X86_64