shithub: dav1d

Download patch

ref: 46a3fd20e032a740061e222414c4145310893593
parent: e2c6d0295c58c9f1c9ce6570e993530b6bc94b68
author: Ronald S. Bultje <[email protected]>
date: Fri Oct 26 13:00:39 EDT 2018

Add a 4x4 cdef_filter AVX2 implementation

cdef_filter_4x4_8bpc_c: 2273.6
cdef_filter_4x4_8bpc_avx2: 113.6

Decoding time reduces to 15.51s for first 1000 frames of chimera 1080p,
from 23.1 before cdef_filter SIMD or 17.86 with only 8x8 cdef_filter
SIMD.

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -56,16 +56,76 @@
 
 SECTION .text
 
+%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride
+    ; load p0/p1
+    movsx         offq, byte [dirq+kq+%1]       ; off1
+%if %5 == 4
+    movq           xm5, [stkq+offq*2+%6*0]      ; p0
+    movq           xm6, [stkq+offq*2+%6*2]
+    movhps         xm5, [stkq+offq*2+%6*1]
+    movhps         xm6, [stkq+offq*2+%6*3]
+    vinserti128     m5, xm6, 1
+%else
+    movu           xm5, [stkq+offq*2+%6*0]      ; p0
+    vinserti128     m5, [stkq+offq*2+%6*1], 1
+%endif
+    neg           offq                          ; -off1
+%if %5 == 4
+    movq           xm6, [stkq+offq*2+%6*0]      ; p1
+    movq           xm9, [stkq+offq*2+%6*2]
+    movhps         xm6, [stkq+offq*2+%6*1]
+    movhps         xm9, [stkq+offq*2+%6*3]
+    vinserti128     m6, xm9, 1
+%else
+    movu           xm6, [stkq+offq*2+%6*0]      ; p1
+    vinserti128     m6, [stkq+offq*2+%6*1], 1
+%endif
+    pcmpeqw         m9, m14, m5
+    pcmpeqw        m10, m14, m6
+    pandn           m9, m5
+    pandn          m10, m6
+    pmaxsw          m7, m9                      ; max after p0
+    pminsw          m8, m5                      ; min after p0
+    pmaxsw          m7, m10                     ; max after p1
+    pminsw          m8, m6                      ; min after p1
+
+    ; accumulate sum[m15] over p0/p1
+    psubw           m5, m4                      ; diff_p0(p0 - px)
+    psubw           m6, m4                      ; diff_p1(p1 - px)
+    pabsw           m9, m5
+    pabsw          m10, m6
+    psraw          m11, m9,  %2
+    psraw          m12, m10, %2
+    psubw          m11, %3, m11
+    psubw          m12, %3, m12
+    pmaxsw         m11, m13
+    pmaxsw         m12, m13
+    pminsw         m11, m9
+    pminsw         m12, m10
+    psignw         m11, m5                      ; constrain(diff_p0)
+    psignw         m12, m6                      ; constrain(diff_p1)
+    pmullw         m11, %4                      ; constrain(diff_p0) * pri_taps
+    pmullw         m12, %4                      ; constrain(diff_p1) * pri_taps
+    paddw          m15, m11
+    paddw          m15, m12
+%endmacro
+
+%macro cdef_filter_fn 3 ; w, h, stride
 INIT_YMM avx2
-cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \
-                                            pri, sec, stride3, dst4, edge
-%define px rsp+32+2*32
+cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \
+                           dst, stride, left, top, pri, sec, stride3, dst4, edge
+%define px rsp+2*16+2*%3
     pcmpeqw        m14, m14
     psrlw          m14, 1                   ; 0x7fff
     mov          edged, r8m
 
     ; prepare pixel buffers - body/right
+%if %1 == 4
+    INIT_XMM avx2
+%endif
+%if %2 == 8
     lea          dst4q, [dstq+strideq*4]
+%endif
     lea       stride3q, [strideq*3]
     test         edged, 2                   ; have_right
     jz .no_right
@@ -73,48 +133,70 @@
     pmovzxbw        m2, [dstq+strideq*1]
     pmovzxbw        m3, [dstq+strideq*2]
     pmovzxbw        m4, [dstq+stride3q]
-    movu     [px+0*32], m1
-    movu     [px+1*32], m2
-    movu     [px+2*32], m3
-    movu     [px+3*32], m4
+    mova     [px+0*%3], m1
+    mova     [px+1*%3], m2
+    mova     [px+2*%3], m3
+    mova     [px+3*%3], m4
+%if %2 == 8
     pmovzxbw        m1, [dst4q+strideq*0]
     pmovzxbw        m2, [dst4q+strideq*1]
     pmovzxbw        m3, [dst4q+strideq*2]
     pmovzxbw        m4, [dst4q+stride3q]
-    movu     [px+4*32], m1
-    movu     [px+5*32], m2
-    movu     [px+6*32], m3
-    movu     [px+7*32], m4
+    mova     [px+4*%3], m1
+    mova     [px+5*%3], m2
+    mova     [px+6*%3], m3
+    mova     [px+7*%3], m4
+%endif
     jmp .body_done
 .no_right:
+%if %1 == 4
+    movd           xm1, [dstq+strideq*0]
+    movd           xm2, [dstq+strideq*2]
+    pinsrd         xm1, [dstq+strideq*1], 1
+    pinsrd         xm2, [dstq+stride3q], 1
+    pmovzxbw       xm1, xm1
+    pmovzxbw       xm2, xm2
+    movq     [px+0*%3], xm1
+    movhps   [px+1*%3], xm1
+    movq     [px+2*%3], xm2
+    movhps   [px+3*%3], xm2
+%else
     pmovzxbw       xm1, [dstq+strideq*0]
     pmovzxbw       xm2, [dstq+strideq*1]
     pmovzxbw       xm3, [dstq+strideq*2]
     pmovzxbw       xm4, [dstq+stride3q]
-    movu     [px+0*32], xm1
-    movu     [px+1*32], xm2
-    movu     [px+2*32], xm3
-    movu     [px+3*32], xm4
-    movd  [px+0*32+16], xm14
-    movd  [px+1*32+16], xm14
-    movd  [px+2*32+16], xm14
-    movd  [px+3*32+16], xm14
+    mova     [px+0*%3], xm1
+    mova     [px+1*%3], xm2
+    mova     [px+2*%3], xm3
+    mova     [px+3*%3], xm4
+%endif
+    movd [px+0*%3+%1*2], xm14
+    movd [px+1*%3+%1*2], xm14
+    movd [px+2*%3+%1*2], xm14
+    movd [px+3*%3+%1*2], xm14
+%if %2 == 8
+    ; FIXME w == 4
+    movd [px+0*%3+%1*2], xm14
+    movd [px+1*%3+%1*2], xm14
+    movd [px+2*%3+%1*2], xm14
+    movd [px+3*%3+%1*2], xm14
     pmovzxbw       xm1, [dst4q+strideq*0]
     pmovzxbw       xm2, [dst4q+strideq*1]
     pmovzxbw       xm3, [dst4q+strideq*2]
     pmovzxbw       xm4, [dst4q+stride3q]
-    movu     [px+4*32], xm1
-    movu     [px+5*32], xm2
-    movu     [px+6*32], xm3
-    movu     [px+7*32], xm4
-    movd  [px+4*32+16], xm14
-    movd  [px+5*32+16], xm14
-    movd  [px+6*32+16], xm14
-    movd  [px+7*32+16], xm14
+    mova     [px+4*%3], xm1
+    mova     [px+5*%3], xm2
+    mova     [px+6*%3], xm3
+    mova     [px+7*%3], xm4
+    movd [px+4*%3+%1*2], xm14
+    movd [px+5*%3+%1*2], xm14
+    movd [px+6*%3+%1*2], xm14
+    movd [px+7*%3+%1*2], xm14
+%endif
 .body_done:
 
     ; top
-    DEFINE_ARGS dst, stride, left, top2, pri, sec, top1, dummy, edge
+    DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
     test         edged, 4                    ; have_top
     jz .no_top
     mov          top1q, [top2q+0*gprsize]
@@ -123,18 +205,18 @@
     jz .top_no_left
     test         edged, 2                    ; have_right
     jz .top_no_right
-    pmovzxbw        m1, [top1q-4]
-    pmovzxbw        m2, [top2q-4]
-    movu   [px-2*32-8], m1
-    movu   [px-1*32-8], m2
+    pmovzxbw        m1, [top1q-(%1/2)]
+    pmovzxbw        m2, [top2q-(%1/2)]
+    movu  [px-2*%3-%1], m1
+    movu  [px-1*%3-%1], m2
     jmp .top_done
 .top_no_right:
-    pmovzxbw        m1, [top1q-8]
-    pmovzxbw        m2, [top2q-8]
-    movu  [px-2*32-16], m1
-    movu  [px-1*32-16], m2
-    movd  [px-2*32+16], xm14
-    movd  [px-1*32+16], xm14
+    pmovzxbw        m1, [top1q-%1]
+    pmovzxbw        m2, [top2q-%1]
+    movu [px-2*%3-%1*2], m1
+    movu [px-1*%3-%1*2], m2
+    movd [px-2*%3+%1*2], xm14
+    movd [px-1*%3+%1*2], xm14
     jmp .top_done
 .top_no_left:
     test         edged, 2                   ; have_right
@@ -141,24 +223,32 @@
     jz .top_no_left_right
     pmovzxbw        m1, [top1q]
     pmovzxbw        m2, [top2q]
-    movu   [px-2*32+0], m1
-    movu   [px-1*32+0], m2
-    movd   [px-2*32-4], xm14
-    movd   [px-1*32-4], xm14
+    mova   [px-2*%3+0], m1
+    mova   [px-1*%3+0], m2
+    movd   [px-2*%3-4], xm14
+    movd   [px-1*%3-4], xm14
     jmp .top_done
 .top_no_left_right:
+%if %1 == 4
+    movd           xm1, [top1q]
+    pinsrd         xm1, [top2q], 1
+    pmovzxbw       xm1, xm1
+    movq   [px-2*%3+0], xm1
+    movhps [px-1*%3+0], xm1
+%else
     pmovzxbw       xm1, [top1q]
     pmovzxbw       xm2, [top2q]
-    movu   [px-2*32+0], xm1
-    movu   [px-1*32+0], xm2
-    movd   [px-2*32-4], xm14
-    movd   [px-1*32-4], xm14
-    movd  [px-2*32+16], xm14
-    movd  [px-1*32+16], xm14
+    mova   [px-2*%3+0], xm1
+    mova   [px-1*%3+0], xm2
+%endif
+    movd   [px-2*%3-4], xm14
+    movd   [px-1*%3-4], xm14
+    movd [px-2*%3+%1*2], xm14
+    movd [px-1*%3+%1*2], xm14
     jmp .top_done
 .no_top:
-    movu   [px-2*32-8], m14
-    movu   [px-1*32-8], m14
+    movu   [px-2*%3-%1], m14
+    movu   [px-1*%3-%1], m14
 .top_done:
 
     ; left
@@ -165,49 +255,57 @@
     test         edged, 1                   ; have_left
     jz .no_left
     pmovzxbw       xm1, [leftq+ 0]
+%if %2 == 8
     pmovzxbw       xm2, [leftq+ 8]
+%endif
     movd   [px+0*32-4], xm1
     pextrd [px+1*32-4], xm1, 1
     pextrd [px+2*32-4], xm1, 2
     pextrd [px+3*32-4], xm1, 3
+%if %2 == 8
     movd   [px+4*32-4], xm2
     pextrd [px+5*32-4], xm2, 1
     pextrd [px+6*32-4], xm2, 2
     pextrd [px+7*32-4], xm2, 3
+%endif
     jmp .left_done
 .no_left:
-    movd   [px+0*32-4], xm14
-    movd   [px+1*32-4], xm14
-    movd   [px+2*32-4], xm14
-    movd   [px+3*32-4], xm14
-    movd   [px+4*32-4], xm14
-    movd   [px+5*32-4], xm14
-    movd   [px+6*32-4], xm14
-    movd   [px+7*32-4], xm14
+    movd   [px+0*%3-4], xm14
+    movd   [px+1*%3-4], xm14
+    movd   [px+2*%3-4], xm14
+    movd   [px+3*%3-4], xm14
+%if %2 == 8
+    movd   [px+4*%3-4], xm14
+    movd   [px+5*%3-4], xm14
+    movd   [px+6*%3-4], xm14
+    movd   [px+7*%3-4], xm14
+%endif
 .left_done:
 
     ; bottom
-    DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, dummy2, dummy3, edge
+    DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
     test         edged, 8                   ; have_bottom
     jz .no_bottom
-    lea          dst8q, [dstq+8*strideq]
+    lea          dst8q, [dstq+%2*strideq]
     test         edged, 1                   ; have_left
     jz .bottom_no_left
     test         edged, 2                   ; have_right
     jz .bottom_no_right
-    pmovzxbw        m1, [dst8q-4]
-    pmovzxbw        m2, [dst8q+strideq-4]
-    movu   [px+8*32-8], m1
-    movu   [px+9*32-8], m2
+    pmovzxbw        m1, [dst8q-(%1/2)]
+    pmovzxbw        m2, [dst8q+strideq-(%1/2)]
+    movu   [px+(%2+0)*%3-%1], m1
+    movu   [px+(%2+1)*%3-%1], m2
     jmp .bottom_done
 .bottom_no_right:
-    pmovzxbw        m1, [dst8q-8]
-    pmovzxbw        m2, [dst8q+strideq-8]
-    movu  [px+8*32-16], m1
-    movu  [px+9*32-16], m2
-    movd  [px+7*32+16], xm14                ; overwritten by previous movu
-    movd  [px+8*32+16], xm14
-    movd  [px+9*32+16], xm14
+    pmovzxbw        m1, [dst8q-%1]
+    pmovzxbw        m2, [dst8q+strideq-%1]
+    movu  [px+(%2+0)*%3-%1*2], m1
+    movu  [px+(%2+1)*%3-%1*2], m2
+%if %1 == 8
+    movd  [px+(%2-1)*%3+%1*2], xm14                ; overwritten by previous movu
+%endif
+    movd  [px+(%2+0)*%3+%1*2], xm14
+    movd  [px+(%2+1)*%3+%1*2], xm14
     jmp .bottom_done
 .bottom_no_left:
     test          edged, 2                  ; have_right
@@ -214,28 +312,37 @@
     jz .bottom_no_left_right
     pmovzxbw        m1, [dst8q]
     pmovzxbw        m2, [dst8q+strideq]
-    movu   [px+8*32+0], m1
-    movu   [px+9*32+0], m2
-    movd   [px+8*32-4], xm14
-    movd   [px+9*32-4], xm14
+    mova   [px+(%2+0)*%3+0], m1
+    mova   [px+(%2+1)*%3+0], m2
+    movd   [px+(%2+0)*%3-4], xm14
+    movd   [px+(%2+1)*%3-4], xm14
     jmp .bottom_done
 .bottom_no_left_right:
+%if %1 == 4
+    movd           xm1, [dst8q]
+    pinsrd         xm1, [dst8q+strideq], 1
+    pmovzxbw       xm1, xm1
+    movq   [px+(%2+0)*%3+0], xm1
+    movhps [px+(%2+1)*%3+0], xm1
+%else
     pmovzxbw       xm1, [dst8q]
     pmovzxbw       xm2, [dst8q+strideq]
-    movu   [px+8*32+0], xm1
-    movu   [px+9*32+0], xm2
-    movd   [px+8*32-4], xm14
-    movd   [px+9*32-4], xm14
-    movd  [px+8*32+16], xm14
-    movd  [px+9*32+16], xm14
+    mova   [px+(%2+0)*%3+0], xm1
+    mova   [px+(%2+1)*%3+0], xm2
+%endif
+    movd   [px+(%2+0)*%3-4], xm14
+    movd   [px+(%2+1)*%3-4], xm14
+    movd  [px+(%2+0)*%3+%1*2], xm14
+    movd  [px+(%2+1)*%3+%1*2], xm14
     jmp .bottom_done
 .no_bottom:
-    movu   [px+8*32-8], m14
-    movu   [px+9*32-8], m14
+    movu   [px+(%2+0)*%3-%1], m14
+    movu   [px+(%2+1)*%3-%1], m14
 .bottom_done:
 
     ; actual filter
-    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp
+    INIT_YMM avx2
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp
 %undef edged
     movifnidn     prid, prim
     movifnidn     secd, secm
@@ -258,7 +365,7 @@
     mov        [rsp+8], secdmpq                 ; sec_shift
 
     ; pri/sec_taps[k] [4 total]
-    DEFINE_ARGS dst, stride, tap, dummy, pri, sec
+    DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3
     movd           xm0, prid
     movd           xm1, secd
     vpbroadcastw    m0, xm0                     ; pri_strength
@@ -270,17 +377,31 @@
     lea           secq, [tapq+secq*4+8]         ; sec_taps
 
     ; off1/2/3[k] [6 total] from [tapq+16+(dir+0/2/6)*2+k]
-    DEFINE_ARGS dst, stride, tap, dir, pri, sec
+    DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3
     mov           dird, r6m
     lea           tapq, [tapq+dirq*2+16]
-    DEFINE_ARGS dst, stride, dir, h, pri, sec, stk, off, k
-    mov             hd, 4
+%if %1*%2*2/mmsize > 1
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+    mov             hd, %1*%2*2/mmsize
+%else
+    DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
+%endif
     lea           stkq, [px]
     pxor           m13, m13
+%if %1*%2*2/mmsize > 1
 .v_loop:
+%endif
     mov             kd, 1
-    mova           xm4, [stkq+32*0]             ; px
-    vinserti128     m4, [stkq+32*1], 1
+%if %1 == 4
+    movq           xm4, [stkq+%3*0]
+    movhps         xm4, [stkq+%3*1]
+    movq           xm5, [stkq+%3*2]
+    movhps         xm5, [stkq+%3*3]
+    vinserti128     m4, xm5, 1
+%else
+    mova           xm4, [stkq+%3*0]             ; px
+    vinserti128     m4, [stkq+%3*1], 1
+%endif
     pxor           m15, m15                     ; sum
     mova            m7, m4                      ; max
     mova            m8, m4                      ; min
@@ -288,48 +409,10 @@
     vpbroadcastw    m2, [priq+kq*2]             ; pri_taps
     vpbroadcastw    m3, [secq+kq*2]             ; sec_taps
 
-%macro ACCUMULATE_TAP 4 ; tap_offset, shift, strength, mul_tap
-    ; load p0/p1
-    movsx         offq, byte [dirq+kq+%1]       ; off1
-    movu           xm5, [stkq+offq*2+32*0]      ; p0
-    vinserti128     m5, [stkq+offq*2+32*1], 1
-    neg           offq                          ; -off1
-    movu           xm6, [stkq+offq*2+32*0]      ; p1
-    vinserti128     m6, [stkq+offq*2+32*1], 1
-    pcmpeqw         m9, m14, m5
-    pcmpeqw        m10, m14, m6
-    pandn           m9, m5
-    pandn          m10, m6
-    pmaxsw          m7, m9                      ; max after p0
-    pminsw          m8, m5                      ; min after p0
-    pmaxsw          m7, m10                     ; max after p1
-    pminsw          m8, m6                      ; min after p1
+    ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3
+    ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3
+    ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3
 
-    ; accumulate sum[m15] over p0/p1
-    psubw           m5, m4                      ; diff_p0(p0 - px)
-    psubw           m6, m4                      ; diff_p1(p1 - px)
-    pabsw           m9, m5
-    pabsw          m10, m6
-    psraw          m11, m9,  %2
-    psraw          m12, m10, %2
-    psubw          m11, %3, m11
-    psubw          m12, %3, m12
-    pmaxsw         m11, m13
-    pmaxsw         m12, m13
-    pminsw         m11, m9
-    pminsw         m12, m10
-    psignw         m11, m5                      ; constrain(diff_p0)
-    psignw         m12, m6                      ; constrain(diff_p1)
-    pmullw         m11, %4                      ; constrain(diff_p0) * pri_taps
-    pmullw         m12, %4                      ; constrain(diff_p1) * pri_taps
-    paddw          m15, m11
-    paddw          m15, m12
-%endmacro
-
-    ACCUMULATE_TAP 0*2, [rsp+0], m0, m2
-    ACCUMULATE_TAP 2*2, [rsp+8], m1, m3
-    ACCUMULATE_TAP 6*2, [rsp+8], m1, m3
-
     dec             kq
     jge .k_loop
 
@@ -342,14 +425,28 @@
     pmaxsw          m4, m8
     packuswb        m4, m4
     vextracti128   xm5, m4, 1
+%if %1 == 4
+    movd [dstq+strideq*0], xm4
+    pextrd [dstq+strideq*1], xm4, 1
+    movd [dstq+strideq*2], xm5
+    pextrd [dstq+stride3q], xm5, 1
+%else
     movq [dstq+strideq*0], xm4
     movq [dstq+strideq*1], xm5
+%endif
+
+%if %1*%2*2/mmsize > 1
     lea           dstq, [dstq+strideq*2]
-    add           stkq, 32*2
+    add           stkq, %3*2
     dec             hd
     jg .v_loop
+%endif
 
     RET
+%endmacro
+
+cdef_filter_fn 8, 8, 32
+cdef_filter_fn 4, 4, 32
 
 INIT_YMM avx2
 cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -29,6 +29,8 @@
 #include "src/cdef.h"
 
 decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
+decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
+
 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
 
 void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
@@ -39,5 +41,6 @@
 #if BITDEPTH == 8 && ARCH_X86_64
     c->dir = dav1d_cdef_dir_avx2;
     c->fb[0] = dav1d_cdef_filter_8x8_avx2;
+    c->fb[2] = dav1d_cdef_filter_4x4_avx2;
 #endif
 }