shithub: dav1d

--- a/src/x86/itx_ssse3.asm

+++ b/src/x86/itx_ssse3.asm

@@ -6097,7 +6097,7 @@

-cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2

+cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2

 %if ARCH_X86_32

     LEA                     r5, $$

 %endif

@@ -6186,7 +6186,9 @@

 %endmacro

 cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2

-    mov                     r3, 2

+    mov                    r3d, 2

+    mov  [rsp+gprsize*2+16*67], dstq

+    lea                   dstq, [rsp+gprsize+16*68]

 .pass1_loop:

     LOAD_4ROWS     coeffq+32*0, 32*8

@@ -6277,7 +6279,7 @@

     jmp   m(idct_8x8_internal).pass1_end1

 .pass1_end4:

-    SAVE_8ROWS    coeffq+32*32, 32

+    SAVE_8ROWS       dstq+32*0, 32

     LOAD_8ROWS   rsp+gprsize+16*43, 16

     mova    [rsp+gprsize+16*0], m7

     mova                    m7, [o(pw_8192)]

@@ -6285,7 +6287,7 @@

     jmp   m(idct_8x8_internal).pass1_end1

 .pass1_end5:

-    SAVE_8ROWS    coeffq+32*40, 32

+    SAVE_8ROWS       dstq+32*8, 32

     LOAD_8ROWS   rsp+gprsize+16*51, 16

     mova    [rsp+gprsize+16*0], m7

     mova                    m7, [o(pw_8192)]

@@ -6293,7 +6295,7 @@

     jmp   m(idct_8x8_internal).pass1_end1

 .pass1_end6:

-    SAVE_8ROWS    coeffq+32*48, 32

+    SAVE_8ROWS      dstq+32*16, 32

     LOAD_8ROWS   rsp+gprsize+16*59, 16

     mova    [rsp+gprsize+16*0], m7

     mova                    m7, [o(pw_8192)]

@@ -6301,20 +6303,20 @@

     jmp   m(idct_8x8_internal).pass1_end1

 .pass1_end7:

-    SAVE_8ROWS    coeffq+32*56, 32

+    SAVE_8ROWS      dstq+32*24, 32

     add                 coeffq, 16

-    dec                     r3

+    add                   dstq, 16

+    dec                    r3d

     jg .pass1_loop

 .pass2:

+    mov                   dstq, [rsp+gprsize*2+16*67]

     sub                 coeffq, 32

-    mov                     r3, 8

-    lea                     r4, [dstq+8]

-    mov  [rsp+gprsize*2+16*67], r4

+    mov                    r3d, 4

 .pass2_loop:

-    mov  [rsp+gprsize*1+16*67], r3

+    mov  [rsp+gprsize*1+16*67], r3d

     LOAD_4ROWS     coeffq+16*0, 32*2

     LOAD_4ROWS_H   coeffq+16*1, 32*2

@@ -6341,13 +6343,47 @@

     REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15

     add                 coeffq, 16*16

-    mov                     r3, [rsp+gprsize*1+16*67]

+    mov                    r3d, [rsp+gprsize*1+16*67]

     mov                   dstq, [rsp+gprsize*2+16*67]

-    lea                     r4, [dstq+8]

-    mov  [rsp+gprsize*2+16*67], r4

-    dec                     r3

+    add                   dstq, 8

+    mov  [rsp+gprsize*2+16*67], dstq

+    dec                    r3d

     jg .pass2_loop

+    mov                    r3d, 4

+    lea                 coeffq, [rsp+gprsize+16*68]

+.pass2_loop2:

+    mov  [rsp+gprsize*1+16*67], r3d

+    LOAD_4ROWS     coeffq+16*0, 32*2

+    LOAD_4ROWS_H   coeffq+16*1, 32*2

+    call  m(idct_8x8_internal).main

+    SAVE_7ROWS    rsp+gprsize+16*3, 16

+    LOAD_4ROWS     coeffq+16*2, 32*2

+    LOAD_4ROWS_H   coeffq+16*3, 32*2

+    call m(idct_16x8_internal).main

+    mov                    r3, dstq

+    lea                  tx2q, [o(m(idct_64x16_internal).end2)]

+    lea                  dstq, [dstq+strideq*8]

+    jmp  m(idct_8x8_internal).end

+.end2:

+    LOAD_8ROWS   rsp+gprsize+16*3, 16

+    mova   [rsp+gprsize+16*0], m7

+    lea                  tx2q, [o(m(idct_64x16_internal).end3)]

+    mov                  dstq, r3

+    jmp  m(idct_8x8_internal).end

+.end3:

+    add                 coeffq, 16*16

+    mov                    r3d, [rsp+gprsize*1+16*67]

+    mov                   dstq, [rsp+gprsize*2+16*67]

+    add                   dstq, 8

+    mov  [rsp+gprsize*2+16*67], dstq

+    dec                    r3d

+    jg .pass2_loop2

ret

--- a/tests/checkasm/itx.c

+++ b/tests/checkasm/itx.c

@@ -158,6 +158,8 @@

         eob += rnd() % (n - eob - 1);

     for (n = eob + 1; n < sw * sh; n++)

         coeff[scan[n]] = 0;

+    for (; n < 32 * 32; n++)

+        coeff[n] = rnd();

     return eob;

@@ -224,7 +226,7 @@

     Dav1dInvTxfmDSPContext c;

     bitfn(dav1d_itx_dsp_init)(&c);

-    ALIGN_STK_32(coef, coeff, 3, [32 * 32]);

+    ALIGN_STK_32(coef, coeff, 2, [32 * 32]);

     ALIGN_STK_32(pixel, c_dst, 64 * 64,);

     ALIGN_STK_32(pixel, a_dst, 64 * 64,);

@@ -245,7 +247,6 @@

         const enum RectTxfmSize tx = txfm_size_order[i];

         const int w = dav1d_txfm_dimensions[tx].w * 4;

         const int h = dav1d_txfm_dimensions[tx].h * 4;

-        const int sw = imin(w, 32), sh = imin(h, 32);

         const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,

                                                dav1d_txfm_dimensions[tx].lh)];

@@ -263,24 +264,22 @@

                     const int bitdepth_max = 0xff;

 #endif

                     const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);

+                    memcpy(coeff[1], coeff[0], sizeof(*coeff));

                     for (int j = 0; j < w * h; j++)

                         c_dst[j] = a_dst[j] = rnd() & bitdepth_max;

-                    memcpy(coeff[1], coeff[0], sw * sh * sizeof(**coeff));

-                    memcpy(coeff[2], coeff[0], sw * sh * sizeof(**coeff));

                     call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob

                              HIGHBD_TAIL_SUFFIX);

                     call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob

                              HIGHBD_TAIL_SUFFIX);

                     if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) ||

-                        memcmp(coeff[0], coeff[1], sw * sh * sizeof(**coeff)))

+                        memcmp(coeff[0], coeff[1], sizeof(*coeff)))

                         fail();

-                    bench_new(a_dst, w * sizeof(*c_dst), coeff[2], eob

+                    bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob

                               HIGHBD_TAIL_SUFFIX);

         report("add_%dx%d", w, h);