shithub: dav1d

--- a/include/common/intops.h

+++ b/include/common/intops.h

@@ -40,6 +40,14 @@

     return a < b ? a : b;

+static inline unsigned umax(const unsigned a, const unsigned b) {

+    return a > b ? a : b;

+}

+static inline unsigned umin(const unsigned a, const unsigned b) {

+    return a < b ? a : b;

+}

 static inline int iclip(const int v, const int min, const int max) {

     return v < min ? min : v > max ? max : v;

--- a/src/env.h

+++ b/src/env.h

@@ -469,180 +469,6 @@

-static inline int get_coef_skip_ctx(const TxfmInfo *const t_dim,

-                                    const enum BlockSize bs,

-                                    const uint8_t *const a,

-                                    const uint8_t *const l,

-                                    const int chroma,

-                                    const enum Dav1dPixelLayout layout)

-{

-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

-    if (chroma) {

-        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

-        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

-        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||

-                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;

-        int ca, cl;

-#define MERGE_CTX(dir, type, mask) \

-        c##dir = !!((*(const type *) dir) & mask); \

-        break

-        switch (t_dim->lw) {

-        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x3F);

-        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x3F3F);

-        case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);

-        case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);

-        default: abort();

-        }

-        switch (t_dim->lh) {

-        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x3F);

-        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x3F3F);

-        case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);

-        case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);

-        default: abort();

-        }

-#undef MERGE_CTX

-        return 7 + not_one_blk * 3 + ca + cl;

-    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {

-        return 0;

-    } else {

-        static const uint8_t skip_contexts[5][5] = {

-            { 1, 2, 2, 2, 3 },

-            { 1, 4, 4, 4, 5 },

-            { 1, 4, 4, 4, 5 },

-            { 1, 4, 4, 4, 5 },

-            { 1, 4, 4, 4, 6 }

-        };

-        uint64_t la, ll;

-#define MERGE_CTX(dir, type, tx) do { \

-            l##dir = *(const type *) dir; \

-            if (tx == TX_64X64) \

-                l##dir |= *(const type *) &dir[sizeof(type)]; \

-            if (tx >= TX_32X32) l##dir |= l##dir >> 32; \

-            if (tx >= TX_16X16) l##dir |= l##dir >> 16; \

-            if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \

-            l##dir &= 0x3F; \

-        } while (0); \

-        break

-        switch (t_dim->lw) {

-        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);

-        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);

-        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);

-        case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32);

-        case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64);

-        }

-        switch (t_dim->lh) {

-        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);

-        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);

-        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);

-        case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32);

-        case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64);

-        }

-#undef MERGE_CTX

-        const int max = imin((int) (la | ll), 4);

-        const int min = imin(imin((int) la, (int) ll), 4);

-        return skip_contexts[min][max];

-    }

-}

-static inline int get_coef_nz_ctx(uint8_t *const levels,

-                                  const enum RectTxfmSize tx,

-                                  const enum TxClass tx_class,

-                                  const int x, const int y,

-                                  const ptrdiff_t stride)

-{

-    static const uint8_t offsets[3][5][2 /* x, y */] = {

-        [TX_CLASS_2D] = {

-            { 0, 1 }, { 1, 0 }, { 2, 0 }, { 0, 2 }, { 1, 1 }

-        }, [TX_CLASS_V] = {

-            { 0, 1 }, { 1, 0 }, { 0, 2 }, { 0, 3 }, { 0, 4 }

-        }, [TX_CLASS_H] = {

-            { 0, 1 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, { 4, 0 }

-        }

-    };

-    const uint8_t (*const off)[2] = offsets[tx_class];

-    int mag = 0;

-    for (int i = 0; i < 5; i++)

-        mag += imin(levels[(x + off[i][0]) * stride + (y + off[i][1])], 3);

-    const int ctx = imin((mag + 1) >> 1, 4);

-    if (tx_class == TX_CLASS_2D) {

-        return dav1d_nz_map_ctx_offset[tx][imin(y, 4)][imin(x, 4)] + ctx;

-    } else {

-        return 26 + imin((tx_class == TX_CLASS_V) ? y : x, 2) * 5 + ctx;

-    }

-}

-static inline int get_dc_sign_ctx(const TxfmInfo *const t_dim,

-                                  const uint8_t *const a,

-                                  const uint8_t *const l)

-{

-    uint64_t sa, sl;

-#define MERGE_CTX(dir, type, tx, mask) do { \

-        s##dir = ((*(const type *) dir) >> 6) & mask; \

-        if (tx == TX_64X64) \

-            s##dir += ((*(const type *) &dir[sizeof(type)]) >> 6) & mask; \

-        if (tx >= TX_32X32) s##dir += s##dir >> 32; \

-        if (tx >= TX_16X16) s##dir += s##dir >> 16; \

-        if (tx >= TX_8X8)   s##dir += s##dir >> 8; \

-    } while (0); \

-    break

-    switch (t_dim->lw) {

-    case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4,   0x03);

-    case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8,   0x0303);

-    case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16, 0x03030303U);

-    case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32, 0x0303030303030303ULL);

-    case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64, 0x0303030303030303ULL);

-    }

-    switch (t_dim->lh) {

-    case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4,   0x03);

-    case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8,   0x0303);

-    case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16, 0x03030303U);

-    case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32, 0x0303030303030303ULL);

-    case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64, 0x0303030303030303ULL);

-    }

-#undef MERGE_CTX

-    const int s = ((int) ((sa + sl) & 0xFF)) - (t_dim->w + t_dim->h);

-    return s < 0 ? 1 : s > 0 ? 2 : 0;

-}

-static inline int get_br_ctx(const uint8_t *const levels,

-                             const int ac, const enum TxClass tx_class,

-                             const int x, const int y,

-                             const ptrdiff_t stride)

-{

-    int mag = 0;

-    static const uint8_t offsets_from_txclass[3][3][2] = {

-        [TX_CLASS_2D] = { { 0, 1 }, { 1, 0 }, { 1, 1 } },

-        [TX_CLASS_H]  = { { 0, 1 }, { 1, 0 }, { 0, 2 } },

-        [TX_CLASS_V]  = { { 0, 1 }, { 1, 0 }, { 2, 0 } }

-    };

-    const uint8_t (*const offsets)[2] = offsets_from_txclass[tx_class];

-    for (int i = 0; i < 3; i++)

-        mag += levels[(x + offsets[i][1]) * stride + y + offsets[i][0]];

-    mag = imin((mag + 1) >> 1, 6);

-    if (!ac) return mag;

-    switch (tx_class) {

-    case TX_CLASS_2D:

-        if (y < 2 && x < 2) return mag + 7;

-        break;

-    case TX_CLASS_H:

-        if (x == 0) return mag + 7;

-        break;

-    case TX_CLASS_V:

-        if (y == 0) return mag + 7;

-        break;

-    }

-    return mag + 14;

-}

 static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,

                             const int bx4, const int by4,

                             const int bw4, const int bh4,

--- a/src/internal.h

+++ b/src/internal.h

@@ -309,14 +309,14 @@

             uint16_t edge_16bpc[257];

};

         struct {

-            uint8_t pal_idx[2 * 64 * 64];

             union {

+                uint8_t levels[32 * 34];

                 struct {

                     uint8_t pal_order[64][8];

                     uint8_t pal_ctx[64];

};

-                uint8_t levels[36 * 36];

};

+            uint8_t pal_idx[2 * 64 * 64];

             uint16_t pal[3 /* plane */][8 /* palette_idx */];

};

         int16_t ac[32 * 32];

--- a/src/recon_tmpl.c

+++ b/src/recon_tmpl.c

@@ -46,16 +46,273 @@

 #include "src/tables.h"

 #include "src/wedge.h"

-static unsigned read_golomb(MsacContext *const msac) {

+static inline unsigned read_golomb(MsacContext *const msac) {

     int len = 0;

     unsigned val = 1;

     while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;

-    while (len--) val = (val << 1) | dav1d_msac_decode_bool_equi(msac);

+    while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);

     return val - 1;

+static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,

+                                    const enum BlockSize bs,

+                                    const uint8_t *const a,

+                                    const uint8_t *const l,

+                                    const int chroma,

+                                    const enum Dav1dPixelLayout layout)

+{

+    const uint8_t *const b_dim = dav1d_block_dimensions[bs];

+    if (chroma) {

+        const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;

+        const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;

+        const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||

+                                b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;

+        int ca, cl;

+#define MERGE_CTX(dir, type, mask) \

+        c##dir = !!((*(const type *) dir) & mask); \

+        break

+        switch (t_dim->lw) {

+        case TX_4X4:   MERGE_CTX(a, uint8_t,  0x3F);

+        case TX_8X8:   MERGE_CTX(a, uint16_t, 0x3F3F);

+        case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);

+        case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);

+        default: assert(0);

+        }

+        switch (t_dim->lh) {

+        case TX_4X4:   MERGE_CTX(l, uint8_t,  0x3F);

+        case TX_8X8:   MERGE_CTX(l, uint16_t, 0x3F3F);

+        case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);

+        case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);

+        default: assert(0);

+        }

+#undef MERGE_CTX

+        return 7 + not_one_blk * 3 + ca + cl;

+    } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {

+        return 0;

+    } else {

+        unsigned la, ll;

+#define MERGE_CTX(dir, type, tx) \

+        if (tx == TX_64X64) { \

+            uint64_t tmp = *(const uint64_t *) dir; \

+            tmp |= *(const uint64_t *) &dir[8]; \

+            l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \

+        } else \

+            l##dir = *(const type *) dir; \

+        if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \

+        if (tx >= TX_16X16) l##dir |= l##dir >> 16; \

+        if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \

+        break

+        switch (t_dim->lw) {

+        case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);

+        case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);

+        case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);

+        case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);

+        case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);

+        default: assert(0);

+        }

+        switch (t_dim->lh) {

+        case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);

+        case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);

+        case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);

+        case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);

+        case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);

+        default: assert(0);

+        }

+#undef MERGE_CTX

+        return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];

+    }

+}

+static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,

+                                       const uint8_t *const a,

+                                       const uint8_t *const l)

+{

+    uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;

+    int s;

+#if ARCH_X86_64 && defined(__GNUC__)

+    /* Coerce compilers into producing better code. For some reason

+     * every x86-64 compiler is awful at handling 64-bit constants. */

+    __asm__("" : "+r"(mask), "+r"(mul));

+#endif

+    switch(tx) {

+    case TX_4X4: {

+        int t = *(const uint8_t *) a >> 6;

+        t    += *(const uint8_t *) l >> 6;

+        s = t - 1 - 1;

+        break;

+    }

+    case TX_8X8: {

+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;

+        t         += *(const uint16_t *) l & (uint32_t) mask;

+        t *= 0x04040404U;

+        s = (int) (t >> 24) - 2 - 2;

+        break;

+    }

+    case TX_16X16: {

+        uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;

+        t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;

+        t *= (uint32_t) mul;

+        s = (int) (t >> 24) - 4 - 4;

+        break;

+    }

+    case TX_32X32: {

+        uint64_t t = (*(const uint64_t *) a & mask) >> 6;

+        t         += (*(const uint64_t *) l & mask) >> 6;

+        t *= mul;

+        s = (int) (t >> 56) - 8 - 8;

+        break;

+    }

+    case TX_64X64: {

+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;

+        t         += (*(const uint64_t *) &a[8] & mask) >> 6;

+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;

+        t         += (*(const uint64_t *) &l[8] & mask) >> 6;

+        t *= mul;

+        s = (int) (t >> 56) - 16 - 16;

+        break;

+    }

+    case RTX_4X8: {

+        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;

+        t         += *(const uint16_t *) l & (uint32_t) mask;

+        t *= 0x04040404U;

+        s = (int) (t >> 24) - 1 - 2;

+        break;

+    }

+    case RTX_8X4: {

+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;

+        t         += *(const uint8_t  *) l & (uint32_t) mask;

+        t *= 0x04040404U;

+        s = (int) (t >> 24) - 2 - 1;

+        break;

+    }

+    case RTX_8X16: {

+        uint32_t t = *(const uint16_t *) a & (uint32_t) mask;

+        t         += *(const uint32_t *) l & (uint32_t) mask;

+        t = (t >> 6) * (uint32_t) mul;

+        s = (int) (t >> 24) - 2 - 4;

+        break;

+    }

+    case RTX_16X8: {

+        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;

+        t         += *(const uint16_t *) l & (uint32_t) mask;

+        t = (t >> 6) * (uint32_t) mul;

+        s = (int) (t >> 24) - 4 - 2;

+        break;

+    }

+    case RTX_16X32: {

+        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;

+        t         += *(const uint64_t *) l & mask;

+        t = (t >> 6) * mul;

+        s = (int) (t >> 56) - 4 - 8;

+        break;

+    }

+    case RTX_32X16: {

+        uint64_t t = *(const uint64_t *) a & mask;

+        t         += *(const uint32_t *) l & (uint32_t) mask;

+        t = (t >> 6) * mul;

+        s = (int) (t >> 56) - 8 - 4;

+        break;

+    }

+    case RTX_32X64: {

+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;

+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;

+        t         += (*(const uint64_t *) &l[8] & mask) >> 6;

+        t *= mul;

+        s = (int) (t >> 56) - 8 - 16;

+        break;

+    }

+    case RTX_64X32: {

+        uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;

+        t         += (*(const uint64_t *) &a[8] & mask) >> 6;

+        t         += (*(const uint64_t *) &l[0] & mask) >> 6;

+        t *= mul;

+        s = (int) (t >> 56) - 16 - 8;

+        break;

+    }

+    case RTX_4X16: {

+        uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;

+        t         += *(const uint32_t *) l & (uint32_t) mask;

+        t = (t >> 6) * (uint32_t) mul;

+        s = (int) (t >> 24) - 1 - 4;

+        break;

+    }

+    case RTX_16X4: {

+        uint32_t t = *(const uint32_t *) a & (uint32_t) mask;

+        t         += *(const uint8_t  *) l & (uint32_t) mask;

+        t = (t >> 6) * (uint32_t) mul;

+        s = (int) (t >> 24) - 4 - 1;

+        break;

+    }

+    case RTX_8X32: {

+        uint64_t t = *(const uint16_t *) a & (uint32_t) mask;

+        t         += *(const uint64_t *) l & mask;

+        t = (t >> 6) * mul;

+        s = (int) (t >> 56) - 2 - 8;

+        break;

+    }

+    case RTX_32X8: {

+        uint64_t t = *(const uint64_t *) a & mask;

+        t         += *(const uint16_t *) l & (uint32_t) mask;

+        t = (t >> 6) * mul;

+        s = (int) (t >> 56) - 8 - 2;

+        break;

+    }

+    case RTX_16X64: {

+        uint64_t t = *(const uint32_t *) a & (uint32_t) mask;

+        t         += *(const uint64_t *) &l[0] & mask;

+        t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);

+        t *= mul;

+        s = (int) (t >> 56) - 4 - 16;

+        break;

+    }

+    case RTX_64X16: {

+        uint64_t t = *(const uint64_t *) &a[0] & mask;

+        t         += *(const uint32_t *) l & (uint32_t) mask;

+        t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);

+        t *= mul;

+        s = (int) (t >> 56) - 16 - 4;

+        break;

+    }

+    default: assert(0);

+    }

+    return (s != 0) + (s > 0);

+}

+static inline unsigned get_lo_ctx(const uint8_t *const levels,

+                                  const enum TxClass tx_class,

+                                  unsigned *const hi_mag,

+                                  const uint8_t (*const ctx_offsets)[5],

+                                  const unsigned x, const unsigned y,

+                                  const ptrdiff_t stride)

+{

+    unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];

+    unsigned offset;

+    if (tx_class == TX_CLASS_2D) {

+        mag += levels[1 * stride + 1];

+        *hi_mag = mag;

+        mag += levels[0 * stride + 2] + levels[2 * stride + 0];

+        offset = ctx_offsets[umin(y, 4)][umin(x, 4)];

+    } else {

+        mag += levels[0 * stride + 2];

+        *hi_mag = mag;

+        mag += levels[0 * stride + 3] + levels[0 * stride + 4];

+        offset = 26 + (y > 1 ? 10 : y * 5);

+    }

+    return offset + (mag > 512 ? 4 : (mag + 64) >> 7);

+}

 static int decode_coefs(Dav1dTileContext *const t,

                         uint8_t *const a, uint8_t *const l,

                         const enum RectTxfmSize tx, const enum BlockSize bs,

@@ -74,7 +331,7 @@

         printf("Start: r=%d\n", ts->msac.rng);

     // does this block have any non-zero coefficients

-    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);

+    const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);

     const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,

                              ts->cdf.coef.skip[t_dim->ctx][sctx]);

     if (dbg)

@@ -175,91 +432,126 @@

     // base tokens

-    uint16_t (*const br_cdf)[4] =

-        ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];

+    uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];

+    uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];

     const uint16_t *const scan = dav1d_scans[tx][tx_class];

     int dc_tok;

     if (eob) {

-        uint8_t *const levels = t->scratch.levels;

+        uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];

+        uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok

         const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);

-        const ptrdiff_t stride = 4 * (sh + 1);

-        memset(levels, 0, stride * 4 * (sw + 1));

-        const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;

+        const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;

-        { // eob

-            const int rc = scan[eob], x = rc >> shift, y = rc & mask;

+        /* eob */

+        unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;

+        unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);

+        int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);

+        int tok = eob_tok + 1;

+        int level_tok = tok * 0x41;

+        unsigned mag;

+        if (dbg)

+            printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",

+                   t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);

-            const int ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);

-            uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx];

+#define DECODE_COEFS_CLASS(tx_class) \

+        if (eob_tok == 2) { \

+            ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \

+                   tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \

+            tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \

+            level_tok = tok + (3 << 6); \

+            if (dbg) \

+                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \

+                       imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \

+                       ts->msac.rng); \

+        } \

+        cf[rc] = tok; \

+        if (tx_class == TX_CLASS_H) \

+            /* Transposing reduces the stride and padding requirements */ \

+            levels[y * stride + x] = (uint8_t) level_tok; \

+        else \

+            levels[x * stride + y] = (uint8_t) level_tok; \

+        for (int i = eob - 1; i > 0; i--) { /* ac */ \

+            if (tx_class == TX_CLASS_H) \

+                rc = i, x = rc & mask, y = rc >> shift; \

+            else \

+                rc = scan[i], x = rc >> shift, y = rc & mask; \

+            assert(x < 32 && y < 32); \

+            uint8_t *const level = levels + x * stride + y; \

+            ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \

+            if (tx_class == TX_CLASS_2D) \

+                y |= x; \

+            tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \

+            level_tok = tok * 0x41; \

+            if (dbg) \

+                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \

+                       t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \

+            if (tok == 3) { \

+                mag &= 63; \

+                ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \

+                      (mag > 12 ? 6 : (mag + 1) >> 1); \

+                tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \

+                level_tok = tok + (3 << 6); \

+                if (dbg) \

+                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \

+                           imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \

+                           ts->msac.rng); \

+            } \

+            cf[rc] = tok; \

+            *level = (uint8_t) level_tok; \

+        } \

+        /* dc */ \

+        ctx = (tx_class == TX_CLASS_2D) ? 0 : \

+            get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \

+        dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \

+        if (dbg) \

+            printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \

+                   t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \

+        if (dc_tok == 3) { \

+            if (tx_class == TX_CLASS_2D) \

+                mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \

+                      levels[1 * stride + 1]; \

+            mag &= 63; \

+            ctx = mag > 12 ? 6 : (mag + 1) >> 1; \

+            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \

+            if (dbg) \

+                printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \

+                       imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \

+        } \

+        break

-            int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 2);

-            int tok = 1 + tok_br;

-            if (dbg)

-                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",

-                       t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);

-            if (tok_br == 2) {

-                const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);

-                tok = dav1d_msac_decode_hi_tok(&ts->msac, br_cdf[br_ctx]);

-                if (dbg)

-                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",

-                           imin(t_dim->ctx, 3), chroma, br_ctx, eob, rc, tok,

-                           ts->msac.rng);

-            }

-            cf[rc] = tok;

-            levels[x * stride + y] = (uint8_t) tok;

+        switch (tx_class) {

+        case TX_CLASS_2D: {

+            const unsigned nonsquare_tx = tx >= RTX_4X8;

+            const uint8_t (*const lo_ctx_offsets)[5] =

+                dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];

+            const ptrdiff_t stride = 4 * sh;

+            memset(levels, 0, stride * (4 * sw + 2));

+            DECODE_COEFS_CLASS(TX_CLASS_2D);

-        for (int i = eob - 1; i > 0; i--) { // ac

-            const int rc = scan[i], x = rc >> shift, y = rc & mask;

-            // lo tok

-            const int ctx = get_coef_nz_ctx(levels, tx, tx_class, x, y, stride);

-            uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];

-            int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3);

-            if (dbg)

-                printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",

-                       t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);

-            if (tok == 3) {

-                const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);

-                tok = dav1d_msac_decode_hi_tok(&ts->msac, br_cdf[br_ctx]);

-                if (dbg)

-                    printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",

-                           imin(t_dim->ctx, 3), chroma, br_ctx, i, rc, tok,

-                           ts->msac.rng);

-            }

-            cf[rc] = tok;

-            levels[x * stride + y] = (uint8_t) tok;

+        case TX_CLASS_H: {

+#define lo_ctx_offsets NULL

+            const ptrdiff_t stride = 16;

+            memset(levels, 0, stride * (4 * sh + 2));

+            DECODE_COEFS_CLASS(TX_CLASS_H);

-        { // dc

-            const int ctx = (tx_class != TX_CLASS_2D) ?

-                get_coef_nz_ctx(levels, tx, tx_class, 0, 0, stride) : 0;

-            uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];

-            dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3);

-            if (dbg)

-                printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",

-                       t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng);

-            if (dc_tok == 3) {

-                const int br_ctx = get_br_ctx(levels, 0, tx_class, 0, 0, stride);

-                dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, br_cdf[br_ctx]);

-                if (dbg)

-                    printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",

-                           imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);

-            }

+        case TX_CLASS_V: {

+            const ptrdiff_t stride = 16;

+            memset(levels, 0, stride * (4 * sw + 2));

+            DECODE_COEFS_CLASS(TX_CLASS_V);

+#undef lo_ctx_offsets

+#undef DECODE_COEFS_CLASS

+        default: assert(0);

+        }

     } else { // dc-only

-        uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][0];

-        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 2);

+        int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);

         dc_tok = 1 + tok_br;

         if (dbg)

             printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",

                    t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);

         if (tok_br == 2) {

-            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, br_cdf[0]);

+            dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);

             if (dbg)

                 printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",

                        imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);

@@ -276,7 +568,7 @@

     unsigned cul_level = 0;

     if (dc_tok) { // dc

-        const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);

+        const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);

         uint16_t *const dc_sign_cdf =

             ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];

         const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);

@@ -328,7 +620,7 @@

     // context

-    *res_ctx = imin(cul_level, 63) | dc_sign;

+    *res_ctx = umin(cul_level, 63) | dc_sign;

     return eob;

--- a/src/tables.c

+++ b/src/tables.c

@@ -272,119 +272,34 @@

     [BS_4x4  ]   = 0,

};

-const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5] = {

-    [TX_4X4] = {

-        { 0, 1, 6, 6 },

-        { 1, 6, 6, 21 },

-        { 6, 6, 21, 21 },

-        { 6, 21, 21, 21 },

-    }, [TX_8X8] = {

-        { 0, 1, 6, 6, 21 },

-        { 1, 6, 6, 21, 21 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [TX_16X16] = {

-        { 0, 1, 6, 6, 21 },

-        { 1, 6, 6, 21, 21 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [TX_32X32] = {

-        { 0, 1, 6, 6, 21 },

-        { 1, 6, 6, 21, 21 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [TX_64X64] = {

-        { 0, 1, 6, 6, 21 },

-        { 1, 6, 6, 21, 21 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [RTX_4X8] = {

-        { 0, 11, 11, 11 },

-        { 11, 11, 11, 11 },

-        { 6, 6, 21, 21 },

-        { 6, 21, 21, 21 },

-        { 21, 21, 21, 21 }

-    }, [RTX_8X4] = {

-        { 0, 16, 6, 6, 21 },

-        { 16, 16, 6, 21, 21 },

+const uint8_t dav1d_lo_ctx_offsets[3][5][5] = {

+    { /* w == h */

+        {  0,  1,  6,  6, 21 },

+        {  1,  6,  6, 21, 21 },

+        {  6,  6, 21, 21, 21 },

+        {  6, 21, 21, 21, 21 },

+        { 21, 21, 21, 21, 21 },

+    }, { /* w > h */

+        {  0, 16,  6,  6, 21 },

+        { 16, 16,  6, 21, 21 },

         { 16, 16, 21, 21, 21 },

         { 16, 16, 21, 21, 21 },

-    }, [RTX_8X16] = {

-        { 0, 11, 11, 11, 11 },

-        { 11, 11, 11, 11, 11 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [RTX_16X8] = {

-        { 0, 16, 6, 6, 21 },

-        { 16, 16, 6, 21, 21 },

         { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 }

-    }, [RTX_16X32] = {

-        { 0, 11, 11, 11, 11 },

+    }, { /* w < h */

+        {  0, 11, 11, 11, 11 },

         { 11, 11, 11, 11, 11 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [RTX_32X16] = {

-        { 0, 16, 6, 6, 21 },

-        { 16, 16, 6, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 }

-    }, [RTX_32X64] = {

-        { 0, 11, 11, 11, 11 },

-        { 11, 11, 11, 11, 11 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [RTX_64X32] = {

-        { 0, 16, 6, 6, 21 },

-        { 16, 16, 6, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 }

-    }, [RTX_4X16] = {

-        { 0, 11, 11, 11 },

-        { 11, 11, 11, 11 },

-        { 6, 6, 21, 21 },

-        { 6, 21, 21, 21 },

-        { 21, 21, 21, 21 }

-    }, [RTX_16X4] = {

-        { 0, 16, 6, 6, 21 },

-        { 16, 16, 6, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-    }, [RTX_8X32] = {

-        { 0, 11, 11, 11, 11 },

-        { 11, 11, 11, 11, 11 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [RTX_32X8] = {

-        { 0, 16, 6, 6, 21 },

-        { 16, 16, 6, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 }

-    }, [RTX_16X64] = {

-        { 0, 11, 11, 11, 11 },

-        { 11, 11, 11, 11, 11 },

-        { 6, 6, 21, 21, 21 },

-        { 6, 21, 21, 21, 21 },

-        { 21, 21, 21, 21, 21 }

-    }, [RTX_64X16] = {

-        { 0, 16, 6, 6, 21 },

-        { 16, 16, 6, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 },

-        { 16, 16, 21, 21, 21 }

-    }

+        {  6,  6, 21, 21, 21 },

+        {  6, 21, 21, 21, 21 },

+        { 21, 21, 21, 21, 21 },

+    },

+};

+const uint8_t dav1d_skip_ctx[5][5] = {

+    { 1, 2, 2, 2, 3 },

+    { 2, 4, 4, 4, 5 },

+    { 2, 4, 4, 4, 5 },

+    { 2, 4, 4, 4, 5 },

+    { 3, 5, 5, 5, 6 },

};

 const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {

--- a/src/tables.h

+++ b/src/tables.h

@@ -57,7 +57,8 @@

 extern const uint8_t dav1d_filter_mode_to_y_mode[5];

 extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];

-extern const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5];

+extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];

+extern const uint8_t dav1d_skip_ctx[5][5];

 extern const uint8_t /* enum TxClass */

                      dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];

 extern const uint8_t /* enum Filter2d */