ref: 1e0f5fed0c530ac7dce561e2334ef3a0f7475507
author: Sigrid Solveig Haflínudóttir <[email protected]>
date: Tue Aug 1 22:21:33 EDT 2023
a first
--- /dev/null
+++ b/LICENSE
@@ -1,0 +1,1 @@
+A mix of public domain and other stuff, check each file.
--- /dev/null
+++ b/README.md
@@ -1,0 +1,7 @@
+# jxl
+
+A JPEG XL decoder for Plan 9.
+
+## BUGS
+
+Decoding is slow.
--- /dev/null
+++ b/builtins.amd64.s
@@ -1,0 +1,13 @@
+TEXT __builtin_ctz(SB),1,$0
+ BYTE $0x0F; BYTE $0xBC; BYTE $0xC5 /* BSFL RARG, AX */
+ RET
+
+TEXT __builtin_clz(SB),1,$0
+ BYTE $0x0F; BYTE $0xBD; BYTE $0xC5 /* BSRL RARG, AX */
+ XORL $31, AX
+ RET
+
+TEXT __builtin_clzll(SB),1,$0
+ BYTE $0x48; BYTE $0x0F; BYTE $0xBD; BYTE $0xC5 /* BSRQ RARG, AX */
+ XORL $63, AX
+ RET
--- /dev/null
+++ b/builtins.arm64.s
@@ -1,0 +1,12 @@
+TEXT __builtin_ctz(SB),1,$0
+ RBITW R0, R0
+ CLZW R0, R0
+ RETURN
+
+TEXT __builtin_clz(SB),1,$0
+ CLZW R0, R0
+ RETURN
+
+TEXT __builtin_clzll(SB),1,$0
+ CLZ R0, R0
+ RETURN
--- /dev/null
+++ b/builtins.c
@@ -1,0 +1,19 @@
+int
+__builtin_clz(unsigned int x)
+{
+ unsigned int r;
+ if(x == 0)
+ return 32;
+ for(r = 0; (x & (1UL<<31)) == 0; x <<= 1, r++);
+ return r;
+}
+
+int
+__builtin_clzll(unsigned long long x)
+{
+ unsigned long long r;
+ if(x == 0)
+ return 64;
+ for(r = 0; (x & (1ULL<<63)) == 0; x <<= 1, r++);
+ return r;
+}
--- /dev/null
+++ b/cbrtf.c
@@ -1,0 +1,66 @@
+/* origin: FreeBSD /usr/src/lib/msun/src/s_cbrtf.c */
+/*
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected].
+ * Debugged and optimized by Bruce D. Evans.
+ */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+/* cbrtf(x)
+ * Return cube root of x
+ */
+
+#include <u.h>
+#include <libc.h>
+
+static const unsigned
+B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
+B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
+
+float cbrtf(float x)
+{
+ double r,T;
+ union {float f; u32int i;} u = {x};
+ u32int hx = u.i & 0x7fffffff;
+
+ if (hx >= 0x7f800000) /* cbrt(NaN,INF) is itself */
+ return x + x;
+
+ /* rough cbrt to 5 bits */
+ if (hx < 0x00800000) { /* zero or subnormal? */
+ if (hx == 0)
+ return x; /* cbrt(+-0) is itself */
+ u.f = x*16777210.0f;//0x1p24f;
+ hx = u.i & 0x7fffffff;
+ hx = hx/3 + B2;
+ } else
+ hx = hx/3 + B1;
+ u.i &= 0x80000000;
+ u.i |= hx;
+
+ /*
+ * First step Newton iteration (solving t*t-x/t == 0) to 16 bits. In
+ * double precision so that its terms can be arranged for efficiency
+ * without causing overflow or underflow.
+ */
+ T = u.f;
+ r = T*T*T;
+ T = T*((double)x+x+r)/(x+r+r);
+
+ /*
+ * Second step Newton iteration to 47 bits. In double precision for
+ * efficiency and accuracy.
+ */
+ r = T*T*T;
+ T = T*((double)x+x+r)/(x+r+r);
+
+ /* rounding to 24 bits is perfect in round-to-nearest mode */
+ return T;
+}
--- /dev/null
+++ b/decode.c
@@ -1,0 +1,90 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <thread.h>
+#define J40_CONFIRM_THAT_THIS_IS_EXPERIMENTAL_AND_POTENTIALLY_UNSAFE
+#define J40_IMPLEMENTATION
+#define J40_INLINE
+int __builtin_clz(unsigned int x);
+int __builtin_clzll(unsigned long long x);
+float cbrtf(float x);
+float ldexpf(float x, int n);
+float hypotf(float x, float y);
+#include "j40.h"
+
+static void
+usage(void)
+{
+ fprint(2, "usage: %s\n", argv0);
+}
+
+static u8int *
+readall(int f, int *outsz)
+{
+ int bufsz, sz, n;
+ u8int *s;
+
+ bufsz = 65535;
+ s = nil;
+ for(sz = 0;; sz += n){
+ if(bufsz-sz < 65536){
+ bufsz *= 2;
+ s = realloc(s, bufsz);
+ }
+ if((n = readn(f, s+sz, bufsz-sz)) < 1)
+ break;
+ }
+ if(n < 0 || sz < 1){
+ if(n == 0)
+ werrstr("empty");
+ free(s);
+ return nil;
+ }
+ *outsz = sz;
+
+ return s;
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ j40_pixels_u8x4 pixels;
+ u8int *p, *in, x;
+ j40_image image;
+ j40_frame frame;
+ int sz, y, n;
+ Biobuf *b;
+
+ ARGBEGIN{
+ default:
+ usage();
+ }ARGEND
+
+ if((in = readall(0, &sz)) == nil || (b = Bfdopen(1, OWRITE)) == nil)
+ sysfatal("%r");
+ j40_from_memory(&image, in, sz, free); // or: j40_from_memory(&image, buf, bufsize, freefunc);
+ j40_output_format(&image, J40_RGBA, J40_U8X4);
+
+ if(j40_next_frame(&image)){
+ frame = j40_current_frame(&image);
+ pixels = j40_frame_pixels_u8x4(&frame, J40_RGBA);
+ Bprint(b, "%11s %11d %11d %11d %11d ", "a8r8g8b8", 0, 0, pixels.width, pixels.height);
+ for(y = 0; y < pixels.height; ++y){
+ p = (u8int*)j40_row_u8x4(pixels, y);
+ for(n = 0; n < 4*pixels.width; n += 4){
+ x = p[n+0];
+ p[n+0] = p[n+2];
+ p[n+2] = x;
+ }
+ Bwrite(b, p, 4*pixels.width);
+ }
+ }
+
+ if(j40_error(&image))
+ sysfatal("%s", j40_error_string(&image));
+
+ j40_free(&image); // also frees all memory associated to j40_frame etc.
+ Bterm(b);
+
+ threadexitsall(nil);
+}
--- /dev/null
+++ b/hypotf.c
@@ -1,0 +1,43 @@
+#include <stdint.h>
+#include <math.h>
+
+float
+hypotf(float x, float y)
+{
+ union {float f; uint32_t i;} ux = {x}, uy = {y}, ut;
+ float z;
+ union {
+ float f;
+ u32int x;
+ }oneP[] = {
+ {.x = 0x6c800000},
+ {.x = 0x12800000},
+ };
+
+ ux.i &= -1U>>1;
+ uy.i &= -1U>>1;
+ if (ux.i < uy.i) {
+ ut = ux;
+ ux = uy;
+ uy = ut;
+ }
+
+ x = ux.f;
+ y = uy.f;
+ if (uy.i == 0xff<<23)
+ return y;
+ if (ux.i >= 0xff<<23 || uy.i == 0 || ux.i - uy.i >= 25<<23)
+ return x + y;
+
+ z = 1;
+ if (ux.i >= (0x7f+60)<<23) {
+ z = oneP[0].f;
+ x *= oneP[1].f;
+ y *= oneP[1].f;
+ } else if (uy.i < (0x7f-60)<<23) {
+ z = oneP[1].f;
+ x *= oneP[0].f;
+ y *= oneP[0].f;
+ }
+ return z*sqrtf((double)x*x + (double)y*y);
+}
--- /dev/null
+++ b/j40.h
@@ -1,0 +1,8505 @@
+// J40: Independent, self-contained JPEG XL decoder
+// Kang Seonghoon, version 2270 (2022-09), Public Domain
+// https://github.com/lifthrasiir/j40
+//
+// This is a decoder for JPEG XL (ISO/IEC 18181) image format. It intends to be a fully compatible
+// reimplementation to the reference implementation, libjxl, and also serves as a verification that
+// the specification allows for an independent implementation besides from libjxl.
+//
+// The following is a simple but complete converter from JPEG XL to Portable Arbitrary Map format:
+//
+/* -------------------------------------------------------------------------------- //
+#define J40_IMPLEMENTATION // only a SINGLE file should have this
+#include "j40.h" // you also need to define a macro for experimental versions; follow the error.
+#include <stdio.h>
+#include <stdarg.h> // for va_*
+
+static int oops(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+ return 1;
+}
+
+int main(int argc, char **argv) {
+ if (argc < 3) return oops("Usage: %s input.jxl output.pam\n", argv[0]);
+
+ FILE *out = fopen(argv[2], "wb");
+ if (!out) return oops("Error: Cannot open an output file.\n");
+
+ j40_image image;
+ j40_from_file(&image, argv[1]); // or: j40_from_memory(&image, buf, bufsize, freefunc);
+ j40_output_format(&image, J40_RGBA, J40_U8X4);
+
+ // JPEG XL supports animation, so `j40_next_frame` calls can be called multiple times
+ if (j40_next_frame(&image)) {
+ j40_frame frame = j40_current_frame(&image);
+ j40_pixels_u8x4 pixels = j40_frame_pixels_u8x4(&frame, J40_RGBA);
+ fprintf(out,
+ "P7\n"
+ "WIDTH %d\n"
+ "HEIGHT %d\n"
+ "DEPTH 4\n"
+ "MAXVAL 255\n"
+ "TUPLTYPE RGB_ALPHA\n"
+ "ENDHDR\n",
+ pixels.width, pixels.height);
+ for (int y = 0; y < pixels.height; ++y) {
+ fwrite(j40_row_u8x4(pixels, y), 4, pixels.width, out);
+ }
+ }
+
+ // J40 stops once the first error is encountered; its error can be checked at the very end
+ if (j40_error(&image)) return oops("Error: %s\n", j40_error_string(&image));
+ if (ferror(out)) return oops("Error: Cannot fully write to the output file.\n");
+
+ j40_free(&image); // also frees all memory associated to j40_frame etc.
+ fclose(out);
+ return 0;
+}
+// -------------------------------------------------------------------------------- */
+
+////////////////////////////////////////////////////////////////////////////////
+// preamble (only reachable via the user `#include`)
+
+// controls whether each `#if`-`#endif` section in this file should be included or not.
+// there are multiple purposes of this macro:
+// - `J40__RECURSING` is always defined after the first ever `#include`, so that:
+// - the preamble will precede every other code in the typical usage, and
+// - the preamble won't be included twice.
+// - `J40__RECURSING` is either 0 (public) or -1 (internal) depending on the logical visibility,
+// so that the preamble can choose whether to include the internal code or not.
+// - larger values (>= 100) are used to repeat a specific section of code with
+// slightly different parameters, i.e. templated code.
+// - one value (currently 9999) is reserved and used to ignore subsequent top-level `#include`s.
+#ifndef J40__RECURSING
+
+#define J40_VERSION 2270 // (fractional gregorian year - 2000) * 100, with a liberal rounding
+
+#ifndef J40_CONFIRM_THAT_THIS_IS_EXPERIMENTAL_AND_POTENTIALLY_UNSAFE
+#error "Please #define J40_CONFIRM_THAT_THIS_IS_EXPERIMENTAL_AND_POTENTIALLY_UNSAFE to use J40. Proceed at your own risk."
+#endif
+
+//#define J40_DEBUG
+
+#ifndef J40_FILENAME // should be provided if this file has a different name than `j40.h`
+#define J40_FILENAME "j40.h"
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef J40_IMPLEMENTATION
+ #define J40__IMPLEMENTATION_INCLUDED
+ #include <string.h>
+ #include <math.h>
+ #include <limits.h>
+ #include <errno.h>
+ #include <stdio.h>
+ #ifdef J40_DEBUG
+ #include <assert.h>
+ #endif
+ #ifndef J40__EXPOSE_INTERNALS
+ #define J40__EXPOSE_INTERNALS
+ #endif
+#endif
+
+#ifdef J40__EXPOSE_INTERNALS
+ #define J40__RECURSING (-1)
+#else
+ #define J40__RECURSING 0
+#endif
+
+// we don't care about secure CRT, which is only marginally safe and not even compatible with C11
+#ifdef _MSC_VER
+ #pragma warning(push)
+ #pragma warning(disable: 4996)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#endif // !defined J40__RECURSING
+
+#if J40__RECURSING == 9999 // enabled only when the header file is included the second time or more
+ #if !defined J40__IMPLEMENTATION_INCLUDED && defined J40_IMPLEMENTATION
+ #error "J40 is included with J40_IMPLEMENTATION defined, but it was already included without it so it would have been ignored!"
+ #endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// public platform macros
+
+#if J40__RECURSING <= 0
+
+// just in case:
+#if CHAR_BIT != 8 // in fact, pretty much every file processing wouldn't make sense if CHAR_BIT > 8
+ #error "J40 requires CHAR_BIT == 8"
+#endif
+
+#ifndef J40_STATIC_ASSERT
+ #if __STDC_VERSION__ >= 199901L
+ #define J40_STATIC_ASSERT(cond, msg) _Static_assert(cond, #msg)
+ #else
+ #define J40_STATIC_ASSERT(cond, msg) typedef char j40__##msg[(cond) ? 1 : -1]
+ #endif
+#endif // !defined J40_STATIC_ASSERT
+
+// just in case again, because it is still possible for them to have padding bits (that we needn't):
+J40_STATIC_ASSERT(sizeof(uint8_t) == 1, uint8_t_should_have_no_padding_bits);
+J40_STATIC_ASSERT(sizeof(uint16_t) == 2, uint16_t_should_have_no_padding_bits);
+J40_STATIC_ASSERT(sizeof(uint32_t) == 4, uint32_t_should_have_no_padding_bits);
+J40_STATIC_ASSERT(sizeof(uint64_t) == 8, uint64_t_should_have_no_padding_bits);
+
+#ifndef J40_API
+ #define J40_API // TODO
+#endif
+
+#endif // J40__RECURSING <= 0
+
+////////////////////////////////////////////////////////////////////////////////
+// public API
+
+#if J40__RECURSING <= 0
+
+// an internal error type. non-zero indicates a different error condition.
+// user callbacks can also emit error codes, which should not exceed `J40_MIN_RESERVED_ERR`.
+// it can be interpreted as a four-letter code, but such encoding is not guaranteed.
+typedef uint32_t j40_err;
+#define J40_MIN_RESERVED_ERR (j40_err) (1 << 24) // anything below this can be used freely
+
+typedef struct {
+ // either J40__IMAGE_MAGIC, (J40__IMAGE_ERR_MAGIC ^ origin) or (J40__IMAGE_OPEN_ERR_MAGIC ^ origin)
+ uint32_t magic;
+ union {
+ struct j40__inner *inner; // if magic == J40__IMAGE_MAGIC
+ j40_err err; // if magic == J40__IMAGE_ERR_MAGIC
+ int saved_errno; // if magic == J40__IMAGE_OPEN_ERR_MAGIC (err is assumed to be `open`)
+ } u;
+} j40_image;
+
+typedef struct {
+ uint32_t magic; // should be J40__FRAME_MAGIC or J40__FRAME_ERR_MAGIC
+ uint32_t reserved;
+ struct j40__inner *inner;
+} j40_frame;
+
+typedef void (*j40_memory_free_func)(void *data);
+
+// pixel formats
+//rsvd: J40_U8 0x0f0f
+//rsvd: J40_U16 0x0f17
+//rsvd: J40_U32 0x0f1b
+//rsvd: J40_U64 0x0f1d
+//rsvd: J40_F32 0x0f1e
+//rsvd: J40_U8X3 0x0f27
+//rsvd: J40_U16X3 0x0f2b
+//rsvd: J40_U32X3 0x0f2d
+//rsvd: J40_F32X3 0x0f2e
+#define J40_U8X4 0x0f33
+//rsvd: J40_U16X4 0x0f35
+//rsvd: J40_U32X4 0x0f36
+//rsvd: J40_F32X4 0x0f39
+
+// color types
+//rsvd: J40_RED 0x170f
+//rsvd: J40_GREEN 0x1717
+//rsvd: J40_BLUE 0x171b
+//rsvd: J40_LUMI 0x171d
+//rsvd: J40_ALPHA 0x171e
+//rsvd: J40_CYAN 0x1727
+//rsvd: J40_YELLOW 0x172b
+//rsvd: J40_MAGENTA 0x172d
+//rsvd: J40_BLACK 0x172e
+//rsvd: J40_JPEG_Y 0x1733
+//rsvd: J40_JPEG_CB 0x1735
+//rsvd: J40_JPEG_CR 0x1736
+//rsvd: J40_OPSIN_X 0x1739
+//rsvd: J40_OPSIN_Y 0x173a
+//rsvd: J40_OPSIN_B 0x173c
+//rsvd: J40_RED_BEFORE_CT 0x1747
+//rsvd: J40_GREEN_BEFORE_CT 0x174b
+//rsvd: J40_BLUE_BEFORE_CT 0x174d
+//rsvd: J40_RGB 0x174e
+//rsvd: J40_BGR 0x1753
+#define J40_RGBA 0x1755
+//rsvd: J40_ARGB 0x1756
+//rsvd: J40_BGRA 0x1759
+//rsvd: J40_ABGR 0x175a
+
+J40_API j40_err j40_error(const j40_image *image);
+J40_API const char *j40_error_string(const j40_image *image);
+
+J40_API j40_err j40_from_memory(j40_image *image, void *buf, size_t size, j40_memory_free_func freefunc);
+J40_API j40_err j40_from_file(j40_image *image, const char *path);
+
+J40_API j40_err j40_output_format(j40_image *image, int32_t channel, int32_t format);
+
+J40_API int j40_next_frame(j40_image *image);
+J40_API j40_frame j40_current_frame(j40_image *image);
+
+#define J40__DEFINE_PIXELS(type, suffix) \
+ typedef struct { \
+ int32_t width, height; \
+ int32_t stride_bytes; \
+ const void *data; \
+ } j40_pixels_##suffix; \
+ J40_API j40_pixels_##suffix j40_frame_pixels_##suffix(const j40_frame *frame, int32_t channel); \
+ J40_API const type *j40_row_##suffix(j40_pixels_##suffix pixels, int32_t y)
+
+typedef uint8_t /*j40_u8x3[3],*/ j40_u8x4[4];
+//typedef uint16_t j40_u16x3[3], j40_u16x4[4];
+//typedef uint32_t j40_u32x3[3], j40_u32x4[4];
+typedef float /*j40_f32x3[3],*/ j40_f32x4[4]; // TODO temporary, API will be available later
+
+//J40__DEFINE_PIXELS(uint8_t, u8); // j40_pixels_u8, j40_frame_pixels_u8, j40_row_u8
+//J40__DEFINE_PIXELS(uint16_t, u16); // j40_pixels_u16, j40_frame_pixels_u16, j40_row_u16
+//J40__DEFINE_PIXELS(uint32_t, u32); // j40_pixels_u32, j40_frame_pixels_u32, j40_row_u32
+//J40__DEFINE_PIXELS(uint64_t, u64); // j40_pixels_u64, j40_frame_pixels_u64, j40_row_u64
+//J40__DEFINE_PIXELS(float, f32); // j40_pixels_f32, j40_frame_pixels_f32, j40_row_f32
+//J40__DEFINE_PIXELS(j40_u8x3, u8x3); // j40_pixels_u8x3, j40_frame_pixels_u8x3, j40_row_u8x3
+//J40__DEFINE_PIXELS(j40_u16x3, u16x3); // j40_pixels_u16x3, j40_frame_pixels_u16x3, j40_row_u16x3
+//J40__DEFINE_PIXELS(j40_u32x3, u32x3); // j40_pixels_u32x3, j40_frame_pixels_u32x3, j40_row_u32x3
+//J40__DEFINE_PIXELS(j40_f32x3, f32x3); // j40_pixels_f32x3, j40_frame_pixels_f32x3, j40_row_f32x3
+J40__DEFINE_PIXELS(j40_u8x4, u8x4); // j40_pixels_u8x4, j40_frame_pixels_u8x4, j40_row_u8x4
+//J40__DEFINE_PIXELS(j40_u16x4, u16x4); // j40_pixels_u16x4, j40_frame_pixels_u16x4, j40_row_u16x4
+//J40__DEFINE_PIXELS(j40_u32x4, u32x4); // j40_pixels_u32x4, j40_frame_pixels_u32x4, j40_row_u32x4
+//J40__DEFINE_PIXELS(j40_f32x4, f32x4); // j40_pixels_f32x4, j40_frame_pixels_f32x4, j40_row_f32x4
+
+J40_API void j40_free(j40_image *image);
+
+#endif // J40__RECURSING <= 0
+
+////////////////////////////////////////////////////////////////////////////////
+//////////////////////// internal code starts from here ////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+#if J40__RECURSING < 0
+
+// comment convention:
+// "SPEC" comments are used for incorrect, ambiguous or misleading specification issues.
+// "TODO spec" comments are roughly same, but not yet fully confirmed & reported.
+
+////////////////////////////////////////////////////////////////////////////////
+// private platform macros
+
+#ifdef __has_attribute // since GCC 5.0.0 and clang 2.9.0
+ #if __has_attribute(always_inline)
+ #define J40__HAS_ALWAYS_INLINE_ATTR 1
+ #endif
+ #if __has_attribute(warn_unused_result)
+ #define J40__HAS_WARN_UNUSED_RESULT_ATTR 1
+ #endif
+#endif
+
+#ifdef __has_builtin // since GCC 10.0.0 and clang 1.0.0 (which thus requires no version check)
+ #if __has_builtin(__builtin_expect)
+ #define J40__HAS_BUILTIN_EXPECT 1
+ #endif
+ #if __has_builtin(__builtin_add_overflow)
+ #define J40__HAS_BUILTIN_ADD_OVERFLOW 1
+ #endif
+ #if __has_builtin(__builtin_sub_overflow)
+ #define J40__HAS_BUILTIN_SUB_OVERFLOW 1
+ #endif
+ #if __has_builtin(__builtin_mul_overflow)
+ #define J40__HAS_BUILTIN_MUL_OVERFLOW 1
+ #endif
+ #if __has_builtin(__builtin_unreachable)
+ #define J40__HAS_BUILTIN_UNREACHABLE 1
+ #endif
+ #if __has_builtin(__builtin_assume_aligned)
+ #define J40__HAS_BUILTIN_ASSUME_ALIGNED 1
+ #endif
+#endif
+
+// clang (among many others) fakes GCC version by default, but we handle clang separately
+#if defined __GNUC__ && !defined __clang__
+ #define J40__GCC_VER (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
+#else
+ #define J40__GCC_VER 0
+#endif
+
+#ifdef __clang__
+ #define J40__CLANG_VER (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
+#else
+ #define J40__CLANG_VER 0
+#endif
+
+#ifndef J40_STATIC
+ #define J40_STATIC static
+#endif
+
+#ifndef J40_INLINE
+ #define J40_INLINE J40_STATIC inline
+#endif
+
+#ifndef J40_ALWAYS_INLINE
+ #if J40__HAS_ALWAYS_INLINE_ATTR || J40__GCC_VER >= 0x30100 || J40__CLANG_VER >= 0x10000
+ #define J40_ALWAYS_INLINE __attribute__((always_inline)) J40_INLINE
+ #elif defined _MSC_VER
+ #define J40_ALWAYS_INLINE __forceinline
+ #else
+ #define J40_ALWAYS_INLINE J40_INLINE
+ #endif
+#endif // !defined J40_ALWAYS_INLINE
+
+#ifndef J40_RESTRICT
+ #if __STDC_VERSION__ >= 199901L
+ #define J40_RESTRICT restrict
+ #elif defined __GNUC__ || __MSC_VER >= 1900 // since pretty much every GCC/Clang and VS 2015
+ #define J40_RESTRICT __restrict
+ #else
+ #define J40_RESTRICT
+ #endif
+#endif // !defined J40_RESTRICT
+
+// most structs in J40 are designed to be zero-initialized, and this avoids useless warnings
+#if defined __cplusplus /*|| __STDC_VERSION__ >= 2023xxL*/
+ #define J40__INIT {}
+#else
+ #define J40__INIT {0}
+#endif
+
+#ifndef J40_NODISCARD
+ #if __cplusplus >= 201703L /*|| __STDC_VERSION__ >= 2023xxL */
+ #define J40_NODISCARD [[nodiscard]] // since C++17 and C23
+ #elif J40__HAS_WARN_UNUSED_RESULT_ATTR || J40__GCC_VER >= 0x30400 || J40__CLANG_VER >= 0x10000
+ // this is stronger than [[nodiscard]] in that it's much harder to suppress; we're okay with that
+ #define J40_NODISCARD __attribute__((warn_unused_result)) // since GCC 3.4 and clang 1.0.0
+ #else
+ #define J40_NODISCARD
+ #endif
+#endif // !defined J40_NODISCARD
+
+#ifndef J40_MAYBE_UNUSED
+ #if __cplusplus >= 201703L /*|| __STDC_VERSION__ >= 2023xxL */
+ #define J40_MAYBE_UNUSED [[maybe_unused]] // since C++17 and C23
+ #elif J40__GCC_VER >= 0x30000 || J40__CLANG_VER >= 0x10000
+ #define J40_MAYBE_UNUSED __attribute__((unused)) // since GCC 2.95 or earlier (!) and clang 1.0.0
+ #else
+ #define J40_MAYBE_UNUSED
+ #endif
+#endif
+
+// rule of thumb: sparingly use them, except for the obvious error cases
+#ifndef J40_EXPECT
+ #if J40__HAS_BUILTIN_EXPECT || J40__GCC_VER >= 0x30000
+ #define J40_EXPECT(p, v) __builtin_expect(p, v)
+ #else
+ #define J40_EXPECT(p, v) (p)
+ #endif
+#endif // !defined J40_EXPECT
+#ifndef J40_LIKELY
+ #define J40_LIKELY(p) J40_EXPECT(!!(p), 1)
+#endif
+#ifndef J40_UNLIKELY
+ #define J40_UNLIKELY(p) J40_EXPECT(!!(p), 0)
+#endif
+
+#if !defined J40_ADD_OVERFLOW && (J40__HAS_BUILTIN_ADD_OVERFLOW || J40__GCC_VER >= 0x50000)
+ #define J40_ADD_OVERFLOW(a, b, res) __builtin_add_overflow(a, b, res)
+#endif
+#if !defined J40_SUB_OVERFLOW && (J40__HAS_BUILTIN_SUB_OVERFLOW || J40__GCC_VER >= 0x50000)
+ #define J40_SUB_OVERFLOW(a, b, res) __builtin_sub_overflow(a, b, res)
+#endif
+#if !defined J40_MUL_OVERFLOW && (J40__HAS_BUILTIN_MUL_OVERFLOW || J40__GCC_VER >= 0x50000)
+ #define J40_MUL_OVERFLOW(a, b, res) __builtin_mul_overflow(a, b, res)
+#endif
+
+#if !defined J40_MALLOC && !defined J40_CALLOC && !defined J40_REALLOC && !defined J40_FREE
+ #define J40_MALLOC malloc
+ #define J40_CALLOC calloc
+ #define J40_REALLOC realloc
+ #define J40_FREE free
+#elif !(defined J40_MALLOC && defined J40_CALLOC && defined J40_REALLOC && defined J40_FREE)
+ #error "J40_MALLOC, J40_CALLOC, J40_REALLOC and J40_FREE should be provided altogether."
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// state
+
+// bit and logical buffer. this is most frequently accessed and thus available without indirection.
+//
+// the bit buffer (`nbits` least significant bits of `bits`) is the least significant bits available
+// for decoding, and the logical buffer [ptr, end) corresponds to subsequent bits.
+// the logical buffer is guaranteed to be all in the codestream (which is not always true if
+// the file uses a container).
+//
+// when the bit buffer has been exhausted the next byte from the logical buffer is consumed and
+// appended at the *top* of the bit buffer. when the logical buffer has been exhausted
+// higher layers (first backing buffer, then container, and finally source) should be consulted.
+typedef struct j40__bits_st {
+ int32_t nbits; // [0, 64]
+ uint64_t bits;
+ uint8_t *ptr, *end;
+} j40__bits_st;
+
+// a common context ("state") for all internal functions.
+// this bears a strong similarity with `struct j40__inner` type in the API layer which would be
+// introduced much later. there are multiple reasons for this split:
+// - `j40__st` is designed to be in the stack, so it doesn't take up much stack space.
+// - `j40__st` allows for partial initialization of subsystems, which makes testing much easier.
+// - `j40__st` only holds things relevant to decoding, while `j40__inner` has API contexts.
+// - there can be multiple `j40__st` for multi-threaded decoding.
+typedef struct {
+ j40_err err; // first error code encountered, or 0
+ int saved_errno;
+ int cannot_retry; // a fatal error was encountered and no more additional input will fix it
+
+ // different subsystems make use of additional contexts, all accessible from here.
+ struct j40__bits_st bits; // very frequently accessed, thus inlined here
+ struct j40__source_st *source;
+ struct j40__container_st *container;
+ struct j40__buffer_st *buffer;
+ struct j40__image_st *image;
+ struct j40__frame_st *frame;
+ struct j40__lf_group_st *lf_group;
+ const struct j40__limits *limits;
+} j40__st;
+
+////////////////////////////////////////////////////////////////////////////////
+// error handling and memory allocation
+
+#ifdef J40_DEBUG
+ #define J40__ASSERT(cond) assert(cond)
+ #define J40__UNREACHABLE() J40__ASSERT(0)
+#elif J40__HAS_BUILTIN_UNREACHABLE || J40__GCC_VER >= 0x40500
+ #define J40__ASSERT(cond) (J40_UNLIKELY(!(cond)) ? __builtin_unreachable() : (void) 0)
+ #define J40__UNREACHABLE() __builtin_unreachable()
+#else
+ #define J40__ASSERT(cond) ((void) (cond))
+ #define J40__UNREACHABLE() ((void) 0) // TODO also check for MSVC __assume
+#endif
+
+// J40_NODISCARD should be before `static` or `inline`
+#define J40__STATIC_RETURNS_ERR J40_NODISCARD J40_STATIC j40_err
+#define J40__INLINE_RETURNS_ERR J40_NODISCARD J40_INLINE j40_err
+
+#define J40__4(s) \
+ (j40_err) (((uint32_t) (s)[0] << 24) | ((uint32_t) (s)[1] << 16) | ((uint32_t) (s)[2] << 8) | (uint32_t) (s)[3])
+#define J40__ERR(s) j40__set_error(st, J40__4(s))
+#define J40__SHOULD(cond, s) do { \
+ if (J40_UNLIKELY(st->err)) goto J40__ON_ERROR; \
+ if (J40_UNLIKELY((cond) == 0)) { j40__set_error(st, J40__4(s)); goto J40__ON_ERROR; } \
+ } while (0)
+#define J40__RAISE(s) do { j40__set_error(st, J40__4(s)); goto J40__ON_ERROR; } while (0)
+#define J40__RAISE_DELAYED() do { if (J40_UNLIKELY(st->err)) goto J40__ON_ERROR; } while (0)
+#define J40__TRY(expr) do { if (J40_UNLIKELY(expr)) { J40__ASSERT(st->err); goto J40__ON_ERROR; } } while (0)
+
+// this *should* use casting because C/C++ don't allow comparison between pointers
+// that came from different arrays at all: https://stackoverflow.com/a/39161283
+#define J40__INBOUNDS(ptr, start, size) ((uintptr_t) (ptr) - (uintptr_t) (start) <= (uintptr_t) (size))
+
+#define J40__TRY_MALLOC(type, ptr, num) \
+ do { \
+ type *newptr = (type*) j40__malloc(num, sizeof(type)); \
+ J40__SHOULD(*(ptr) = newptr, "!mem"); \
+ } while (0)
+
+#define J40__TRY_CALLOC(type, ptr, num) \
+ do { \
+ type *newptr = (type*) j40__calloc(num, sizeof(type)); \
+ J40__SHOULD(*(ptr) = newptr, "!mem"); \
+ } while (0)
+
+#define J40__TRY_REALLOC32(type, ptr, len, cap) \
+ do { \
+ type *newptr = (type*) j40__realloc32(st, *(ptr), sizeof(type), len, cap); \
+ if (J40_LIKELY(newptr)) *(ptr) = newptr; else goto J40__ON_ERROR; \
+ } while (0)
+
+#define J40__TRY_REALLOC64(type, ptr, len, cap) \
+ do { \
+ type *newptr = (type*) j40__realloc64(st, *(ptr), sizeof(type), len, cap); \
+ if (J40_LIKELY(newptr)) *(ptr) = newptr; else goto J40__ON_ERROR; \
+ } while (0)
+
+J40_STATIC j40_err j40__set_error(j40__st *st, j40_err err);
+J40_STATIC void *j40__malloc(size_t num, size_t size);
+J40_STATIC void *j40__calloc(size_t num, size_t size);
+J40_STATIC void *j40__realloc32(j40__st *st, void *ptr, size_t itemsize, int32_t len, int32_t *cap);
+J40_STATIC void *j40__realloc64(j40__st *st, void *ptr, size_t itemsize, int64_t len, int64_t *cap);
+J40_STATIC void j40__free(void *ptr);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC j40_err j40__set_error(j40__st *st, j40_err err) {
+ if (err != J40__4("shrt")) st->cannot_retry = 1;
+ if (!st->err) st->err = err;
+ return err;
+}
+
+J40_STATIC void *j40__malloc(size_t num, size_t size) {
+ if (size == 0 || num > SIZE_MAX / size) return NULL;
+ return J40_MALLOC(num * size);
+}
+
+J40_STATIC void *j40__calloc(size_t num, size_t size) {
+ return J40_CALLOC(num, size);
+}
+
+J40_STATIC void *j40__realloc32(j40__st *st, void *ptr, size_t itemsize, int32_t len, int32_t *cap) {
+ void *newptr;
+ uint32_t newcap;
+ size_t newsize;
+ J40__ASSERT(len >= 0);
+ if (len <= *cap) return ptr;
+ newcap = (uint32_t) *cap * 2;
+ if (newcap > (uint32_t) INT32_MAX) newcap = (uint32_t) INT32_MAX;
+ if (newcap < (uint32_t) len) newcap = (uint32_t) len;
+ J40__SHOULD(newcap <= SIZE_MAX / itemsize, "!mem");
+ newsize = (size_t) (itemsize * newcap);
+ J40__SHOULD(newptr = ptr ? J40_REALLOC(ptr, newsize) : J40_MALLOC(newsize), "!mem");
+ *cap = (int32_t) newcap;
+ return newptr;
+J40__ON_ERROR:
+ return NULL;
+}
+
+J40_STATIC void *j40__realloc64(j40__st *st, void *ptr, size_t itemsize, int64_t len, int64_t *cap) {
+ void *newptr;
+ uint64_t newcap;
+ size_t newsize;
+ J40__ASSERT(len >= 0);
+ if (len <= *cap) return ptr;
+ newcap = (uint64_t) *cap * 2;
+ if (newcap > (uint64_t) INT64_MAX) newcap = (uint64_t) INT64_MAX;
+ if (newcap < (uint64_t) len) newcap = (uint64_t) len;
+ J40__SHOULD(newcap <= SIZE_MAX / itemsize, "!mem");
+ newsize = (size_t) (itemsize * newcap);
+ J40__SHOULD(newptr = ptr ? J40_REALLOC(ptr, newsize) : J40_MALLOC(newsize), "!mem");
+ *cap = (int64_t) newcap;
+ return newptr;
+J40__ON_ERROR:
+ return NULL;
+}
+
+J40_STATIC void j40__free(void *ptr) {
+ J40_FREE(ptr);
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// utility
+
+#define J40__CONCAT_(a,b) a##b
+#define J40__CONCAT(a,b) a##b
+#define J40__CONCAT3(a,b,c) a##b##c
+
+// `j40__(foo, X)` and its uppercase version is `j40__foo` followed by a macro `J40__V` expanded;
+// this greatly simplifies the construction of templated names.
+#define J40__PARAMETRIC_NAME_(prefix, x, J40__V) J40__CONCAT3(prefix, x, J40__V)
+#define j40__(x, V) J40__PARAMETRIC_NAME_(j40__, x, J40__CONCAT(J40__, V))
+#define J40__(x, V) J40__PARAMETRIC_NAME_(J40__, x, J40__CONCAT(J40__, V))
+
+J40_ALWAYS_INLINE int32_t j40__unpack_signed(int32_t x);
+J40_ALWAYS_INLINE int64_t j40__unpack_signed64(int64_t x);
+J40_ALWAYS_INLINE int32_t j40__ceil_div32(int32_t x, int32_t y);
+J40_ALWAYS_INLINE int64_t j40__ceil_div64(int64_t x, int64_t y);
+J40_ALWAYS_INLINE float j40__minf(float x, float y);
+J40_ALWAYS_INLINE float j40__maxf(float x, float y);
+J40_ALWAYS_INLINE int j40__surely_nonzero(float x);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_ALWAYS_INLINE int32_t j40__unpack_signed(int32_t x) {
+ return (int32_t) (x & 1 ? -(x / 2 + 1) : x / 2);
+}
+J40_ALWAYS_INLINE int64_t j40__unpack_signed64(int64_t x) {
+ return (int64_t) (x & 1 ? -(x / 2 + 1) : x / 2);
+}
+
+// equivalent to ceil(x / y)
+J40_ALWAYS_INLINE int32_t j40__ceil_div32(int32_t x, int32_t y) { return (x + y - 1) / y; }
+J40_ALWAYS_INLINE int64_t j40__ceil_div64(int64_t x, int64_t y) { return (x + y - 1) / y; }
+
+J40_ALWAYS_INLINE float j40__minf(float x, float y) { return (x < y ? x : y); }
+J40_ALWAYS_INLINE float j40__maxf(float x, float y) { return (x > y ? x : y); }
+
+// used to guard against division by zero
+J40_ALWAYS_INLINE int j40__surely_nonzero(float x) {
+ return isfinite(x) && fabs(x) >= 1e-8f;
+}
+
+#ifdef _MSC_VER // required for j40__floor/ceil_lgN implementations
+
+#include <intrin.h>
+
+#pragma intrinsic(_BitScanReverse)
+J40_ALWAYS_INLINE int j40__clz32(uint32_t x) {
+ unsigned long index;
+ return _BitScanReverse(&index, x) ? 31 - (int) index : 32;
+}
+
+J40_ALWAYS_INLINE int j40__clz16(uint16_t x) { return j40__clz32(x); }
+
+// _BitScanReverse64 is not available at all in x86-32, so we need to detour
+#if defined __ia64__ || defined __x86_64
+#pragma intrinsic(_BitScanReverse64)
+J40_ALWAYS_INLINE int j40__clz64(uint64_t x) {
+ unsigned long index;
+ return _BitScanReverse64(&index, x) ? 63 - (int) index : 64;
+}
+#else
+J40_ALWAYS_INLINE int j40__clz64(uint64_t x) {
+ return x >> 32 ? j40__clz32((uint32_t) (x >> 32)) : 32 + j40__clz32((uint32_t) x);
+}
+#endif // defined __ia64__ || defined __x86_64
+
+#endif // defined _MSC_VER
+
+#endif // defined J40_IMPLEMENTATION
+
+// ----------------------------------------
+// recursion for bit-dependent math functions
+#undef J40__RECURSING
+#define J40__RECURSING 100
+#define J40__N 16
+#include J40_FILENAME
+#define J40__N 32
+#include J40_FILENAME
+#define J40__N 64
+#include J40_FILENAME
+#undef J40__RECURSING
+#define J40__RECURSING (-1)
+
+#endif // J40__RECURSING < 0
+#if J40__RECURSING == 100
+ #define j40__intN J40__CONCAT3(int, J40__N, _t)
+ #define j40__uintN J40__CONCAT3(uint, J40__N, _t)
+ #define J40__INTN_MAX J40__CONCAT3(INT, J40__N, _MAX)
+ #define J40__INTN_MIN J40__CONCAT3(INT, J40__N, _MIN)
+// ----------------------------------------
+
+J40_ALWAYS_INLINE j40__intN j40__(floor_avg,N)(j40__intN x, j40__intN y);
+J40_ALWAYS_INLINE j40__intN j40__(abs,N)(j40__intN x);
+J40_ALWAYS_INLINE j40__intN j40__(min,N)(j40__intN x, j40__intN y);
+J40_ALWAYS_INLINE j40__intN j40__(max,N)(j40__intN x, j40__intN y);
+
+// returns 1 if overflow or underflow didn't occur
+J40_ALWAYS_INLINE int j40__(add,N)(j40__intN x, j40__intN y, j40__intN *out);
+J40_ALWAYS_INLINE int j40__(sub,N)(j40__intN x, j40__intN y, j40__intN *out);
+J40_ALWAYS_INLINE int j40__(mul,N)(j40__intN x, j40__intN y, j40__intN *out);
+J40_ALWAYS_INLINE int j40__(add_fallback,N)(j40__intN x, j40__intN y, j40__intN *out);
+J40_ALWAYS_INLINE int j40__(sub_fallback,N)(j40__intN x, j40__intN y, j40__intN *out);
+J40_ALWAYS_INLINE int j40__(mul_fallback,N)(j40__intN x, j40__intN y, j40__intN *out);
+J40_ALWAYS_INLINE j40__intN j40__(clamp_add,N)(j40__intN x, j40__intN y);
+J40_ALWAYS_INLINE j40__intN j40__(clamp_mul,N)(j40__intN x, j40__intN y);
+
+#ifdef J40_IMPLEMENTATION
+
+// same to `(a + b) >> 1` but doesn't overflow, useful for tight loops with autovectorization
+// https://devblogs.microsoft.com/oldnewthing/20220207-00/?p=106223
+J40_ALWAYS_INLINE j40__intN j40__(floor_avg,N)(j40__intN x, j40__intN y) {
+ return (j40__intN) (x / 2 + y / 2 + (x & y & 1));
+}
+
+J40_ALWAYS_INLINE j40__intN j40__(abs,N)(j40__intN x) {
+ return (j40__intN) (x < 0 ? -x : x);
+}
+J40_ALWAYS_INLINE j40__intN j40__(min,N)(j40__intN x, j40__intN y) {
+ return (j40__intN) (x < y ? x : y);
+}
+J40_ALWAYS_INLINE j40__intN j40__(max,N)(j40__intN x, j40__intN y) {
+ return (j40__intN) (x > y ? x : y);
+}
+
+J40_ALWAYS_INLINE int j40__(add,N)(j40__intN x, j40__intN y, j40__intN *out) {
+#ifdef J40_ADD_OVERFLOW
+ // gcc/clang extension uses an opposite convention, which is unnatural to use with J40__SHOULD
+ return !J40_ADD_OVERFLOW(x, y, out);
+#else
+ return j40__(add_fallback,N)(x, y, out);
+#endif
+}
+
+J40_ALWAYS_INLINE int j40__(sub,N)(j40__intN x, j40__intN y, j40__intN *out) {
+#ifdef J40_SUB_OVERFLOW
+ return !J40_SUB_OVERFLOW(x, y, out);
+#else
+ return j40__(sub_fallback,N)(x, y, out);
+#endif
+}
+
+J40_ALWAYS_INLINE int j40__(mul,N)(j40__intN x, j40__intN y, j40__intN *out) {
+#ifdef J40_MUL_OVERFLOW
+ return !J40_MUL_OVERFLOW(x, y, out);
+#else
+ return j40__(mul_fallback,N)(x, y, out);
+#endif
+}
+
+J40_ALWAYS_INLINE int j40__(add_fallback,N)(j40__intN x, j40__intN y, j40__intN *out) {
+ if (J40_UNLIKELY((x > 0 && y > J40__INTN_MAX - x) || (x < 0 && y < J40__INTN_MIN - x))) {
+ return 0;
+ } else {
+ *out = (j40__intN) (x + y);
+ return 1;
+ }
+}
+
+J40_ALWAYS_INLINE int j40__(sub_fallback,N)(j40__intN x, j40__intN y, j40__intN *out) {
+ if (J40_UNLIKELY((y < 0 && x > J40__INTN_MAX + y) || (y > 0 && x < J40__INTN_MIN + y))) {
+ return 0;
+ } else {
+ *out = (j40__intN) (x - y);
+ return 1;
+ }
+}
+
+J40_ALWAYS_INLINE int j40__(mul_fallback,N)(j40__intN x, j40__intN y, j40__intN *out) {
+ if (J40_UNLIKELY(
+ x > 0 ?
+ (y > 0 ? x > J40__INTN_MAX / y : y < J40__INTN_MIN / x) :
+ (y > 0 ? x < J40__INTN_MIN / y : y != 0 && x < J40__INTN_MAX / y)
+ )) {
+ return 0;
+ } else {
+ *out = (j40__intN) (x * y);
+ return 1;
+ }
+}
+
+J40_ALWAYS_INLINE j40__intN j40__(clamp_add,N)(j40__intN x, j40__intN y) {
+ j40__intN out;
+ return j40__(add,N)(x, y, &out) ? out : J40__INTN_MAX;
+}
+
+J40_ALWAYS_INLINE j40__intN j40__(clamp_mul,N)(j40__intN x, j40__intN y) {
+ j40__intN out;
+ return j40__(mul,N)(x, y, &out) ? out : J40__INTN_MAX;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+#ifdef _MSC_VER
+ #define J40__CLZN j40__(clz, N)
+#else
+ #define J40__UINTN_MAX J40__CONCAT3(UINT, J40__N, _MAX)
+ #if UINT_MAX == J40__UINTN_MAX
+ #define J40__CLZN __builtin_clz
+ #elif ULONG_MAX == J40__UINTN_MAX
+ #define J40__CLZN __builtin_clzl
+ #elif ULLONG_MAX == J40__UINTN_MAX
+ #define J40__CLZN __builtin_clzll
+ #endif
+ #undef J40__UINTN_MAX
+#endif // !defined _MSC_VER
+#ifdef J40__CLZN
+ J40_ALWAYS_INLINE int j40__(floor_lg,N)(j40__uintN x);
+ J40_ALWAYS_INLINE int j40__(ceil_lg,N)(j40__uintN x);
+
+ #ifdef J40_IMPLEMENTATION
+ // both requires x to be > 0
+ J40_ALWAYS_INLINE int j40__(floor_lg,N)(j40__uintN x) {
+ return J40__N - 1 - J40__CLZN(x);
+ }
+ J40_ALWAYS_INLINE int j40__(ceil_lg,N)(j40__uintN x) {
+ return x > 1 ? J40__N - J40__CLZN(x - 1) : 0;
+ }
+ #endif
+
+ #undef J40__CLZN
+#endif
+
+// ----------------------------------------
+// end of recursion
+ #undef j40__intN
+ #undef j40__uintN
+ #undef J40__INTN_MAX
+ #undef J40__INTN_MIN
+ #undef J40__N
+#endif // J40__RECURSING == 100
+#if J40__RECURSING < 0
+// ----------------------------------------
+
+////////////////////////////////////////////////////////////////////////////////
+// aligned pointers
+
+#ifndef J40_ASSUME_ALIGNED
+ #if J40__HAS_BUILTIN_ASSUME_ALIGNED || J40__GCC_VER >= 0x40700
+ #define J40_ASSUME_ALIGNED(p, align) __builtin_assume_aligned(p, align)
+ #else
+ #define J40_ASSUME_ALIGNED(p, align) (p)
+ #endif
+#endif // !defined J40_ASSUME_ALIGNED
+
+J40_ALWAYS_INLINE void *j40__alloc_aligned(size_t sz, size_t align, size_t *outmisalign);
+J40_ALWAYS_INLINE void j40__free_aligned(void *ptr, size_t align, size_t misalign);
+
+J40_MAYBE_UNUSED J40_STATIC void *j40__alloc_aligned_fallback(size_t sz, size_t align, size_t *outmisalign);
+J40_MAYBE_UNUSED J40_STATIC void j40__free_aligned_fallback(void *ptr, size_t align, size_t misalign);
+
+#ifdef J40_IMPLEMENTATION
+
+#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600
+ J40_ALWAYS_INLINE void *j40__alloc_aligned(size_t sz, size_t align, size_t *outmisalign) {
+ void *ptr = NULL;
+ *outmisalign = 0;
+ return posix_memalign(&ptr, align, sz) ? NULL : ptr;
+ }
+ J40_ALWAYS_INLINE void j40__free_aligned(void *ptr, size_t align, size_t misalign) {
+ (void) align; (void) misalign;
+ free(ptr); // important: do not use j40_free!
+ }
+#elif defined _ISOC11_SOURCE
+ J40_ALWAYS_INLINE void *j40__alloc_aligned(size_t sz, size_t align, size_t *outmisalign) {
+ if (sz > SIZE_MAX / align * align) return NULL; // overflow
+ *outmisalign = 0;
+ return aligned_alloc(align, (sz + align - 1) / align * align);
+ }
+ J40_ALWAYS_INLINE void j40__free_aligned(void *ptr, size_t align, size_t misalign) {
+ (void) align; (void) misalign;
+ free(ptr); // important: do not use j40_free!
+ }
+#else
+ J40_ALWAYS_INLINE void *j40__alloc_aligned(size_t sz, size_t align, size_t *outmisalign) {
+ return j40__alloc_aligned_fallback(sz, align, outmisalign);
+ }
+ J40_ALWAYS_INLINE void j40__free_aligned(void *ptr, size_t align, size_t misalign) {
+ j40__free_aligned_fallback(ptr, align, misalign);
+ }
+#endif
+
+// a fallback implementation; the caller should store the misalign amount [0, align) separately.
+// used when the platform doesn't provide aligned malloc at all, or the platform implementation
+// is not necessarily better; e.g. MSVC _aligned_malloc has the same amount of overhead as of Win10
+J40_MAYBE_UNUSED J40_STATIC void *j40__alloc_aligned_fallback(size_t sz, size_t align, size_t *outmisalign) {
+ // while this is almost surely an overestimate (can be improved if we know the malloc alignment)
+ // there is no standard way to compute a better estimate in C99 so this is inevitable.
+ size_t maxmisalign = align - 1, misalign;
+ void *ptr;
+ if (sz > SIZE_MAX - maxmisalign) return NULL; // overflow
+ ptr = J40_MALLOC(sz + maxmisalign);
+ if (!ptr) return NULL;
+ misalign = align - (uintptr_t) ptr % align;
+ if (misalign == align) misalign = 0;
+ *outmisalign = misalign;
+ return (void*) ((uintptr_t) ptr + misalign);
+}
+
+static void j40__free_aligned_fallback(void *ptr, size_t align, size_t misalign) {
+ if (!ptr) return;
+ J40__ASSERT((uintptr_t) ptr % align == 0);
+ j40__free((void*) ((uintptr_t) ptr - misalign));
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// two-dimensional view
+
+typedef struct { int32_t logw, logh; float *J40_RESTRICT ptr; } j40__view_f32;
+
+J40_ALWAYS_INLINE j40__view_f32 j40__make_view_f32(int32_t logw, int32_t logh, float *J40_RESTRICT ptr);
+J40_ALWAYS_INLINE void j40__adapt_view_f32(j40__view_f32 *outv, int32_t logw, int32_t logh);
+J40_ALWAYS_INLINE void j40__reshape_view_f32(j40__view_f32 *outv, int32_t logw, int32_t logh);
+J40_ALWAYS_INLINE void j40__copy_view_f32(j40__view_f32 *outv, const j40__view_f32 inv);
+J40_ALWAYS_INLINE void j40__transpose_view_f32(j40__view_f32 *outv, const j40__view_f32 inv);
+J40_ALWAYS_INLINE void j40__oddeven_columns_to_halves_f32(j40__view_f32 *outv, const j40__view_f32 inv);
+J40_ALWAYS_INLINE void j40__oddeven_rows_to_halves_f32(j40__view_f32 *outv, const j40__view_f32 inv);
+J40_MAYBE_UNUSED J40_STATIC void j40__print_view_f32(j40__view_f32 v, const char *name, const char *file, int32_t line);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_ALWAYS_INLINE j40__view_f32 j40__make_view_f32(int32_t logw, int32_t logh, float *J40_RESTRICT ptr) {
+ j40__view_f32 ret = { logw, logh, ptr };
+ return ret;
+}
+
+J40_ALWAYS_INLINE void j40__adapt_view_f32(j40__view_f32 *outv, int32_t logw, int32_t logh) {
+ J40__ASSERT(outv->logw + outv->logh >= logw + logh);
+ outv->logw = logw;
+ outv->logh = logh;
+}
+
+J40_ALWAYS_INLINE void j40__reshape_view_f32(j40__view_f32 *outv, int32_t logw, int32_t logh) {
+ J40__ASSERT(outv->logw + outv->logh == logw + logh);
+ outv->logw = logw;
+ outv->logh = logh;
+}
+
+J40_ALWAYS_INLINE void j40__copy_view_f32(j40__view_f32 *outv, const j40__view_f32 inv) {
+ int32_t x, y;
+ float *outptr = outv->ptr;
+ j40__adapt_view_f32(outv, inv.logw, inv.logh);
+ for (y = 0; y < (1 << inv.logh); ++y) for (x = 0; x < (1 << inv.logw); ++x) {
+ outptr[y << inv.logw | x] = inv.ptr[y << inv.logw | x];
+ }
+}
+
+J40_ALWAYS_INLINE void j40__transpose_view_f32(j40__view_f32 *outv, const j40__view_f32 inv) {
+ int32_t x, y;
+ float *outptr = outv->ptr;
+ j40__adapt_view_f32(outv, inv.logh, inv.logw);
+ for (y = 0; y < (1 << inv.logh); ++y) for (x = 0; x < (1 << inv.logw); ++x) {
+ outptr[x << inv.logh | y] = inv.ptr[y << inv.logw | x];
+ }
+}
+
+// shuffles columns 01234567 into 02461357 and so on
+J40_ALWAYS_INLINE void j40__oddeven_columns_to_halves_f32(j40__view_f32 *outv, const j40__view_f32 inv) {
+ int32_t x, y;
+ float *outptr = outv->ptr;
+ J40__ASSERT(inv.logw > 0);
+ j40__adapt_view_f32(outv, inv.logw, inv.logh);
+ for (y = 0; y < (1 << inv.logh); ++y) for (x = 0; x < (1 << inv.logw); ++x) {
+ int32_t outx = ((x & 1) << (inv.logw - 1)) | (x >> 1);
+ outptr[y << inv.logw | outx] = inv.ptr[y << inv.logw | x];
+ }
+}
+
+// shuffles rows 01234567 into 02461357 and so on
+J40_ALWAYS_INLINE void j40__oddeven_rows_to_halves_f32(j40__view_f32 *outv, const j40__view_f32 inv) {
+ int32_t x, y;
+ float *outptr = outv->ptr;
+ J40__ASSERT(inv.logh > 0);
+ j40__adapt_view_f32(outv, inv.logw, inv.logh);
+ for (y = 0; y < (1 << inv.logh); ++y) {
+ int32_t outy = ((y & 1) << (inv.logh - 1)) | (y >> 1);
+ for (x = 0; x < (1 << inv.logw); ++x) outptr[outy << inv.logw | x] = inv.ptr[y << inv.logw | x];
+ }
+}
+
+#define J40__AT(view, x, y) \
+ (J40__ASSERT(0 <= (x) && (x) < (1 << (view).logw) && 0 <= (y) && (y) < (1 << (view).logh)), \
+ (view).ptr + ((y) << (view).logw | (x)))
+
+#define J40__VIEW_FOREACH(view, y, x, v) \
+ for (y = 0; y < (1 << (view).logh); ++y) \
+ for (x = 0; x < (1 << (view).logw) && (v = (view).ptr + (y << (view).logw | x), 1); ++x)
+
+J40_MAYBE_UNUSED J40_STATIC void j40__print_view_f32(j40__view_f32 v, const char *name, const char *file, int32_t line) {
+ int32_t x, y;
+ printf(".--- %s:%d: %s (w=%d h=%d @%p)", file, line, name, 1 << v.logw, 1 << v.logh, v.ptr);
+ for (y = 0; y < (1 << v.logh); ++y) {
+ printf("\n|");
+ for (x = 0; x < (1 << v.logw); ++x) printf(" %f", *J40__AT(v, x, y));
+ }
+ printf("\n'--- %s:%d\n", file, line);
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+#define j40__print_view_f32(v) j40__print_view_f32(v, #v, __FILE__, __LINE__)
+
+////////////////////////////////////////////////////////////////////////////////
+// plane
+
+enum {
+ J40__PLANE_U8 = (uint8_t) 0x20,
+ J40__PLANE_U16 = (uint8_t) 0x21,
+ J40__PLANE_I16 = (uint8_t) 0x41,
+ J40__PLANE_U32 = (uint8_t) 0x22,
+ J40__PLANE_I32 = (uint8_t) 0x42,
+ J40__PLANE_F32 = (uint8_t) 0x62,
+ J40__PLANE_EMPTY = (uint8_t) 0xe0, // should have width=0 and height=0
+};
+
+#define J40__PIXELS_ALIGN 32
+
+typedef struct {
+ uint8_t type; // 0 means uninitialized (all fields besides from pixels are considered garbage)
+ uint8_t misalign;
+ int8_t vshift, hshift;
+ int32_t width, height;
+ int32_t stride_bytes; // the number of *bytes* between each row
+ uintptr_t pixels;
+} j40__plane;
+
+#define J40__TYPED_PIXELS(plane, y, typeconst, pixel_t) \
+ (J40__ASSERT((plane)->type == typeconst), \
+ J40__ASSERT(0 <= (y) && (y) < (plane)->height), \
+ (pixel_t*) J40_ASSUME_ALIGNED( \
+ (void*) ((char*) (plane)->pixels + (size_t) (plane)->stride_bytes * (size_t) (y)), \
+ J40__PIXELS_ALIGN))
+
+#define J40__U8_PIXELS(plane, y) J40__TYPED_PIXELS(plane, y, J40__PLANE_U8, uint8_t)
+#define J40__U16_PIXELS(plane, y) J40__TYPED_PIXELS(plane, y, J40__PLANE_U16, uint16_t)
+#define J40__I16_PIXELS(plane, y) J40__TYPED_PIXELS(plane, y, J40__PLANE_I16, int16_t)
+#define J40__U32_PIXELS(plane, y) J40__TYPED_PIXELS(plane, y, J40__PLANE_U32, uint32_t)
+#define J40__I32_PIXELS(plane, y) J40__TYPED_PIXELS(plane, y, J40__PLANE_I32, int32_t)
+#define J40__F32_PIXELS(plane, y) J40__TYPED_PIXELS(plane, y, J40__PLANE_F32, float)
+
+#define J40__PLANE_PIXEL_SIZE(plane) (1 << ((plane)->type & 31))
+#define J40__PLANE_STRIDE(plane) ((plane)->stride_bytes >> ((plane)->type & 31))
+
+enum {
+ J40__PLANE_CLEAR = 1 << 0,
+ // for public facing planes, we always add padding to prevent misconception
+ J40__PLANE_FORCE_PAD = 1 << 1,
+};
+
+J40__STATIC_RETURNS_ERR j40__init_plane(
+ j40__st *st, uint8_t type, int32_t width, int32_t height, int flags, j40__plane *out
+);
+J40_STATIC void j40__init_empty_plane(j40__plane *out);
+J40_STATIC int j40__plane_all_equal_sized(const j40__plane *begin, const j40__plane *end);
+// returns that type if all planes have the same type, otherwise returns 0
+J40_STATIC uint8_t j40__plane_all_equal_typed(const j40__plane *begin, const j40__plane *end);
+J40_STATIC uint8_t j40__plane_all_equal_typed_or_empty(const j40__plane *begin, const j40__plane *end);
+J40_STATIC void j40__free_plane(j40__plane *plane);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__init_plane(
+ j40__st *st, uint8_t type, int32_t width, int32_t height, int flags, j40__plane *out
+) {
+ int32_t pixel_size = 1 << (type & 31);
+ void *pixels;
+ int32_t stride_bytes;
+ size_t total, misalign;
+
+ out->type = 0;
+ J40__ASSERT(width > 0 && height > 0);
+
+ J40__SHOULD(j40__mul32(width, pixel_size, &stride_bytes), "bigg");
+ if (flags & J40__PLANE_FORCE_PAD) J40__SHOULD(j40__add32(stride_bytes, 1, &stride_bytes), "bigg");
+ J40__SHOULD(
+ j40__mul32(j40__ceil_div32(stride_bytes, J40__PIXELS_ALIGN), J40__PIXELS_ALIGN, &stride_bytes),
+ "bigg");
+ J40__SHOULD((size_t) stride_bytes <= SIZE_MAX / (uint32_t) height, "bigg");
+ total = (size_t) stride_bytes * (size_t) height;
+ J40__SHOULD(pixels = j40__alloc_aligned(total, J40__PIXELS_ALIGN, &misalign), "!mem");
+
+ out->stride_bytes = stride_bytes;
+ out->width = width;
+ out->height = height;
+ out->type = type;
+ out->vshift = out->hshift = 0;
+ out->misalign = (uint8_t) misalign;
+ out->pixels = (uintptr_t) pixels;
+ if (flags & J40__PLANE_CLEAR) memset(pixels, 0, total);
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+// an empty plane can arise from inverse modular transform, but it can be a bug as well,
+// hence a separate function and separate type.
+J40_STATIC void j40__init_empty_plane(j40__plane *out) {
+ out->type = J40__PLANE_EMPTY;
+ out->stride_bytes = 0;
+ out->width = out->height = 0;
+ out->vshift = out->hshift = 0;
+ out->misalign = 0;
+ out->pixels = (uintptr_t) (void*) 0;
+}
+
+J40_STATIC int j40__plane_all_equal_sized(const j40__plane *begin, const j40__plane *end) {
+ j40__plane c;
+ int shift_should_match;
+ if (begin >= end) return 0; // do not allow edge cases
+ c = *begin;
+ shift_should_match = (begin->vshift >= 0 && begin->hshift >= 0);
+ while (++begin < end) {
+ if (c.width != begin->width || c.height != begin->height) return 0;
+ // even though the sizes match, different shifts can't be mixed as per the spec
+ if (shift_should_match) {
+ if (c.vshift >= 0 && c.hshift >= 0 && (c.vshift != begin->vshift || c.hshift != begin->hshift)) return 0;
+ }
+ }
+ return 1;
+}
+
+J40_STATIC uint8_t j40__plane_all_equal_typed(const j40__plane *begin, const j40__plane *end) {
+ uint8_t type;
+ if (begin >= end) return 0;
+ type = begin->type;
+ while (++begin < end) {
+ if (begin->type != type) return 0;
+ }
+ return type;
+}
+
+J40_STATIC uint8_t j40__plane_all_equal_typed_or_empty(const j40__plane *begin, const j40__plane *end) {
+ uint8_t type;
+ if (begin >= end) return 0;
+ type = begin->type;
+ while (++begin < end) {
+ // allow empty plane to pass this test; if all planes are empty, will return J40__PLANE_EMPTY
+ if (type == J40__PLANE_EMPTY) type = begin->type;
+ if (begin->type != J40__PLANE_EMPTY && begin->type != type) return 0;
+ }
+ return type;
+}
+
+J40_STATIC void j40__free_plane(j40__plane *plane) {
+ // we don't touch pixels if plane is zero-initialized via memset, because while `plane->type` is
+ // definitely zero in this case `(void*) plane->pixels` might NOT be a null pointer!
+ if (plane->type && plane->type != J40__PLANE_EMPTY) {
+ j40__free_aligned((void*) plane->pixels, J40__PIXELS_ALIGN, plane->misalign);
+ }
+ plane->width = plane->height = plane->stride_bytes = 0;
+ plane->type = 0;
+ plane->vshift = plane->hshift = 0;
+ plane->misalign = 0;
+ plane->pixels = (uintptr_t) (void*) 0;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// limits
+
+typedef struct j40__limits {
+ int64_t pixels; // <= 2^61
+ int32_t width; // <= 2^31
+ int32_t height; // <= 2^30
+
+ uint64_t icc_output_size; // < 2^63
+ int32_t bpp; // <= 64
+ int ec_black_allowed;
+
+ int32_t num_extra_channels; // <= 4096
+ int needs_modular_16bit_buffers;
+ int32_t nb_transforms; // <= 273
+
+ int32_t nb_channels_tr; // # of modular channels after transform
+ int32_t tree_depth; // distance between root and leaf nodes
+ int64_t zf_pixels; // total # of pixels in a run of zero-duration frames, unlimited if zero
+} j40__limits;
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC const j40__limits J40__MAIN_LV5_LIMITS = {
+ /*.pixels =*/ 1 << 28, /*.width =*/ 1 << 18, /*.height =*/ 1 << 18,
+ /*.icc_output_size =*/ 1u << 22, /*.bpp =*/ 16, /*.ec_black_allowed =*/ 0,
+ /*.num_extra_channels =*/ 4, /*.needs_modular_16bit_buffers =*/ 1, /*.nb_transforms =*/ 8,
+ /*.nb_channels_tr =*/ 256, /*.tree_depth =*/ 64, /*.zf_pixels =*/ 1 << 28,
+};
+
+#if 0
+J40_STATIC j40__limits J40__MAIN_LV10_LIMITS = {
+ /*.pixels =*/ (int64_t) 1 << 40, /*.width =*/ 1 << 30, /*.height =*/ 1 << 30,
+ /*.icc_output_size =*/ 1u << 28, /*.bpp =*/ 32, /*.ec_black_allowed =*/ 1,
+ /*.num_extra_channels =*/ 256, /*.needs_modular_16bit_buffers =*/ 0, /*.nb_transforms =*/ 512,
+ /*.nb_channels_tr =*/ 1 << 16, /*.tree_depth =*/ 2048, /*.zf_pixels =*/ 0,
+};
+#endif
+
+#endif // defined J40_IMPLEMENTATION
+
+//extern const j40__limits J40__MAIN_LV5_LIMITS/*, J40__MAIN_LV10_LIMITS*/;
+
+////////////////////////////////////////////////////////////////////////////////
+// input source
+
+typedef int (*j40_source_read_func)(uint8_t *buf, int64_t fileoff, size_t maxsize, size_t *size, void *data);
+typedef int (*j40_source_seek_func)(int64_t fileoff, void *data);
+typedef void (*j40_source_free_func)(void *data); // intentionally same to j40_memory_free_func
+
+typedef struct j40__source_st {
+ j40_source_read_func read_func;
+ j40_source_seek_func seek_func;
+ j40_source_free_func free_func;
+ void *data;
+
+ int64_t fileoff; // absolute file offset, assumed to be 0 at the initialization
+ int64_t fileoff_limit; // fileoff can't exceed this; otherwise will behave as if EOF has occurred
+} j40__source_st;
+
+J40__STATIC_RETURNS_ERR j40__init_memory_source(
+ j40__st *st, uint8_t *buf, size_t size, j40_memory_free_func freefunc, j40__source_st *source
+);
+J40__STATIC_RETURNS_ERR j40__init_file_source(j40__st *st, const char *path, j40__source_st *source);
+J40__STATIC_RETURNS_ERR j40__try_read_from_source(
+ j40__st *st, uint8_t *buf, int64_t minsize, int64_t maxsize, int64_t *size
+);
+J40__STATIC_RETURNS_ERR j40__read_from_source(j40__st *st, uint8_t *buf, int64_t size);
+J40__STATIC_RETURNS_ERR j40__seek_from_source(j40__st *st, int64_t fileoff);
+J40_STATIC void j40__free_source(j40__source_st *source);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC int j40__memory_source_read(uint8_t *buf, int64_t fileoff, size_t maxsize, size_t *size, void *data) {
+ uint8_t *mem = (uint8_t*) data;
+ memcpy(buf, mem + fileoff, maxsize);
+ *size = maxsize;
+ return 0;
+}
+
+J40__STATIC_RETURNS_ERR j40__init_memory_source(
+ j40__st *st, uint8_t *buf, size_t size, j40_memory_free_func freefunc, j40__source_st *source
+) {
+ J40__SHOULD(size <= (uint64_t) INT64_MAX, "flen");
+ source->read_func = j40__memory_source_read;
+ source->seek_func = NULL;
+ source->free_func = freefunc;
+ source->data = buf;
+ source->fileoff = 0;
+ source->fileoff_limit = (int64_t) size;
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC int j40__file_source_read(uint8_t *buf, int64_t fileoff, size_t maxsize, size_t *size, void *data) {
+ FILE *fp = (FILE*) data;
+ size_t read;
+
+ (void) fileoff;
+ read = fread(buf, 1, maxsize, fp);
+ if (read > 0) {
+ *size = read;
+ return 0;
+ } else if (feof(fp)) {
+ *size = 0;
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+J40_STATIC int j40__file_source_seek(int64_t fileoff, void *data) {
+ FILE *fp = (FILE*) data;
+ if (fileoff < 0) return 1;
+ if (fileoff <= LONG_MAX) {
+ if (fseek(fp, (long) fileoff, SEEK_SET) != 0) return 1;
+ } else {
+ if (fseek(fp, LONG_MAX, SEEK_SET) != 0) return 1;
+ fileoff -= LONG_MAX;
+ while (fileoff >= LONG_MAX) {
+ if (fseek(fp, LONG_MAX, SEEK_CUR) != 0) return 1;
+ fileoff -= LONG_MAX;
+ }
+ if (fseek(fp, (long) fileoff, SEEK_CUR) != 0) return 1;
+ }
+ return 0;
+}
+
+J40_STATIC void j40__file_source_free(void *data) {
+ FILE *fp = (FILE*) data;
+ fclose(fp);
+}
+
+J40__STATIC_RETURNS_ERR j40__init_file_source(j40__st *st, const char *path, j40__source_st *source) {
+ FILE *fp;
+ int saved_errno;
+
+ saved_errno = errno;
+ errno = 0;
+ fp = fopen(path, "rb");
+ if (!fp) {
+ st->saved_errno = errno;
+ if (errno == 0) errno = saved_errno;
+ J40__RAISE("open");
+ }
+ errno = saved_errno;
+
+ source->read_func = j40__file_source_read;
+ source->seek_func = j40__file_source_seek;
+ source->free_func = j40__file_source_free;
+ source->data = fp;
+ source->fileoff = 0;
+ source->fileoff_limit = ((uint64_t) INT64_MAX < SIZE_MAX ? INT64_MAX : (int64_t) SIZE_MAX);
+ return 0;
+
+J40__ON_ERROR:
+ if (fp) fclose(fp);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__try_read_from_source(
+ j40__st *st, uint8_t *buf, int64_t minsize, int64_t maxsize, int64_t *size
+) {
+ j40__source_st *source = st->source;
+ int64_t read_size = 0;
+ int saved_errno = errno;
+ errno = 0;
+ *size = 0;
+
+ J40__ASSERT(0 <= minsize && minsize <= maxsize);
+ J40__ASSERT(0 <= source->fileoff && source->fileoff <= source->fileoff_limit);
+
+ // clamp maxsize if fileoff_limit is set
+ J40__ASSERT((uint64_t) source->fileoff_limit <= SIZE_MAX); // so maxsize fits in size_t
+ if (maxsize > source->fileoff_limit - source->fileoff) {
+ maxsize = source->fileoff_limit - source->fileoff;
+ J40__SHOULD(minsize <= maxsize, "shrt"); // `minsize` bytes can't be read due to virtual EOF
+ }
+
+ while (read_size < maxsize) {
+ size_t added_size;
+ if (J40_UNLIKELY(source->read_func(
+ buf + read_size, source->fileoff, (size_t) (maxsize - read_size), &added_size, source->data
+ ))) {
+ st->saved_errno = errno;
+ if (errno == 0) errno = saved_errno;
+ J40__RAISE("read");
+ }
+ if (added_size == 0) break; // EOF or blocking condition
+ J40__SHOULD(added_size <= (uint64_t) INT64_MAX, "flen");
+ read_size += (int64_t) added_size;
+ J40__SHOULD(j40__add64(source->fileoff, (int64_t) read_size, &source->fileoff), "flen");
+ }
+
+ J40__SHOULD(read_size >= minsize, "shrt");
+ errno = saved_errno;
+ *size = read_size;
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__read_from_source(j40__st *st, uint8_t *buf, int64_t size) {
+ int64_t read_size;
+ return j40__try_read_from_source(st, buf, size, size, &read_size);
+}
+
+J40__STATIC_RETURNS_ERR j40__seek_from_source(j40__st *st, int64_t fileoff) {
+ j40__source_st *source = st->source;
+
+ J40__ASSERT(fileoff >= 0);
+ if (fileoff == source->fileoff) return 0;
+
+ fileoff = j40__min64(fileoff, source->fileoff_limit);
+
+ // for the memory source read always have the current fileoff so seek is a no-op
+ if (source->seek_func) {
+ int saved_errno = errno;
+ errno = 0;
+
+ if (J40_UNLIKELY(source->seek_func(fileoff, source->data))) {
+ st->saved_errno = errno;
+ if (errno == 0) errno = saved_errno;
+ J40__RAISE("seek");
+ }
+
+ errno = saved_errno;
+ }
+
+ source->fileoff = fileoff;
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC void j40__free_source(j40__source_st *source) {
+ if (source->free_func) source->free_func(source->data);
+ source->read_func = NULL;
+ source->seek_func = NULL;
+ source->free_func = NULL;
+ source->data = NULL;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// container
+
+typedef struct { int64_t codeoff, fileoff; } j40__map;
+
+enum j40__container_flags {
+ // if set, initial jxl & ftyp boxes have been read
+ J40__CONTAINER_CONFIRMED = 1 << 0,
+
+ // currently seen box types, as they have a cardinality and positional requirement
+ J40__SEEN_JXLL = 1 << 1, // at most once, before jxlc/jxlp
+ J40__SEEN_JXLI = 1 << 2, // at most once
+ J40__SEEN_JXLC = 1 << 3, // precludes jxlp, at most once
+ J40__SEEN_JXLP = 1 << 4, // precludes jxlc
+
+ // if set, no more jxlc/jxlp boxes are allowed (and map no longer changes)
+ J40__NO_MORE_CODESTREAM_BOX = 1 << 5,
+
+ // if set, there is an implied entry for `map[nmap]`. this is required when the last
+ // codestream box has an unknown length and thus it extends to the (unknown) end of file.
+ J40__IMPLIED_LAST_MAP_ENTRY = 1 << 6,
+
+ // if set, there is no more box past `map[nmap-1]` (or an implied `map[nmap]` if any)
+ J40__NO_MORE_BOX = 1 << 7,
+};
+
+typedef struct j40__container_st {
+ int flags; // bitset of `enum j40__container_flags`
+ // map[0..nmap) encodes two arrays C[i] = map[i].codeoff and F[i] = map[i].fileoff,
+ // so that codestream offsets [C[k], C[k+1]) map to file offsets [F[k], F[k] + (C[k+1] - C[k])).
+ // all codestream offsets less than the largest C[i] are 1-to-1 mapped to file offsets.
+ //
+ // the last entry, in particular F[nmap-1], has multiple interpretations.
+ // if the mapping is still being built, F[nmap-1] is the start of the next box to be read.
+ // if an implicit map entry flag is set, F[nmap] = L and C[nmap] = C[nmap-1] + (L - F[nmap-1])
+ // where L is the file length (which is not directly available).
+ j40__map *map;
+ int32_t nmap, map_cap;
+} j40__container_st;
+
+J40_ALWAYS_INLINE uint32_t j40__u32be(uint8_t *p);
+J40__STATIC_RETURNS_ERR j40__box_header(j40__st *st, uint32_t *type, int64_t *size);
+J40__STATIC_RETURNS_ERR j40__container(j40__st *st, int64_t wanted_codeoff);
+J40_STATIC int32_t j40__search_codestream_offset(const j40__st *st, int64_t codeoff);
+J40__STATIC_RETURNS_ERR j40__map_codestream_offset(j40__st *st, int64_t codeoff, int64_t *fileoff);
+J40_STATIC void j40__free_container(j40__container_st *container);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_ALWAYS_INLINE uint32_t j40__u32be(uint8_t *p) {
+ return ((uint32_t) p[0] << 24) | ((uint32_t) p[1] << 16) | ((uint32_t) p[2] << 8) | (uint32_t) p[3];
+}
+
+// size is < 0 if EOF, or INT64_MAX if the box extends indefinitely until the end of file
+J40__STATIC_RETURNS_ERR j40__box_header(j40__st *st, uint32_t *type, int64_t *size) {
+ uint8_t buf[8];
+ uint32_t size32;
+ uint64_t size64;
+ int64_t headersize;
+
+ J40__TRY(j40__try_read_from_source(st, buf, 0, 8, &headersize));
+ if (headersize == 0) {
+ *size = -1;
+ return 0;
+ }
+ J40__SHOULD(headersize == 8, "shrt"); // if not EOF, the full header should have been read
+
+ size32 = j40__u32be(buf);
+ *type = j40__u32be(buf + 4);
+ if (size32 == 0) {
+ *size = INT64_MAX;
+ } else if (size32 == 1) {
+ J40__TRY(j40__read_from_source(st, buf, 8));
+ size64 = ((uint64_t) j40__u32be(buf) << 32) | (uint64_t) j40__u32be(buf + 4);
+ J40__SHOULD(size64 >= 16, "boxx");
+ J40__SHOULD(size64 <= INT64_MAX, "flen");
+ *size = (int64_t) size64 - 16;
+ } else {
+ J40__SHOULD(size32 >= 8, "boxx");
+ *size = (int64_t) size32 - 8;
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+// scans as many boxes as required to map given codestream offset (i.e. the inclusive limit).
+// this is done in the best effort basis, so even after this
+// `j40__map_codestream_offset(st, wanted_codeoff)` may still fail.
+J40__STATIC_RETURNS_ERR j40__container(j40__st *st, int64_t wanted_codeoff) {
+ static const uint8_t JXL_BOX[12] = { // type `JXL `, value 0D 0A 87 0A
+ 0x00, 0x00, 0x00, 0x0c, 0x4a, 0x58, 0x4c, 0x20, 0x0d, 0x0a, 0x87, 0x0a,
+ }, FTYP_BOX[20] = { // type `ftyp`, brand `jxl `, version 0, only compatible w/ brand `jxl `
+ 0x00, 0x00, 0x00, 0x14, 0x66, 0x74, 0x79, 0x70, 0x6a, 0x78, 0x6c, 0x20,
+ 0x00, 0x00, 0x00, 0x00, 0x6a, 0x78, 0x6c, 0x20,
+ };
+
+ j40__source_st *source = st->source;
+ j40__container_st *c = st->container;
+ uint8_t buf[32];
+
+ if (!c->map) {
+ c->map_cap = 8;
+ c->nmap = 1;
+ J40__TRY_MALLOC(j40__map, &c->map, (size_t) c->map_cap);
+ c->map[0].codeoff = c->map[0].fileoff = 0; // fileoff will be updated
+ }
+
+ // immediately return if given codeoff is already mappable
+ if (c->flags & J40__IMPLIED_LAST_MAP_ENTRY) return 0;
+ if (wanted_codeoff < c->map[c->nmap - 1].codeoff) return 0;
+
+ // read the file header (if not yet read) and skip to the next box header
+ if (c->flags & J40__CONTAINER_CONFIRMED) {
+ J40__TRY(j40__seek_from_source(st, c->map[c->nmap - 1].fileoff));
+ } else {
+ J40__TRY(j40__seek_from_source(st, 0));
+
+ J40__TRY(j40__read_from_source(st, buf, 2));
+ if (buf[0] == 0xff && buf[1] == 0x0a) { // bare codestream
+ c->flags = J40__CONTAINER_CONFIRMED | J40__IMPLIED_LAST_MAP_ENTRY;
+ return 0;
+ }
+
+ J40__SHOULD(buf[0] == JXL_BOX[0] && buf[1] == JXL_BOX[1], "!jxl");
+ J40__TRY(j40__read_from_source(st, buf, sizeof(JXL_BOX) + sizeof(FTYP_BOX) - 2));
+ J40__SHOULD(memcmp(buf, JXL_BOX + 2, sizeof(JXL_BOX) - 2) == 0, "!jxl");
+ J40__SHOULD(memcmp(buf + (sizeof(JXL_BOX) - 2), FTYP_BOX, sizeof(FTYP_BOX)) == 0, "ftyp");
+ c->flags |= J40__CONTAINER_CONFIRMED;
+ c->map[0].fileoff = source->fileoff;
+ }
+
+ while (wanted_codeoff >= c->map[c->nmap - 1].codeoff) {
+ uint32_t type;
+ int64_t size;
+ int codestream_box = 0;
+
+ J40__TRY(j40__box_header(st, &type, &size));
+ if (size < 0) break;
+
+ // TODO the ordering rule for jxll/jxli may change in the future version of 18181-2
+ switch (type) {
+ case 0x6a786c6c: // jxll: codestream level
+ J40__SHOULD(!(c->flags & J40__SEEN_JXLL), "box?");
+ c->flags |= J40__SEEN_JXLL;
+ break;
+
+ case 0x6a786c69: // jxli: frame index
+ J40__SHOULD(!(c->flags & J40__SEEN_JXLI), "box?");
+ c->flags |= J40__SEEN_JXLI;
+ break;
+
+ case 0x6a786c63: // jxlc: single codestream
+ J40__SHOULD(!(c->flags & J40__NO_MORE_CODESTREAM_BOX), "box?");
+ J40__SHOULD(!(c->flags & (J40__SEEN_JXLP | J40__SEEN_JXLC)), "box?");
+ c->flags |= J40__SEEN_JXLC | J40__NO_MORE_CODESTREAM_BOX;
+ codestream_box = 1;
+ break;
+
+ case 0x6a786c70: // jxlp: partial codestreams
+ J40__SHOULD(!(c->flags & J40__NO_MORE_CODESTREAM_BOX), "box?");
+ J40__SHOULD(!(c->flags & J40__SEEN_JXLC), "box?");
+ c->flags |= J40__SEEN_JXLP;
+ codestream_box = 1;
+ J40__SHOULD(size >= 4, "jxlp");
+ J40__TRY(j40__read_from_source(st, buf, 4));
+ // TODO the partial codestream index is ignored right now
+ if (!(buf[0] >> 7)) c->flags |= J40__NO_MORE_CODESTREAM_BOX;
+ if (size < INT64_MAX) size -= 4;
+ break;
+
+ case 0x62726f62: // brob: brotli-compressed box
+ J40__SHOULD(size > 4, "brot"); // Brotli stream is never empty so 4 is also out
+ J40__TRY(j40__read_from_source(st, buf, 4));
+ type = j40__u32be(buf);
+ J40__SHOULD(type != 0x62726f62 /*brob*/ && (type >> 8) != 0x6a786c /*jxl*/, "brot");
+ if (size < INT64_MAX) size -= 4;
+ break;
+ } // other boxes have no additional requirements and are simply skipped
+
+ // this box has an indeterminate size and thus there is no more box following
+ if (size == INT64_MAX) {
+ if (codestream_box) c->flags |= J40__IMPLIED_LAST_MAP_ENTRY;
+ c->flags |= J40__NO_MORE_BOX;
+ break;
+ }
+
+ if (codestream_box) {
+ // add a new entry. at this point C[nmap-1] is the first codestream offset in this box
+ // and F[nmap-1] points to the beginning of this box, which should be updated to
+ // the beginning of the box *contents*.
+ J40__TRY_REALLOC32(j40__map, &c->map, c->nmap + 1, &c->map_cap);
+ c->map[c->nmap - 1].fileoff = source->fileoff;
+ J40__SHOULD(j40__add64(c->map[c->nmap - 1].codeoff, size, &c->map[c->nmap].codeoff), "flen");
+ // F[nmap] gets updated in the common case.
+ J40__SHOULD(j40__add32(c->nmap, 1, &c->nmap), "flen");
+ }
+
+ // always maintains F[nmap-1] to be the beginning of the next box (and seek to that point).
+ // we've already read the previous box header, so this should happen even if seek fails.
+ J40__SHOULD(j40__add64(source->fileoff, size, &c->map[c->nmap - 1].fileoff), "flen");
+ J40__TRY(j40__seek_from_source(st, c->map[c->nmap - 1].fileoff));
+ }
+
+ // now the EOF has been reached or the last box had an indeterminate size.
+ // EOF condition can be recovered (i.e. we can add more boxes to get it correctly decoded)
+ // so it's not a hard error, but we can't recover from an indeterminately sized box.
+ if ((c->flags & J40__NO_MORE_BOX) && !(c->flags & (J40__SEEN_JXLC | J40__SEEN_JXLP))) {
+ st->cannot_retry = 1;
+ J40__RAISE("shrt");
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+// returns i such that codeoff is in [C[i], C[i+1]), or nmap-1 if there is no such map entry
+J40_STATIC int32_t j40__search_codestream_offset(const j40__st *st, int64_t codeoff) {
+ j40__map *map = st->container->map;
+ int32_t nmap = st->container->nmap, i;
+ J40__ASSERT(map && nmap > 0);
+ // TODO use a binary search instead
+ for (i = 1; i < nmap; ++i) {
+ if (codeoff < map[i].codeoff) break;
+ }
+ return i - 1;
+}
+
+J40__STATIC_RETURNS_ERR j40__map_codestream_offset(j40__st *st, int64_t codeoff, int64_t *fileoff) {
+ j40__map *map = st->container->map;
+ int32_t nmap = st->container->nmap, i;
+
+ i = j40__search_codestream_offset(st, codeoff);
+ if (i < nmap - 1) {
+ J40__ASSERT(codeoff - map[i].codeoff < map[i+1].fileoff - map[i].fileoff);
+ *fileoff = map[i].fileoff + (codeoff - map[i].codeoff); // thus this never overflows
+ } else if (st->container->flags & J40__IMPLIED_LAST_MAP_ENTRY) {
+ J40__SHOULD(j40__add64(map[nmap-1].fileoff, codeoff - map[nmap-1].codeoff, fileoff), "flen");
+ } else if (st->container->flags & J40__NO_MORE_CODESTREAM_BOX) {
+ // TODO is this valid to do? j40__end_of_frame depends on this.
+ if (codeoff == map[nmap-1].codeoff) {
+ *fileoff = map[nmap-1].fileoff;
+ } else {
+ st->cannot_retry = 1;
+ J40__RAISE("shrt");
+ }
+ } else {
+ J40__RAISE("shrt");
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC void j40__free_container(j40__container_st *container) {
+ j40__free(container->map);
+ container->map = NULL;
+ container->nmap = container->map_cap = 0;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// backing buffer
+
+typedef struct j40__buffer_st {
+ uint8_t *buf;
+ int64_t size, capacity;
+
+ int64_t next_codeoff; // the codestream offset right past the backing buffer (i.e. `buf[size]`)
+ int64_t codeoff_limit; // codestream offset can't exceed this; used for per-section decoding
+
+ j40__bits_st checkpoint; // the earliest point that the parser can ever backtrack
+} j40__buffer_st;
+
+J40__STATIC_RETURNS_ERR j40__init_buffer(j40__st *st, int64_t codeoff, int64_t codeoff_limit);
+J40__STATIC_RETURNS_ERR j40__refill_buffer(j40__st *st);
+J40__STATIC_RETURNS_ERR j40__seek_buffer(j40__st *st, int64_t codeoff);
+J40_STATIC int64_t j40__codestream_offset(const j40__st *st);
+J40_MAYBE_UNUSED J40_STATIC int64_t j40__bits_read(const j40__st *st);
+J40_STATIC void j40__free_buffer(j40__buffer_st *buffer);
+
+#ifdef J40_IMPLEMENTATION
+
+#define J40__INITIAL_BUFSIZE 0x10000
+
+J40__STATIC_RETURNS_ERR j40__init_buffer(j40__st *st, int64_t codeoff, int64_t codeoff_limit) {
+ j40__bits_st *bits = &st->bits, *checkpoint = &st->buffer->checkpoint;
+ j40__buffer_st *buffer = st->buffer;
+
+ J40__ASSERT(!buffer->buf);
+ J40__TRY_MALLOC(uint8_t, &buffer->buf, J40__INITIAL_BUFSIZE);
+ bits->ptr = bits->end = buffer->buf;
+ buffer->size = 0;
+ buffer->capacity = J40__INITIAL_BUFSIZE;
+ buffer->next_codeoff = codeoff;
+ buffer->codeoff_limit = codeoff_limit;
+ bits->bits = 0;
+ bits->nbits = 0;
+ *checkpoint = *bits;
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__refill_buffer(j40__st *st) {
+ j40__bits_st *bits = &st->bits, *checkpoint = &st->buffer->checkpoint;
+ j40__buffer_st *buffer = st->buffer;
+ j40__container_st *container = st->container;
+ int64_t available, wanted_codeoff;
+ int32_t i;
+
+ J40__ASSERT(J40__INBOUNDS(bits->ptr, buffer->buf, buffer->size));
+ J40__ASSERT(J40__INBOUNDS(checkpoint->ptr, buffer->buf, buffer->size));
+ J40__ASSERT(checkpoint->ptr <= bits->ptr);
+
+ // trim the committed portion from the backing buffer
+ if (checkpoint->ptr > buffer->buf) {
+ int64_t committed_size = (int64_t) (checkpoint->ptr - buffer->buf);
+ J40__ASSERT(committed_size <= buffer->size); // so committed_size can't overflow
+ // this also can't overflow, because buffer->size never exceeds SIZE_MAX
+ memmove(buffer->buf, checkpoint->ptr, (size_t) (buffer->size - committed_size));
+ buffer->size -= committed_size;
+ bits->ptr -= committed_size;
+ bits->end -= committed_size;
+ checkpoint->ptr = buffer->buf;
+ }
+
+ // if there is no room left in the backing buffer, it's time to grow it
+ if (buffer->size == buffer->capacity) {
+ int64_t newcap = j40__clamp_add64(buffer->capacity, buffer->capacity);
+ ptrdiff_t relptr = bits->ptr - buffer->buf;
+ J40__TRY_REALLOC64(uint8_t, &buffer->buf, newcap, &buffer->capacity);
+ bits->ptr = buffer->buf + relptr;
+ checkpoint->ptr = buffer->buf;
+ }
+
+ wanted_codeoff = j40__min64(buffer->codeoff_limit,
+ j40__clamp_add64(buffer->next_codeoff, buffer->capacity - buffer->size));
+ available = wanted_codeoff - buffer->next_codeoff;
+ --wanted_codeoff; // ensure that this is inclusive, i.e. the last byte offset *allowed*
+
+ // do the initial mapping if no map is available
+ if (!container->map) J40__TRY(j40__container(st, wanted_codeoff));
+
+ i = j40__search_codestream_offset(st, buffer->next_codeoff);
+ while (available > 0) {
+ j40__map *map = container->map;
+ int32_t nmap = container->nmap;
+ int64_t fileoff, readable_size, read_size;
+
+ if (i < nmap - 1) {
+ int64_t box_size = map[i+1].codeoff - map[i].codeoff;
+ J40__ASSERT(box_size > 0);
+ readable_size = j40__min64(available, map[i+1].codeoff - buffer->next_codeoff);
+ J40__ASSERT(buffer->next_codeoff - map[i].codeoff < map[i+1].fileoff - map[i].fileoff);
+ fileoff = map[i].fileoff + (buffer->next_codeoff - map[i].codeoff); // thus can't overflow
+ } else if (container->flags & J40__IMPLIED_LAST_MAP_ENTRY) {
+ readable_size = available;
+ J40__SHOULD(
+ j40__add64(map[i].fileoff, buffer->next_codeoff - map[nmap-1].codeoff, &fileoff),
+ "flen");
+ } else {
+ // we have reached past the last mapped box, but there may be more boxes to map
+ J40__TRY(j40__container(st, wanted_codeoff));
+ if (nmap == container->nmap && !(container->flags & J40__IMPLIED_LAST_MAP_ENTRY)) {
+ break; // no additional box mapped, nothing can be done
+ }
+ continue;
+ }
+ J40__ASSERT(readable_size > 0);
+
+ J40__TRY(j40__seek_from_source(st, fileoff));
+ J40__TRY(j40__try_read_from_source(st, buffer->buf + buffer->size, 0, readable_size, &read_size));
+ if (read_size == 0) break; // EOF or blocking condition, can't continue
+
+ buffer->size += read_size;
+ J40__SHOULD(j40__add64(buffer->next_codeoff, read_size, &buffer->next_codeoff), "flen");
+ bits->end = checkpoint->end = buffer->buf + buffer->size;
+ available -= read_size;
+ if (read_size == readable_size) ++i; // try again if read is somehow incomplete
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__seek_buffer(j40__st *st, int64_t codeoff) {
+ int64_t reusable_size = st->buffer->next_codeoff - codeoff, fileoff;
+ st->bits.bits = 0;
+ st->bits.nbits = 0;
+ if (0 < reusable_size && reusable_size <= st->buffer->size) {
+ st->bits.ptr = st->buffer->buf + (st->buffer->size - reusable_size);
+ st->bits.end = st->buffer->buf + st->buffer->size;
+ } else {
+ st->bits.ptr = st->bits.end = st->buffer->buf;
+ st->buffer->size = 0;
+ st->buffer->next_codeoff = codeoff;
+ J40__TRY(j40__map_codestream_offset(st, codeoff, &fileoff));
+ J40__TRY(j40__seek_from_source(st, fileoff));
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC int64_t j40__codestream_offset(const j40__st *st) {
+ J40__ASSERT(st->bits.nbits % 8 == 0);
+ return st->buffer->next_codeoff - st->buffer->size + (st->bits.ptr - st->buffer->buf) - st->bits.nbits / 8;
+}
+
+// diagnostic only, doesn't check for overflow or anything
+J40_MAYBE_UNUSED J40_STATIC int64_t j40__bits_read(const j40__st *st) {
+ int32_t nbytes = j40__ceil_div32(st->bits.nbits, 8), nbits = 8 * nbytes - st->bits.nbits;
+ // the codestream offset for the byte that contains the first bit to read
+ int64_t codeoff = st->buffer->next_codeoff - st->buffer->size + (st->bits.ptr - st->buffer->buf) - nbytes;
+ j40__map map = st->container->map[j40__search_codestream_offset(st, codeoff)];
+ return (map.fileoff + (codeoff - map.codeoff)) * 8 + nbits;
+}
+
+J40_STATIC void j40__free_buffer(j40__buffer_st *buffer) {
+ j40__free(buffer->buf);
+ buffer->buf = NULL;
+ buffer->size = buffer->capacity = 0;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// bitstream
+
+J40__STATIC_RETURNS_ERR j40__always_refill(j40__st *st, int32_t n);
+// ensure st->bits.nbits is at least n; otherwise pull as many bytes as possible into st->bits.bits
+#define j40__refill(st, n) (J40_UNLIKELY(st->bits.nbits < (n)) ? j40__always_refill(st, n) : st->err)
+
+J40__INLINE_RETURNS_ERR j40__zero_pad_to_byte(j40__st *st);
+J40__STATIC_RETURNS_ERR j40__skip(j40__st *st, int64_t n);
+
+J40_INLINE int32_t j40__u(j40__st *st, int32_t n);
+J40_INLINE int64_t j40__64u(j40__st *st, int32_t n);
+J40_INLINE int32_t j40__u32(
+ j40__st *st,
+ int32_t o0, int32_t n0, int32_t o1, int32_t n1,
+ int32_t o2, int32_t n2, int32_t o3, int32_t n3
+);
+J40_INLINE int64_t j40__64u32(
+ j40__st *st,
+ int32_t o0, int32_t n0, int32_t o1, int32_t n1,
+ int32_t o2, int32_t n2, int32_t o3, int32_t n3
+);
+J40_STATIC uint64_t j40__u64(j40__st *st);
+J40_INLINE int32_t j40__enum(j40__st *st);
+J40_INLINE float j40__f16(j40__st *st);
+J40_INLINE int32_t j40__u8(j40__st *st);
+J40_INLINE int32_t j40__at_most(j40__st *st, int32_t max);
+J40__STATIC_RETURNS_ERR j40__no_more_bytes(j40__st *st);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__always_refill(j40__st *st, int32_t n) {
+ static const int32_t NBITS = 64;
+ j40__bits_st *bits = &st->bits;
+
+ J40__ASSERT(0 <= n && n < NBITS);
+ while (1) {
+ int32_t consumed = (NBITS - bits->nbits) >> 3;
+ if (J40_LIKELY(bits->end - bits->ptr >= consumed)) {
+ // fast case: consume `consumed` bytes from the logical buffer
+ J40__ASSERT(bits->nbits <= NBITS - 8);
+ do {
+ bits->bits |= (uint64_t) *bits->ptr++ << bits->nbits;
+ bits->nbits += 8;
+ } while (bits->nbits <= NBITS - 8);
+ break;
+ }
+
+ // slow case: the logical buffer has been exhausted, try to refill the backing buffer
+ while (bits->ptr < bits->end) {
+ bits->bits |= (uint64_t) *bits->ptr++ << bits->nbits;
+ bits->nbits += 8;
+ }
+ if (bits->nbits > NBITS - 8) break;
+
+ J40__SHOULD(st->buffer, "shrt");
+ J40__TRY(j40__refill_buffer(st));
+ if (bits->end == bits->ptr) { // no possibility to read more bits
+ if (bits->nbits >= n) break;
+ J40__RAISE("shrt");
+ }
+ // otherwise now we have possibly more bits to refill, try again
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__INLINE_RETURNS_ERR j40__zero_pad_to_byte(j40__st *st) {
+ int32_t n = st->bits.nbits & 7;
+ if (st->bits.bits & ((1u << n) - 1)) return J40__ERR("pad0");
+ st->bits.bits >>= n;
+ st->bits.nbits -= n;
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__skip(j40__st *st, int64_t n) {
+ j40__bits_st *bits = &st->bits;
+ int64_t bytes;
+ if (bits->nbits >= n) {
+ bits->bits >>= (int32_t) n;
+ bits->nbits -= (int32_t) n;
+ } else {
+ n -= bits->nbits;
+ bits->bits = 0;
+ bits->nbits = 0;
+ }
+ bytes = n >> 3;
+ // TODO honor containers
+ if (bits->end - bits->ptr < (int64_t) bytes) return J40__ERR("shrt");
+ bits->ptr += bytes;
+ n &= 7;
+ if (j40__refill(st, (int32_t) n)) return st->err;
+ bits->bits >>= (int32_t) n;
+ bits->nbits -= (int32_t) n;
+ return st->err;
+}
+
+J40_INLINE int32_t j40__u(j40__st *st, int32_t n) {
+ int32_t ret;
+ J40__ASSERT(0 <= n && n <= 31);
+ if (j40__refill(st, n)) return 0;
+ ret = (int32_t) (st->bits.bits & ((1u << n) - 1));
+ st->bits.bits >>= n;
+ st->bits.nbits -= n;
+ return ret;
+}
+
+J40_INLINE int64_t j40__64u(j40__st *st, int32_t n) {
+ int64_t ret;
+ J40__ASSERT(0 <= n && n <= 63);
+ if (j40__refill(st, n)) return 0;
+ ret = (int64_t) (st->bits.bits & (((uint64_t) 1u << n) - 1));
+ st->bits.bits >>= n;
+ st->bits.nbits -= n;
+ return ret;
+}
+
+J40_INLINE int32_t j40__u32(
+ j40__st *st,
+ int32_t o0, int32_t n0, int32_t o1, int32_t n1,
+ int32_t o2, int32_t n2, int32_t o3, int32_t n3
+) {
+ const int32_t o[4] = { o0, o1, o2, o3 };
+ const int32_t n[4] = { n0, n1, n2, n3 };
+ int32_t sel;
+ J40__ASSERT(0 <= n0 && n0 <= 30 && o0 <= 0x7fffffff - (1 << n0));
+ J40__ASSERT(0 <= n1 && n1 <= 30 && o1 <= 0x7fffffff - (1 << n1));
+ J40__ASSERT(0 <= n2 && n2 <= 30 && o2 <= 0x7fffffff - (1 << n2));
+ J40__ASSERT(0 <= n3 && n3 <= 30 && o3 <= 0x7fffffff - (1 << n3));
+ sel = j40__u(st, 2);
+ return j40__u(st, n[sel]) + o[sel];
+}
+
+J40_INLINE int64_t j40__64u32(
+ j40__st *st,
+ int32_t o0, int32_t n0, int32_t o1, int32_t n1,
+ int32_t o2, int32_t n2, int32_t o3, int32_t n3
+) {
+ const int32_t o[4] = { o0, o1, o2, o3 };
+ const int32_t n[4] = { n0, n1, n2, n3 };
+ int32_t sel;
+ J40__ASSERT(0 <= n0 && n0 <= 62);
+ J40__ASSERT(0 <= n1 && n1 <= 62);
+ J40__ASSERT(0 <= n2 && n2 <= 62);
+ J40__ASSERT(0 <= n3 && n3 <= 62);
+ sel = j40__u(st, 2);
+ return (j40__64u(st, n[sel]) + (int64_t) o[sel]) & (int64_t) 0xffffffff;
+}
+
+J40_STATIC uint64_t j40__u64(j40__st *st) {
+ int32_t sel = j40__u(st, 2), shift;
+ uint64_t ret = (uint64_t) j40__u(st, sel * 4);
+ if (sel < 3) {
+ ret += 17u >> (8 - sel * 4);
+ } else {
+ for (shift = 12; shift < 64 && j40__u(st, 1); shift += 8) {
+ ret |= (uint64_t) j40__u(st, shift < 56 ? 8 : 64 - shift) << shift;
+ }
+ }
+ return ret;
+}
+
+J40_INLINE int32_t j40__enum(j40__st *st) {
+ int32_t ret = j40__u32(st, 0, 0, 1, 0, 2, 4, 18, 6);
+ // the spec says it should be 64, but the largest enum value in use is 18 (kHLG);
+ // we have to reject unknown enum values anyway so we use a smaller limit to avoid overflow
+ if (ret >= 31) return J40__ERR("enum"), 0;
+ return ret;
+}
+
+J40_INLINE float j40__f16(j40__st *st) {
+ int32_t bits = j40__u(st, 16);
+ int32_t biased_exp = (bits >> 10) & 0x1f;
+ if (biased_exp == 31) return J40__ERR("!fin"), 0.0f;
+ return (bits >> 15 ? -1 : 1) * ldexpf((float) ((bits & 0x3ff) | (biased_exp > 0 ? 0x400 : 0)), biased_exp - 25);
+}
+
+J40_INLINE int32_t j40__u8(j40__st *st) { // ANS distribution decoding only
+ if (j40__u(st, 1)) {
+ int32_t n = j40__u(st, 3);
+ return j40__u(st, n) + (1 << n);
+ } else {
+ return 0;
+ }
+}
+
+// equivalent to u(ceil(log2(max + 1))), decodes [0, max] with the minimal number of bits
+J40_INLINE int32_t j40__at_most(j40__st *st, int32_t max) {
+ int32_t v = max > 0 ? j40__u(st, j40__ceil_lg32((uint32_t) max + 1)) : 0;
+ if (v > max) return J40__ERR("rnge"), 0;
+ return v;
+}
+
+// ensures that we have reached the end of file or advertised section with proper padding
+J40__STATIC_RETURNS_ERR j40__no_more_bytes(j40__st *st) {
+ J40__TRY(j40__zero_pad_to_byte(st));
+ J40__SHOULD(st->bits.nbits == 0 && st->bits.ptr == st->bits.end, "excs");
+J40__ON_ERROR:
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// prefix code
+
+J40__STATIC_RETURNS_ERR j40__prefix_code_tree(
+ j40__st *st, int32_t l2size, int32_t *out_fast_len, int32_t *out_max_len, int32_t **out_table
+);
+J40_INLINE int32_t j40__prefix_code(j40__st *st, int32_t fast_len, int32_t max_len, const int32_t *table);
+
+#ifdef J40_IMPLEMENTATION
+
+// a prefix code tree is represented by max_len (max code length), fast_len (explained below),
+// and an int32_t table either statically or dynamically constructed.
+// table[0] .. table[(1 << fast_len) - 1] are a lookup table for first fast_len bits.
+// each entry is either a direct entry (positive),
+// or an index to the first overflow entry (negative, the actual index is -table[i]).
+//
+// subsequent overflow entries are used for codes with the length > fast_len;
+// the decoder reads overflow entries in the order, stopping at the first match.
+// the last overflow entry is implicit so the table is constructed to ensure the match.
+//
+// a direct or overflow entry format:
+// - bits 0..3: codeword length - fast_len
+// - bits 4..15: codeword, skipping first fast_len bits, ordered like st->bits.bits (overflow only)
+// - bits 16..30: corresponding alphabet
+
+enum { J40__MAX_TYPICAL_FAST_LEN = 7 }; // limit fast_len for typical cases
+enum { J40__MAX_TABLE_GROWTH = 2 }; // we can afford 2x the table size if beneficial though
+
+// read a prefix code tree, as specified in RFC 7932 section 3
+J40__STATIC_RETURNS_ERR j40__prefix_code_tree(
+ j40__st *st, int32_t l2size, int32_t *out_fast_len, int32_t *out_max_len, int32_t **out_table
+) {
+ static const uint8_t REV5[32] = {
+ 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ };
+
+ // for ordinary cases we have three different prefix codes:
+ // layer 0 (fixed): up to 4 bits, decoding into 0..5, used L1SIZE = 18 times
+ // layer 1: up to 5 bits, decoding into 0..17, used l2size times
+ // layer 2: up to 15 bits, decoding into 0..l2size-1
+ enum { L1SIZE = 18, L0MAXLEN = 4, L1MAXLEN = 5, L2MAXLEN = 15 };
+ enum { L1CODESUM = 1 << L1MAXLEN, L2CODESUM = 1 << L2MAXLEN };
+ static const int32_t L0TABLE[1 << L0MAXLEN] = {
+ 0x00002, 0x40002, 0x30002, 0x20003, 0x00002, 0x40002, 0x30002, 0x10004,
+ 0x00002, 0x40002, 0x30002, 0x20003, 0x00002, 0x40002, 0x30002, 0x50004,
+ };
+ static const uint8_t L1ZIGZAG[L1SIZE] = {1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15};
+
+ int32_t l1lengths[L1SIZE] = {0}, *l2lengths = NULL;
+ int32_t l1counts[L1MAXLEN + 1] = {0}, l2counts[L2MAXLEN + 1] = {0};
+ int32_t l1starts[L1MAXLEN + 1], l2starts[L2MAXLEN + 1], l2overflows[L2MAXLEN + 1];
+ int32_t l1table[1 << L1MAXLEN] = {0}, *l2table = NULL;
+ int32_t total, code, hskip, fast_len, i, j;
+
+ J40__ASSERT(l2size > 0 && l2size <= 0x8000);
+ if (l2size == 1) { // SPEC missing this case
+ *out_fast_len = *out_max_len = 0;
+ J40__TRY_MALLOC(int32_t, out_table, 1);
+ (*out_table)[0] = 0;
+ return 0;
+ }
+
+ hskip = j40__u(st, 2);
+ if (hskip == 1) { // simple prefix codes (section 3.4)
+ static const struct { int8_t maxlen, sortfrom, sortto, len[8], symref[8]; } TEMPLATES[5] = {
+ { 3, 2, 4, {1,2,1,3,1,2,1,3}, {0,1,0,2,0,1,0,3} }, // NSYM=4 tree-select 1 (1233)
+ { 0, 0, 0, {0}, {0} }, // NSYM=1 (0)
+ { 1, 0, 2, {1,1}, {0,1} }, // NSYM=2 (11)
+ { 2, 1, 3, {1,2,1,2}, {0,1,0,2} }, // NSYM=3 (122)
+ { 2, 0, 4, {2,2,2,2}, {0,1,2,3} }, // NSYM=4 tree-select 0 (2222)
+ };
+ int32_t nsym = j40__u(st, 2) + 1, syms[4], tmp;
+ for (i = 0; i < nsym; ++i) {
+ syms[i] = j40__at_most(st, l2size - 1);
+ for (j = 0; j < i; ++j) J40__SHOULD(syms[i] != syms[j], "hufd");
+ }
+ if (nsym == 4 && j40__u(st, 1)) nsym = 0; // tree-select
+ J40__RAISE_DELAYED();
+
+ // symbols of the equal length have to be sorted
+ for (i = TEMPLATES[nsym].sortfrom + 1; i < TEMPLATES[nsym].sortto; ++i) {
+ for (j = i; j > TEMPLATES[nsym].sortfrom && syms[j - 1] > syms[j]; --j) {
+ tmp = syms[j - 1];
+ syms[j - 1] = syms[j];
+ syms[j] = tmp;
+ }
+ }
+
+ *out_fast_len = *out_max_len = TEMPLATES[nsym].maxlen;
+ J40__TRY_MALLOC(int32_t, out_table, 1u << *out_max_len);
+ for (i = 0; i < (1 << *out_max_len); ++i) {
+ (*out_table)[i] = (syms[TEMPLATES[nsym].symref[i]] << 16) | (int32_t) TEMPLATES[nsym].len[i];
+ }
+ return 0;
+ }
+
+ // complex prefix codes (section 3.5): read layer 1 code lengths using the layer 0 code
+ total = 0;
+ for (i = l1counts[0] = hskip; i < L1SIZE && total < L1CODESUM; ++i) {
+ l1lengths[L1ZIGZAG[i]] = code = j40__prefix_code(st, L0MAXLEN, L0MAXLEN, L0TABLE);
+ ++l1counts[code];
+ if (code) total += L1CODESUM >> code;
+ }
+ J40__SHOULD(total == L1CODESUM && l1counts[0] != i, "hufd");
+
+ // construct the layer 1 tree
+ if (l1counts[0] == i - 1) { // special case: a single code repeats as many as possible
+ for (i = 0; l1lengths[i]; ++i); // this SHOULD terminate
+ for (code = 0; code < L1CODESUM; ++code) l1table[code] = i;
+ l1lengths[i] = 0;
+ } else {
+ l1starts[1] = 0;
+ for (i = 2; i <= L1MAXLEN; ++i) {
+ l1starts[i] = l1starts[i - 1] + (l1counts[i - 1] << (L1MAXLEN - (i - 1)));
+ }
+ for (i = 0; i < L1SIZE; ++i) {
+ int32_t n = l1lengths[i], *start = &l1starts[n];
+ if (n == 0) continue;
+ for (code = (int32_t) REV5[*start]; code < L1CODESUM; code += 1 << n) {
+ l1table[code] = (i << 16) | n;
+ }
+ *start += L1CODESUM >> n;
+ }
+ }
+
+ { // read layer 2 code lengths using the layer 1 code
+ int32_t prev = 8, rep, prev_rep = 0; // prev_rep: prev repeat count of 16(pos)/17(neg) so far
+ J40__TRY_CALLOC(int32_t, &l2lengths, (size_t) l2size);
+ for (i = total = 0; i < l2size && total < L2CODESUM; ) {
+ code = j40__prefix_code(st, L1MAXLEN, L1MAXLEN, l1table);
+ if (code < 16) {
+ l2lengths[i++] = code;
+ ++l2counts[code];
+ if (code) {
+ total += L2CODESUM >> code;
+ prev = code;
+ }
+ prev_rep = 0;
+ } else if (code == 16) { // repeat non-zero 3+u(2) times
+ // instead of keeping the current repeat count, we calculate a difference
+ // between the previous and current repeat count and directly apply the delta
+ if (prev_rep < 0) prev_rep = 0;
+ rep = (prev_rep > 0 ? 4 * prev_rep - 5 : 3) + j40__u(st, 2);
+ J40__SHOULD(i + (rep - prev_rep) <= l2size, "hufd");
+ total += (L2CODESUM * (rep - prev_rep)) >> prev;
+ l2counts[prev] += rep - prev_rep;
+ for (; prev_rep < rep; ++prev_rep) l2lengths[i++] = prev;
+ } else { // code == 17: repeat zero 3+u(3) times
+ if (prev_rep > 0) prev_rep = 0;
+ rep = (prev_rep < 0 ? 8 * prev_rep + 13 : -3) - j40__u(st, 3);
+ J40__SHOULD(i + (prev_rep - rep) <= l2size, "hufd");
+ for (; prev_rep > rep; --prev_rep) l2lengths[i++] = 0;
+ }
+ J40__RAISE_DELAYED();
+ }
+ J40__SHOULD(total == L2CODESUM, "hufd");
+ }
+
+ // determine the layer 2 lookup table size
+ l2starts[1] = 0;
+ *out_max_len = 1;
+ for (i = 2; i <= L2MAXLEN; ++i) {
+ l2starts[i] = l2starts[i - 1] + (l2counts[i - 1] << (L2MAXLEN - (i - 1)));
+ if (l2counts[i]) *out_max_len = i;
+ }
+ if (*out_max_len <= J40__MAX_TYPICAL_FAST_LEN) {
+ fast_len = *out_max_len;
+ J40__TRY_MALLOC(int32_t, &l2table, 1u << fast_len);
+ } else {
+ // if the distribution is flat enough the max fast_len might be slow
+ // because most LUT entries will be overflow refs so we will hit slow paths for most cases.
+ // we therefore calculate the table size with the max fast_len,
+ // then find the largest fast_len within the specified table growth factor.
+ int32_t size, size_limit, size_used;
+ fast_len = J40__MAX_TYPICAL_FAST_LEN;
+ size = 1 << fast_len;
+ for (i = fast_len + 1; i <= *out_max_len; ++i) size += l2counts[i];
+ size_used = size;
+ size_limit = size * J40__MAX_TABLE_GROWTH;
+ for (i = fast_len + 1; i <= *out_max_len; ++i) {
+ size = size + (1 << i) - l2counts[i];
+ if (size <= size_limit) {
+ size_used = size;
+ fast_len = i;
+ }
+ }
+ l2overflows[fast_len + 1] = 1 << fast_len;
+ for (i = fast_len + 2; i <= *out_max_len; ++i) l2overflows[i] = l2overflows[i - 1] + l2counts[i - 1];
+ J40__TRY_MALLOC(int32_t, &l2table, (size_t) (size_used + 1));
+ // this entry should be unreachable, but should work as a stopper if there happens to be a logic bug
+ l2table[size_used] = 0;
+ }
+
+ // fill the layer 2 table
+ for (i = 0; i < l2size; ++i) {
+ int32_t n = l2lengths[i], *start = &l2starts[n];
+ if (n == 0) continue;
+ code = ((int32_t) REV5[*start & 31] << 10) |
+ ((int32_t) REV5[*start >> 5 & 31] << 5) |
+ ((int32_t) REV5[*start >> 10]);
+ if (n <= fast_len) {
+ for (; code < (1 << fast_len); code += 1 << n) l2table[code] = (i << 16) | n;
+ *start += L2CODESUM >> n;
+ } else {
+ // there should be exactly one code which is a LUT-covered prefix plus all zeroes;
+ // in the canonical Huffman tree that code would be in the first overflow entry
+ if ((code >> fast_len) == 0) l2table[code] = -l2overflows[n];
+ *start += L2CODESUM >> n;
+ l2table[l2overflows[n]++] = (i << 16) | (code >> fast_len << 4) | (n - fast_len);
+ }
+ }
+
+ *out_fast_len = fast_len;
+ *out_table = l2table;
+ j40__free(l2lengths);
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(l2lengths);
+ j40__free(l2table);
+ return st->err;
+}
+
+J40_STATIC int32_t j40__match_overflow(j40__st *st, int32_t fast_len, const int32_t *table) {
+ int32_t entry, code, code_len;
+ st->bits.nbits -= fast_len;
+ st->bits.bits >>= fast_len;
+ do {
+ entry = *table++;
+ code = (entry >> 4) & 0xfff;
+ code_len = entry & 15;
+ } while (code != (int32_t) (st->bits.bits & ((1u << code_len) - 1)));
+ return entry;
+}
+
+J40_INLINE int32_t j40__prefix_code(j40__st *st, int32_t fast_len, int32_t max_len, const int32_t *table) {
+ int32_t entry, code_len;
+ // this is not `j40__refill(st, max_len)` because it should be able to handle codes
+ // at the very end of file or section and shorter than max_len bits; in that case
+ // the bit buffer will correctly contain a short code padded with zeroes.
+ if (st->bits.nbits < max_len && j40__always_refill(st, 0)) return 0;
+ entry = table[st->bits.bits & ((1u << fast_len) - 1)];
+ if (entry < 0 && fast_len < max_len) entry = j40__match_overflow(st, fast_len, table - entry);
+ code_len = entry & 15;
+ st->bits.nbits -= code_len;
+ st->bits.bits >>= code_len;
+ if (st->bits.nbits < 0) { // too many bits read from the bit buffer
+ st->bits.nbits = 0;
+ J40__ASSERT(st->bits.bits == 0);
+ J40__ERR("shrt");
+ }
+ return entry >> 16;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// hybrid integer encoding
+
+// token < 2^split_exp is interpreted as is.
+// otherwise (token - 2^split_exp) is split into NNHHHLLL where config determines H/L lengths.
+// then MMMMM = u(NN + split_exp - H/L lengths) is read; the decoded value is 1HHHMMMMMLLL.
+typedef struct {
+ int8_t split_exp; // [0, 15]
+ int8_t msb_in_token, lsb_in_token; // msb_in_token + lsb_in_token <= split_exp
+ // TODO 2^30 bound is arbitrary, codestream may require larger values for some edge cases
+ int32_t max_token; // upper bound of token s.t. `j40__hybrid_int(token)` is < 2^30
+} j40__hybrid_int_config;
+
+J40__STATIC_RETURNS_ERR j40__read_hybrid_int_config(
+ j40__st *st, int32_t log_alpha_size, j40__hybrid_int_config *out
+);
+J40_INLINE int32_t j40__hybrid_int(j40__st *st, int32_t token, j40__hybrid_int_config config);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__read_hybrid_int_config(
+ j40__st *st, int32_t log_alpha_size, j40__hybrid_int_config *out
+) {
+ J40__ASSERT(log_alpha_size <= 15);
+ out->split_exp = (int8_t) j40__at_most(st, log_alpha_size);
+ if (out->split_exp != log_alpha_size) {
+ out->msb_in_token = (int8_t) j40__at_most(st, out->split_exp);
+ out->lsb_in_token = (int8_t) j40__at_most(st, out->split_exp - out->msb_in_token);
+ } else {
+ out->msb_in_token = out->lsb_in_token = 0;
+ }
+ out->max_token =
+ (1 << out->split_exp) + ((30 - out->split_exp) << (out->lsb_in_token + out->msb_in_token)) - 1;
+ return st->err;
+}
+
+J40_INLINE int32_t j40__hybrid_int(j40__st *st, int32_t token, j40__hybrid_int_config config) {
+ int32_t midbits, lo, mid, hi, top, bits_in_token, split = 1 << config.split_exp;
+ if (token < split) return token;
+ if (token > config.max_token) {
+ token = config.max_token;
+ J40__ERR("iovf");
+ }
+ bits_in_token = config.msb_in_token + config.lsb_in_token;
+ midbits = config.split_exp - bits_in_token + ((token - split) >> bits_in_token);
+ mid = j40__u(st, midbits);
+ top = 1 << config.msb_in_token;
+ lo = token & ((1 << config.lsb_in_token) - 1);
+ hi = (token >> config.lsb_in_token) & (top - 1);
+ return ((top | hi) << (midbits + config.lsb_in_token)) | ((mid << config.lsb_in_token) | lo);
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// rANS alias table
+
+enum {
+ J40__DIST_BITS = 12,
+ J40__ANS_INIT_STATE = 0x130000
+};
+
+// the alias table of size N is conceptually an array of N buckets with probability 1/N each,
+// where each bucket corresponds to at most two symbols distinguished by the cutoff point.
+// this is done by rearranging symbols so that every symbol boundary falls into distinct buckets.
+// so it allows *any* distribution of N symbols to be decoded in a constant time after the setup.
+// the table is not unique though, so the spec needs to specify the exact construction algorithm.
+//
+// input range: 0 cutoff bucket_size
+// +-----------|----------------------------+
+// output symbol: | i | symbol | <- bucket i
+// +-----------|----------------------------+
+// output range: 0 cutoff|offset offset+bucket_size
+typedef struct { int16_t cutoff, offset_or_next, symbol; } j40__alias_bucket;
+
+J40__STATIC_RETURNS_ERR j40__init_alias_map(
+ j40__st *st, const int16_t *D, int32_t log_alpha_size, j40__alias_bucket **out
+);
+J40_STATIC int32_t j40__ans_code(
+ j40__st *st, uint32_t *state, int32_t log_bucket_size,
+ const int16_t *D, const j40__alias_bucket *aliases
+);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__init_alias_map(
+ j40__st *st, const int16_t *D, int32_t log_alpha_size, j40__alias_bucket **out
+) {
+ int16_t log_bucket_size = (int16_t) (J40__DIST_BITS - log_alpha_size);
+ int16_t bucket_size = (int16_t) (1 << log_bucket_size);
+ int16_t table_size = (int16_t) (1 << log_alpha_size);
+ j40__alias_bucket *buckets = NULL;
+ // the underfull and overfull stacks are implicit linked lists; u/o resp. is the top index,
+ // buckets[u/o].next is the second-to-top index and so on. an index -1 indicates the bottom.
+ int16_t u = -1, o = -1, i, j;
+
+ J40__ASSERT(5 <= log_alpha_size && log_alpha_size <= 8);
+ J40__TRY_MALLOC(j40__alias_bucket, &buckets, 1u << log_alpha_size);
+
+ for (i = 0; i < table_size && !D[i]; ++i);
+ for (j = (int16_t) (i + 1); j < table_size && !D[j]; ++j);
+ if (i < table_size && j >= table_size) { // D[i] is the only non-zero probability
+ for (j = 0; j < table_size; ++j) {
+ buckets[j].symbol = i;
+ buckets[j].offset_or_next /*offset*/ = (int16_t) (j << log_bucket_size);
+ buckets[j].cutoff = 0;
+ }
+ *out = buckets;
+ return 0;
+ }
+
+ // each bucket is either settled (fields fully set) or unsettled (only `cutoff` is set).
+ // unsettled buckets are either in the underfull stack, in which case `cutoff < bucket_size`,
+ // or in the overfull stack, in which case `cutoff > bucket_size`. other fields are left
+ // unused, so `offset` in settled buckets is aliased to `next` in unsettled buckets.
+ // when rearranging results in buckets with `cutoff == bucket_size`,
+ // final fields are set and they become settled; eventually every bucket has to be settled.
+ for (i = 0; i < table_size; ++i) {
+ int16_t cutoff = D[i];
+ buckets[i].cutoff = cutoff;
+ if (cutoff > bucket_size) {
+ buckets[i].offset_or_next /*next*/ = o;
+ o = i;
+ } else if (cutoff < bucket_size) {
+ buckets[i].offset_or_next /*next*/ = u;
+ u = i;
+ } else { // immediately settled
+ buckets[i].symbol = i;
+ buckets[i].offset_or_next /*offset*/ = 0;
+ }
+ }
+
+ while (o >= 0) {
+ int16_t by, tmp;
+ J40__ASSERT(u >= 0);
+ by = (int16_t) (bucket_size - buckets[u].cutoff);
+ // move the input range [cutoff[o] - by, cutoff[o]] of the bucket o into
+ // the input range [cutoff[u], bucket_size] of the bucket u (which is settled after this)
+ tmp = buckets[u].offset_or_next /*next*/;
+ buckets[o].cutoff = (int16_t) (buckets[o].cutoff - by);
+ buckets[u].symbol = o;
+ buckets[u].offset_or_next /*offset*/ = (int16_t) (buckets[o].cutoff - buckets[u].cutoff);
+ u = tmp;
+ if (buckets[o].cutoff < bucket_size) { // o is now underfull, move to the underfull stack
+ tmp = buckets[o].offset_or_next /*next*/;
+ buckets[o].offset_or_next /*next*/ = u;
+ u = o;
+ o = tmp;
+ } else if (buckets[o].cutoff == bucket_size) { // o is also settled
+ tmp = buckets[o].offset_or_next /*next*/;
+ buckets[o].offset_or_next /*offset*/ = 0;
+ o = tmp;
+ }
+ }
+
+ J40__ASSERT(u < 0);
+ *out = buckets;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(buckets);
+ return st->err;
+}
+
+J40_STATIC int32_t j40__ans_code(
+ j40__st *st, uint32_t *state, int32_t log_bucket_size,
+ const int16_t *D, const j40__alias_bucket *aliases
+) {
+ if (*state == 0) {
+ *state = (uint32_t) j40__u(st, 16);
+ *state |= (uint32_t) j40__u(st, 16) << 16;
+ }
+ {
+ int32_t index = (int32_t) (*state & 0xfff);
+ int32_t i = index >> log_bucket_size;
+ int32_t pos = index & ((1 << log_bucket_size) - 1);
+ const j40__alias_bucket *bucket = &aliases[i];
+ int32_t symbol = pos < bucket->cutoff ? i : bucket->symbol;
+ int32_t offset = pos < bucket->cutoff ? 0 : bucket->offset_or_next /*offset*/;
+ J40__ASSERT(D[symbol] != 0);
+ *state = (uint32_t) D[symbol] * (*state >> 12) + (uint32_t) offset + (uint32_t) pos;
+ if (*state < (1u << 16)) *state = (*state << 16) | (uint32_t) j40__u(st, 16);
+ return symbol;
+ }
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// entropy code
+
+typedef union {
+ j40__hybrid_int_config config;
+ struct {
+ j40__hybrid_int_config config;
+ int32_t count;
+ } init; // only used during the initialization
+ struct {
+ j40__hybrid_int_config config;
+ int16_t *D;
+ j40__alias_bucket *aliases;
+ } ans; // if parent use_prefix_code is false
+ struct {
+ j40__hybrid_int_config config;
+ int16_t fast_len, max_len;
+ int32_t *table;
+ } prefix; // if parent use_prefix_code is true
+} j40__code_cluster;
+
+typedef struct {
+ int32_t num_dist;
+ int lz77_enabled, use_prefix_code;
+ int32_t min_symbol, min_length;
+ int32_t log_alpha_size; // only used when use_prefix_code is false
+ int32_t num_clusters; // in [1, min(num_dist, 256)]
+ uint8_t *cluster_map; // each in [0, num_clusters)
+ j40__hybrid_int_config lz_len_config;
+ j40__code_cluster *clusters;
+} j40__code_spec;
+
+typedef struct {
+ const j40__code_spec *spec;
+ // LZ77 states
+ int32_t num_to_copy, copy_pos, num_decoded;
+ int32_t window_cap, *window;
+ // ANS state (SPEC there is a single such state throughout the whole ANS stream)
+ uint32_t ans_state; // 0 if uninitialized
+} j40__code_st;
+
+// j40__code dist_mult should be clamped to this value in order to prevent overflow
+#define J40__MAX_DIST_MULT (1 << 21)
+
+J40__STATIC_RETURNS_ERR j40__cluster_map(
+ j40__st *st, int32_t num_dist, int32_t max_allowed, int32_t *num_clusters, uint8_t **outmap
+);
+J40__STATIC_RETURNS_ERR j40__ans_table(j40__st *st, int32_t log_alpha_size, int16_t **outtable);
+J40__STATIC_RETURNS_ERR j40__read_code_spec(j40__st *st, int32_t num_dist, j40__code_spec *spec);
+J40_STATIC void j40__init_code(j40__code_st *code, const j40__code_spec *spec);
+J40_STATIC int32_t j40__entropy_code_cluster(
+ j40__st *st, int use_prefix_code, int32_t log_alpha_size,
+ j40__code_cluster *cluster, uint32_t *ans_state
+);
+J40_STATIC int32_t j40__code(j40__st *st, int32_t ctx, int32_t dist_mult, j40__code_st *code);
+J40_STATIC void j40__free_code(j40__code_st *code);
+J40__STATIC_RETURNS_ERR j40__finish_and_free_code(j40__st *st, j40__code_st *code);
+J40_STATIC void j40__free_code_spec(j40__code_spec *spec);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__cluster_map(
+ j40__st *st, int32_t num_dist, int32_t max_allowed, int32_t *num_clusters, uint8_t **outmap
+) {
+ j40__code_spec codespec = J40__INIT; // cluster map might be recursively coded
+ j40__code_st code = J40__INIT;
+ uint32_t seen[8] = {0};
+ uint8_t *map = NULL;
+ int32_t i, j;
+
+ J40__ASSERT(num_dist > 0);
+ J40__ASSERT(max_allowed >= 1 && max_allowed <= 256);
+ if (max_allowed > num_dist) max_allowed = num_dist;
+
+ if (num_dist == 1) { // SPEC impossible in Brotli but possible (and unspecified) in JPEG XL
+ *num_clusters = 1;
+ J40__TRY_CALLOC(uint8_t, outmap, 1);
+ return 0;
+ }
+
+ *outmap = NULL;
+ J40__TRY_MALLOC(uint8_t, &map, (size_t) num_dist);
+
+ if (j40__u(st, 1)) { // is_simple (# clusters < 8)
+ int32_t nbits = j40__u(st, 2);
+ for (i = 0; i < num_dist; ++i) {
+ map[i] = (uint8_t) j40__u(st, nbits);
+ J40__SHOULD((int32_t) map[i] < max_allowed, "clst");
+ }
+ } else {
+ int use_mtf = j40__u(st, 1);
+
+ // TODO while num_dist is limited to 1, there is still a possibility of unbounded recursion
+ // when each code spec introduces its own LZ77 distribution; libjxl doesn't allow LZ77
+ // when cluster map is reading only two entries, which is technically incorrect but
+ // easier to adopt in the current structure of J40 as well.
+ J40__TRY(j40__read_code_spec(st, num_dist <= 2 ? -1 : 1, &codespec));
+ j40__init_code(&code, &codespec);
+ for (i = 0; i < num_dist; ++i) {
+ int32_t index = j40__code(st, 0, 0, &code); // SPEC context (always 0) is missing
+ J40__SHOULD(index < max_allowed, "clst");
+ map[i] = (uint8_t) index;
+ }
+ J40__TRY(j40__finish_and_free_code(st, &code));
+ j40__free_code_spec(&codespec);
+
+ if (use_mtf) {
+ uint8_t mtf[256], moved;
+ for (i = 0; i < 256; ++i) mtf[i] = (uint8_t) i;
+ for (i = 0; i < num_dist; ++i) {
+ j = map[i];
+ map[i] = moved = mtf[j];
+ for (; j > 0; --j) mtf[j] = mtf[j - 1];
+ mtf[0] = moved;
+ }
+ }
+ }
+
+ // verify cluster_map and determine the implicit num_clusters
+ for (i = 0; i < num_dist; ++i) seen[map[i] >> 5] |= (uint32_t) 1 << (map[i] & 31);
+ for (i = 0; i < 256 && (seen[i >> 5] >> (i & 31) & 1); ++i);
+ *num_clusters = i; // the first unset position or 256 if none
+ for (; i < 256 && !(seen[i >> 5] >> (i & 31) & 1); ++i);
+ J40__SHOULD(i == 256, "clst"); // no more set position beyond num_clusters
+ J40__ASSERT(*num_clusters > 0);
+
+ *outmap = map;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(map);
+ j40__free_code(&code);
+ j40__free_code_spec(&codespec);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__ans_table(j40__st *st, int32_t log_alpha_size, int16_t **outtable) {
+ enum { DISTBITS = J40__DIST_BITS, DISTSUM = 1 << DISTBITS };
+ int32_t table_size = 1 << log_alpha_size;
+ int32_t i;
+ int16_t *D = NULL;
+
+ J40__TRY_MALLOC(int16_t, &D, (size_t) table_size);
+
+ switch (j40__u(st, 2)) { // two Bool() calls combined into u(2), so bits are swapped
+ case 1: { // true -> false case: one entry
+ int32_t v = j40__u8(st);
+ memset(D, 0, sizeof(int16_t) * (size_t) table_size);
+ J40__SHOULD(v < table_size, "ansd");
+ D[v] = DISTSUM;
+ break;
+ }
+
+ case 3: { // true -> true case: two entries
+ int32_t v1 = j40__u8(st);
+ int32_t v2 = j40__u8(st);
+ J40__SHOULD(v1 != v2 && v1 < table_size && v2 < table_size, "ansd");
+ memset(D, 0, sizeof(int16_t) * (size_t) table_size);
+ D[v1] = (int16_t) j40__u(st, DISTBITS);
+ D[v2] = (int16_t) (DISTSUM - D[v1]);
+ break;
+ }
+
+ case 2: { // false -> true case: evenly distribute to first `alpha_size` entries
+ int32_t alpha_size = j40__u8(st) + 1;
+ int16_t d = (int16_t) (DISTSUM / alpha_size);
+ int16_t bias_size = (int16_t) (DISTSUM % alpha_size);
+ J40__SHOULD(alpha_size <= table_size, "ansd");
+ for (i = 0; i < bias_size; ++i) D[i] = (int16_t) (d + 1);
+ for (; i < alpha_size; ++i) D[i] = d;
+ for (; i < table_size; ++i) D[i] = 0;
+ break;
+ }
+
+ case 0: { // false -> false case: bit counts + RLE
+ int32_t len, shift, alpha_size, omit_log, omit_pos, code, total, n;
+ int32_t ncodes, codes[259]; // exponents if >= 0, negated repeat count if < 0
+
+ len = j40__u(st, 1) ? j40__u(st, 1) ? j40__u(st, 1) ? 3 : 2 : 1 : 0;
+ shift = j40__u(st, len) + (1 << len) - 1;
+ J40__SHOULD(shift <= 13, "ansd");
+ alpha_size = j40__u8(st) + 3;
+
+ omit_log = -1; // there should be at least one non-RLE code
+ for (i = ncodes = 0; i < alpha_size; ) {
+ static const int32_t TABLE[] = { // reinterpretation of kLogCountLut
+ 0xa0003, -16, 0x70003, 0x30004, 0x60003, 0x80003, 0x90003, 0x50004,
+ 0xa0003, 0x40004, 0x70003, 0x10004, 0x60003, 0x80003, 0x90003, 0x20004,
+ 0x00011, 0xb0022, 0xc0003, 0xd0043, // overflow for ...0001
+ };
+ code = j40__prefix_code(st, 4, 7, TABLE);
+ if (code < 13) {
+ ++i;
+ codes[ncodes++] = code;
+ if (omit_log < code) omit_log = code;
+ } else {
+ i += code = j40__u8(st) + 4;
+ codes[ncodes++] = -code;
+ }
+ }
+ J40__SHOULD(i == alpha_size && omit_log >= 0, "ansd");
+
+ omit_pos = -1;
+ for (i = n = total = 0; i < ncodes && n < table_size; ++i) {
+ code = codes[i];
+ if (code < 0) { // repeat
+ int16_t prev = n > 0 ? D[n - 1] : 0;
+ J40__SHOULD(prev >= 0, "ansd"); // implicit D[n] followed by RLE
+ code = j40__min32(-code, table_size - n);
+ total += (int32_t) prev * code;
+ while (code-- > 0) D[n++] = prev;
+ } else if (code == omit_log) { // the first longest D[n] is "omitted" (implicit)
+ omit_pos = n;
+ omit_log = -1; // this branch runs at most once
+ D[n++] = -1;
+ } else if (code < 2) {
+ total += code;
+ D[n++] = (int16_t) code;
+ } else {
+ int32_t bitcount;
+ --code;
+ bitcount = j40__min32(j40__max32(0, shift - ((DISTBITS - code) >> 1)), code);
+ code = (1 << code) + (j40__u(st, bitcount) << (code - bitcount));
+ total += code;
+ D[n++] = (int16_t) code;
+ }
+ }
+ for (; n < table_size; ++n) D[n] = 0;
+ J40__SHOULD(omit_pos >= 0, "ansd");
+ J40__SHOULD(total <= DISTSUM, "ansd");
+ D[omit_pos] = (int16_t) (DISTSUM - total);
+ break;
+ }
+
+ default: J40__UNREACHABLE();
+ }
+
+ *outtable = D;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(D);
+ return st->err;
+}
+
+// num_dist can be negative (and its absolute value is used) when a further use of LZ77 is disallowed
+J40__STATIC_RETURNS_ERR j40__read_code_spec(j40__st *st, int32_t num_dist, j40__code_spec *spec) {
+ int32_t i;
+ int allow_lz77;
+
+ J40__ASSERT(num_dist != 0);
+ allow_lz77 = (num_dist > 0);
+ num_dist = j40__abs32(num_dist);
+
+ spec->cluster_map = NULL;
+ spec->clusters = NULL;
+
+ // LZ77Params
+ spec->lz77_enabled = j40__u(st, 1);
+ if (spec->lz77_enabled) {
+ J40__SHOULD(allow_lz77, "lz77");
+ spec->min_symbol = j40__u32(st, 224, 0, 512, 0, 4096, 0, 8, 15);
+ spec->min_length = j40__u32(st, 3, 0, 4, 0, 5, 2, 9, 8);
+ J40__TRY(j40__read_hybrid_int_config(st, 8, &spec->lz_len_config));
+ ++num_dist; // num_dist - 1 is a synthesized LZ77 length distribution
+ } else {
+ spec->min_symbol = spec->min_length = 0x7fffffff;
+ }
+
+ // cluster_map: a mapping from context IDs to actual distributions
+ J40__TRY(j40__cluster_map(st, num_dist, 256, &spec->num_clusters, &spec->cluster_map));
+
+ J40__TRY_CALLOC(j40__code_cluster, &spec->clusters, (size_t) spec->num_clusters);
+
+ spec->use_prefix_code = j40__u(st, 1);
+ if (spec->use_prefix_code) {
+ for (i = 0; i < spec->num_clusters; ++i) { // SPEC the count is off by one
+ J40__TRY(j40__read_hybrid_int_config(st, 15, &spec->clusters[i].config));
+ }
+
+ for (i = 0; i < spec->num_clusters; ++i) {
+ if (j40__u(st, 1)) {
+ int32_t n = j40__u(st, 4);
+ spec->clusters[i].init.count = 1 + (1 << n) + j40__u(st, n);
+ J40__SHOULD(spec->clusters[i].init.count <= (1 << 15), "hufd");
+ } else {
+ spec->clusters[i].init.count = 1;
+ }
+ }
+
+ // SPEC this should happen after reading *all* count[i]
+ for (i = 0; i < spec->num_clusters; ++i) {
+ j40__code_cluster *c = &spec->clusters[i];
+ int32_t fast_len, max_len;
+ J40__TRY(j40__prefix_code_tree(st, c->init.count, &fast_len, &max_len, &c->prefix.table));
+ c->prefix.fast_len = (int16_t) fast_len;
+ c->prefix.max_len = (int16_t) max_len;
+ }
+ } else {
+ spec->log_alpha_size = 5 + j40__u(st, 2);
+ for (i = 0; i < spec->num_clusters; ++i) { // SPEC the count is off by one
+ J40__TRY(j40__read_hybrid_int_config(st, spec->log_alpha_size, &spec->clusters[i].config));
+ }
+
+ for (i = 0; i < spec->num_clusters; ++i) {
+ j40__code_cluster *c = &spec->clusters[i];
+ J40__TRY(j40__ans_table(st, spec->log_alpha_size, &c->ans.D));
+ J40__TRY(j40__init_alias_map(st, c->ans.D, spec->log_alpha_size, &c->ans.aliases));
+ }
+ }
+
+ spec->num_dist = num_dist;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_code_spec(spec);
+ return st->err;
+}
+
+J40_STATIC void j40__init_code(j40__code_st *code, const j40__code_spec *spec) {
+ code->spec = spec;
+ code->num_to_copy = code->copy_pos = code->num_decoded = 0;
+ code->window_cap = 0;
+ code->window = 0;
+ code->ans_state = 0;
+}
+
+J40_STATIC int32_t j40__entropy_code_cluster(
+ j40__st *st, int use_prefix_code, int32_t log_alpha_size,
+ j40__code_cluster *cluster, uint32_t *ans_state
+) {
+ if (use_prefix_code) {
+ return j40__prefix_code(st, cluster->prefix.fast_len, cluster->prefix.max_len, cluster->prefix.table);
+ } else {
+ return j40__ans_code(st, ans_state, J40__DIST_BITS - log_alpha_size, cluster->ans.D, cluster->ans.aliases);
+ }
+}
+
+// aka DecodeHybridVarLenUint
+J40_STATIC int32_t j40__code(j40__st *st, int32_t ctx, int32_t dist_mult, j40__code_st *code) {
+ static const int32_t MASK = 0xfffff;
+
+ const j40__code_spec *spec = code->spec;
+ int32_t token, distance, log_alpha_size;
+ j40__code_cluster *cluster;
+ int use_prefix_code;
+
+ if (code->num_to_copy > 0) {
+ J40__ASSERT(code->window); // because this can't be the initial token and lz77_enabled is true
+ --code->num_to_copy;
+ return code->window[code->num_decoded++ & MASK] = code->window[code->copy_pos++ & MASK];
+ }
+
+ J40__ASSERT(ctx < spec->num_dist);
+ use_prefix_code = spec->use_prefix_code;
+ log_alpha_size = spec->log_alpha_size;
+ cluster = &spec->clusters[spec->cluster_map[ctx]];
+ token = j40__entropy_code_cluster(st, use_prefix_code, log_alpha_size, cluster, &code->ans_state);
+ if (token >= spec->min_symbol) { // this is large enough if lz77_enabled is false
+ j40__code_cluster *lz_cluster = &spec->clusters[spec->cluster_map[spec->num_dist - 1]];
+ int32_t num_to_copy = j40__hybrid_int(st, token - spec->min_symbol, spec->lz_len_config) + spec->min_length;
+ token = j40__entropy_code_cluster(st, use_prefix_code, log_alpha_size, lz_cluster, &code->ans_state);
+ distance = j40__hybrid_int(st, token, lz_cluster->config);
+ if (st->err) return 0;
+ if (!dist_mult) {
+ ++distance;
+ } else if (distance >= 120) {
+ distance -= 119;
+ } else {
+ static const uint8_t SPECIAL_DISTANCES[120] = { // {a,b} encoded as (a+7)*16+b
+ 0x71, 0x80, 0x81, 0x61, 0x72, 0x90, 0x82, 0x62, 0x91, 0x51, 0x92, 0x52,
+ 0x73, 0xa0, 0x83, 0x63, 0xa1, 0x41, 0x93, 0x53, 0xa2, 0x42, 0x74, 0xb0,
+ 0x84, 0x64, 0xb1, 0x31, 0xa3, 0x43, 0x94, 0x54, 0xb2, 0x32, 0x75, 0xa4,
+ 0x44, 0xb3, 0x33, 0xc0, 0x85, 0x65, 0xc1, 0x21, 0x95, 0x55, 0xc2, 0x22,
+ 0xb4, 0x34, 0xa5, 0x45, 0xc3, 0x23, 0x76, 0xd0, 0x86, 0x66, 0xd1, 0x11,
+ 0x96, 0x56, 0xd2, 0x12, 0xb5, 0x35, 0xc4, 0x24, 0xa6, 0x46, 0xd3, 0x13,
+ 0x77, 0xe0, 0x87, 0x67, 0xc5, 0x25, 0xe1, 0x01, 0xb6, 0x36, 0xd4, 0x14,
+ 0x97, 0x57, 0xe2, 0x02, 0xa7, 0x47, 0xe3, 0x03, 0xc6, 0x26, 0xd5, 0x15,
+ 0xf0, 0xb7, 0x37, 0xe4, 0x04, 0xf1, 0xf2, 0xd6, 0x16, 0xf3, 0xc7, 0x27,
+ 0xe5, 0x05, 0xf4, 0xd7, 0x17, 0xe6, 0x06, 0xf5, 0xe7, 0x07, 0xf6, 0xf7,
+ };
+ int32_t special = (int32_t) SPECIAL_DISTANCES[distance];
+ J40__ASSERT(dist_mult <= J40__MAX_DIST_MULT);
+ // TODO spec bug: distance can be as low as -6 when dist_mult = 1 and distance =
+ // dist_mult * 1 - 7; libjxl clamps it to the minimum of 1, so we do the same here
+ distance = j40__max32(1, ((special >> 4) - 7) + dist_mult * (special & 7));
+ }
+ distance = j40__min32(j40__min32(distance, code->num_decoded), 1 << 20);
+ code->copy_pos = code->num_decoded - distance;
+ if (J40_UNLIKELY(distance == 0)) {
+ // TODO spec bug: this is possible when num_decoded == 0 (or a non-positive special
+ // distance, handled above) and libjxl acts as if `window[i]` is initially filled with 0
+ J40__ASSERT(code->num_decoded == 0 && !code->window);
+ code->window = (int32_t*) j40__calloc(1u << 20, sizeof(int32_t));
+ if (!code->window) return J40__ERR("!mem"), 0;
+ }
+ J40__ASSERT(num_to_copy > 0);
+ code->num_to_copy = num_to_copy - 1;
+ return code->window[code->num_decoded++ & MASK] = code->window[code->copy_pos++ & MASK];
+ }
+
+ token = j40__hybrid_int(st, token, cluster->config);
+ if (st->err) return 0;
+ if (spec->lz77_enabled) {
+ if (!code->window) { // XXX should be dynamically resized
+ code->window = (int32_t*) j40__malloc(1u << 20, sizeof(int32_t));
+ if (!code->window) return J40__ERR("!mem"), 0;
+ }
+ code->window[code->num_decoded++ & MASK] = token;
+ }
+ return token;
+}
+
+J40_STATIC void j40__free_code(j40__code_st *code) {
+ j40__free(code->window);
+ code->window = NULL;
+ code->window_cap = 0;
+}
+
+J40__STATIC_RETURNS_ERR j40__finish_and_free_code(j40__st *st, j40__code_st *code) {
+ if (!code->spec->use_prefix_code) {
+ if (code->ans_state) {
+ J40__SHOULD(code->ans_state == J40__ANS_INIT_STATE, "ans?");
+ } else { // edge case: if no symbols have been read the state has to be read at this point
+ J40__SHOULD(j40__u(st, 16) == (J40__ANS_INIT_STATE & 0xffff), "ans?");
+ J40__SHOULD(j40__u(st, 16) == (J40__ANS_INIT_STATE >> 16), "ans?");
+ }
+ }
+ // it's explicitly allowed that num_to_copy can be > 0 at the end of stream
+J40__ON_ERROR:
+ j40__free_code(code);
+ return st->err;
+}
+
+J40_STATIC void j40__free_code_spec(j40__code_spec *spec) {
+ int32_t i;
+ if (spec->clusters) {
+ for (i = 0; i < spec->num_clusters; ++i) {
+ if (spec->use_prefix_code) {
+ j40__free(spec->clusters[i].prefix.table);
+ } else {
+ j40__free(spec->clusters[i].ans.D);
+ j40__free(spec->clusters[i].ans.aliases);
+ }
+ }
+ j40__free(spec->clusters);
+ spec->clusters = NULL;
+ }
+ j40__free(spec->cluster_map);
+ spec->cluster_map = NULL;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// image header & metadata
+
+enum {
+ J40__CHROMA_WHITE = 0, J40__CHROMA_RED = 1,
+ J40__CHROMA_GREEN = 2, J40__CHROMA_BLUE = 3,
+};
+
+enum j40__ec_type {
+ J40__EC_ALPHA = 0, J40__EC_DEPTH = 1, J40__EC_SPOT_COLOUR = 2,
+ J40__EC_SELECTION_MASK = 3, J40__EC_BLACK = 4, J40__EC_CFA = 5,
+ J40__EC_THERMAL = 6, J40__EC_NON_OPTIONAL = 15, J40__EC_OPTIONAL = 16,
+};
+
+typedef struct {
+ enum j40__ec_type type;
+ int32_t bpp, exp_bits, dim_shift, name_len;
+ char *name;
+ union {
+ int alpha_associated;
+ struct { float red, green, blue, solidity; } spot;
+ int32_t cfa_channel;
+ } data;
+} j40__ec_info;
+
+enum j40__orientation {
+ J40__ORIENT_TL = 1, J40__ORIENT_TR = 2, J40__ORIENT_BR = 3, J40__ORIENT_BL = 4,
+ J40__ORIENT_LT = 5, J40__ORIENT_RT = 6, J40__ORIENT_RB = 7, J40__ORIENT_LB = 8,
+};
+
+enum j40__cspace {
+ J40__CS_CHROMA = 'c', J40__CS_GREY = 'g', J40__CS_XYB = 'x',
+};
+
+enum { // for `j40__image_st.gamma_or_tf`
+ J40__TF_709 = -1, J40__TF_UNKNOWN = -2, J40__TF_LINEAR = -8, J40__TF_SRGB = -13,
+ J40__TF_PQ = -16, J40__TF_DCI = -17, J40__TF_HLG = -18,
+ J40__GAMMA_MAX = 10000000,
+};
+
+enum j40__render_intent {
+ J40__INTENT_PERC = 0, J40__INTENT_REL = 1, J40__INTENT_SAT = 2, J40__INTENT_ABS = 3
+};
+
+typedef struct j40__image_st {
+ int32_t width, height;
+ enum j40__orientation orientation;
+ int32_t intr_width, intr_height; // 0 if not specified
+ int bpp, exp_bits;
+
+ int32_t anim_tps_num, anim_tps_denom; // num=denom=0 if not animated
+ int64_t anim_nloops; // 0 if infinity
+ int anim_have_timecodes;
+
+ char *icc;
+ size_t iccsize;
+ enum j40__cspace cspace;
+ float cpoints[4 /*J40__CHROMA_xxx*/][2 /*x=0, y=1*/]; // only for J40__CS_CHROMA
+ int32_t gamma_or_tf; // gamma if > 0, transfer function if <= 0
+ enum j40__render_intent render_intent;
+ float intensity_target, min_nits; // 0 < min_nits <= intensity_target
+ float linear_below; // absolute (nits) if >= 0; a negated ratio of max display brightness if [-1,0]
+
+ int modular_16bit_buffers;
+ int num_extra_channels;
+ j40__ec_info *ec_info;
+ int xyb_encoded;
+ float opsin_inv_mat[3][3], opsin_bias[3], quant_bias[3 /*xyb*/], quant_bias_num;
+ int want_icc;
+} j40__image_st;
+
+J40__STATIC_RETURNS_ERR j40__signature(j40__st *st);
+J40__STATIC_RETURNS_ERR j40__size_header(j40__st *st, int32_t *outw, int32_t *outh);
+J40__STATIC_RETURNS_ERR j40__bit_depth(j40__st *st, int32_t *outbpp, int32_t *outexpbits);
+J40__STATIC_RETURNS_ERR j40__name(j40__st *st, int32_t *outlen, char **outbuf);
+J40__STATIC_RETURNS_ERR j40__customxy(j40__st *st, float xy[2]);
+J40__STATIC_RETURNS_ERR j40__extensions(j40__st *st);
+J40__STATIC_RETURNS_ERR j40__image_metadata(j40__st *st);
+J40_STATIC void j40__free_image_state(j40__image_st *im);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__signature(j40__st *st) {
+ int32_t sig = j40__u(st, 16);
+ J40__SHOULD(sig == 0x0aff, "!jxl"); // FF 0A in the byte sequence
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__size_header(j40__st *st, int32_t *outw, int32_t *outh) {
+ int32_t div8 = j40__u(st, 1);
+ *outh = div8 ? (j40__u(st, 5) + 1) * 8 : j40__u32(st, 1, 9, 1, 13, 1, 18, 1, 30);
+ switch (j40__u(st, 3)) { // ratio
+ case 0: *outw = div8 ? (j40__u(st, 5) + 1) * 8 : j40__u32(st, 1, 9, 1, 13, 1, 18, 1, 30); break;
+ case 1: *outw = *outh; break;
+ case 2: *outw = (int32_t) ((uint64_t) *outh * 6 / 5); break;
+ case 3: *outw = (int32_t) ((uint64_t) *outh * 4 / 3); break;
+ case 4: *outw = (int32_t) ((uint64_t) *outh * 3 / 2); break;
+ case 5: *outw = (int32_t) ((uint64_t) *outh * 16 / 9); break;
+ case 6: *outw = (int32_t) ((uint64_t) *outh * 5 / 4); break;
+ case 7:
+ // height is at most 2^30, so width is at most 2^31 which requires uint32_t.
+ // but in order to avoid bugs we rarely use unsigned integers, so we just reject it.
+ // this should be not a problem as the Main profile Level 10 (the largest profile)
+ // already limits height to at most 2^30.
+ J40__SHOULD(*outh < 0x40000000, "bigg");
+ *outw = *outh * 2;
+ break;
+ default: J40__UNREACHABLE();
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__bit_depth(j40__st *st, int32_t *outbpp, int32_t *outexpbits) {
+ if (j40__u(st, 1)) { // float_sample
+ int32_t mantissa_bits;
+ *outbpp = j40__u32(st, 32, 0, 16, 0, 24, 0, 1, 6);
+ *outexpbits = j40__u(st, 4) + 1;
+ mantissa_bits = *outbpp - *outexpbits - 1;
+ J40__SHOULD(2 <= mantissa_bits && mantissa_bits <= 23, "bpp?");
+ J40__SHOULD(2 <= *outexpbits && *outexpbits <= 8, "exp?"); // implies bpp in [5,32] when combined
+ } else {
+ *outbpp = j40__u32(st, 8, 0, 10, 0, 12, 0, 1, 6);
+ *outexpbits = 0;
+ J40__SHOULD(1 <= *outbpp && *outbpp <= 31, "bpp?");
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__name(j40__st *st, int32_t *outlen, char **outbuf) {
+ char *buf = NULL;
+ int32_t i, c, cc, len;
+ len = j40__u32(st, 0, 0, 0, 4, 16, 5, 48, 10);
+ if (len > 0) {
+ J40__TRY_MALLOC(char, &buf, (size_t) len + 1);
+ for (i = 0; i < len; ++i) {
+ buf[i] = (char) j40__u(st, 8);
+ J40__RAISE_DELAYED();
+ }
+ buf[len] = 0;
+ for (i = 0; i < len; ) { // UTF-8 verification
+ c = (uint8_t) buf[i++];
+ cc = (uint8_t) buf[i]; // always accessible thanks to null-termination
+ c = c < 0x80 ? 0 : c < 0xc2 ? -1 : c < 0xe0 ? 1 :
+ c < 0xf0 ? (c == 0xe0 ? cc >= 0xa0 : c == 0xed ? cc < 0xa0 : 1) ? 2 : -1 :
+ c < 0xf5 ? (c == 0xf0 ? cc >= 0x90 : c == 0xf4 ? cc < 0x90 : 1) ? 3 : -1 : -1;
+ J40__SHOULD(c >= 0 && i + c < len, "name");
+ while (c-- > 0) J40__SHOULD((buf[i++] & 0xc0) == 0x80, "name");
+ }
+ *outbuf = buf;
+ } else {
+ J40__RAISE_DELAYED();
+ *outbuf = NULL;
+ }
+ *outlen = len;
+ return 0;
+J40__ON_ERROR:
+ j40__free(buf);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__customxy(j40__st *st, float xy[2]) {
+ xy[0] = (float)j40__unpack_signed(j40__u32(st, 0, 19, 0x80000, 19, 0x100000, 20, 0x200000, 21)) / 100000.0f;
+ xy[1] = (float)j40__unpack_signed(j40__u32(st, 0, 19, 0x80000, 19, 0x100000, 20, 0x200000, 21)) / 100000.0f;
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__extensions(j40__st *st) {
+ uint64_t extensions = j40__u64(st);
+ int64_t nbits = 0;
+ int32_t i;
+ for (i = 0; i < 64; ++i) {
+ if (extensions >> i & 1) {
+ uint64_t n = j40__u64(st);
+ J40__RAISE_DELAYED();
+ J40__SHOULD(n <= (uint64_t) INT64_MAX && j40__add64(nbits, (int64_t) n, &nbits), "flen");
+ }
+ }
+ return j40__skip(st, nbits);
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__image_metadata(j40__st *st) {
+ static const float SRGB_CHROMA[4][2] = { // default chromacity (kD65, kSRGB)
+ {0.3127f, 0.3290f}, {0.639998686f, 0.330010138f},
+ {0.300003784f, 0.600003357f}, {0.150002046f, 0.059997204f},
+ };
+ static const float OPSIN_INV_MAT[3][3] = { // default opsin inverse matrix
+ {11.031566901960783f, -9.866943921568629f, -0.16462299647058826f},
+ {-3.254147380392157f, 4.418770392156863f, -0.16462299647058826f},
+ {-3.6588512862745097f, 2.7129230470588235f, 1.9459282392156863f},
+ };
+
+ j40__image_st *im = st->image;
+ int32_t i, j;
+
+ im->orientation = J40__ORIENT_TL;
+ im->intr_width = 0;
+ im->intr_height = 0;
+ im->bpp = 8;
+ im->exp_bits = 0;
+ im->anim_tps_num = 0;
+ im->anim_tps_denom = 0;
+ im->anim_nloops = 0;
+ im->anim_have_timecodes = 0;
+ im->icc = NULL;
+ im->iccsize = 0;
+ im->cspace = J40__CS_CHROMA;
+ memcpy(im->cpoints, SRGB_CHROMA, sizeof SRGB_CHROMA);
+ im->gamma_or_tf = J40__TF_SRGB;
+ im->render_intent = J40__INTENT_REL;
+ im->intensity_target = 255.0f;
+ im->min_nits = 0.0f;
+ im->linear_below = 0.0f;
+ im->modular_16bit_buffers = 1;
+ im->xyb_encoded = 1;
+ memcpy(im->opsin_inv_mat, OPSIN_INV_MAT, sizeof OPSIN_INV_MAT);
+ im->opsin_bias[0] = im->opsin_bias[1] = im->opsin_bias[2] = -0.0037930732552754493f;
+ im->quant_bias[0] = 1.0f - 0.05465007330715401f;
+ im->quant_bias[1] = 1.0f - 0.07005449891748593f;
+ im->quant_bias[2] = 1.0f - 0.049935103337343655f;
+ im->quant_bias_num = 0.145f;
+
+ J40__TRY(j40__size_header(st, &im->width, &im->height));
+ J40__SHOULD(im->width <= st->limits->width && im->height <= st->limits->height, "slim");
+ J40__SHOULD((int64_t) im->width * im->height <= st->limits->pixels, "slim");
+
+ if (!j40__u(st, 1)) { // !all_default
+ int32_t extra_fields = j40__u(st, 1);
+ if (extra_fields) {
+ im->orientation = (enum j40__orientation) (j40__u(st, 3) + 1);
+ if (j40__u(st, 1)) { // have_intr_size
+ J40__TRY(j40__size_header(st, &im->intr_width, &im->intr_height));
+ }
+ if (j40__u(st, 1)) { // have_preview
+ J40__RAISE("TODO: preview");
+ }
+ if (j40__u(st, 1)) { // have_animation
+ im->anim_tps_num = j40__u32(st, 100, 0, 1000, 0, 1, 10, 1, 30);
+ im->anim_tps_denom = j40__u32(st, 1, 0, 1001, 0, 1, 8, 1, 10);
+ im->anim_nloops = j40__64u32(st, 0, 0, 0, 3, 0, 16, 0, 32);
+ im->anim_have_timecodes = j40__u(st, 1);
+ }
+ }
+ J40__TRY(j40__bit_depth(st, &im->bpp, &im->exp_bits));
+ J40__SHOULD(im->bpp <= st->limits->bpp, "fbpp");
+ im->modular_16bit_buffers = j40__u(st, 1);
+ J40__SHOULD(im->modular_16bit_buffers || !st->limits->needs_modular_16bit_buffers, "fm32");
+ im->num_extra_channels = j40__u32(st, 0, 0, 1, 0, 2, 4, 1, 12);
+ J40__SHOULD(im->num_extra_channels <= st->limits->num_extra_channels, "elim");
+ J40__TRY_CALLOC(j40__ec_info, &im->ec_info, (size_t) im->num_extra_channels);
+ for (i = 0; i < im->num_extra_channels; ++i) im->ec_info[i].name = NULL;
+ for (i = 0; i < im->num_extra_channels; ++i) {
+ j40__ec_info *ec = &im->ec_info[i];
+ if (j40__u(st, 1)) { // d_alpha
+ ec->type = J40__EC_ALPHA;
+ ec->bpp = 8;
+ ec->exp_bits = ec->dim_shift = ec->name_len = 0;
+ ec->data.alpha_associated = 0;
+ } else {
+ ec->type = (enum j40__ec_type) j40__enum(st);
+ J40__TRY(j40__bit_depth(st, &ec->bpp, &ec->exp_bits));
+ ec->dim_shift = j40__u32(st, 0, 0, 3, 0, 4, 0, 1, 3);
+ J40__TRY(j40__name(st, &ec->name_len, &ec->name));
+ switch (ec->type) {
+ case J40__EC_ALPHA:
+ ec->data.alpha_associated = j40__u(st, 1);
+ break;
+ case J40__EC_SPOT_COLOUR:
+ ec->data.spot.red = j40__f16(st);
+ ec->data.spot.green = j40__f16(st);
+ ec->data.spot.blue = j40__f16(st);
+ ec->data.spot.solidity = j40__f16(st);
+ break;
+ case J40__EC_CFA:
+ ec->data.cfa_channel = j40__u32(st, 1, 0, 0, 2, 3, 4, 19, 8);
+ break;
+ case J40__EC_BLACK:
+ J40__SHOULD(st->limits->ec_black_allowed, "fblk");
+ break;
+ case J40__EC_DEPTH: case J40__EC_SELECTION_MASK:
+ case J40__EC_THERMAL: case J40__EC_NON_OPTIONAL: case J40__EC_OPTIONAL:
+ break;
+ default: J40__RAISE("ect?");
+ }
+ }
+ J40__SHOULD(ec->bpp <= st->limits->bpp, "fbpp");
+ J40__RAISE_DELAYED();
+ }
+ im->xyb_encoded = j40__u(st, 1);
+ if (!j40__u(st, 1)) { // ColourEncoding.all_default
+ enum cspace { CS_RGB = 0, CS_GREY = 1, CS_XYB = 2, CS_UNKNOWN = 3 } cspace;
+ enum { WP_D65 = 1, WP_CUSTOM = 2, WP_E = 10, WP_DCI = 11 };
+ enum { PR_SRGB = 1, PR_CUSTOM = 2, PR_2100 = 9, PR_P3 = 11 };
+ im->want_icc = j40__u(st, 1);
+ cspace = (enum cspace) j40__enum(st);
+ switch (cspace) {
+ case CS_RGB: case CS_UNKNOWN: im->cspace = J40__CS_CHROMA; break;
+ case CS_GREY: im->cspace = J40__CS_GREY; break;
+ case CS_XYB: im->cspace = J40__CS_XYB; break;
+ default: J40__RAISE("csp?");
+ }
+ // TODO: should verify cspace grayness with ICC grayness
+ if (!im->want_icc) {
+ if (cspace != CS_XYB) {
+ static const float E[2] = {1/3.f, 1/3.f}, DCI[2] = {0.314f, 0.351f},
+ BT2100[3][2] = {{0.708f, 0.292f}, {0.170f, 0.797f}, {0.131f, 0.046f}},
+ P3[3][2] = {{0.680f, 0.320f}, {0.265f, 0.690f}, {0.150f, 0.060f}};
+ switch (j40__enum(st)) {
+ case WP_D65: break; // default
+ case WP_CUSTOM: J40__TRY(j40__customxy(st, im->cpoints[J40__CHROMA_WHITE])); break;
+ case WP_E: memcpy(im->cpoints + J40__CHROMA_WHITE, E, sizeof E); break;
+ case WP_DCI: memcpy(im->cpoints + J40__CHROMA_WHITE, DCI, sizeof DCI); break;
+ default: J40__RAISE("wpt?");
+ }
+ if (cspace != CS_GREY) {
+ switch (j40__enum(st)) {
+ case PR_SRGB: break; // default
+ case PR_CUSTOM:
+ J40__TRY(j40__customxy(st, im->cpoints[J40__CHROMA_RED]));
+ J40__TRY(j40__customxy(st, im->cpoints[J40__CHROMA_GREEN]));
+ J40__TRY(j40__customxy(st, im->cpoints[J40__CHROMA_BLUE]));
+ break;
+ case PR_2100: memcpy(im->cpoints + J40__CHROMA_RED, BT2100, sizeof BT2100); break;
+ case PR_P3: memcpy(im->cpoints + J40__CHROMA_RED, P3, sizeof P3); break;
+ default: J40__RAISE("prm?");
+ }
+ }
+ }
+ if (j40__u(st, 1)) { // have_gamma
+ im->gamma_or_tf = j40__u(st, 24);
+ J40__SHOULD(im->gamma_or_tf > 0 && im->gamma_or_tf <= J40__GAMMA_MAX, "gama");
+ if (cspace == CS_XYB) J40__SHOULD(im->gamma_or_tf == 3333333, "gama");
+ } else {
+ im->gamma_or_tf = -j40__enum(st);
+ J40__SHOULD((
+ 1 << -J40__TF_709 | 1 << -J40__TF_UNKNOWN | 1 << -J40__TF_LINEAR |
+ 1 << -J40__TF_SRGB | 1 << -J40__TF_PQ | 1 << -J40__TF_DCI |
+ 1 << -J40__TF_HLG
+ ) >> -im->gamma_or_tf & 1, "tfn?");
+ }
+ im->render_intent = (enum j40__render_intent) j40__enum(st);
+ J40__SHOULD((
+ 1 << J40__INTENT_PERC | 1 << J40__INTENT_REL |
+ 1 << J40__INTENT_SAT | 1 << J40__INTENT_ABS
+ ) >> im->render_intent & 1, "itt?");
+ }
+ }
+ if (extra_fields) {
+ if (!j40__u(st, 1)) { // ToneMapping.all_default
+ int relative_to_max_display;
+ im->intensity_target = j40__f16(st);
+ J40__SHOULD(im->intensity_target > 0, "tone");
+ im->min_nits = j40__f16(st);
+ J40__SHOULD(0 < im->min_nits && im->min_nits <= im->intensity_target, "tone");
+ relative_to_max_display = j40__u(st, 1);
+ im->linear_below = j40__f16(st);
+ if (relative_to_max_display) {
+ J40__SHOULD(0 <= im->linear_below && im->linear_below <= 1, "tone");
+ im->linear_below *= -1.0f;
+ } else {
+ J40__SHOULD(0 <= im->linear_below, "tone");
+ }
+ }
+ }
+ J40__TRY(j40__extensions(st));
+ }
+ if (!j40__u(st, 1)) { // !default_m
+ int32_t cw_mask;
+ if (im->xyb_encoded) {
+ for (i = 0; i < 3; ++i) for (j = 0; j < 3; ++j) im->opsin_inv_mat[i][j] = j40__f16(st);
+ for (i = 0; i < 3; ++i) im->opsin_bias[i] = j40__f16(st);
+ for (i = 0; i < 3; ++i) im->quant_bias[i] = j40__f16(st);
+ im->quant_bias_num = j40__f16(st);
+ }
+ cw_mask = j40__u(st, 3);
+ if (cw_mask & 1) {
+ J40__RAISE("TODO: up2_weight");
+ }
+ if (cw_mask & 2) {
+ J40__RAISE("TODO: up4_weight");
+ }
+ if (cw_mask & 4) {
+ J40__RAISE("TODO: up8_weight");
+ }
+ }
+ J40__RAISE_DELAYED();
+ return 0;
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC void j40__free_image_state(j40__image_st *im) {
+ int32_t i;
+ if (im->ec_info) {
+ for (i = 0; i < im->num_extra_channels; ++i) j40__free(im->ec_info[i].name);
+ j40__free(im->ec_info);
+ im->ec_info = NULL;
+ }
+ j40__free(im->icc);
+ im->icc = NULL;
+ im->num_extra_channels = 0;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// ICC
+
+J40_STATIC uint64_t j40__icc_varint(j40__st *st, uint64_t *index, uint64_t size, j40__code_st *code);
+J40__STATIC_RETURNS_ERR j40__icc(j40__st *st);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC uint64_t j40__icc_varint(j40__st *st, uint64_t *index, uint64_t size, j40__code_st *code) {
+ uint64_t value = 0;
+ int32_t shift = 0;
+ do {
+ int32_t b;
+ if ((*index)++ >= size) return J40__ERR("icc?"), 0u;
+ b = j40__code(st, 0, 0, code);
+ value |= (uint64_t) (b & 0x7f) << shift;
+ if (b < 128) return value;
+ shift += 7;
+ } while (shift < 63);
+ return J40__ERR("vint"), 0u;
+}
+
+J40__STATIC_RETURNS_ERR j40__icc(j40__st *st) {
+ uint64_t enc_size, output_size, index;
+ j40__code_spec codespec = J40__INIT;
+ j40__code_st code = J40__INIT;
+ int32_t byte = 0, prev = 0, pprev = 0, ctx;
+
+ enc_size = j40__u64(st);
+ J40__TRY(j40__read_code_spec(st, 41, &codespec));
+ j40__init_code(&code, &codespec);
+
+ index = 0;
+ output_size = j40__icc_varint(st, &index, enc_size, &code);
+ J40__SHOULD(output_size <= st->limits->icc_output_size, "plim");
+
+ // SPEC it is still possible that enc_size is too large while output_size is within the limit.
+ // the current spec allows for an arbitrarily large enc_size for the fixed output_size, because
+ // some commands can generate zero output bytes, but as libjxl never emits such commands and
+ // already (wrongly) assumes that a single command byte can generate at least one output byte,
+ // J40 instead chose to forbid such commands. with this restriction in place valid enc_size
+ // can never exceed 21 times output_size, so this is what we are checking for.
+ J40__SHOULD(output_size >= enc_size / 21, "icc?");
+
+ for (; index < enc_size; ++index) {
+ pprev = prev;
+ prev = byte;
+ ctx = 0;
+ if (index > 128) {
+ if (prev < 16) ctx = prev < 2 ? prev + 3 : 5;
+ else if (prev > 240) ctx = 6 + (prev == 255);
+ else if (97 <= (prev | 32) && (prev | 32) <= 122) ctx = 1;
+ else if (prev == 44 || prev == 46 || (48 <= prev && prev < 58)) ctx = 2;
+ else ctx = 8;
+ if (pprev < 16) ctx += 2 * 8;
+ else if (pprev > 240) ctx += 3 * 8;
+ else if (97 <= (pprev | 32) && (pprev | 32) <= 122) ctx += 0 * 8;
+ else if (pprev == 44 || pprev == 46 || (48 <= pprev && pprev < 58)) ctx += 1 * 8;
+ else ctx += 4 * 8;
+ }
+ byte = j40__code(st, ctx, 0, &code);
+ //printf("%zd/%zd: %zd ctx=%d byte=%#x %c\n", index, enc_size, j40__bits_read(st), ctx, (int)byte, 0x20 <= byte && byte < 0x7f ? byte : ' '); fflush(stdout);
+ J40__RAISE_DELAYED();
+ // TODO actually interpret them
+ }
+ J40__TRY(j40__finish_and_free_code(st, &code));
+ j40__free_code_spec(&codespec);
+
+ //size_t commands_size = j40__varint(st);
+
+ /*
+ static const char PREDICTIONS[] = {
+ '*', '*', '*', '*', 0, 0, 0, 0, 4, 0, 0, 0, 'm', 'n', 't', 'r',
+ 'R', 'G', 'B', ' ', 'X', 'Y', 'Z', ' ', 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 'a', 'c', 's', 'p', 0, '@', '@', '@', 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 246, 214, 0, 1, 0, 0, 0, 0, 211, 45,
+ '#', '#', '#', '#',
+ };
+ char pred = i < sizeof(PREDICTIONS) ? PREDICTIONS[i] : 0;
+ switch (pred) {
+ case '*': pred = output_size[i]; break;
+ case '#': pred = header[i - 76]; break;
+ case '@':
+ switch (header[40]) {
+ case 'A': pred = "APPL"[i - 40]; break;
+ case 'M': pred = "MSFT"[i - 40]; break;
+ case 'S':
+ switch (i < 41 ? 0 : header[41]) {
+ case 'G': pred = "SGI "[i - 40]; break;
+ case 'U': pred = "SUNW"[i - 40]; break;
+ }
+ break;
+ }
+ break;
+ }
+ */
+
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_code(&code);
+ j40__free_code_spec(&codespec);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// MA tree
+
+enum { J40__NUM_PRED = 14 };
+
+typedef union {
+ struct {
+ int32_t prop; // < 0, ~prop is the property index (e.g. -1 = channel index)
+ int32_t value;
+ int32_t leftoff, rightoff; // relative to the current node
+ } branch;
+ struct {
+ int32_t ctx; // >= 0
+ int32_t predictor;
+ int32_t offset, multiplier;
+ } leaf;
+} j40__tree_node;
+
+J40__STATIC_RETURNS_ERR j40__tree(
+ j40__st *st, int32_t max_tree_size, j40__tree_node **tree, j40__code_spec *codespec
+);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__tree(
+ j40__st *st, int32_t max_tree_size, j40__tree_node **tree, j40__code_spec *codespec
+) {
+ j40__code_st code = J40__INIT;
+ j40__tree_node *t = NULL;
+ int32_t tree_idx = 0, tree_cap = 8;
+ int32_t ctx_id = 0, nodes_left = 1;
+ int32_t depth = 0, nodes_upto_this_depth = 1;
+
+ J40__ASSERT(max_tree_size <= (1 << 26)); // codestream limit; the actual limit should be smaller
+
+ J40__TRY(j40__read_code_spec(st, 6, codespec));
+ j40__init_code(&code, codespec);
+ J40__TRY_MALLOC(j40__tree_node, &t, (size_t) tree_cap);
+ while (nodes_left-- > 0) { // depth-first, left-to-right ordering
+ j40__tree_node *n;
+ int32_t prop, val, shift;
+
+ // the beginning of new tree depth; all `nodes_left` nodes are in this depth at the moment
+ if (tree_idx == nodes_upto_this_depth) {
+ J40__SHOULD(++depth <= st->limits->tree_depth, "tlim");
+ nodes_upto_this_depth += nodes_left + 1;
+ }
+
+ prop = j40__code(st, 1, 0, &code);
+ J40__TRY_REALLOC32(j40__tree_node, &t, tree_idx + 1, &tree_cap);
+ n = &t[tree_idx++];
+ if (prop > 0) {
+ n->branch.prop = -prop;
+ n->branch.value = j40__unpack_signed(j40__code(st, 0, 0, &code));
+ n->branch.leftoff = ++nodes_left;
+ n->branch.rightoff = ++nodes_left;
+ } else {
+ n->leaf.ctx = ctx_id++;
+ n->leaf.predictor = j40__code(st, 2, 0, &code);
+ n->leaf.offset = j40__unpack_signed(j40__code(st, 3, 0, &code));
+ shift = j40__code(st, 4, 0, &code);
+ J40__SHOULD(shift < 31, "tree");
+ val = j40__code(st, 5, 0, &code);
+ J40__SHOULD(((val + 1) >> (31 - shift)) == 0, "tree");
+ n->leaf.multiplier = (val + 1) << shift;
+ }
+
+ J40__SHOULD(tree_idx + nodes_left <= max_tree_size, "tlim");
+ }
+ J40__ASSERT(tree_idx == nodes_upto_this_depth);
+ J40__TRY(j40__finish_and_free_code(st, &code));
+
+ j40__free_code_spec(codespec);
+ memset(codespec, 0, sizeof(*codespec)); // XXX is it required?
+ J40__TRY(j40__read_code_spec(st, ctx_id, codespec));
+ *tree = t;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(t);
+ j40__free_code(&code);
+ j40__free_code_spec(codespec);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// modular header
+
+enum j40__transform_id {
+ J40__TR_RCT = 0, J40__TR_PALETTE = 1, J40__TR_SQUEEZE = 2
+};
+
+typedef union {
+ enum j40__transform_id tr;
+ struct {
+ enum j40__transform_id tr; // = J40__TR_RCT
+ int32_t begin_c, type;
+ } rct;
+ struct {
+ enum j40__transform_id tr; // = J40__TR_PALETTE
+ int32_t begin_c, num_c, nb_colours, nb_deltas, d_pred;
+ } pal;
+ // this is nested in the bitstream, but flattened here.
+ // nb_transforms get updated accordingly, but should be enough (the maximum is 80808)
+ struct {
+ enum j40__transform_id tr; // = J40__TR_SQUEEZE
+ int implicit; // if true, no explicit parameters given in the bitstream
+ int horizontal, in_place;
+ int32_t begin_c, num_c;
+ } sq;
+} j40__transform;
+
+typedef struct { int8_t p1, p2, p3[5], w[4]; } j40__wp_params;
+
+typedef struct {
+ int use_global_tree;
+ j40__wp_params wp;
+ int32_t nb_transforms;
+ j40__transform *transform;
+ j40__tree_node *tree; // owned only if use_global_tree is false
+ j40__code_spec codespec;
+ j40__code_st code;
+ int32_t num_channels, nb_meta_channels;
+ j40__plane *channel; // should use the same type, either i16 or i32
+ int32_t dist_mult; // min(max(non-meta channel width), J40__MAX_DIST_MULT)
+} j40__modular;
+
+J40_STATIC void j40__init_modular_common(j40__modular *m);
+J40__STATIC_RETURNS_ERR j40__init_modular(
+ j40__st *st, int32_t num_channels, const int32_t *w, const int32_t *h, j40__modular *m
+);
+J40__STATIC_RETURNS_ERR j40__init_modular_for_global(
+ j40__st *st, int frame_is_modular, int frame_do_ycbcr,
+ int32_t frame_log_upsampling, const int32_t *frame_ec_log_upsampling,
+ int32_t frame_width, int32_t frame_height, j40__modular *m
+);
+J40__STATIC_RETURNS_ERR j40__init_modular_for_pass_group(
+ j40__st *st, int32_t num_gm_channels, int32_t gw, int32_t gh,
+ int32_t minshift, int32_t maxshift, const j40__modular *gm, j40__modular *m
+);
+J40_STATIC void j40__combine_modular_from_pass_group(
+ int32_t num_gm_channels, int32_t gy, int32_t gx,
+ int32_t minshift, int32_t maxshift, const j40__modular *gm, j40__modular *m
+);
+J40__STATIC_RETURNS_ERR j40__modular_header(
+ j40__st *st, j40__tree_node *global_tree, const j40__code_spec *global_codespec,
+ j40__modular *m
+);
+J40__STATIC_RETURNS_ERR j40__allocate_modular(j40__st *st, j40__modular *m);
+J40_STATIC void j40__free_modular(j40__modular *m);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC void j40__init_modular_common(j40__modular *m) {
+ m->transform = NULL;
+ m->tree = NULL;
+ memset(&m->codespec, 0, sizeof(j40__code_spec));
+ memset(&m->code, 0, sizeof(j40__code_st));
+ m->code.spec = &m->codespec;
+ m->channel = NULL;
+}
+
+J40__STATIC_RETURNS_ERR j40__init_modular(
+ j40__st *st, int32_t num_channels, const int32_t *w, const int32_t *h, j40__modular *m
+) {
+ int32_t i;
+
+ j40__init_modular_common(m);
+ m->num_channels = num_channels;
+ J40__ASSERT(num_channels > 0);
+ J40__TRY_CALLOC(j40__plane, &m->channel, (size_t) num_channels);
+ for (i = 0; i < num_channels; ++i) {
+ m->channel[i].width = w[i];
+ m->channel[i].height = h[i];
+ m->channel[i].hshift = m->channel[i].vshift = 0;
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__init_modular_for_global(
+ j40__st *st, int frame_is_modular, int frame_do_ycbcr,
+ int32_t frame_log_upsampling, const int32_t *frame_ec_log_upsampling,
+ int32_t frame_width, int32_t frame_height, j40__modular *m
+) {
+ j40__image_st *im = st->image;
+ int32_t i;
+
+ j40__init_modular_common(m);
+ m->num_channels = im->num_extra_channels;
+ if (frame_is_modular) { // SPEC the condition is negated
+ m->num_channels += (!frame_do_ycbcr && !im->xyb_encoded && im->cspace == J40__CS_GREY ? 1 : 3);
+ }
+ if (m->num_channels == 0) return 0;
+
+ J40__TRY_CALLOC(j40__plane, &m->channel, (size_t) m->num_channels);
+ for (i = 0; i < im->num_extra_channels; ++i) {
+ int32_t log_upsampling = (frame_ec_log_upsampling ? frame_ec_log_upsampling[i] : 0) + im->ec_info[i].dim_shift;
+ J40__SHOULD(log_upsampling >= frame_log_upsampling, "usmp");
+ J40__SHOULD(log_upsampling == 0, "TODO: upsampling is not yet supported");
+ m->channel[i].width = frame_width;
+ m->channel[i].height = frame_height;
+ m->channel[i].hshift = m->channel[i].vshift = 0;
+ }
+ for (; i < m->num_channels; ++i) {
+ m->channel[i].width = frame_width;
+ m->channel[i].height = frame_height;
+ m->channel[i].hshift = m->channel[i].vshift = 0;
+ }
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(m->channel);
+ m->channel = NULL;
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__init_modular_for_pass_group(
+ j40__st *st, int32_t num_gm_channels, int32_t gw, int32_t gh,
+ int32_t minshift, int32_t maxshift, const j40__modular *gm, j40__modular *m
+) {
+ int32_t i, max_channels;
+
+ j40__init_modular_common(m);
+ m->num_channels = 0;
+ max_channels = gm->num_channels - num_gm_channels;
+ J40__ASSERT(max_channels >= 0);
+ J40__TRY_CALLOC(j40__plane, &m->channel, (size_t) max_channels);
+ for (i = num_gm_channels; i < gm->num_channels; ++i) {
+ j40__plane *gc = &gm->channel[i], *c = &m->channel[m->num_channels];
+ if (gc->hshift < 3 || gc->vshift < 3) {
+ J40__ASSERT(gc->hshift >= 0 && gc->vshift >= 0);
+ (void) minshift; (void) maxshift;
+ // TODO check minshift/maxshift!!!
+ c->hshift = gc->hshift;
+ c->vshift = gc->vshift;
+ c->width = gw >> gc->hshift; // TODO is this correct? should be ceil?
+ c->height = gh >> gc->vshift;
+ ++m->num_channels;
+ }
+ }
+ if (m->num_channels == 0) {
+ j40__free(m->channel);
+ m->channel = NULL;
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC void j40__combine_modular_from_pass_group(
+ int32_t num_gm_channels, int32_t gy, int32_t gx,
+ int32_t minshift, int32_t maxshift, const j40__modular *gm, j40__modular *m
+) {
+ int32_t gcidx, cidx, y, gx0, gy0;
+ for (gcidx = num_gm_channels, cidx = 0; gcidx < gm->num_channels; ++gcidx) {
+ j40__plane *gc = &gm->channel[gcidx], *c = &m->channel[cidx];
+ J40__ASSERT(gc->type == c->type);
+ if (gc->hshift < 3 || gc->vshift < 3) {
+ size_t pixel_size = (size_t) J40__PLANE_PIXEL_SIZE(gc);
+ size_t gc_stride = (size_t) gc->stride_bytes, c_stride = (size_t) c->stride_bytes;
+ (void) minshift; (void) maxshift;
+ // TODO check minshift/maxshift!!!
+ J40__ASSERT(gc->hshift == c->hshift && gc->vshift == c->vshift);
+ gx0 = gx >> gc->hshift;
+ gy0 = gy >> gc->vshift;
+ J40__ASSERT(gx0 + c->width <= gc->width && gy0 + c->height <= gc->height);
+ for (y = 0; y < c->height; ++y) {
+ memcpy(
+ (void*) (gc->pixels + gc_stride * (size_t) (gy0 + y) + pixel_size * (size_t) gx0),
+ (void*) (c->pixels + c_stride * (size_t) y),
+ pixel_size * (size_t) c->width);
+ }
+ ++cidx;
+ }
+ }
+ J40__ASSERT(cidx == m->num_channels);
+}
+
+J40__STATIC_RETURNS_ERR j40__modular_header(
+ j40__st *st, j40__tree_node *global_tree, const j40__code_spec *global_codespec,
+ j40__modular *m
+) {
+ j40__plane *channel = m->channel;
+ int32_t num_channels = m->num_channels, nb_meta_channels = 0;
+ // note: channel_cap is the upper bound of # channels during inverse transform, and since
+ // we don't shrink the channel list we don't ever need reallocation in j40__inverse_transform!
+ int32_t channel_cap = m->num_channels, transform_cap;
+ int32_t i, j;
+
+ J40__ASSERT(num_channels > 0);
+
+ m->use_global_tree = j40__u(st, 1);
+ J40__SHOULD(!m->use_global_tree || global_tree, "mtre");
+
+ { // WPHeader
+ int default_wp = j40__u(st, 1);
+ m->wp.p1 = default_wp ? 16 : (int8_t) j40__u(st, 5);
+ m->wp.p2 = default_wp ? 10 : (int8_t) j40__u(st, 5);
+ for (i = 0; i < 5; ++i) m->wp.p3[i] = default_wp ? 7 * (i < 3) : (int8_t) j40__u(st, 5);
+ for (i = 0; i < 4; ++i) m->wp.w[i] = default_wp ? 12 + (i < 1) : (int8_t) j40__u(st, 4);
+ }
+
+ transform_cap = m->nb_transforms = j40__u32(st, 0, 0, 1, 0, 2, 4, 18, 8);
+ J40__SHOULD(m->nb_transforms <= st->limits->nb_transforms, "xlim");
+ J40__TRY_MALLOC(j40__transform, &m->transform, (size_t) transform_cap);
+ for (i = 0; i < m->nb_transforms; ++i) {
+ j40__transform *tr = &m->transform[i];
+ int32_t num_sq;
+
+ tr->tr = (enum j40__transform_id) j40__u(st, 2);
+ switch (tr->tr) {
+ // RCT: [begin_c, begin_c+3) -> [begin_c, begin_c+3)
+ case J40__TR_RCT: {
+ int32_t begin_c = tr->rct.begin_c = j40__u32(st, 0, 3, 8, 6, 72, 10, 1096, 13);
+ int32_t type = tr->rct.type = j40__u32(st, 6, 0, 0, 2, 2, 4, 10, 6);
+ J40__SHOULD(type < 42, "rctt");
+ J40__SHOULD(begin_c + 3 <= num_channels, "rctc");
+ J40__SHOULD(begin_c >= nb_meta_channels || begin_c + 3 <= nb_meta_channels, "rctc");
+ J40__SHOULD(j40__plane_all_equal_sized(channel + begin_c, channel + begin_c + 3), "rtcd");
+ break;
+ }
+
+ // Palette: [begin_c, end_c) -> palette 0 (meta, nb_colours by num_c) + index begin_c+1
+ case J40__TR_PALETTE: {
+ j40__plane input;
+ int32_t begin_c = tr->pal.begin_c = j40__u32(st, 0, 3, 8, 6, 72, 10, 1096, 13);
+ int32_t num_c = tr->pal.num_c = j40__u32(st, 1, 0, 3, 0, 4, 0, 1, 13);
+ int32_t end_c = begin_c + num_c;
+ int32_t nb_colours = tr->pal.nb_colours = j40__u32(st, 0, 8, 256, 10, 1280, 12, 5376, 16);
+ tr->pal.nb_deltas = j40__u32(st, 0, 0, 1, 8, 257, 10, 1281, 16);
+ tr->pal.d_pred = j40__u(st, 4);
+ J40__SHOULD(tr->pal.d_pred < J40__NUM_PRED, "palp");
+ J40__SHOULD(end_c <= num_channels, "palc");
+ if (begin_c < nb_meta_channels) { // num_c meta channels -> 2 meta channels (palette + index)
+ J40__SHOULD(end_c <= nb_meta_channels, "palc");
+ nb_meta_channels += 2 - num_c;
+ } else { // num_c color channels -> 1 meta channel (palette) + 1 color channel (index)
+ nb_meta_channels += 1;
+ }
+ J40__SHOULD(j40__plane_all_equal_sized(channel + begin_c, channel + end_c), "pald");
+ // inverse palette transform always requires one more channel slot
+ J40__TRY_REALLOC32(j40__plane, &channel, num_channels + 1, &channel_cap);
+ input = channel[begin_c];
+ memmove(channel + 1, channel, sizeof(*channel) * (size_t) begin_c);
+ memmove(channel + begin_c + 2, channel + end_c, sizeof(*channel) * (size_t) (num_channels - end_c));
+ channel[0].width = nb_colours;
+ channel[0].height = num_c;
+ channel[0].hshift = 0; // SPEC missing
+ channel[0].vshift = -1;
+ channel[begin_c + 1] = input;
+ num_channels += 2 - num_c;
+ break;
+ }
+
+ // Squeeze:
+ case J40__TR_SQUEEZE: {
+ num_sq = j40__u32(st, 0, 0, 1, 4, 9, 6, 41, 8);
+ if (num_sq == 0) {
+ tr->sq.implicit = 1;
+ } else {
+ J40__TRY_REALLOC32(j40__transform, &m->transform, m->nb_transforms + num_sq - 1, &transform_cap);
+ for (j = 0; j < num_sq; ++j) {
+ tr = &m->transform[i + j];
+ tr->sq.tr = J40__TR_SQUEEZE;
+ tr->sq.implicit = 0;
+ tr->sq.horizontal = j40__u(st, 1);
+ tr->sq.in_place = j40__u(st, 1);
+ tr->sq.begin_c = j40__u32(st, 0, 3, 8, 6, 72, 10, 1096, 13);
+ tr->sq.num_c = j40__u32(st, 1, 0, 2, 0, 3, 0, 4, 4);
+ }
+ i += num_sq - 1;
+ m->nb_transforms += num_sq - 1;
+ }
+ J40__RAISE("TODO: squeeze channel effects");
+ break;
+ }
+
+ default: J40__RAISE("xfm?");
+ }
+ J40__RAISE_DELAYED();
+ }
+
+ J40__SHOULD(num_channels <= st->limits->nb_channels_tr, "xlim");
+
+ if (m->use_global_tree) {
+ m->tree = global_tree;
+ memcpy(&m->codespec, global_codespec, sizeof(j40__code_spec));
+ } else {
+ int32_t max_tree_size = 1024;
+ for (i = 0; i < num_channels; ++i) {
+ max_tree_size = j40__clamp_add32(max_tree_size,
+ j40__clamp_mul32(channel[i].width, channel[i].height));
+ }
+ max_tree_size = j40__min32(1 << 20, max_tree_size);
+ J40__TRY(j40__tree(st, max_tree_size, &m->tree, &m->codespec));
+ }
+ j40__init_code(&m->code, &m->codespec);
+
+ m->channel = channel;
+ m->num_channels = num_channels;
+ m->nb_meta_channels = nb_meta_channels;
+ m->dist_mult = 0;
+ for (i = nb_meta_channels; i < num_channels; ++i) {
+ m->dist_mult = j40__max32(m->dist_mult, channel[i].width);
+ }
+ m->dist_mult = j40__min32(m->dist_mult, J40__MAX_DIST_MULT);
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(channel);
+ j40__free(m->transform);
+ if (!m->use_global_tree) {
+ j40__free(m->tree);
+ j40__free_code_spec(&m->codespec);
+ }
+ m->num_channels = 0;
+ m->channel = NULL;
+ m->transform = NULL;
+ m->tree = NULL;
+ memset(&m->codespec, 0, sizeof(j40__code_spec));
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__allocate_modular(j40__st *st, j40__modular *m) {
+ uint8_t pixel_type = (uint8_t) (st->image->modular_16bit_buffers ? J40__PLANE_I16 : J40__PLANE_I32);
+ int32_t i;
+ for (i = 0; i < m->num_channels; ++i) {
+ j40__plane *c = &m->channel[i];
+ if (c->width > 0 && c->height > 0) {
+ J40__TRY(j40__init_plane(st, pixel_type, c->width, c->height, J40__PLANE_FORCE_PAD, c));
+ } else { // possible when, for example, palette with only synthetic colors (nb_colours == 0)
+ j40__init_empty_plane(c);
+ }
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC void j40__free_modular(j40__modular *m) {
+ int32_t i;
+ j40__free_code(&m->code);
+ if (!m->use_global_tree) {
+ j40__free(m->tree);
+ j40__free_code_spec(&m->codespec);
+ }
+ if (m->channel) {
+ for (i = 0; i < m->num_channels; ++i) j40__free_plane(&m->channel[i]);
+ j40__free(m->channel);
+ m->channel = NULL;
+ }
+ j40__free(m->transform);
+ m->use_global_tree = 0;
+ m->tree = NULL;
+ memset(&m->codespec, 0, sizeof(j40__code_spec));
+ m->transform = NULL;
+ m->num_channels = 0;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// modular prediction
+
+J40__STATIC_RETURNS_ERR j40__modular_channel(j40__st *st, j40__modular *m, int32_t cidx, int64_t sidx);
+
+#ifdef J40_IMPLEMENTATION
+static const int32_t J40__24DIVP1[64] = { // [i] = floor(2^24 / (i+1))
+ 0x1000000, 0x800000, 0x555555, 0x400000, 0x333333, 0x2aaaaa, 0x249249, 0x200000,
+ 0x1c71c7, 0x199999, 0x1745d1, 0x155555, 0x13b13b, 0x124924, 0x111111, 0x100000,
+ 0xf0f0f, 0xe38e3, 0xd7943, 0xccccc, 0xc30c3, 0xba2e8, 0xb2164, 0xaaaaa,
+ 0xa3d70, 0x9d89d, 0x97b42, 0x92492, 0x8d3dc, 0x88888, 0x84210, 0x80000,
+ 0x7c1f0, 0x78787, 0x75075, 0x71c71, 0x6eb3e, 0x6bca1, 0x69069, 0x66666,
+ 0x63e70, 0x61861, 0x5f417, 0x5d174, 0x5b05b, 0x590b2, 0x57262, 0x55555,
+ 0x53978, 0x51eb8, 0x50505, 0x4ec4e, 0x4d487, 0x4bda1, 0x4a790, 0x49249,
+ 0x47dc1, 0x469ee, 0x456c7, 0x44444, 0x4325c, 0x42108, 0x41041, 0x40000,
+};
+#endif
+
+// ----------------------------------------
+// recursion for modular buffer sizes (16/32)
+#undef J40__RECURSING
+#define J40__RECURSING 200
+#define J40__P 16
+#define J40__2P 32
+#include J40_FILENAME
+#define J40__P 32
+#define J40__2P 64
+#include J40_FILENAME
+#undef J40__RECURSING
+#define J40__RECURSING (-1)
+
+#endif // J40__RECURSING < 0
+#if J40__RECURSING == 200
+ #define j40__intP J40__CONCAT3(int, J40__P, _t)
+ #define j40__int2P J40__CONCAT3(int, J40__2P, _t)
+ #define j40__uint2P J40__CONCAT3(uint, J40__2P, _t)
+ #define J40__PIXELS J40__CONCAT3(J40__I, J40__P, _PIXELS)
+// ----------------------------------------
+
+typedef struct {
+ int32_t width;
+ j40__wp_params params;
+ j40__int2P (*errors)[5], pred[5]; // [0..3] = sub-predictions, [4] = final prediction
+ j40__int2P trueerrw, trueerrn, trueerrnw, trueerrne;
+} j40__(wp,2P);
+
+typedef struct { j40__intP w, n, nw, ne, nn, nee, ww, nww; } j40__(neighbors,P);
+J40_ALWAYS_INLINE j40__(neighbors,P) j40__(init_neighbors,P)(const j40__plane *plane, int32_t x, int32_t y);
+
+J40_INLINE j40__int2P j40__(gradient,2P)(j40__int2P w, j40__int2P n, j40__int2P nw);
+J40__STATIC_RETURNS_ERR j40__(init_wp,2P)(j40__st *st, j40__wp_params params, int32_t width, j40__(wp,2P) *wp);
+J40_STATIC void j40__(wp_before_predict_internal,2P)(
+ j40__(wp,2P) *wp, int32_t x, int32_t y,
+ j40__intP pw, j40__intP pn, j40__intP pnw, j40__intP pne, j40__intP pnn
+);
+J40_INLINE void j40__(wp_before_predict,2P)(j40__(wp,2P) *wp, int32_t x, int32_t y, j40__(neighbors,P) *p);
+J40_INLINE j40__int2P j40__(predict,2P)(
+ j40__st *st, int32_t pred, const j40__(wp,2P) *wp, const j40__(neighbors,P) *p
+);
+J40_INLINE void j40__(wp_after_predict,2P)(j40__(wp,2P) *wp, int32_t x, int32_t y, j40__int2P val);
+J40_STATIC void j40__(reset_wp,2P)(j40__(wp,2P) *wp);
+J40_STATIC void j40__(free_wp,2P)(j40__(wp,2P) *wp);
+J40__STATIC_RETURNS_ERR j40__(modular_channel,P)(j40__st *st, j40__modular *m, int32_t cidx, int64_t sidx);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_ALWAYS_INLINE j40__(neighbors,P) j40__(init_neighbors,P)(const j40__plane *plane, int32_t x, int32_t y) {
+ j40__(neighbors,P) p;
+ const j40__intP *pixels = J40__PIXELS(plane, y);
+ int32_t width = plane->width, stride = J40__PLANE_STRIDE(plane);
+
+ /* NN
+ * |
+ * v
+ * NWW NW _ N <- NE <- NEE
+ * | | /|
+ * v v |/
+ * WW -> W ` C
+ *
+ * A -> B means that if A doesn't exist B is used instead.
+ * if the pixel at the end of this chain doesn't exist as well, 0 is used.
+ */
+ p.w = x > 0 ? pixels[x - 1] : y > 0 ? pixels[x - stride] : 0;
+ p.n = y > 0 ? pixels[x - stride] : p.w;
+ p.nw = x > 0 && y > 0 ? pixels[(x - 1) - stride] : p.w;
+ p.ne = x + 1 < width && y > 0 ? pixels[(x + 1) - stride] : p.n;
+ p.nn = y > 1 ? pixels[x - 2 * stride] : p.n;
+ p.nee = x + 2 < width && y > 0 ? pixels[(x + 2) - stride] : p.ne;
+ p.ww = x > 1 ? pixels[x - 2] : p.w;
+ p.nww = x > 1 && y > 0 ? pixels[(x - 2) - stride] : p.ww;
+ return p;
+}
+
+J40_INLINE j40__int2P j40__(gradient,2P)(j40__int2P w, j40__int2P n, j40__int2P nw) {
+ j40__int2P lo = j40__(min,2P)(w, n), hi = j40__(max,2P)(w, n);
+ return j40__(min,2P)(j40__(max,2P)(lo, w + n - nw), hi);
+}
+
+J40__STATIC_RETURNS_ERR j40__(init_wp,2P)(j40__st *st, j40__wp_params params, int32_t width, j40__(wp,2P) *wp) {
+ typedef j40__int2P j40__i2Px5[5];
+ int32_t i;
+ J40__ASSERT(width > 0);
+ wp->width = width;
+ wp->params = params;
+ J40__TRY_CALLOC(j40__i2Px5, &wp->errors, (size_t) width * 2);
+ for (i = 0; i < 5; ++i) wp->pred[i] = 0;
+ wp->trueerrw = wp->trueerrn = wp->trueerrnw = wp->trueerrne = 0;
+J40__ON_ERROR:
+ return st->err;
+}
+
+// also works when wp is zero-initialized (in which case does nothing)
+J40_STATIC void j40__(wp_before_predict_internal,2P)(
+ j40__(wp,2P) *wp, int32_t x, int32_t y,
+ j40__intP pw, j40__intP pn, j40__intP pnw, j40__intP pne, j40__intP pnn
+) {
+ typedef j40__int2P int2P_t;
+ typedef j40__uint2P uint2P_t;
+
+ static const int2P_t ZERO[4] = {0, 0, 0, 0};
+
+ int2P_t (*err)[5], (*nerr)[5];
+ int2P_t w[4], wsum, sum;
+ int32_t logw, i;
+ const int2P_t *errw, *errn, *errnw, *errne, *errww, *errw2;
+
+ if (!wp->errors) return;
+
+ err = wp->errors + (y & 1 ? wp->width : 0);
+ nerr = wp->errors + (y & 1 ? 0 : wp->width);
+
+ // SPEC edge cases are handled differently from the spec, in particular some pixels are
+ // added twice to err_sum and requires a special care (errw2 below)
+ errw = x > 0 ? err[x - 1] : ZERO;
+ errn = y > 0 ? nerr[x] : ZERO;
+ errnw = x > 0 && y > 0 ? nerr[x - 1] : errn;
+ errne = x + 1 < wp->width && y > 0 ? nerr[x + 1] : errn;
+ errww = x > 1 ? err[x - 2] : ZERO;
+ errw2 = x + 1 < wp->width ? ZERO : errw;
+
+ // SPEC again, edge cases are handled differently
+ wp->trueerrw = x > 0 ? err[x - 1][4] : 0;
+ wp->trueerrn = y > 0 ? nerr[x][4] : 0;
+ wp->trueerrnw = x > 0 && y > 0 ? nerr[x - 1][4] : wp->trueerrn;
+ wp->trueerrne = x + 1 < wp->width && y > 0 ? nerr[x + 1][4] : wp->trueerrn;
+
+ // (expr << 3) is used throughout wp, but it's an UB when expr is negative
+ wp->pred[0] = (pw + pne - pn) * 8;
+ wp->pred[1] = pn * 8 - (((wp->trueerrw + wp->trueerrn + wp->trueerrne) * wp->params.p1) >> 5);
+ wp->pred[2] = pw * 8 - (((wp->trueerrw + wp->trueerrn + wp->trueerrnw) * wp->params.p2) >> 5);
+ wp->pred[3] = pn * 8 - // SPEC negated (was `+`)
+ ((wp->trueerrnw * wp->params.p3[0] + wp->trueerrn * wp->params.p3[1] +
+ wp->trueerrne * wp->params.p3[2] + (pnn - pn) * 8 * wp->params.p3[3] +
+ (pnw - pw) * 8 * wp->params.p3[4]) >> 5);
+ for (i = 0; i < 4; ++i) {
+ int2P_t errsum = errn[i] + errw[i] + errnw[i] + errww[i] + errne[i] + errw2[i];
+ int32_t shift = j40__max32(j40__(floor_lg,2P)((uint2P_t) errsum + 1) - 5, 0);
+ // SPEC missing the final `>> shift`
+ w[i] = (int2P_t) (4 + ((int64_t) wp->params.w[i] * J40__24DIVP1[errsum >> shift] >> shift));
+ }
+ logw = j40__(floor_lg,2P)((uint2P_t) (w[0] + w[1] + w[2] + w[3])) - 4;
+ wsum = sum = 0;
+ for (i = 0; i < 4; ++i) {
+ wsum += w[i] >>= logw;
+ sum += wp->pred[i] * w[i];
+ }
+ // SPEC missing `- 1` before scaling
+ wp->pred[4] = (int2P_t) (((int64_t) sum + (wsum >> 1) - 1) * J40__24DIVP1[wsum - 1] >> 24);
+ if (((wp->trueerrn ^ wp->trueerrw) | (wp->trueerrn ^ wp->trueerrnw)) <= 0) {
+ int2P_t lo = j40__(min,2P)(pw, j40__(min,2P)(pn, pne)) * 8; // SPEC missing shifts
+ int2P_t hi = j40__(max,2P)(pw, j40__(max,2P)(pn, pne)) * 8;
+ wp->pred[4] = j40__(min,2P)(j40__(max,2P)(lo, wp->pred[4]), hi);
+ }
+}
+
+J40_INLINE void j40__(wp_before_predict,2P)(
+ j40__(wp,2P) *wp, int32_t x, int32_t y, j40__(neighbors,P) *p
+) {
+ j40__(wp_before_predict_internal,2P)(wp, x, y, p->w, p->n, p->nw, p->ne, p->nn);
+}
+
+J40_INLINE j40__int2P j40__(predict,2P)(
+ j40__st *st, int32_t pred, const j40__(wp,2P) *wp, const j40__(neighbors,P) *p
+) {
+ switch (pred) {
+ case 0: return 0;
+ case 1: return p->w;
+ case 2: return p->n;
+ case 3: return (p->w + p->n) / 2;
+ case 4: return j40__(abs,2P)(p->n - p->nw) < j40__(abs,2P)(p->w - p->nw) ? p->w : p->n;
+ case 5: return j40__(gradient,2P)(p->w, p->n, p->nw);
+ case 6: return (wp->pred[4] + 3) >> 3;
+ case 7: return p->ne;
+ case 8: return p->nw;
+ case 9: return p->ww;
+ case 10: return (p->w + p->nw) / 2;
+ case 11: return (p->n + p->nw) / 2;
+ case 12: return (p->n + p->ne) / 2;
+ case 13: return (6 * p->n - 2 * p->nn + 7 * p->w + p->ww + p->nee + 3 * p->ne + 8) / 16;
+ default: return J40__ERR("pred"), 0;
+ }
+}
+
+// also works when wp is zero-initialized (in which case does nothing)
+J40_INLINE void j40__(wp_after_predict,2P)(j40__(wp,2P) *wp, int32_t x, int32_t y, j40__int2P val) {
+ if (wp->errors) {
+ j40__int2P *err = wp->errors[(y & 1 ? wp->width : 0) + x];
+ int32_t i;
+ // SPEC approximated differently from the spec
+ for (i = 0; i < 4; ++i) err[i] = (j40__(abs,2P)(wp->pred[i] - val * 8) + 3) >> 3;
+ err[4] = wp->pred[4] - val * 8; // SPEC this is a *signed* difference
+ }
+}
+
+// also works when wp is zero-initialized (in which case does nothing)
+J40_STATIC void j40__(reset_wp,2P)(j40__(wp,2P) *wp) {
+ int32_t i;
+ if (wp->errors) memset(wp->errors, 0, (size_t) wp->width * 2 * sizeof(j40__int2P[5]));
+ for (i = 0; i < 5; ++i) wp->pred[i] = 0;
+ wp->trueerrw = wp->trueerrn = wp->trueerrnw = wp->trueerrne = 0;
+}
+
+J40_STATIC void j40__(free_wp,2P)(j40__(wp,2P) *wp) {
+ j40__free(wp->errors);
+ wp->errors = NULL;
+ wp->width = 0;
+}
+
+J40__STATIC_RETURNS_ERR j40__(modular_channel,P)(
+ j40__st *st, j40__modular *m, int32_t cidx, int64_t sidx
+) {
+ typedef j40__intP intP_t;
+ typedef j40__int2P int2P_t;
+
+ j40__plane *c = &m->channel[cidx];
+ int32_t width = c->width, height = c->height;
+ int32_t y, x, i;
+ int32_t nrefcmap, *refcmap = NULL; // refcmap[i] is a channel index for properties (16..19)+4*i
+ j40__(wp,2P) wp = J40__INIT;
+
+ J40__ASSERT(m->tree); // caller should set this to the global tree if not given
+ J40__ASSERT(c->type == J40__(PLANE_I,P));
+
+ { // determine whether to use weighted predictor (expensive)
+ int32_t lasttree = 0, use_wp = 0;
+ for (i = 0; i <= lasttree && !use_wp; ++i) {
+ if (m->tree[i].branch.prop < 0) {
+ use_wp |= ~m->tree[i].branch.prop == 15;
+ lasttree = j40__max32(lasttree,
+ i + j40__max32(m->tree[i].branch.leftoff, m->tree[i].branch.rightoff));
+ } else {
+ use_wp |= m->tree[i].leaf.predictor == 6;
+ }
+ }
+ if (use_wp) J40__TRY(j40__(init_wp,2P)(st, m->wp, width, &wp));
+ }
+
+ // compute indices for additional "previous channel" properties
+ // SPEC incompatible channels are skipped and never result in unusable but numbered properties
+ J40__TRY_MALLOC(int32_t, &refcmap, (size_t) cidx);
+ nrefcmap = 0;
+ for (i = cidx - 1; i >= 0; --i) {
+ j40__plane *refc = &m->channel[i];
+ if (c->width != refc->width || c->height != refc->height) continue;
+ if (c->hshift != refc->hshift || c->vshift != refc->vshift) continue;
+ refcmap[nrefcmap++] = i;
+ }
+
+ for (y = 0; y < height; ++y) {
+ intP_t *outpixels = J40__PIXELS(c, y);
+ for (x = 0; x < width; ++x) {
+ j40__tree_node *n = m->tree;
+ j40__(neighbors,P) p = j40__(init_neighbors,P)(c, x, y);
+ int2P_t val;
+
+ // wp should be calculated before any property testing due to max_error (property 15)
+ j40__(wp_before_predict,2P)(&wp, x, y, &p);
+
+ while (n->branch.prop < 0) {
+ int32_t refcidx;
+ j40__plane *refc;
+
+ switch (~n->branch.prop) {
+ case 0: val = cidx; break;
+ case 1: val = (int2P_t) sidx; break; // TODO check overflow
+ case 2: val = y; break;
+ case 3: val = x; break;
+ case 4: val = j40__(abs,2P)(p.n); break;
+ case 5: val = j40__(abs,2P)(p.w); break;
+ case 6: val = p.n; break;
+ case 7: val = p.w; break;
+ case 8: val = x > 0 ? p.w - (p.ww + p.nw - p.nww) : p.w; break;
+ case 9: val = p.w + p.n - p.nw; break;
+ case 10: val = p.w - p.nw; break;
+ case 11: val = p.nw - p.n; break;
+ case 12: val = p.n - p.ne; break;
+ case 13: val = p.n - p.nn; break;
+ case 14: val = p.w - p.ww; break;
+ case 15: // requires use_wp; otherwise will be 0
+ val = wp.trueerrw;
+ if (j40__(abs,2P)(val) < j40__(abs,2P)(wp.trueerrn)) val = wp.trueerrn;
+ if (j40__(abs,2P)(val) < j40__(abs,2P)(wp.trueerrnw)) val = wp.trueerrnw;
+ if (j40__(abs,2P)(val) < j40__(abs,2P)(wp.trueerrne)) val = wp.trueerrne;
+ break;
+ default:
+ refcidx = (~n->branch.prop - 16) / 4;
+ J40__SHOULD(refcidx < nrefcmap, "trec");
+ refc = &m->channel[refcmap[refcidx]];
+ J40__ASSERT(c->width == refc->width && c->height == refc->height);
+ val = J40__PIXELS(refc, y)[x]; // rC
+ if (~n->branch.prop & 2) {
+ int2P_t rw = x > 0 ? J40__PIXELS(refc, y)[x - 1] : 0;
+ int2P_t rn = y > 0 ? J40__PIXELS(refc, y - 1)[x] : rw;
+ int2P_t rnw = x > 0 && y > 0 ? J40__PIXELS(refc, y - 1)[x - 1] : rw;
+ val -= j40__(gradient,2P)(rw, rn, rnw);
+ }
+ if (~n->branch.prop & 1) val = j40__(abs,2P)(val);
+ break;
+ }
+ n += val > n->branch.value ? n->branch.leftoff : n->branch.rightoff;
+ }
+
+ val = j40__code(st, n->leaf.ctx, m->dist_mult, &m->code);
+ // TODO can overflow at any operator and the bound is incorrect anyway
+ val = j40__unpack_signed((int32_t) val) * n->leaf.multiplier + n->leaf.offset;
+ val += j40__(predict,2P)(st, n->leaf.predictor, &wp, &p);
+ J40__SHOULD(INT16_MIN <= val && val <= INT16_MAX, "povf");
+ outpixels[x] = (intP_t) val;
+ j40__(wp_after_predict,2P)(&wp, x, y, val);
+ }
+ }
+
+ j40__(free_wp,2P)(&wp);
+ j40__free(refcmap);
+ return 0;
+
+J40__ON_ERROR:
+ j40__(free_wp,2P)(&wp);
+ j40__free(refcmap);
+ j40__free_plane(c);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+// ----------------------------------------
+// end of recursion
+ #undef j40__intP
+ #undef j40__int2P
+ #undef j40__uint2P
+ #undef J40__PIXELS
+ #undef J40__P
+ #undef J40__2P
+#endif // J40__RECURSING == 200
+#if J40__RECURSING < 0
+// ----------------------------------------
+
+#ifdef J40_IMPLEMENTATION
+J40__STATIC_RETURNS_ERR j40__modular_channel(j40__st *st, j40__modular *m, int32_t cidx, int64_t sidx) {
+ switch (m->channel[cidx].type) {
+ case J40__PLANE_I16: return j40__modular_channel16(st, m, cidx, sidx);
+ case J40__PLANE_I32: return j40__modular_channel32(st, m, cidx, sidx);
+ case J40__PLANE_EMPTY: return 0;
+ default: J40__UNREACHABLE(); return 0;
+ }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// modular (inverse) transform
+
+J40__STATIC_RETURNS_ERR j40__inverse_transform(j40__st *st, j40__modular *m);
+
+#ifdef J40_IMPLEMENTATION
+#define J40__X(x,y,z) {x,y,z}, {-(x),-(y),-(z)}
+#define J40__XX(a,b,c,d,e,f) J40__X a, J40__X b, J40__X c, J40__X d, J40__X e, J40__X f
+static const int16_t J40__PALETTE_DELTAS[144][3] = { // the first entry is a duplicate and skipped
+ J40__XX((0, 0, 0), (4, 4, 4), (11, 0, 0), (0, 0, -13), (0, -12, 0), (-10, -10, -10)),
+ J40__XX((-18, -18, -18), (-27, -27, -27), (-18, -18, 0), (0, 0, -32), (-32, 0, 0), (-37, -37, -37)),
+ J40__XX((0, -32, -32), (24, 24, 45), (50, 50, 50), (-45, -24, -24), (-24, -45, -45), (0, -24, -24)),
+ J40__XX((-34, -34, 0), (-24, 0, -24), (-45, -45, -24), (64, 64, 64), (-32, 0, -32), (0, -32, 0)),
+ J40__XX((-32, 0, 32), (-24, -45, -24), (45, 24, 45), (24, -24, -45), (-45, -24, 24), (80, 80, 80)),
+ J40__XX((64, 0, 0), (0, 0, -64), (0, -64, -64), (-24, -24, 45), (96, 96, 96), (64, 64, 0)),
+ J40__XX((45, -24, -24), (34, -34, 0), (112, 112, 112), (24, -45, -45), (45, 45, -24), (0, -32, 32)),
+ J40__XX((24, -24, 45), (0, 96, 96), (45, -24, 24), (24, -45, -24), (-24, -45, 24), (0, -64, 0)),
+ J40__XX((96, 0, 0), (128, 128, 128), (64, 0, 64), (144, 144, 144), (96, 96, 0), (-36, -36, 36)),
+ J40__XX((45, -24, -45), (45, -45, -24), (0, 0, -96), (0, 128, 128), (0, 96, 0), (45, 24, -45)),
+ J40__XX((-128, 0, 0), (24, -45, 24), (-45, 24, -45), (64, 0, -64), (64, -64, -64), (96, 0, 96)),
+ J40__XX((45, -45, 24), (24, 45, -45), (64, 64, -64), (128, 128, 0), (0, 0, -128), (-24, 45, -45)),
+};
+#undef J40__X
+#undef J40__XX
+#endif // defined J40_IMPLEMENTATION
+
+// ----------------------------------------
+// recursion for modular inverse transform
+#undef J40__RECURSING
+#define J40__RECURSING 300
+#define J40__P 16
+#define J40__2P 32
+#include J40_FILENAME
+#define J40__P 32
+#define J40__2P 64
+#include J40_FILENAME
+#undef J40__RECURSING
+#define J40__RECURSING (-1)
+
+#endif // J40__RECURSING < 0
+#if J40__RECURSING == 300
+ #define j40__intP J40__CONCAT3(int, J40__P, _t)
+ #define j40__int2P J40__CONCAT3(int, J40__2P, _t)
+ #define J40__PIXELS J40__CONCAT3(J40__I, J40__P, _PIXELS)
+// ----------------------------------------
+
+J40_STATIC void j40__(inverse_rct,P)(j40__modular *m, const j40__transform *tr);
+J40__STATIC_RETURNS_ERR j40__(inverse_palette,P)(j40__st *st, j40__modular *m, const j40__transform *tr);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC void j40__(inverse_rct,P)(j40__modular *m, const j40__transform *tr) {
+ typedef j40__intP intP_t;
+ typedef j40__int2P int2P_t;
+
+ // SPEC permutation psuedocode is missing parentheses; better done with a LUT anyway
+ static const uint8_t PERMUTATIONS[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{1,0,2},{2,1,0}};
+
+ j40__plane c[3];
+ int32_t x, y, i;
+
+ J40__ASSERT(tr->tr == J40__TR_RCT);
+ for (i = 0; i < 3; ++i) c[i] = m->channel[tr->rct.begin_c + i];
+ J40__ASSERT(j40__plane_all_equal_sized(c, c + 3));
+
+ // it is possible that input planes are empty, in which case we do nothing (not even shuffling)
+ if (c->type == J40__PLANE_EMPTY) {
+ J40__ASSERT(j40__plane_all_equal_typed(c, c + 3) == J40__PLANE_EMPTY);
+ return;
+ } else {
+ J40__ASSERT(j40__plane_all_equal_typed(c, c + 3) == J40__(PLANE_I,P));
+ }
+
+ // TODO detect overflow
+ switch (tr->rct.type % 7) {
+ case 0: break;
+ case 1:
+ for (y = 0; y < c->height; ++y) {
+ intP_t *pp0 = J40__PIXELS(&c[0], y), *pp2 = J40__PIXELS(&c[2], y);
+ for (x = 0; x < c->width; ++x) pp2[x] = (intP_t) (pp2[x] + pp0[x]);
+ }
+ break;
+ case 2:
+ for (y = 0; y < c->height; ++y) {
+ intP_t *pp0 = J40__PIXELS(&c[0], y), *pp1 = J40__PIXELS(&c[1], y), *pp2 = J40__PIXELS(&c[2], y);
+ for (x = 0; x < c->width; ++x) pp2[x] = (intP_t) (pp1[x] + pp0[x]);
+ }
+ break;
+ case 3:
+ for (y = 0; y < c->height; ++y) {
+ intP_t *pp0 = J40__PIXELS(&c[0], y), *pp1 = J40__PIXELS(&c[1], y), *pp2 = J40__PIXELS(&c[2], y);
+ for (x = 0; x < c->width; ++x) {
+ pp1[x] = (intP_t) (pp1[x] + pp0[x]);
+ pp2[x] = (intP_t) (pp2[x] + pp0[x]);
+ }
+ }
+ break;
+ case 4:
+ for (y = 0; y < c->height; ++y) {
+ intP_t *pp0 = J40__PIXELS(&c[0], y), *pp1 = J40__PIXELS(&c[1], y), *pp2 = J40__PIXELS(&c[2], y);
+ for (x = 0; x < c->width; ++x) pp1[x] = (intP_t) (pp1[x] + j40__(floor_avg,P)(pp0[x], pp2[x]));
+ }
+ break;
+ case 5:
+ for (y = 0; y < c->height; ++y) {
+ intP_t *pp0 = J40__PIXELS(&c[0], y), *pp1 = J40__PIXELS(&c[1], y), *pp2 = J40__PIXELS(&c[2], y);
+ for (x = 0; x < c->width; ++x) {
+ // TODO avoid int2P_t if possible
+ pp1[x] = (intP_t) ((int2P_t) pp1[x] + pp0[x] + (pp2[x] >> 1));
+ pp2[x] = (intP_t) (pp2[x] + pp0[x]);
+ }
+ }
+ break;
+ case 6: // YCgCo
+ for (y = 0; y < c->height; ++y) {
+ intP_t *pp0 = J40__PIXELS(&c[0], y), *pp1 = J40__PIXELS(&c[1], y), *pp2 = J40__PIXELS(&c[2], y);
+ for (x = 0; x < c->width; ++x) {
+ // TODO avoid int2P_t if possible
+ int2P_t tmp = (int2P_t) pp0[x] - ((int2P_t) pp2[x] >> 1);
+ int2P_t p1 = (int2P_t) pp2[x] + tmp;
+ int2P_t p2 = tmp - ((int2P_t) pp1[x] >> 1);
+ pp0[x] = (intP_t) (p2 + pp1[x]);
+ pp1[x] = (intP_t) p1;
+ pp2[x] = (intP_t) p2;
+ }
+ }
+ break;
+ default: J40__UNREACHABLE();
+ }
+
+ for (i = 0; i < 3; ++i) {
+ m->channel[tr->rct.begin_c + PERMUTATIONS[tr->rct.type / 7][i]] = c[i];
+ }
+}
+
+J40__STATIC_RETURNS_ERR j40__(inverse_palette,P)(
+ j40__st *st, j40__modular *m, const j40__transform *tr
+) {
+ typedef j40__intP intP_t;
+ typedef j40__int2P int2P_t;
+
+ // `first` is the index channel index; restored color channels will be at indices [first,last],
+ // where the original index channel is relocated to the index `last` and then repurposed.
+ // the palette meta channel 0 will be removed at the very end.
+ int32_t first = tr->pal.begin_c + 1, last = tr->pal.begin_c + tr->pal.num_c, bpp = st->image->bpp;
+ int32_t i, j, y, x;
+ j40__plane *idxc;
+ int32_t width = m->channel[first].width, height = m->channel[first].height;
+ int use_pred = tr->pal.nb_deltas > 0, use_wp = use_pred && tr->pal.d_pred == 6;
+ j40__(wp,2P) wp = J40__INIT;
+
+ J40__ASSERT(tr->tr == J40__TR_PALETTE);
+
+ // since we never shrink m->channel, we know there is enough capacity for intermediate transform
+ memmove(m->channel + last, m->channel + first, sizeof(j40__plane) * (size_t) (m->num_channels - first));
+ m->num_channels += last - first;
+ idxc = &m->channel[last];
+
+ for (i = first; i < last; ++i) m->channel[i].type = 0;
+ if (idxc->type == J40__PLANE_EMPTY) {
+ // index channel is empty; all resulting output channels would be empty
+ for (i = first; i < last; ++i) j40__init_empty_plane(&m->channel[i]);
+ } else {
+ for (i = first; i < last; ++i) {
+ J40__TRY(j40__init_plane(st, J40__(PLANE_I,P), width, height, 0, &m->channel[i]));
+ }
+ }
+
+ if (use_wp) J40__TRY(j40__(init_wp,2P)(st, m->wp, width, &wp));
+
+ for (i = 0; i < tr->pal.num_c; ++i) {
+ // palette channel can be also empty
+ intP_t *palp = tr->pal.nb_colours > 0 ? J40__PIXELS(&m->channel[0], i) : NULL;
+ j40__plane *c = &m->channel[first + i];
+ for (y = 0; y < height; ++y) {
+ // SPEC pseudocode accidentally overwrites the index channel
+ intP_t *idxline = J40__PIXELS(idxc, y);
+ intP_t *line = J40__PIXELS(c, y);
+ for (x = 0; x < width; ++x) {
+ intP_t idx = idxline[x], val;
+ int is_delta = idx < tr->pal.nb_deltas;
+ if (idx < 0) { // hard-coded delta for first 3 channels, otherwise 0
+ if (i < 3) {
+ idx = (intP_t) (~idx % 143); // say no to 1's complement
+ val = J40__PALETTE_DELTAS[idx + 1][i];
+ if (bpp > 8) val = (intP_t) (val << (j40__min32(bpp, 24) - 8));
+ } else {
+ val = 0;
+ }
+ } else if (idx < tr->pal.nb_colours) {
+ val = palp[idx];
+ } else { // synthesized from (idx - nb_colours)
+ idx = (intP_t) (idx - tr->pal.nb_colours);
+ if (idx < 64) { // idx == ..YX in base 4 -> {(X+0.5)/4, (Y+0.5)/4, ...}
+ val = (intP_t) ((i < 3 ? idx >> (2 * i) : 0) * (((int2P_t) 1 << bpp) - 1) / 4 +
+ ((int2P_t) 1 << j40__max32(0, bpp - 3)));
+ } else { // idx + 64 == ..ZYX in base 5 -> {X/4, Y/4, Z/4, ...}
+ val = (intP_t) (idx - 64);
+ for (j = 0; j < i; ++j) val = (intP_t) (val / 5);
+ val = (intP_t) ((val % 5) * ((1 << bpp) - 1) / 4);
+ }
+ }
+ if (use_pred) {
+ j40__(neighbors,P) p = j40__(init_neighbors,P)(c, x, y);
+ j40__(wp_before_predict,2P)(&wp, x, y, &p);
+ // TODO handle overflow
+ if (is_delta) val = (intP_t) (val + j40__(predict,2P)(st, tr->pal.d_pred, &wp, &p));
+ j40__(wp_after_predict,2P)(&wp, x, y, val);
+ }
+ line[x] = val;
+ }
+ }
+ j40__(reset_wp,2P)(&wp);
+ }
+
+ j40__(free_wp,2P)(&wp);
+ j40__free_plane(&m->channel[0]);
+ memmove(m->channel, m->channel + 1, sizeof(j40__plane) * (size_t) --m->num_channels);
+ return 0;
+
+J40__ON_ERROR:
+ j40__(free_wp,2P)(&wp);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+// ----------------------------------------
+// end of recursion
+ #undef j40__intP
+ #undef j40__int2P
+ #undef J40__PIXELS
+ #undef J40__P
+ #undef J40__2P
+#endif // J40__RECURSING == 300
+#if J40__RECURSING < 0
+// ----------------------------------------
+
+#ifdef J40_IMPLEMENTATION
+J40__STATIC_RETURNS_ERR j40__inverse_transform(j40__st *st, j40__modular *m) {
+ int32_t i;
+
+ if (m->num_channels == 0) return 0;
+
+ switch (j40__plane_all_equal_typed_or_empty(m->channel, m->channel + m->num_channels)) {
+ case J40__PLANE_I16:
+ for (i = m->nb_transforms - 1; i >= 0; --i) {
+ const j40__transform *tr = &m->transform[i];
+ switch (tr->tr) {
+ case J40__TR_RCT: j40__inverse_rct16(m, tr); break;
+ case J40__TR_PALETTE: J40__TRY(j40__inverse_palette16(st, m, tr)); break;
+ case J40__TR_SQUEEZE: J40__RAISE("TODO: squeeze inverse transformation"); break;
+ default: J40__UNREACHABLE();
+ }
+ }
+ break;
+
+ case J40__PLANE_I32:
+ for (i = m->nb_transforms - 1; i >= 0; --i) {
+ const j40__transform *tr = &m->transform[i];
+ switch (tr->tr) {
+ case J40__TR_RCT: j40__inverse_rct32(m, tr); break;
+ case J40__TR_PALETTE: J40__TRY(j40__inverse_palette32(st, m, tr)); break;
+ case J40__TR_SQUEEZE: J40__RAISE("TODO: squeeze inverse transformation"); break;
+ default: J40__UNREACHABLE();
+ }
+ }
+ break;
+
+ default: // while *some* channels can be empty, it is impossible that all channels are empty
+ J40__UNREACHABLE();
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// dequantization matrix and coefficient orders
+
+enum {
+ J40__NUM_DCT_SELECT = 27, // the number of all possible varblock types (DctSelect)
+ J40__NUM_DCT_PARAMS = 17, // the number of parameters, some shared by multiple DctSelects
+ J40__NUM_ORDERS = 13, // the number of distinct varblock dimensions & orders, after transposition
+};
+
+enum j40__dq_matrix_mode { // the number of params per channel follows:
+ J40__DQ_ENC_LIBRARY = 0, // 0
+ J40__DQ_ENC_HORNUSS = 1, // 3 (params)
+ J40__DQ_ENC_DCT2 = 2, // 6 (params)
+ J40__DQ_ENC_DCT4 = 3, // 2 (params) + n (dct_params)
+ // TODO DCT4x8 uses an undefined name "parameters" (should be "params")
+ J40__DQ_ENC_DCT4X8 = 4, // 1 (params) + n (dct_params)
+ J40__DQ_ENC_AFV = 5, // 9 (params) + n (dct_params) + m (dct4x4_params)
+ J40__DQ_ENC_DCT = 6, // n (params)
+ // all other modes eventually decode to:
+ J40__DQ_ENC_RAW = 7, // n rows * m columns, with the top-left 1/8 by 1/8 unused
+};
+
+typedef struct {
+ enum j40__dq_matrix_mode mode;
+ int16_t n, m;
+ j40_f32x4 *params; // the last element per each row is unused
+} j40__dq_matrix;
+
+J40__STATIC_RETURNS_ERR j40__read_dq_matrix(
+ j40__st *st, int32_t rows, int32_t columns, int64_t raw_sidx,
+ j40__tree_node *global_tree, const j40__code_spec *global_codespec, j40__dq_matrix *dqmat
+);
+J40_INLINE float j40__interpolate(float pos, int32_t c, const j40_f32x4 *bands, int32_t len);
+J40__STATIC_RETURNS_ERR j40__interpolation_bands(
+ j40__st *st, const j40_f32x4 *params, int32_t nparams, j40_f32x4 *out
+);
+J40_STATIC void j40__dct_quant_weights(
+ int32_t rows, int32_t columns, const j40_f32x4 *bands, int32_t len, j40_f32x4 *out
+);
+J40__STATIC_RETURNS_ERR j40__load_dq_matrix(j40__st *st, int32_t idx, j40__dq_matrix *dqmat);
+J40_STATIC void j40__free_dq_matrix(j40__dq_matrix *dqmat);
+J40__STATIC_RETURNS_ERR j40__natural_order(j40__st *st, int32_t log_rows, int32_t log_columns, int32_t **out);
+
+#ifdef J40_IMPLEMENTATION
+
+typedef struct { int8_t log_rows, log_columns, param_idx, order_idx; } j40__dct_select;
+static const j40__dct_select J40__DCT_SELECT[J40__NUM_DCT_SELECT] = {
+ // hereafter DCTnm refers to DCT(2^n)x(2^m) in the spec
+ /*DCT33*/ {3, 3, 0, 0}, /*Hornuss*/ {3, 3, 1, 1}, /*DCT11*/ {3, 3, 2, 1}, /*DCT22*/ {3, 3, 3, 1},
+ /*DCT44*/ {4, 4, 4, 2}, /*DCT55*/ {5, 5, 5, 3}, /*DCT43*/ {4, 3, 6, 4}, /*DCT34*/ {3, 4, 6, 4},
+ /*DCT53*/ {5, 3, 7, 5}, /*DCT35*/ {3, 5, 7, 5}, /*DCT54*/ {5, 4, 8, 6}, /*DCT45*/ {4, 5, 8, 6},
+ /*DCT23*/ {3, 3, 9, 1}, /*DCT32*/ {3, 3, 9, 1}, /*AFV0*/ {3, 3, 10, 1}, /*AFV1*/ {3, 3, 10, 1},
+ /*AFV2*/ {3, 3, 10, 1}, /*AFV3*/ {3, 3, 10, 1}, /*DCT66*/ {6, 6, 11, 7}, /*DCT65*/ {6, 5, 12, 8},
+ /*DCT56*/ {5, 6, 12, 8}, /*DCT77*/ {7, 7, 13, 9}, /*DCT76*/ {7, 6, 14, 10}, /*DCT67*/ {6, 7, 14, 10},
+ /*DCT88*/ {8, 8, 15, 11}, /*DCT87*/ {8, 7, 16, 12}, /*DCT78*/ {7, 8, 16, 12},
+};
+
+static const struct j40__dct_params {
+ int8_t log_rows, log_columns, def_offset, def_mode, def_n, def_m;
+} J40__DCT_PARAMS[J40__NUM_DCT_PARAMS] = {
+ /*DCT33*/ {3, 3, 0, J40__DQ_ENC_DCT, 6, 0}, /*Hornuss*/ {3, 3, 6, J40__DQ_ENC_HORNUSS, 0, 0},
+ /*DCT11*/ {3, 3, 9, J40__DQ_ENC_DCT2, 0, 0}, /*DCT22*/ {3, 3, 15, J40__DQ_ENC_DCT4, 4, 0},
+ /*DCT44*/ {4, 4, 21, J40__DQ_ENC_DCT, 7, 0}, /*DCT55*/ {5, 5, 28, J40__DQ_ENC_DCT, 8, 0},
+ /*DCT34*/ {3, 4, 36, J40__DQ_ENC_DCT, 7, 0}, /*DCT35*/ {3, 5, 43, J40__DQ_ENC_DCT, 8, 0},
+ /*DCT45*/ {4, 5, 51, J40__DQ_ENC_DCT, 8, 0}, /*DCT23*/ {3, 3, 59, J40__DQ_ENC_DCT4X8, 4, 0},
+ /*AFV*/ {3, 3, 64, J40__DQ_ENC_AFV, 4, 4}, /*DCT66*/ {6, 6, 81, J40__DQ_ENC_DCT, 8, 0},
+ /*DCT56*/ {5, 6, 89, J40__DQ_ENC_DCT, 8, 0}, /*DCT77*/ {7, 7, 97, J40__DQ_ENC_DCT, 8, 0},
+ /*DCT67*/ {6, 7, 105, J40__DQ_ENC_DCT, 8, 0}, /*DCT88*/ {8, 8, 113, J40__DQ_ENC_DCT, 8, 0},
+ /*DCT78*/ {7, 8, 121, J40__DQ_ENC_DCT, 8, 0},
+};
+
+#define J40__DCT4X4_DCT_PARAMS \
+ {2200.0f, 392.0f, 112.0f}, {0.0f, 0.0f, -0.25f}, {0.0f, 0.0f, -0.25f}, {0.0f, 0.0f, -0.5f} // (4)
+#define J40__DCT4X8_DCT_PARAMS \
+ {2198.050556016380522f, 764.3655248643528689f, 527.107573587542228f}, \
+ {-0.96269623020744692f, -0.92630200888366945f, -1.4594385811273854f}, \
+ {-0.76194253026666783f, -0.9675229603596517f, -1.450082094097871593f}, \
+ {-0.6551140670773547f, -0.27845290869168118f, -1.5843722511996204f} // (4)
+#define J40__LARGE_DCT_PARAMS(mult) \
+ /* it turns out that the first sets of parameters for larger DCTs have the same ratios */ \
+ {mult * 23629.073922049845f, mult * 8611.3238710010046f, mult * 4492.2486445538634f}, \
+ {-1.025f, -0.3041958212306401f, -1.2f}, {-0.78f, 0.3633036457487539f, -1.2f}, \
+ {-0.65012f, -0.35660379990111464f, -0.8f}, {-0.19041574084286472f, -0.3443074455424403f, -0.7f}, \
+ {-0.20819395464f, -0.33699592683512467f, -0.7f}, {-0.421064f, -0.30180866526242109f, -0.4f}, \
+ {-0.32733845535848671f, -0.27321683125358037f, -0.5f} // (8)
+static const float J40__LIBRARY_DCT_PARAMS[129][4] = {
+ // DCT33 dct_params (n=6) (SPEC some values are incorrect)
+ {3150.0f, 560.0f, 512.0f}, {0.0f, 0.0f, -2.0f}, {-0.4f, -0.3f, -1.0f},
+ {-0.4f, -0.3f, 0.0f}, {-0.4f, -0.3f, -1.0f}, {-2.0f, -0.3f, -2.0f},
+ // Hornuss params (3)
+ {280.0f, 60.0f, 18.0f}, {3160.0f, 864.0f, 200.0f}, {3160.0f, 864.0f, 200.0f},
+ // DCT11 params (6)
+ {3840.0f, 960.0f, 640.0f}, {2560.0f, 640.0f, 320.0f}, {1280.0f, 320.0f, 128.0f},
+ {640.0f, 180.0f, 64.0f}, {480.0f, 140.0f, 32.0f}, {300.0f, 120.0f, 16.0f},
+ // DCT22 params (2) + dct_params (n=4) (TODO spec bug: some values are incorrect)
+ {1.0f, 1.0f, 1.0f}, {1.0f, 1.0f, 1.0f}, J40__DCT4X4_DCT_PARAMS,
+ // DCT44 dct_params (n=7)
+ {8996.8725711814115328f, 3191.48366296844234752f, 1157.50408145487200256f},
+ {-1.3000777393353804f, -0.67424582104194355f, -2.0531423165804414f},
+ {-0.49424529824571225f, -0.80745813428471001f, -1.4f},
+ {-0.439093774457103443f, -0.44925837484843441f, -0.50687130033378396f},
+ {-0.6350101832695744f, -0.35865440981033403f, -0.42708730624733904f},
+ {-0.90177264050827612f, -0.31322389111877305f, -1.4856834539296244f},
+ {-1.6162099239887414f, -0.37615025315725483f, -4.9209142884401604f},
+ // DCT55 dct_params (n=8)
+ {15718.40830982518931456f, 7305.7636810695983104f, 3803.53173721215041536f},
+ {-1.025f, -0.8041958212306401f, -3.060733579805728f},
+ {-0.98f, -0.7633036457487539f, -2.0413270132490346f},
+ {-0.9012f, -0.55660379990111464f, -2.0235650159727417f},
+ {-0.4f, -0.49785304658857626f, -0.5495389509954993f},
+ {-0.48819395464f, -0.43699592683512467f, -0.4f},
+ {-0.421064f, -0.40180866526242109f, -0.4f},
+ {-0.27f, -0.27321683125358037f, -0.3f},
+ // DCT34 dct_params (n=7)
+ {7240.7734393502f, 1448.15468787004f, 506.854140754517f},
+ {-0.7f, -0.5f, -1.4f}, {-0.7f, -0.5f, -0.2f}, {-0.2f, -0.5f, -0.5f},
+ {-0.2f, -0.2f, -0.5f}, {-0.2f, -0.2f, -1.5f}, {-0.5f, -0.2f, -3.6f},
+ // DCT35 dct_params (n=8)
+ {16283.2494710648897f, 5089.15750884921511936f, 3397.77603275308720128f},
+ {-1.7812845336559429f, -0.320049391452786891f, -0.321327362693153371f},
+ {-1.6309059012653515f, -0.35362849922161446f, -0.34507619223117997f},
+ {-1.0382179034313539f, -0.30340000000000003f, -0.70340000000000003f},
+ {-0.85f, -0.61f, -0.9f}, {-0.7f, -0.5f, -1.0f}, {-0.9f, -0.5f, -1.0f},
+ {-1.2360638576849587f, -0.6f, -1.1754605576265209f},
+ // DCT45 dct_params (n=8)
+ {13844.97076442300573f, 4798.964084220744293f, 1807.236946760964614f},
+ {-0.97113799999999995f, -0.61125308982767057f, -1.2f},
+ {-0.658f, -0.83770786552491361f, -1.2f}, {-0.42026f, -0.79014862079498627f, -0.7f},
+ {-0.22712f, -0.2692727459704829f, -0.7f}, {-0.2206f, -0.38272769465388551f, -0.7f},
+ {-0.226f, -0.22924222653091453f, -0.4f}, {-0.6f, -0.20719098826199578f, -0.5f},
+ // DCT23 params (1) + dct_params (n=4)
+ {1.0f, 1.0f, 1.0f}, J40__DCT4X8_DCT_PARAMS,
+ // AFV params (9) + dct_params (n=4) + dct4x4_params (m=4)
+ // (SPEC params & dct_params are swapped; TODO spec bug: dct4x4_params are also incorrect)
+ {3072.0f, 1024.0f, 384.0f}, {3072.0f, 1024.0f, 384.0f}, {256.0f, 50.0f, 12.0f},
+ {256.0f, 50.0f, 12.0f}, {256.0f, 50.0f, 12.0f}, {414.0f, 58.0f, 22.0f},
+ {0.0f, 0.0f, -0.25f}, {0.0f, 0.0f, -0.25f}, {0.0f, 0.0f, -0.25f},
+ J40__DCT4X8_DCT_PARAMS, J40__DCT4X4_DCT_PARAMS,
+
+ J40__LARGE_DCT_PARAMS(0.9f), // DCT66 dct_params (n=8)
+ J40__LARGE_DCT_PARAMS(0.65f), // DCT56 dct_params (n=8)
+ J40__LARGE_DCT_PARAMS(1.8f), // DCT77 dct_params (n=8)
+ J40__LARGE_DCT_PARAMS(1.3f), // DCT67 dct_params (n=8)
+ J40__LARGE_DCT_PARAMS(3.6f), // DCT88 dct_params (n=8)
+ J40__LARGE_DCT_PARAMS(2.6f), // DCT78 dct_params (n=8)
+};
+
+static const int8_t J40__LOG_ORDER_SIZE[J40__NUM_ORDERS][2] = {
+ {3,3}, {3,3}, {4,4}, {5,5}, {3,4}, {3,5}, {4,5}, {6,6}, {5,6}, {7,7}, {6,7}, {8,8}, {7,8},
+};
+
+J40__STATIC_RETURNS_ERR j40__read_dq_matrix(
+ j40__st *st, int32_t rows, int32_t columns, int64_t raw_sidx,
+ j40__tree_node *global_tree, const j40__code_spec *global_codespec, j40__dq_matrix *dqmat
+) {
+ j40__modular m = J40__INIT;
+ int32_t c, i, j;
+
+ dqmat->mode = (enum j40__dq_matrix_mode) j40__u(st, 3);
+ dqmat->params = NULL;
+ if (dqmat->mode == J40__DQ_ENC_RAW) { // read as a modular image
+ float denom, inv_denom;
+ int32_t w[3], h[3], x, y;
+
+ denom = j40__f16(st);
+ // TODO spec bug: ZeroPadToByte isn't required at this point
+ J40__SHOULD(j40__surely_nonzero(denom), "dqm0");
+ inv_denom = 1.0f / denom;
+
+ w[0] = w[1] = w[2] = columns;
+ h[0] = h[1] = h[2] = rows;
+ J40__TRY(j40__init_modular(st, 3, w, h, &m));
+ J40__TRY(j40__modular_header(st, global_tree, global_codespec, &m));
+ J40__TRY(j40__allocate_modular(st, &m));
+ for (c = 0; c < m.num_channels; ++c) J40__TRY(j40__modular_channel(st, &m, c, raw_sidx));
+ J40__TRY(j40__finish_and_free_code(st, &m.code));
+ J40__TRY(j40__inverse_transform(st, &m));
+
+ J40__TRY_MALLOC(j40_f32x4, &dqmat->params, (size_t) (rows * columns));
+ for (c = 0; c < 3; ++c) {
+ if (m.channel[c].type == J40__PLANE_I16) {
+ for (y = 0; y < rows; ++y) {
+ int16_t *pixels = J40__I16_PIXELS(&m.channel[c], y);
+ for (x = 0; x < columns; ++x) {
+ dqmat->params[y * columns + x][c] = (float) pixels[x] * inv_denom;
+ }
+ }
+ } else {
+ for (y = 0; y < rows; ++y) {
+ int32_t *pixels = J40__I32_PIXELS(&m.channel[c], y);
+ for (x = 0; x < columns; ++x) {
+ dqmat->params[y * columns + x][c] = (float) pixels[x] * inv_denom;
+ }
+ }
+ }
+ }
+
+ j40__free_modular(&m);
+ dqmat->n = (int16_t) rows;
+ dqmat->m = (int16_t) columns;
+ } else {
+ static const struct how {
+ int8_t requires8x8; // 1 if 8x8 matrix is required
+ int8_t nparams; // the number of fixed parameters
+ int8_t nscaled; // params[0..nscaled-1] should be scaled by 64
+ int8_t ndctparams; // the number of calls to ReadDctParams
+ } HOW[7] = {{0,0,0,0}, {1,3,3,0}, {1,6,6,0}, {1,2,2,1}, {1,1,0,1}, {1,9,6,2}, {1,0,0,1}};
+ struct how how = HOW[dqmat->mode];
+ int32_t paramsize = how.nparams + how.ndctparams * 16, paramidx = how.nparams;
+ if (how.requires8x8) J40__SHOULD(rows == 8 && columns == 8, "dqm?");
+ if (paramsize) {
+ J40__TRY_MALLOC(j40_f32x4, &dqmat->params, (size_t) paramsize);
+ for (c = 0; c < 3; ++c) for (j = 0; j < how.nparams; ++j) {
+ dqmat->params[j][c] = j40__f16(st) * (j < how.nscaled ? 64.0f : 1.0f);
+ }
+ for (i = 0; i < how.ndctparams; ++i) { // ReadDctParams
+ int32_t n = *(i == 0 ? &dqmat->n : &dqmat->m) = (int16_t) (j40__u(st, 4) + 1);
+ for (c = 0; c < 3; ++c) for (j = 0; j < n; ++j) {
+ dqmat->params[paramidx + j][c] = j40__f16(st) * (j == 0 ? 64.0f : 1.0f);
+ }
+ paramidx += n;
+ }
+ }
+ J40__RAISE_DELAYED();
+ }
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(dqmat->params);
+ dqmat->params = NULL;
+ j40__free_modular(&m);
+ return st->err;
+}
+
+// piecewise exponential interpolation where pos is in [0,1], mapping pos = k/(len-1) to bands[k]
+J40_INLINE float j40__interpolate(float pos, int32_t c, const j40_f32x4 *bands, int32_t len) {
+ float scaled_pos, frac_idx, a, b;
+ int32_t scaled_idx;
+ if (len == 1) return bands[0][c];
+ scaled_pos = pos * (float) (len - 1);
+ scaled_idx = (int32_t) scaled_pos;
+ frac_idx = scaled_pos - (float) scaled_idx;
+ a = bands[scaled_idx][c];
+ b = bands[scaled_idx + 1][c];
+ return a * powf(b / a, frac_idx);
+}
+
+J40__STATIC_RETURNS_ERR j40__interpolation_bands(
+ j40__st *st, const j40_f32x4 *params, int32_t nparams, j40_f32x4 *out
+) {
+ int32_t i, c;
+ for (c = 0; c < 3; ++c) {
+ // TODO spec bug: loops for x & y are independent of the loop for i (bands)
+ // TODO spec bug: `bands(i)` for i >= 0 (not i > 0) should be larger (not no less) than 0
+ out[0][c] = params[0][c];
+ J40__SHOULD(out[0][c] > 0, "band");
+ for (i = 1; i < nparams; ++i) {
+ float v = params[i][c];
+ out[i][c] = v > 0 ? out[i - 1][c] * (1.0f + v) : out[i - 1][c] / (1.0f - v);
+ J40__SHOULD(out[i][c] > 0, "band");
+ }
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_STATIC void j40__dct_quant_weights(
+ int32_t rows, int32_t columns, const j40_f32x4 *bands, int32_t len, j40_f32x4 *out
+) {
+ float inv_rows_m1 = 1.0f / (float) (rows - 1), inv_columns_m1 = 1.0f / (float) (columns - 1);
+ int32_t x, y, c;
+ for (c = 0; c < 3; ++c) {
+ for (y = 0; y < rows; ++y) for (x = 0; x < columns; ++x) {
+ static const float INV_SQRT2 = 1.0f / 1.414214562373095f; // 1/(sqrt(2) + 1e-6)
+ float d = hypotf((float) x * inv_columns_m1, (float) y * inv_rows_m1);
+ // TODO spec issue: num_bands doesn't exist (probably len)
+ out[y * columns + x][c] = j40__interpolate(d * INV_SQRT2, c, bands, len);
+ }
+ }
+}
+
+// TODO spec issue: VarDCT uses the (row, column) notation, not the (x, y) notation; explicitly note this
+// TODO spec improvement: spec can provide computed matrices for default parameters to aid verification
+J40__STATIC_RETURNS_ERR j40__load_dq_matrix(j40__st *st, int32_t idx, j40__dq_matrix *dqmat) {
+ enum { MAX_BANDS = 15 };
+ const struct j40__dct_params dct = J40__DCT_PARAMS[idx];
+ enum j40__dq_matrix_mode mode;
+ int32_t rows, columns, n, m;
+ const j40_f32x4 *params;
+ j40_f32x4 *raw = NULL, bands[MAX_BANDS], scratch[64];
+ int32_t x, y, i, c;
+
+ mode = dqmat->mode;
+ if (mode == J40__DQ_ENC_RAW) {
+ return 0; // nothing to do
+ } else if (mode == J40__DQ_ENC_LIBRARY) {
+ mode = (enum j40__dq_matrix_mode) dct.def_mode;
+ n = dct.def_n;
+ m = dct.def_m;
+ params = J40__LIBRARY_DCT_PARAMS + dct.def_offset;
+ } else {
+ n = dqmat->n;
+ m = dqmat->m;
+ params = dqmat->params;
+ }
+
+ rows = 1 << dct.log_rows;
+ columns = 1 << dct.log_columns;
+ J40__TRY_MALLOC(j40_f32x4, &raw, (size_t) (rows * columns));
+
+ switch (mode) {
+ case J40__DQ_ENC_DCT:
+ J40__TRY(j40__interpolation_bands(st, params, n, bands));
+ j40__dct_quant_weights(rows, columns, bands, n, raw);
+ break;
+
+ case J40__DQ_ENC_DCT4:
+ J40__ASSERT(rows == 8 && columns == 8);
+ J40__ASSERT(n <= MAX_BANDS);
+ J40__TRY(j40__interpolation_bands(st, params + 2, n, bands));
+ j40__dct_quant_weights(4, 4, bands, n, scratch);
+ for (c = 0; c < 3; ++c) {
+ for (y = 0; y < 8; ++y) for (x = 0; x < 8; ++x) {
+ raw[y * 8 + x][c] = scratch[(y / 2) * 4 + (x / 2)][c];
+ }
+ raw[001][c] /= params[0][c];
+ raw[010][c] /= params[0][c];
+ raw[011][c] /= params[1][c];
+ }
+ break;
+
+ case J40__DQ_ENC_DCT2:
+ J40__ASSERT(rows == 8 && columns == 8);
+ for (c = 0; c < 3; ++c) {
+ static const int8_t MAP[64] = {
+ // TODO spec issue: coefficient (0,0) is unspecified; means it shouldn't be touched
+ 0,0,2,2,4,4,4,4,
+ 0,1,2,2,4,4,4,4,
+ 2,2,3,3,4,4,4,4,
+ 2,2,3,3,4,4,4,4,
+ 4,4,4,4,5,5,5,5,
+ 4,4,4,4,5,5,5,5,
+ 4,4,4,4,5,5,5,5,
+ 4,4,4,4,5,5,5,5,
+ };
+ for (i = 0; i < 64; ++i) raw[i][c] = params[MAP[i]][c];
+ raw[0][c] = -1.0f;
+ }
+ break;
+
+ case J40__DQ_ENC_HORNUSS:
+ J40__ASSERT(rows == 8 && columns == 8);
+ for (c = 0; c < 3; ++c) {
+ for (i = 0; i < 64; ++i) raw[i][c] = params[0][c];
+ raw[000][c] = 1.0f;
+ raw[001][c] = raw[010][c] = params[1][c];
+ raw[011][c] = params[2][c];
+ }
+ break;
+
+ case J40__DQ_ENC_DCT4X8:
+ J40__ASSERT(rows == 8 && columns == 8);
+ J40__ASSERT(n <= MAX_BANDS);
+ J40__TRY(j40__interpolation_bands(st, params + 1, n, bands));
+ // TODO spec bug: 4 rows by 8 columns, not 8 rows by 4 columns (compare with AFV weights4x8)
+ // the position (x, y Idiv 2) is also confusing, since it's using the (x, y) notation
+ j40__dct_quant_weights(4, 8, bands, n, scratch);
+ for (c = 0; c < 3; ++c) {
+ for (y = 0; y < 8; ++y) for (x = 0; x < 8; ++x) {
+ raw[y * 8 + x][c] = scratch[(y / 2) * 8 + x][c];
+ }
+ raw[001][c] /= params[0][c];
+ }
+ break;
+
+ case J40__DQ_ENC_AFV:
+ J40__ASSERT(rows == 8 && columns == 8);
+ J40__ASSERT(n <= MAX_BANDS && m <= MAX_BANDS);
+ J40__TRY(j40__interpolation_bands(st, params + 9, n, bands));
+ j40__dct_quant_weights(4, 8, bands, n, scratch);
+ J40__TRY(j40__interpolation_bands(st, params + 9 + n, m, bands));
+ j40__dct_quant_weights(4, 4, bands, m, scratch + 32);
+ J40__TRY(j40__interpolation_bands(st, params + 5, 4, bands));
+ for (c = 0; c < 3; ++c) {
+ // TODO spec bug: this value can never be 1 because it will result in an out-of-bound
+ // access in j40__interpolate; libjxl avoids this by adding 1e-6 to the denominator
+ static const float FREQS[12] = { // precomputed values of (freqs[i] - lo) / (hi - lo + 1e-6)
+ 0.000000000f, 0.373436417f, 0.320380100f, 0.379332596f, 0.066671353f, 0.259756761f,
+ 0.530035651f, 0.789731061f, 0.149436598f, 0.559318823f, 0.669198646f, 0.999999917f,
+ };
+ scratch[0][c] = params[0][c]; // replaces the top-left corner of weights4x8
+ scratch[32][c] = params[1][c]; // replaces the top-left corner of weights4x4
+ for (i = 0; i < 12; ++i) scratch[i + 48][c] = j40__interpolate(FREQS[i], c, bands, 4);
+ scratch[60][c] = 1.0f;
+ for (i = 0; i < 3; ++i) scratch[i + 61][c] = params[i + 2][c];
+ }
+ for (c = 0; c < 3; ++c) {
+ // TODO spec bug: `weight(...)` uses multiple conflicting notations
+ static const int8_t MAP[64] = {
+ // 1..31 from weights4x8, 33..47 from weights4x4, 48..59 interpolated,
+ // 0/32/61..63 directly from parameters, 60 fixed to 1.0
+ 60, 32, 62, 33, 48, 34, 49, 35,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 61, 36, 63, 37, 50, 38, 51, 39,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 52, 40, 53, 41, 54, 42, 55, 43,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 56, 44, 57, 45, 58, 46, 59, 47,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ };
+ for (i = 0; i < 64; ++i) raw[i][c] = scratch[MAP[i]][c];
+ }
+ break;
+
+ default: J40__UNREACHABLE();
+ }
+
+ j40__free(dqmat->params);
+ dqmat->mode = J40__DQ_ENC_RAW;
+ dqmat->n = (int16_t) rows;
+ dqmat->m = (int16_t) columns;
+ dqmat->params = raw;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(raw);
+ return st->err;
+}
+
+J40_STATIC void j40__free_dq_matrix(j40__dq_matrix *dqmat) {
+ if (dqmat->mode != J40__DQ_ENC_LIBRARY) j40__free(dqmat->params);
+ dqmat->mode = J40__DQ_ENC_LIBRARY;
+ dqmat->params = NULL;
+}
+
+J40__STATIC_RETURNS_ERR j40__natural_order(j40__st *st, int32_t log_rows, int32_t log_columns, int32_t **out) {
+ int32_t size = 1 << (log_rows + log_columns), log_slope = log_columns - log_rows;
+ int32_t rows8 = 1 << (log_rows - 3), columns8 = 1 << (log_columns - 3);
+ int32_t *order = NULL;
+ int32_t y, x, key1, o;
+
+ J40__ASSERT(8 >= log_columns && log_columns >= log_rows && log_rows >= 3);
+
+ J40__TRY_MALLOC(int32_t, &order, (size_t) size);
+
+ o = 0;
+ for (y = 0; y < rows8; ++y) for (x = 0; x < columns8; ++x) {
+ order[o++] = y << log_columns | x;
+ }
+
+ // d e..
+ // +---------/-/- each diagonal is identified by an integer
+ // | |/ / / key1 = scaled_x + scaled_y = x + y * 2^log_slope,
+ // |_a_b_c_| / / and covers at least one cell when:
+ // |/ / / / / / / 2^(log_columns - 3) <= key1 < 2^(log_columns + 1) - 2^log_slope.
+ for (key1 = 1 << (log_columns - 3); o < size; ++key1) {
+ // place initial endpoints to leftmost and topmost edges, then fix out-of-bounds later
+ int32_t x0 = key1 & ((1 << log_slope) - 1), y0 = key1 >> log_slope, x1 = key1, y1 = 0;
+ if (x1 >= (1 << log_columns)) {
+ int32_t excess = j40__ceil_div32(x1 - ((1 << log_columns) - 1), 1 << log_slope);
+ x1 -= excess << log_slope;
+ y1 += excess;
+ J40__ASSERT(x1 >= 0 && y1 < (1 << log_rows));
+ }
+ if (y0 >= (1 << log_rows)) {
+ int32_t excess = y0 - ((1 << log_rows) - 1);
+ x0 += excess << log_slope;
+ y0 -= excess;
+ J40__ASSERT(x0 < (1 << log_columns) && y0 >= 0);
+ }
+ J40__ASSERT(o + (y0 - y1 + 1) <= size);
+ if (key1 & 1) {
+ for (x = x1, y = y1; x >= x0; x -= 1 << log_slope, ++y) {
+ // skip the already covered top-left LLF region
+ if (y >= rows8 || x >= columns8) order[o++] = y << log_columns | x;
+ }
+ } else {
+ for (x = x0, y = y0; x <= x1; x += 1 << log_slope, --y) {
+ if (y >= rows8 || x >= columns8) order[o++] = y << log_columns | x;
+ }
+ }
+ }
+ J40__ASSERT(o == size);
+
+ *out = order;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(order);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// frame context
+
+enum {
+ J40__MAX_PASSES = 11,
+};
+
+enum {
+ J40__BLEND_REPLACE = 0, // new
+ J40__BLEND_ADD = 1, // old + new
+ J40__BLEND_BLEND = 2, // new + old * (1 - new alpha) or equivalent, optionally clamped
+ J40__BLEND_MUL_ADD = 3, // old + new * alpha or equivalent, optionally clamped
+ J40__BLEND_MUL = 4, // old * new, optionally clamped
+};
+typedef struct {
+ int8_t mode, alpha_chan, clamp, src_ref_frame;
+} j40__blend_info;
+
+enum j40__frame_type {
+ J40__FRAME_REGULAR = 0, J40__FRAME_LF = 1, J40__FRAME_REFONLY = 2, J40__FRAME_REGULAR_SKIPPROG = 3
+};
+
+typedef struct j40__frame_st {
+ int is_last;
+ enum j40__frame_type type;
+ int is_modular; // VarDCT if false
+ int has_noise, has_patches, has_splines, use_lf_frame, skip_adapt_lf_smooth;
+ int do_ycbcr;
+ int32_t jpeg_upsampling; // [0] | [1] << 2 | [2] << 4
+ int32_t log_upsampling, *ec_log_upsampling;
+ int32_t group_size_shift;
+ int32_t x_qm_scale, b_qm_scale;
+ int32_t num_passes;
+ int8_t shift[J40__MAX_PASSES];
+ int8_t log_ds[J40__MAX_PASSES + 1]; // pass i shift range is [log_ds[i+1], log_ds[i])
+ int32_t lf_level;
+ int32_t x0, y0, width, height;
+ int32_t grows, gcolumns, ggrows, ggcolumns;
+ // there can be at most (2^23 + 146)^2 groups and (2^20 + 29)^2 LF groups in a single frame
+ int64_t num_groups, num_lf_groups;
+ int64_t duration, timecode;
+ j40__blend_info blend_info, *ec_blend_info;
+ int32_t save_as_ref;
+ int save_before_ct;
+ int32_t name_len;
+ char *name;
+ struct {
+ int enabled;
+ float weights[3 /*xyb*/][2 /*0=weight1 (cardinal/center), 1=weight2 (diagonal/center)*/];
+ } gab;
+ struct {
+ int32_t iters;
+ float sharp_lut[8], channel_scale[3];
+ float quant_mul, pass0_sigma_scale, pass2_sigma_scale, border_sad_mul, sigma_for_modular;
+ } epf;
+ // TODO spec bug: m_*_lf_unscaled are wildly incorrect, both in default values and scaling
+ float m_lf_scaled[3 /*xyb*/];
+ j40__tree_node *global_tree;
+ j40__code_spec global_codespec;
+
+ // modular only, available after LfGlobal (local groups are always pasted into gmodular)
+ j40__modular gmodular;
+ int32_t num_gm_channels; // <= gmodular.num_channels
+
+ // vardct only, available after LfGlobal
+ int32_t global_scale, quant_lf;
+ int32_t lf_thr[3 /*xyb*/][15], qf_thr[15];
+ int32_t nb_lf_thr[3 /*xyb*/], nb_qf_thr;
+ uint8_t *block_ctx_map;
+ int32_t block_ctx_size, nb_block_ctx;
+ float inv_colour_factor;
+ int32_t x_factor_lf, b_factor_lf;
+ float base_corr_x, base_corr_b;
+
+ // vardct only, available after HfGlobal/HfPass
+ int32_t dct_select_used, dct_select_loaded; // i-th bit for DctSelect i
+ int32_t order_used, order_loaded; // i-th bit for order i
+ j40__dq_matrix dq_matrix[J40__NUM_DCT_PARAMS];
+ int32_t num_hf_presets;
+ // Lehmer code + sentinel (-1) before actual coefficient decoding,
+ // either properly computed or discarded due to non-use later (can be NULL in that case)
+ int32_t *orders[J40__MAX_PASSES][J40__NUM_ORDERS][3 /*xyb*/];
+ j40__code_spec coeff_codespec[J40__MAX_PASSES];
+} j40__frame_st;
+
+J40_STATIC void j40__free_frame_state(j40__frame_st *f);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC void j40__free_frame_state(j40__frame_st *f) {
+ int32_t i, j, k;
+ j40__free(f->ec_log_upsampling);
+ j40__free(f->ec_blend_info);
+ j40__free(f->name);
+ j40__free(f->global_tree);
+ j40__free_code_spec(&f->global_codespec);
+ j40__free_modular(&f->gmodular);
+ j40__free(f->block_ctx_map);
+ for (i = 0; i < J40__NUM_DCT_PARAMS; ++i) j40__free_dq_matrix(&f->dq_matrix[i]);
+ for (i = 0; i < J40__MAX_PASSES; ++i) {
+ for (j = 0; j < J40__NUM_ORDERS; ++j) {
+ for (k = 0; k < 3; ++k) {
+ j40__free(f->orders[i][j][k]);
+ f->orders[i][j][k] = NULL;
+ }
+ }
+ j40__free_code_spec(&f->coeff_codespec[i]);
+ }
+ f->ec_log_upsampling = NULL;
+ f->ec_blend_info = NULL;
+ f->name = NULL;
+ f->global_tree = NULL;
+ f->block_ctx_map = NULL;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// frame header
+
+J40__STATIC_RETURNS_ERR j40__frame_header(j40__st *st);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__frame_header(j40__st *st) {
+ j40__image_st *im = st->image;
+ j40__frame_st *f = st->frame;
+ int32_t i, j;
+
+ f->is_last = 1;
+ f->type = J40__FRAME_REGULAR;
+ f->is_modular = 0;
+ f->has_noise = f->has_patches = f->has_splines = f->use_lf_frame = f->skip_adapt_lf_smooth = 0;
+ f->do_ycbcr = 0;
+ f->jpeg_upsampling = 0;
+ f->log_upsampling = 0;
+ f->ec_log_upsampling = NULL;
+ f->group_size_shift = 8;
+ f->x_qm_scale = 3;
+ f->b_qm_scale = 2;
+ f->num_passes = 1;
+ f->shift[0] = 0; // last pass if default
+ f->log_ds[0] = 3; f->log_ds[1] = 0; // last pass if default
+ f->lf_level = 0;
+ f->x0 = f->y0 = 0;
+ f->width = im->width;
+ f->height = im->height;
+ f->duration = f->timecode = 0;
+ f->blend_info.mode = J40__BLEND_REPLACE;
+ f->blend_info.alpha_chan = 0; // XXX set to the actual alpha channel
+ f->blend_info.clamp = 0;
+ f->blend_info.src_ref_frame = 0;
+ f->ec_blend_info = NULL;
+ f->save_as_ref = 0;
+ f->save_before_ct = 1;
+ f->name_len = 0;
+ f->name = NULL;
+ f->gab.enabled = 1;
+ f->gab.weights[0][0] = f->gab.weights[1][0] = f->gab.weights[2][0] = 0.115169525f;
+ f->gab.weights[0][1] = f->gab.weights[1][1] = f->gab.weights[2][1] = 0.061248592f;
+ f->epf.iters = 2;
+ for (i = 0; i < 8; ++i) f->epf.sharp_lut[i] = (float) i / 7.0f;
+ f->epf.channel_scale[0] = 40.0f;
+ f->epf.channel_scale[1] = 5.0f;
+ f->epf.channel_scale[2] = 3.5f;
+ f->epf.quant_mul = 0.46f;
+ f->epf.pass0_sigma_scale = 0.9f;
+ f->epf.pass2_sigma_scale = 6.5f;
+ f->epf.border_sad_mul = 2.0f / 3.0f;
+ f->epf.sigma_for_modular = 1.0f;
+ // TODO spec bug: default values for m_*_lf_unscaled should be reciprocals of the listed values
+ f->m_lf_scaled[0] = 1.0f / 4096.0f;
+ f->m_lf_scaled[1] = 1.0f / 512.0f;
+ f->m_lf_scaled[2] = 1.0f / 256.0f;
+ f->global_tree = NULL;
+ memset(&f->global_codespec, 0, sizeof(j40__code_spec));
+ memset(&f->gmodular, 0, sizeof(j40__modular));
+ f->block_ctx_map = NULL;
+ f->inv_colour_factor = 1 / 84.0f;
+ f->x_factor_lf = 0;
+ f->b_factor_lf = 0;
+ f->base_corr_x = 0.0f;
+ f->base_corr_b = 1.0f;
+ f->dct_select_used = f->dct_select_loaded = 0;
+ f->order_used = f->order_loaded = 0;
+ memset(f->dq_matrix, 0, sizeof(f->dq_matrix));
+ memset(f->orders, 0, sizeof(f->orders));
+ memset(f->coeff_codespec, 0, sizeof(f->coeff_codespec));
+
+ J40__TRY(j40__zero_pad_to_byte(st));
+
+ if (!j40__u(st, 1)) { // !all_default
+ int full_frame = 1;
+ uint64_t flags;
+ f->type = (enum j40__frame_type) j40__u(st, 2);
+ f->is_modular = j40__u(st, 1);
+ flags = j40__u64(st);
+ f->has_noise = (int) (flags & 1);
+ f->has_patches = (int) (flags >> 1 & 1);
+ f->has_splines = (int) (flags >> 4 & 1);
+ f->use_lf_frame = (int) (flags >> 5 & 1);
+ f->skip_adapt_lf_smooth = (int) (flags >> 7 & 1);
+ if (!im->xyb_encoded) f->do_ycbcr = j40__u(st, 1);
+ if (!f->use_lf_frame) {
+ if (f->do_ycbcr) f->jpeg_upsampling = j40__u(st, 6); // yes, we are lazy
+ f->log_upsampling = j40__u(st, 2);
+ J40__SHOULD(f->log_upsampling == 0, "TODO: upsampling is not yet implemented");
+ J40__TRY_MALLOC(int32_t, &f->ec_log_upsampling, (size_t) im->num_extra_channels);
+ for (i = 0; i < im->num_extra_channels; ++i) {
+ f->ec_log_upsampling[i] = j40__u(st, 2);
+ J40__SHOULD(f->ec_log_upsampling[i] == 0, "TODO: upsampling is not yet implemented");
+ }
+ }
+ if (f->is_modular) {
+ f->group_size_shift = 7 + j40__u(st, 2);
+ } else if (im->xyb_encoded) {
+ f->x_qm_scale = j40__u(st, 3);
+ f->b_qm_scale = j40__u(st, 3);
+ }
+ if (f->type != J40__FRAME_REFONLY) {
+ f->num_passes = j40__u32(st, 1, 0, 2, 0, 3, 0, 4, 3);
+ if (f->num_passes > 1) {
+ // SPEC this part is especially flaky and the spec and libjxl don't agree to each other.
+ // we do the most sensible thing that is still compatible to libjxl:
+ // - downsample should be decreasing (or stay same)
+ // - last_pass should be strictly increasing and last_pass[0] (if any) should be 0
+ // see also https://github.com/libjxl/libjxl/issues/1401
+ int8_t log_ds[4];
+ int32_t ppass = 0, num_ds = j40__u32(st, 0, 0, 1, 0, 2, 0, 3, 1);
+ J40__SHOULD(num_ds < f->num_passes, "pass");
+ for (i = 0; i < f->num_passes - 1; ++i) f->shift[i] = (int8_t) j40__u(st, 2);
+ f->shift[f->num_passes - 1] = 0;
+ for (i = 0; i < num_ds; ++i) {
+ log_ds[i] = (int8_t) j40__u(st, 2);
+ if (i > 0) J40__SHOULD(log_ds[i - 1] >= log_ds[i], "pass");
+ }
+ for (i = 0; i < num_ds; ++i) {
+ int32_t pass = j40__u32(st, 0, 0, 1, 0, 2, 0, 0, 3);
+ J40__SHOULD(i > 0 ? ppass < pass && pass < f->num_passes : pass == 0, "pass");
+ while (ppass < pass) f->log_ds[++ppass] = i > 0 ? log_ds[i - 1] : 3;
+ }
+ while (ppass < f->num_passes) f->log_ds[++ppass] = i > 0 ? log_ds[num_ds - 1] : 3;
+ }
+ }
+ if (f->type == J40__FRAME_LF) {
+ f->lf_level = j40__u(st, 2) + 1;
+ } else if (j40__u(st, 1)) { // have_crop
+ if (f->type != J40__FRAME_REFONLY) { // SPEC missing UnpackSigned
+ f->x0 = j40__unpack_signed(j40__u32(st, 0, 8, 256, 11, 2304, 14, 18688, 30));
+ f->y0 = j40__unpack_signed(j40__u32(st, 0, 8, 256, 11, 2304, 14, 18688, 30));
+ }
+ f->width = j40__u32(st, 0, 8, 256, 11, 2304, 14, 18688, 30);
+ f->height = j40__u32(st, 0, 8, 256, 11, 2304, 14, 18688, 30);
+ J40__SHOULD(f->width <= st->limits->width && f->height <= st->limits->height, "slim");
+ J40__SHOULD((int64_t) f->width * f->height <= st->limits->pixels, "slim");
+ full_frame = f->x0 <= 0 && f->y0 <= 0 &&
+ f->width + f->x0 >= im->width && f->height + f->y0 >= im->height;
+ }
+ if (f->type == J40__FRAME_REGULAR || f->type == J40__FRAME_REGULAR_SKIPPROG) {
+ J40__TRY_MALLOC(j40__blend_info, &f->ec_blend_info, (size_t) im->num_extra_channels);
+ for (i = -1; i < im->num_extra_channels; ++i) {
+ j40__blend_info *blend = i < 0 ? &f->blend_info : &f->ec_blend_info[i];
+ blend->mode = (int8_t) j40__u32(st, 0, 0, 1, 0, 2, 0, 3, 2);
+ if (im->num_extra_channels > 0) {
+ if (blend->mode == J40__BLEND_BLEND || blend->mode == J40__BLEND_MUL_ADD) {
+ blend->alpha_chan = (int8_t) j40__u32(st, 0, 0, 1, 0, 2, 0, 3, 3);
+ blend->clamp = (int8_t) j40__u(st, 1);
+ } else if (blend->mode == J40__BLEND_MUL) {
+ blend->clamp = (int8_t) j40__u(st, 1);
+ }
+ }
+ if (!full_frame || blend->mode != J40__BLEND_REPLACE) {
+ blend->src_ref_frame = (int8_t) j40__u(st, 2);
+ }
+ }
+ if (im->anim_tps_denom) { // have_animation stored implicitly
+ f->duration = j40__64u32(st, 0, 0, 1, 0, 0, 8, 0, 32);
+ if (im->anim_have_timecodes) {
+ f->timecode = j40__64u(st, 32);
+ }
+ }
+ f->is_last = j40__u(st, 1);
+ } else {
+ f->is_last = 0;
+ }
+ if (f->type != J40__FRAME_LF && !f->is_last) f->save_as_ref = j40__u(st, 2);
+ // SPEC this condition is essentially swapped with the default value in the spec
+ if (f->type == J40__FRAME_REFONLY || (
+ full_frame &&
+ (f->type == J40__FRAME_REGULAR || f->type == J40__FRAME_REGULAR_SKIPPROG) &&
+ f->blend_info.mode == J40__BLEND_REPLACE &&
+ (f->duration == 0 || f->save_as_ref != 0) &&
+ !f->is_last
+ )) {
+ f->save_before_ct = j40__u(st, 1);
+ } else {
+ f->save_before_ct = (f->type == J40__FRAME_LF);
+ }
+ J40__TRY(j40__name(st, &f->name_len, &f->name));
+ { // RestorationFilter
+ int restoration_all_default = j40__u(st, 1);
+ f->gab.enabled = restoration_all_default ? 1 : j40__u(st, 1);
+ if (f->gab.enabled) {
+ if (j40__u(st, 1)) { // gab_custom
+ for (i = 0; i < 3; ++i) {
+ for (j = 0; j < 2; ++j) f->gab.weights[i][j] = j40__f16(st);
+ }
+ }
+ }
+ f->epf.iters = restoration_all_default ? 2 : j40__u(st, 2);
+ if (f->epf.iters) {
+ if (!f->is_modular && j40__u(st, 1)) { // epf_sharp_custom
+ for (i = 0; i < 8; ++i) f->epf.sharp_lut[i] = j40__f16(st);
+ }
+ if (j40__u(st, 1)) { // epf_weight_custom
+ for (i = 0; i < 3; ++i) f->epf.channel_scale[i] = j40__f16(st);
+ J40__TRY(j40__skip(st, 32)); // ignored
+ }
+ if (j40__u(st, 1)) { // epf_sigma_custom
+ if (!f->is_modular) f->epf.quant_mul = j40__f16(st);
+ f->epf.pass0_sigma_scale = j40__f16(st);
+ f->epf.pass2_sigma_scale = j40__f16(st);
+ f->epf.border_sad_mul = j40__f16(st);
+ }
+ if (f->epf.iters && f->is_modular) f->epf.sigma_for_modular = j40__f16(st);
+ }
+ if (!restoration_all_default) J40__TRY(j40__extensions(st));
+ }
+ J40__TRY(j40__extensions(st));
+ }
+ J40__RAISE_DELAYED();
+
+ if (im->xyb_encoded && im->want_icc) f->save_before_ct = 1; // ignores the decoded bit
+ f->grows = j40__ceil_div32(f->height, 1 << f->group_size_shift);
+ f->gcolumns = j40__ceil_div32(f->width, 1 << f->group_size_shift);
+ f->num_groups = (int64_t) f->grows * f->gcolumns;
+ f->ggrows = j40__ceil_div32(f->height, 8 << f->group_size_shift);
+ f->ggcolumns = j40__ceil_div32(f->width, 8 << f->group_size_shift);
+ f->num_lf_groups = (int64_t) f->ggrows * f->ggcolumns;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(f->ec_log_upsampling);
+ j40__free(f->ec_blend_info);
+ j40__free(f->name);
+ f->ec_log_upsampling = NULL;
+ f->ec_blend_info = NULL;
+ f->name = NULL;
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// frame header
+
+typedef struct {
+ int64_t idx; // either LF group index (pass < 0) or group index (pass >= 0)
+ int64_t codeoff;
+ int32_t size;
+ int32_t pass; // pass number, or negative if this is an LF group section
+} j40__section;
+
+typedef struct {
+ // if nonzero, there is only single section of this size and other fields are ignored
+ int32_t single_size;
+
+ // LfGlobal and HfGlobal are common dependencies of other sections, and handled separately
+ int64_t lf_global_codeoff, hf_global_codeoff;
+ int32_t lf_global_size, hf_global_size;
+
+ // other sections are ordered by codeoff, unless the earlier section needs the later section
+ // for decoding, in which case the order gets swapped
+ int64_t nsections, nsections_read;
+ j40__section *sections;
+ int64_t end_codeoff;
+} j40__toc;
+
+J40__STATIC_RETURNS_ERR j40__permutation(
+ j40__st *st, j40__code_st *code, int32_t size, int32_t skip, int32_t **out
+);
+J40_INLINE void j40__apply_permutation(void *targetbuf, void *temp, size_t elemsize, const int32_t *lehmer);
+J40__STATIC_RETURNS_ERR j40__read_toc(j40__st *st, j40__toc *toc);
+J40_STATIC void j40__free_toc(j40__toc *toc);
+
+#ifdef J40_IMPLEMENTATION
+
+// also used in j40__hf_global; out is terminated by a sentinel (-1) or NULL if empty
+// TODO permutation may have to handle more than 2^31 entries
+J40__STATIC_RETURNS_ERR j40__permutation(
+ j40__st *st, j40__code_st *code, int32_t size, int32_t skip, int32_t **out
+) {
+ int32_t *arr = NULL;
+ int32_t i, prev, end;
+
+ J40__ASSERT(code->spec->num_dist == 8 + !!code->spec->lz77_enabled);
+
+ // SPEC this is the number of integers to read, not the last offset to read (can differ when skip > 0)
+ end = j40__code(st, j40__min32(7, j40__ceil_lg32((uint32_t) size + 1)), 0, code);
+ J40__SHOULD(end <= size - skip, "perm"); // SPEC missing
+ if (end == 0) {
+ *out = NULL;
+ return 0;
+ }
+
+ J40__TRY_MALLOC(int32_t, &arr, (size_t) (end + 1));
+ prev = 0;
+ for (i = 0; i < end; ++i) {
+ prev = arr[i] = j40__code(st, j40__min32(7, j40__ceil_lg32((uint32_t) prev + 1)), 0, code);
+ J40__SHOULD(prev < size - (skip + i), "perm"); // SPEC missing
+ }
+ arr[end] = -1; // sentinel
+ *out = arr;
+ return 0;
+
+J40__ON_ERROR:
+ free(arr);
+ return st->err;
+}
+
+// target is pre-shifted by skip
+J40_INLINE void j40__apply_permutation(
+ void *targetbuf, void *temp, size_t elemsize, const int32_t *lehmer
+) {
+ char *target = (char*) targetbuf;
+ if (!lehmer) return;
+ while (*lehmer >= 0) {
+ size_t x = (size_t) *lehmer++;
+ memcpy(temp, target + elemsize * x, elemsize);
+ memmove(target + elemsize, target, elemsize * x);
+ memcpy(target, temp, elemsize);
+ target += elemsize;
+ }
+}
+
+J40_STATIC int j40__compare_section(const void *a, const void *b) {
+ const j40__section *aa = (const j40__section*) a, *bb = (const j40__section*) b;
+ return aa->codeoff < bb->codeoff ? -1 : aa->codeoff > bb->codeoff ? 1 : 0;
+}
+
+J40__STATIC_RETURNS_ERR j40__read_toc(j40__st *st, j40__toc *toc) {
+ j40__frame_st *f = st->frame;
+
+ int64_t nsections = f->num_passes == 1 && f->num_groups == 1 ? 1 :
+ 1 /*lf_global*/ + f->num_lf_groups /*lf_group*/ +
+ 1 /*hf_global + hf_pass*/ + f->num_passes * f->num_groups /*group_pass*/;
+ int64_t nsections2;
+ j40__section *sections = NULL, *sections2 = NULL, temp;
+
+ // interleaved linked lists for each LF group; for each LF group `gg` there are three cases:
+ // - no relocated section if `relocs[gg].next == 0` (initial state).
+ // - a single relocated section `relocs[gg].section` if `relocs[gg].next < 0`.
+ // - 2+ relocated sections `relocs[i].section`, where `k` starts at `gg` and
+ // continues through `next` until it's negative.
+ struct reloc { int64_t next; j40__section section; } *relocs = NULL;
+ int64_t nrelocs, relocs_cap;
+
+ int32_t *lehmer = NULL;
+ j40__code_spec codespec = J40__INIT;
+ j40__code_st code = J40__INIT;
+ int64_t i, nremoved;
+ int32_t pass;
+
+ // TODO remove int32_t restrictions
+ J40__SHOULD((uint64_t) nsections <= SIZE_MAX && nsections <= INT32_MAX, "flen");
+
+ if (j40__u(st, 1)) { // permuted
+ J40__TRY(j40__read_code_spec(st, 8, &codespec));
+ j40__init_code(&code, &codespec);
+ J40__TRY(j40__permutation(st, &code, (int32_t) nsections, 0, &lehmer));
+ J40__TRY(j40__finish_and_free_code(st, &code));
+ j40__free_code_spec(&codespec);
+ }
+ J40__TRY(j40__zero_pad_to_byte(st));
+
+ // single section case: no allocation required
+ if (nsections == 1) {
+ toc->single_size = j40__u32(st, 0, 10, 1024, 14, 17408, 22, 4211712, 30);
+ J40__TRY(j40__zero_pad_to_byte(st));
+ toc->lf_global_codeoff = toc->hf_global_codeoff = 0;
+ toc->lf_global_size = toc->hf_global_size = 0;
+ toc->nsections = toc->nsections_read = 0;
+ toc->sections = NULL;
+ J40__SHOULD(j40__add64(j40__codestream_offset(st), toc->single_size, &toc->end_codeoff), "flen");
+ j40__free(lehmer);
+ return 0;
+ }
+
+ J40__TRY_MALLOC(j40__section, §ions, (size_t) nsections);
+ for (i = 0; i < nsections; ++i) {
+ sections[i].size = j40__u32(st, 0, 10, 1024, 14, 17408, 22, 4211712, 30);
+ }
+ J40__TRY(j40__zero_pad_to_byte(st));
+
+ sections[0].codeoff = j40__codestream_offset(st); // all TOC offsets are relative to this point
+ for (i = 1; i < nsections; ++i) {
+ J40__SHOULD(j40__add64(sections[i-1].codeoff, sections[i-1].size, §ions[i].codeoff), "flen");
+ }
+ J40__SHOULD(j40__add64(sections[i-1].codeoff, sections[i-1].size, &toc->end_codeoff), "flen");
+
+ if (lehmer) {
+ j40__apply_permutation(sections, &temp, sizeof(j40__section), lehmer);
+ j40__free(lehmer);
+ lehmer = NULL;
+ }
+
+ toc->lf_global_codeoff = sections[0].codeoff;
+ toc->lf_global_size = sections[0].size;
+ sections[0].codeoff = -1;
+ for (i = 0; i < f->num_lf_groups; ++i) {
+ sections[i + 1].pass = -1;
+ sections[i + 1].idx = i;
+ }
+ toc->hf_global_codeoff = sections[f->num_lf_groups + 1].codeoff;
+ toc->hf_global_size = sections[f->num_lf_groups + 1].size;
+ sections[f->num_lf_groups + 1].codeoff = -1;
+ for (pass = 0; pass < f->num_passes; ++pass) {
+ int64_t sectionid = 1 + f->num_lf_groups + 1 + pass * f->num_groups;
+ for (i = 0; i < f->num_groups; ++i) {
+ sections[sectionid + i].pass = pass;
+ sections[sectionid + i].idx = i;
+ }
+ }
+
+ // any group section depending on the later LF group section is temporarily moved to relocs
+ {
+ int32_t ggrow, ggcolumn;
+
+ J40__TRY_CALLOC(struct reloc, &relocs, (size_t) f->num_lf_groups);
+ nrelocs = relocs_cap = f->num_lf_groups;
+
+ for (ggrow = 0; ggrow < f->ggrows; ++ggrow) for (ggcolumn = 0; ggcolumn < f->ggcolumns; ++ggcolumn) {
+ int64_t ggidx = (int64_t) ggrow * f->ggcolumns + ggcolumn, ggsection = 1 + ggidx;
+ int64_t ggcodeoff = sections[ggsection].codeoff;
+ int64_t gsection_base =
+ 1 + f->num_lf_groups + 1 + (int64_t) (ggrow * 8) * f->gcolumns + (ggcolumn * 8);
+ int32_t grows_in_gg = j40__min32((ggrow + 1) * 8, f->grows) - ggrow * 8;
+ int32_t gcolumns_in_gg = j40__min32((ggcolumn + 1) * 8, f->gcolumns) - ggcolumn * 8;
+ int32_t grow_in_gg, gcolumn_in_gg;
+
+ for (pass = 0; pass < f->num_passes; ++pass) {
+ for (grow_in_gg = 0; grow_in_gg < grows_in_gg; ++grow_in_gg) {
+ for (gcolumn_in_gg = 0; gcolumn_in_gg < gcolumns_in_gg; ++gcolumn_in_gg) {
+ int64_t gsection = gsection_base + pass * f->num_groups +
+ (grow_in_gg * f->gcolumns + gcolumn_in_gg);
+ if (sections[gsection].codeoff > ggcodeoff) continue;
+ if (relocs[ggidx].next) {
+ J40__TRY_REALLOC64(struct reloc, &relocs, nrelocs + 1, &relocs_cap);
+ relocs[nrelocs] = relocs[ggidx];
+ relocs[ggidx].next = nrelocs++;
+ } else {
+ relocs[ggidx].next = -1;
+ }
+ relocs[ggidx].section = sections[gsection];
+ sections[gsection].codeoff = -1;
+ }
+ }
+ }
+ }
+ }
+
+ // remove any section with a codeoff -1 and sort the remainder
+ for (i = nremoved = 0; i < nsections; ++i) {
+ if (sections[i].codeoff < 0) {
+ ++nremoved;
+ } else {
+ sections[i - nremoved] = sections[i];
+ }
+ }
+ qsort(sections, (size_t) (nsections - nremoved), sizeof(j40__section), j40__compare_section);
+
+ // copy sections to sections2, but insert any relocated sections after corresponding LF group section
+ J40__TRY_MALLOC(j40__section, §ions2, (size_t) nsections);
+ nsections2 = 0;
+ for (i = 0; i < nsections - nremoved; ++i) {
+ int64_t j, first_reloc_off;
+ sections2[nsections2++] = sections[i];
+ if (sections[i].pass >= 0) continue;
+ j = sections[i].idx;
+ if (!relocs[j].next) continue;
+ first_reloc_off = nsections2;
+ while (j >= 0) {
+ sections2[nsections2++] = relocs[j].section;
+ j = relocs[j].next;
+ }
+ qsort(sections2 + first_reloc_off, (size_t) (nsections2 - first_reloc_off),
+ sizeof(j40__section), j40__compare_section);
+ }
+
+ toc->sections = sections2;
+ toc->nsections = nsections2;
+ toc->nsections_read = 0;
+ J40__ASSERT(nsections2 == nsections - 2); // excludes LfGlobal and HfGlobal
+
+ j40__free(sections);
+ j40__free(relocs);
+ j40__free(lehmer);
+ j40__free_code(&code);
+ j40__free_code_spec(&codespec);
+ return 0;
+
+J40__ON_ERROR:
+ j40__free(sections);
+ j40__free(sections2);
+ j40__free(relocs);
+ j40__free(lehmer);
+ j40__free_code(&code);
+ j40__free_code_spec(&codespec);
+ return st->err;
+}
+
+J40_STATIC void j40__free_toc(j40__toc *toc) {
+ j40__free(toc->sections);
+ toc->sections = NULL;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// DCT
+
+// both use `in` as a scratch space as well, so `in` will be altered after return
+J40_STATIC void j40__forward_dct_unscaled(
+ float *J40_RESTRICT out, float *J40_RESTRICT in, int32_t t, int32_t rep
+);
+J40_STATIC void j40__inverse_dct(
+ float *J40_RESTRICT out, float *J40_RESTRICT in, int32_t t, int32_t rep
+);
+
+J40_STATIC void j40__forward_dct2d_scaled_for_llf(
+ float *J40_RESTRICT buf, float *J40_RESTRICT scratch, int32_t log_rows, int32_t log_columns
+);
+J40_STATIC void j40__inverse_dct2d(
+ float *J40_RESTRICT buf, float *J40_RESTRICT scratch, int32_t log_rows, int32_t log_columns
+);
+
+J40_STATIC void j40__inverse_dct11(float *buf);
+J40_STATIC void j40__inverse_dct22(float *buf);
+J40_STATIC void j40__inverse_hornuss(float *buf);
+J40_STATIC void j40__inverse_dct32(float *buf);
+J40_STATIC void j40__inverse_dct23(float *buf);
+J40_STATIC void j40__inverse_afv22(float *J40_RESTRICT out, float *J40_RESTRICT in);
+J40_STATIC void j40__inverse_afv(float *buf, int flipx, int flipy);
+
+#ifdef J40_IMPLEMENTATION
+
+// this is more or less a direct translation of mcos2/3 algorithms described in:
+// Perera, S. M., & Liu, J. (2018). Lowest Complexity Self-Recursive Radix-2 DCT II/III Algorithms.
+// SIAM Journal on Matrix Analysis and Applications, 39(2), 664--682.
+
+// [(1<<n) + k] = 1/(2 cos((k+0.5)/2^(n+1) pi)) for n >= 1 and 0 <= k < 2^n
+J40_STATIC const float J40__HALF_SECANTS[256] = {
+ 0, 0, // unused
+ 0.54119610f, 1.30656296f, // n=1 for DCT-4
+ 0.50979558f, 0.60134489f, 0.89997622f, 2.56291545f, // n=2 for DCT-8
+ // n=3 for DCT-16
+ 0.50241929f, 0.52249861f, 0.56694403f, 0.64682178f, 0.78815462f, 1.06067769f, 1.72244710f, 5.10114862f,
+ // n=4 for DCT-32
+ 0.50060300f, 0.50547096f, 0.51544731f, 0.53104259f, 0.55310390f, 0.58293497f, 0.62250412f, 0.67480834f,
+ 0.74453627f, 0.83934965f, 0.97256824f, 1.16943993f, 1.48416462f, 2.05778101f, 3.40760842f, 10.1900081f,
+ // n=5 for DCT-64
+ 0.50015064f, 0.50135845f, 0.50378873f, 0.50747117f, 0.51245148f, 0.51879271f, 0.52657732f, 0.53590982f,
+ 0.54692044f, 0.55976981f, 0.57465518f, 0.59181854f, 0.61155735f, 0.63423894f, 0.66031981f, 0.69037213f,
+ 0.72512052f, 0.76549416f, 0.81270209f, 0.86834472f, 0.93458360f, 1.01440826f, 1.11207162f, 1.23383274f,
+ 1.38929396f, 1.59397228f, 1.87467598f, 2.28205007f, 2.92462843f, 4.08461108f, 6.79675071f, 20.3738782f,
+ // n=6 for DCT-128
+ 0.50003765f, 0.50033904f, 0.50094272f, 0.50185052f, 0.50306519f, 0.50459044f, 0.50643095f, 0.50859242f,
+ 0.51108159f, 0.51390633f, 0.51707566f, 0.52059987f, 0.52449054f, 0.52876071f, 0.53342493f, 0.53849944f,
+ 0.54400225f, 0.54995337f, 0.55637499f, 0.56329167f, 0.57073059f, 0.57872189f, 0.58729894f, 0.59649876f,
+ 0.60636246f, 0.61693573f, 0.62826943f, 0.64042034f, 0.65345190f, 0.66743520f, 0.68245013f, 0.69858665f,
+ 0.71594645f, 0.73464482f, 0.75481294f, 0.77660066f, 0.80017990f, 0.82574877f, 0.85353675f, 0.88381100f,
+ 0.91688445f, 0.95312587f, 0.99297296f, 1.03694904f, 1.08568506f, 1.13994868f, 1.20068326f, 1.26906117f,
+ 1.34655763f, 1.43505509f, 1.53699410f, 1.65559652f, 1.79520522f, 1.96181785f, 2.16395782f, 2.41416000f,
+ 2.73164503f, 3.14746219f, 3.71524274f, 4.53629094f, 5.82768838f, 8.15384860f, 13.5842903f, 40.7446881f,
+ // n=7 for DCT-256
+ 0.50000941f, 0.50008472f, 0.50023540f, 0.50046156f, 0.50076337f, 0.50114106f, 0.50159492f, 0.50212529f,
+ 0.50273257f, 0.50341722f, 0.50417977f, 0.50502081f, 0.50594098f, 0.50694099f, 0.50802161f, 0.50918370f,
+ 0.51042817f, 0.51175599f, 0.51316821f, 0.51466598f, 0.51625048f, 0.51792302f, 0.51968494f, 0.52153769f,
+ 0.52348283f, 0.52552196f, 0.52765682f, 0.52988922f, 0.53222108f, 0.53465442f, 0.53719139f, 0.53983424f,
+ 0.54258533f, 0.54544717f, 0.54842239f, 0.55151375f, 0.55472418f, 0.55805673f, 0.56151465f, 0.56510131f,
+ 0.56882030f, 0.57267538f, 0.57667051f, 0.58080985f, 0.58509780f, 0.58953898f, 0.59413825f, 0.59890075f,
+ 0.60383188f, 0.60893736f, 0.61422320f, 0.61969575f, 0.62536172f, 0.63122819f, 0.63730265f, 0.64359303f,
+ 0.65010770f, 0.65685553f, 0.66384594f, 0.67108889f, 0.67859495f, 0.68637535f, 0.69444203f, 0.70280766f,
+ 0.71148577f, 0.72049072f, 0.72983786f, 0.73954355f, 0.74962527f, 0.76010172f, 0.77099290f, 0.78232026f,
+ 0.79410679f, 0.80637720f, 0.81915807f, 0.83247799f, 0.84636782f, 0.86086085f, 0.87599311f, 0.89180358f,
+ 0.90833456f, 0.92563200f, 0.94374590f, 0.96273078f, 0.98264619f, 1.00355728f, 1.02553551f, 1.04865941f,
+ 1.07301549f, 1.09869926f, 1.12581641f, 1.15448427f, 1.18483336f, 1.21700940f, 1.25117548f, 1.28751481f,
+ 1.32623388f, 1.36756626f, 1.41177723f, 1.45916930f, 1.51008903f, 1.56493528f, 1.62416951f, 1.68832855f,
+ 1.75804061f, 1.83404561f, 1.91722116f, 2.00861611f, 2.10949453f, 2.22139378f, 2.34620266f, 2.48626791f,
+ 2.64454188f, 2.82479140f, 3.03189945f, 3.27231159f, 3.55471533f, 3.89110779f, 4.29853753f, 4.80207601f,
+ 5.44016622f, 6.27490841f, 7.41356676f, 9.05875145f, 11.6446273f, 16.3000231f, 27.1639777f, 81.4878422f,
+};
+
+// TODO spec bug: ScaleF doesn't match with the current libjxl! it turns out that this is actually
+// a set of factors for the Arai, Agui, Nakajima DCT & IDCT algorithm, which was only used in
+// older versions of libjxl (both the current libjxl and J40 currently uses Perera-Liu) and
+// not even a resampling algorithm to begin with.
+//
+// [(1<<N) + k] = 1 / (cos(k/2^(4+N) pi) * cos(k/2^(3+N) pi) * cos(k/2^(2+N) pi) * 2^N)
+// for N >= 1 and 0 <= k < 2^N
+J40_STATIC const float J40__LF2LLF_SCALES[64] = {
+ 0, // unused
+ 1.00000000f, // N=1, n=8
+ 0.50000000f, 0.55446868f, // N=2, n=16
+ 0.25000000f, 0.25644002f, 0.27723434f, 0.31763984f, // N=4, n=32
+ // N=8, n=64
+ 0.12500000f, 0.12579419f, 0.12822001f, 0.13241272f, 0.13861717f, 0.14722207f, 0.15881992f, 0.17431123f,
+ // N=16, n=128
+ 0.06250000f, 0.06259894f, 0.06289709f, 0.06339849f, 0.06411001f, 0.06504154f, 0.06620636f, 0.06762155f,
+ 0.06930858f, 0.07129412f, 0.07361103f, 0.07629973f, 0.07940996f, 0.08300316f, 0.08715562f, 0.09196277f,
+ // N=32, n=256
+ 0.03125000f, 0.03126236f, 0.03129947f, 0.03136146f, 0.03144855f, 0.03156101f, 0.03169925f, 0.03186372f,
+ 0.03205500f, 0.03227376f, 0.03252077f, 0.03279691f, 0.03310318f, 0.03344071f, 0.03381077f, 0.03421478f,
+ 0.03465429f, 0.03513107f, 0.03564706f, 0.03620441f, 0.03680552f, 0.03745302f, 0.03814986f, 0.03889931f,
+ 0.03970498f, 0.04057091f, 0.04150158f, 0.04250201f, 0.04357781f, 0.04473525f, 0.04598138f, 0.04732417f,
+};
+
+#define J40__SQRT2 1.4142135623730951f
+
+#define J40__DCT_ARGS float *J40_RESTRICT out, float *J40_RESTRICT in, int32_t t
+#define J40__REPEAT1() for (r1 = 0; r1 < rep1 * rep2; r1 += rep2)
+#define J40__REPEAT2() for (r2 = 0; r2 < rep2; ++r2)
+#define J40__IN(i) in[(i) * stride + r1 + r2]
+#define J40__OUT(i) out[(i) * stride + r1 + r2]
+
+J40_ALWAYS_INLINE void j40__forward_dct_core(
+ J40__DCT_ARGS, int32_t rep1, int32_t rep2,
+ void (*half_forward_dct)(J40__DCT_ARGS, int32_t rep1, int32_t rep2)
+) {
+ int32_t r1, r2, i, N = 1 << t, stride = rep1 * rep2;
+
+ // out[0..N) = W^c_N H_N in[0..N)
+ J40__REPEAT1() {
+ for (i = 0; i < N / 2; ++i) {
+ float mult = J40__HALF_SECANTS[N / 2 + i];
+ J40__REPEAT2() {
+ float x = J40__IN(i), y = J40__IN(N - i - 1);
+ J40__OUT(i) = x + y;
+ J40__OUT(N / 2 + i) = (x - y) * mult;
+ }
+ }
+ }
+
+ // in[0..N/2) = mcos2(out[0..N/2), N/2)
+ // in[N/2..N) = mcos2(out[N/2..N), N/2)
+ half_forward_dct(in, out, t - 1, rep1, rep2);
+ half_forward_dct(in + N / 2 * stride, out + N / 2 * stride, t - 1, rep1, rep2);
+
+ // out[0,2..N) = in[0..N/2)
+ J40__REPEAT1() for (i = 0; i < N / 2; ++i) J40__REPEAT2() {
+ J40__OUT(i * 2) = J40__IN(i);
+ }
+
+ // out[1,3..N) = B_(N/2) in[N/2..N)
+ J40__REPEAT1() {
+ J40__REPEAT2() J40__OUT(1) = J40__SQRT2 * J40__IN(N / 2) + J40__IN(N / 2 + 1);
+ for (i = 1; i < N / 2 - 1; ++i) {
+ J40__REPEAT2() J40__OUT(i * 2 + 1) = J40__IN(N / 2 + i) + J40__IN(N / 2 + i + 1);
+ }
+ J40__REPEAT2() J40__OUT(N - 1) = J40__IN(N - 1);
+ }
+}
+
+J40_ALWAYS_INLINE void j40__inverse_dct_core(
+ J40__DCT_ARGS, int32_t rep1, int32_t rep2,
+ void (*half_inverse_dct)(J40__DCT_ARGS, int32_t rep1, int32_t rep2)
+) {
+ int32_t r1, r2, i, N = 1 << t, stride = rep1 * rep2;
+
+ // out[0..N/2) = in[0,2..N)
+ J40__REPEAT1() {
+ for (i = 0; i < N / 2; ++i) {
+ J40__REPEAT2() J40__OUT(i) = J40__IN(i * 2);
+ }
+ }
+
+ // out[N/2..N) = (B_(N/2))^T in[1,3..N)
+ J40__REPEAT1() {
+ J40__REPEAT2() J40__OUT(N / 2) = J40__SQRT2 * J40__IN(1);
+ for (i = 1; i < N / 2; ++i) {
+ J40__REPEAT2() J40__OUT(N / 2 + i) = J40__IN(i * 2 - 1) + J40__IN(i * 2 + 1);
+ }
+ }
+
+ // in[0..N/2) = mcos3(out[0..N/2), N/2)
+ // in[N/2..N) = mcos3(out[N/2..N), N/2)
+ half_inverse_dct(in, out, t - 1, rep1, rep2);
+ half_inverse_dct(in + N / 2 * stride, out + N / 2 * stride, t - 1, rep1, rep2);
+
+ // out[0..N) = (H_N)^T W^c_N in[0..N)
+ J40__REPEAT1() {
+ for (i = 0; i < N / 2; ++i) {
+ float mult = J40__HALF_SECANTS[N / 2 + i];
+ J40__REPEAT2() {
+ float x = J40__IN(i), y = J40__IN(N / 2 + i);
+ // this might look wasteful, but modern compilers can optimize them into FMA
+ // which can be actually faster than a single multiplication (TODO verify this)
+ J40__OUT(i) = x + y * mult;
+ J40__OUT(N - i - 1) = x - y * mult;
+ }
+ }
+ }
+}
+
+J40_ALWAYS_INLINE void j40__dct2(J40__DCT_ARGS, int32_t rep1, int32_t rep2) {
+ int32_t r1, r2, stride = rep1 * rep2;
+ J40__ASSERT(t == 1); (void) t;
+ J40__REPEAT1() J40__REPEAT2() {
+ float x = J40__IN(0), y = J40__IN(1);
+ J40__OUT(0) = x + y;
+ J40__OUT(1) = x - y;
+ }
+}
+
+J40_ALWAYS_INLINE void j40__forward_dct4(J40__DCT_ARGS, int32_t rep1, int32_t rep2) {
+ J40__ASSERT(t == 2); (void) t;
+ j40__forward_dct_core(out, in, 2, rep1, rep2, j40__dct2);
+}
+
+J40_STATIC void j40__forward_dct_recur(J40__DCT_ARGS, int32_t rep1, int32_t rep2) {
+ if (t < 4) {
+ J40__ASSERT(t == 3);
+ j40__forward_dct_core(out, in, 3, rep1, rep2, j40__forward_dct4);
+ } else {
+ j40__forward_dct_core(out, in, t, rep1, rep2, j40__forward_dct_recur);
+ }
+}
+
+J40_STATIC void j40__forward_dct_recur_x8(J40__DCT_ARGS, int32_t rep1, int32_t rep2) {
+ J40__ASSERT(rep2 == 8); (void) rep2;
+ if (t < 4) {
+ J40__ASSERT(t == 3);
+ j40__forward_dct_core(out, in, 3, rep1, 8, j40__forward_dct4);
+ } else {
+ j40__forward_dct_core(out, in, t, rep1, 8, j40__forward_dct_recur_x8);
+ }
+}
+
+// this omits the final division by (1 << t)!
+J40_STATIC void j40__forward_dct_unscaled(J40__DCT_ARGS, int32_t rep) {
+ if (t <= 0) {
+ memcpy(out, in, sizeof(float) * (size_t) rep);
+ } else if (rep % 8 == 0) {
+ if (t == 1) j40__dct2(out, in, 1, rep / 8, 8);
+ else if (t == 2) j40__forward_dct4(out, in, 2, rep / 8, 8);
+ else j40__forward_dct_recur_x8(out, in, t, rep / 8, 8);
+ } else {
+ if (t == 1) j40__dct2(out, in, 1, rep, 1);
+ else if (t == 2) j40__forward_dct4(out, in, 2, rep, 1);
+ else j40__forward_dct_recur(out, in, t, rep, 1);
+ }
+}
+
+J40_ALWAYS_INLINE void j40__forward_dct_unscaled_view(j40__view_f32 *outv, j40__view_f32 *inv) {
+ j40__adapt_view_f32(outv, inv->logw, inv->logh);
+ j40__forward_dct_unscaled(outv->ptr, inv->ptr, inv->logh, 1 << inv->logw);
+}
+
+J40_ALWAYS_INLINE void j40__inverse_dct4(J40__DCT_ARGS, int32_t rep1, int32_t rep2) {
+ J40__ASSERT(t == 2); (void) t;
+ j40__inverse_dct_core(out, in, 2, rep1, rep2, j40__dct2);
+}
+
+J40_STATIC void j40__inverse_dct_recur(J40__DCT_ARGS, int32_t rep1, int32_t rep2) {
+ if (t < 4) {
+ J40__ASSERT(t == 3);
+ j40__inverse_dct_core(out, in, 3, rep1, rep2, j40__inverse_dct4);
+ } else {
+ j40__inverse_dct_core(out, in, t, rep1, rep2, j40__inverse_dct_recur);
+ }
+}
+
+J40_STATIC void j40__inverse_dct_recur_x8(J40__DCT_ARGS, int32_t rep1, int32_t rep2) {
+ J40__ASSERT(rep2 == 8); (void) rep2;
+ if (t < 4) {
+ J40__ASSERT(t == 3);
+ j40__inverse_dct_core(out, in, 3, rep1, 8, j40__inverse_dct4);
+ } else {
+ j40__inverse_dct_core(out, in, t, rep1, 8, j40__inverse_dct_recur_x8);
+ }
+}
+
+J40_STATIC void j40__inverse_dct(J40__DCT_ARGS, int32_t rep) {
+ if (t <= 0) {
+ memcpy(out, in, sizeof(float) * (size_t) rep);
+ } else if (rep % 8 == 0) {
+ if (t == 1) j40__dct2(out, in, 1, rep / 8, 8);
+ else if (t == 2) j40__inverse_dct4(out, in, 2, rep / 8, 8);
+ else j40__inverse_dct_recur_x8(out, in, t, rep / 8, 8);
+ } else {
+ if (t == 1) j40__dct2(out, in, 1, rep, 1);
+ else if (t == 2) j40__inverse_dct4(out, in, 2, rep, 1);
+ else j40__inverse_dct_recur(out, in, t, rep, 1);
+ }
+}
+
+J40_ALWAYS_INLINE void j40__inverse_dct_view(j40__view_f32 *outv, j40__view_f32 *inv) {
+ j40__adapt_view_f32(outv, inv->logw, inv->logh);
+ j40__inverse_dct(outv->ptr, inv->ptr, inv->logh, 1 << inv->logw);
+}
+
+#undef J40__DCT_ARGS
+#undef J40__IN
+#undef J40__OUT
+
+J40_STATIC void j40__forward_dct2d_scaled_for_llf(
+ float *J40_RESTRICT buf, float *J40_RESTRICT scratch, int32_t log_rows, int32_t log_columns
+) {
+ j40__view_f32 bufv = j40__make_view_f32(log_columns, log_rows, buf);
+ j40__view_f32 scratchv = j40__make_view_f32(log_columns, log_rows, scratch);
+ float *p;
+ int32_t x, y;
+
+ j40__forward_dct_unscaled_view(&scratchv, &bufv);
+ j40__transpose_view_f32(&bufv, scratchv);
+ j40__forward_dct_unscaled_view(&scratchv, &bufv);
+ // TODO spec bug (I.6.5): the pseudocode only works correctly when C > R;
+ // the condition itself can be eliminated by inlining DCT_2D though
+ J40__VIEW_FOREACH(scratchv, y, x, p) {
+ // hopefully compiler will factor the second multiplication out of the inner loop (TODO verify this)
+ *p *= J40__LF2LLF_SCALES[(1 << scratchv.logw) + x] * J40__LF2LLF_SCALES[(1 << scratchv.logh) + y];
+ }
+ // TODO spec improvement (I.6.3 note): given the pseudocode, it might be better to
+ // state that the DCT result *always* has C <= R, transposing as necessary.
+ if (log_columns > log_rows) {
+ j40__transpose_view_f32(&bufv, scratchv);
+ } else {
+ j40__copy_view_f32(&bufv, scratchv);
+ }
+ J40__ASSERT(bufv.logw == j40__max32(log_columns, log_rows));
+ J40__ASSERT(bufv.logh == j40__min32(log_columns, log_rows));
+}
+
+J40_STATIC void j40__inverse_dct2d(
+ float *J40_RESTRICT buf, float *J40_RESTRICT scratch, int32_t log_rows, int32_t log_columns
+) {
+ j40__view_f32 bufv;
+ j40__view_f32 scratchv = j40__make_view_f32(log_columns, log_rows, scratch);
+
+ if (log_columns > log_rows) {
+ // TODO spec improvement: coefficients start being transposed, note this as well
+ bufv = j40__make_view_f32(log_columns, log_rows, buf);
+ j40__transpose_view_f32(&scratchv, bufv);
+ } else {
+ bufv = j40__make_view_f32(log_rows, log_columns, buf);
+ j40__copy_view_f32(&scratchv, bufv);
+ }
+ j40__inverse_dct_view(&bufv, &scratchv);
+ j40__transpose_view_f32(&scratchv, bufv);
+ j40__inverse_dct_view(&bufv, &scratchv);
+ J40__ASSERT(bufv.logw == log_columns && bufv.logh == log_rows);
+}
+
+// a single iteration of AuxIDCT2x2
+J40_ALWAYS_INLINE void j40__aux_inverse_dct11(float *out, float *in, int32_t x, int32_t y, int32_t S2) {
+ int32_t p = y * 8 + x, q = (y * 2) * 8 + (x * 2);
+ float c00 = in[p], c01 = in[p + S2], c10 = in[p + S2 * 8], c11 = in[p + S2 * 9];
+ out[q + 000] = c00 + c01 + c10 + c11; // r00
+ out[q + 001] = c00 + c01 - c10 - c11; // r01
+ out[q + 010] = c00 - c01 + c10 - c11; // r10
+ out[q + 011] = c00 - c01 - c10 + c11; // r11
+}
+
+J40_STATIC void j40__inverse_dct11(float *buf) {
+ float scratch[64];
+ int32_t x, y;
+
+ // TODO spec issue: only the "top-left" SxS cells, not "top"
+ j40__aux_inverse_dct11(buf, buf, 0, 0, 1); // updates buf[(0..1)*8+(0..1)]
+ // updates scratch[(0..3)*8+(0..3)], copying other elements from buf in verbatim
+ memcpy(scratch, buf, sizeof(float) * 64);
+ for (y = 0; y < 2; ++y) for (x = 0; x < 2; ++x) j40__aux_inverse_dct11(scratch, buf, x, y, 2);
+ // updates the entire buf
+ for (y = 0; y < 4; ++y) for (x = 0; x < 4; ++x) j40__aux_inverse_dct11(buf, scratch, x, y, 4);
+}
+
+J40_STATIC void j40__inverse_dct22(float *buf) {
+ float scratch[64];
+ int32_t x, y;
+
+ j40__aux_inverse_dct11(buf, buf, 0, 0, 1);
+ // after the top-left inverse DCT2x2, four 4x4 submatrices are formed and IDCTed individually.
+ // IDCT itself requires transposition and the final matrices are stitched in a different way,
+ // but it turns out that IDCT can be done in place, only requiring the final stitching.
+ //
+ // input after transposition output
+ // a1 a2 b1 b2 c1 c2 d1 d2 a1 a3 e1 e3 i1 i3 m1 m3 a1 e1 i1 m1 a2 e2 i2 m2
+ // a3 a4 b3 b4 c3 c4 d3 d4 a2 a4 e2 e4 i2 i4 m2 m4 b1 f1 j1 n1 b2 f2 j2 n2
+ // e1 e2 f1 f2 g1 g2 h1 h2 b1 b3 f1 f3 j1 j3 n1 n3 c1 g1 k1 o1 c2 g2 k2 o2
+ // e3 e4 f3 f4 g3 g4 h3 h4 ---> b2 b4 f2 f4 j2 j4 n2 n4 ---> d1 k1 l1 p1 d2 k2 l2 p2
+ // i1 i2 j1 j2 k1 k2 l1 l2 c1 c3 g1 g3 k1 k3 o1 o3 a3 e3 i3 m3 a4 e4 i4 m4
+ // i3 i4 j3 j4 k3 k4 l3 l4 c2 c4 g2 g4 k2 k4 o2 o4 b3 f3 j3 n3 b4 f4 j4 n4
+ // m1 m2 n1 n2 o1 o2 p1 p2 d1 d3 h1 h3 l1 l3 p1 p3 c3 g3 k3 o3 c4 g4 k4 o4
+ // m3 m4 n3 n4 o3 o4 p3 p4 d2 d4 h2 h4 l2 l4 p2 p4 d3 k3 l3 p3 d4 k4 l4 p4
+ //
+ // TODO spec issue: notationally `sample` is a *4-dimensional* array, which is not very clear
+ j40__inverse_dct(scratch, buf, 2, 16); // columnar IDCT for a#-m#, b#-n#, c#-o# and d#-p#
+ for (y = 0; y < 8; ++y) for (x = 0; x < 8; ++x) buf[x * 8 + y] = scratch[y * 8 + x];
+ j40__inverse_dct(scratch, buf, 2, 16); // columnar IDCT for a#-d#, e#-h#, i#-l# and m#-p#
+ for (y = 0; y < 4; ++y) for (x = 0; x < 4; ++x) {
+ buf[y * 8 + x] = scratch[(y * 2) * 8 + (x * 2)];
+ buf[y * 8 + (x + 4)] = scratch[(y * 2 + 1) * 8 + (x * 2)];
+ buf[(y + 4) * 8 + x] = scratch[(y * 2) * 8 + (x * 2 + 1)];
+ buf[(y + 4) * 8 + (x + 4)] = scratch[(y * 2 + 1) * 8 + (x * 2 + 1)];
+ }
+}
+
+J40_STATIC void j40__inverse_hornuss(float *buf) {
+ float scratch[64];
+ int32_t x, y, ix, iy;
+ memcpy(scratch, buf, sizeof(float) * 64);
+ j40__aux_inverse_dct11(scratch, buf, 0, 0, 1); // updates scratch[(0..1)*8+(0..1)]
+ for (y = 0; y < 2; ++y) for (x = 0; x < 2; ++x) {
+ int32_t pos00 = y * 8 + x, pos11 = (y + 2) * 8 + (x + 2);
+ float rsum[4] = {0}, sample11;
+ for (iy = 0; iy < 4; ++iy) for (ix = 0; ix < 4; ++ix) {
+ rsum[ix] += scratch[(y + iy * 2) * 8 + (x + ix * 2)];
+ }
+ // conceptually (SUM rsum[i]) = residual_sum + coefficients(x, y) in the spec
+ sample11 = scratch[pos00] - (rsum[0] + rsum[1] + rsum[2] + rsum[3] - scratch[pos00]) * 0.0625f;
+ scratch[pos00] = scratch[pos11];
+ scratch[pos11] = 0.0f;
+ for (iy = 0; iy < 4; ++iy) for (ix = 0; ix < 4; ++ix) {
+ buf[(4 * y + iy) * 8 + (4 * x + ix)] = scratch[(y + iy * 2) * 8 + (x + ix * 2)] + sample11;
+ }
+ }
+}
+
+J40_STATIC void j40__inverse_dct32(float *buf) {
+ float scratch[64], tmp;
+ j40__view_f32 bufv = j40__make_view_f32(3, 3, buf);
+ j40__view_f32 scratchv = j40__make_view_f32(3, 3, scratch);
+
+ // coefficients form two 4 rows x 8 columns matrices from even and odd rows;
+ // note that this is NOT 8 rows x 4 columns, because of transposition
+ // TODO spec issue: inconsistent naming between coeffs_8x4 and coeffs_4x8
+ tmp = *J40__AT(bufv, 0, 0) + *J40__AT(bufv, 0, 1);
+ *J40__AT(bufv, 0, 1) = *J40__AT(bufv, 0, 0) - *J40__AT(bufv, 0, 1);
+ *J40__AT(bufv, 0, 0) = tmp;
+ j40__reshape_view_f32(&bufv, 4, 2);
+ j40__inverse_dct_view(&scratchv, &bufv);
+ j40__reshape_view_f32(&scratchv, 3, 3);
+ j40__transpose_view_f32(&bufv, scratchv);
+ j40__inverse_dct_view(&scratchv, &bufv);
+ j40__oddeven_columns_to_halves_f32(&bufv, scratchv);
+ J40__ASSERT(bufv.logw == 3 && bufv.logh == 3);
+}
+
+J40_STATIC void j40__inverse_dct23(float *buf) {
+ float scratch[64];
+ j40__view_f32 bufv = j40__make_view_f32(3, 3, buf);
+ j40__view_f32 scratchv = j40__make_view_f32(3, 3, scratch);
+
+ // coefficients form two 4 rows x 8 columns matrices from even and odd rows
+ j40__copy_view_f32(&scratchv, bufv);
+ *J40__AT(scratchv, 0, 0) = *J40__AT(bufv, 0, 0) + *J40__AT(bufv, 0, 1);
+ *J40__AT(scratchv, 0, 1) = *J40__AT(bufv, 0, 0) - *J40__AT(bufv, 0, 1);
+ j40__transpose_view_f32(&bufv, scratchv);
+ j40__inverse_dct_view(&scratchv, &bufv);
+ j40__transpose_view_f32(&bufv, scratchv);
+ j40__reshape_view_f32(&bufv, 4, 2);
+ j40__inverse_dct_view(&scratchv, &bufv);
+ j40__reshape_view_f32(&scratchv, 3, 3);
+ j40__oddeven_rows_to_halves_f32(&bufv, scratchv);
+ J40__ASSERT(bufv.logw == 3 && bufv.logh == 3);
+}
+
+// TODO spec issue: the input is a 4x4 matrix but indexed like a 1-dimensional array
+J40_STATIC void j40__inverse_afv22(float *J40_RESTRICT out, float *J40_RESTRICT in) {
+ static const float AFV_BASIS[256] = { // AFVBasis in the specification, but transposed
+ 0.25000000f, 0.87690293f, 0.00000000f, 0.00000000f,
+ 0.00000000f, -0.41053776f, 0.00000000f, 0.00000000f,
+ 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.25000000f, 0.22065181f, 0.00000000f, 0.00000000f,
+ -0.70710678f, 0.62354854f, 0.00000000f, 0.00000000f,
+ 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.25000000f, -0.10140050f, 0.40670076f, -0.21255748f,
+ 0.00000000f, -0.06435072f, -0.45175566f, -0.30468475f,
+ 0.30179295f, 0.40824829f, 0.17478670f, -0.21105601f,
+ -0.14266085f, -0.13813540f, -0.17437603f, 0.11354987f,
+ 0.25000000f, -0.10140050f, 0.44444817f, 0.30854971f,
+ 0.00000000f, -0.06435072f, 0.15854504f, 0.51126161f,
+ 0.25792363f, 0.00000000f, 0.08126112f, 0.18567181f,
+ -0.34164468f, 0.33022826f, 0.07027907f, -0.07417505f,
+ 0.25000000f, 0.22065181f, 0.00000000f, 0.00000000f,
+ 0.70710678f, 0.62354854f, 0.00000000f, 0.00000000f,
+ 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.25000000f, -0.10140050f, 0.00000000f, 0.47067023f,
+ 0.00000000f, -0.06435072f, -0.04038515f, 0.00000000f,
+ 0.16272340f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.73674975f, 0.08755115f, -0.29210266f, 0.19402893f,
+ 0.25000000f, -0.10140050f, 0.19574399f, -0.16212052f,
+ 0.00000000f, -0.06435072f, 0.00741823f, -0.29048013f,
+ 0.09520023f, 0.00000000f, -0.36753980f, 0.49215859f,
+ 0.24627108f, -0.07946707f, 0.36238173f, -0.43519050f,
+ 0.25000000f, -0.10140050f, 0.29291001f, 0.00000000f,
+ 0.00000000f, -0.06435072f, 0.39351034f, -0.06578702f,
+ 0.00000000f, -0.40824829f, -0.30788221f, -0.38525014f,
+ -0.08574019f, -0.46133749f, 0.00000000f, 0.21918685f,
+ 0.25000000f, -0.10140050f, -0.40670076f, -0.21255748f,
+ 0.00000000f, -0.06435072f, -0.45175566f, 0.30468475f,
+ 0.30179295f, -0.40824829f, -0.17478670f, 0.21105601f,
+ -0.14266085f, -0.13813540f, -0.17437603f, 0.11354987f,
+ 0.25000000f, -0.10140050f, -0.19574399f, -0.16212052f,
+ 0.00000000f, -0.06435072f, 0.00741823f, 0.29048013f,
+ 0.09520023f, 0.00000000f, 0.36753980f, -0.49215859f,
+ 0.24627108f, -0.07946707f, 0.36238173f, -0.43519050f,
+ 0.25000000f, -0.10140050f, 0.00000000f, -0.47067023f,
+ 0.00000000f, -0.06435072f, 0.11074166f, 0.00000000f,
+ -0.16272340f, 0.00000000f, 0.00000000f, 0.00000000f,
+ 0.14883399f, 0.49724647f, 0.29210266f, 0.55504438f,
+ 0.25000000f, -0.10140050f, 0.11379074f, -0.14642919f,
+ 0.00000000f, -0.06435072f, 0.08298163f, -0.23889774f,
+ -0.35312385f, -0.40824829f, 0.48266891f, 0.17419413f,
+ -0.04768680f, 0.12538059f, -0.43266080f, -0.25468277f,
+ 0.25000000f, -0.10140050f, -0.44444817f, 0.30854971f,
+ 0.00000000f, -0.06435072f, 0.15854504f, -0.51126161f,
+ 0.25792363f, 0.00000000f, -0.08126112f, -0.18567181f,
+ -0.34164468f, 0.33022826f, 0.07027907f, -0.07417505f,
+ 0.25000000f, -0.10140050f, -0.29291001f, 0.00000000f,
+ 0.00000000f, -0.06435072f, 0.39351034f, 0.06578702f,
+ 0.00000000f, 0.40824829f, 0.30788221f, 0.38525014f,
+ -0.08574019f, -0.46133749f, 0.00000000f, 0.21918685f,
+ 0.25000000f, -0.10140050f, -0.11379074f, -0.14642919f,
+ 0.00000000f, -0.06435072f, 0.08298163f, 0.23889774f,
+ -0.35312385f, 0.40824829f, -0.48266891f, -0.17419413f,
+ -0.04768680f, 0.12538059f, -0.43266080f, -0.25468277f,
+ 0.25000000f, -0.10140050f, 0.00000000f, 0.42511496f,
+ 0.00000000f, -0.06435072f, -0.45175566f, 0.00000000f,
+ -0.60358590f, 0.00000000f, 0.00000000f, 0.00000000f,
+ -0.14266085f, -0.13813540f, 0.34875205f, 0.11354987f,
+ };
+
+ int32_t i, j;
+ for (i = 0; i < 16; ++i) {
+ float sum = 0.0f;
+ for (j = 0; j < 16; ++j) sum += in[j] * AFV_BASIS[i * 16 + j];
+ out[i] = sum;
+ }
+}
+
+J40_STATIC void j40__inverse_afv(float *buf, int flipx, int flipy) {
+ // input flipx/y=0/0 flipx/y=1/0 flipx/y=0/1 flipx/y=1/1
+ // _______ +-----+-----+ +-----+-----+ +-----------+ +-----------+
+ // |_|_|_|_| |' | | | | '| | | | |
+ // |_|_|_|_| ---> |AFV22|DCT22| |DCT22|AFV22| | DCT23 | | DCT23 |
+ // |_|_|_|_| +-----+-----+ +-----+-----+ +-----+-----+ +-----+-----+
+ // |_|_|_|_| | DCT23 | | DCT23 | |AFV22|DCT22| |DCT22|AFV22|
+ // | | | | |. | | | | .|
+ // (2x2 each) +-----------+ +-----------+ +-----+-----+ +-----+-----+
+ //
+ // coefficients are divided by 16 2x2 blocks, where two top coefficients are for AFV22
+ // and DCT22 respectively and two bottom coefficients are for DCT23.
+ // all three corresponding DC coefficients are in the top-left block and handled specially.
+ // AFV22 samples are then flipped so that the top-left cell is moved to the corner (dots above).
+ //
+ // TODO spec issue: identifiers have `*` in place of `x`
+
+ float scratch[64];
+ // buf23/buf32 etc. refer to the same memory region; numbers refer to the supposed dimensions
+ float *bufafv = buf, *buf22 = buf + 16, *buf23 = buf + 32, *buf32 = buf23;
+ float *scratchafv = scratch, *scratch22 = scratch + 16, *scratch23 = scratch + 32, *scratch32 = scratch23;
+ int32_t x, y;
+
+ J40__ASSERT(flipx == !!flipx && flipy == !!flipy);
+
+ for (y = 0; y < 8; y += 2) for (x = 0; x < 8; ++x) {
+ // AFV22 coefficients to scratch[0..16), DCT22 coefficients to scratch[16..32)
+ scratch[(x % 2) * 16 + (y / 2) * 4 + (x / 2)] = buf[y * 8 + x];
+ }
+ for (y = 1; y < 8; y += 2) for (x = 0; x < 8; ++x) {
+ // DCT23 coefficients to scratch[32..64) = scratch32[0..32), after transposition
+ scratch32[x * 4 + (y / 2)] = buf[y * 8 + x];
+ }
+ scratchafv[0] = (buf[0] + buf[1] + buf[8]) * 4.0f;
+ scratch22[0] = buf[0] - buf[1] + buf[8]; // TODO spec bug: x and y are swapped
+ scratch32[0] = buf[0] - buf[8]; // TODO spec bug: x and y are swapped
+
+ j40__inverse_afv22(bufafv, scratchafv);
+ j40__inverse_dct(buf22, scratch22, 2, 4);
+ j40__inverse_dct(buf32, scratch32, 3, 4);
+
+ for (y = 0; y < 4; ++y) {
+ for (x = 0; x < 4; ++x) scratchafv[y * 4 + x] = bufafv[y * 4 + x]; // AFV22, as is
+ for (x = 0; x < 4; ++x) scratch22[x * 4 + y] = buf22[y * 4 + x]; // DCT22, transposed
+ }
+ for (y = 0; y < 8; ++y) {
+ for (x = 0; x < 4; ++x) scratch23[x * 8 + y] = buf32[y * 4 + x]; // DCT23, transposed
+ }
+
+ j40__inverse_dct(buf22, scratch22, 2, 4);
+ j40__inverse_dct(buf23, scratch23, 2, 8);
+ memcpy(scratch + 16, buf + 16, sizeof(float) * 48);
+
+ for (y = 0; y < 4; ++y) {
+ static const int8_t FLIP_FOR_AFV[2][4] = {{0, 1, 2, 3}, {7, 6, 5, 4}};
+ int32_t afv22pos = FLIP_FOR_AFV[flipy][y] * 8;
+ int32_t dct22pos = (flipy * 4 + y) * 8 + (!flipx * 4);
+ int32_t dct23pos = (!flipy * 4 + y) * 8;
+ for (x = 0; x < 4; ++x) buf[afv22pos + FLIP_FOR_AFV[flipx][x]] = scratchafv[y * 4 + x];
+ for (x = 0; x < 4; ++x) buf[dct22pos + x] = scratch22[y * 4 + x];
+ // TODO spec issue: samples_4x4 should be samples_4x8
+ for (x = 0; x < 8; ++x) buf[dct23pos + x] = scratch23[y * 8 + x];
+ }
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// LfGlobal: additional image features, HF block context, global tree, extra channels
+
+J40__STATIC_RETURNS_ERR j40__lf_global(j40__st *st);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__lf_global(j40__st *st) {
+ j40__frame_st *f = st->frame;
+ int32_t sidx = 0;
+ int32_t i, j;
+
+ if (f->has_patches) J40__RAISE("TODO: patches");
+ if (f->has_splines) J40__RAISE("TODO: splines");
+ if (f->has_noise) J40__RAISE("TODO: noise");
+
+ if (!j40__u(st, 1)) { // LfChannelDequantization.all_default
+ // TODO spec bug: missing division by 128
+ for (i = 0; i < 3; ++i) f->m_lf_scaled[i] = j40__f16(st) / 128.0f;
+ }
+
+ if (!f->is_modular) {
+ f->global_scale = j40__u32(st, 1, 11, 2049, 11, 4097, 12, 8193, 16);
+ f->quant_lf = j40__u32(st, 16, 0, 1, 5, 1, 8, 1, 16);
+
+ // HF block context
+ if (j40__u(st, 1)) {
+ static const uint8_t DEFAULT_BLKCTX[] = {
+ 0, 1, 2, 2, 3, 3, 4, 5, 6, 6, 6, 6, 6,
+ 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14,
+ 7, 8, 9, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14,
+ };
+ f->block_ctx_size = sizeof(DEFAULT_BLKCTX) / sizeof(*DEFAULT_BLKCTX);
+ J40__TRY_MALLOC(uint8_t, &f->block_ctx_map, sizeof(DEFAULT_BLKCTX));
+ memcpy(f->block_ctx_map, DEFAULT_BLKCTX, sizeof(DEFAULT_BLKCTX));
+ f->nb_qf_thr = f->nb_lf_thr[0] = f->nb_lf_thr[1] = f->nb_lf_thr[2] = 0; // SPEC is implicit
+ f->nb_block_ctx = 15;
+ } else {
+ J40__RAISE_DELAYED();
+ f->block_ctx_size = 39; // SPEC not 27
+ for (i = 0; i < 3; ++i) {
+ f->nb_lf_thr[i] = j40__u(st, 4);
+ // TODO spec question: should this be sorted? (current code is okay with that)
+ for (j = 0; j < f->nb_lf_thr[i]; ++j) {
+ f->lf_thr[i][j] = (int32_t) j40__unpack_signed64(j40__64u32(st, 0, 4, 16, 8, 272, 16, 65808, 32));
+ }
+ f->block_ctx_size *= f->nb_lf_thr[i] + 1; // SPEC is off by one
+ }
+ f->nb_qf_thr = j40__u(st, 4);
+ // TODO spec bug: both qf_thr[i] and HfMul should be incremented
+ for (i = 0; i < f->nb_qf_thr; ++i) f->qf_thr[i] = j40__u32(st, 0, 2, 4, 3, 12, 5, 44, 8) + 1;
+ f->block_ctx_size *= f->nb_qf_thr + 1; // SPEC is off by one
+ // block_ctx_size <= 39*15^4 and never overflows
+ J40__SHOULD(f->block_ctx_size <= 39 * 64, "hfbc"); // SPEC limit is not 21*64
+ J40__TRY(j40__cluster_map(st, f->block_ctx_size, 16, &f->nb_block_ctx, &f->block_ctx_map));
+ }
+
+ if (!j40__u(st, 1)) { // LfChannelCorrelation.all_default
+ f->inv_colour_factor = 1.0f / (float) j40__u32(st, 84, 0, 256, 0, 2, 8, 258, 16);
+ f->base_corr_x = j40__f16(st);
+ f->base_corr_b = j40__f16(st);
+ f->x_factor_lf = j40__u(st, 8) - 127;
+ f->b_factor_lf = j40__u(st, 8) - 127;
+ }
+ }
+
+ // we need f->gmodular.num_channels for j40__tree
+ J40__TRY(j40__init_modular_for_global(st, f->is_modular, f->do_ycbcr,
+ f->log_upsampling, f->ec_log_upsampling, f->width, f->height, &f->gmodular));
+
+ if (j40__u(st, 1)) { // global tree present
+ int32_t max_tree_size = j40__min32(1 << 22,
+ 1024 + j40__clamp_mul32(j40__clamp_mul32(f->width, f->height), f->gmodular.num_channels) / 16);
+ J40__TRY(j40__tree(st, max_tree_size, &f->global_tree, &f->global_codespec));
+ }
+
+ if (f->gmodular.num_channels > 0) {
+ J40__TRY(j40__modular_header(st, f->global_tree, &f->global_codespec, &f->gmodular));
+ J40__TRY(j40__allocate_modular(st, &f->gmodular));
+ if (f->width <= (1 << f->group_size_shift) && f->height <= (1 << f->group_size_shift)) {
+ f->num_gm_channels = f->gmodular.num_channels;
+ } else {
+ f->num_gm_channels = f->gmodular.nb_meta_channels;
+ }
+ for (i = 0; i < f->num_gm_channels; ++i) {
+ J40__TRY(j40__modular_channel(st, &f->gmodular, i, sidx));
+ }
+ J40__TRY(j40__finish_and_free_code(st, &f->gmodular.code));
+ } else {
+ f->num_gm_channels = 0;
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// LfGroup: downsampled LF image (optionally smoothed), varblock information
+
+typedef struct {
+ int32_t coeffoff_qfidx; // offset to coeffs (always a multiple of 64) | qf index (always < 16)
+ union {
+ int32_t m1; // HfMul - 1 during j40__hf_metadata, to avoid overflow at this stage
+ float inv; // 1 / HfMul after j40__hf_metadata
+ } hfmul;
+ // DctSelect is embedded in blocks
+} j40__varblock;
+
+typedef struct j40__lf_group_st {
+ int64_t idx;
+
+ int32_t left, top;
+ int32_t width, height; // <= 8192
+ int32_t width8, height8; // <= 1024
+ int32_t width64, height64; // <= 128
+
+ // contained group indices: [gidx + gstride * y, gidx + gstride * y + gcolumns) for each row
+ int64_t gidx, grows, gcolumns, gstride;
+
+ j40__plane xfromy, bfromy; // width64 x height64 each
+ j40__plane sharpness; // width8 x height8
+
+ int32_t nb_varblocks; // <= 2^20 (TODO spec issue: named nb_blocks)
+ // bits 0..19: varblock index [0, nb_varblocks)
+ // bits 20..24: DctSelect + 2, or 1 if not the top-left corner (0 is reserved for unused block)
+ j40__plane blocks; // width8 x height8
+ j40__varblock *varblocks; // [nb_varblocks]
+
+ float *llfcoeffs[3]; // [width8*height8] each
+ // TODO coeffs can be integers before dequantization
+ float *coeffs[3]; // [width8*height8*64] each, aligned
+ #define J40__COEFFS_ALIGN 64
+ uint8_t coeffs_misalign[3];
+
+ // precomputed lf_idx
+ j40__plane lfindices; // [width8*height8]
+
+ int loaded;
+} j40__lf_group_st;
+
+J40__STATIC_RETURNS_ERR j40__lf_quant(
+ j40__st *st, int32_t extra_prec, j40__modular *m, j40__lf_group_st *gg, j40__plane outlfquant[3]
+);
+J40__STATIC_RETURNS_ERR j40__hf_metadata(
+ j40__st *st, int32_t nb_varblocks,
+ j40__modular *m, const j40__plane lfquant[3], j40__lf_group_st *gg
+);
+J40__STATIC_RETURNS_ERR j40__lf_group(j40__st *st, j40__lf_group_st *gg);
+J40_STATIC void j40__free_lf_group(j40__lf_group_st *gg);
+
+// ----------------------------------------
+// recursion for LF dequantization operations
+#undef J40__RECURSING
+#define J40__RECURSING 400
+#define J40__P 16
+#include J40_FILENAME
+#define J40__P 32
+#include J40_FILENAME
+#undef J40__RECURSING
+#define J40__RECURSING (-1)
+
+#endif // J40__RECURSING < 0
+#if J40__RECURSING == 400
+ #define j40__intP J40__CONCAT3(int, J40__P, _t)
+ #define J40__PIXELS J40__CONCAT3(J40__I, J40__P, _PIXELS)
+// ----------------------------------------
+
+#ifdef J40_IMPLEMENTATION
+
+// out(x, y) = in(x, y) * mult (after type conversion)
+J40_STATIC void j40__(dequant_lf,P)(const j40__plane *in, float mult, j40__plane *out) {
+ int32_t x, y;
+ J40__ASSERT(in->type == J40__(PLANE_I,P) && out->type == J40__PLANE_F32);
+ J40__ASSERT(in->width <= out->width && in->height <= out->height);
+ for (y = 0; y < in->height; ++y) {
+ j40__intP *inpixels = J40__PIXELS(in, y);
+ float *outpixels = J40__F32_PIXELS(out, y);
+ for (x = 0; x < in->width; ++x) outpixels[x] = (float) inpixels[x] * mult;
+ }
+}
+
+// plane(x, y) += # of lf_thr[i] s.t. in(x, y) > lf_thr[i]
+J40_STATIC void j40__(add_thresholds,P)(
+ j40__plane *plane, const j40__plane *in, const int32_t *lf_thr, int32_t nb_lf_thr
+) {
+ int32_t x, y, i;
+ J40__ASSERT(in->type == J40__(PLANE_I,P) && plane->type == J40__PLANE_U8);
+ J40__ASSERT(in->width <= plane->width && in->height <= plane->height);
+ for (y = 0; y < plane->height; ++y) {
+ j40__intP *inpixels = J40__PIXELS(in, y);
+ uint8_t *pixels = J40__U8_PIXELS(plane, y);
+ for (i = 0; i < nb_lf_thr; ++i) {
+ int32_t threshold = lf_thr[i];
+ for (x = 0; x < in->width; ++x) {
+ pixels[x] = (uint8_t) (pixels[x] + (inpixels[x] > threshold));
+ }
+ }
+ }
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+// ----------------------------------------
+// end of recursion
+ #undef j40__intP
+ #undef J40__PIXELS
+ #undef J40__P
+#endif // J40__RECURSING == 400
+#if J40__RECURSING < 0
+// ----------------------------------------
+
+#ifdef J40_IMPLEMENTATION
+
+J40_ALWAYS_INLINE void j40__dequant_lf(const j40__plane *in, float mult, j40__plane *out) {
+ switch (in->type) {
+ case J40__PLANE_I16: j40__dequant_lf16(in, mult, out); break;
+ case J40__PLANE_I32: j40__dequant_lf32(in, mult, out); break;
+ default: J40__UNREACHABLE();
+ }
+}
+
+J40_ALWAYS_INLINE void j40__add_thresholds(
+ j40__plane *plane, const j40__plane *in, const int32_t *lf_thr, int32_t nb_lf_thr
+) {
+ switch (in->type) {
+ case J40__PLANE_I16: j40__add_thresholds16(plane, in, lf_thr, nb_lf_thr); break;
+ case J40__PLANE_I32: j40__add_thresholds32(plane, in, lf_thr, nb_lf_thr); break;
+ default: J40__UNREACHABLE();
+ }
+}
+
+J40_STATIC void j40__multiply_each_u8(j40__plane *plane, int32_t mult) {
+ int32_t x, y;
+ J40__ASSERT(plane->type == J40__PLANE_U8);
+ for (y = 0; y < plane->height; ++y) {
+ uint8_t *pixels = J40__U8_PIXELS(plane, y);
+ for (x = 0; x < plane->width; ++x) pixels[x] = (uint8_t) (pixels[x] * mult);
+ }
+}
+
+J40__STATIC_RETURNS_ERR j40__smooth_lf(j40__st *st, j40__lf_group_st *gg, j40__plane lfquant[3]) {
+ static const float W0 = 0.05226273532324128f, W1 = 0.20345139757231578f, W2 = 0.0334829185968739f;
+
+ j40__frame_st *f = st->frame;
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8;
+ float *linebuf = NULL, *nline[3], *line[3];
+ float inv_m_lf[3];
+ int32_t x, y, c;
+
+ for (c = 0; c < 3; ++c) {
+ // TODO spec bug: missing 2^16 scaling
+ inv_m_lf[c] = (float) (f->global_scale * f->quant_lf) / f->m_lf_scaled[c] / 65536.0f;
+ }
+
+ J40__TRY_MALLOC(float, &linebuf, (size_t) (ggw8 * 6));
+ for (c = 0; c < 3; ++c) {
+ nline[c] = linebuf + (c + 3) * ggw8; // intentionally uninitialized
+ line[c] = linebuf + c * ggw8; // row 0
+ memcpy(line[c], J40__F32_PIXELS(&lfquant[c], 0), sizeof(float) * (size_t) ggw8);
+ }
+
+ for (y = 1; y < ggh8 - 1; ++y) {
+ float *outline[3], *sline[3];
+ for (c = 0; c < 3; ++c) {
+ float *temp = nline[c];
+ nline[c] = line[c];
+ line[c] = temp;
+ outline[c] = J40__F32_PIXELS(&lfquant[c], y);
+ sline[c] = J40__F32_PIXELS(&lfquant[c], y + 1);
+ memcpy(line[c], outline[c], sizeof(float) * (size_t) ggw8);
+ }
+ for (x = 1; x < ggw8 - 1; ++x) {
+ float wa[3], diff[3], gap = 0.5f;
+ for (c = 0; c < 3; ++c) {
+ wa[c] =
+ (nline[c][x - 1] * W2 + nline[c][x] * W1 + nline[c][x + 1] * W2) +
+ ( line[c][x - 1] * W1 + line[c][x] * W0 + line[c][x + 1] * W1) +
+ (sline[c][x - 1] * W2 + sline[c][x] * W1 + sline[c][x + 1] * W2);
+ diff[c] = fabsf(wa[c] - line[c][x]) * inv_m_lf[c];
+ if (gap < diff[c]) gap = diff[c];
+ }
+ gap = j40__maxf(0.0f, 3.0f - 4.0f * gap);
+ // TODO spec bug: s (sample) and wa (weighted average) are swapped in the final formula
+ for (c = 0; c < 3; ++c) outline[c][x] = (wa[c] - line[c][x]) * gap + line[c][x];
+ }
+ }
+
+J40__ON_ERROR:
+ j40__free(linebuf);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__lf_quant(
+ j40__st *st, int32_t extra_prec, j40__modular *m, j40__lf_group_st *gg, j40__plane outlfquant[3]
+) {
+ static const int32_t YXB2XYB[3] = {1, 0, 2}; // TODO spec bug: this reordering is missing
+
+ j40__frame_st *f = st->frame;
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8;
+ j40__plane *channel[3], lfquant[3] = {J40__INIT}, lfindices = J40__INIT;
+ int32_t c;
+
+ J40__ASSERT(j40__plane_all_equal_sized(m->channel, m->channel + 3));
+
+ for (c = 0; c < 3; ++c) J40__TRY(j40__init_plane(st, J40__PLANE_F32, ggw8, ggh8, 0, &lfquant[c]));
+ J40__TRY(j40__init_plane(st, J40__PLANE_U8, ggw8, ggh8, J40__PLANE_CLEAR, &lfindices));
+
+ // extract LfQuant from m and populate lfindices
+ for (c = 0; c < 3; ++c) {
+ // TODO spec bug: missing 2^16 scaling
+ float mult_lf = f->m_lf_scaled[c] / (float) (f->global_scale * f->quant_lf) * (float) (65536 >> extra_prec);
+ channel[c] = &m->channel[YXB2XYB[c]];
+ j40__dequant_lf(channel[c], mult_lf, &lfquant[c]);
+ }
+ j40__add_thresholds(&lfindices, channel[0], f->lf_thr[0], f->nb_lf_thr[0]);
+ j40__multiply_each_u8(&lfindices, f->nb_lf_thr[0] + 1);
+ j40__add_thresholds(&lfindices, channel[2], f->lf_thr[2], f->nb_lf_thr[2]);
+ j40__multiply_each_u8(&lfindices, f->nb_lf_thr[2] + 1);
+ j40__add_thresholds(&lfindices, channel[1], f->lf_thr[1], f->nb_lf_thr[1]);
+
+ // apply smoothing to LfQuant
+ if (!f->skip_adapt_lf_smooth) J40__TRY(j40__smooth_lf(st, gg, lfquant));
+
+ memcpy(outlfquant, lfquant, sizeof(j40__plane) * 3);
+ gg->lfindices = lfindices;
+ return 0;
+
+J40__ON_ERROR:
+ for (c = 0; c < 3; ++c) j40__free_plane(&lfquant[c]);
+ j40__free_plane(&lfindices);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__hf_metadata(
+ j40__st *st, int32_t nb_varblocks,
+ j40__modular *m, const j40__plane lfquant[3], j40__lf_group_st *gg
+) {
+ j40__frame_st *f = st->frame;
+ j40__plane blocks = J40__INIT;
+ j40__varblock *varblocks = NULL;
+ float *coeffs[3 /*xyb*/] = {NULL}, *llfcoeffs[3 /*xyb*/] = {NULL};
+ size_t coeffs_misalign[3] = {0};
+ int32_t log_gsize8 = f->group_size_shift - 3;
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8;
+ int32_t voff, coeffoff;
+ int32_t x0, y0, x1, y1, i, j, c;
+
+ gg->xfromy = m->channel[0];
+ gg->bfromy = m->channel[1];
+ gg->sharpness = m->channel[3];
+ memset(&m->channel[0], 0, sizeof(j40__plane));
+ memset(&m->channel[1], 0, sizeof(j40__plane));
+ memset(&m->channel[3], 0, sizeof(j40__plane));
+
+ J40__TRY(j40__init_plane(st, J40__PLANE_I32, ggw8, ggh8, J40__PLANE_CLEAR, &blocks));
+ J40__TRY_MALLOC(j40__varblock, &varblocks, (size_t) nb_varblocks);
+ for (c = 0; c < 3; ++c) { // TODO account for chroma subsampling
+ J40__TRY_MALLOC(float, &llfcoeffs[c], (size_t) (ggw8 * ggh8));
+ J40__SHOULD(
+ coeffs[c] = (float*) j40__alloc_aligned(
+ sizeof(float) * (size_t) (ggw8 * ggh8 * 64), J40__COEFFS_ALIGN, &coeffs_misalign[c]),
+ "!mem");
+ for (i = 0; i < ggw8 * ggh8 * 64; ++i) coeffs[c][i] = 0.0f;
+ }
+
+ // temporarily use coeffoff_qfidx to store DctSelect
+ if (m->channel[2].type == J40__PLANE_I16) {
+ int16_t *blockinfo0 = J40__I16_PIXELS(&m->channel[2], 0);
+ int16_t *blockinfo1 = J40__I16_PIXELS(&m->channel[2], 1);
+ for (i = 0; i < nb_varblocks; ++i) {
+ varblocks[i].coeffoff_qfidx = blockinfo0[i];
+ varblocks[i].hfmul.m1 = blockinfo1[i];
+ }
+ } else {
+ int32_t *blockinfo0 = J40__I32_PIXELS(&m->channel[2], 0);
+ int32_t *blockinfo1 = J40__I32_PIXELS(&m->channel[2], 1);
+ for (i = 0; i < nb_varblocks; ++i) {
+ varblocks[i].coeffoff_qfidx = blockinfo0[i];
+ varblocks[i].hfmul.m1 = blockinfo1[i];
+ }
+ }
+
+ // place varblocks
+ voff = coeffoff = 0;
+ for (y0 = 0; y0 < ggh8; ++y0) for (x0 = 0; x0 < ggw8; ++x0) {
+ int32_t dctsel, log_vh, log_vw, vh8, vw8;
+ const j40__dct_select *dct;
+ if (J40__I32_PIXELS(&blocks, y0)[x0]) continue;
+
+ J40__SHOULD(voff < nb_varblocks, "vblk"); // TODO spec issue: missing
+ dctsel = varblocks[voff].coeffoff_qfidx;
+ J40__SHOULD(0 <= dctsel && dctsel < J40__NUM_DCT_SELECT, "dct?");
+ dct = &J40__DCT_SELECT[dctsel];
+ f->dct_select_used |= 1 << dctsel;
+ f->order_used |= 1 << dct->order_idx;
+ varblocks[voff].coeffoff_qfidx = coeffoff;
+ J40__ASSERT(coeffoff % 64 == 0);
+
+ log_vh = dct->log_rows;
+ log_vw = dct->log_columns;
+ J40__ASSERT(log_vh >= 3 && log_vw >= 3 && log_vh <= 8 && log_vw <= 8);
+ vw8 = 1 << (log_vw - 3);
+ vh8 = 1 << (log_vh - 3);
+ x1 = x0 + vw8 - 1;
+ y1 = y0 + vh8 - 1;
+ // SPEC the first available block in raster order SHOULD be the top-left corner of
+ // the next varblock, otherwise it's an error (no retry required)
+ J40__SHOULD(x1 < ggw8 && (x0 >> log_gsize8) == (x1 >> log_gsize8), "vblk");
+ J40__SHOULD(y1 < ggh8 && (y0 >> log_gsize8) == (y1 >> log_gsize8), "vblk");
+
+ for (i = 0; i < vh8; ++i) {
+ int32_t *blockrow = J40__I32_PIXELS(&blocks, y0 + i);
+ for (j = 0; j < vw8; ++j) blockrow[x0 + j] = 1 << 20 | voff;
+ }
+ J40__I32_PIXELS(&blocks, y0)[x0] = (dctsel + 2) << 20 | voff;
+
+ // compute LLF coefficients from dequantized LF
+ if (log_vw <= 3 && log_vh <= 3) {
+ for (c = 0; c < 3; ++c) llfcoeffs[c][coeffoff >> 6] = J40__F32_PIXELS(&lfquant[c], y0)[x0];
+ } else {
+ float scratch[1024]; // DCT256x256 requires 32x32
+ for (c = 0; c < 3; ++c) {
+ float *llfcoeffs_c = llfcoeffs[c] + (coeffoff >> 6);
+ for (i = 0; i < vh8; ++i) {
+ float *lfquantrow = J40__F32_PIXELS(&lfquant[c], y0 + i);
+ for (j = 0; j < vw8; ++j) llfcoeffs_c[i * vw8 + j] = lfquantrow[x0 + j];
+ }
+ // TODO spec bug: DctSelect type IDENTIFY [sic] no longer exists
+ // TODO spec issue: DCT8x8 doesn't need this
+ j40__forward_dct2d_scaled_for_llf(llfcoeffs_c, scratch, log_vh - 3, log_vw - 3);
+ }
+ }
+
+ coeffoff += 1 << (log_vw + log_vh);
+ ++voff;
+ }
+ J40__SHOULD(voff == nb_varblocks, "vblk"); // TODO spec issue: missing
+ // TODO both libjxl and spec don't check for coeffoff == ggw8 * ggh8, but they probably should?
+
+ // compute qf_idx and hfmul.inv for later use
+ J40__ASSERT(f->nb_qf_thr < 16);
+ for (j = 0; j < f->nb_qf_thr; ++j) {
+ for (i = 0; i < nb_varblocks; ++i) {
+ varblocks[i].coeffoff_qfidx += varblocks[i].hfmul.m1 >= f->qf_thr[j];
+ }
+ }
+ for (i = 0; i < nb_varblocks; ++i) {
+ varblocks[i].hfmul.inv = 1.0f / ((float) varblocks[i].hfmul.m1 + 1.0f);
+ }
+
+ gg->nb_varblocks = nb_varblocks;
+ gg->blocks = blocks;
+ gg->varblocks = varblocks;
+ for (c = 0; c < 3; ++c) {
+ gg->llfcoeffs[c] = llfcoeffs[c];
+ gg->coeffs[c] = coeffs[c];
+ gg->coeffs_misalign[c] = (uint8_t) coeffs_misalign[c];
+ }
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_plane(&blocks);
+ j40__free(varblocks);
+ for (c = 0; c < 3; ++c) {
+ j40__free_aligned(coeffs[c], J40__COEFFS_ALIGN, coeffs_misalign[c]);
+ j40__free(llfcoeffs[c]);
+ }
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__lf_group(j40__st *st, j40__lf_group_st *gg) {
+ j40__frame_st *f = st->frame;
+ int64_t ggidx = gg->idx;
+ int64_t sidx0 = 1 + ggidx, sidx1 = 1 + f->num_lf_groups + ggidx, sidx2 = 1 + 2 * f->num_lf_groups + ggidx;
+ j40__plane lfquant[3] = {J40__INIT};
+ j40__modular m = J40__INIT;
+ int32_t i;
+
+ // TODO factor into j40__init_modular_for_lf_group
+ for (i = f->num_gm_channels; i < f->gmodular.num_channels; ++i) {
+ j40__plane *c = &f->gmodular.channel[i];
+ if (c->hshift >= 3 && c->vshift >= 3) {
+ (void) sidx1;
+ J40__RAISE("TODO: ModularLfGroup decoding should continue here");
+ }
+ }
+
+ if (!f->is_modular) {
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8;
+ int32_t ggw64 = gg->width64, ggh64 = gg->height64;
+ int32_t w[4], h[4], nb_varblocks;
+
+ J40__ASSERT(ggw8 <= 1024 && ggh8 <= 1024);
+
+ // LfQuant
+ if (!f->use_lf_frame) {
+ int32_t extra_prec = j40__u(st, 2), c;
+ J40__SHOULD(f->jpeg_upsampling == 0, "TODO: subimage w/h depends on jpeg_upsampling");
+ w[0] = w[1] = w[2] = ggw8;
+ h[0] = h[1] = h[2] = ggh8;
+ J40__TRY(j40__init_modular(st, 3, w, h, &m));
+ J40__TRY(j40__modular_header(st, f->global_tree, &f->global_codespec, &m));
+ J40__TRY(j40__allocate_modular(st, &m));
+ for (c = 0; c < m.num_channels; ++c) J40__TRY(j40__modular_channel(st, &m, c, sidx0));
+ J40__TRY(j40__finish_and_free_code(st, &m.code));
+ J40__TRY(j40__inverse_transform(st, &m));
+ // TODO spec issue: this modular image is independent of bpp/float_sample/etc.
+ // TODO spec bug: channels are in the YXB order
+ J40__TRY(j40__lf_quant(st, extra_prec, &m, gg, lfquant));
+ j40__free_modular(&m);
+ } else {
+ J40__RAISE("TODO: persist lfquant and use it in later frames");
+ }
+
+ // HF metadata
+ // SPEC nb_block is off by one
+ nb_varblocks = j40__u(st, j40__ceil_lg32((uint32_t) (ggw8 * ggh8))) + 1; // at most 2^20
+ w[0] = w[1] = ggw64; h[0] = h[1] = ggh64; // XFromY, BFromY
+ w[2] = nb_varblocks; h[2] = 2; // BlockInfo
+ w[3] = ggw8; h[3] = ggh8; // Sharpness
+ J40__TRY(j40__init_modular(st, 4, w, h, &m));
+ J40__TRY(j40__modular_header(st, f->global_tree, &f->global_codespec, &m));
+ J40__TRY(j40__allocate_modular(st, &m));
+ for (i = 0; i < m.num_channels; ++i) J40__TRY(j40__modular_channel(st, &m, i, sidx2));
+ J40__TRY(j40__finish_and_free_code(st, &m.code));
+ J40__TRY(j40__inverse_transform(st, &m));
+ J40__TRY(j40__hf_metadata(st, nb_varblocks, &m, lfquant, gg));
+ j40__free_modular(&m);
+ for (i = 0; i < 3; ++i) j40__free_plane(&lfquant[i]);
+ }
+
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_modular(&m);
+ for (i = 0; i < 3; ++i) j40__free_plane(&lfquant[i]);
+ if (gg) j40__free_lf_group(gg);
+ return st->err;
+}
+
+J40_STATIC void j40__free_lf_group(j40__lf_group_st *gg) {
+ int32_t i;
+ for (i = 0; i < 3; ++i) {
+ j40__free(gg->llfcoeffs[i]);
+ j40__free_aligned(gg->coeffs[i], J40__COEFFS_ALIGN, gg->coeffs_misalign[i]);
+ gg->llfcoeffs[i] = NULL;
+ gg->coeffs[i] = NULL;
+ }
+ j40__free_plane(&gg->xfromy);
+ j40__free_plane(&gg->bfromy);
+ j40__free_plane(&gg->sharpness);
+ j40__free_plane(&gg->blocks);
+ j40__free_plane(&gg->lfindices);
+ j40__free(gg->varblocks);
+ gg->varblocks = NULL;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// HfGlobal and HfPass
+
+J40__STATIC_RETURNS_ERR j40__hf_global(j40__st *st);
+
+#ifdef J40_IMPLEMENTATION
+
+// reads both HfGlobal and HfPass (SPEC they form a single group)
+J40__STATIC_RETURNS_ERR j40__hf_global(j40__st *st) {
+ j40__frame_st *f = st->frame;
+ int64_t sidx_base = 1 + 3 * f->num_lf_groups;
+ j40__code_spec codespec = J40__INIT;
+ j40__code_st code = J40__INIT;
+ int32_t i, j, c;
+
+ J40__ASSERT(!f->is_modular);
+
+ // dequantization matrices
+ if (!j40__u(st, 1)) {
+ // TODO spec improvement: encoding mode 1..5 are only valid for 0-3/9-10 since it requires 8x8 matrix, explicitly note this
+ for (i = 0; i < J40__NUM_DCT_PARAMS; ++i) { // SPEC not 11, should be 17
+ const struct j40__dct_params dct = J40__DCT_PARAMS[i];
+ int32_t rows = 1 << (int32_t) dct.log_rows, columns = 1 << (int32_t) dct.log_columns;
+ J40__TRY(j40__read_dq_matrix(st, rows, columns, sidx_base + i,
+ f->global_tree, &f->global_codespec, &f->dq_matrix[i]));
+ }
+ }
+
+ // TODO is it possible that num_hf_presets > num_groups? otherwise j40__at_most is better
+ f->num_hf_presets = j40__u(st, j40__ceil_lg32((uint32_t) f->num_groups)) + 1;
+ J40__RAISE_DELAYED();
+
+ // HfPass
+ for (i = 0; i < f->num_passes; ++i) {
+ int32_t used_orders = j40__u32(st, 0x5f, 0, 0x13, 0, 0, 0, 0, 13);
+ if (used_orders > 0) {
+ J40__TRY(j40__read_code_spec(st, 8, &codespec));
+ j40__init_code(&code, &codespec);
+ }
+ for (j = 0; j < J40__NUM_ORDERS; ++j) {
+ if (used_orders >> j & 1) {
+ int32_t size = 1 << (J40__LOG_ORDER_SIZE[j][0] + J40__LOG_ORDER_SIZE[j][1]);
+ for (c = 0; c < 3; ++c) { // SPEC this loop is omitted
+ J40__TRY(j40__permutation(st, &code, size, size / 64, &f->orders[i][j][c]));
+ }
+ }
+ }
+ if (used_orders > 0) {
+ J40__TRY(j40__finish_and_free_code(st, &code));
+ j40__free_code_spec(&codespec);
+ }
+
+ J40__TRY(j40__read_code_spec(st, 495 * f->nb_block_ctx * f->num_hf_presets, &f->coeff_codespec[i]));
+ }
+
+J40__ON_ERROR:
+ j40__free_code(&code);
+ j40__free_code_spec(&codespec);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// PassGroup
+
+J40__STATIC_RETURNS_ERR j40__hf_coeffs(
+ j40__st *st, int32_t ctxoff, int32_t pass,
+ int32_t gx_in_gg, int32_t gy_in_gg, int32_t gw, int32_t gh, j40__lf_group_st *gg
+);
+J40__STATIC_RETURNS_ERR j40__pass_group(
+ j40__st *st, int32_t pass, int32_t gx_in_gg, int32_t gy_in_gg, int32_t gw, int32_t gh, int64_t gidx,
+ j40__lf_group_st *gg
+);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__hf_coeffs(
+ j40__st *st, int32_t ctxoff, int32_t pass,
+ int32_t gx_in_gg, int32_t gy_in_gg, int32_t gw, int32_t gh, j40__lf_group_st *gg
+) {
+ typedef int8_t j40_i8x3[3];
+ const j40__frame_st *f = st->frame;
+ int32_t gw8 = j40__ceil_div32(gw, 8), gh8 = j40__ceil_div32(gh, 8);
+ int8_t (*nonzeros)[3] = NULL;
+ j40__code_st code = J40__INIT;
+ int32_t lfidx_size = (f->nb_lf_thr[0] + 1) * (f->nb_lf_thr[1] + 1) * (f->nb_lf_thr[2] + 1);
+ int32_t x8, y8, i, j, c_yxb;
+
+ J40__ASSERT(gx_in_gg % 8 == 0 && gy_in_gg % 8 == 0);
+
+ j40__init_code(&code, &f->coeff_codespec[pass]);
+
+ // TODO spec bug: there are *three* NonZeros for each channel
+ J40__TRY_MALLOC(j40_i8x3, &nonzeros, (size_t) (gw8 * gh8));
+
+ for (y8 = 0; y8 < gh8; ++y8) for (x8 = 0; x8 < gw8; ++x8) {
+ const j40__dct_select *dct;
+ // TODO spec issue: missing x and y (here called x8 and y8)
+ int32_t ggx8 = x8 + gx_in_gg / 8, ggy8 = y8 + gy_in_gg / 8, nzpos = y8 * gw8 + x8;
+ int32_t voff = J40__I32_PIXELS(&gg->blocks, ggy8)[ggx8], dctsel = voff >> 20;
+ int32_t log_rows, log_columns, log_size;
+ int32_t coeffoff, qfidx, lfidx, bctx0, bctxc;
+
+ if (dctsel < 2) continue; // not top-left block
+ dctsel -= 2;
+ voff &= 0xfffff;
+ J40__ASSERT(dctsel < J40__NUM_DCT_SELECT);
+ dct = &J40__DCT_SELECT[dctsel];
+ log_rows = dct->log_rows;
+ log_columns = dct->log_columns;
+ log_size = log_rows + log_columns;
+
+ coeffoff = gg->varblocks[voff].coeffoff_qfidx & ~15;
+ qfidx = gg->varblocks[voff].coeffoff_qfidx & 15;
+ // TODO spec improvement: explain why lf_idx is separately calculated
+ // (answer: can be efficiently precomputed via vectorization)
+ lfidx = J40__U8_PIXELS(&gg->lfindices, ggy8)[ggx8];
+ bctx0 = (dct->order_idx * (f->nb_qf_thr + 1) + qfidx) * lfidx_size + lfidx;
+ bctxc = 13 * (f->nb_qf_thr + 1) * lfidx_size;
+
+ // unlike most places, this uses the YXB order
+ for (c_yxb = 0; c_yxb < 3; ++c_yxb) {
+ static const int32_t YXB2XYB[3] = {1, 0, 2};
+ static const int8_t TWICE_COEFF_FREQ_CTX[64] = { // pre-multiplied by 2, [0] is unused
+ -1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+ 30, 30, 32, 32, 34, 34, 36, 36, 38, 38, 40, 40, 42, 42, 44, 44,
+ 46, 46, 46, 46, 48, 48, 48, 48, 50, 50, 50, 50, 52, 52, 52, 52,
+ 54, 54, 54, 54, 56, 56, 56, 56, 58, 58, 58, 58, 60, 60, 60, 60,
+ };
+ // TODO spec bug: CoeffNumNonzeroContext[9] should be 123, not 23
+ static const int16_t TWICE_COEFF_NNZ_CTX[64] = { // pre-multiplied by 2
+ 0, 0, 62, 124, 124, 186, 186, 186, 186, 246, 246, 246, 246, 304, 304, 304,
+ 304, 304, 304, 304, 304, 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 360,
+ 360, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412,
+ 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412, 412,
+ };
+
+ int32_t c = YXB2XYB[c_yxb];
+ float *coeffs = gg->coeffs[c] + coeffoff;
+ int32_t *order = f->orders[pass][dct->order_idx][c];
+ int32_t bctx = f->block_ctx_map[bctx0 + bctxc * c_yxb]; // BlockContext()
+ int32_t nz, nzctx, cctx, qnz, prev;
+
+ // orders should have been already converted from Lehmer code
+ J40__ASSERT(order && ((f->order_loaded >> dct->order_idx) & 1));
+
+ // predict and read the number of non-zero coefficients
+ nz = x8 > 0 ?
+ (y8 > 0 ? (nonzeros[nzpos - 1][c] + nonzeros[nzpos - gw8][c] + 1) >> 1 : nonzeros[nzpos - 1][c]) :
+ (y8 > 0 ? nonzeros[nzpos - gw8][c] : 32);
+ // TODO spec improvement: `predicted` can never exceed 63 in NonZerosContext(),
+ // so better to make it a normative assertion instead of clamping
+ // TODO spec question: then why the predicted value of 64 is reserved in the contexts?
+ J40__ASSERT(nz < 64);
+ nzctx = ctxoff + bctx + (nz < 8 ? nz : 4 + nz / 2) * f->nb_block_ctx;
+ nz = j40__code(st, nzctx, 0, &code);
+ // TODO spec issue: missing
+ J40__SHOULD(nz <= (63 << (log_size - 6)), "coef");
+
+ qnz = j40__ceil_div32(nz, 1 << (log_size - 6)); // [0, 64)
+ for (i = 0; i < (1 << (log_rows - 3)); ++i) {
+ for (j = 0; j < (1 << (log_columns - 3)); ++j) {
+ nonzeros[nzpos + i * gw8 + j][c] = (int8_t) qnz;
+ }
+ }
+ cctx = ctxoff + 458 * bctx + 37 * f->nb_block_ctx;
+
+ prev = (nz <= (1 << (log_size - 4))); // TODO spec bug: swapped condition
+ // TODO spec issue: missing size (probably W*H)
+ for (i = 1 << (log_size - 6); nz > 0 && i < (1 << log_size); ++i) {
+ int32_t ctx = cctx +
+ TWICE_COEFF_NNZ_CTX[j40__ceil_div32(nz, 1 << (log_size - 6))] +
+ TWICE_COEFF_FREQ_CTX[i >> (log_size - 6)] + prev;
+ // TODO spec question: can this overflow?
+ // unlike modular there is no guarantee about "buffers" or anything similar here
+ int32_t ucoeff = j40__code(st, ctx, 0, &code);
+ // TODO int-to-float conversion, is it okay?
+ coeffs[order[i]] += (float) j40__unpack_signed(ucoeff);
+ // TODO spec issue: normative indicator has changed from [[...]] to a long comment
+ nz -= prev = (ucoeff != 0);
+ }
+ J40__SHOULD(nz == 0, "coef"); // TODO spec issue: missing
+ }
+ }
+
+ J40__TRY(j40__finish_and_free_code(st, &code));
+ j40__free(nonzeros);
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_code(&code);
+ j40__free(nonzeros);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__pass_group(
+ j40__st *st, int32_t pass, int32_t gx_in_gg, int32_t gy_in_gg, int32_t gw, int32_t gh, int64_t gidx,
+ j40__lf_group_st *gg
+) {
+ j40__frame_st *f = st->frame;
+ // SPEC "the number of tables" is fixed, no matter how many RAW quant tables are there
+ int64_t sidx = 1 + 3 * f->num_lf_groups + J40__NUM_DCT_PARAMS + pass * f->num_groups + gidx;
+ j40__modular m = J40__INIT;
+ int32_t i;
+
+ if (!f->is_modular) {
+ int32_t ctxoff;
+ // TODO spec issue: this offset is later referred so should be monospaced
+ ctxoff = 495 * f->nb_block_ctx * j40__u(st, j40__ceil_lg32((uint32_t) f->num_hf_presets));
+ J40__TRY(j40__hf_coeffs(st, ctxoff, pass, gx_in_gg, gy_in_gg, gw, gh, gg));
+ }
+
+ J40__TRY(j40__init_modular_for_pass_group(st, f->num_gm_channels, gw, gh, 0, 3, &f->gmodular, &m));
+ if (m.num_channels > 0) {
+ J40__TRY(j40__modular_header(st, f->global_tree, &f->global_codespec, &m));
+ J40__TRY(j40__allocate_modular(st, &m));
+ for (i = 0; i < m.num_channels; ++i) J40__TRY(j40__modular_channel(st, &m, i, sidx));
+ J40__TRY(j40__finish_and_free_code(st, &m.code));
+ J40__TRY(j40__inverse_transform(st, &m));
+ j40__combine_modular_from_pass_group(f->num_gm_channels,
+ gg->top + gy_in_gg, gg->left + gx_in_gg, 0, 3, &f->gmodular, &m);
+ j40__free_modular(&m);
+ }
+
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_modular(&m);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// coefficients to samples
+
+J40_STATIC void j40__dequant_hf(j40__st *st, j40__lf_group_st *gg);
+J40__STATIC_RETURNS_ERR j40__combine_vardct_from_lf_group(j40__st *st, const j40__lf_group_st *gg);
+
+#ifdef J40_IMPLEMENTATION
+
+J40_STATIC void j40__dequant_hf(j40__st *st, j40__lf_group_st *gg) {
+ // QM_SCALE[i] = 0.8^(i - 2)
+ static const float QM_SCALE[8] = {1.5625f, 1.25f, 1.0f, 0.8f, 0.64f, 0.512f, 0.4096f, 0.32768f};
+
+ j40__frame_st *f = st->frame;
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8;
+ float x_qm_scale, b_qm_scale, quant_bias_num = st->image->quant_bias_num, *quant_bias = st->image->quant_bias;
+ int32_t x8, y8, c, i;
+
+ J40__ASSERT(f->x_qm_scale >= 0 && f->x_qm_scale < 8);
+ J40__ASSERT(f->b_qm_scale >= 0 && f->b_qm_scale < 8);
+ x_qm_scale = QM_SCALE[f->x_qm_scale];
+ b_qm_scale = QM_SCALE[f->b_qm_scale];
+
+ for (y8 = 0; y8 < ggh8; ++y8) for (x8 = 0; x8 < ggw8; ++x8) {
+ const j40__dct_select *dct;
+ const j40__dq_matrix *dqmat;
+ int32_t voff = J40__I32_PIXELS(&gg->blocks, y8)[x8], dctsel = voff >> 20, size;
+ float mult[3 /*xyb*/];
+
+ if (dctsel < 2) continue; // not top-left block
+ voff &= 0xfffff;
+ dct = &J40__DCT_SELECT[dctsel - 2];
+ size = 1 << (dct->log_rows + dct->log_columns);
+ // TODO spec bug: spec says mult[1] = HfMul, should be 2^16 / (global_scale * HfMul)
+ mult[1] = 65536.0f / (float) f->global_scale * gg->varblocks[voff].hfmul.inv;
+ mult[0] = mult[1] * x_qm_scale;
+ mult[2] = mult[1] * b_qm_scale;
+ dqmat = &f->dq_matrix[dct->param_idx];
+ J40__ASSERT(dqmat->mode == J40__DQ_ENC_RAW); // should have been already loaded
+
+ for (c = 0; c < 3; ++c) {
+ float *coeffs = gg->coeffs[c] + (gg->varblocks[voff].coeffoff_qfidx & ~15);
+ for (i = 0; i < size; ++i) { // LLF positions are left unused and can be clobbered
+ // TODO spec issue: "quant" is a variable name and should be monospaced
+ if (-1.0f <= coeffs[i] && coeffs[i] <= 1.0f) {
+ coeffs[i] *= quant_bias[c]; // TODO coeffs[i] is integer at this point?
+ } else {
+ coeffs[i] -= quant_bias_num / coeffs[i];
+ }
+ coeffs[i] *= mult[c] / dqmat->params[i][c]; // TODO precompute this
+ }
+ }
+ }
+}
+
+J40__STATIC_RETURNS_ERR j40__combine_vardct_from_lf_group(j40__st *st, const j40__lf_group_st *gg) {
+ j40__image_st *im = st->image;
+ j40__frame_st *f = st->frame;
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8;
+ int32_t ggw = gg->width, ggh = gg->height;
+ float kx_lf, kb_lf, cbrt_opsin_bias[3 /*xyb*/];
+ float *scratch = NULL, *scratch2, *samples[3] = {0};
+ int32_t x8, y8, x, y, i, c;
+
+ for (c = 0; c < 3; ++c) {
+ J40__TRY_MALLOC(float, &samples[c], (size_t) (ggw * ggh));
+ }
+ // TODO allocates the same amount of memory regardless of transformations used
+ J40__TRY_MALLOC(float, &scratch, 2 * 65536);
+ scratch2 = scratch + 65536;
+
+ kx_lf = f->base_corr_x + (float) f->x_factor_lf * f->inv_colour_factor;
+ kb_lf = f->base_corr_b + (float) f->b_factor_lf * f->inv_colour_factor;
+
+ for (y8 = 0; y8 < ggh8; ++y8) for (x8 = 0; x8 < ggw8; ++x8) {
+ const j40__dct_select *dct;
+ int32_t voff = J40__I32_PIXELS(&gg->blocks, y8)[x8], dctsel = voff >> 20;
+ int32_t size, effvw, effvh, vw8, vh8, samplepos;
+ int32_t coeffoff;
+ float *coeffs[3 /*xyb*/], *llfcoeffs[3 /*xyb*/], kx_hf, kb_hf;
+
+ if (dctsel < 2) continue; // not top-left block
+ dctsel -= 2;
+ voff &= 0xfffff;
+ dct = &J40__DCT_SELECT[dctsel];
+ size = 1 << (dct->log_rows + dct->log_columns);
+ coeffoff = gg->varblocks[voff].coeffoff_qfidx & ~15;
+ for (c = 0; c < 3; ++c) {
+ coeffs[c] = gg->coeffs[c] + coeffoff;
+ llfcoeffs[c] = gg->llfcoeffs[c] + (coeffoff >> 6);
+ }
+
+ // TODO spec bug: x_factor and b_factor (for HF) is constant in the same varblock,
+ // even when the varblock spans multiple 64x64 rectangles
+ kx_hf = f->base_corr_x + f->inv_colour_factor * (gg->xfromy.type == J40__PLANE_I16 ?
+ (float) J40__I16_PIXELS(&gg->xfromy, y8 / 8)[x8 / 8] :
+ (float) J40__I32_PIXELS(&gg->xfromy, y8 / 8)[x8 / 8]);
+ kb_hf = f->base_corr_b + f->inv_colour_factor * (gg->bfromy.type == J40__PLANE_I16 ?
+ (float) J40__I16_PIXELS(&gg->bfromy, y8 / 8)[x8 / 8] :
+ (float) J40__I32_PIXELS(&gg->bfromy, y8 / 8)[x8 / 8]);
+
+ effvh = j40__min32(ggh - y8 * 8, 1 << dct->log_rows);
+ effvw = j40__min32(ggw - x8 * 8, 1 << dct->log_columns);
+ samplepos = (y8 * 8) * ggw + (x8 * 8);
+ // this is for LLF coefficients, which may have been transposed
+ vh8 = 1 << (j40__min32(dct->log_rows, dct->log_columns) - 3);
+ vw8 = 1 << (j40__max32(dct->log_rows, dct->log_columns) - 3);
+
+ for (c = 0; c < 3; ++c) {
+ // chroma from luma (CfL), overwrite LLF coefficients on the way
+ // TODO skip CfL if there's subsampled channel
+ switch (c) {
+ case 0: // X
+ for (i = 0; i < size; ++i) scratch[i] = coeffs[0][i] + coeffs[1][i] * kx_hf;
+ for (y = 0; y < vh8; ++y) for (x = 0; x < vw8; ++x) {
+ scratch[y * vw8 * 8 + x] = llfcoeffs[0][y * vw8 + x] + llfcoeffs[1][y * vw8 + x] * kx_lf;
+ }
+ break;
+ case 1: // Y
+ for (i = 0; i < size; ++i) scratch[i] = coeffs[1][i];
+ for (y = 0; y < vh8; ++y) for (x = 0; x < vw8; ++x) {
+ scratch[y * vw8 * 8 + x] = llfcoeffs[1][y * vw8 + x];
+ }
+ break;
+ case 2: // B
+ for (i = 0; i < size; ++i) scratch[i] = coeffs[2][i] + coeffs[1][i] * kb_hf;
+ for (y = 0; y < vh8; ++y) for (x = 0; x < vw8; ++x) {
+ scratch[y * vw8 * 8 + x] = llfcoeffs[2][y * vw8 + x] + llfcoeffs[1][y * vw8 + x] * kb_lf;
+ }
+ break;
+ default: J40__UNREACHABLE();
+ }
+
+ // inverse DCT
+ switch (dctsel) {
+ case 1: j40__inverse_hornuss(scratch); break; // Hornuss
+ case 2: j40__inverse_dct11(scratch); break; // DCT11
+ case 3: j40__inverse_dct22(scratch); break; // DCT22
+ case 12: j40__inverse_dct23(scratch); break; // DCT23
+ case 13: j40__inverse_dct32(scratch); break; // DCT32
+ case 14: j40__inverse_afv(scratch, 0, 0); break; // AFV0
+ case 15: j40__inverse_afv(scratch, 1, 0); break; // AFV1
+ case 16: j40__inverse_afv(scratch, 0, 1); break; // AFV2
+ case 17: j40__inverse_afv(scratch, 1, 1); break; // AFV3
+ default: // every other DCTnm where n, m >= 3
+ j40__inverse_dct2d(scratch, scratch2, dct->log_rows, dct->log_columns);
+ break;
+ }
+
+ if (0) { // TODO display borders for the debugging
+ for (x = 0; x < (1<<dct->log_columns); ++x) scratch[x] = 1.0f - (float) ((dctsel >> x) & 1);
+ for (y = 0; y < (1<<dct->log_rows); ++y) scratch[y << dct->log_columns] = 1.0f - (float) ((dctsel >> y) & 1);
+ }
+
+ // reposition samples into the rectangular grid
+ // TODO spec issue: overflown samples (due to non-8n dimensions) are probably ignored
+ for (y = 0; y < effvh; ++y) for (x = 0; x < effvw; ++x) {
+ samples[c][samplepos + y * ggw + x] = scratch[y << dct->log_columns | x];
+ }
+ }
+ }
+
+ // coeffs is now correctly positioned, copy to the modular buffer
+ // TODO this is highly ad hoc, should be moved to rendering
+ for (c = 0; c < 3; ++c) cbrt_opsin_bias[c] = cbrtf(im->opsin_bias[c]);
+ for (y = 0; y < ggh; ++y) for (x = 0; x < ggw; ++x) {
+ int32_t pos = y * ggw + x;
+ float p[3] = {
+ samples[1][pos] + samples[0][pos],
+ samples[1][pos] - samples[0][pos],
+ samples[2][pos],
+ };
+ float itscale = 255.0f / im->intensity_target;
+ for (c = 0; c < 3; ++c) {
+ float pp = p[c] - cbrt_opsin_bias[c];
+ samples[c][pos] = (pp * pp * pp + im->opsin_bias[c]) * itscale;
+ }
+ }
+ for (c = 0; c < 3; ++c) {
+ if (f->gmodular.channel[c].type == J40__PLANE_I16) {
+ for (y = 0; y < ggh; ++y) {
+ int16_t *pixels = J40__I16_PIXELS(&f->gmodular.channel[c], gg->top + y);
+ for (x = 0; x < ggw; ++x) {
+ int32_t p = y * ggw + x;
+ float v =
+ samples[0][p] * im->opsin_inv_mat[c][0] +
+ samples[1][p] * im->opsin_inv_mat[c][1] +
+ samples[2][p] * im->opsin_inv_mat[c][2];
+ // TODO very, very slow; probably different approximations per bpp ranges may be needed
+ v = (v <= 0.0031308f ? 12.92f * v : 1.055f * powf(v, 1.0f / 2.4f) - 0.055f); // to sRGB
+ // TODO overflow check
+ pixels[gg->left + x] = (int16_t) ((float) ((1 << im->bpp) - 1) * v + 0.5f);
+ }
+ }
+ } else {
+ J40__RAISE("TODO: don't keep this here");
+ }
+ }
+
+J40__ON_ERROR:
+ j40__free(scratch);
+ for (c = 0; c < 3; ++c) j40__free(samples[c]);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// restoration filters
+
+J40__STATIC_RETURNS_ERR j40__gaborish(j40__st *st, j40__plane channels[3 /*xyb*/]);
+
+J40_STATIC int32_t j40__mirror1d(int32_t coord, int32_t size);
+J40_STATIC void j40__epf_distance(const j40__plane *in, int32_t dx, int32_t dy, j40__plane *out);
+J40__STATIC_RETURNS_ERR j40__epf_recip_sigmas(j40__st *st, const j40__lf_group_st *gg, j40__plane *out);
+J40__STATIC_RETURNS_ERR j40__epf_step(
+ j40__st *st, j40__plane channels[3], float sigma_scale, const j40__plane *recip_sigmas,
+ int32_t nkernels, const int32_t (*kernels)[2], j40__plane (*distances)[3], int dist_uses_cross,
+ const j40__lf_group_st *gg
+);
+J40__STATIC_RETURNS_ERR j40__epf(j40__st *st, j40__plane channels[3], const j40__lf_group_st *gg);
+
+#ifdef J40_IMPLEMENTATION
+
+// TODO spec issue: restoration filters are applied to the entire image,
+// even though their parameters are separately signaled via multiple groups or LF groups!
+
+J40__STATIC_RETURNS_ERR j40__gaborish(j40__st *st, j40__plane channels[3 /*xyb*/]) {
+ j40__frame_st *f = st->frame;
+ int32_t width, height;
+ int32_t c, x, y;
+ float *linebuf = NULL, *nline, *line;
+
+ if (!f->gab.enabled) return 0;
+
+ J40__ASSERT(j40__plane_all_equal_sized(channels, channels + 3));
+ J40__ASSERT(j40__plane_all_equal_typed(channels, channels + 3) == J40__PLANE_F32);
+ width = channels->width;
+ height = channels->height;
+
+ J40__TRY_MALLOC(float, &linebuf, (size_t) (width * 2));
+
+ for (c = 0; c < 3; ++c) {
+ float w0 = 1.0f, w1 = f->gab.weights[c][0], w2 = f->gab.weights[c][1];
+ float wsum = w0 + w1 * 4 + w2 * 4;
+ J40__SHOULD(j40__surely_nonzero(wsum), "gab0");
+ w0 /= wsum; w1 /= wsum; w2 /= wsum;
+
+ nline = linebuf + width; // intentionally uninitialized
+ line = linebuf; // row -1 (= row 0 after mirroring)
+ memcpy(line, J40__F32_PIXELS(&channels[c], 0), sizeof(float) * (size_t) width);
+
+ for (y = 0; y < height; ++y) {
+ float *sline, *outline, *temp = nline;
+ nline = line;
+ line = temp;
+ sline = y + 1 < height ? J40__F32_PIXELS(&channels[c], y + 1) : line;
+ outline = J40__F32_PIXELS(&channels[c], y);
+ memcpy(line, outline, sizeof(float) * (size_t) width);
+
+ outline[0] =
+ nline[0] * (w2 + w1) + nline[1] * w2 +
+ line[0] * (w1 + w0) + line[1] * w1 +
+ sline[0] * (w2 + w1) + sline[1] * w2;
+ for (x = 1; x < width - 1; ++x) {
+ outline[x] =
+ nline[x - 1] * w2 + nline[x] * w1 + nline[x + 1] * w2 +
+ line[x - 1] * w1 + line[x] * w0 + line[x + 1] * w1 +
+ sline[x - 1] * w2 + sline[x] * w1 + sline[x + 1] * w2;
+ }
+ if (width > 1) {
+ outline[width - 1] =
+ nline[width - 2] * w2 + nline[width - 1] * (w1 + w2) +
+ line[width - 2] * w1 + line[width - 1] * (w0 + w1) +
+ sline[width - 2] * w2 + sline[width - 1] * (w1 + w2);
+ }
+ }
+ }
+
+J40__ON_ERROR:
+ j40__free(linebuf);
+ return st->err;
+}
+
+J40_STATIC int32_t j40__mirror1d(int32_t coord, int32_t size) {
+ while (1) {
+ if (coord < 0) coord = -coord - 1;
+ else if (coord >= size) coord = size * 2 - 1 - coord;
+ else return coord;
+ }
+}
+
+// computes out(x + 1, y + 1) = abs(in(x, y) - in(x + dx, y + dy)), up to mirroring.
+// used to compute DistanceStep* functions; an increased border is required for correctness.
+J40_STATIC void j40__epf_distance(const j40__plane *in, int32_t dx, int32_t dy, j40__plane *out) {
+ int32_t width = in->width, height = in->height;
+ int32_t x, y, xlo, xhi;
+
+ J40__ASSERT(width + 2 == out->width && height + 2 == out->height);
+ J40__ASSERT(in->type == J40__PLANE_F32 && out->type == J40__PLANE_F32);
+ J40__ASSERT(-2 <= dx && dx <= 2 && -2 <= dy && dy <= 2);
+
+ xlo = (dx > 0 ? 0 : -dx);
+ xhi = (dx < 0 ? width : width - dx);
+
+ // TODO spec issue: `[[(ix, iy) in coords]]` should be normative comments
+ // TODO spec issue: `ix` and `iy` not defined in DistanceStep2, should be 0
+
+ for (y = -1; y <= height; ++y) {
+ int32_t refy = j40__mirror1d(y, height), offy = j40__mirror1d(y + dy, height);
+ float *refpixels = J40__F32_PIXELS(in, refy);
+ float *offpixels = J40__F32_PIXELS(in, offy);
+ float *outpixels = J40__F32_PIXELS(out, y + 1) + 1;
+
+ for (x = -1; x < xlo; ++x) {
+ outpixels[x] = fabsf(refpixels[j40__mirror1d(x, width)] - offpixels[j40__mirror1d(x + dx, width)]);
+ }
+ for (; x < xhi; ++x) {
+ outpixels[x] = fabsf(refpixels[x] - offpixels[x + dx]);
+ }
+ for (; x <= width; ++x) {
+ outpixels[x] = fabsf(refpixels[j40__mirror1d(x, width)] - offpixels[j40__mirror1d(x + dx, width)]);
+ }
+ }
+}
+
+static const float J40__SIGMA_THRESHOLD = 0.3f;
+
+// computes f(sigma) for each block, where f(x) = 1/x if x >= J40__SIGMA_THRESHOLD and < 0 otherwise.
+// note that `inv_sigma` in the spec is not same to `1/sigma`, hence a different name.
+J40__STATIC_RETURNS_ERR j40__epf_recip_sigmas(j40__st *st, const j40__lf_group_st *gg, j40__plane *out) {
+ j40__frame_st *f = st->frame;
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8;
+ float inv_quant_sharp_lut[8]; // 1 / (epf_quant_mul * epf_sharp_lut[i])
+ int32_t x8, y8, i;
+
+ J40__TRY(j40__init_plane(st, J40__PLANE_F32, gg->width8, gg->height8, J40__PLANE_FORCE_PAD, out));
+
+ for (i = 0; i < 8; ++i) {
+ float quant_sharp_lut = f->epf.quant_mul * f->epf.sharp_lut[i];
+ J40__SHOULD(j40__surely_nonzero(quant_sharp_lut), "epf0");
+ inv_quant_sharp_lut[i] = 1.0f / quant_sharp_lut;
+ }
+
+ if (gg->sharpness.type == J40__PLANE_I16) {
+ uint16_t sharpness_ub = 0;
+ for (y8 = 0; y8 < ggh8; ++y8) {
+ int16_t *sharpness = J40__I16_PIXELS(&gg->sharpness, y8);
+ float *recip_sigmas = J40__F32_PIXELS(out, y8);
+ for (x8 = 0; x8 < ggw8; ++x8) {
+ sharpness_ub |= (uint16_t) sharpness[x8];
+ recip_sigmas[x8] = inv_quant_sharp_lut[sharpness[x8] & 7];
+ }
+ }
+ J40__SHOULD(sharpness_ub < 8, "shrp");
+ } else {
+ uint32_t sharpness_ub = 0;
+ for (y8 = 0; y8 < ggh8; ++y8) {
+ int32_t *sharpness = J40__I32_PIXELS(&gg->sharpness, y8);
+ float *recip_sigmas = J40__F32_PIXELS(out, y8);
+ for (x8 = 0; x8 < ggw8; ++x8) {
+ sharpness_ub |= (uint32_t) sharpness[x8];
+ recip_sigmas[x8] = inv_quant_sharp_lut[sharpness[x8] & 7];
+ }
+ }
+ J40__SHOULD(sharpness_ub < 8, "shrp");
+ }
+
+ for (y8 = 0; y8 < ggh8; ++y8) {
+ int32_t *blocks = J40__I32_PIXELS(&gg->blocks, y8);
+ float *recip_sigmas = J40__F32_PIXELS(out, y8);
+ for (x8 = 0; x8 < ggw8; ++x8) {
+ int32_t voff = blocks[x8] & 0xfffff;
+ recip_sigmas[x8] *= gg->varblocks[voff].hfmul.inv;
+ if (recip_sigmas[x8] > 1.0f / J40__SIGMA_THRESHOLD) recip_sigmas[x8] = -1.0f;
+ }
+ }
+
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_plane(out);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__epf_step(
+ j40__st *st, j40__plane channels[3], float sigma_scale, const j40__plane *recip_sigmas,
+ int32_t nkernels, const int32_t (*kernels)[2], j40__plane (*distances)[3], int dist_uses_cross,
+ const j40__lf_group_st *gg
+) {
+ enum { NKERNELS = 12 }; // except for the center
+
+ j40__frame_st *f = st->frame;
+ int32_t ggw8 = gg->width8, ggh8 = gg->height8, width = gg->width, height = gg->height;
+ int32_t stride = width + 4, cstride = stride * 3;
+ int32_t borderx[4] = {-2, -1, width, width + 1}, mirrorx[4];
+ float *linebuf = NULL, *lines[5][3]; // [y+2][c] for row y in the channel c, with mirrored borders
+ float *recip_sigmas_for_modular = NULL; // only used for modular
+ float border_sigma_scale;
+ int32_t x, y, c, k, i;
+
+ J40__ASSERT(nkernels <= NKERNELS);
+
+ J40__ASSERT(j40__plane_all_equal_sized(channels, channels + 3));
+ J40__ASSERT(j40__plane_all_equal_typed(channels, channels + 3) == J40__PLANE_F32);
+ J40__ASSERT(channels->width == width && channels->height == height);
+
+ if (recip_sigmas) {
+ J40__ASSERT(recip_sigmas->width == ggw8 && recip_sigmas->height == ggh8);
+ J40__ASSERT(recip_sigmas->type == J40__PLANE_F32);
+ } else {
+ float recip_sigma;
+ J40__SHOULD(j40__surely_nonzero(f->epf.sigma_for_modular), "epf0");
+
+ // sigma is fixed for modular, so if this is below the threshold no filtering happens
+ if (f->epf.sigma_for_modular < J40__SIGMA_THRESHOLD) return 0;
+
+ J40__TRY_MALLOC(float, &recip_sigmas_for_modular, (size_t) ggw8);
+ recip_sigma = 1.0f / f->epf.sigma_for_modular;
+ for (x = 0; x < ggw8; ++x) recip_sigmas_for_modular[x] = recip_sigma;
+ }
+
+ sigma_scale *= 1.9330952441687859f; // -1.65 * 4 * (sqrt(0.5) - 1)
+ border_sigma_scale = sigma_scale * f->epf.border_sad_mul;
+
+ for (c = 0; c < 3; ++c) {
+ for (k = 0; k < nkernels; ++k) {
+ j40__epf_distance(&channels[c], kernels[k][0], kernels[k][1], &distances[k][c]);
+ }
+ }
+
+ for (i = 0; i < 4; ++i) mirrorx[i] = j40__mirror1d(borderx[i], width);
+
+ J40__TRY_MALLOC(float, &linebuf, (size_t) (cstride * 4));
+ for (c = 0; c < 3; ++c) {
+ int32_t ym2 = j40__mirror1d(-2, height), ym1 = j40__mirror1d(-1, height);
+ for (i = 0; i < 4; ++i) lines[i][c] = linebuf + cstride * c + stride * i + 1;
+ memcpy(lines[1][c], J40__F32_PIXELS(&channels[c], ym2), sizeof(float) * (size_t) width);
+ memcpy(lines[2][c], J40__F32_PIXELS(&channels[c], ym1), sizeof(float) * (size_t) width);
+ memcpy(lines[3][c], J40__F32_PIXELS(&channels[c], 0), sizeof(float) * (size_t) width);
+ for (i = 0; i < 4; ++i) {
+ int32_t borderpos = c * cstride + borderx[i], mirrorpos = c * cstride + mirrorx[i];
+ lines[1][c][borderpos] = lines[1][c][mirrorpos];
+ lines[2][c][borderpos] = lines[2][c][mirrorpos];
+ lines[3][c][borderpos] = lines[3][c][mirrorpos];
+ }
+ }
+
+ for (y = 0; y < height; ++y) {
+ int32_t y1 = j40__mirror1d(y + 1, height), y2 = j40__mirror1d(y + 2, height);
+ float *outline[3];
+ float *recip_sigma_row =
+ recip_sigmas ? J40__F32_PIXELS(recip_sigmas, y / 8) : recip_sigmas_for_modular;
+ float *distance_rows[NKERNELS][3][3] = {{{0}}}; // [kernel_idx][dy+1][c]
+
+ for (c = 0; c < 3; ++c) {
+ float *temp = lines[0][c];
+ lines[0][c] = lines[1][c];
+ lines[1][c] = lines[2][c];
+ lines[2][c] = lines[3][c];
+ lines[3][c] = temp;
+ lines[4][c] = J40__F32_PIXELS(&channels[c], y2);
+ outline[c] = J40__F32_PIXELS(&channels[c], y);
+
+ memcpy(lines[3][c], J40__F32_PIXELS(&channels[c], y1), sizeof(float) * (size_t) width);
+ for (i = 0; i < 4; ++i) lines[3][c][borderx[i]] = lines[3][c][mirrorx[i]];
+
+ for (k = 0; k < nkernels; ++k) {
+ for (i = 0; i < 3; ++i) {
+ distance_rows[k][i][c] = J40__F32_PIXELS(&distances[k][c], y + i);
+ }
+ }
+ }
+
+ for (x = 0; x < width; ++x) {
+ float recip_sigma = recip_sigma_row[x / 8], inv_sigma_times_pos_mult;
+ float sum_weights, sum_channels[3];
+
+ if (recip_sigma < 0.0f) {
+ x += 7; // this and at most 7 subsequent pixels will be skipped anyway
+ continue;
+ }
+
+ // TODO spec issue: "either coordinate" refers to both x and y (i.e. "borders")
+ // according to the source code
+ if ((((x + 1) | (y + 1)) & 7) < 2) {
+ inv_sigma_times_pos_mult = recip_sigma * border_sigma_scale;
+ } else {
+ inv_sigma_times_pos_mult = recip_sigma * sigma_scale;
+ }
+
+ // kernels[*] do not include center, which distance is always 0
+ sum_weights = 1.0f;
+ for (c = 0; c < 3; ++c) sum_channels[c] = lines[2][c][x];
+
+ if (dist_uses_cross) {
+ for (k = 0; k < nkernels; ++k) {
+ float dist = 0.0f;
+ for (c = 0; c < 3; ++c) {
+ dist += f->epf.channel_scale[c] * (
+ distance_rows[k][1][c][x + 1] +
+ distance_rows[k][1][c][x + 0] + distance_rows[k][0][c][x + 1] +
+ distance_rows[k][2][c][x + 1] + distance_rows[k][1][c][x + 2]);
+ }
+ float weight = j40__maxf(0.0f, 1.0f + dist * inv_sigma_times_pos_mult);
+ sum_weights += weight;
+ for (c = 0; c < 3; ++c) {
+ sum_channels[c] += lines[2 + kernels[k][0]][c][x + kernels[k][1]] * weight;
+ }
+ }
+ } else {
+ for (k = 0; k < nkernels; ++k) {
+ float dist = 0.0f;
+ for (c = 0; c < 3; ++c) {
+ dist += f->epf.channel_scale[c] * distance_rows[k][1][c][x + 1];
+ }
+ float weight = j40__maxf(0.0f, 1.0f + dist * inv_sigma_times_pos_mult);
+ sum_weights += weight;
+ for (c = 0; c < 3; ++c) {
+ sum_channels[c] += lines[2 + kernels[k][0]][c][x + kernels[k][1]] * weight;
+ }
+ }
+ }
+
+ for (c = 0; c < 3; ++c) outline[c][x] = sum_channels[c] / sum_weights;
+ }
+ }
+
+J40__ON_ERROR:
+ j40__free(recip_sigmas_for_modular);
+ j40__free(linebuf);
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__epf(j40__st *st, j40__plane channels[3], const j40__lf_group_st *gg) {
+ static const int32_t KERNELS12[][2] = { // 0 < L1 distance <= 2 (step 2)
+ {0,-2}, {-1,-1}, {-1,0}, {-1,1}, {0,-2}, {0,-1}, {0,1}, {0,2}, {-1,1}, {-1,0}, {-1,1}, {0,2},
+ }, KERNELS4[][2] = { // 0 < L1 distance <= 1 (steps 0 and 1)
+ {0,-1}, {-1,0}, {1,0}, {0,1},
+ };
+
+ j40__frame_st *f = st->frame;
+ j40__plane recip_sigmas_ = J40__INIT, *recip_sigmas;
+ j40__plane distances[12][3] = J40__INIT;
+ int32_t k, c, maxnkernels = 0;
+
+ if (f->epf.iters <= 0) return 0;
+
+ if (!f->is_modular) {
+ recip_sigmas = &recip_sigmas_;
+ J40__TRY(j40__epf_recip_sigmas(st, gg, recip_sigmas));
+ } else {
+ recip_sigmas = NULL;
+ }
+
+ // TODO the current implementation takes up to 36 times the input image size of memory!
+ maxnkernels = f->epf.iters >= 3 ? 12 : 4;
+ for (k = 0; k < maxnkernels; ++k) for (c = 0; c < 3; ++c) {
+ J40__TRY(j40__init_plane(
+ st, J40__PLANE_F32, channels[c].width + 2, channels[c].height + 2, 0, &distances[k][c]));
+ }
+
+ if (f->epf.iters >= 3) { // step 0
+ J40__TRY(j40__epf_step(
+ st, channels, f->epf.pass0_sigma_scale, recip_sigmas, 12, KERNELS12, distances, 1, gg));
+ }
+ if (f->epf.iters >= 1) { // step 1
+ J40__TRY(j40__epf_step(st, channels, 1.0f, recip_sigmas, 4, KERNELS4, distances, 1, gg));
+ }
+ if (f->epf.iters >= 2) { // step 2
+ J40__TRY(j40__epf_step(
+ st, channels, f->epf.pass2_sigma_scale, recip_sigmas, 4, KERNELS4, distances, 0, gg));
+ }
+
+J40__ON_ERROR:
+ if (recip_sigmas) j40__free_plane(recip_sigmas);
+ for (k = 0; k < maxnkernels; ++k) for (c = 0; c < 3; ++c) j40__free_plane(&distances[k][c]);
+ return st->err;
+}
+
+#endif // J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// frame parsing primitives
+
+struct j40__group_info {
+ int64_t ggidx;
+ int32_t gx_in_gg, gy_in_gg;
+ int32_t gw, gh;
+};
+
+typedef struct {
+ j40__st *parent; // can be NULL if not initialized
+ j40__st st;
+ j40__buffer_st buffer;
+} j40__section_st;
+
+J40__STATIC_RETURNS_ERR j40__allocate_lf_groups(j40__st *st, j40__lf_group_st **out);
+J40__STATIC_RETURNS_ERR j40__prepare_dq_matrices(j40__st *st);
+J40__STATIC_RETURNS_ERR j40__prepare_orders(j40__st *st);
+J40_ALWAYS_INLINE struct j40__group_info j40__group_info(j40__frame_st *f, int64_t gidx);
+
+J40__STATIC_RETURNS_ERR j40__init_section_state(
+ j40__st **stptr, j40__section_st *sst, int64_t codeoff, int32_t size
+);
+J40__STATIC_RETURNS_ERR j40__finish_section_state(j40__st **stptr, j40__section_st *sst, j40_err err);
+
+J40__STATIC_RETURNS_ERR j40__lf_global_in_section(j40__st *st, const j40__toc *toc);
+J40__STATIC_RETURNS_ERR j40__hf_global_in_section(j40__st *st, const j40__toc *toc);
+J40__STATIC_RETURNS_ERR j40__lf_or_pass_group_in_section(j40__st *st, j40__toc *toc, j40__lf_group_st *ggs);
+
+J40__STATIC_RETURNS_ERR j40__combine_vardct(j40__st *st, j40__lf_group_st *ggs);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__allocate_lf_groups(j40__st *st, j40__lf_group_st **out) {
+ j40__frame_st *f = st->frame;
+ j40__lf_group_st *ggs = NULL;
+ int32_t ggsize = 8 << f->group_size_shift, gsize = 1 << f->group_size_shift;
+ int32_t ggx, ggy, ggidx = 0, gidx = 0, gstride = j40__ceil_div32(f->width, gsize);
+
+ J40__TRY_CALLOC(j40__lf_group_st, &ggs, (size_t) f->num_lf_groups);
+
+ for (ggy = 0; ggy < f->height; ggy += ggsize) {
+ int32_t ggh = j40__min32(ggsize, f->height - ggy);
+ int32_t grows = j40__ceil_div32(ggh, gsize);
+ for (ggx = 0; ggx < f->width; ggx += ggsize, ++ggidx) {
+ j40__lf_group_st *gg = &ggs[ggidx];
+ int32_t ggw = j40__min32(ggsize, f->width - ggx);
+ int32_t gcolumns = j40__ceil_div32(ggw, gsize);
+ gg->idx = ggidx;
+ gg->left = ggx; gg->top = ggy;
+ gg->width = ggw; gg->height = ggh;
+ gg->width8 = j40__ceil_div32(ggw, 8); gg->height8 = j40__ceil_div32(ggh, 8);
+ gg->width64 = j40__ceil_div32(ggw, 64); gg->height64 = j40__ceil_div32(ggh, 64);
+ gg->gidx = gidx + (ggx >> f->group_size_shift);
+ gg->grows = grows;
+ gg->gcolumns = gcolumns;
+ gg->gstride = gstride;
+ }
+ gidx += grows * gstride;
+ }
+
+ J40__ASSERT(f->num_lf_groups == ggidx);
+ J40__ASSERT(f->num_groups == gidx);
+ *out = ggs;
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__prepare_dq_matrices(j40__st *st) {
+ j40__frame_st *f = st->frame;
+ int32_t dct_select_not_loaded = f->dct_select_used & ~f->dct_select_loaded;
+ int32_t i;
+ if (!dct_select_not_loaded) return 0;
+ for (i = 0; i < J40__NUM_DCT_SELECT; ++i) {
+ if (dct_select_not_loaded >> i & 1) {
+ const j40__dct_select *dct = &J40__DCT_SELECT[i];
+ int32_t param_idx = dct->param_idx;
+ J40__TRY(j40__load_dq_matrix(st, param_idx, &f->dq_matrix[param_idx]));
+ f->dct_select_loaded |= 1 << i;
+ }
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__prepare_orders(j40__st *st) {
+ j40__frame_st *f = st->frame;
+ int32_t order_not_loaded = f->order_used & ~f->order_loaded;
+ int32_t pass, i, c;
+ if (!order_not_loaded) return 0;
+ for (i = 0; i < J40__NUM_ORDERS; ++i) {
+ if (order_not_loaded >> i & 1) {
+ int32_t log_rows = J40__LOG_ORDER_SIZE[i][0];
+ int32_t log_columns = J40__LOG_ORDER_SIZE[i][1];
+ int32_t *order, temp, skip = 1 << (log_rows + log_columns - 6);
+ for (pass = 0; pass < f->num_passes; ++pass) for (c = 0; c < 3; ++c) {
+ J40__TRY(j40__natural_order(st, log_rows, log_columns, &order));
+ j40__apply_permutation(order + skip, &temp, sizeof(int32_t), f->orders[pass][i][c]);
+ j40__free(f->orders[pass][i][c]);
+ f->orders[pass][i][c] = order;
+ }
+ f->order_loaded |= 1 << i;
+ }
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40_ALWAYS_INLINE struct j40__group_info j40__group_info(j40__frame_st *f, int64_t gidx) {
+ struct j40__group_info info;
+ int32_t shift = f->group_size_shift;
+ int64_t row, column;
+ J40__ASSERT(0 <= gidx && gidx < f->num_groups);
+ row = gidx / f->gcolumns;
+ column = gidx % f->gcolumns;
+ info.ggidx = (row / 8) * f->ggcolumns + (column / 8);
+ info.gx_in_gg = (int32_t) (column % 8) << shift;
+ info.gy_in_gg = (int32_t) (row % 8) << shift;
+ info.gw = (int32_t) (j40__min64(f->width, (column + 1) << shift) - (column << shift));
+ info.gh = (int32_t) (j40__min64(f->height, (row + 1) << shift) - (row << shift));
+ return info;
+}
+
+// creates a new per-section state `sst` which is identical to `*stptr` except for `buffer`,
+// then ensures that only codestream offsets [codeoff, codeoff + size) are available to `sst`
+// and updates `stptr` to point to `sst`, which should be restored with `j40__finish_section_state`.
+J40__STATIC_RETURNS_ERR j40__init_section_state(
+ j40__st **stptr, j40__section_st *sst, int64_t codeoff, int32_t size
+) {
+ static const j40__buffer_st BUFFER_INIT = J40__INIT;
+ j40__st *st = *stptr;
+ int64_t fileoff, codeoff_limit;
+
+ sst->parent = NULL;
+
+ J40__ASSERT(codeoff <= INT64_MAX - size);
+ J40__TRY(j40__map_codestream_offset(st, codeoff, &fileoff));
+ J40__SHOULD(j40__add64(codeoff, size, &codeoff_limit), "flen");
+
+ J40__TRY(j40__seek_from_source(st, fileoff)); // doesn't alter st->buffer
+
+ sst->st = *st;
+ sst->buffer = BUFFER_INIT;
+ sst->st.buffer = &sst->buffer;
+ J40__TRY(j40__init_buffer(&sst->st, codeoff, codeoff_limit));
+
+J40__ON_ERROR:
+ sst->parent = st;
+ *stptr = &sst->st;
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__finish_section_state(j40__st **stptr, j40__section_st *sst, j40_err err) {
+ j40__st *st;
+
+ if (!sst->parent) return err;
+ J40__ASSERT(*stptr == &sst->st);
+
+ if (err) {
+ *stptr = st = sst->parent;
+ J40__ASSERT(sst->st.err == err);
+ st->err = err;
+ st->saved_errno = sst->st.saved_errno;
+ st->cannot_retry = sst->st.cannot_retry;
+ // TODO `shrt` is not recoverable if this section is not the last section read
+ } else {
+ st = &sst->st;
+ J40__ASSERT(!st->err);
+ J40__TRY(j40__no_more_bytes(st));
+ }
+
+J40__ON_ERROR:
+ *stptr = st = sst->parent;
+ j40__free_buffer(&sst->buffer);
+
+ // ensure that other subsystems can't be accidentally deallocated
+ sst->parent = NULL;
+ sst->st.source = NULL;
+ sst->st.container = NULL;
+ sst->st.buffer = NULL;
+ sst->st.image = NULL;
+ sst->st.frame = NULL;
+
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__lf_global_in_section(j40__st *st, const j40__toc *toc) {
+ j40__section_st sst = J40__INIT;
+ if (!toc->single_size) {
+ J40__TRY(j40__init_section_state(&st, &sst, toc->lf_global_codeoff, toc->lf_global_size));
+ }
+ J40__TRY(j40__finish_section_state(&st, &sst, j40__lf_global(st)));
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__hf_global_in_section(j40__st *st, const j40__toc *toc) {
+ j40__section_st sst = J40__INIT;
+ if (st->frame->is_modular) {
+ J40__SHOULD(toc->hf_global_size == 0, "excs");
+ } else {
+ if (!toc->single_size) {
+ J40__TRY(j40__init_section_state(&st, &sst, toc->hf_global_codeoff, toc->hf_global_size));
+ }
+ J40__TRY(j40__finish_section_state(&st, &sst, j40__hf_global(st)));
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__lf_or_pass_group_in_section(j40__st *st, j40__toc *toc, j40__lf_group_st *ggs) {
+ j40__section section = toc->sections[toc->nsections_read];
+ j40__section_st sst = J40__INIT;
+
+ if (section.pass < 0) { // LF group
+ j40__lf_group_st *gg = &ggs[section.idx];
+ J40__TRY(j40__init_section_state(&st, &sst, section.codeoff, section.size));
+ J40__TRY(j40__finish_section_state(&st, &sst, j40__lf_group(st, gg)));
+ gg->loaded = 1;
+ J40__TRY(j40__prepare_dq_matrices(st));
+ J40__TRY(j40__prepare_orders(st));
+ } else { // pass group
+ struct j40__group_info info = j40__group_info(st->frame, section.idx);
+ j40__lf_group_st *gg = &ggs[info.ggidx];
+ J40__ASSERT(gg->loaded); // j40__read_toc should have taken care of this
+ J40__TRY(j40__init_section_state(&st, &sst, section.codeoff, section.size));
+ J40__TRY(j40__finish_section_state(&st, &sst, j40__pass_group(
+ st, section.pass, info.gx_in_gg, info.gy_in_gg, info.gw, info.gh, section.idx, gg)));
+ }
+
+ ++toc->nsections_read;
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__combine_vardct(j40__st *st, j40__lf_group_st *ggs) {
+ j40__frame_st *f = st->frame;
+ int64_t i;
+
+ // TODO pretty incorrect to do this
+ J40__SHOULD(!f->do_ycbcr && st->image->cspace != J40__CS_GREY, "TODO: we don't yet do YCbCr or gray");
+ J40__SHOULD(st->image->modular_16bit_buffers, "TODO: !modular_16bit_buffers");
+ f->gmodular.num_channels = 3;
+ J40__TRY_CALLOC(j40__plane, &f->gmodular.channel, 3);
+ for (i = 0; i < f->gmodular.num_channels; ++i) {
+ J40__TRY(j40__init_plane(
+ st, J40__PLANE_I16, f->width, f->height, J40__PLANE_FORCE_PAD, &f->gmodular.channel[i]));
+ }
+ for (i = 0; i < f->num_lf_groups; ++i) {
+ j40__dequant_hf(st, &ggs[i]);
+ J40__TRY(j40__combine_vardct_from_lf_group(st, &ggs[i]));
+ }
+
+J40__ON_ERROR:
+ return st->err;
+}
+
+J40__STATIC_RETURNS_ERR j40__end_of_frame(j40__st *st, const j40__toc *toc) {
+ J40__TRY(j40__zero_pad_to_byte(st));
+ if (toc->single_size) {
+ int64_t codeoff = j40__codestream_offset(st);
+ if (codeoff < toc->end_codeoff) {
+ st->cannot_retry = 1;
+ J40__RAISE("shrt");
+ } else {
+ J40__SHOULD(codeoff == toc->end_codeoff, "excs");
+ }
+ } else {
+ J40__TRY(j40__seek_buffer(st, toc->end_codeoff));
+ }
+J40__ON_ERROR:
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// rendering (currently very limited)
+
+J40__STATIC_RETURNS_ERR j40__render_to_u8x4_rgba(j40__st *st, j40__plane *out);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__render_to_u8x4_rgba(j40__st *st, j40__plane *out) {
+ j40__image_st *im = st->image;
+ j40__frame_st *f = st->frame;
+ j40__plane *c[4], rgba = J40__INIT;
+ int32_t maxpixel, maxpixel2;
+ int32_t i, x, y;
+
+ J40__SHOULD(im->modular_16bit_buffers, "TODO: specialize for 32-bit");
+ J40__SHOULD(im->bpp >= 8, "TODO: does not yet support <8bpp");
+ J40__SHOULD(im->exp_bits == 0, "TODO: float samples not yet supported");
+ J40__SHOULD(!(!f->do_ycbcr && im->xyb_encoded && im->cspace == J40__CS_GREY),
+ "TODO: direct luma encoding not yet supported");
+
+ J40__ASSERT(f->gmodular.num_channels >= 3);
+ for (i = 0; i < 3; ++i) c[i] = &f->gmodular.channel[i];
+ c[3] = NULL;
+ for (i = 3; i < f->gmodular.num_channels; ++i) {
+ j40__ec_info *ec = &im->ec_info[i - 3];
+ if (ec->type == J40__EC_ALPHA) {
+ J40__SHOULD(ec->bpp == im->bpp && ec->exp_bits == im->exp_bits,
+ "TODO: alpha channel has different bpp or sample type from color channels");
+ J40__SHOULD(ec->dim_shift == 0, "TODO: subsampled alpha not yet supported");
+ J40__SHOULD(!ec->data.alpha_associated, "TODO: associated alpha not yet supported");
+ c[3] = &f->gmodular.channel[i];
+ break;
+ }
+ }
+
+ J40__SHOULD(f->width < INT32_MAX / 4, "bigg");
+ J40__TRY(j40__init_plane(st, J40__PLANE_U8, f->width * 4, f->height, J40__PLANE_FORCE_PAD, &rgba));
+
+ maxpixel = (1 << im->bpp) - 1;
+ maxpixel2 = (1 << (im->bpp - 1));
+ for (y = 0; y < f->height; ++y) {
+ int16_t *pixels[4];
+ uint8_t *outpixels = J40__U8_PIXELS(&rgba, y);
+ for (i = 0; i < 4; ++i) pixels[i] = c[i] ? J40__I16_PIXELS(c[i], y) : NULL;
+ for (x = 0; x < f->width; ++x) {
+ for (i = 0; i < 4; ++i) {
+ // TODO optimize
+ int32_t p = j40__min32(j40__max32(0, pixels[i] ? pixels[i][x] : maxpixel), maxpixel);
+ outpixels[x * 4 + i] = (uint8_t) ((p * 255 + maxpixel2) / maxpixel);
+ }
+ }
+ }
+
+ *out = rgba;
+ return 0;
+
+J40__ON_ERROR:
+ j40__free_plane(&rgba);
+ return st->err;
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// API utilities
+
+// we don't trust callers and do the basic check ourselves
+#define J40__IMAGE_MAGIC ((uint32_t) 0x7867ae21) // crc32("j40_image")
+#define J40__IMAGE_ERR_MAGIC ((uint32_t) 0xb26a48aa) // crc32("j40_image with error")
+#define J40__IMAGE_OPEN_ERR_MAGIC ((uint32_t) 0x02c2eb6d) // crc32("j40_image with open error")
+#define J40__FRAME_MAGIC ((uint32_t) 0x08a296b3) // crc32("j40_frame")
+#define J40__FRAME_ERR_MAGIC ((uint32_t) 0x16351564) // crc32("j40_frame with error")
+#define J40__INNER_MAGIC ((uint32_t) 0x5009e1c4) // crc32("j40__inner")
+
+#define J40__FOREACH_API(X) \
+ X(from_file,) \
+ X(from_memory,) \
+ /* the last origin that can use alternative magic numbers, see J40__ORIGIN_LAST_ALT_MAGIC */ \
+ X(output_format,) \
+ X(next_frame,) \
+ X(current_frame,) \
+ X(frame_pixels,_*) \
+ X(error_string,) \
+ X(free,) \
+
+typedef enum { // each API defines its origin value; they don't have to be stable
+ J40__ORIGIN_NONE = 0,
+ J40__ORIGIN_NEXT, // for j40_free; the next call will be the actual origin
+#define J40__ORIGIN_ENUM_VALUE(origin, suffix) J40__ORIGIN_##origin,
+ J40__FOREACH_API(J40__ORIGIN_ENUM_VALUE)
+ J40__ORIGIN_MAX,
+ J40__ORIGIN_LAST_ALT_MAGIC = J40__ORIGIN_from_memory,
+} j40__origin;
+
+static const char *J40__ORIGIN_NAMES[] = {
+ "(unknown)",
+ NULL,
+#define J40__ORIGIN_NAME(origin, suffix) #origin #suffix,
+ J40__FOREACH_API(J40__ORIGIN_NAME)
+};
+
+static const struct { char err[5]; const char *msg, *suffix; } J40__ERROR_STRINGS[] = {
+ { "Upt0", "`path` parameter is NULL", NULL },
+ { "Ubf0", "`buf` parameter is NULL", NULL },
+ { "Uch?", "Bad `channel` parameter", NULL },
+ { "Ufm?", "Bad `format` parameter", NULL },
+ { "Uof?", "Bad `channel` and `format` combination", NULL },
+ { "Urnd", "Frame is not yet rendered", NULL },
+ { "Ufre", "Trying to reuse already freed image", NULL },
+ { "!mem", "Out of memory", NULL },
+ { "!jxl", "The JPEG XL signature is not found", NULL },
+ { "open", "Failed to open file", NULL },
+ { "bigg", "Image dimensions are too large to handle", NULL },
+ { "flen", "File is too lengthy to handle", NULL },
+ { "shrt", "Premature end of file", NULL },
+ { "slim", "Image size limit reached", NULL },
+ { "elim", "Extra channel number limit reached", NULL },
+ { "xlim", "Modular transform limit reached", NULL },
+ { "tlim", "Meta-adaptive tree size or depth limit reached", NULL },
+ { "plim", "ICC profile length limit reached", NULL },
+ { "fbpp", "Given bits per pixel value is disallowed", NULL }, // "f" stands for "forbidden"
+ { "fblk", "Black extra channel is disallowed", NULL },
+ { "fm32", "32-bit buffers for modular encoding are disallowed", NULL },
+ { "TODO", "Unimplemented feature encountered", NULL }, // TODO remove this when ready
+ { "TEST", "Testing-only error occurred", NULL },
+};
+
+// an API-level twin of `j40__st`; see `j40__st` documentation for the rationale for split.
+typedef struct j40__inner {
+ uint32_t magic; // should be J40__INNER_MAGIC
+
+ //j40__mutex mutex;
+
+ j40__origin origin; // error origin
+ // same to those in j40__st
+ j40_err err;
+ int saved_errno;
+ int cannot_retry;
+
+ #define J40__ERRBUF_LEN 256
+ char errbuf[J40__ERRBUF_LEN];
+
+ int state; // used in j40_advance
+
+ // subsystem contexts; copied to and from j40__st whenever needed
+ struct j40__bits_st bits;
+ struct j40__source_st source;
+ struct j40__container_st container;
+ struct j40__buffer_st buffer;
+ struct j40__image_st image;
+ struct j40__frame_st frame;
+ struct j40__lf_group_st *lf_groups; // [frame.num_lf_groups]
+
+ j40__toc toc;
+
+ int rendered;
+ j40__plane rendered_rgba;
+} j40__inner;
+
+J40__STATIC_RETURNS_ERR j40__set_alt_magic(
+ j40_err err, int saved_errno, j40__origin origin, j40_image *image
+);
+J40__STATIC_RETURNS_ERR j40__set_magic(j40__inner *inner, j40_image *image);
+
+J40_STATIC j40_err j40__check_image(j40_image *image, j40__origin neworigin, j40__inner **outinner);
+#define J40__CHECK_IMAGE() do { \
+ j40_err err = j40__check_image((j40_image*) image, ORIGIN, &inner); \
+ if (err) return err; \
+ } while (0)
+#define J40__SET_INNER_ERR(s) (inner->origin = ORIGIN, inner->err = J40__4(s))
+
+J40_STATIC void j40__init_state(j40__st *st, j40__inner *inner);
+J40_STATIC void j40__save_state(j40__st *st, j40__inner *inner, j40__origin origin);
+
+J40__STATIC_RETURNS_ERR j40__advance(j40__inner *inner, j40__origin origin/*, int32_t until*/);
+
+J40_STATIC void j40__free_inner(j40__inner *inner);
+
+#ifdef J40_IMPLEMENTATION
+
+J40__STATIC_RETURNS_ERR j40__set_alt_magic(
+ j40_err err, int saved_errno, j40__origin origin, j40_image *image
+) {
+ if (err == J40__4("open")) {
+ image->magic = J40__IMAGE_OPEN_ERR_MAGIC ^ (uint32_t) origin;
+ image->u.saved_errno = saved_errno;
+ return err;
+ } else {
+ image->magic = J40__IMAGE_ERR_MAGIC ^ (uint32_t) origin;
+ return image->u.err = err;
+ }
+}
+
+J40__STATIC_RETURNS_ERR j40__set_magic(j40__inner *inner, j40_image *image) {
+ image->magic = J40__IMAGE_MAGIC;
+ image->u.inner = inner;
+ inner->magic = J40__INNER_MAGIC;
+ return 0;
+}
+
+J40_STATIC j40_err j40__check_image(j40_image *image, j40__origin neworigin, j40__inner **outinner) {
+ *outinner = NULL;
+ if (!image) return J40__4("Uim0");
+ if (image->magic != J40__IMAGE_MAGIC) {
+ uint32_t origin = image->magic ^ J40__IMAGE_ERR_MAGIC;
+ if (0 < origin && origin <= J40__ORIGIN_LAST_ALT_MAGIC) {
+ if (origin == J40__ORIGIN_NEXT && neworigin) image->magic = J40__IMAGE_ERR_MAGIC ^ neworigin;
+ return image->u.err;
+ }
+ origin = image->magic ^ J40__IMAGE_OPEN_ERR_MAGIC;
+ if (0 < origin && origin <= J40__ORIGIN_LAST_ALT_MAGIC) return J40__4("open");
+ return J40__4("Uim?");
+ }
+ if (!image->u.inner || image->u.inner->magic != J40__INNER_MAGIC) return J40__4("Uim?");
+ *outinner = image->u.inner;
+ return image->u.inner->err; // TODO handle cannot_retry in a better way
+}
+
+J40_STATIC void j40__init_state(j40__st *st, j40__inner *inner) {
+ st->err = 0;
+ st->saved_errno = 0;
+ st->cannot_retry = 0;
+ st->bits = inner->buffer.checkpoint;
+ st->source = &inner->source;
+ st->container = &inner->container;
+ st->buffer = &inner->buffer;
+ st->image = &inner->image;
+ st->frame = &inner->frame;
+ st->limits = &J40__MAIN_LV5_LIMITS;
+}
+
+J40_STATIC void j40__save_state(j40__st *st, j40__inner *inner, j40__origin origin) {
+ if (st->err) {
+ inner->origin = origin;
+ inner->err = st->err;
+ inner->saved_errno = st->saved_errno;
+ inner->cannot_retry = st->cannot_retry;
+ } else {
+ inner->buffer.checkpoint = st->bits;
+ }
+}
+
+// TODO expose this with a proper interface
+J40__STATIC_RETURNS_ERR j40__advance(j40__inner *inner, j40__origin origin/*, int32_t until*/) {
+ j40__st stbuf, *st = &stbuf;
+ j40__frame_st *f;
+ j40_err err;
+
+ j40__init_state(st, inner);
+
+ // a less-known coroutine hack with some tweak.
+ // see https://www.chiark.greenend.org.uk/~sgtatham/coroutines.html for basic concepts.
+ //
+ // it is EXTREMELY important that any `J40__YIELD_AFTER` call may fail, and the next call
+ // to `j40_advance` will restart after the last successful `J40__YIELD_AFTER` call.
+ // therefore any code between two `J40__YIELD_AFTER` can run multiple times!
+ // if you don't want this, you should move the code into a separate function.
+ // for the same reason, this block can't contain any variable declaration or assignment.
+ #define J40__YIELD_AFTER(expr) \
+ do { \
+ err = (expr); \
+ j40__save_state(st, inner, origin); \
+ if (err) return err; \
+ inner->state = __LINE__; /* thus each line can have at most one J40__YIELD() call */ \
+ /* fall through */ \
+ case __LINE__:; \
+ } while (0)
+
+ f = st->frame;
+ switch (inner->state) {
+ case 0: // initial state
+
+ J40__YIELD_AFTER(j40__init_buffer(st, 0, INT64_MAX));
+ J40__YIELD_AFTER(j40__signature(st));
+ J40__YIELD_AFTER(j40__image_metadata(st));
+
+ if (st->image->want_icc) {
+ J40__YIELD_AFTER(j40__icc(st));
+ }
+
+ { // TODO should really be a loop, should we support multiple frames
+ J40__YIELD_AFTER(j40__frame_header(st));
+ if (!f->is_last) J40__YIELD_AFTER(J40__ERR("TODO: multiple frames"));
+ if (f->type != J40__FRAME_REGULAR) J40__YIELD_AFTER(J40__ERR("TODO: non-regular frame"));
+ J40__YIELD_AFTER(j40__read_toc(st, &inner->toc));
+
+ J40__YIELD_AFTER(j40__lf_global_in_section(st, &inner->toc));
+ J40__YIELD_AFTER(j40__hf_global_in_section(st, &inner->toc));
+
+ J40__YIELD_AFTER(j40__allocate_lf_groups(st, &inner->lf_groups));
+
+ if (inner->toc.single_size) {
+ J40__ASSERT(f->num_lf_groups == 1 && f->num_groups == 1 && f->num_passes == 1);
+ J40__YIELD_AFTER(j40__lf_group(st, &inner->lf_groups[0]));
+ J40__YIELD_AFTER(j40__prepare_dq_matrices(st));
+ J40__YIELD_AFTER(j40__prepare_orders(st));
+ J40__YIELD_AFTER(j40__pass_group(st, 0, 0, 0, f->width, f->height, 0, &inner->lf_groups[0]));
+ J40__YIELD_AFTER(j40__zero_pad_to_byte(st));
+ } else {
+ while (inner->toc.nsections_read < inner->toc.nsections) {
+ J40__YIELD_AFTER(j40__lf_or_pass_group_in_section(st, &inner->toc, inner->lf_groups));
+ }
+ }
+
+ J40__YIELD_AFTER(j40__end_of_frame(st, &inner->toc));
+
+ J40__YIELD_AFTER(j40__inverse_transform(st, &f->gmodular));
+ if (!f->is_modular) J40__YIELD_AFTER(j40__combine_vardct(st, inner->lf_groups));
+ }
+
+ J40__YIELD_AFTER(j40__no_more_bytes(st));
+ break;
+
+ default: J40__UNREACHABLE();
+ }
+
+ return 0;
+}
+
+J40_STATIC void j40__free_inner(j40__inner *inner) {
+ int64_t i, num_lf_groups = inner->frame.num_lf_groups;
+ j40__free_source(&inner->source);
+ j40__free_container(&inner->container);
+ j40__free_buffer(&inner->buffer);
+ j40__free_image_state(&inner->image);
+ j40__free_frame_state(&inner->frame);
+ if (inner->lf_groups) {
+ for (i = 0; i < num_lf_groups; ++i) j40__free_lf_group(&inner->lf_groups[i]);
+ free(inner->lf_groups);
+ }
+ j40__free_toc(&inner->toc);
+ j40__free_plane(&inner->rendered_rgba);
+ j40__free(inner);
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+// public API (implementation)
+
+#ifdef J40_IMPLEMENTATION
+
+J40_API j40_err j40_error(const j40_image *image) {
+ j40__inner *inner; // ignored
+ // do not alter image->magic even for Ufre
+ return j40__check_image((j40_image *) image, J40__ORIGIN_NONE, &inner);
+}
+
+J40_API const char *j40_error_string(const j40_image *image) {
+ static char static_errbuf[J40__ERRBUF_LEN];
+ uint32_t origin = J40__ORIGIN_NONE;
+ j40_err err = 0;
+ const char *msg, *suffix;
+ char *buf = NULL;
+ int saved_errno = 0;
+ int32_t i, corrupted_image = 0;
+
+ if (!image) {
+ snprintf(static_errbuf, J40__ERRBUF_LEN, "`image` parameter is NULL during j40_error_string");
+ return static_errbuf;
+ }
+ if (image->magic == J40__IMAGE_MAGIC) {
+ if (image->u.inner && image->u.inner->magic == J40__INNER_MAGIC) {
+ origin = image->u.inner->origin;
+ err = image->u.inner->err;
+ buf = image->u.inner->errbuf;
+ saved_errno = image->u.inner->saved_errno;
+ } else {
+ corrupted_image = 1;
+ }
+ } else {
+ origin = image->magic ^ J40__IMAGE_ERR_MAGIC;
+ if (0 < origin && origin <= J40__ORIGIN_LAST_ALT_MAGIC) {
+ err = image->u.err;
+ buf = static_errbuf;
+ saved_errno = 0;
+ // do not alter image->magic even for Ufre, but the message will be altered accordingly
+ if (origin == J40__ORIGIN_NEXT) origin = J40__ORIGIN_error_string;
+ } else {
+ origin = image->magic ^ J40__IMAGE_OPEN_ERR_MAGIC;
+ if (0 < origin && origin <= J40__ORIGIN_LAST_ALT_MAGIC) {
+ err = J40__4("open");
+ buf = static_errbuf;
+ saved_errno = image->u.saved_errno;
+ } else {
+ corrupted_image = 1;
+ }
+ }
+ }
+ if (corrupted_image) {
+ snprintf(static_errbuf, J40__ERRBUF_LEN,
+ "`image` parameter is found corrupted during j40_error_string");
+ return static_errbuf;
+ }
+
+ // TODO acquire a spinlock for buf if threaded
+
+ msg = NULL;
+ suffix = "";
+ for (i = 0; i < (int32_t) (sizeof(J40__ERROR_STRINGS) / sizeof(*J40__ERROR_STRINGS)); ++i) {
+ if (err == J40__4(J40__ERROR_STRINGS[i].err)) {
+ msg = J40__ERROR_STRINGS[i].msg;
+ if (J40__ERROR_STRINGS[i].suffix) suffix = J40__ERROR_STRINGS[i].suffix;
+ break;
+ }
+ }
+ if (!msg) {
+ snprintf(buf, J40__ERRBUF_LEN, "Decoding failed (%c%c%c%c) during j40_%s",
+ err >> 24 & 0xff, err >> 16 & 0xff, err >> 8 & 0xff, err & 0xff, J40__ORIGIN_NAMES[origin]);
+ } else if (saved_errno) {
+ snprintf(buf, J40__ERRBUF_LEN, "%s during j40_%s%s: %s",
+ msg, J40__ORIGIN_NAMES[origin], suffix, strerror(saved_errno));
+ } else {
+ snprintf(buf, J40__ERRBUF_LEN, "%s during j40_%s%s", msg, J40__ORIGIN_NAMES[origin], suffix);
+ }
+ return buf;
+}
+
+J40_API j40_err j40_from_memory(j40_image *image, void *buf, size_t size, j40_memory_free_func freefunc) {
+ static const j40__origin ORIGIN = J40__ORIGIN_from_memory;
+ j40__inner *inner;
+ j40__st stbuf, *st = &stbuf;
+
+ if (!image) return J40__4("Uim0");
+ if (!buf) return j40__set_alt_magic(J40__4("Ubf0"), 0, ORIGIN, image);
+
+ inner = (j40__inner*) j40__calloc(1, sizeof(j40__inner));
+ if (!inner) return j40__set_alt_magic(J40__4("!mem"), 0, ORIGIN, image);
+
+ j40__init_state(st, inner);
+ if (j40__init_memory_source(st, (uint8_t*) buf, size, freefunc, &inner->source)) {
+ j40__free_inner(inner);
+ return j40__set_alt_magic(st->err, st->saved_errno, ORIGIN, image);
+ } else {
+ J40__ASSERT(!st->err);
+ return j40__set_magic(inner, image);
+ }
+}
+
+J40_API j40_err j40_from_file(j40_image *image, const char *path) {
+ static const j40__origin ORIGIN = J40__ORIGIN_from_file;
+ j40__inner *inner;
+ j40__st stbuf, *st = &stbuf;
+
+ if (!image) return J40__4("Uim0");
+ if (!path) return j40__set_alt_magic(J40__4("Upt0"), 0, ORIGIN, image);
+
+ inner = (j40__inner*) j40__calloc(1, sizeof(j40__inner));
+ if (!inner) return j40__set_alt_magic(J40__4("!mem"), 0, ORIGIN, image);
+
+ j40__init_state(st, inner);
+ if (j40__init_file_source(st, path, &inner->source)) {
+ j40__free_inner(inner);
+ return j40__set_alt_magic(st->err, st->saved_errno, ORIGIN, image);
+ } else {
+ J40__ASSERT(!st->err);
+ return j40__set_magic(inner, image);
+ }
+}
+
+J40_API j40_err j40_output_format(j40_image *image, int32_t channel, int32_t format) {
+ static const j40__origin ORIGIN = J40__ORIGIN_output_format;
+ j40__inner *inner;
+
+ J40__CHECK_IMAGE();
+
+ // TODO implement multiple output formats
+ if (channel != J40_RGBA) return J40__SET_INNER_ERR("Uch?");
+ if (format != J40_U8X4) return J40__SET_INNER_ERR("Ufm?");
+ if (!(channel == J40_RGBA && format == J40_U8X4)) return J40__SET_INNER_ERR("Uof?");
+
+ return 0;
+}
+
+J40_API int j40_next_frame(j40_image *image) {
+ static const j40__origin ORIGIN = J40__ORIGIN_next_frame;
+ j40__inner *inner;
+ j40__st stbuf;
+ j40_err err;
+
+ err = j40__check_image(image, ORIGIN, &inner);
+ if (err) return 0; // does NOT return err!
+
+ err = j40__advance(inner, ORIGIN);
+ if (err) return 0;
+
+ // we don't yet have multiple frames, so the second j40_next_frame call always returns 0
+ if (inner->rendered) return 0;
+
+ j40__init_state(&stbuf, inner);
+ err = j40__render_to_u8x4_rgba(&stbuf, &inner->rendered_rgba);
+ if (err) {
+ inner->origin = ORIGIN;
+ inner->err = err;
+ return 0;
+ }
+ inner->rendered = 1;
+ return 1;
+}
+
+J40_API j40_frame j40_current_frame(j40_image *image) {
+ static const j40__origin ORIGIN = J40__ORIGIN_current_frame;
+ j40__inner *inner;
+ j40_frame frame;
+ j40_err err;
+
+ err = j40__check_image(image, ORIGIN, &inner);
+ frame.magic = J40__FRAME_ERR_MAGIC;
+ frame.reserved = 0;
+ frame.inner = inner;
+ if (err) return frame;
+
+ if (!inner->rendered) {
+ if (!j40_next_frame(image)) { // if j40_next_frame hasn't been called, implicity call it
+ if (inner->err) return frame; // at this point we are sure that inner exists
+ }
+ }
+
+ frame.magic = J40__FRAME_MAGIC;
+ return frame;
+}
+
+J40_API j40_pixels_u8x4 j40_frame_pixels_u8x4(const j40_frame *frame, int32_t channel) {
+ static const j40__origin ORIGIN = J40__ORIGIN_frame_pixels;
+
+ // on error, return this placeholder image (TODO should this include an error message?)
+ #define J40__U8X4_THIRD(a,b,c,d,e,f,g) 255,0,0,a*255, 255,0,0,b*255, 255,0,0,c*255, \
+ 255,0,0,d*255, 255,0,0,e*255, 255,0,0,f*255, 255,0,0,g*255
+ #define J40__U8X4_ROW(aa,bb,cc) J40__U8X4_THIRD aa, J40__U8X4_THIRD bb, J40__U8X4_THIRD cc
+ static const uint8_t ERROR_PIXELS_DATA[] = {
+ J40__U8X4_ROW((1,1,1,1,1,1,1),(1,1,1,1,1,1,1),(1,1,1,1,1,1,1)),
+ J40__U8X4_ROW((1,0,0,0,1,1,1),(1,1,1,1,1,1,1),(1,1,1,1,1,1,1)),
+ J40__U8X4_ROW((1,0,1,1,1,1,1),(1,1,1,1,1,1,1),(1,1,1,1,1,1,1)),
+ J40__U8X4_ROW((1,0,0,0,1,0,0),(0,1,0,0,0,1,0),(0,0,1,0,0,0,1)),
+ J40__U8X4_ROW((1,0,1,1,1,0,1),(1,1,0,1,1,1,0),(1,0,1,0,1,1,1)),
+ J40__U8X4_ROW((1,0,0,0,1,0,1),(1,1,0,1,1,1,0),(0,0,1,0,1,1,1)),
+ J40__U8X4_ROW((1,1,1,1,1,1,1),(1,1,1,1,1,1,1),(1,1,1,1,1,1,1)),
+ };
+ static const j40_pixels_u8x4 ERROR_PIXELS = {21, 7, 21 * 4, ERROR_PIXELS_DATA};
+
+ j40__inner *inner;
+ j40_pixels_u8x4 pixels;
+
+ if (!frame || frame->magic != J40__FRAME_MAGIC) return ERROR_PIXELS;
+ inner = frame->inner;
+ if (!inner || inner->magic != J40__INNER_MAGIC) return ERROR_PIXELS;
+
+ // TODO support more channels
+ if (channel != J40_RGBA) return ERROR_PIXELS;
+
+ // TODO this condition is impossible under the current API
+ if (!inner->rendered) return J40__SET_INNER_ERR("Urnd"), ERROR_PIXELS;
+
+ J40__ASSERT(inner->rendered_rgba.width % 4 == 0);
+ pixels.width = inner->rendered_rgba.width / 4;
+ pixels.height = inner->rendered_rgba.height;
+ pixels.stride_bytes = inner->rendered_rgba.stride_bytes;
+ pixels.data = (void*) inner->rendered_rgba.pixels;
+ return pixels;
+}
+
+J40_API const j40_u8x4 *j40_row_u8x4(j40_pixels_u8x4 pixels, int32_t y) {
+ J40__ASSERT(0 <= y && y < pixels.height);
+ J40__ASSERT(pixels.stride_bytes > 0);
+ J40__ASSERT(pixels.data);
+ return (const j40_u8x4*) ((const char*) pixels.data + (size_t) pixels.stride_bytes * (size_t) y);
+}
+
+J40_API void j40_free(j40_image *image) {
+ j40__inner *inner;
+ j40__check_image(image, J40__ORIGIN_free, &inner);
+ if (inner) j40__free_inner(inner);
+ image->magic = J40__IMAGE_ERR_MAGIC ^ J40__ORIGIN_NEXT;
+ image->u.err = J40__4("Ufre");
+}
+
+#endif // defined J40_IMPLEMENTATION
+
+////////////////////////////////////////////////////////////////////////////////
+#endif // J40__RECURSING < 0 // internal code ends here //
+////////////////////////////////////////////////////////////////////////////////
+
+#if J40__RECURSING <= 0
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef _MSC_VER
+ #pragma warning(pop)
+#endif
+
+// prevents double `#include`s---we can't really use `#pragma once` or simple `#ifndef` guards...
+#undef J40__RECURSING
+#define J40__RECURSING 9999
+
+#endif // J40__RECURSING <= 0
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// end of file //////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+// vim: noet ts=4 st=4 sts=4 sw=4 list colorcolumn=100
--- /dev/null
+++ b/ldexpf.c
@@ -1,0 +1,39 @@
+#include <math.h>
+#include <stdint.h>
+
+float ldexpf(float x, int n)
+{
+ union {float f; uint32_t i;} u;
+ float y = x;
+ union {
+ float f;
+ u32int x;
+ }oneP[] = {
+ {.x = 0x7f000000},
+ {.x = 0x800000},
+ {.x = 0x4b800000},
+ };
+
+ if (n > 127) {
+ y *= oneP[0].f;
+ n -= 127;
+ if (n > 127) {
+ y *= oneP[0].f;
+ n -= 127;
+ if (n > 127)
+ n = 127;
+ }
+ } else if (n < -126) {
+ y *= oneP[1].f * oneP[2].f;
+ n += 126 - 24;
+ if (n < -126) {
+ y *= oneP[1].f * oneP[2].f;
+ n += 126 - 24;
+ if (n < -126)
+ n = -126;
+ }
+ }
+ u.i = (uint32_t)(0x7f+n)<<23;
+ x = y * u.f;
+ return x;
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,17 @@
+</$objtype/mkfile
+
+TARG=jxl
+BIN=/$objtype/bin
+CFLAGS=$CFLAGS -p -I/sys/include/npe
+OFILES=\
+ builtins`{test -f builtins.$objtype.s && echo -n .$objtype}.$O\
+ cbrtf.$O\
+ hypotf.$O\
+ ldexpf.$O\
+ decode.$O\
+
+default:V: all
+
+decode.$O: j40.h
+
+</sys/src/cmd/mkone