shithub: jbig2

Download patch

ref: c3134491a3010fcce440a45968407c6511766671
parent: 060a2afad426347a9d63544f8ff7a5c2c96ae8e1
author: Robin Watts <[email protected]>
date: Tue Jan 21 13:16:37 EST 2020

Optimise jbig2_compose_image.

Work at byte level rather than bit level, and use static inline
templating methods to generate optimised versions for each
different operator.

--- a/jbig2_image.c
+++ b/jbig2_image.c
@@ -164,134 +164,249 @@
     return image;
 }
 
-/* composite one jbig2_image onto another
-   slow but general version */
-static int
-jbig2_image_compose_unopt(Jbig2Ctx *ctx, Jbig2Image *dst, Jbig2Image *src, int x, int y, Jbig2ComposeOp op)
+static void
+template_image_compose_opt(const uint8_t * JBIG2_RESTRICT ss, uint8_t * JBIG2_RESTRICT dd, int early, int late, uint8_t leftmask, uint8_t rightmask, uint32_t bytewidth_, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride, Jbig2ComposeOp op)
 {
-    uint32_t i, j;
-    uint32_t sw = src->width;
-    uint32_t sh = src->height;
-    uint32_t sx = 0;
-    uint32_t sy = 0;
+    int i;
+    uint32_t j;
+    int bytewidth = (int)bytewidth_;
 
-    /* clip to the dst image boundaries */
-    if (x < 0) {
-        sx += -x;
-        if (sw < (uint32_t) -x)
-            sw = 0;
-        else
-            sw -= -x;
-        x = 0;
+    if (bytewidth == 1) {
+        for (j = 0; j < h; j++) {
+            /* Only 1 byte! */
+            uint8_t v = (((early ? 0 : ss[0]<<8) | (late ? 0 : ss[1]))>>shift);
+            if (op == JBIG2_COMPOSE_OR)
+                *dd |= v & leftmask;
+            else if (op == JBIG2_COMPOSE_AND)
+                *dd &= (v & leftmask) | ~leftmask;
+            else if (op == JBIG2_COMPOSE_XOR)
+                *dd ^= v & leftmask;
+            else if (op == JBIG2_COMPOSE_XNOR)
+                *dd ^= (~v) & leftmask;
+            else /* Replace */
+                *dd = (v & leftmask) | (*dd & ~leftmask);
+	    dd += dstride;
+	    ss += sstride;
+	}
+	return;
     }
-    if (y < 0) {
-        sy += -y;
-        if (sh < (uint32_t) -y)
-            sh = 0;
-        else
-            sh -= -y;
-        y = 0;
+    bytewidth -= 2;
+    if (shift == 0) {
+        ss++;
+        for (j = 0; j < h; j++) {
+            /* Left byte */
+            const uint8_t * JBIG2_RESTRICT s = ss;
+            uint8_t * JBIG2_RESTRICT d = dd;
+            if (op == JBIG2_COMPOSE_OR)
+                *d++ |= *s++ & leftmask;
+            else if (op == JBIG2_COMPOSE_AND)
+                *d++ &= (*s++ & leftmask) | ~leftmask;
+            else if (op == JBIG2_COMPOSE_XOR)
+                *d++ ^= *s++ & leftmask;
+            else if (op == JBIG2_COMPOSE_XNOR)
+                *d++ ^= (~*s++) & leftmask;
+            else /* Replace */
+                *d = (*s++ & leftmask) | (*d & ~leftmask), d++;
+            /* Central run */
+            for (i = bytewidth; i != 0; i--)
+	    {
+                if (op == JBIG2_COMPOSE_OR)
+                    *d++ |= *s++;
+                else if (op == JBIG2_COMPOSE_AND)
+                    *d++ &= *s++;
+                else if (op == JBIG2_COMPOSE_XOR)
+                    *d++ ^= *s++;
+                else if (op == JBIG2_COMPOSE_XNOR)
+                    *d++ ^= ~*s++;
+                else /* Replace */
+                    *d++ = *s++;
+	    }
+	    /* Right byte */
+            if (op == JBIG2_COMPOSE_OR)
+                *d |= *s & rightmask;
+            else if (op == JBIG2_COMPOSE_AND)
+                *d &= (*s & rightmask) | ~rightmask;
+            else if (op == JBIG2_COMPOSE_XOR)
+                *d ^= *s & rightmask;
+            else if (op == JBIG2_COMPOSE_XNOR)
+                *d ^= (~*s) & rightmask;
+            else /* Replace */
+                *d = (*s & rightmask) | (*d & ~rightmask);
+	    dd += dstride;
+	    ss += sstride;
+	}
+    } else {
+        for (j = 0; j < h; j++) {
+            /* Left byte */
+            const uint8_t * JBIG2_RESTRICT s = ss;
+            uint8_t * JBIG2_RESTRICT d = dd;
+            uint8_t s0, s1, v;
+            s0 = early ? 0 : *s;
+	    s++;
+	    s1 = *s++;
+            v = ((s0<<8) | s1)>>shift;
+            if (op == JBIG2_COMPOSE_OR)
+                *d++ |= v & leftmask;
+            else if (op == JBIG2_COMPOSE_AND)
+                *d++ &= (v & leftmask) | ~leftmask;
+            else if (op == JBIG2_COMPOSE_XOR)
+                *d++ ^= v & leftmask;
+            else if (op == JBIG2_COMPOSE_XNOR)
+                *d++ ^= (~v) & leftmask;
+            else /* Replace */
+                *d = (v & leftmask) | (*d & ~leftmask), d++;
+            /* Central run */
+            for (i = bytewidth; i > 0; i--)
+	    {
+                s0 = s1; s1 = *s++;
+		v = ((s0<<8) | s1)>>shift;
+                if (op == JBIG2_COMPOSE_OR)
+                    *d++ |= v;
+                else if (op == JBIG2_COMPOSE_AND)
+                    *d++ &= v;
+                else if (op == JBIG2_COMPOSE_XOR)
+                    *d++ ^= v;
+                else if (op == JBIG2_COMPOSE_XNOR)
+                    *d++ ^= ~v;
+                else /* Replace */
+                    *d++ = v;
+	    }
+	    /* Right byte */
+	    s0 = s1; s1 = (late ? 0 : *s);
+	    v = (((s0<<8) | s1)>>shift);
+            if (op == JBIG2_COMPOSE_OR)
+                *d |= v & rightmask;
+            else if (op == JBIG2_COMPOSE_AND)
+                *d &= (v & rightmask) | ~rightmask;
+            else if (op == JBIG2_COMPOSE_XOR)
+                *d ^= v & rightmask;
+            else if (op == JBIG2_COMPOSE_XNOR)
+                *d ^= ~v & rightmask;
+            else /* Replace */
+                *d = (v & rightmask) | (*d & ~rightmask);
+	    dd += dstride;
+	    ss += sstride;
+	}
     }
-    if ((uint32_t) x + sw >= dst->width) {
-        if (dst->width >= (uint32_t) x)
-            sw = dst->width - x;
-        else
-            sw = 0;
-    }
-    if ((uint32_t) y + sh >= dst->height) {
-        if (dst->height >= (uint32_t) y)
-            sh = dst->height - y;
-        else
-            sh = 0;
-    }
+}
 
-    switch (op) {
-    case JBIG2_COMPOSE_OR:
-        for (j = 0; j < sh; j++) {
-            for (i = 0; i < sw; i++) {
-                jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy) | jbig2_image_get_pixel(dst, i + x, j + y));
-            }
-        }
-        break;
-    case JBIG2_COMPOSE_AND:
-        for (j = 0; j < sh; j++) {
-            for (i = 0; i < sw; i++) {
-                jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy) & jbig2_image_get_pixel(dst, i + x, j + y));
-            }
-        }
-        break;
-    case JBIG2_COMPOSE_XOR:
-        for (j = 0; j < sh; j++) {
-            for (i = 0; i < sw; i++) {
-                jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy) ^ jbig2_image_get_pixel(dst, i + x, j + y));
-            }
-        }
-        break;
-    case JBIG2_COMPOSE_XNOR:
-        for (j = 0; j < sh; j++) {
-            for (i = 0; i < sw; i++) {
-                jbig2_image_set_pixel(dst, i + x, j + y, (jbig2_image_get_pixel(src, i + sx, j + sy) == jbig2_image_get_pixel(dst, i + x, j + y)));
-            }
-        }
-        break;
-    case JBIG2_COMPOSE_REPLACE:
-        for (j = 0; j < sh; j++) {
-            for (i = 0; i < sw; i++) {
-                jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy));
-            }
-        }
-        break;
-    }
+static void
+jbig2_image_compose_opt_OR(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+    if (early || late)
+        template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_OR);
+    else
+        template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_OR);
+}
 
-    return 0;
+static void
+jbig2_image_compose_opt_AND(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+    if (early || late)
+        template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_AND);
+    else
+        template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_AND);
 }
 
+static void
+jbig2_image_compose_opt_XOR(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+    if (early || late)
+        template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XOR);
+    else
+        template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XOR);
+}
+
+static void
+jbig2_image_compose_opt_XNOR(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+    if (early || late)
+        template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XNOR);
+    else
+        template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XNOR);
+}
+
+static void
+jbig2_image_compose_opt_REPLACE(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+    if (early || late)
+        template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_REPLACE);
+    else
+        template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_REPLACE);
+}
+
 /* composite one jbig2_image onto another */
 int
 jbig2_image_compose(Jbig2Ctx *ctx, Jbig2Image *dst, Jbig2Image *src, int x, int y, Jbig2ComposeOp op)
 {
-    uint32_t i, j;
     uint32_t w, h;
-    uint32_t leftbyte, rightbyte;
     uint32_t shift;
-    uint8_t *s, *ss;
-    uint8_t *d, *dd;
-    uint8_t mask, rightmask;
+    uint32_t leftbyte;
+    uint8_t *ss;
+    uint8_t *dd;
+    uint8_t leftmask, rightmask;
+    int early = x >= 0;
+    int late;
+    uint32_t bytewidth;
+    uint32_t syoffset = 0;
 
     if (src == NULL)
         return 0;
 
-    /* The optimized code for the OR operator below doesn't
-       handle the source image partially placed outside the
-       destination (above and/or to the left). The affected
-       intersection of the destination is computed correctly,
-       however the correct subset of the source image is not
-       chosen. Instead the upper left corner of the source image
-       is always used.
+    /* This code takes a src image and combines it onto dst at offset (x,y), with operation op. */
 
-       In the unoptimized version that handles all operators
-       (including OR) the correct subset of the source image is
-       chosen.
+    /* Data is packed msb first within a byte, so with bits numbered: 01234567.
+     * Second byte is: 89abcdef. So to combine into a run, we use:
+     *       (s[0]<<8) | s[1] == 0123456789abcdef.
+     * To read from src into dst at offset 3, we need to read:
+     *    read:      0123456789abcdef...
+     *    write:  0123456798abcdef...
+     * In general, to read from src and write into dst at offset x, we need to shift
+     * down by (x&7) bits to allow for bit alignment. So shift = x&7.
+     * So the 'central' part of our runs will see us doing:
+     *   *d++ op= ((s[0]<<8)|s[1])>>shift;
+     * with special cases on the left and right edges of the run to mask.
+     * With the left hand edge, we have to be careful not to 'underread' the start of
+     * the src image; this is what the early flag is about. Similarly we have to be
+     * careful not to read off the right hand edge; this is what the late flag is for.
+     */
 
-       The workaround is to check whether the x/y coordinates to
-       the composition operator are negative and in this case use
-       the unoptimized implementation.
-
-       TODO: Fix the optimized OR implementation if possible. */
-    if (op != JBIG2_COMPOSE_OR || x < 0 || y < 0) {
-        /* hand off the the general routine */
-        return jbig2_image_compose_unopt(ctx, dst, src, x, y, op);
-    }
-
-    /* optimized code for the prevalent OR operator */
-
     /* clip */
     w = src->width;
     h = src->height;
-    ss = src->data;
+    shift = (x & 7);
+    ss = src->data - early;
 
-    w = ((uint32_t) x + w < dst->width) ? w : ((dst->width >= (uint32_t) x) ? dst->width - (uint32_t) x : 0);
-    h = ((uint32_t) y + h < dst->height) ? h : ((dst->height >= (uint32_t) y) ? dst->height - (uint32_t) y : 0);
+    if (x < 0) {
+        if (w < (uint32_t) -x)
+            w = 0;
+	else
+            w += x;
+	ss += (-x-1)>>3;
+        x = 0;
+    }
+    if (y < 0) {
+        if (h < (uint32_t) -y)
+            h = 0;
+        else
+            h += y;
+	syoffset = -y * src->stride;
+        y = 0;
+    }
+    if ((uint32_t)x + w > dst->width)
+    {
+        if (dst->width < (uint32_t)x)
+            w = 0;
+        else
+            w = dst->width - x;
+    }
+    if ((uint32_t)y + h > dst->height)
+    {
+        if (dst->height < (uint32_t)y)
+            h = 0;
+        else
+            h = dst->height - y;
+    }
 #ifdef JBIG2_DEBUG
     jbig2_error(ctx, JBIG2_SEVERITY_DEBUG, -1, "compositing %dx%d at (%d, %d) after clipping", w, h, x, y);
 #endif
@@ -305,55 +420,32 @@
     }
 
     leftbyte = (uint32_t) x >> 3;
-    rightbyte = ((uint32_t) x + w - 1) >> 3;
-    shift = x & 7;
+    dd = dst->data + y * dst->stride + leftbyte;
+    bytewidth = (((uint32_t) x + w - 1) >> 3) - leftbyte + 1;
+    leftmask = 255>>(x&7);
+    rightmask = (((x+w)&7) == 0) ? 255 : ~(255>>((x+w)&7));
+    if (bytewidth == 1)
+        leftmask &= rightmask;
+    late = (ss + bytewidth >= src->data + ((src->width+7)>>3));
+    ss += syoffset;
 
-    /* general OR case */
-    s = ss;
-    d = dd = dst->data + y * dst->stride + leftbyte;
-    if (d < dst->data ||
-        leftbyte > dst->stride ||
-        d - leftbyte + (size_t) h * dst->stride > dst->data + (size_t) dst->height * dst->stride ||
-        s - leftbyte + (size_t) (h - 1) * src->stride + rightbyte > src->data + (size_t) src->height * src->stride) {
-        return jbig2_error(ctx, JBIG2_SEVERITY_FATAL, -1, "preventing heap overflow in jbig2_image_compose");
-    }
-    if (leftbyte == rightbyte) {
-        mask = 0x100 - (0x100 >> w);
-        for (j = 0; j < h; j++) {
-            *d |= (*s & mask) >> shift;
-            d += dst->stride;
-            s += src->stride;
-        }
-    } else if (shift == 0) {
-        rightmask = (w & 7) ? 0x100 - (1 << (8 - (w & 7))) : 0xFF;
-        for (j = 0; j < h; j++) {
-            for (i = leftbyte; i < rightbyte; i++)
-                *d++ |= *s++;
-            *d |= *s & rightmask;
-            d = (dd += dst->stride);
-            s = (ss += src->stride);
-        }
-    } else {
-        bool overlap = (((w + 7) >> 3) < ((x + w + 7) >> 3) - (x >> 3));
-
-        mask = 0x100 - (1 << shift);
-        if (overlap)
-            rightmask = (0x100 - (0x100 >> ((x + w) & 7))) >> (8 - shift);
-        else
-            rightmask = 0x100 - (0x100 >> (w & 7));
-        for (j = 0; j < h; j++) {
-            *d++ |= (*s & mask) >> shift;
-            for (i = leftbyte; i < rightbyte - 1; i++) {
-                *d |= ((*s++ & ~mask) << (8 - shift));
-                *d++ |= ((*s & mask) >> shift);
-            }
-            if (overlap)
-                *d |= (*s & rightmask) << (8 - shift);
-            else
-                *d |= ((s[0] & ~mask) << (8 - shift)) | ((s[1] & rightmask) >> shift);
-            d = (dd += dst->stride);
-            s = (ss += src->stride);
-        }
+    switch(op)
+    {
+    case JBIG2_COMPOSE_OR:
+        jbig2_image_compose_opt_OR(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+        break;
+    case JBIG2_COMPOSE_AND:
+        jbig2_image_compose_opt_AND(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+        break;
+    case JBIG2_COMPOSE_XOR:
+        jbig2_image_compose_opt_XOR(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+        break;
+    case JBIG2_COMPOSE_XNOR:
+        jbig2_image_compose_opt_XNOR(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+        break;
+    case JBIG2_COMPOSE_REPLACE:
+        jbig2_image_compose_opt_REPLACE(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+        break;
     }
 
     return 0;
--- a/jbig2_priv.h
+++ b/jbig2_priv.h
@@ -116,11 +116,7 @@
 
 #define jbig2_renew(ctx, p, t, size) ((t *)jbig2_realloc(ctx->allocator, (p), size, sizeof(t)))
 
-int jbig2_error(Jbig2Ctx *ctx, Jbig2Severity severity, int32_t seg_idx, const char *fmt, ...)
-#ifdef __GNUC__
-    __attribute__ ((format (__printf__, 4, 5)))
-#endif
-    ;
+int jbig2_error(Jbig2Ctx *ctx, Jbig2Severity severity, int32_t seg_idx, const char *fmt, ...);
 
 /* The word stream design is a compromise between simplicity and
    trying to amortize the number of method calls. Each ::get_next_word
@@ -136,5 +132,16 @@
 Jbig2WordStream *jbig2_word_stream_buf_new(Jbig2Ctx *ctx, const byte *data, size_t size);
 
 void jbig2_word_stream_buf_free(Jbig2Ctx *ctx, Jbig2WordStream *ws);
+
+/* restrict is standard in C99, but not in all C++ compilers. */
+#if defined (__STDC_VERSION_) && (__STDC_VERSION__ >= 199901L) /* C99 */
+#define JBIG2_RESTRICT restrict
+#elif defined(_MSC_VER) && (_MSC_VER >= 1600) /* MSVC 10 or newer */
+#define JBIG2_RESTRICT __restrict
+#elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC 3 or newer */
+#define JBIG2_RESTRICT __restrict
+#else /* Unknown or ancient */
+#define JBIG2_RESTRICT
+#endif
 
 #endif /* _JBIG2_PRIV_H */