shithub: dav1d

Download patch

ref: f2ee7a2a430992816e20b9d3a364492abf8fbdb8
parent: d6e9cb0603817437aea15849228255661e19b2b6
author: Henrik Gramner <[email protected]>
date: Fri Sep 28 07:20:35 EDT 2018

Add 'checkasm' asm testing/benchmarking framework

Some of the code originally written by, or based by code written by,
the following authors who have agreed to relicense it to 2-clause BSD:

  Anton Mitrofanov
  Diego Biurrun
  Janne Grunau
  Loren Merritt
  Luca Barbato
  Martin Storsjö
  Michael Niedermayer

--- /dev/null
+++ b/src/cpu.h
@@ -1,0 +1,40 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_SRC_CPU_H__
+#define __DAV1D_SRC_CPU_H__
+
+#include "config.h"
+
+#if ARCH_X86
+#include "src/x86/cpu.h"
+#else
+#define dav1d_get_cpu_flags 0
+#define dav1d_set_cpu_flags_mask(mask) while (0)
+#endif
+
+#endif /* __DAV1D_SRC_CPU_H__ */
--- a/src/x86/cpu.c
+++ b/src/x86/cpu.c
@@ -32,9 +32,9 @@
 void dav1d_cpu_cpuid(uint32_t *info, int leaf);
 uint64_t dav1d_cpu_xgetbv(int xcr);
 
-static enum CpuFlags get_cpu_flags_x86(void) {
+static unsigned get_cpu_flags(void) {
     uint32_t info[4], n_ids;
-    enum CpuFlags flags = 0;
+    unsigned flags = 0;
 
     dav1d_cpu_cpuid(info, 0);
     n_ids = info[0];
@@ -66,13 +66,19 @@
     return flags;
 }
 
-enum CpuFlags dav1d_get_cpu_flags_x86(void) {
-    static enum CpuFlags flags;
+static unsigned flags_mask = -1;
+
+unsigned dav1d_get_cpu_flags(void) {
+    static unsigned flags;
     static uint8_t checked = 0;
 
     if (!checked) {
-        flags = get_cpu_flags_x86();
+        flags = get_cpu_flags();
         checked = 1;
     }
-    return flags;
+    return flags & flags_mask;
+}
+
+void dav1d_set_cpu_flags_mask(const unsigned mask) {
+    flags_mask = mask;
 }
--- a/src/x86/cpu.h
+++ b/src/x86/cpu.h
@@ -40,6 +40,7 @@
     DAV1D_X86_CPU_FLAG_AVX512 = 1 << 8, /* F + CD + BW + DQ + VL */
 };
 
-enum CpuFlags dav1d_get_cpu_flags_x86(void);
+unsigned dav1d_get_cpu_flags(void);
+void dav1d_set_cpu_flags_mask(unsigned mask);
 
 #endif /* __DAV1D_SRC_X86_CPU_H__ */
--- a/src/x86/mc_init.c
+++ b/src/x86/mc_init.c
@@ -25,10 +25,8 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "config.h"
-
+#include "src/cpu.h"
 #include "src/mc.h"
-#include "src/x86/cpu.h"
 
 decl_mc_fn(dav1d_put_8tap_regular_avx2);
 decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2);
@@ -62,7 +60,7 @@
     c->mc[type] = dav1d_put_##name##_##suffix
 #define init_mct_fn(type, name, suffix) \
     c->mct[type] = dav1d_prep_##name##_##suffix
-    const enum CpuFlags flags = dav1d_get_cpu_flags_x86();
+    const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
--- /dev/null
+++ b/tests/checkasm/checkasm.c
@@ -1,0 +1,578 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include "src/cpu.h"
+
+#include "tests/checkasm/checkasm.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#define COLOR_RED    FOREGROUND_RED
+#define COLOR_GREEN  FOREGROUND_GREEN
+#define COLOR_YELLOW (FOREGROUND_RED|FOREGROUND_GREEN)
+#else
+#include <unistd.h>
+#define COLOR_RED    1
+#define COLOR_GREEN  2
+#define COLOR_YELLOW 3
+#endif
+
+/* List of tests to invoke */
+static const struct {
+    const char *name;
+    void (*func)(void);
+} tests[] = {
+    { NULL }
+};
+
+/* List of cpu flags to check */
+static const struct {
+    const char *name;
+    const char *suffix;
+    unsigned flag;
+} cpus[] = {
+#if ARCH_X86
+    { "SSE",     "sse",    DAV1D_X86_CPU_FLAG_SSE },
+    { "SSE2",    "sse2",   DAV1D_X86_CPU_FLAG_SSE2 },
+    { "SSE3",    "sse3",   DAV1D_X86_CPU_FLAG_SSE3 },
+    { "SSSE3",   "ssse3",  DAV1D_X86_CPU_FLAG_SSSE3 },
+    { "SSE4.1",  "sse4",   DAV1D_X86_CPU_FLAG_SSE41 },
+    { "SSE4.2",  "sse42",  DAV1D_X86_CPU_FLAG_SSE42 },
+    { "AVX",     "avx",    DAV1D_X86_CPU_FLAG_AVX },
+    { "AVX2",    "avx2",   DAV1D_X86_CPU_FLAG_AVX2 },
+    { "AVX-512", "avx512", DAV1D_X86_CPU_FLAG_AVX512 },
+#endif
+    { NULL }
+};
+
+typedef struct CheckasmFuncVersion {
+    struct CheckasmFuncVersion *next;
+    void *func;
+    int ok;
+    unsigned cpu;
+    int iterations;
+    uint64_t cycles;
+} CheckasmFuncVersion;
+
+/* Binary search tree node */
+typedef struct CheckasmFunc {
+    struct CheckasmFunc *child[2];
+    CheckasmFuncVersion versions;
+    uint8_t color; /* 0 = red, 1 = black */
+    char name[1];
+} CheckasmFunc;
+
+/* Internal state */
+static struct {
+    CheckasmFunc *funcs;
+    CheckasmFunc *current_func;
+    CheckasmFuncVersion *current_func_ver;
+    const char *current_test_name;
+    const char *bench_pattern;
+    int bench_pattern_len;
+    int num_checked;
+    int num_failed;
+    int nop_time;
+    unsigned cpu_flag;
+    const char *cpu_flag_name;
+    const char *test_name;
+} state;
+
+/* float compare support code */
+typedef union {
+    float f;
+    uint32_t i;
+} intfloat;
+
+static int is_negative(const intfloat u) {
+    return u.i >> 31;
+}
+
+int float_near_ulp(const float a, const float b, const unsigned max_ulp) {
+    intfloat x, y;
+
+    x.f = a;
+    y.f = b;
+
+    if (is_negative(x) != is_negative(y)) {
+        // handle -0.0 == +0.0
+        return a == b;
+    }
+
+    if (llabs((int64_t)x.i - y.i) <= max_ulp)
+        return 1;
+
+    return 0;
+}
+
+int float_near_ulp_array(const float *const a, const float *const b,
+                         const unsigned max_ulp, const int len)
+{
+    for (int i = 0; i < len; i++)
+        if (!float_near_ulp(a[i], b[i], max_ulp))
+            return 0;
+
+    return 1;
+}
+
+int float_near_abs_eps(const float a, const float b, const float eps) {
+    return fabsf(a - b) < eps;
+}
+
+int float_near_abs_eps_array(const float *const a, const float *const b,
+                             const float eps, const int len)
+{
+    for (int i = 0; i < len; i++)
+        if (!float_near_abs_eps(a[i], b[i], eps))
+            return 0;
+
+    return 1;
+}
+
+int float_near_abs_eps_ulp(const float a, const float b, const float eps,
+                           const unsigned max_ulp)
+{
+    return float_near_ulp(a, b, max_ulp) || float_near_abs_eps(a, b, eps);
+}
+
+int float_near_abs_eps_array_ulp(const float *const a, const float *const b,
+                                 const float eps, const unsigned max_ulp,
+                                 const int len)
+{
+    for (int i = 0; i < len; i++)
+        if (!float_near_abs_eps_ulp(a[i], b[i], eps, max_ulp))
+            return 0;
+
+    return 1;
+}
+
+/* Print colored text to stderr if the terminal supports it */
+static void color_printf(const int color, const char *const fmt, ...) {
+    static int8_t use_color = -1;
+    va_list arg;
+
+#ifdef _WIN32
+    static HANDLE con;
+    static WORD org_attributes;
+
+    if (use_color < 0) {
+        CONSOLE_SCREEN_BUFFER_INFO con_info;
+        con = GetStdHandle(STD_ERROR_HANDLE);
+        if (con && con != INVALID_HANDLE_VALUE &&
+            GetConsoleScreenBufferInfo(con, &con_info))
+        {
+            org_attributes = con_info.wAttributes;
+            use_color = 1;
+        } else
+            use_color = 0;
+    }
+    if (use_color)
+        SetConsoleTextAttribute(con, (org_attributes & 0xfff0) |
+                                (color & 0x0f));
+#else
+    if (use_color < 0) {
+        const char *const term = getenv("TERM");
+        use_color = term && strcmp(term, "dumb") && isatty(2);
+    }
+    if (use_color)
+        fprintf(stderr, "\x1b[%d;3%dm", (color & 0x08) >> 3, color & 0x07);
+#endif
+
+    va_start(arg, fmt);
+    vfprintf(stderr, fmt, arg);
+    va_end(arg);
+
+    if (use_color) {
+#ifdef _WIN32
+        SetConsoleTextAttribute(con, org_attributes);
+#else
+        fprintf(stderr, "\x1b[0m");
+#endif
+    }
+}
+
+/* Deallocate a tree */
+static void destroy_func_tree(CheckasmFunc *const f) {
+    if (f) {
+        CheckasmFuncVersion *v = f->versions.next;
+        while (v) {
+            CheckasmFuncVersion *next = v->next;
+            free(v);
+            v = next;
+        }
+
+        destroy_func_tree(f->child[0]);
+        destroy_func_tree(f->child[1]);
+        free(f);
+    }
+}
+
+/* Allocate a zero-initialized block, clean up and exit on failure */
+static void *checkasm_malloc(const size_t size) {
+    void *const ptr = calloc(1, size);
+    if (!ptr) {
+        fprintf(stderr, "checkasm: malloc failed\n");
+        destroy_func_tree(state.funcs);
+        exit(1);
+    }
+    return ptr;
+}
+
+/* Get the suffix of the specified cpu flag */
+static const char *cpu_suffix(const unsigned cpu) {
+    for (int i = sizeof(cpus) / sizeof(*cpus) - 2; i >= 0; i--)
+        if (cpu & cpus[i].flag)
+            return cpus[i].suffix;
+
+    return "c";
+}
+
+#ifdef readtime
+static int cmp_nop(const void *a, const void *b) {
+    return *(const uint16_t*)a - *(const uint16_t*)b;
+}
+
+/* Measure the overhead of the timing code (in decicycles) */
+static int measure_nop_time(void) {
+    uint16_t nops[10000];
+    int i, nop_sum = 0;
+
+    for (i = 0; i < 10000; i++) {
+        uint64_t t = readtime();
+        nops[i] = readtime() - t;
+    }
+
+    qsort(nops, 10000, sizeof(uint16_t), cmp_nop);
+    for (i = 2500; i < 7500; i++)
+        nop_sum += nops[i];
+
+    return nop_sum / 500;
+}
+
+/* Print benchmark results */
+static void print_benchs(const CheckasmFunc *const f) {
+    if (f) {
+        print_benchs(f->child[0]);
+
+        /* Only print functions with at least one assembly version */
+        if (f->versions.cpu || f->versions.next) {
+            const CheckasmFuncVersion *v = &f->versions;
+            do {
+                if (v->iterations) {
+                    int decicycles = (10*v->cycles/v->iterations -
+                                      state.nop_time) / 4;
+                    printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu),
+                           decicycles/10, decicycles%10);
+                }
+            } while ((v = v->next));
+        }
+
+        print_benchs(f->child[1]);
+    }
+}
+#endif
+
+#define is_digit(x) ((x) >= '0' && (x) <= '9')
+
+/* ASCIIbetical sort except preserving natural order for numbers */
+static int cmp_func_names(const char *a, const char *b) {
+    const char *const start = a;
+    int ascii_diff, digit_diff;
+
+    for (; !(ascii_diff = *(const unsigned char*)a -
+                          *(const unsigned char*)b) && *a; a++, b++);
+    for (; is_digit(*a) && is_digit(*b); a++, b++);
+
+    if (a > start && is_digit(a[-1]) &&
+        (digit_diff = is_digit(*a) - is_digit(*b)))
+    {
+        return digit_diff;
+    }
+
+    return ascii_diff;
+}
+
+/* Perform a tree rotation in the specified direction and return the new root */
+static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) {
+    CheckasmFunc *const r = f->child[dir^1];
+    f->child[dir^1] = r->child[dir];
+    r->child[dir] = f;
+    r->color = f->color;
+    f->color = 0;
+    return r;
+}
+
+#define is_red(f) ((f) && !(f)->color)
+
+/* Balance a left-leaning red-black tree at the specified node */
+static void balance_tree(CheckasmFunc **root) {
+    CheckasmFunc *const f = *root;
+
+    if (is_red(f->child[0]) && is_red(f->child[1])) {
+        f->color ^= 1;
+        f->child[0]->color = f->child[1]->color = 1;
+    }
+    else if (!is_red(f->child[0]) && is_red(f->child[1]))
+        *root = rotate_tree(f, 0); /* Rotate left */
+    else if (is_red(f->child[0]) && is_red(f->child[0]->child[0]))
+        *root = rotate_tree(f, 1); /* Rotate right */
+}
+
+/* Get a node with the specified name, creating it if it doesn't exist */
+static CheckasmFunc *get_func(CheckasmFunc **root, const char *const name) {
+    CheckasmFunc *f = *root;
+
+    if (f) {
+        /* Search the tree for a matching node */
+        int cmp = cmp_func_names(name, f->name);
+        if (cmp) {
+            f = get_func(&f->child[cmp > 0], name);
+
+            /* Rebalance the tree on the way up if a new node was inserted */
+            if (!f->versions.func)
+                balance_tree(root);
+        }
+    } else {
+        /* Allocate and insert a new node into the tree */
+        const int name_length = strlen(name);
+        f = *root = checkasm_malloc(sizeof(CheckasmFunc) + name_length);
+        memcpy(f->name, name, name_length + 1);
+    }
+
+    return f;
+}
+
+/* Perform tests and benchmarks for the specified
+ * cpu flag if supported by the host */
+static void check_cpu_flag(const char *const name, unsigned flag) {
+    const unsigned old_cpu_flag = state.cpu_flag;
+
+    flag |= old_cpu_flag;
+    dav1d_set_cpu_flags_mask(flag);
+    state.cpu_flag = dav1d_get_cpu_flags();
+
+    if (!flag || state.cpu_flag != old_cpu_flag) {
+        state.cpu_flag_name = name;
+        for (int i = 0; tests[i].func; i++) {
+            if (state.test_name && strcmp(tests[i].name, state.test_name))
+                continue;
+            state.current_test_name = tests[i].name;
+            tests[i].func();
+        }
+    }
+}
+
+/* Print the name of the current CPU flag, but only do it once */
+static void print_cpu_name(void) {
+    if (state.cpu_flag_name) {
+        color_printf(COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
+        state.cpu_flag_name = NULL;
+    }
+}
+
+int main(int argc, char *argv[]) {
+#ifdef readtime
+    unsigned int seed = readtime();
+#else
+    unsigned int seed = time(NULL);
+#endif
+    int ret = 0;
+
+    /*if (!tests[0].func || !cpus[0].flag) {
+        fprintf(stderr, "checkasm: no tests to perform\n");
+        return 0;
+    }*/
+
+    while (argc > 1) {
+        if (!strncmp(argv[1], "--bench", 7)) {
+#ifndef readtime
+            fprintf(stderr,
+                    "checkasm: --bench is not supported on your system\n");
+            return 1;
+#endif
+            if (argv[1][7] == '=') {
+                state.bench_pattern = argv[1] + 8;
+                state.bench_pattern_len = strlen(state.bench_pattern);
+            } else
+                state.bench_pattern = "";
+        } else if (!strncmp(argv[1], "--test=", 7)) {
+            state.test_name = argv[1] + 7;
+        } else {
+            seed = strtoul(argv[1], NULL, 10);
+        }
+
+        argc--;
+        argv++;
+    }
+
+    fprintf(stderr, "checkasm: using random seed %u\n", seed);
+    srand(seed);
+
+    check_cpu_flag(NULL, 0);
+    for (int i = 0; cpus[i].flag; i++)
+        check_cpu_flag(cpus[i].name, cpus[i].flag);
+
+    if (state.num_failed) {
+        fprintf(stderr, "checkasm: %d of %d tests have failed\n",
+                state.num_failed, state.num_checked);
+        ret = 1;
+    } else {
+        fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
+#ifdef readtime
+        if (state.bench_pattern) {
+            state.nop_time = measure_nop_time();
+            printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
+            print_benchs(state.funcs);
+        }
+#endif
+    }
+
+    destroy_func_tree(state.funcs);
+    return ret;
+}
+
+/* Decide whether or not the specified function needs to be tested and
+ * allocate/initialize data structures if needed. Returns a pointer to a
+ * reference function if the function should be tested, otherwise NULL */
+void *checkasm_check_func(void *const func, const char *const name, ...) {
+    char name_buf[256];
+    va_list arg;
+
+    va_start(arg, name);
+    const int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
+    va_end(arg);
+
+    if (!func || name_length <= 0 || name_length >= sizeof(name_buf))
+        return NULL;
+
+    state.current_func = get_func(&state.funcs, name_buf);
+    state.funcs->color = 1;
+    CheckasmFuncVersion *v = &state.current_func->versions;
+    void *ref = func;
+
+    if (v->func) {
+        CheckasmFuncVersion *prev;
+        do {
+            /* Only test functions that haven't already been tested */
+            if (v->func == func)
+                return NULL;
+
+            if (v->ok)
+                ref = v->func;
+
+            prev = v;
+        } while ((v = v->next));
+
+        v = prev->next = checkasm_malloc(sizeof(CheckasmFuncVersion));
+    }
+
+    v->func = func;
+    v->ok = 1;
+    v->cpu = state.cpu_flag;
+    state.current_func_ver = v;
+
+    if (state.cpu_flag)
+        state.num_checked++;
+
+    return ref;
+}
+
+/* Decide whether or not the current function needs to be benchmarked */
+int checkasm_bench_func(void) {
+    return !state.num_failed && state.bench_pattern &&
+           !strncmp(state.current_func->name, state.bench_pattern,
+                    state.bench_pattern_len);
+}
+
+/* Indicate that the current test has failed */
+void checkasm_fail_func(const char *const msg, ...) {
+    if (state.current_func_ver->cpu && state.current_func_ver->ok) {
+        va_list arg;
+
+        print_cpu_name();
+        fprintf(stderr, "   %s_%s (", state.current_func->name,
+                cpu_suffix(state.current_func_ver->cpu));
+        va_start(arg, msg);
+        vfprintf(stderr, msg, arg);
+        va_end(arg);
+        fprintf(stderr, ")\n");
+
+        state.current_func_ver->ok = 0;
+        state.num_failed++;
+    }
+}
+
+/* Update benchmark results of the current function */
+void checkasm_update_bench(const int iterations, const uint64_t cycles) {
+    state.current_func_ver->iterations += iterations;
+    state.current_func_ver->cycles += cycles;
+}
+
+/* Print the outcome of all tests performed since
+ * the last time this function was called */
+void checkasm_report(const char *const name, ...) {
+    static int prev_checked, prev_failed, max_length;
+
+    if (state.num_checked > prev_checked) {
+        int pad_length = max_length + 4;
+        va_list arg;
+
+        print_cpu_name();
+        pad_length -= fprintf(stderr, " - %s.", state.current_test_name);
+        va_start(arg, name);
+        pad_length -= vfprintf(stderr, name, arg);
+        va_end(arg);
+        fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
+
+        if (state.num_failed == prev_failed)
+            color_printf(COLOR_GREEN, "OK");
+        else
+            color_printf(COLOR_RED, "FAILED");
+        fprintf(stderr, "]\n");
+
+        prev_checked = state.num_checked;
+        prev_failed  = state.num_failed;
+    } else if (!state.cpu_flag) {
+        /* Calculate the amount of padding required
+         * to make the output vertically aligned */
+        int length = strlen(state.current_test_name);
+        va_list arg;
+
+        va_start(arg, name);
+        length += vsnprintf(NULL, 0, name, arg);
+        va_end(arg);
+
+        if (length > max_length)
+            max_length = length;
+    }
+}
--- /dev/null
+++ b/tests/checkasm/checkasm.h
@@ -1,0 +1,157 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_TESTS_CHECKASM_CHECKASM_H
+#define __DAV1D_TESTS_CHECKASM_CHECKASM_H
+
+#include "config.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "include/common/attributes.h"
+#include "include/common/intops.h"
+
+void *checkasm_check_func(void *func, const char *name, ...);
+int checkasm_bench_func(void);
+void checkasm_fail_func(const char *msg, ...);
+void checkasm_update_bench(int iterations, uint64_t cycles);
+void checkasm_report(const char *name, ...);
+
+/* float compare utilities */
+int float_near_ulp(float a, float b, unsigned max_ulp);
+int float_near_abs_eps(float a, float b, float eps);
+int float_near_abs_eps_ulp(float a, float b, float eps, unsigned max_ulp);
+int float_near_ulp_array(const float *a, const float *b, unsigned max_ulp,
+                         int len);
+int float_near_abs_eps_array(const float *a, const float *b, float eps,
+                             int len);
+int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
+                                 unsigned max_ulp, int len);
+
+static void *func_ref, *func_new;
+
+#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
+
+/* Decide whether or not the specified function needs to be tested */
+#define check_func(func, ...)\
+    (func_ref = checkasm_check_func((func_new = func), __VA_ARGS__))
+
+/* Declare the function prototype. The first argument is the return value,
+ * the remaining arguments are the function parameters. Naming parameters
+ * is optional. */
+#define declare_func(ret, ...)\
+    declare_new(ret, __VA_ARGS__) typedef ret func_type(__VA_ARGS__)
+
+/* Indicate that the current test has failed */
+#define fail() checkasm_fail_func("%s:%d", __FILE__, __LINE__)
+
+/* Print the test outcome */
+#define report checkasm_report
+
+/* Call the reference function */
+#define call_ref(...) ((func_type *)func_ref)(__VA_ARGS__)
+
+#if HAVE_ASM
+#if ARCH_X86
+#ifdef _MSC_VER
+#include <intrin.h>
+#define readtime() (_mm_lfence(), __rdtsc())
+#else
+static inline uint64_t readtime(void) {
+    uint32_t eax, edx;
+    __asm__ __volatile__("lfence\nrdtsc" : "=a"(eax), "=d"(edx));
+    return (((uint64_t)edx) << 32) | eax;
+}
+#define readtime readtime
+#endif
+
+/* Verifies that clobbered callee-saved registers
+ * are properly saved and restored */
+void checkasm_checked_call(void *func, ...);
+
+#if ARCH_X86_64
+/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended
+ * to 64-bit. This is done by clobbering the stack with junk around the stack
+ * pointer and calling the assembly function through checked_call() with added
+ * dummy arguments which forces all real arguments to be passed on the stack
+ * and not in registers. For 32-bit arguments the upper half of the 64-bit
+ * register locations on the stack will now contain junk which will cause
+ * misbehaving functions to either produce incorrect output or segfault. Note
+ * that even though this works extremely well in practice, it's technically
+ * not guaranteed and false negatives is theoretically possible, but there
+ * can never be any false positives. */
+void checkasm_stack_clobber(uint64_t clobber, ...);
+#define declare_new(ret, ...)\
+    ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__) =\
+    (void *)checkasm_checked_call;
+#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
+#define call_new(...)\
+    (checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
+                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
+                            CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
+     checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__))
+#elif ARCH_X86_32
+#define declare_new(ret, ...)\
+    ret (*checked_call)(void *, __VA_ARGS__) = (void *)checkasm_checked_call;
+#define call_new(...) checked_call(func_new, __VA_ARGS__)
+#endif
+#else
+#define declare_new(ret, ...)
+/* Call the function */
+#define call_new(...) ((func_type *)func_new)(__VA_ARGS__)
+#endif
+#endif
+
+/* Benchmark the function */
+#ifdef readtime
+#define bench_new(...)\
+    do {\
+        if (checkasm_bench_func()) {\
+            func_type *tfunc = func_new;\
+            uint64_t tsum = 0;\
+            int ti, tcount = 0;\
+            for (ti = 0; ti < BENCH_RUNS; ti++) {\
+                uint64_t t = readtime();\
+                tfunc(__VA_ARGS__);\
+                tfunc(__VA_ARGS__);\
+                tfunc(__VA_ARGS__);\
+                tfunc(__VA_ARGS__);\
+                t = readtime() - t;\
+                if (t*tcount <= tsum*4 && ti > 0) {\
+                    tsum += t;\
+                    tcount++;\
+                }\
+            }\
+            checkasm_update_bench(tcount, tsum);\
+        }\
+    } while (0)
+#else
+#define bench_new(...) while (0)
+#endif
+
+#endif /* __DAV1D_TESTS_CHECKASM_CHECKASM_H */
--- /dev/null
+++ b/tests/checkasm/x86/checkasm.asm
@@ -1,0 +1,212 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%define private_prefix checkasm
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+
+error_message: db "failed to preserve register", 0
+
+%if ARCH_X86_64
+; just random numbers to reduce the chance of incidental match
+ALIGN 16
+x6:  dq 0x1a1b2550a612b48c,0x79445c159ce79064
+x7:  dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
+x8:  dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
+x9:  dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f
+x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9
+x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d
+x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b
+x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786
+x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
+x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
+n7:  dq 0x21f86d66c8ca00ce
+n8:  dq 0x75b6ba21077c48ad
+n9:  dq 0xed56bb2dcb3c7736
+n10: dq 0x8bda43d3fd1a7e06
+n11: dq 0xb64a9c9e5d318408
+n12: dq 0xdf9a54b303f1d3a3
+n13: dq 0x4a75479abd64e097
+n14: dq 0x249214109d5d1c88
+%endif
+
+SECTION .text
+
+cextern fail_func
+
+; max number of args used by any asm function.
+; (max_args % 4) must equal 3 for stack alignment
+%define max_args 15
+
+%if ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; int checkasm_stack_clobber(uint64_t clobber, ...)
+;-----------------------------------------------------------------------------
+cglobal stack_clobber, 1,2
+    ; Clobber the stack with junk below the stack pointer
+    %define argsize (max_args+6)*8
+    SUB  rsp, argsize
+    mov   r1, argsize-8
+.loop:
+    mov [rsp+r1], r0
+    sub   r1, 8
+    jge .loop
+    ADD  rsp, argsize
+    RET
+
+%if WIN64
+    %assign free_regs 7
+    DECLARE_REG_TMP 4
+%else
+    %assign free_regs 9
+    DECLARE_REG_TMP 7
+%endif
+
+;-----------------------------------------------------------------------------
+; void checkasm_checked_call(void *func, ...)
+;-----------------------------------------------------------------------------
+INIT_XMM
+cglobal checked_call, 2,15,16,max_args*8+8
+    mov  t0, r0
+
+    ; All arguments have been pushed on the stack instead of registers in
+    ; order to test for incorrect assumptions that 32-bit ints are
+    ; zero-extended to 64-bit.
+    mov  r0, r6mp
+    mov  r1, r7mp
+    mov  r2, r8mp
+    mov  r3, r9mp
+%if UNIX64
+    mov  r4, r10mp
+    mov  r5, r11mp
+    %assign i 6
+    %rep max_args-6
+        mov  r9, [rsp+stack_offset+(i+1)*8]
+        mov  [rsp+(i-6)*8], r9
+        %assign i i+1
+    %endrep
+%else ; WIN64
+    %assign i 4
+    %rep max_args-4
+        mov  r9, [rsp+stack_offset+(i+7)*8]
+        mov  [rsp+i*8], r9
+        %assign i i+1
+    %endrep
+
+    ; Move possible floating-point arguments to the correct registers
+    movq m0, r0
+    movq m1, r1
+    movq m2, r2
+    movq m3, r3
+
+    %assign i 6
+    %rep 16-6
+        mova m %+ i, [x %+ i]
+        %assign i i+1
+    %endrep
+%endif
+
+%assign i 14
+%rep 15-free_regs
+    mov r %+ i, [n %+ i]
+    %assign i i-1
+%endrep
+    call t0
+%assign i 14
+%rep 15-free_regs
+    xor r %+ i, [n %+ i]
+    or  r14, r %+ i
+    %assign i i-1
+%endrep
+
+%if WIN64
+    %assign i 6
+    %rep 16-6
+        pxor m %+ i, [x %+ i]
+        por  m6, m %+ i
+        %assign i i+1
+    %endrep
+    packsswb m6, m6
+    movq r5, m6
+    or  r14, r5
+%endif
+
+    ; Call fail_func() with a descriptive message to mark it as a failure
+    ; if the called function didn't preserve all callee-saved registers.
+    ; Save the return value located in rdx:rax first to prevent clobbering.
+    jz .ok
+    mov  r9, rax
+    mov r10, rdx
+    lea  r0, [error_message]
+    xor eax, eax
+    call fail_func
+    mov rdx, r10
+    mov rax, r9
+.ok:
+    RET
+
+%else
+
+; just random numbers to reduce the chance of incidental match
+%define n3 dword 0x6549315c
+%define n4 dword 0xe02f3e23
+%define n5 dword 0xb78d0d1d
+%define n6 dword 0x33627ba7
+
+;-----------------------------------------------------------------------------
+; void checkasm_checked_call(void *func, ...)
+;-----------------------------------------------------------------------------
+cglobal checked_call, 1,7
+    mov  r3, n3
+    mov  r4, n4
+    mov  r5, n5
+    mov  r6, n6
+%rep max_args
+    PUSH dword [esp+20+max_args*4]
+%endrep
+    call r0
+    xor  r3, n3
+    xor  r4, n4
+    xor  r5, n5
+    xor  r6, n6
+    or   r3, r4
+    or   r5, r6
+    or   r3, r5
+    jz .ok
+    mov  r3, eax
+    mov  r4, edx
+    lea  r0, [error_message]
+    mov [esp], r0
+    call fail_func
+    mov  edx, r4
+    mov  eax, r3
+.ok:
+    add  esp, max_args*4
+    RET
+
+%endif ; ARCH_X86_64