shithub: hj264

Download patch

ref: 71e5eaf30c76e8bbd82fe4c9592d4406d7fa1c08
author: Sigrid Solveig Haflínudóttir <[email protected]>
date: Tue Jul 13 08:08:57 EDT 2021

THAT'S A FIRST

--- /dev/null
+++ b/LICENSE
@@ -1,0 +1,1 @@
+Public domain.
--- /dev/null
+++ b/README.md
@@ -1,0 +1,3 @@
+# hj264
+
+H.264 encoder for Plan 9. WIP.
--- /dev/null
+++ b/hj264.c
@@ -1,0 +1,372 @@
+#define MINIH264_IMPLEMENTATION
+#define H264E_MAX_THREADS 7
+#include "minih264e.h"
+#include <thread.h>
+#include <bio.h>
+#include <draw.h>
+#include <memdraw.h>
+#include <tos.h>
+
+#define max(a,b) ((a)>(b)?(a):(b))
+#define min(a,b) ((a)<(b)?(a):(b))
+#define clp(v,a,b) min((b), max((v),(a)))
+#define align(p,a) (void*)((((uintptr)p - 1) | (a-1)) + 1)
+
+enum {
+	Align = 64,
+	Maxquality = 10,
+	Gop = 20,
+};
+
+typedef struct Hjob Hjob;
+typedef struct Hjthread Hjthread;
+typedef struct Hj264 Hj264;
+
+struct Hjob {
+	void (*run)(void *);
+	void *arg;
+};
+
+struct Hjthread {
+	int id;
+	Channel *job;
+	Channel *done;
+};
+
+struct Hj264 {
+	H264E_persist_t *persist;
+	H264E_scratch_t *scratch;
+	H264E_run_param_t rp;
+	H264E_io_yuv_t yuv;
+	Hjthread threads[H264E_MAX_THREADS];
+	Hjob jobs[H264E_MAX_THREADS];
+	int nthreads;
+	u8int buf[1];
+};
+
+static void
+xrgb2yuv(u8int *src, int stride, int h, H264E_io_yuv_t *io)
+{
+	int x, y, r, g, b;
+	u8int *bgrx, *yuv[3];
+
+	yuv[0] = io->yuv[0];
+	yuv[1] = io->yuv[1];
+	yuv[2] = io->yuv[2];
+
+	for(y = 0; y < h;){
+		bgrx = &src[y * stride];
+		for(x = 0; x < stride/4;){
+			b = bgrx[0];
+			g = bgrx[1];
+			r = bgrx[2];
+			bgrx += 4;
+/* this is not the "full" swing, just sayin' */
+#define YY ((( 66*r + 129*g +  25*b + 128) >> 8) +  16)
+#define UU (((-38*r -  74*g + 112*b + 128) >> 8) + 128)
+#define VV (((112*r -  94*g -  18*b + 128) >> 8) + 128)
+			yuv[0][x] = YY;
+			yuv[1][x/2] = UU;
+			yuv[2][x/2] = VV;
+			x++;
+
+			b = bgrx[0];
+			g = bgrx[1];
+			r = bgrx[2];
+			bgrx += 4;
+			yuv[0][x] = YY;
+			x++;
+		}
+		yuv[0] += io->stride[0];
+		y++;
+
+		for(x = 0; x < stride/4;){
+			b = bgrx[0];
+			g = bgrx[1];
+			r = bgrx[2];
+			bgrx += 4;
+			yuv[0][x] = YY;
+			x++;
+#undef YY
+#undef UU
+#undef VV
+		}
+		yuv[0] += io->stride[0];
+		yuv[1] += io->stride[1];
+		yuv[2] += io->stride[1];
+		y++;
+	}
+}
+
+static void
+threadf(void *p)
+{
+	Hjthread *t;
+	Hjob *j;
+	Channel *job, *done;
+
+	t = p;
+	threadsetname("hj264/%d", t->id);
+
+	job = t->job;
+	done = t->done;
+	for(sendp(done, nil); (j = recvp(job)) != nil; sendp(done, j))
+		j->run(j->arg);
+
+	chanfree(done);
+	chanfree(job);
+
+	threadexits(nil);
+}
+
+static void
+hjobsrun(void *p, void (*run)(void *), void **arg, int njob)
+{
+	int n, t;
+	Hj264 *h;
+	Hjob *j;
+
+	h = p;
+	for(n = 0; n < njob;){
+		for(t = 0; t < h->nthreads && n < njob; t++, n++){
+			j = &h->jobs[t];
+			j->run = run;
+			j->arg = arg[n];
+			sendp(h->threads[t].job, j);
+		}
+
+		for(t--; t >= 0; t--)
+			recvp(h->threads[t].done);
+	}
+}
+
+static int
+hj264_encode(Hj264 *h, u8int **data, int *sz)
+{
+	int e;
+
+	if((e = H264E_encode(h->persist, h->scratch, &h->rp, &h->yuv, data, sz)) != 0){
+		werrstr("H264E_encode: error %d", e);
+		return -1;
+	}
+
+	return 0;
+}
+
+static Hj264 *
+hj264new(int nthreads, int denoise, int kbps, int ww, int hh)
+{
+	int i, e, szscratch, szpersist, szyuv;
+	H264E_create_param_t cp;
+	Hjthread *t;
+	u8int *p;
+	Hj264 *h;
+
+	nthreads = clp(nthreads, 1, H264E_MAX_THREADS);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.num_layers = 1;
+	cp.gop = Gop;
+	cp.max_threads = nthreads;
+	cp.const_input_flag = 1;
+	cp.temporal_denoise_flag = denoise;
+	cp.vbv_size_bytes = kbps/1000*8/2; /* 2 seconds */
+	cp.width = ww;
+	cp.height = hh;
+
+	if((e = H264E_sizeof(&cp, &szpersist, &szscratch)) != 0){
+		werrstr("H264E_sizeof: error %d", e);
+		return nil;
+	}
+
+	/* YUV logic requires alignment */
+	ww = ((ww-1) | 15) + 1;
+	hh = ((hh-1) | 15) + 1;
+	szyuv = ww*hh*3/2;
+	if((h = calloc(1, sizeof(*h) + Align+szyuv + Align+szpersist + Align+szscratch)) == nil)
+		return nil;
+
+	p = align(h->buf, Align);
+	h->yuv.yuv[0] = p;
+	h->yuv.stride[0] = ww;
+	h->yuv.yuv[1] = p + ww*hh;
+	h->yuv.stride[1] = ww/2;
+	h->yuv.yuv[2] = p + ww*hh*5/4;
+	h->yuv.stride[2] = ww/2;
+	h->persist = align(p+szyuv, Align);
+	h->scratch = align(h->persist+szpersist, Align);
+
+	cp.token = h;
+	cp.run_func_in_thread = hjobsrun;
+	H264E_init(h->persist, &cp);
+
+	h->nthreads = nthreads;
+	for(i = 0; i < nthreads; i++){
+		t = &h->threads[i];
+		t->id = i;
+		t->job = chancreate(sizeof(void*), 0);
+		t->done = chancreate(sizeof(void*), 0);
+		proccreate(threadf, t, mainstacksize);
+		recvp(t->done);
+	}
+
+	return h;
+}
+
+static void
+hj264free(Hj264 *h)
+{
+	int i;
+
+	for(i = 0; i < h->nthreads; i++){
+		chanclose(h->threads[i].done);
+		chanclose(h->threads[i].job);
+	}
+
+	free(h);
+}
+
+static uvlong
+nanosec(void)
+{
+	static uvlong fasthz, xstart;
+	uvlong x, div;
+
+	if(fasthz == ~0ULL)
+		return nsec() - xstart;
+
+	if(fasthz == 0){
+		if(_tos->cyclefreq){
+			cycles(&xstart);
+			fasthz = _tos->cyclefreq;
+		} else {
+			xstart = nsec();
+			fasthz = ~0ULL;
+			fprint(2, "cyclefreq not available, falling back to nsec()\n");
+			fprint(2, "you might want to disable aux/timesync\n");
+			return 0;
+		}
+	}
+	cycles(&x);
+	x -= xstart;
+
+	/* this is ugly */
+	for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL);
+
+	return x / (fasthz / div);
+}
+
+static void
+usage(void)
+{
+	fprint(2, "usage: %s [-d] [-f FPS] [-n THREADS] [-k KBPS] [-q 0…10] [-Q QP]\n", argv0);
+	threadexitsall("usage");
+}
+
+int
+main(int argc, char **argv)
+{
+	int nthreads, fps, kbps, denoise, quality, qp;
+	int ww, hh, in, sz, srcsz, nframes;
+	uvlong start, end;
+	u8int *data, *src;
+	Memimage *im;
+	Biobuf out;
+	Hj264 *h;
+	char *s;
+
+	/* use NPROC-1 threads by default */
+	nthreads = ((s = getenv("NPROC")) != nil) ? atoi(s)-1 : 1;
+	denoise = 0;
+	quality = 10;
+	kbps = 0;
+	fps = 30;
+	qp = 33;
+	ARGBEGIN{
+	case 'd':
+		denoise++;
+		break;
+	case 'f':
+		fps = atoi(EARGF(usage()));
+		break;
+	case 'k':
+		kbps = atoi(EARGF(usage()));
+		break;
+	case 'n':
+		nthreads = atoi(EARGF(usage()));
+		break;
+	case 'q':
+		quality = atoi(EARGF(usage()));
+		break;
+	case 'Q':
+		qp = atoi(EARGF(usage()));
+		break;
+	default:
+		usage();
+	}ARGEND
+
+	if(argc < 1)
+		usage();
+	if((in = open(*argv, OREAD)) < 0)
+		sysfatal("input: %r");
+	if(Binit(&out, 1, OWRITE) < 0)
+		sysfatal("Binit failed: %r");
+
+	memimageinit();
+	nanosec();
+
+	if(quality > Maxquality)
+		quality = Maxquality;
+	if(kbps < 0)
+		kbps = 0;
+
+	src = nil;
+	srcsz = 0;
+	h = nil;
+	start = nanosec();
+	for(nframes = 0;; nframes++){
+		seek(in, 0, 0);
+		if((im = readmemimage(in)) == nil)
+			break;
+		ww = Dx(im->r);
+		hh = Dy(im->r);
+
+		if(h == nil){
+			srcsz = Dy(im->r)*(2+bytesperline(im->r, im->depth));
+			if((src = malloc(srcsz)) == nil)
+				sysfatal("memory");
+			unloadmemimage(im, im->r, src, srcsz);
+
+			if((h = hj264new(nthreads, denoise, kbps, ww, hh)) == nil)
+				sysfatal("hj264new: %r");
+			h->rp.encode_speed = Maxquality - quality;
+			h->rp.qp_min = h->rp.qp_max = qp;
+			if(kbps > 0){
+				h->rp.qp_min = 10;
+				h->rp.qp_max = 50;
+				h->rp.desired_frame_bytes = kbps*1000/8/fps;
+			}
+		}
+
+		unloadmemimage(im, im->r, src, srcsz);
+		xrgb2yuv(src, bytesperline(im->r, im->depth), Dy(im->r), &h->yuv);
+		freememimage(im);
+
+		if(hj264_encode(h, &data, &sz) != 0)
+			sysfatal("hj264_encode: %r");
+		if(Bwrite(&out, data, sz) != sz)
+			break;
+		if(nanosec() - start > 4000000000ULL)
+			break;
+	}
+	end = nanosec();
+	fprint(2, "%d fps\n", (int)(nframes / ((end - start)/1000000000ULL)));
+
+	/* FIXME flush on note */
+	Bflush(&out);
+	hj264free(h);
+
+	threadexitsall(nil);
+
+	return 0;
+}
--- /dev/null
+++ b/minih264e.h
@@ -1,0 +1,11718 @@
+#ifndef MINIH264_H
+#define MINIH264_H
+/*
+    https://github.com/lieff/minih264
+    To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
+    This software is distributed without any warranty.
+    See <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef H264E_SVC_API
+#   define H264E_SVC_API 1
+#endif
+
+#ifndef H264E_MAX_THREADS
+#   define H264E_MAX_THREADS 4
+#endif
+
+/**
+*   API return error codes
+*/
+#define H264E_STATUS_SUCCESS                0
+#define H264E_STATUS_BAD_ARGUMENT           1
+#define H264E_STATUS_BAD_PARAMETER          2
+#define H264E_STATUS_BAD_FRAME_TYPE         3
+#define H264E_STATUS_SIZE_NOT_MULTIPLE_16   4
+#define H264E_STATUS_SIZE_NOT_MULTIPLE_2    5
+#define H264E_STATUS_BAD_LUMA_ALIGN         6
+#define H264E_STATUS_BAD_LUMA_STRIDE        7
+#define H264E_STATUS_BAD_CHROMA_ALIGN       8
+#define H264E_STATUS_BAD_CHROMA_STRIDE      9
+
+/**
+*   Frame type definitions
+*   - Sequence must start with key (IDR) frame.
+*   - P (Predicted) frames are most efficiently coded
+*   - Dropable frames may be safely removed from bitstream, and used
+*     for frame rate scalability
+*   - Golden and Recovery frames used for error recovery. These
+*     frames uses "long-term reference" for prediction, and
+*     can be decoded if P frames sequence is interrupted.
+*     They acts similarly to key frame, but coded more efficiently.
+*
+*   Type        Refers to   Saved as long-term  Saved as short-term
+*   ---------------------------------------------------------------
+*   Key (IDR) : N/A         Yes                 Yes                |
+*   Golden    : long-term   Yes                 Yes                |
+*   Recovery  : long-term   No                  Yes                |
+*   P         : short-term  No                  Yes                |
+*   Droppable : short-term  No                  No                 |
+*                                                                  |
+*   Example sequence:        K   P   P   G   D   P   R   D   K     |
+*   long-term reference       1K  1K  1K  4G  4G  4G  4G  4G  9K   |
+*                             /         \ /         \         /    |
+*   coded frame             1K  2P  3P  4G  5D  6P  7R  8D  9K     |
+*                             \ / \ / \   \ /   / \   \ /     \    |
+*   short-term reference      1K  2P  3P  4G  4G  6P  7R  7R  9K   |
+*
+*/
+#define H264E_FRAME_TYPE_DEFAULT    0       // Frame type set according to GOP size
+#define H264E_FRAME_TYPE_KEY        6       // Random access point: SPS+PPS+Intra frame
+#define H264E_FRAME_TYPE_I          5       // Intra frame: updates long & short-term reference
+#define H264E_FRAME_TYPE_GOLDEN     4       // Use and update long-term reference
+#define H264E_FRAME_TYPE_RECOVERY   3       // Use long-term reference, updates short-term reference
+#define H264E_FRAME_TYPE_P          2       // Use and update short-term reference
+#define H264E_FRAME_TYPE_DROPPABLE  1       // Use short-term reference, don't update anything
+#define H264E_FRAME_TYPE_CUSTOM     99      // Application specifies reference frame
+
+/**
+*   Speed preset index.
+*   Currently used values are 0, 1, 8 and 9
+*/
+#define H264E_SPEED_SLOWEST         0       // All coding tools enabled, including denoise filter
+#define H264E_SPEED_BALANCED        5
+#define H264E_SPEED_FASTEST         10      // Minimum tools enabled
+
+/**
+*   Creations parameters
+*/
+typedef struct H264E_create_param_tag
+{
+    // Frame width: must be multiple of 16
+    int width;
+
+    // Frame height: must be multiple of 16
+    int height;
+
+    // GOP size == key frame period
+    // If 0: no key frames generated except 1st frame (infinite GOP)
+    // If 1: Only intra-frames produced
+    int gop;
+
+    // Video Buffer Verifier size, bits
+    // If 0: VBV model would be disabled
+    // Note, that this value defines Level,
+    int vbv_size_bytes;
+
+    // If set: transparent frames produced on VBV overflow
+    // If not set: VBV overflow ignored, produce bitrate bigger than specified
+    int vbv_overflow_empty_frame_flag;
+
+    // If set: keep minimum bitrate using stuffing, prevent VBV underflow
+    // If not set: ignore VBV underflow, produce bitrate smaller than specified
+    int vbv_underflow_stuffing_flag;
+
+    // If set: control bitrate at macroblock-level (better bitrate precision)
+    // If not set: control bitrate at frame-level (better quality)
+    int fine_rate_control_flag;
+
+    // If set: don't change input, but allocate additional frame buffer
+    // If not set: use input as a scratch
+    int const_input_flag;
+
+    // If 0: golden, recovery, and custom frames are disabled
+    // If >0: Specifies number of persistent frame buffer's used
+    int max_long_term_reference_frames;
+
+    int enableNEON;
+
+    // If set: enable temporal noise suppression
+    int temporal_denoise_flag;
+
+    int sps_id;
+
+#if H264E_SVC_API
+    //          SVC extension
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    // Number of SVC layers:
+    // 1 = AVC
+    // 2 = SVC with 2-layers of spatial scalability
+    int num_layers;
+
+    // If set, SVC extension layer will use predictors from base layer
+    // (sometimes can slightly increase efficiency)
+    int inter_layer_pred_flag;
+#endif
+
+#if H264E_MAX_THREADS
+    //           Multi-thread extension
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    // Maximum threads, supported by the callback
+    int max_threads;
+
+    // Opaque token, passed to callback
+    void *token;
+
+    // Application-supplied callback function.
+    // This callback runs given jobs, by calling provided job_func(), passing
+    // job_data[i] to each one.
+    //
+    // The h264e_thread_pool_run() can be used here, example:
+    //
+    //      int max_threads = 4;
+    //      void *thread_pool = h264e_thread_pool_init(max_threads);
+    //
+    //      H264E_create_param_t par;
+    //      par.max_threads = max_threads;
+    //      par.token = thread_pool;
+    //      par.run_func_in_thread = h264e_thread_pool_run;
+    //
+    // The reason to use double callbacks is to avoid mixing portable and
+    // system-dependent code, and to avoid close() function in the encoder API.
+    //
+    void (*run_func_in_thread)(void *token, void (*job_func)(void*), void *job_data[], int njobs);
+#endif
+
+} H264E_create_param_t;
+
+/**
+*   Run-time parameters
+*/
+typedef struct H264E_run_param_tag
+{
+    // Variable, indicating speed/quality tradeoff
+    // 0 means best quality
+    int encode_speed;
+
+    // Frame type override: one of H264E_FRAME_TYPE_* values
+    // if 0: GOP pattern defined by create_param::gop value
+    int frame_type;
+
+    // Used only if frame_type == H264E_FRAME_TYPE_CUSTOM
+    // Reference long-term frame index [1..max_long_term_reference_frames]
+    // 0 = use previous frame (short-term)
+    // -1 = IDR frame, kill all long-term frames
+    int long_term_idx_use;
+
+    // Used only if frame_type == H264E_FRAME_TYPE_CUSTOM
+    // Store decoded frame in long-term buffer with given index in the
+    // range [1..max_long_term_reference_frames]
+    // 0 = save to short-term buffer
+    // -1 = Don't save frame (dropable)
+    int long_term_idx_update;
+
+    // Target frame size. Typically = bitrate/framerate
+    int desired_frame_bytes;
+
+    // Minimum quantizer value, 10 indicates good quality
+    // range: [10; qp_max]
+    int qp_min;
+
+    // Maximum quantizer value, 51 indicates very bad quality
+    // range: [qp_min; 51]
+    int qp_max;
+
+    // Desired NALU size. NALU produced as soon as it's size exceed this value
+    // if 0: frame would be coded with a single NALU
+    int desired_nalu_bytes;
+
+    // Optional NALU notification callback, called by the encoder
+    // as soon as NALU encoding complete.
+    void (*nalu_callback)(
+        const unsigned char *nalu_data, // Coded NALU data, w/o start code
+        int sizeof_nalu_data,           // Size of NALU data
+        void *token                     // optional transparent token
+        );
+
+    // token to pass to NALU callback
+    void *nalu_callback_token;
+
+} H264E_run_param_t;
+
+/**
+*    Planar YUV420 descriptor
+*/
+typedef struct H264E_io_yuv_tag
+{
+    // Pointers to 3 pixel planes of YUV image
+    unsigned char *yuv[3];
+    // Stride for each image plane
+    int stride[3];
+} H264E_io_yuv_t;
+
+typedef struct H264E_persist_tag H264E_persist_t;
+typedef struct H264E_scratch_tag H264E_scratch_t;
+
+/**
+*   Return persistent and scratch memory requirements
+*   for given encoding options.
+*
+*   Return value:
+*       -zero in case of success
+*       -error code (H264E_STATUS_*), if fails
+*
+*   example:
+*
+*   int sizeof_persist, sizeof_scratch, error;
+*   H264E_persist_t * enc;
+*   H264E_scratch_t * scratch;
+*
+*   error = H264E_sizeof(param, &sizeof_persist, &sizeof_scratch);
+*   if (!error)
+*   {
+*       enc     = malloc(sizeof_persist);
+*       scratch = malloc(sizeof_scratch);
+*       error = H264E_init(enc, param);
+*   }
+*/
+int H264E_sizeof(
+    const H264E_create_param_t *param,  ///< Encoder creation parameters
+    int *sizeof_persist,                ///< [OUT] Size of persistent RAM
+    int *sizeof_scratch                 ///< [OUT] Size of scratch RAM
+);
+
+/**
+*   Initialize encoding session
+*
+*   Return value:
+*       -zero in case of success
+*       -error code (H264E_STATUS_*), if fails
+*/
+int H264E_init(
+    H264E_persist_t *enc,               ///< Encoder object
+    const H264E_create_param_t *param   ///< Encoder creation parameters
+);
+
+/**
+*   Encode single video frame
+*
+*   Output buffer is in the scratch RAM
+*
+*   Return value:
+*       -zero in case of success
+*       -error code (H264E_STATUS_*), if fails
+*/
+int H264E_encode(
+    H264E_persist_t *enc,               ///< Encoder object
+    H264E_scratch_t *scratch,           ///< Scratch memory
+    const H264E_run_param_t *run_param, ///< run-time parameters
+    H264E_io_yuv_t *frame,              ///< Input video frame
+    unsigned char **coded_data,         ///< [OUT] Pointer to coded data
+    int *sizeof_coded_data              ///< [OUT] Size of coded data
+);
+
+/**
+*   This is a "hack" function to set internal rate-control state
+*   Note that encoder allows application to completely override rate-control,
+*   so this function should be used only by lazy coders, who just want to change
+*   VBV size, without implementing custom rate-control.
+*
+*   Note that H.264 level defined by VBV size on initialization.
+*/
+void H264E_set_vbv_state(
+    H264E_persist_t *enc,               ///< Encoder object
+    int vbv_size_bytes,                 ///< New VBV size
+    int vbv_fullness_bytes              ///< New VBV fulness, -1 = no change
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //MINIH264_H
+
+#if defined(MINIH264_IMPLEMENTATION) && !defined(MINIH264_IMPLEMENTATION_GUARD)
+#define MINIH264_IMPLEMENTATION_GUARD
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+/************************************************************************/
+/*      Build configuration                                             */
+/************************************************************************/
+#ifndef H264E_ENABLE_DENOISE
+#define H264E_ENABLE_DENOISE 1 // Build-in noise supressor
+#endif
+
+#ifndef MAX_LONG_TERM_FRAMES
+#define MAX_LONG_TERM_FRAMES 8 // Max long-term frames count
+#endif
+
+#if !defined(MINIH264_ONLY_SIMD) && (defined(_M_X64) || defined(_M_ARM64) || defined(__x86_64__) || defined(__aarch64__))
+/* x64 always have SSE2, arm64 always have neon, no need for generic code */
+#define MINIH264_ONLY_SIMD
+#endif /* SIMD checks... */
+
+#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__))
+#define H264E_ENABLE_SSE2 1
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <emmintrin.h>
+#endif
+#elif defined(__ARM_NEON) || defined(__aarch64__)
+#define H264E_ENABLE_NEON 1
+#include <arm_neon.h>
+#else
+#ifdef MINIH264_ONLY_SIMD
+#error MINIH264_ONLY_SIMD used, but SSE/NEON not enabled
+#endif
+#endif
+
+#ifndef MINIH264_ONLY_SIMD
+#define H264E_ENABLE_PLAIN_C 1
+#endif
+
+#define H264E_CONFIGS_COUNT ((H264E_ENABLE_SSE2) + (H264E_ENABLE_PLAIN_C) + (H264E_ENABLE_NEON))
+
+#if defined(__ARMCC_VERSION) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define __BYTE_ORDER 0
+#define __BIG_ENDIAN 1
+#elif defined(__linux__) || defined(__CYGWIN__)
+#include <endian.h>
+#elif defined(__APPLE__)
+#include <libkern/OSByteOrder.h>
+#define __BYTE_ORDER BYTE_ORDER
+#define __BIG_ENDIAN BIG_ENDIAN
+#elif defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__)
+#include <sys/endian.h>
+#elif __plan9__
+#define __LITTLE_ENDIAN 1234
+#define __BIG_ENDIAN 4321
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#else
+#error platform not supported
+#endif
+
+#if defined(__aarch64__) && defined(__clang__)
+// uintptr_t broken with aarch64 clang on ubuntu 18
+#define uintptr_t unsigned long
+#endif
+#if defined(__arm__) && defined(__clang__)
+#include <arm_acle.h>
+#elif defined(__arm__) && defined(__GNUC__) && !defined(__ARMCC_VERSION)
+static inline unsigned int __usad8(unsigned int val1, unsigned int val2)
+{
+    unsigned int result;
+    __asm__ volatile ("usad8 %0, %1, %2\n\t"
+                      : "=r" (result)
+                      : "r" (val1), "r" (val2));
+    return result;
+}
+
+static inline unsigned int __usada8(unsigned int val1, unsigned int val2, unsigned int val3)
+{
+    unsigned int result;
+    __asm__ volatile ("usada8 %0, %1, %2, %3\n\t"
+                      : "=r" (result)
+                      : "r" (val1), "r" (val2), "r" (val3));
+    return result;
+}
+
+static inline unsigned int __sadd16(unsigned int val1, unsigned int val2)
+{
+    unsigned int result;
+    __asm__ volatile ("sadd16 %0, %1, %2\n\t"
+                      : "=r" (result)
+                      : "r" (val1), "r" (val2));
+    return result;
+}
+
+static inline unsigned int __ssub16(unsigned int val1, unsigned int val2)
+{
+    unsigned int result;
+    __asm__ volatile ("ssub16 %0, %1, %2\n\t"
+                      : "=r" (result)
+                      : "r" (val1), "r" (val2));
+    return result;
+}
+
+static inline unsigned int __clz(unsigned int val1)
+{
+    unsigned int result;
+    __asm__ volatile ("clz %0, %1\n\t"
+                      : "=r" (result)
+                      : "r" (val1));
+    return result;
+}
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif  //__cplusplus
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+#   define h264e_restrict __restrict
+#elif defined(__arm__)
+#   define h264e_restrict __restrict
+#else
+#   define h264e_restrict
+#endif
+#if defined(_MSC_VER)
+#   define ALIGN(n) __declspec(align(n))
+#   define ALIGN2(n)
+#else
+#   define ALIGN(n)
+#   define ALIGN2(n) __attribute__((aligned(n)))
+#endif
+
+#if __GNUC__ || __clang__
+typedef int int_u __attribute__ ((__aligned__ (1)));
+#else
+typedef int int_u;
+#endif
+
+#ifndef MAX
+#   define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef MIN
+#   define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#ifndef ABS
+#   define ABS(x)    ((x) >= 0 ? (x) : -(x))
+#endif
+
+#define IS_ALIGNED(p, n) (!((uintptr_t)(p) & (uintptr_t)((n) - 1)))
+
+// bit-stream
+#if __BYTE_ORDER == __BIG_ENDIAN
+#   define SWAP32(x) (uint32_t)(x)
+#else
+#ifdef _MSC_VER
+#   define SWAP32(x) _byteswap_ulong(x)
+#elif defined(__GNUC__) || defined(__clang__)
+#   define SWAP32(x) __builtin_bswap32(x)
+#else
+#   define SWAP32(x) (uint32_t)((((x) >> 24) & 0xFF) | (((x) >> 8) & 0xFF00) | (((x) << 8) & 0xFF0000) | ((x & 0xFF) << 24))
+#endif
+#endif
+
+#define BS_OPEN(bs) uint32_t cache = bs->cache; int shift = bs->shift; uint32_t *buf = bs->buf;
+#define BS_CLOSE(bs) bs->cache = cache; bs->shift = shift; bs->buf = buf;
+#define BS_PUT(n, val)      \
+if ((shift -= n) < 0)       \
+{                           \
+    cache |= val >> -shift; \
+    *buf++ = SWAP32(cache); \
+    shift += 32;            \
+    cache = 0;              \
+}                           \
+cache |= (uint32_t)val << shift;
+
+// Quantizer-dequantizer modes
+#define QDQ_MODE_INTRA_4   2       // intra 4x4
+#define QDQ_MODE_INTER     8       // inter
+#define QDQ_MODE_INTRA_16  (8 + 1) // intra 16x61
+#define QDQ_MODE_CHROMA    (4 + 1) // chroma
+
+// put most frequently used bits to lsb, to use these as look-up tables
+#define AVAIL_TR    8
+#define AVAIL_TL    4
+#define AVAIL_L     2
+#define AVAIL_T     1
+
+typedef uint8_t     pix_t;
+typedef uint32_t    bs_item_t;
+
+/**
+*   Output bitstream
+*/
+typedef struct
+{
+    int         shift;  // bit position in the cache
+    uint32_t    cache;  // bit cache
+    bs_item_t    *buf;  // current position
+    bs_item_t  *origin; // initial position
+} bs_t;
+
+/**
+*   Tuple for motion vector, or height/width representation
+*/
+typedef union
+{
+    struct
+    {
+        int16_t x;      // horizontal or width
+        int16_t y;      // vertical or height
+    } s;
+    int32_t u32;        // packed representation
+} point_t;
+
+/**
+*   Rectangle
+*/
+typedef struct
+{
+    point_t tl;         // top-left corner
+    point_t br;         // bottom-right corner
+} rectangle_t;
+
+/**
+*   Quantized/dequantized representation for 4x4 block
+*/
+typedef struct
+{
+    int16_t qv[16];     // quantized coefficient
+    int16_t dq[16];     // dequantized
+} quant_t;
+
+/**
+*   Scratch RAM, used only for current MB encoding
+*/
+typedef struct H264E_scratch_tag
+{
+    pix_t mb_pix_inp[256];          // Input MB (cached)
+    pix_t mb_pix_store[4*256];      // Prediction variants
+
+    // Quantized/dequantized
+    int16_t dcy[16];                // Y DC
+    quant_t qy[16];                 // Y 16x4x4 blocks
+
+    int16_t dcu[16];                // U DC: 4 used + align
+    quant_t qu[4];                  // U 4x4x4 blocks
+
+    int16_t dcv[16];                // V DC: 4 used + align
+    quant_t qv[4];                  // V 4x4x4 blocks
+
+    // Quantized DC:
+    int16_t quant_dc[16];           // Y
+    int16_t quant_dc_u[4];          // U
+    int16_t quant_dc_v[4];          // V
+
+    uint16_t nz_mask;               // Bit flags for non-zero 4x4 blocks
+} scratch_t;
+
+/**
+*   Deblock filter frame context
+*/
+typedef struct
+{
+    // Motion vectors for 4x4 MB internal sub-blocks, top and left border,
+    // 5x5 array without top-left cell:
+    //     T0 T1 T2 T4
+    //  L0 i0 i1 i2 i3
+    //  L1 ...
+    //  ......
+    //
+    point_t df_mv[5*5 - 1];         // MV for current macroblock and neighbors
+    uint8_t *df_qp;                 // QP for current row of macroblocks
+    int8_t *mb_type;                // Macroblock type for current row of macroblocks
+    uint32_t nzflag;                // Bit flags for non-zero 4x4 blocks (left neighbors)
+
+    // Huffman and deblock uses different nnz...
+    uint8_t *df_nzflag;             // Bit flags for non-zero 4x4 blocks (top neighbors), only 4 bits used
+} deblock_filter_t;
+
+/**
+*    Deblock filter parameters for current MB
+*/
+typedef struct
+{
+    uint32_t strength32[4*2];       // Strength for 4 colums and 4 rows
+    uint8_t tc0[16*2];              // TC0 parameter for 4 colums and 4 rows
+    uint8_t alpha[2*2];             // alpha for border/internals
+    uint8_t beta[2*2];              // beta for border/internals
+} deblock_params_t;
+
+/**
+*   Persistent RAM
+*/
+typedef struct H264E_persist_tag
+{
+    H264E_create_param_t param;     // Copy of create parameters
+    H264E_io_yuv_t inp;             // Input picture
+
+    struct
+    {
+        int pic_init_qp;            // Initial QP
+    } sps;
+
+    struct
+    {
+        int num;                    // Frame number
+        int nmbx;                   // Frame width, macroblocks
+        int nmby;                   // Frame height, macroblocks
+        int nmb;                    // Number of macroblocks in frame
+        int w;                      // Frame width, pixels
+        int h;                      // Frame height, pixels
+        rectangle_t mv_limit;       // Frame MV limits = frame + border extension
+        rectangle_t mv_qpel_limit;  // Reduced MV limits for qpel interpolation filter
+        int cropping_flag;          // Cropping indicator
+    } frame;
+
+    struct
+    {
+        int type;                   // Current slice type (I/P)
+        int start_mb_num;           // # of 1st MB in the current slice
+    } slice;
+
+    struct
+    {
+        int x;                      // MB x position (in MB's)
+        int y;                      // MB y position (in MB's)
+        int num;                    // MB number
+        int skip_run;               // Skip run count
+
+        // according to table 7-13
+        // -1 = skip, 0 = P16x16, 1 = P16x8, 2=P8x16, 3 = P8x8, 5 = I4x4, >=6 = I16x16
+        int type;                   // MB type
+
+        struct
+        {
+            int pred_mode_luma;     // Intra 16x16 prediction mode
+        } i16;
+
+        int8_t i4x4_mode[16];       // Intra 4x4 prediction modes
+
+        int cost;                   // Best coding cost
+        int avail;                  // Neighbor availability flags
+        point_t mvd[16];            // Delta-MV for each 4x4 sub-part
+        point_t mv[16];             // MV for each 4x4 sub-part
+
+        point_t mv_skip_pred;       // Skip MV predictor
+    } mb;
+
+    H264E_io_yuv_t ref;             // Current reference picture
+    H264E_io_yuv_t dec;             // Reconstructed current macroblock
+#if H264E_ENABLE_DENOISE
+    H264E_io_yuv_t denoise;         // Noise suppression filter
+#endif
+
+    unsigned char *lt_yuv[MAX_LONG_TERM_FRAMES][3]; // Long-term reference pictures
+    unsigned char lt_used[MAX_LONG_TERM_FRAMES];    // Long-term "used" flags
+
+    struct
+    {
+        int qp;                     // Current QP
+        int vbv_bits;               // Current VBV fullness, bits
+        int qp_smooth;              // Averaged QP
+        int dqp_smooth;             // Adaptive QP adjustment, account for "compressibility"
+        int max_dqp;                // Worst-case DQP, for long-term reference QP adjustment
+
+        int bit_budget;             // Frame bit budget
+        int prev_qp;                // Previous MB QP
+        int prev_err;               // Accumulated coded size error
+        int stable_count;           // Stable/not stable state machine
+
+        int vbv_target_level;       // Desired VBV fullness after frame encode
+
+        // Quantizer data, passed to low-level functions
+        // layout:
+        // multiplier_quant0, multiplier_dequant0,
+        // multiplier_quant2, multiplier_dequant2,
+        // multiplier_quant1, multiplier_dequant1,
+        // rounding_factor_pos,
+        // zero_thr_inter
+        // zero_thr_inter2
+        // ... and same data for chroma
+        //uint16_t qdat[2][(6 + 4)];
+#define OFFS_RND_INTER 6
+#define OFFS_RND_INTRA 7
+#define OFFS_THR_INTER 8
+#define OFFS_THR2_INTER 9
+#define OFFS_THR_1_OFF 10
+#define OFFS_THR_2_OFF 18
+#define OFFS_QUANT_VECT 26
+#define OFFS_DEQUANT_VECT 34
+        //struct
+        //{
+        //    uint16_t qdq[6];
+        //    uint16_t rnd[2]; // inter/intra
+        //    uint16_t thr[2]; // thresholds
+        //    uint16_t zero_thr[2][8];
+        //    uint16_t qfull[8];
+        //    uint16_t dqfull[8];
+        //} qdat[2];
+        uint16_t qdat[2][6 + 2 + 2 + 8 + 8 + 8 + 8];
+    } rc;
+
+    deblock_filter_t df;            // Deblock filter
+
+    // Speed/quality trade-off
+    struct
+    {
+        int disable_deblock;        // Disable deblock filter flags
+    } speed;
+
+    int most_recent_ref_frame_idx;  // Last updated long-term reference
+
+    // predictors contexts
+    point_t *mv_pred;               // MV for left&top 4x4 blocks
+    uint8_t *nnz;                   // Number of non-zero coeffs per 4x4 block for left&top
+    int32_t *i4x4mode;              // Intra 4x4 mode for left&top
+    pix_t *top_line;                // left&top neighbor pixels
+
+    // output data
+    uint8_t *out;                   // Output data storage (pointer to scratch RAM!)
+    unsigned int out_pos;           // Output byte position
+    bs_t bs[1];                     // Output bitbuffer
+
+    scratch_t *scratch;             // Pointer to scratch RAM
+#if H264E_MAX_THREADS > 1
+    scratch_t *scratch_store[H264E_MAX_THREADS];   // Pointer to scratch RAM
+    int sizeof_scaratch;
+#endif
+    H264E_run_param_t run_param;    // Copy of run-time parameters
+
+    // Consecutive IDR's must have different idr_pic_id,
+    // unless there are some P between them
+    uint8_t next_idr_pic_id;
+
+    pix_t *pbest;                   // Macroblock best predictor
+    pix_t *ptest;                   // Macroblock predictor under test
+
+    point_t mv_clusters[2];         // MV clusterization for prediction
+
+    // Flag to track short-term reference buffer, for MMCO 1 command
+    int short_term_used;
+
+#if H264E_SVC_API
+    //svc ext
+    int   current_layer;
+    int   adaptive_base_mode_flag;
+    void *enc_next;
+#endif
+
+} h264e_enc_t;
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+/************************************************************************/
+/*      Constants                                                       */
+/************************************************************************/
+
+// Tunable constants can be adjusted by the "training" application
+#ifndef ADJUSTABLE
+#   define ADJUSTABLE static const
+#endif
+
+// Huffman encode tables
+#define CODE8(val, len) (uint8_t)((val << 4) + len)
+#define CODE(val, len) (uint8_t)((val << 4) + (len - 1))
+
+const uint8_t h264e_g_run_before[57] =
+{
+    15, 17, 20, 24, 29, 35, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+    /**** Table #  0 size  2 ****/
+    CODE8(1, 1), CODE8(0, 1),
+    /**** Table #  1 size  3 ****/
+    CODE8(1, 1), CODE8(1, 2), CODE8(0, 2),
+    /**** Table #  2 size  4 ****/
+    CODE8(3, 2), CODE8(2, 2), CODE8(1, 2), CODE8(0, 2),
+    /**** Table #  3 size  5 ****/
+    CODE8(3, 2), CODE8(2, 2), CODE8(1, 2), CODE8(1, 3), CODE8(0, 3),
+    /**** Table #  4 size  6 ****/
+    CODE8(3, 2), CODE8(2, 2), CODE8(3, 3), CODE8(2, 3), CODE8(1, 3), CODE8(0, 3),
+    /**** Table #  5 size  7 ****/
+    CODE8(3, 2), CODE8(0, 3), CODE8(1, 3), CODE8(3, 3), CODE8(2, 3), CODE8(5, 3), CODE8(4, 3),
+    /**** Table #  6 size 15 ****/
+    CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(2,  3), CODE8(1,  3), CODE8(1, 4),
+    CODE8(1, 5), CODE8(1, 6), CODE8(1, 7), CODE8(1, 8), CODE8(1, 9), CODE8(1, 10), CODE8(1, 11),
+};
+
+const uint8_t h264e_g_total_zeros_cr_2x2[12] =
+{
+    3, 7, 10,
+    /**** Table #  0 size  4 ****/
+    CODE8(1, 1), CODE8(1, 2), CODE8(1, 3), CODE8(0, 3),
+    /**** Table #  1 size  3 ****/
+    CODE8(1, 1), CODE8(1, 2), CODE8(0, 2),
+    /**** Table #  2 size  2 ****/
+    CODE8(1, 1), CODE8(0, 1),
+};
+
+const uint8_t h264e_g_total_zeros[150] =
+{
+    15, 31, 46, 60, 73, 85, 96, 106, 115, 123, 130, 136, 141, 145, 148,
+    /**** Table #  0 size 16 ****/
+    CODE8(1, 1), CODE8(3, 3), CODE8(2, 3), CODE8(3, 4), CODE8(2, 4), CODE8(3, 5), CODE8(2, 5), CODE8(3, 6),
+    CODE8(2, 6), CODE8(3, 7), CODE8(2, 7), CODE8(3, 8), CODE8(2, 8), CODE8(3, 9), CODE8(2, 9), CODE8(1, 9),
+    /**** Table #  1 size 15 ****/
+    CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(5, 4), CODE8(4, 4), CODE8(3, 4),
+    CODE8(2, 4), CODE8(3, 5), CODE8(2, 5), CODE8(3, 6), CODE8(2, 6), CODE8(1, 6), CODE8(0, 6),
+    /**** Table #  2 size 14 ****/
+    CODE8(5, 4), CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 4), CODE8(3, 4), CODE8(4, 3), CODE8(3, 3),
+    CODE8(2, 4), CODE8(3, 5), CODE8(2, 5), CODE8(1, 6), CODE8(1, 5), CODE8(0, 6),
+    /**** Table #  3 size 13 ****/
+    CODE8(3, 5), CODE8(7, 3), CODE8(5, 4), CODE8(4, 4), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 4),
+    CODE8(3, 3), CODE8(2, 4), CODE8(2, 5), CODE8(1, 5), CODE8(0, 5),
+    /**** Table #  4 size 12 ****/
+    CODE8(5, 4), CODE8(4, 4), CODE8(3, 4), CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3),
+    CODE8(2, 4), CODE8(1, 5), CODE8(1, 4), CODE8(0, 5),
+    /**** Table #  5 size 11 ****/
+    CODE8(1, 6), CODE8(1, 5), CODE8(7, 3), CODE8(6, 3), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(2, 3),
+    CODE8(1, 4), CODE8(1, 3), CODE8(0, 6),
+    /**** Table #  6 size 10 ****/
+    CODE8(1, 6), CODE8(1, 5), CODE8(5, 3), CODE8(4, 3), CODE8(3, 3), CODE8(3, 2), CODE8(2, 3), CODE8(1, 4),
+    CODE8(1, 3), CODE8(0, 6),
+    /**** Table #  7 size  9 ****/
+    CODE8(1, 6), CODE8(1, 4), CODE8(1, 5), CODE8(3, 3), CODE8(3, 2), CODE8(2, 2), CODE8(2, 3), CODE8(1, 3),
+    CODE8(0, 6),
+    /**** Table #  8 size  8 ****/
+    CODE8(1, 6), CODE8(0, 6), CODE8(1, 4), CODE8(3, 2), CODE8(2, 2), CODE8(1, 3), CODE8(1, 2), CODE8(1, 5),
+    /**** Table #  9 size  7 ****/
+    CODE8(1, 5), CODE8(0, 5), CODE8(1, 3), CODE8(3, 2), CODE8(2, 2), CODE8(1, 2), CODE8(1, 4),
+    /**** Table # 10 size  6 ****/
+    CODE8(0, 4), CODE8(1, 4), CODE8(1, 3), CODE8(2, 3), CODE8(1, 1), CODE8(3, 3),
+    /**** Table # 11 size  5 ****/
+    CODE8(0, 4), CODE8(1, 4), CODE8(1, 2), CODE8(1, 1), CODE8(1, 3),
+    /**** Table # 12 size  4 ****/
+    CODE8(0, 3), CODE8(1, 3), CODE8(1, 1), CODE8(1, 2),
+    /**** Table # 13 size  3 ****/
+    CODE8(0, 2), CODE8(1, 2), CODE8(1, 1),
+    /**** Table # 14 size  2 ****/
+    CODE8(0, 1), CODE8(1, 1),
+};
+
+const uint8_t h264e_g_coeff_token[277 + 18] =
+{
+    17 + 18, 17 + 18,
+    82 + 18, 82 + 18,
+    147 + 18, 147 + 18, 147 + 18, 147 + 18,
+    212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18, 212 + 18,
+    0 + 18,
+    /**** Table #  4 size 17 ****/     // offs: 0
+    CODE(1, 2), CODE(1, 1), CODE(1, 3), CODE(5, 6), CODE(7, 6), CODE(6, 6), CODE(2, 7), CODE(0, 7), CODE(4, 6),
+    CODE(3, 7), CODE(2, 8), CODE(0, 0), CODE(3, 6), CODE(3, 8), CODE(0, 0), CODE(0, 0), CODE(2, 6),
+    /**** Table #  0 size 65 ****/     // offs: 17
+    CODE( 1,  1), CODE( 1,  2), CODE( 1,  3), CODE( 3,  5), CODE( 5,  6), CODE( 4,  6), CODE( 5,  7), CODE( 3,  6),
+    CODE( 7,  8), CODE( 6,  8), CODE( 5,  8), CODE( 4,  7), CODE( 7,  9), CODE( 6,  9), CODE( 5,  9), CODE( 4,  8),
+    CODE( 7, 10), CODE( 6, 10), CODE( 5, 10), CODE( 4,  9), CODE( 7, 11), CODE( 6, 11), CODE( 5, 11), CODE( 4, 10),
+    CODE(15, 13), CODE(14, 13), CODE(13, 13), CODE( 4, 11), CODE(11, 13), CODE(10, 13), CODE( 9, 13), CODE(12, 13),
+    CODE( 8, 13), CODE(14, 14), CODE(13, 14), CODE(12, 14), CODE(15, 14), CODE(10, 14), CODE( 9, 14), CODE( 8, 14),
+    CODE(11, 14), CODE(14, 15), CODE(13, 15), CODE(12, 15), CODE(15, 15), CODE(10, 15), CODE( 9, 15), CODE( 8, 15),
+    CODE(11, 15), CODE( 1, 15), CODE(13, 16), CODE(12, 16), CODE(15, 16), CODE(14, 16), CODE( 9, 16), CODE( 8, 16),
+    CODE(11, 16), CODE(10, 16), CODE( 5, 16), CODE( 0,  0), CODE( 7, 16), CODE( 6, 16), CODE( 0,  0), CODE( 0,  0), CODE( 4, 16),
+    /**** Table #  1 size 65 ****/     // offs: 82
+    CODE( 3,  2), CODE( 2,  2), CODE( 3,  3), CODE( 5,  4), CODE(11,  6), CODE( 7,  5), CODE( 9,  6), CODE( 4,  4),
+    CODE( 7,  6), CODE(10,  6), CODE( 5,  6), CODE( 6,  5), CODE( 7,  7), CODE( 6,  6), CODE( 5,  7), CODE( 8,  6),
+    CODE( 7,  8), CODE( 6,  7), CODE( 5,  8), CODE( 4,  6), CODE( 4,  8), CODE( 6,  8), CODE( 5,  9), CODE( 4,  7),
+    CODE( 7,  9), CODE( 6,  9), CODE(13, 11), CODE( 4,  9), CODE(15, 11), CODE(14, 11), CODE( 9, 11), CODE(12, 11),
+    CODE(11, 11), CODE(10, 11), CODE(13, 12), CODE( 8, 11), CODE(15, 12), CODE(14, 12), CODE( 9, 12), CODE(12, 12),
+    CODE(11, 12), CODE(10, 12), CODE(13, 13), CODE(12, 13), CODE( 8, 12), CODE(14, 13), CODE( 9, 13), CODE( 8, 13),
+    CODE(15, 13), CODE(10, 13), CODE( 6, 13), CODE( 1, 13), CODE(11, 13), CODE(11, 14), CODE(10, 14), CODE( 4, 14),
+    CODE( 7, 13), CODE( 8, 14), CODE( 5, 14), CODE( 0,  0), CODE( 9, 14), CODE( 6, 14), CODE( 0,  0), CODE( 0,  0), CODE( 7, 14),
+    /**** Table #  2 size 65 ****/     // offs: 147
+    CODE(15,  4), CODE(14,  4), CODE(13,  4), CODE(12,  4), CODE(15,  6), CODE(15,  5), CODE(14,  5), CODE(11,  4),
+    CODE(11,  6), CODE(12,  5), CODE(11,  5), CODE(10,  4), CODE( 8,  6), CODE(10,  5), CODE( 9,  5), CODE( 9,  4),
+    CODE(15,  7), CODE( 8,  5), CODE(13,  6), CODE( 8,  4), CODE(11,  7), CODE(14,  6), CODE( 9,  6), CODE(13,  5),
+    CODE( 9,  7), CODE(10,  6), CODE(13,  7), CODE(12,  6), CODE( 8,  7), CODE(14,  7), CODE(10,  7), CODE(12,  7),
+    CODE(15,  8), CODE(14,  8), CODE(13,  8), CODE(12,  8), CODE(11,  8), CODE(10,  8), CODE( 9,  8), CODE( 8,  8),
+    CODE(15,  9), CODE(14,  9), CODE(13,  9), CODE(12,  9), CODE(11,  9), CODE(10,  9), CODE( 9,  9), CODE(10, 10),
+    CODE( 8,  9), CODE( 7,  9), CODE(11, 10), CODE( 6, 10), CODE(13, 10), CODE(12, 10), CODE( 7, 10), CODE( 2, 10),
+    CODE( 9, 10), CODE( 8, 10), CODE( 3, 10), CODE( 0,  0), CODE( 5, 10), CODE( 4, 10), CODE( 0,  0), CODE( 0,  0), CODE( 1, 10),
+    /**** Table #  3 size 65 ****/     // offs: 212
+     3,  1,  6, 11,  0,  5, 10, 15,  4,  9, 14, 19,  8, 13, 18, 23, 12, 17, 22, 27, 16, 21, 26, 31, 20, 25, 30, 35,
+    24, 29, 34, 39, 28, 33, 38, 43, 32, 37, 42, 47, 36, 41, 46, 51, 40, 45, 50, 55, 44, 49, 54, 59, 48, 53, 58, 63,
+    52, 57, 62,  0, 56, 61,  0,  0, 60
+};
+
+/*
+    Block scan order
+    0 1 4 5
+    2 3 6 7
+    8 9 C D
+    A B E F
+*/
+static const uint8_t decode_block_scan[16] = { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 };
+
+static const uint8_t qpy2qpc[52] = {  // todo: [0 - 9] not used
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,
+   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+   26, 27, 28, 29, 29, 30, 31, 32, 32, 33, 34, 34, 35,
+   35, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, 39, 39,
+};
+
+/**
+*   Rate-control LUT for intra/inter macroblocks: number of bits per macroblock for given QP
+*   Estimated experimentally
+*/
+static const uint16_t bits_per_mb[2][42 - 1] =
+{
+    // 10                                                          20                                                          30                                                          40                                                          50
+    { 664,  597,  530,  484,  432,  384,  341,  297,  262,  235,  198,  173,  153,  131,  114,  102,   84,   74,   64,   54,   47,   42,   35,   31,   26,   22,   20,   17,   15,   13,   12,   10,    9,    9,    7,    7,    6,    5,    4,    1,    1}, // P
+    {1057,  975,  925,  868,  803,  740,  694,  630,  586,  547,  496,  457,  420,  378,  345,  318,  284,  258,  234,  210,  190,  178,  155,  141,  129,  115,  102,   95,   82,   75,   69,   60,   55,   51,   45,   41,   40,   35,   31,   28,   24}  // I
+};
+
+/**
+*   Deblock filter constants:
+*   <alpha> <thr[1]> <thr[2]> <thr[3]> <beta>
+*/
+static const uint8_t g_a_tc0_b[52 - 10][5] = {
+    {  0,  0,  0,  0,  0},  // 10
+    {  0,  0,  0,  0,  0},  // 11
+    {  0,  0,  0,  0,  0},  // 12
+    {  0,  0,  0,  0,  0},  // 13
+    {  0,  0,  0,  0,  0},  // 14
+    {  0,  0,  0,  0,  0},  // 15
+    {  4,  0,  0,  0,  2},
+    {  4,  0,  0,  1,  2},
+    {  5,  0,  0,  1,  2},
+    {  6,  0,  0,  1,  3},
+    {  7,  0,  0,  1,  3},
+    {  8,  0,  1,  1,  3},
+    {  9,  0,  1,  1,  3},
+    { 10,  1,  1,  1,  4},
+    { 12,  1,  1,  1,  4},
+    { 13,  1,  1,  1,  4},
+    { 15,  1,  1,  1,  6},
+    { 17,  1,  1,  2,  6},
+    { 20,  1,  1,  2,  7},
+    { 22,  1,  1,  2,  7},
+    { 25,  1,  1,  2,  8},
+    { 28,  1,  2,  3,  8},
+    { 32,  1,  2,  3,  9},
+    { 36,  2,  2,  3,  9},
+    { 40,  2,  2,  4, 10},
+    { 45,  2,  3,  4, 10},
+    { 50,  2,  3,  4, 11},
+    { 56,  3,  3,  5, 11},
+    { 63,  3,  4,  6, 12},
+    { 71,  3,  4,  6, 12},
+    { 80,  4,  5,  7, 13},
+    { 90,  4,  5,  8, 13},
+    {101,  4,  6,  9, 14},
+    {113,  5,  7, 10, 14},
+    {127,  6,  8, 11, 15},
+    {144,  6,  8, 13, 15},
+    {162,  7, 10, 14, 16},
+    {182,  8, 11, 16, 16},
+    {203,  9, 12, 18, 17},
+    {226, 10, 13, 20, 17},
+    {255, 11, 15, 23, 18},
+    {255, 13, 17, 25, 18},
+};
+
+/************************************************************************/
+/*  Adjustable encoder parameters. Initial MIN_QP values never used     */
+/************************************************************************/
+
+ADJUSTABLE uint16_t g_rnd_inter[] = {
+    11665, 11665, 11665, 11665, 11665, 11665, 11665, 11665, 11665, 11665,
+    11665, 12868, 14071, 15273, 16476,
+    17679, 17740, 17801, 17863, 17924,
+    17985, 17445, 16904, 16364, 15823,
+    15283, 15198, 15113, 15027, 14942,
+    14857, 15667, 16478, 17288, 18099,
+    18909, 19213, 19517, 19822, 20126,
+    20430, 16344, 12259, 8173, 4088,
+    4088, 4088, 4088, 4088, 4088,
+    4088, 4088,
+};
+
+ADJUSTABLE uint16_t g_thr_inter[] = {
+    31878, 31878, 31878, 31878, 31878, 31878, 31878, 31878, 31878, 31878,
+    31878, 33578, 35278, 36978, 38678,
+    40378, 41471, 42563, 43656, 44748,
+    45841, 46432, 47024, 47615, 48207,
+    48798, 49354, 49911, 50467, 51024,
+    51580, 51580, 51580, 51580, 51580,
+    51580, 52222, 52864, 53506, 54148,
+    54790, 45955, 37120, 28286, 19451,
+    10616, 9326, 8036, 6745, 5455,
+    4165, 4165,
+};
+
+ADJUSTABLE uint16_t g_thr_inter2[] = {
+    45352, 45352, 45352, 45352, 45352, 45352, 45352, 45352, 45352, 45352,
+    45352, 41100, 36848, 32597, 28345,
+    24093, 25904, 27715, 29525, 31336,
+    33147, 33429, 33711, 33994, 34276,
+    34558, 32902, 31246, 29590, 27934,
+    26278, 26989, 27700, 28412, 29123,
+    29834, 29038, 28242, 27445, 26649,
+    25853, 23440, 21028, 18615, 16203,
+    13790, 11137, 8484, 5832, 3179,
+    526, 526,
+};
+
+ADJUSTABLE uint16_t g_skip_thr_inter[52] =
+{
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 44, 44,
+    44, 40, 37, 33, 30,
+    26, 32, 38, 45, 51,
+    57, 58, 58, 59, 59,
+    60, 66, 73, 79, 86,
+    92, 95, 98, 100, 103,
+    106, 200, 300, 400, 500,
+    600, 700, 800, 900, 1000,
+    1377, 1377,
+};
+
+ADJUSTABLE uint16_t g_lambda_q4[52] =
+{
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 13, 11, 10, 8,
+    7, 11, 15, 20, 24,
+    28, 30, 31, 33, 34,
+    36, 48, 60, 71, 83,
+    95, 95, 95, 96, 96,
+    96, 113, 130, 147, 164,
+    181, 401, 620, 840, 1059,
+    1279, 1262, 1246, 1229, 1213,
+    1196, 1196,
+};
+ADJUSTABLE uint16_t g_lambda_mv_q4[52] =
+{
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 14, 15, 15, 16,
+    17, 18, 20, 21, 23,
+    24, 28, 32, 37, 41,
+    45, 53, 62, 70, 79,
+    87, 105, 123, 140, 158,
+    176, 195, 214, 234, 253,
+    272, 406, 541, 675, 810,
+    944, 895, 845, 796, 746,
+    697, 697,
+};
+
+ADJUSTABLE uint16_t g_skip_thr_i4x4[52] =
+{
+    0,1,2,3,4,5,6,7,8,9,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+    68, 68, 68, 68, 68, 68, 68, 68, 68, 68,
+    100, 100,
+};
+
+ADJUSTABLE uint16_t g_deadzonei[] = {
+    3419, 3419, 3419, 3419, 3419, 3419, 3419, 3419, 3419, 3419,
+    30550, 8845, 14271, 19698, 25124,
+    30550, 29556, 28562, 27569, 26575,
+    25581, 25284, 24988, 24691, 24395,
+    24098, 24116, 24134, 24153, 24171,
+    24189, 24010, 23832, 23653, 23475,
+    23296, 23569, 23842, 24115, 24388,
+    24661, 19729, 14797, 9865, 4933,
+    24661, 3499, 6997, 10495, 13993,
+    17491, 17491,
+};
+
+ADJUSTABLE uint16_t g_lambda_i4_q4[] = {
+    27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+    27, 31, 34, 38, 41,
+    45, 76, 106, 137, 167,
+    198, 220, 243, 265, 288,
+    310, 347, 384, 421, 458,
+    495, 584, 673, 763, 852,
+    941, 1053, 1165, 1276, 1388,
+    1500, 1205, 910, 614, 319,
+    5000, 1448, 2872, 4296, 5720,
+    7144, 7144,
+};
+
+ADJUSTABLE uint16_t g_lambda_i16_q4[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0,
+    0, 3, 7, 10, 14,
+    17, 14, 10, 7, 3,
+    50, 20, 39, 59, 78,
+    98, 94, 89, 85, 80,
+    76, 118, 161, 203, 246,
+    288, 349, 410, 470, 531,
+    592, 575, 558, 540, 523,
+    506, 506,
+};
+
+const uint8_t g_diff_to_gainQ8[256] =
+{
+    0, 16, 25, 32, 37, 41, 44, 48, 50, 53, 55, 57, 59, 60, 62, 64, 65,
+    66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 76, 77, 78, 79, 80, 80,
+    81, 82, 82, 83, 83, 84, 85, 85, 86, 86, 87, 87, 88, 88, 89, 89,
+    90, 90, 91, 91, 92, 92, 92, 93, 93, 94, 94, 94, 95, 95, 96, 96,
+    96, 97, 97, 97, 98, 98, 98, 99, 99, 99, 99, 100, 100, 100, 101, 101,
+    101, 102, 102, 102, 102, 103, 103, 103, 103, 104, 104, 104, 104, 105, 105, 105,
+    105, 106, 106, 106, 106, 106, 107, 107, 107, 107, 108, 108, 108, 108, 108, 109,
+    109, 109, 109, 109, 110, 110, 110, 110, 110, 111, 111, 111, 111, 111, 112, 112,
+    112, 112, 112, 112, 113, 113, 113, 113, 113, 113, 114, 114, 114, 114, 114, 114,
+    115, 115, 115, 115, 115, 115, 115, 116, 116, 116, 116, 116, 116, 117, 117, 117,
+    117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 118, 119, 119, 119, 119,
+    119, 119, 119, 119, 120, 120, 120, 120, 120, 120, 120, 120, 121, 121, 121, 121,
+    121, 121, 121, 121, 122, 122, 122, 122, 122, 122, 122, 122, 122, 123, 123, 123,
+    123, 123, 123, 123, 123, 123, 124, 124, 124, 124, 124, 124, 124, 124, 124, 125,
+    125, 125, 125, 125, 125, 125, 125, 125, 125, 126, 126, 126, 126, 126, 126, 126,
+    126, 126, 126, 126, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 128,
+};
+
+#if H264E_ENABLE_SSE2 && !defined(MINIH264_ASM)
+#define BS_BITS 32
+
+static void h264e_bs_put_bits_sse2(bs_t *bs, unsigned n, unsigned val)
+{
+    assert(!(val >> n));
+    bs->shift -= n;
+    assert((unsigned)n <= 32);
+    if (bs->shift < 0)
+    {
+        assert(-bs->shift < 32);
+        bs->cache |= val >> -bs->shift;
+        *bs->buf++ = SWAP32(bs->cache);
+        bs->shift = 32 + bs->shift;
+        bs->cache = 0;
+    }
+    bs->cache |= val << bs->shift;
+}
+
+static void h264e_bs_flush_sse2(bs_t *bs)
+{
+    *bs->buf = SWAP32(bs->cache);
+}
+
+static unsigned h264e_bs_get_pos_bits_sse2(const bs_t *bs)
+{
+    unsigned pos_bits = (unsigned)((bs->buf - bs->origin)*BS_BITS);
+    pos_bits += BS_BITS - bs->shift;
+    assert((int)pos_bits >= 0);
+    return pos_bits;
+}
+
+static unsigned h264e_bs_byte_align_sse2(bs_t *bs)
+{
+    int pos = h264e_bs_get_pos_bits_sse2(bs);
+    h264e_bs_put_bits_sse2(bs, -pos & 7, 0);
+    return pos + (-pos & 7);
+}
+
+/**
+*   Golomb code
+*   0 => 1
+*   1 => 01 0
+*   2 => 01 1
+*   3 => 001 00
+*   4 => 001 01
+*
+*   [0]     => 1
+*   [1..2]  => 01x
+*   [3..6]  => 001xx
+*   [7..14] => 0001xxx
+*
+*/
+static void h264e_bs_put_golomb_sse2(bs_t *bs, unsigned val)
+{
+    int size;
+#if defined(_MSC_VER)
+    unsigned long nbit;
+    _BitScanReverse(&nbit, val + 1);
+    size = 1 + nbit;
+#else
+    size = 32 - __builtin_clz(val + 1);
+#endif
+    h264e_bs_put_bits_sse2(bs, 2*size - 1, val + 1);
+}
+
+/**
+*   signed Golomb code.
+*   mapping to unsigned code:
+*       0 => 0
+*       1 => 1
+*      -1 => 2
+*       2 => 3
+*      -2 => 4
+*       3 => 5
+*      -3 => 6
+*/
+static void h264e_bs_put_sgolomb_sse2(bs_t *bs, int val)
+{
+    val = 2*val - 1;
+    val ^= val >> 31;
+    h264e_bs_put_golomb_sse2(bs, val);
+}
+
+static void h264e_bs_init_bits_sse2(bs_t *bs, void *data)
+{
+    bs->origin = data;
+    bs->buf = bs->origin;
+    bs->shift = BS_BITS;
+    bs->cache = 0;
+}
+
+static unsigned __clz_cavlc(unsigned v)
+{
+#if defined(_MSC_VER)
+    unsigned long nbit;
+    _BitScanReverse(&nbit, v);
+    return 31 - nbit;
+#else
+    return __builtin_clz(v);
+#endif
+}
+
+static void h264e_vlc_encode_sse2(bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx)
+{
+    int nnz_context, nlevels, nnz; // nnz = nlevels + trailing_ones
+    unsigned trailing_ones = 0;
+    unsigned trailing_ones_sign = 0;
+    uint8_t runs[16];
+    uint8_t *prun = runs;
+    int16_t *levels;
+    int cloop = maxNumCoeff;
+    int v, drun;
+    unsigned zmask;
+    BS_OPEN(bs)
+
+    ALIGN(16) int16_t zzquant[16] ALIGN2(16);
+    levels = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+    if (maxNumCoeff != 4)
+    {
+        __m128i y0, y1;
+        __m128i x0 = _mm_load_si128((__m128i *)quant);
+        __m128i x1 = _mm_load_si128((__m128i *)(quant + 8));
+#define SWAP_XMM(x, i, j)     { int t0 = _mm_extract_epi16(x, i); int t1 = _mm_extract_epi16(x, j); x = _mm_insert_epi16(x, t0, j); x = _mm_insert_epi16(x, t1, i); }
+#define SWAP_XMM2(x, y, i, j) { int t0 = _mm_extract_epi16(x, i); int t1 = _mm_extract_epi16(y, j); y = _mm_insert_epi16(y, t0, j); x = _mm_insert_epi16(x, t1, i); }
+        SWAP_XMM(x0, 3, 4);
+        SWAP_XMM(x1, 3, 4);
+        SWAP_XMM2(x0, x1, 5, 2);
+        x0 = _mm_shufflelo_epi16(x0, 0 + (3 << 2) + (1 << 4) + (2 << 6));
+        x0 = _mm_shufflehi_epi16(x0, 2 + (0 << 2) + (3 << 4) + (1 << 6));
+        x1 = _mm_shufflelo_epi16(x1, 2 + (0 << 2) + (3 << 4) + (1 << 6));
+        x1 = _mm_shufflehi_epi16(x1, 1 + (2 << 2) + (0 << 4) + (3 << 6));
+        y0 = _mm_unpacklo_epi64(x0, x1);
+        y1 = _mm_unpackhi_epi64(x0, x1);
+        y0 = _mm_slli_epi16(y0, 1);
+        y1 = _mm_slli_epi16(y1, 1);
+        zmask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_packs_epi16(y0, y1), _mm_setzero_si128()));
+        _mm_store_si128((__m128i *)zzquant, y0);
+        _mm_store_si128((__m128i *)(zzquant + 8), y1);
+
+        if (maxNumCoeff == 15)
+            zmask |= 1;
+        zmask = (~zmask) << 16;
+
+        v = 15;
+        drun = (maxNumCoeff == 16) ? 1 : 0;
+    } else
+    {
+        __m128i x0 = _mm_loadl_epi64((__m128i *)quant);
+        x0 = _mm_slli_epi16(x0, 1);
+        zmask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_packs_epi16(x0, x0), _mm_setzero_si128()));
+        _mm_storel_epi64((__m128i *)zzquant, x0);
+        zmask = (~zmask) << 28;
+        drun = 1;
+        v = 3;
+    }
+
+    if (zmask)
+    {
+        do
+        {
+            int i = __clz_cavlc(zmask);
+            *--levels = zzquant[v -= i];
+            *prun++ = (uint8_t)(v + drun);
+            zmask <<= (i + 1);
+            v--;
+        } while(zmask);
+        quant = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+        nnz = (int)(quant - levels);
+
+        cloop = MIN(3, nnz);
+        levels = quant - 1;
+        do
+        {
+            if ((unsigned)(*levels + 2) > 4u)
+            {
+                break;
+            }
+            trailing_ones_sign = (trailing_ones_sign << 1) | (*levels-- < 0);
+            trailing_ones++;
+        } while (--cloop);
+    } else
+    {
+        nnz = trailing_ones = 0;
+    }
+    nlevels = nnz - trailing_ones;
+
+    nnz_context = nz_ctx[-1] + nz_ctx[1];
+
+    nz_ctx[0] = (uint8_t)nnz;
+    if (nnz_context <= 34)
+    {
+        nnz_context = (nnz_context + 1) >> 1;
+    }
+    nnz_context &= 31;
+
+    // 9.2.1 Parsing process for total number of transform coefficient levels and trailing ones
+    {
+        int off = h264e_g_coeff_token[nnz_context];
+        unsigned n = 6, val = h264e_g_coeff_token[off + trailing_ones + 4*nlevels];
+        if (off != 230)
+        {
+            n = (val & 15) + 1;
+            val >>= 4;
+        }
+        BS_PUT(n, val);
+    }
+
+    if (nnz)
+    {
+        if (trailing_ones)
+        {
+            BS_PUT(trailing_ones, trailing_ones_sign);
+        }
+        if (nlevels)
+        {
+            int vlcnum = 1;
+            int sym_len, prefix_len;
+
+            int sym = *levels-- - 2;
+            if (sym < 0) sym = -3 - sym;
+            if (sym >= 6) vlcnum++;
+            if (trailing_ones < 3)
+            {
+                sym -= 2;
+                if (nnz > 10)
+                {
+                    sym_len = 1;
+                    prefix_len = sym >> 1;
+                    if (prefix_len >= 15)
+                    {
+                        // or vlcnum = 1;  goto escape;
+                        prefix_len = 15;
+                        sym_len = 12;
+                    }
+                    sym -= prefix_len << 1;
+                    // bypass vlcnum advance due to sym -= 2; above
+                    goto loop_enter;
+                }
+            }
+
+            if (sym < 14)
+            {
+                prefix_len = sym;
+                sym = 0; // to avoid side effect in bitbuf
+                sym_len = 0;
+            } else if (sym < 30)
+            {
+                prefix_len = 14;
+                sym_len = 4;
+                sym -= 14;
+            } else
+            {
+                vlcnum = 1;
+                goto escape;
+            }
+            goto loop_enter;
+
+            for (;;)
+            {
+                sym_len = vlcnum;
+                prefix_len = sym >> vlcnum;
+                if (prefix_len >= 15)
+                {
+escape:
+                    prefix_len = 15;
+                    sym_len = 12;
+                }
+                sym -= prefix_len << vlcnum;
+
+                if (prefix_len >= 3 && vlcnum < 6) vlcnum++;
+loop_enter:
+                sym |= 1 << sym_len;
+                sym_len += prefix_len+1;
+                BS_PUT(sym_len, (unsigned)sym);
+                if (!--nlevels) break;
+                sym = *levels-- - 2;
+                if (sym < 0) sym = -3 - sym;
+            }
+        }
+
+        if (nnz < maxNumCoeff)
+        {
+            const uint8_t *vlc = (maxNumCoeff == 4) ? h264e_g_total_zeros_cr_2x2 : h264e_g_total_zeros;
+            uint8_t *run = runs;
+            int run_prev = *run++;
+            int nzeros = run_prev - nnz;
+            int zeros_left = 2*nzeros - 1;
+            int ctx = nnz - 1;
+            run[nnz - 1] = (uint8_t)maxNumCoeff; // terminator
+            for(;;)
+            {
+                int t;
+                //encode_huff8(bs, vlc, ctx, nzeros);
+
+                unsigned val = vlc[vlc[ctx] + nzeros];
+                unsigned n = val & 15;
+                val >>= 4;
+                BS_PUT(n, val);
+
+                zeros_left -= nzeros;
+                if (zeros_left < 0)
+                {
+                    break;
+                }
+
+                t = *run++;
+                nzeros = run_prev - t - 1;
+                if (nzeros < 0)
+                {
+                    break;
+                }
+                run_prev = t;
+                assert(zeros_left < 14);
+                vlc = h264e_g_run_before;
+                ctx = zeros_left;
+            }
+        }
+    }
+    BS_CLOSE(bs);
+}
+
+#define MM_LOAD_8TO16_2(p) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p)), _mm_setzero_si128())
+static __inline __m128i subabs128_16(__m128i a, __m128i b)
+{
+    return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+static __inline __m128i clone2x16(const void *p)
+{
+    __m128i tmp = MM_LOAD_8TO16_2(p);
+    return _mm_unpacklo_epi16(tmp, tmp);
+}
+static __inline __m128i subabs128(__m128i a, __m128i b)
+{
+    return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+static void transpose8x8_sse(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride)
+{
+    __m128i a = _mm_loadl_epi64((__m128i *)(src));
+    __m128i b = _mm_loadl_epi64((__m128i *)(src += src_stride));
+    __m128i c = _mm_loadl_epi64((__m128i *)(src += src_stride));
+    __m128i d = _mm_loadl_epi64((__m128i *)(src += src_stride));
+    __m128i e = _mm_loadl_epi64((__m128i *)(src += src_stride));
+    __m128i f = _mm_loadl_epi64((__m128i *)(src += src_stride));
+    __m128i g = _mm_loadl_epi64((__m128i *)(src += src_stride));
+    __m128i h = _mm_loadl_epi64((__m128i *)(src += src_stride));
+
+    __m128i p0 = _mm_unpacklo_epi8(a,b);  // b7 a7 b6 a6 ... b0 a0
+    __m128i p1 = _mm_unpacklo_epi8(c,d);  // d7 c7 d6 c6 ... d0 c0
+    __m128i p2 = _mm_unpacklo_epi8(e,f);  // f7 e7 f6 e6 ... f0 e0
+    __m128i p3 = _mm_unpacklo_epi8(g,h);  // h7 g7 h6 g6 ... h0 g0
+
+    __m128i q0 = _mm_unpacklo_epi16(p0, p1);  // d3c3 b3a3 ... d0c0 b0a0
+    __m128i q1 = _mm_unpackhi_epi16(p0, p1);  // d7c7 b7a7 ... d4c4 b4a4
+    __m128i q2 = _mm_unpacklo_epi16(p2, p3);  // h3g3 f3e3 ... h0g0 f0e0
+    __m128i q3 = _mm_unpackhi_epi16(p2, p3);  // h7g7 f7e7 ... h4g4 f4e4
+
+    __m128i r0 = _mm_unpacklo_epi32(q0, q2);  // h1g1f1e1 d1c1b1a1 h0g0f0e0 d0c0b0a0
+    __m128i r1 = _mm_unpackhi_epi32(q0, q2);  // h3g3f3e3 d3c3b3a3 h2g2f2e2 d2c2b2a2
+    __m128i r2 = _mm_unpacklo_epi32(q1, q3);
+    __m128i r3 = _mm_unpackhi_epi32(q1, q3);
+    _mm_storel_epi64((__m128i *)(dst), r0); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r0, r0)); dst += dst_stride;
+    _mm_storel_epi64((__m128i *)(dst), r1); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r1, r1)); dst += dst_stride;
+    _mm_storel_epi64((__m128i *)(dst), r2); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r2, r2)); dst += dst_stride;
+    _mm_storel_epi64((__m128i *)(dst), r3); dst += dst_stride; _mm_storel_epi64((__m128i *)(dst), _mm_unpackhi_epi64(r3, r3)); dst += dst_stride;
+}
+
+static void deblock_chroma_h_s4_sse(uint8_t *pq0, int stride, const void* threshold, int alpha, int beta, uint32_t argstr)
+{
+    __m128i thr, str, d;
+    __m128i p1 = MM_LOAD_8TO16_2(pq0 - 2*stride);
+    __m128i p0 = MM_LOAD_8TO16_2(pq0 - stride);
+    __m128i q0 = MM_LOAD_8TO16_2(pq0);
+    __m128i q1 = MM_LOAD_8TO16_2(pq0 + stride);
+    __m128i zero = _mm_setzero_si128();
+    __m128i _alpha = _mm_set1_epi16((short)alpha);
+    __m128i _beta = _mm_set1_epi16((short)beta);
+    __m128i tmp;
+
+    str =                    _mm_cmplt_epi16(subabs128_16(p0, q0), _alpha);
+    str = _mm_and_si128(str, _mm_cmplt_epi16(_mm_max_epi16(subabs128_16(p1, p0), subabs128_16(q1, q0)), _beta));
+
+    if ((uint8_t)argstr != 4)
+    {
+        d = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_mm_sub_epi16(q0, p0), 2), p1), q1),_mm_set1_epi16(4)), 3);
+        thr = _mm_add_epi16(clone2x16(threshold), _mm_set1_epi16(1));
+        d = _mm_min_epi16(_mm_max_epi16(d, _mm_sub_epi16(zero, thr)), thr);
+
+        tmp = _mm_unpacklo_epi8(_mm_cvtsi32_si128(argstr), _mm_setzero_si128());
+        tmp = _mm_unpacklo_epi16(tmp, tmp);
+
+//        str = _mm_and_si128(str, _mm_cmpgt_epi16(clone2x16(strength), zero));
+        str = _mm_and_si128(str, _mm_cmpgt_epi16(tmp, zero));
+        d = _mm_and_si128(str, d);
+        p0 = _mm_add_epi16(p0, d);
+        q0 = _mm_sub_epi16(q0, d);
+    } else
+    {
+        __m128i pq = _mm_add_epi16(p1, q1);
+        __m128i newp = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(pq, p1), p0), 1);
+        __m128i newq = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(pq, q1), q0), 1);
+        p0 = _mm_xor_si128(_mm_and_si128(_mm_xor_si128(_mm_avg_epu16(newp,zero), p0), str), p0);
+        q0 = _mm_xor_si128(_mm_and_si128(_mm_xor_si128(_mm_avg_epu16(newq,zero), q0), str), q0);
+    }
+    _mm_storel_epi64((__m128i*)(pq0 - stride), _mm_packus_epi16(p0, zero));
+    _mm_storel_epi64((__m128i*)(pq0         ), _mm_packus_epi16(q0, zero));
+}
+
+static void deblock_chroma_v_s4_sse(uint8_t *pix, int stride, const void* threshold, int alpha, int beta, uint32_t str)
+{
+    uint8_t t8x4[8*4];
+    int i;
+    uint8_t *p = pix - 2;
+    __m128i t0 =_mm_unpacklo_epi16(
+        _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)p),              _mm_cvtsi32_si128(*(int_u*)(p + stride))),
+        _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)(p + 2*stride)), _mm_cvtsi32_si128(*(int_u*)(p + 3*stride)))
+        );
+    __m128i t1 =_mm_unpacklo_epi16(
+        _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)(p + 4*stride)), _mm_cvtsi32_si128(*(int_u*)(p + 5*stride))),
+        _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int_u*)(p + 6*stride)), _mm_cvtsi32_si128(*(int_u*)(p + 7*stride)))
+        );
+    __m128i p1 = _mm_unpacklo_epi32(t0, t1);
+    __m128i p0 = _mm_shuffle_epi32 (p1, 0x4E); // 01001110b
+    __m128i q0 = _mm_unpackhi_epi32(t0, t1);
+    __m128i q1 = _mm_shuffle_epi32 (q0, 0x4E);
+    _mm_storel_epi64((__m128i*)(t8x4), p1);
+    _mm_storel_epi64((__m128i*)(t8x4 + 8), p0);
+    _mm_storel_epi64((__m128i*)(t8x4 + 16), q0);
+    _mm_storel_epi64((__m128i*)(t8x4 + 24), q1);
+    deblock_chroma_h_s4_sse(t8x4 + 16, 8, threshold, alpha, beta, str);
+
+    for (i = 0; i < 8; i++)
+    {
+        pix[-1] = t8x4[8  + i];
+        pix[ 0] = t8x4[16 + i];
+        pix += stride;
+    }
+}
+
+#define CMP_BETA(p, q, beta)   _mm_cmpeq_epi8(_mm_subs_epu8(_mm_subs_epu8(p, q), beta), _mm_subs_epu8(_mm_subs_epu8(q, p), beta))
+#define CMP_1(p, q, beta)     (_mm_subs_epu8(subabs128(p, q), beta))
+
+static void deblock_luma_h_s4_sse(uint8_t *pix, int stride, int alpha, int beta)
+{
+    int ccloop = 2;
+    do
+    {
+        __m128i p3 = MM_LOAD_8TO16_2(pix - 4*stride);
+        __m128i p2 = MM_LOAD_8TO16_2(pix - 3*stride);
+        __m128i p1 = MM_LOAD_8TO16_2(pix - 2*stride);
+        __m128i p0 = MM_LOAD_8TO16_2(pix - stride);
+        __m128i q0 = MM_LOAD_8TO16_2(pix);
+        __m128i q1 = MM_LOAD_8TO16_2(pix + stride);
+        __m128i q2 = MM_LOAD_8TO16_2(pix + 2*stride);
+        __m128i q3 = MM_LOAD_8TO16_2(pix + 3*stride);
+        __m128i zero = _mm_setzero_si128();
+        __m128i _alpha = _mm_set1_epi16((short)alpha);
+        __m128i _quarteralpha = _mm_set1_epi16((short)((alpha >> 2) + 2));
+        __m128i _beta = _mm_set1_epi16((short)beta);
+        __m128i ap_less_beta;
+        __m128i aq_less_beta;
+        __m128i str;
+        __m128i pq;
+        __m128i short_p;
+        __m128i short_q;
+        __m128i long_p;
+        __m128i long_q;
+        __m128i t;
+        __m128i p0q0_less__quarteralpha;
+
+        __m128i absdif_p0_q0 = subabs128_16(p0, q0);
+        __m128i p0_plus_q0 = _mm_add_epi16(_mm_add_epi16(p0, q0), _mm_set1_epi16(2));
+
+        // if (abs_p0_q0 < alpha && abs_p1_p0 < beta && abs_q1_q0 < beta)
+        str = _mm_cmplt_epi16(absdif_p0_q0, _alpha);
+        //str = _mm_and_si128(str, _mm_cmplt_epi16(subabs128_16(p1, p0), _beta));
+        //str = _mm_and_si128(str, _mm_cmplt_epi16(subabs128_16(q1, q0), _beta));
+        str = _mm_and_si128(str, _mm_cmplt_epi16(_mm_max_epi16(subabs128_16(p1, p0), subabs128_16(q1, q0)), _beta));
+        p0q0_less__quarteralpha = _mm_and_si128(_mm_cmplt_epi16(absdif_p0_q0, _quarteralpha), str);
+
+        //int short_p = (2*p1 + p0 + q1 + 2);
+        //int short_q = (2*q1 + q0 + p1 + 2);
+        short_p = _mm_avg_epu8(_mm_avg_epu8(p0, q1),p1);
+        pq = _mm_add_epi16(_mm_add_epi16(p1, q1), _mm_set1_epi16(2));
+        short_p = _mm_add_epi16(_mm_add_epi16(pq, p1), p0);
+        short_q = _mm_add_epi16(_mm_add_epi16(pq, q1), q0);
+
+        ap_less_beta = _mm_and_si128(_mm_cmplt_epi16(subabs128_16(p2, p0), _beta), p0q0_less__quarteralpha);
+        t = _mm_add_epi16(_mm_add_epi16(p2, p1), p0_plus_q0);
+        // short_p += t - p1 + q0;
+        long_p = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(_mm_add_epi16(short_p, t), p1), q0), 1);
+
+        _mm_storel_epi64((__m128i*)(pix - 2*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(ap_less_beta, _mm_srai_epi16(t, 2)), _mm_andnot_si128(ap_less_beta, p1)), zero));
+        t = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(_mm_add_epi16(p3, p2), 1), t), _mm_set1_epi16(2));
+        _mm_storel_epi64((__m128i*)(pix - 3*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(ap_less_beta, _mm_srai_epi16(t, 3)), _mm_andnot_si128(ap_less_beta, p2)), zero));
+
+        aq_less_beta = _mm_and_si128(_mm_cmplt_epi16(subabs128_16(q2, q0), _beta), p0q0_less__quarteralpha);
+        t = _mm_add_epi16(_mm_add_epi16(q2, q1), p0_plus_q0);
+        long_q = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(_mm_add_epi16(short_q, t), q1), p0), 1);
+        _mm_storel_epi64((__m128i*)(pix + 1*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(aq_less_beta, _mm_srai_epi16(t, 2)), _mm_andnot_si128(aq_less_beta, q1)), zero));
+
+        t = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(_mm_add_epi16(q3, q2), 1), t), _mm_set1_epi16(2));
+        _mm_storel_epi64((__m128i*)(pix + 2*stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(aq_less_beta, _mm_srai_epi16(t, 3)), _mm_andnot_si128(aq_less_beta, q2)), zero));
+
+        short_p = _mm_srai_epi16(_mm_or_si128(_mm_and_si128(ap_less_beta, long_p), _mm_andnot_si128(ap_less_beta, short_p)), 2);
+        short_q = _mm_srai_epi16(_mm_or_si128(_mm_and_si128(aq_less_beta, long_q), _mm_andnot_si128(aq_less_beta, short_q)), 2);
+
+        _mm_storel_epi64((__m128i*)(pix - stride), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(str, short_p), _mm_andnot_si128(str, p0)), zero));
+        _mm_storel_epi64((__m128i*)(pix         ), _mm_packus_epi16(_mm_or_si128(_mm_and_si128(str, short_q), _mm_andnot_si128(str, q0)), zero));
+
+        pix += 8;
+    } while (--ccloop);
+}
+
+static void deblock_luma_v_s4_sse(uint8_t *pix, int stride, int alpha, int beta)
+{
+    __m128i scratch[8];
+    uint8_t *s = pix - 4;
+    uint8_t *dst = (uint8_t *)scratch;
+    int cloop = 2;
+    do
+    {
+        transpose8x8_sse(dst, 16, s, stride);
+        s += 8*stride;
+        dst += 8;
+    } while(--cloop);
+
+    deblock_luma_h_s4_sse((uint8_t *)(scratch+4), 16, alpha, beta);
+    s = pix - 4;
+    dst = (uint8_t *)scratch;
+    cloop = 2;
+    do
+    {
+        transpose8x8_sse(s, stride, dst, 16);
+        s += 8*stride;
+        dst += 8;
+    } while(--cloop);
+}
+
+// (a-b) >> 1s == ((a + ~b + 1) >> 1u) - 128;
+//
+// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3 =
+//          (4*q0 - 4*p0 + p1 - q1 + 4) >> 3 =
+//          ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4) >> 3
+//          ((p1-p0) - (q1-q0) - 3*p0 + 3*q0) + 4) >> 3
+//          (((p1-p0)-p0)>>1 - ((q1-q0)-q0)>>1 - p0 + q0) + 2) >> 2
+//          ((((p1-p0)-p0)>>1 - p0)>>1 - (((q1-q0)-q0)>>1 - q0)>>1) + 1) >> 1
+static void deblock_luma_h_s3_sse(uint8_t *h264e_restrict pix, int stride, int alpha, int beta, const void* threshold, uint32_t strength)
+{
+    __m128i p1 = _mm_loadu_si128((__m128i *)(pix - 2*stride));
+    __m128i p0 = _mm_loadu_si128((__m128i *)(pix - stride));
+    __m128i q0 = _mm_loadu_si128((__m128i *)pix);
+    __m128i q1 = _mm_loadu_si128((__m128i *)(pix + stride));
+    __m128i maskp, maskq, zeromask, thr;
+    __m128i tc0tmp, p2, q2, p0q0avg, _beta;
+
+#define HALFSUM(x, y) _mm_sub_epi8(_mm_avg_epu8(x, y), _mm_and_si128(_mm_xor_si128(y, x), _mm_set1_epi8(1)))
+
+    // if (ABS(p0-q0) - alpha) ...
+    zeromask = _mm_subs_epu8(subabs128(p0, q0), _mm_set1_epi8((int8_t)(alpha - 1)));
+    //  & (ABS(p1-p0) - beta) & (ABS(q1-q0) - beta)
+    _beta = _mm_set1_epi8((int8_t)(beta - 1));
+    zeromask = _mm_or_si128(zeromask, _mm_subs_epu8(_mm_max_epu8(subabs128(p1, p0), subabs128(q1, q0)), _beta));
+    zeromask = _mm_cmpeq_epi8(zeromask, _mm_setzero_si128());
+
+    {
+        __m128i str_x = _mm_cvtsi32_si128(strength);
+        str_x = _mm_unpacklo_epi8(str_x, str_x);
+        str_x = _mm_cmpgt_epi8(_mm_unpacklo_epi8(str_x, str_x), _mm_setzero_si128());
+        zeromask = _mm_and_si128(zeromask, str_x);
+    }
+
+    thr = _mm_cvtsi32_si128(*(int*)threshold);//_mm_loadl_epi64((__m128i *)(threshold));
+    thr = _mm_unpacklo_epi8(thr, thr);
+    thr = _mm_unpacklo_epi8(thr, thr);
+    thr = _mm_and_si128(thr, zeromask);
+
+    p2 = _mm_loadu_si128((__m128i *)(pix - 3*stride));
+    maskp = CMP_BETA(p2, p0, _beta);
+    tc0tmp = _mm_and_si128(thr, maskp);
+    p0q0avg = _mm_avg_epu8(p0, q0);     // (p0+q0+1)>>1
+    _mm_storeu_si128((__m128i *)(pix - 2*stride), _mm_min_epu8(_mm_max_epu8(HALFSUM(p2, p0q0avg), _mm_subs_epu8(p1, tc0tmp)), _mm_adds_epu8(p1, tc0tmp)));
+
+    q2 = _mm_loadu_si128((__m128i *)(pix + 2*stride));
+    maskq = CMP_BETA(q2, q0, _beta);
+    tc0tmp = _mm_and_si128(thr, maskq);
+    _mm_storeu_si128((__m128i *)(pix + stride),  _mm_min_epu8(_mm_max_epu8(HALFSUM(q2, p0q0avg), _mm_subs_epu8(q1, tc0tmp)), _mm_adds_epu8(q1, tc0tmp)));
+
+    thr = _mm_sub_epi8(thr, maskp);
+    thr = _mm_sub_epi8(thr, maskq);
+    thr = _mm_and_si128(thr, zeromask);
+
+    {
+    __m128i ff = _mm_set1_epi8(0xff);
+    __m128i part1 = _mm_avg_epu8(q0, _mm_xor_si128(p0, ff));
+    __m128i part2 = _mm_avg_epu8(p1, _mm_xor_si128(q1, ff));
+    __m128i carry = _mm_and_si128(_mm_xor_si128(p0, q0), _mm_set1_epi8(1));
+    __m128i d = _mm_adds_epu8(part1, _mm_avg_epu8(_mm_avg_epu8(part2, _mm_set1_epi8(3)), carry));
+    __m128i delta_p = _mm_subs_epu8(d, _mm_set1_epi8((char)(128 + 33)));
+    __m128i delta_n = _mm_subs_epu8(_mm_set1_epi8((char)(128 + 33)), d);
+    delta_p = _mm_min_epu8(delta_p, thr);
+    delta_n = _mm_min_epu8(delta_n, thr);
+
+    q0 =  _mm_adds_epu8(_mm_subs_epu8(q0, delta_p), delta_n);
+    p0 =  _mm_subs_epu8(_mm_adds_epu8(p0, delta_p), delta_n);
+
+    _mm_storeu_si128 ((__m128i *)(pix - stride), p0);
+    _mm_storeu_si128 ((__m128i *)pix,            q0);
+    }
+}
+
+static void deblock_luma_v_s3_sse(uint8_t *pix, int stride, int alpha, int beta, const void* thr, uint32_t strength)
+{
+    __m128i scratch[8];
+    uint8_t *s = pix - 4;
+    uint8_t *dst = (uint8_t *)scratch;
+    int cloop = 2;
+    do
+    {
+        transpose8x8_sse(dst, 16, s, stride);
+        s += 8*stride;
+        dst += 8;
+    } while(--cloop);
+
+    deblock_luma_h_s3_sse((uint8_t*)(scratch + 4), 16, alpha, beta, thr, strength);
+    s = pix - 4;
+    dst = (uint8_t *)scratch;
+    cloop = 2;
+    do
+    {
+        transpose8x8_sse(s, stride, dst, 16);
+        s += 8*stride;
+        dst += 8;
+    } while(--cloop);
+}
+
+static void h264e_deblock_chroma_sse2(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+    const uint8_t *alpha = par->alpha;
+    const uint8_t *beta = par->beta;
+    const uint8_t *thr = par->tc0;
+    const uint8_t *strength = (uint8_t *)par->strength32;
+    int a, b, x, y;
+    a = alpha[0];
+    b = beta[0];
+    for (x = 0; x < 16; x += 8)
+    {
+        uint32_t str = *(uint32_t*)&strength[x];
+        if (str && a)
+        {
+            deblock_chroma_v_s4_sse(pix + (x >> 1), stride, thr + x, a, b, str);
+        }
+        a = alpha[1];
+        b = beta[1];
+    }
+    thr += 16;
+    strength += 16;
+    a = alpha[2];
+    b = beta[2];
+    for (y = 0; y < 16; y += 8)
+    {
+        uint32_t str = *(uint32_t*)&strength[y];
+        if (str && a)
+        {
+            deblock_chroma_h_s4_sse(pix, stride, thr + y, a, b, str);
+        }
+        pix += 4*stride;
+        a = alpha[3];
+        b = beta[3];
+    }
+}
+
+static void h264e_deblock_luma_sse2(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+    const uint8_t *alpha = par->alpha;
+    const uint8_t *beta = par->beta;
+    const uint8_t *thr = par->tc0;
+    const uint8_t *strength = (uint8_t *)par->strength32;
+    int a, b, x, y;
+    a = alpha[0];
+    b = beta[0];
+    for (x = 0; x < 16; x += 4)
+    {
+        uint32_t str = *(uint32_t*)&strength[x];
+        if ((uint8_t)str == 4)
+        {
+            deblock_luma_v_s4_sse(pix + x, stride, a, b);
+        } else if (str && a)
+        {
+            deblock_luma_v_s3_sse(pix + x, stride, a, b, thr + x, str);
+        }
+        a = alpha[1];
+        b = beta[1];
+    }
+    thr += 16;
+    strength += 16;
+    a = alpha[2];
+    b = beta[2];
+    for (y = 0; y < 16; y += 4)
+    {
+        uint32_t str = *(uint32_t*)&strength[y];
+        if ((uint8_t)str == 4)
+        {
+            deblock_luma_h_s4_sse(pix, stride, a, b);
+        } else if (str && a)
+        {
+            deblock_luma_h_s3_sse(pix, stride, a, b, thr + y, str);
+        }
+        a = alpha[3];
+        b = beta[3];
+        pix += 4*stride;
+    }
+}
+
+static void h264e_denoise_run_sse2(unsigned char *frm, unsigned char *frmprev, int w, int h_arg, int stride_frm, int stride_frmprev)
+{
+#define MM_LOAD_8TO16(p) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p)), zero)
+    int cloop, h = h_arg;
+    __m128i zero = _mm_setzero_si128();
+    __m128i exp  = _mm_set1_epi32(0x7F800000);
+
+    w -= 2;
+    h -= 2;
+    if (w <= 2 || h <= 2)
+    {
+        return;
+    }
+
+    do
+    {
+        unsigned char *pf = frm += stride_frm;
+        unsigned char *pp = frmprev += stride_frmprev;
+        cloop = w >> 3;
+        pp[-stride_frmprev] = *pf++;
+        pp++;
+
+        while (cloop--)
+        {
+            __m128 float_val;
+            __m128i log_neighbour, log_d;
+            __m128i log_neighbour_h, log_neighbour_l, log_d_h, log_d_l;
+            __m128i a, b;
+            __m128i gain;
+            __m128i abs_d, abs_neighbour;
+            a = MM_LOAD_8TO16(pf);
+            b = MM_LOAD_8TO16(pp);
+            abs_d   = _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+            a = MM_LOAD_8TO16(pf-stride_frm);
+            a = _mm_add_epi16(a, MM_LOAD_8TO16(pf - 1));
+            a = _mm_add_epi16(a, MM_LOAD_8TO16(pf + 1));
+            a = _mm_add_epi16(a, MM_LOAD_8TO16(pf + stride_frm));
+            b = MM_LOAD_8TO16(pp-stride_frmprev);
+            b = _mm_add_epi16(b, MM_LOAD_8TO16(pp - 1));
+            b = _mm_add_epi16(b, MM_LOAD_8TO16(pp + 1));
+            b = _mm_add_epi16(b, MM_LOAD_8TO16(pp + stride_frmprev));
+
+            abs_neighbour = _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+
+            abs_neighbour = _mm_srai_epi16(abs_neighbour, 2);
+
+            abs_d = _mm_add_epi16(abs_d, _mm_set1_epi16(1));
+            abs_neighbour = _mm_add_epi16(abs_neighbour, _mm_set1_epi16(1));
+
+            float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(abs_neighbour, zero), 16), 16));
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            log_neighbour_l  = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+            float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(abs_neighbour, zero), 16), 16));
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            log_neighbour_h  = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+            float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(abs_d, zero), 16), 16));
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            log_d_l = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+            float_val = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(abs_d, zero), 16), 16));
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            float_val = _mm_mul_ps(float_val, float_val);
+            log_d_h = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(float_val), exp), 23), _mm_set1_epi32(127));
+
+            log_d = _mm_packs_epi32(log_d_l, log_d_h);
+            log_neighbour = _mm_packs_epi32(log_neighbour_l, log_neighbour_h);
+
+            log_neighbour = _mm_slli_epi16(log_neighbour, 8);
+            log_neighbour = _mm_adds_epu16(log_neighbour, log_neighbour);
+            log_neighbour = _mm_adds_epu16(log_neighbour, log_neighbour);
+            log_neighbour = _mm_srli_epi16(log_neighbour, 8);
+
+            log_neighbour = _mm_subs_epu16(_mm_set1_epi16(255), log_neighbour);
+            log_d = _mm_subs_epu16(_mm_set1_epi16(255), log_d);
+
+            gain = _mm_mullo_epi16(log_d, log_neighbour);
+
+            a = MM_LOAD_8TO16(pf);
+            b = MM_LOAD_8TO16(pp);
+{
+            __m128i s;
+            __m128i gain_inv;
+            gain_inv = _mm_sub_epi16(_mm_set1_epi8((char)255), gain);
+            s = _mm_add_epi16(_mm_mulhi_epu16(a, gain_inv), _mm_mulhi_epu16(b, gain));
+            b = _mm_mullo_epi16(b, gain);
+            a = _mm_mullo_epi16(a, gain_inv);
+            a = _mm_sub_epi16(_mm_avg_epu16(a, b), _mm_and_si128(_mm_xor_si128(a, b), _mm_set1_epi16(1)));
+            a = _mm_avg_epu16(_mm_srli_epi16(a, 14), _mm_set1_epi16(0));
+            a = _mm_add_epi16(a, s);
+            _mm_storel_epi64((__m128i *)(pp-stride_frmprev), _mm_packus_epi16(a,zero));
+}
+            pf += 8;
+            pp += 8;
+        }
+
+        cloop = w & 7;
+        while (cloop--)
+        {
+            int d, neighbourhood;
+            unsigned g, gd, gn, out_val;
+            d = pf[0] - pp[0];
+            neighbourhood  = pf[-1]      - pp[-1];
+            neighbourhood += pf[+1]      - pp[+1];
+            neighbourhood += pf[-stride_frm] - pp[-stride_frmprev];
+            neighbourhood += pf[+stride_frm] - pp[+stride_frmprev];
+
+            if (d < 0)
+            {
+                d = -d;
+            }
+            if (neighbourhood < 0)
+            {
+                neighbourhood = -neighbourhood;
+            }
+            neighbourhood >>= 2;
+
+            gd = g_diff_to_gainQ8[d];
+            gn = g_diff_to_gainQ8[neighbourhood];
+
+            gn <<= 2;
+            if (gn > 255)
+            {
+                gn = 255;
+            }
+
+            gn = 255 - gn;
+            gd = 255 - gd;
+            g = gn*gd;  // Q8*Q8 = Q16;
+
+            //out_val = ((pp[0]*g ) >> 16) + (((0xffff-g)*pf[0] ) >> 16);
+            out_val = (pp[0]*g + (0xffff-g)*pf[0]  + (1<<15)) >> 16;
+
+            assert(out_val <= 255);
+
+            pp[-stride_frmprev] = (unsigned char)out_val;
+
+            pf++, pp++;
+        }
+        pp[-stride_frmprev] = *pf++;
+    } while(--h);
+
+    memcpy(frmprev + stride_frmprev, frm + stride_frm, w+2);
+    h = h_arg - 2;
+    do
+    {
+        memcpy(frmprev, frmprev - stride_frmprev, w+2);
+        frmprev -= stride_frmprev;
+    } while(--h);
+    memcpy(frmprev, frm - stride_frm*(h_arg-2), w+2);
+}
+
+#define IS_NULL(p) ((p) < (pix_t *)(uintptr_t)32)
+
+static uint32_t intra_predict_dc_sse(const pix_t *left, const pix_t *top, int log_side)
+{
+    unsigned dc = 0, side = 1u << log_side, round = 0;
+    __m128i sum = _mm_setzero_si128();
+    if (!IS_NULL(left))
+    {
+        int cloop = side;
+        round += side >> 1;
+        do
+        {
+            sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_cvtsi32_si128(*(int*)left), _mm_setzero_si128()));
+            left += 4;
+        } while (cloop -= 4);
+    }
+    if (!IS_NULL(top))
+    {
+        int cloop = side;
+        round += side >> 1;
+        do
+        {
+            sum = _mm_add_epi64(sum, _mm_sad_epu8(_mm_cvtsi32_si128(*(int*)top), _mm_setzero_si128()));
+            top += 4;
+        } while (cloop -= 4);
+    }
+    dc = _mm_cvtsi128_si32(sum);
+    dc += round;
+    if (round == side) dc >>= 1;
+    dc >>= log_side;
+    if (!round) dc = 128;
+    return dc * 0x01010101;
+}
+
+/*
+ * Note: To make the code more readable we refer to the neighboring pixels
+ * in variables named as below:
+ *
+ *    UL U0 U1 U2 U3 U4 U5 U6 U7
+ *    L0 xx xx xx xx
+ *    L1 xx xx xx xx
+ *    L2 xx xx xx xx
+ *    L3 xx xx xx xx
+ */
+#define UL edge[-1]
+#define U0 edge[0]
+#define T1 edge[1]
+#define U2 edge[2]
+#define U3 edge[3]
+#define U4 edge[4]
+#define U5 edge[5]
+#define U6 edge[6]
+#define U7 edge[7]
+#define L0 edge[-2]
+#define L1 edge[-3]
+#define L2 edge[-4]
+#define L3 edge[-5]
+
+static void h264e_intra_predict_16x16_sse2(pix_t *predict,  const pix_t *left, const pix_t *top, int mode)
+{
+    int cloop = 16;
+    if (mode < 1)
+    {
+        __m128i a = _mm_load_si128((__m128i *)top);
+        do
+        {
+            _mm_store_si128((__m128i *)predict, a);
+            predict += 16;
+        } while(--cloop);
+    } else if (mode == 1)
+    {
+        const __m128i c1111 = _mm_set1_epi8(1);
+        do
+        {
+            _mm_store_si128((__m128i *)predict, _mm_shuffle_epi32(_mm_mul_epu32(_mm_cvtsi32_si128(*left++), c1111), 0));
+            predict += 16;
+        } while(--cloop);
+    } else //if (mode == 2)
+    {
+        __m128i dc128;
+        int dc = intra_predict_dc_sse(left, top, 4);
+        dc128 = _mm_shuffle_epi32(_mm_cvtsi32_si128(dc), 0);
+        do
+        {
+            _mm_store_si128((__m128i *)predict, dc128);
+            predict += 16;
+        } while(--cloop);
+    }
+}
+
+static void h264e_intra_predict_chroma_sse2(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+    int cloop = 8;
+    if (mode < 1)
+    {
+        __m128i a = _mm_load_si128((__m128i *)top);
+        do
+        {
+            _mm_store_si128((__m128i *)predict, a);
+            predict += 16;
+        } while(--cloop);
+    } else if (mode == 1)
+    {
+        do
+        {
+            __m128i t = _mm_unpacklo_epi32(_mm_cvtsi32_si128(left[0]*0x01010101u), _mm_cvtsi32_si128(left[8]*0x01010101u));
+            t = _mm_unpacklo_epi32(t, t);
+            _mm_store_si128((__m128i *)predict, t);
+            left++;
+            predict += 16;
+        } while(--cloop);
+    } else //if (mode == 2)
+    {
+        // chroma
+        uint32_t *d = (uint32_t*)predict;
+        __m128i *d128 = (__m128i *)predict;
+        __m128i tmp;
+        cloop = 2;
+        do
+        {
+            d[0] = d[1] = d[16] = intra_predict_dc_sse(left, top, 2);
+            d[17] = intra_predict_dc_sse(left + 4, top + 4, 2);
+            if (!IS_NULL(top))
+            {
+                d[1] = intra_predict_dc_sse(NULL, top + 4, 2);
+            }
+            if (!IS_NULL(left))
+            {
+                d[16] = intra_predict_dc_sse(NULL, left + 4, 2);
+            }
+            d += 2;
+            left += 8;
+            top += 8;
+        } while(--cloop);
+        tmp = _mm_load_si128(d128++);
+        _mm_store_si128(d128++, tmp);
+        _mm_store_si128(d128++, tmp);
+        _mm_store_si128(d128++, tmp);
+        tmp = _mm_load_si128(d128++);
+        _mm_store_si128(d128++, tmp);
+        _mm_store_si128(d128++, tmp);
+        _mm_store_si128(d128++, tmp);
+    }
+}
+
+static int h264e_intra_choose_4x4_sse2(const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty)
+{
+    int best_m = 0;
+    int sad, best_sad = 0x10000;
+
+    __m128i b0 = _mm_loadl_epi64((__m128i *)blockin);
+    __m128i b1 = _mm_loadl_epi64((__m128i *)(blockin + 16));
+    __m128i b2 = _mm_loadl_epi64((__m128i *)(blockin + 32));
+    __m128i b3 = _mm_loadl_epi64((__m128i *)(blockin + 48));
+    __m128i c  = _mm_unpacklo_epi32(b0, b1);
+    __m128i d  = _mm_unpacklo_epi32(b2, b3);
+    __m128i sse_blockin = _mm_unpacklo_epi64(c, d);
+    __m128i t, t0, t1, t2, res, sad128, best128;
+
+#define TEST(mode) sad128 = _mm_sad_epu8(res, sse_blockin);                 \
+            sad128 = _mm_adds_epu16 (sad128, _mm_shuffle_epi32(sad128, 2)); \
+            sad  = _mm_cvtsi128_si32(sad128);                               \
+            if (mode != mpred) sad += penalty;                              \
+            if (sad < best_sad)                                             \
+            {                                                               \
+                best128 = res;                                              \
+                best_sad = sad;                                             \
+                best_m = mode;                                              \
+            }
+
+    __m128i border = _mm_loadu_si128((__m128i *)(&L3));
+    int topright = 0x01010101u*U7;
+
+    if (!(avail & AVAIL_TR))
+    {
+        topright = 0x01010101u*U3;
+        //border = _mm_insert_epi32 (border, topright, 2);
+        border = _mm_insert_epi16 (border, topright, 4);
+        border = _mm_insert_epi16 (border, topright, 5);
+    }
+    //border = _mm_insert_epi32 (border, topright, 3);
+    border = _mm_insert_epi16 (border, topright, 6);
+    border = _mm_insert_epi16 (border, topright, 7);
+
+    // DC
+    {
+        unsigned dc = 0, round = 0;
+
+        if (avail & AVAIL_L)
+        {
+            dc += _mm_cvtsi128_si32(_mm_sad_epu8(_mm_and_si128(border, _mm_set_epi32(0, 0, 0, ~0)), _mm_setzero_si128()));
+            round += 2;
+        }
+        if (avail & AVAIL_T)
+        {
+            dc += _mm_cvtsi128_si32(_mm_sad_epu8(_mm_and_si128(_mm_srli_si128(border, 5), _mm_set_epi32(0, 0, 0, ~0)), _mm_setzero_si128()));
+            round += 2;
+        }
+        dc += round;
+        if (round == 4) dc >>= 1;
+        dc >>= 2;
+        if (!round) dc = 128;
+        t = _mm_cvtsi32_si128(dc * 0x01010101);
+        t = _mm_unpacklo_epi32(t, t);
+        best128 =_mm_unpacklo_epi32(t, t);
+
+        //TEST(2)
+        sad128 = _mm_sad_epu8(best128, sse_blockin);
+        sad128 = _mm_adds_epu16 (sad128, _mm_shuffle_epi32(sad128, 2));
+        best_sad = _mm_cvtsi128_si32(sad128);
+
+        if (2 != mpred) best_sad += penalty;
+        best_m = 2;
+    }
+
+    if (avail & AVAIL_T)
+    {
+        t = _mm_srli_si128(border, 5);
+        t = _mm_unpacklo_epi32(t, t);
+        res =  _mm_unpacklo_epi32(t, t);
+        TEST(0)
+
+        t0 = _mm_srli_si128(border, 5);
+        t1 = _mm_srli_si128(border, 6);
+        t2 = _mm_srli_si128(border, 7);
+        t = _mm_sub_epi8(_mm_avg_epu8(t0, t2), _mm_and_si128(_mm_xor_si128(t0, t2), _mm_set1_epi8(1)));
+        t = _mm_avg_epu8(t, t1);
+        t2 = _mm_unpacklo_epi32(t, _mm_srli_si128(t, 1));
+
+        res = _mm_unpacklo_epi64(t2, _mm_unpacklo_epi32(_mm_srli_si128(t, 2), _mm_srli_si128(t, 3)));
+        TEST(3)
+
+        t0 = _mm_avg_epu8(t0,t1);
+        t0  = _mm_unpacklo_epi32(t0, _mm_srli_si128(t0, 1));
+        res = _mm_unpacklo_epi32(t0, t2);
+        TEST(7)
+    }
+
+    if (avail & AVAIL_L)
+    {
+        int ext;
+        t = _mm_unpacklo_epi8(border, border);
+        t = _mm_shufflelo_epi16(t, 3 + (2 << 2) + (1 << 4) + (0 << 6));
+        res = _mm_unpacklo_epi8(t, t);
+        TEST(1)
+
+        t0 = _mm_unpacklo_epi8(border, _mm_setzero_si128());
+        t0 = _mm_shufflelo_epi16(t0, 3 + (2 << 2) + (1 << 4) + (0 << 6));
+        t0 = _mm_packus_epi16(t0, t0);       // 0 1 2 3
+
+        t1 = _mm_unpacklo_epi8(t0, t0);      // 0 0 1 1 2 2 3 3
+
+        ext = _mm_extract_epi16(t1, 3);
+        t0 = _mm_insert_epi16 (t0, ext, 2);  // 0 1 2 3 3 3
+        t1 = _mm_insert_epi16 (t1, ext, 4);  // 0 0 1 1 2 2 3 3 33
+        t2 = _mm_slli_si128(t0, 2);          // x x 0 1 2 3 3 3
+        t = _mm_sub_epi8(_mm_avg_epu8(t0, t2), _mm_and_si128(_mm_xor_si128(t0, t2), _mm_set1_epi8(1)));
+        // 0 1 2 3 3 3
+        // x x 0 1 2 3
+        t = _mm_unpacklo_epi8(t2, t);
+        // 0   1   2   3   3   3
+        // x   x   0   1   2   3
+        // x   x   0   1   2   3
+        t = _mm_avg_epu8(t, _mm_slli_si128(t1, 2));
+        // 0 0 1 1 2 2 3 3
+
+        res = _mm_unpacklo_epi32(_mm_srli_si128(t, 4), _mm_srli_si128(t, 6));
+        //res = _mm_insert_epi32 (res, ext|(ext<<16),3);
+        res = _mm_insert_epi16 (res, ext, 6);
+        res = _mm_insert_epi16 (res, ext, 7);
+        TEST(8)
+    }
+
+    if ((avail & (AVAIL_T | AVAIL_L | AVAIL_TL)) == (AVAIL_T | AVAIL_L | AVAIL_TL))
+    {
+        int t16;
+        t0 = _mm_srli_si128(border, 1);
+        t1 = _mm_srli_si128(border, 2);
+        t = _mm_sub_epi8(_mm_avg_epu8(border, t1), _mm_and_si128(_mm_xor_si128(border, t1), _mm_set1_epi8(1)));
+        t = _mm_avg_epu8(t, t0);
+
+        res = _mm_unpacklo_epi64(_mm_unpacklo_epi32(_mm_srli_si128(t, 3), _mm_srli_si128(t, 2)), _mm_unpacklo_epi32(_mm_srli_si128(t, 1), t));
+        TEST(4)
+
+        t1 = _mm_unpacklo_epi8(t2 = _mm_avg_epu8(t0,border), t);
+        t1 = _mm_unpacklo_epi32(t1, _mm_srli_si128(t1, 2));
+        res = _mm_shuffle_epi32(t1, 3 | (2 << 2) | (1 << 4) | (0 << 6));
+        res = _mm_insert_epi16 (res, _mm_extract_epi16 (t, 2), 1);
+        TEST(6)
+
+        t = _mm_srli_si128(t, 1);
+        res = _mm_unpacklo_epi32(_mm_srli_si128(t2, 4), _mm_srli_si128(t, 2));
+        t2 =  _mm_insert_epi16 (t2, t16 = _mm_extract_epi16 (t, 0), 1);
+        t  =  _mm_insert_epi16 (t, (t16 << 8), 0);
+        res = _mm_unpacklo_epi64(res, _mm_unpacklo_epi32(_mm_srli_si128(t2, 3), _mm_srli_si128(t, 1)));
+        TEST(5)
+    }
+
+    ((uint32_t *)blockpred)[ 0] = _mm_extract_epi16(best128, 0) | ((unsigned)_mm_extract_epi16(best128, 1) << 16);
+    ((uint32_t *)blockpred)[ 4] = _mm_extract_epi16(best128, 2) | ((unsigned)_mm_extract_epi16(best128, 3) << 16);
+    ((uint32_t *)blockpred)[ 8] = _mm_extract_epi16(best128, 4) | ((unsigned)_mm_extract_epi16(best128, 5) << 16);
+    ((uint32_t *)blockpred)[12] = _mm_extract_epi16(best128, 6) | ((unsigned)_mm_extract_epi16(best128, 7) << 16);
+
+    return best_m + (best_sad << 4);    // pack result
+}
+
+#define MM_LOAD_8TO16(p) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(p)), zero)
+#define MM_LOAD_REG(p, sh) _mm_unpacklo_epi8(_mm_srli_si128(p, sh), zero)
+#define __inline
+static __inline void copy_wh_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    assert(h % 4 == 0);
+    if (w == 16)
+    {
+        do
+        {
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_store_si128((__m128i *)dst, _mm_loadu_si128((__m128i *)src)); src += src_stride; dst += 16;
+        } while(h -= 8);
+    } else //if (w == 8)
+    {
+        do
+        {
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+        } while(h -= 4);
+    }
+}
+
+static __inline void hpel_lpf_diag_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    ALIGN(16) int16_t scratch[21 * 16] ALIGN2(16);  /* 21 rows by 16 pixels per row */
+
+    /*
+     * Intermediate values will be 1/2 pel at Horizontal direction
+     * Starting at (0.5, -2) at top extending to (0.5, height + 3) at bottom
+     * scratch contains a 2D array of size (w)X(h + 5)
+     */
+    __m128i zero = _mm_setzero_si128();
+    __m128i c32,c5 = _mm_set1_epi16(5);
+    int cloop = h + 5;
+    int16_t *h264e_restrict dst16 = scratch;
+    const int16_t *src16 = scratch + 2*16;
+    src -= 2*src_stride;
+    if (w == 8)
+    {
+        src16 = scratch + 2*8;
+        do
+        {
+            __m128i inp = _mm_loadu_si128((__m128i*)(src - 2));
+            _mm_store_si128((__m128i*)dst16, _mm_add_epi16(
+                _mm_mullo_epi16(
+                    _mm_sub_epi16(
+                        _mm_slli_epi16(
+                            _mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)),
+                            2),
+                        _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+                    c5),
+                _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))
+            ));
+            src += src_stride;
+            dst16 += 8;
+        } while (--cloop);
+
+        c32 = _mm_set1_epi16(32);
+        cloop = h;
+        do
+        {
+            // (20*x2 - 5*x1 + x0 + 512) >> 10 =>
+            // (16*x2 + 4*x2 - 4*x1 - x1 + x0 + 512) >> 10 =>
+            // ((((x0 - x1) >> 2) + (x2 - x1)) >> 2) + x2 + 32 >> 6
+            __m128i x1 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 1*8)), _mm_load_si128((__m128i*)(src16 + 2*8)));
+            __m128i x2 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 0*8)), _mm_load_si128((__m128i*)(src16 + 1*8)));
+            _mm_storel_epi64((__m128i*)dst,
+                _mm_packus_epi16(
+                    _mm_srai_epi16(
+                        _mm_add_epi16(
+                            _mm_srai_epi16(
+                                _mm_sub_epi16(
+                                    _mm_srai_epi16(
+                                        _mm_sub_epi16(
+                                            _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 2*8)), _mm_load_si128((__m128i*)(src16 + 3*8))),
+                                            x1),
+                                        2),
+                                    _mm_sub_epi16(x1, x2)),
+                                2),
+                            _mm_add_epi16(x2, c32)),
+                        6),
+                    zero));
+            src16 += 8;
+            dst += 16;
+        } while(--cloop);
+    } else
+    {
+        do
+        {
+            _mm_store_si128((__m128i*)dst16, _mm_add_epi16(
+                _mm_mullo_epi16(
+                    _mm_sub_epi16(
+                        _mm_slli_epi16(
+                            _mm_add_epi16(MM_LOAD_8TO16(src - 0), MM_LOAD_8TO16(src + 1)),
+                            2),
+                        _mm_add_epi16(MM_LOAD_8TO16(src - 1), MM_LOAD_8TO16(src + 2))),
+                    c5),
+                _mm_add_epi16(MM_LOAD_8TO16(src - 2), MM_LOAD_8TO16(src + 3))
+            ));
+            _mm_store_si128((__m128i*)(dst16 + 8), _mm_add_epi16(
+                _mm_mullo_epi16(
+                    _mm_sub_epi16(
+                        _mm_slli_epi16(
+                            _mm_add_epi16(MM_LOAD_8TO16(src + 8 - 0), MM_LOAD_8TO16(src + 8 + 1)),
+                            2),
+                        _mm_add_epi16(MM_LOAD_8TO16(src + 8 - 1), MM_LOAD_8TO16(src + 8 + 2))),
+                    c5),
+                _mm_add_epi16(MM_LOAD_8TO16(src + 8 - 2), MM_LOAD_8TO16(src + 8 + 3))
+            ));
+            src += src_stride;
+            dst16 += 8*2;
+        } while (--cloop);
+
+        c32 = _mm_set1_epi16(32);
+        cloop = 2*h;
+        do
+        {
+            // (20*x2 - 5*x1 + x0 + 512) >> 10 =>
+            // (16*x2 + 4*x2 - 4*x1 - x1 + x0 + 512) >> 10 =>
+            // ((((x0 - x1) >> 2) + (x2 - x1)) >> 2) + x2 + 32 >> 6
+            __m128i x1 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 1*16)), _mm_load_si128((__m128i*)(src16 + 2*16)));
+            __m128i x2 = _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 0*16)), _mm_load_si128((__m128i*)(src16 + 1*16)));
+            _mm_storel_epi64((__m128i*)dst,
+                _mm_packus_epi16(
+                    _mm_srai_epi16(
+                        _mm_add_epi16(
+                            _mm_srai_epi16(
+                                _mm_sub_epi16(
+                                    _mm_srai_epi16(
+                                        _mm_sub_epi16(
+                                            _mm_add_epi16(_mm_load_si128((__m128i*)(src16 - 2*16)), _mm_load_si128((__m128i*)(src16 + 3*16))),
+                                            x1),
+                                        2),
+                                    _mm_sub_epi16(x1, x2)),
+                                2),
+                            _mm_add_epi16(x2, c32)),
+                        6),
+                    zero));
+            src16 += 8;
+            dst += 8;
+        } while(--cloop);
+    }
+}
+
+static __inline void hpel_lpf_hor_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    __m128i zero = _mm_setzero_si128();
+    const __m128i five = _mm_set1_epi16(5);
+    if (w == 8)
+    {
+        do
+        {
+            __m128i inp = _mm_loadu_si128((__m128i*)(src - 2));
+            _mm_storel_epi64((__m128i*)dst, _mm_packus_epi16(
+                _mm_srai_epi16(
+                    _mm_add_epi16(
+                        _mm_add_epi16(
+                            _mm_mullo_epi16(
+                                _mm_sub_epi16(
+                                    _mm_slli_epi16(_mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)), 2),
+                                    _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+                                 five),
+                            _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))),
+                        _mm_set1_epi16(16)),
+                    5),
+                zero));
+            src += src_stride;
+            dst += 16;
+        } while (--h);
+    } else do
+    {
+        __m128i inp = _mm_loadu_si128((__m128i*)(src - 2));
+        _mm_storel_epi64((__m128i*)dst, _mm_packus_epi16(
+            _mm_srai_epi16(
+                _mm_add_epi16(
+                    _mm_add_epi16(
+                        _mm_mullo_epi16(
+                            _mm_sub_epi16(
+                                _mm_slli_epi16(_mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)), 2),
+                                _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+                             five),
+                        _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))),
+                    _mm_set1_epi16(16)),
+                5),
+            zero));
+        inp = _mm_loadu_si128((__m128i*)(src + 8 - 2));
+        _mm_storel_epi64((__m128i*)(dst + 8), _mm_packus_epi16(
+            _mm_srai_epi16(
+                _mm_add_epi16(
+                    _mm_add_epi16(
+                        _mm_mullo_epi16(
+                            _mm_sub_epi16(
+                                _mm_slli_epi16(_mm_add_epi16(MM_LOAD_REG(inp, 2), MM_LOAD_REG(inp, 3)), 2),
+                                _mm_add_epi16(MM_LOAD_REG(inp, 1), MM_LOAD_REG(inp, 4))),
+                             five),
+                        _mm_add_epi16(_mm_unpacklo_epi8(inp, zero), MM_LOAD_REG(inp, 5))),
+                    _mm_set1_epi16(16)),
+                5),
+            zero));
+        src += src_stride;
+        dst += 16;
+    } while (--h);
+}
+
+static __inline void hpel_lpf_ver_sse(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    __m128i zero = _mm_setzero_si128();
+    __m128i five = _mm_set1_epi16(5);
+    __m128i const16 = _mm_set1_epi16(16);
+
+    do
+    {
+        int cloop = h;
+        do
+        {
+            _mm_storel_epi64((__m128i*)dst, _mm_packus_epi16(
+                _mm_srai_epi16(
+                    _mm_add_epi16(
+                        _mm_add_epi16(
+                            _mm_mullo_epi16(
+                                _mm_sub_epi16(
+                                     _mm_slli_epi16(_mm_add_epi16(MM_LOAD_8TO16(src - 0*src_stride), MM_LOAD_8TO16(src + 1*src_stride)), 2),
+                                    _mm_add_epi16(MM_LOAD_8TO16(src - 1*src_stride), MM_LOAD_8TO16(src + 2*src_stride))),
+                                five),
+                            _mm_add_epi16(MM_LOAD_8TO16(src - 2*src_stride), MM_LOAD_8TO16(src + 3*src_stride))),
+                        const16),
+                    5),
+                zero));
+            src += src_stride;
+            dst += 16;
+        } while(--cloop);
+        src += 8 - src_stride*h;
+        dst += 8 - 16*h;
+    } while ((w -= 8) > 0);
+}
+
+static void average_16x16_unalign_sse(uint8_t *dst, const uint8_t *src, int src_stride)
+{
+    __m128i *d = (__m128i *)dst;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+    _mm_store_si128(d, _mm_avg_epu8(_mm_load_si128(d), _mm_loadu_si128((__m128i *)src))); src += src_stride; d++;
+}
+
+static void h264e_qpel_average_wh_align_sse2(const uint8_t *src0, const uint8_t *src1, uint8_t *h264e_restrict dst, point_t wh)
+{
+    int w = wh.s.x;
+    int h = wh.s.y;
+    __m128i *d = (__m128i *)dst;
+    const __m128i *s0 = (const __m128i *)src0;
+    const __m128i *s1 = (const __m128i *)src1;
+    if (w == 16)
+    {
+        do
+        {
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+            _mm_store_si128(d++, _mm_avg_epu8(_mm_load_si128(s0++), _mm_load_si128(s1++)));
+        } while((h -= 8) > 0);
+    } else
+    {
+        do
+        {
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+            _mm_storel_epi64(d++, _mm_avg_epu8(_mm_loadl_epi64(s0++), _mm_loadl_epi64(s1++)));
+        } while((h -= 8) > 0);
+    }
+}
+
+static void h264e_qpel_interpolate_luma_sse2(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+    ALIGN(16) uint8_t scratch[16*16] ALIGN2(16);
+//    src += ((dx + 1) >> 2) + ((dy + 1) >> 2)*src_stride;            // dx == 3 ? next row; dy == 3 ? next line
+//    dxdy              actions: Horizontal, Vertical, Diagonal, Average
+//    0 1 2 3 +1        -   ha    h    ha+
+//    1                 va  hva   hda  hv+a
+//    2                 v   vda   d    v+da
+//    3                 va+ h+va h+da  h+v+a
+//    +stride
+    int32_t pos = 1 << (dxdy.s.x + 4*dxdy.s.y);
+    uint8_t *h264e_restrict dst0 = dst;
+
+    if (pos == 1)
+    {
+        copy_wh_sse(src, src_stride, dst, wh.s.x, wh.s.y);
+        return;
+    }
+    if (pos & 0xe0ee) // 1110 0000 1110 1110
+    {
+        hpel_lpf_hor_sse(src + ((dxdy.s.y + 1) >> 2)*src_stride, src_stride, dst, wh.s.x, wh.s.y);
+        dst = scratch;
+    }
+    if (pos & 0xbbb0) // 1011 1011 1011 0000
+    {
+        hpel_lpf_ver_sse(src + ((dxdy.s.x + 1) >> 2), src_stride, dst, wh.s.x, wh.s.y);
+        dst = scratch;
+    }
+    if (pos & 0x4e40) // 0100 1110 0100 0000
+    {
+        hpel_lpf_diag_sse(src, src_stride, dst, wh.s.x, wh.s.y);
+        dst = scratch;
+    }
+    if (pos & 0xfafa) // 1111 1010 1111 1010
+    {
+        assert(wh.s.x == 16 && wh.s.y == 16);
+        if (pos & 0xeae0)// 1110 1010 1110 0000
+        {
+            point_t p;
+            p.u32 = 16 + (16 << 16);
+            h264e_qpel_average_wh_align_sse2(scratch, dst0, dst0, p);
+        } else
+        {
+            src += ((dxdy.s.x + 1) >> 2) + ((dxdy.s.y + 1) >> 2)*src_stride;
+            average_16x16_unalign_sse(dst0, src, src_stride);
+        }
+    }
+}
+
+static void h264e_qpel_interpolate_chroma_sse2(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+    __m128i zero = _mm_setzero_si128();
+    int w = wh.s.x;
+    int h = wh.s.y;
+    __m128i a, b, c, d;
+
+//        __m128i a = _mm_set1_epi16((short)((8-dx) * (8-dy)));
+//        __m128i b = _mm_set1_epi16((short)(dx * (8-dy)));
+//        __m128i c = _mm_set1_epi16((short)((8-dx) * dy));
+//        __m128i d = _mm_set1_epi16((short)(dx * dy));
+    __m128i c8 = _mm_set1_epi16(8);
+    __m128i y,x = _mm_cvtsi32_si128(dxdy.u32);
+    x = _mm_unpacklo_epi16(x, x);
+    x = _mm_unpacklo_epi32(x, x);
+    y = _mm_unpackhi_epi64(x, x);
+    x = _mm_unpacklo_epi64(x, x);
+    a = _mm_mullo_epi16(_mm_sub_epi16(c8, x), _mm_sub_epi16(c8, y));
+    b = _mm_mullo_epi16(x, _mm_sub_epi16(c8, y));
+    c = _mm_mullo_epi16(_mm_sub_epi16(c8, x), y);
+    d = _mm_mullo_epi16(x, y);
+
+    if (!dxdy.u32)
+    {
+        // 10%
+        if (w == 8) do
+        {
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+            _mm_storel_epi64((__m128i *)dst, _mm_loadl_epi64((__m128i *)src)); src += src_stride; dst += 16;
+        } while(h -= 4);
+        else
+        {
+            do
+            {
+                *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+                *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+                *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+                *(int *)dst = *(int_u *)src; src += src_stride; dst += 16;
+            } while(h -= 4);
+        }
+    } else
+    if (!dxdy.s.x || !dxdy.s.y)
+    {
+        // 40%
+        int dsrc = dxdy.s.x?1:src_stride;
+        c = _mm_or_si128(c,b);
+
+        if (w==8)
+        {
+            do
+            {
+                _mm_storel_epi64((__m128i *)dst,
+                _mm_packus_epi16(
+                    _mm_srai_epi16(
+                        _mm_add_epi16(
+                            _mm_add_epi16(
+                                    _mm_mullo_epi16(a, MM_LOAD_8TO16(src)),
+                                    _mm_mullo_epi16(c, MM_LOAD_8TO16(src + dsrc))),
+                            _mm_set1_epi16(32)),
+                        6),
+                    zero)) ;
+                dst += 16;
+                src += src_stride;
+            } while (--h);
+        } else
+        {
+            do
+            {
+                *(int* )(dst) = _mm_cvtsi128_si32 (
+                _mm_packus_epi16(
+                    _mm_srai_epi16(
+                        _mm_add_epi16(
+                            _mm_add_epi16(
+                                    _mm_mullo_epi16(a, MM_LOAD_8TO16(src)),
+                                    _mm_mullo_epi16(c, MM_LOAD_8TO16(src + dsrc))),
+                            _mm_set1_epi16(32)),
+                        6),
+                    zero));
+                dst += 16;
+                src += src_stride;
+            } while (--h);
+        }
+    } else
+    {
+        // 50%
+        if (w == 8)
+        {
+            __m128i x1,x0;
+            x0 = _mm_loadl_epi64((__m128i*)(src));
+            x1 = _mm_loadl_epi64((__m128i*)(src + 1));
+            x0 = _mm_unpacklo_epi8(x0, zero);
+            x1 = _mm_unpacklo_epi8(x1, zero);
+            do
+            {
+                __m128i y0, y1;
+                src += src_stride;
+                y0 = _mm_loadl_epi64((__m128i*)(src));
+                y1 = _mm_loadl_epi64((__m128i*)(src + 1));
+                y0 = _mm_unpacklo_epi8(y0, zero);
+                y1 = _mm_unpacklo_epi8(y1, zero);
+                _mm_storel_epi64((__m128i *)dst,
+                    _mm_packus_epi16(
+                        _mm_srai_epi16(
+                            _mm_add_epi16(
+                                _mm_add_epi16(
+                                    _mm_add_epi16(
+                                        _mm_mullo_epi16(x0, a),
+                                        _mm_mullo_epi16(x1, b)),
+                                    _mm_add_epi16(
+                                        _mm_mullo_epi16(y0, c),
+                                        _mm_mullo_epi16(y1, d))),
+                                _mm_set1_epi16(32)),
+                            6),
+                        zero));
+                x0 = y0;
+                x1 = y1;
+                dst += 16;
+            } while (--h);
+        } else
+        {
+            // TODO: load 32!
+            __m128i x1, x0 = MM_LOAD_8TO16(src);
+            do
+            {
+                src += src_stride;
+                x1 = MM_LOAD_8TO16(src);
+                *(int*)(dst) = _mm_cvtsi128_si32(
+                    _mm_packus_epi16(
+                        _mm_srai_epi16(
+                            _mm_add_epi16(
+                                _mm_add_epi16(
+                                    _mm_add_epi16(
+                                        _mm_mullo_epi16(x0, a),
+                                        _mm_mullo_epi16(_mm_srli_si128(x0, 2), b)),
+                                    _mm_add_epi16(
+                                        _mm_mullo_epi16(x1, c),
+                                        _mm_mullo_epi16(_mm_srli_si128(x1, 2), d))),
+                                _mm_set1_epi16(32)),
+                            6),
+                        zero));
+                x0 = x1;
+                dst += 16;
+            } while (--h);
+        }
+    }
+}
+
+static int h264e_sad_mb_unlaign_8x8_sse2(const pix_t *a, int a_stride, const pix_t *b, int sad[4])
+{
+    __m128i *mb = (__m128i *)b;
+    __m128i s01, s23;
+    s01 = _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++)); a += a_stride;
+    s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s01 = _mm_add_epi64(s01, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+
+    s23 = _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++)); a += a_stride;
+    s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s23 = _mm_add_epi64(s23, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+
+    sad[0] = _mm_cvtsi128_si32(s01);
+    sad[1] = _mm_extract_epi16(s01, 4);
+    sad[2] = _mm_cvtsi128_si32(s23);
+    sad[3] = _mm_extract_epi16(s23, 4);
+    return sad[0] + sad[1] + sad[2] + sad[3];
+}
+
+
+static int h264e_sad_mb_unlaign_wh_sse2(const pix_t *a, int a_stride, const pix_t *b, point_t wh)
+{
+    __m128i *mb = (__m128i *)b;
+    __m128i s;
+
+    assert(wh.s.x == 8 || wh.s.x == 16);
+    assert(wh.s.y == 8 || wh.s.y == 16);
+
+    if (wh.s.x == 8)
+    {
+        s =                  _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++));  a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+
+        if (wh.s.y == 16)
+        {
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+            s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadl_epi64((__m128i *)a), _mm_loadl_epi64(mb++))); a += a_stride;
+        }
+        return _mm_extract_epi16 (s, 0);
+    }
+
+    s =                  _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++));  a += a_stride;
+    s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+
+    if (wh.s.y == 16)
+    {
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+        s = _mm_add_epi16(s, _mm_sad_epu8(_mm_loadu_si128((__m128i *)a), _mm_loadu_si128(mb++))); a += a_stride;
+    }
+
+    s = _mm_adds_epu16(s, _mm_shuffle_epi32(s, 2));
+    return _mm_cvtsi128_si32(s);
+}
+
+static void h264e_copy_8x8_sse2(pix_t *d, int d_stride, const pix_t *s)
+{
+    assert(IS_ALIGNED(d, 8));
+    assert(IS_ALIGNED(s, 8));
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s))); s += 16; d += d_stride;
+    _mm_storel_epi64((__m128i*)(d), _mm_loadl_epi64((__m128i*)(s)));
+}
+
+static void h264e_copy_16x16_sse2(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+    assert(IS_ALIGNED(d, 8));
+    assert(IS_ALIGNED(s, 8));
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s))); s += s_stride; d += d_stride;
+    _mm_storeu_si128((__m128i*)(d), _mm_loadu_si128((__m128i*)(s)));
+}
+
+static void h264e_copy_borders_sse2(unsigned char *pic, int w, int h, int guard)
+{
+    int rowbytes = w + 2*guard;
+    int topbot = 2;
+    pix_t *s = pic;
+    pix_t *d = pic - guard*rowbytes;
+    assert(guard == 8 || guard == 16);
+    assert((w % 8) == 0);
+    do
+    {
+        int cloop = w;
+        do
+        {
+            __m128i t = _mm_loadu_si128((__m128i*)(s));
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            if (guard == 16)
+            {
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+                _mm_storeu_si128((__m128i*)d, t); d += rowbytes;
+            }
+            s += 16;
+            d += 16 - guard*rowbytes;
+        } while((cloop -= 16) > 0);
+        s = pic + (h - 1)*rowbytes;
+        d = s + rowbytes;
+    } while(--topbot);
+
+    {
+        pix_t *s0 = pic - guard*rowbytes;
+        pix_t *s1 = pic - guard*rowbytes + w - 1;
+        int cloop = 2*guard + h;
+        if (guard == 8) do
+        {
+            _mm_storel_epi64((__m128i*)(s0-8), _mm_set1_epi8(*s0));
+            _mm_storel_epi64((__m128i*)(s1+1), _mm_set1_epi8(*s1));
+            s0 += rowbytes;
+            s1 += rowbytes;
+        } while(--cloop); else do
+        {
+            _mm_storeu_si128((__m128i*)(s0-16), _mm_set1_epi8(*s0));
+            _mm_storeu_si128((__m128i*)(s1+1), _mm_set1_epi8(*s1));
+            s0 += rowbytes;
+            s1 += rowbytes;
+        } while(--cloop);
+
+    }
+}
+
+static void hadamar4_2d_sse(int16_t *x)
+{
+    __m128i a = _mm_loadl_epi64((__m128i*)x);
+    __m128i b = _mm_loadl_epi64((__m128i*)(x + 4));
+    __m128i c = _mm_loadl_epi64((__m128i*)(x + 8));
+    __m128i d = _mm_loadl_epi64((__m128i*)(x + 12));
+
+    __m128i u0 = _mm_add_epi16(a, c);
+    __m128i u1 = _mm_sub_epi16(a, c);
+    __m128i u2 = _mm_add_epi16(b, d);
+    __m128i u3 = _mm_sub_epi16(b, d);
+    __m128i v0 = _mm_add_epi16(u0, u2);
+    __m128i v3 = _mm_sub_epi16(u0, u2);
+    __m128i v1 = _mm_add_epi16(u1, u3);
+    __m128i v2 = _mm_sub_epi16(u1, u3);
+
+    //    v0: a0 a1 a2 a3
+    //    v1: b0 ......
+    //    v2: c0 ......
+    //    v4: d0 d1 .. d3
+    //
+    __m128i t0 = _mm_unpacklo_epi16(v0, v1);    // a0, b0, a1, b1, a2, b2, a3, b3
+    __m128i t2 = _mm_unpacklo_epi16(v2, v3);    // c0, d0, c1, d1, c2, d2, c3, d3
+    a = _mm_unpacklo_epi32(t0, t2);    // a0, b0, c0, d0, a1, b1, c1, d1
+    c = _mm_unpackhi_epi32(t0, t2);    // a2, b2, c2, d2, a3, b3, c3, d3
+    u0 = _mm_add_epi16(a, c); // u0 u2
+    u1 = _mm_sub_epi16(a, c); // u1 u3
+    v0 = _mm_unpacklo_epi64(u0, u1); // u0 u1
+    v1 = _mm_unpackhi_epi64(u0, u1); // u2 u3
+    u0 = _mm_add_epi16(v0, v1); // v0 v1
+    u1 = _mm_sub_epi16(v0, v1); // v3 v2
+
+    v1 = _mm_shuffle_epi32(u1, 0x4e); // u2 u3      01001110
+    _mm_store_si128((__m128i*)x, u0);
+    _mm_store_si128((__m128i*)(x + 8), v1);
+
+}
+
+static void dequant_dc_sse(quant_t *q, int16_t *qval, int dequant, int n)
+{
+    do q++->dq[0] = (int16_t)(*qval++*(int16_t)dequant); while (--n);
+}
+
+static void quant_dc_sse(int16_t *qval, int16_t *deq, int16_t quant, int n, int round_q18)
+{
+    int r_minus = (1 << 18) - round_q18;
+    do
+    {
+        int v = *qval;
+        int r = v < 0 ? r_minus : round_q18;
+        *deq++ = *qval++ = (v * quant + r) >> 18;
+    } while (--n);
+}
+
+static void hadamar2_2d_sse(int16_t *x)
+{
+    int a = x[0];
+    int b = x[1];
+    int c = x[2];
+    int d = x[3];
+    x[0] = (int16_t)(a + b + c + d);
+    x[1] = (int16_t)(a - b + c - d);
+    x[2] = (int16_t)(a + b - c - d);
+    x[3] = (int16_t)(a - b - c + d);
+}
+
+static void h264e_quant_luma_dc_sse2(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+    int16_t *tmp = ((int16_t*)q) - 16;
+    hadamar4_2d_sse(tmp);
+    quant_dc_sse(tmp, deq, qdat[0], 16, 0x20000);//0x15555);
+    hadamar4_2d_sse(tmp);
+    assert(!(qdat[1] & 3));
+    // dirty trick here: shift w/o rounding, since it have no effect  for qp >= 10 (or, to be precise, for qp => 9)
+    dequant_dc_sse(q, tmp, qdat[1] >> 2, 16);
+}
+
+static int h264e_quant_chroma_dc_sse2(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+    int16_t *tmp = ((int16_t*)q) - 16;
+    hadamar2_2d_sse(tmp);
+    quant_dc_sse(tmp, deq, (int16_t)(qdat[0] << 1), 4, 0xAAAA);
+    hadamar2_2d_sse(tmp);
+    assert(!(qdat[1] & 1));
+    dequant_dc_sse(q, tmp, qdat[1] >> 1, 4);
+    return !!(tmp[0] | tmp[1] | tmp[2] | tmp[3]);
+}
+
+static int is_zero_sse(const int16_t *dat, int i0, const uint16_t *thr)
+{
+    __m128i t = _mm_loadu_si128((__m128i*)(thr));
+    __m128i d = _mm_load_si128((__m128i*)(dat));
+    __m128i z = _mm_setzero_si128();
+    __m128i m, sign;
+    if (i0) d = _mm_insert_epi16 (d, 0, 0);
+
+    sign = _mm_cmpgt_epi16(z, d);
+    d = _mm_sub_epi16(_mm_xor_si128(d, sign), sign);
+
+    m = _mm_cmpgt_epi16(d, t);
+    d = _mm_loadu_si128((__m128i*)(dat + 8));
+    sign = _mm_cmpgt_epi16(z, d);
+    d = _mm_sub_epi16(_mm_xor_si128(d, sign), sign);
+    m = _mm_or_si128(m, _mm_cmpgt_epi16(d, t));
+    return !_mm_movemask_epi8(m);
+}
+
+static int is_zero4_sse(const quant_t *q, int i0, const uint16_t *thr)
+{
+    return is_zero_sse(q[0].dq, i0, thr) &&
+           is_zero_sse(q[1].dq, i0, thr) &&
+           is_zero_sse(q[4].dq, i0, thr) &&
+           is_zero_sse(q[5].dq, i0, thr);
+}
+
+static int h264e_transform_sub_quant_dequant_sse2(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat)
+{
+    int crow = mode >> 1;
+    int ccol = crow;
+    int i, i0 = mode & 1;
+    int nz_block_mask = 0;
+    int zmask = 0;
+    quant_t *q_0 = q;
+
+    int y, x;
+    for (y = 0; y < crow; y++)
+    {
+        for (x = 0; x < ccol; x += 2)
+        {
+            const pix_t *pinp  = inp  + inp_stride*4*y + 4*x;
+            const pix_t *ppred = pred +         16*4*y + 4*x;
+
+            __m128i d0, d1, d2, d3;
+            __m128i t0, t1, t2, t3;
+            __m128i q0, q1, q2, q3;
+            __m128i zero = _mm_setzero_si128();
+            __m128i inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp),  zero);
+            __m128i pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)ppred), zero);
+
+            d0 =_mm_sub_epi16(inp8, pred8);
+            pinp += inp_stride;
+            inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp),  zero);
+            pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(ppred + 16)), zero);
+            d1 =_mm_sub_epi16(inp8, pred8);
+            pinp += inp_stride;
+            inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp),  zero);
+            pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(ppred + 32)), zero);
+            d2 =_mm_sub_epi16(inp8, pred8);
+            pinp += inp_stride;
+            inp8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)pinp),  zero);
+            pred8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(ppred + 48)), zero);
+            d3 =_mm_sub_epi16(inp8, pred8);
+            t0 = _mm_add_epi16(d0, d3);
+            t1 = _mm_sub_epi16(d0, d3);
+            t2 = _mm_add_epi16(d1, d2);
+            t3 = _mm_sub_epi16(d1, d2);
+            q0 = _mm_add_epi16(t0, t2);
+            q1 = _mm_add_epi16(_mm_add_epi16(t1, t1), t3);
+            q2 = _mm_sub_epi16(t0, t2);
+            q3 = _mm_sub_epi16(t1, _mm_add_epi16(t3, t3));
+
+            //    q0: a0 a1 ....... a7
+            //    q1: b0 .............
+            //    q2: c0 .............
+            //    q3: d0 d1 ....... d7
+            //
+            t0 = _mm_unpacklo_epi16(q0, q1);    // a0, b0, a1, b1, a2, b2, a3, b3
+            t1 = _mm_unpackhi_epi16(q0, q1);    // a4, b4, a5, b5, a6, b6, a7, b7
+            t2 = _mm_unpacklo_epi16(q2, q3);    // c0, d0
+            t3 = _mm_unpackhi_epi16(q2, q3);    // c4, d4
+
+            q0 = _mm_unpacklo_epi32(t0, t2);    // a0, b0, c0, d0, a1, b1, c1, d1
+            q1 = _mm_unpackhi_epi32(t0, t2);    // a2, b2,
+            q2 = _mm_unpacklo_epi32(t1, t3);    // a4, b4
+            q3 = _mm_unpackhi_epi32(t1, t3);    // a6, b6
+
+            d0 = _mm_unpacklo_epi64(q0, q2);    // a0, b0, c0, d0, a4, b4, c4, d4
+            d1 = _mm_unpackhi_epi64(q0, q2);    // a1, b1, c1, d1
+            d2 = _mm_unpacklo_epi64(q1, q3);    // a2, b2,
+            d3 = _mm_unpackhi_epi64(q1, q3);    // a3, b3,
+
+            t0 = _mm_add_epi16(d0, d3);
+            t1 = _mm_sub_epi16(d0, d3);
+            t2 = _mm_add_epi16(d1, d2);
+            t3 = _mm_sub_epi16(d1, d2);
+            q0 = _mm_add_epi16(t0, t2);
+            q1 = _mm_add_epi16(_mm_add_epi16(t1, t1), t3);
+            q2 = _mm_sub_epi16(t0, t2);
+            q3 = _mm_sub_epi16(t1, _mm_add_epi16(t3, t3));
+
+            _mm_storel_epi64((__m128i*)(q[0].dq    ), q0);
+            _mm_storel_epi64((__m128i*)(q[0].dq + 4), q1);
+            _mm_storel_epi64((__m128i*)(q[0].dq + 8), q2);
+            _mm_storel_epi64((__m128i*)(q[0].dq + 12), q3);
+            if (ccol > 1)
+            {
+                q0 = _mm_unpackhi_epi64(q0, q0); _mm_storel_epi64((__m128i*)(q[1].dq    ), q0);
+                q1 = _mm_unpackhi_epi64(q1, q1); _mm_storel_epi64((__m128i*)(q[1].dq + 4), q1);
+                q2 = _mm_unpackhi_epi64(q2, q2); _mm_storel_epi64((__m128i*)(q[1].dq + 8), q2);
+                q3 = _mm_unpackhi_epi64(q3, q3); _mm_storel_epi64((__m128i*)(q[1].dq + 12), q3);
+            }
+            q += 2;
+        }
+    }
+    q = q_0;
+    crow = mode >> 1;
+    ccol = crow;
+
+    if (mode & 1) // QDQ_MODE_INTRA_16 || QDQ_MODE_CHROMA
+    {
+        int cloop = (mode >> 1)*(mode >> 1);
+        short *dc = ((short *)q) - 16;
+        quant_t *pq = q;
+        do
+        {
+            *dc++ = pq->dq[0];
+            pq++;
+        } while (--cloop);
+    }
+
+    if (mode == QDQ_MODE_INTER || mode == QDQ_MODE_CHROMA)
+    {
+        for (i = 0; i < crow*ccol; i++)
+        {
+            if (is_zero_sse(q[i].dq, i0, qdat + OFFS_THR_1_OFF))
+            {
+                zmask |= (1 << i);
+            }
+        }
+
+        if (mode == QDQ_MODE_INTER)
+        {
+            if ((~zmask & 0x0033) && is_zero4_sse(q +  0, i0, qdat + OFFS_THR_2_OFF)) zmask |= 0x33;
+            if ((~zmask & 0x00CC) && is_zero4_sse(q +  2, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 2);
+            if ((~zmask & 0x3300) && is_zero4_sse(q +  8, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 8);
+            if ((~zmask & 0xCC00) && is_zero4_sse(q + 10, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 10);
+        }
+    }
+
+    do
+    {
+        do
+        {
+            int nz_mask = 0;
+            if (zmask & 1)
+            {
+                _mm_store_si128((__m128i*)(q->qv),     _mm_setzero_si128());
+                _mm_store_si128((__m128i*)(q->qv) + 1, _mm_setzero_si128());
+            } else
+            {
+                int16_t *qv_tmp = q->qv;//[16];
+                __m128i t;
+                const __m128i const_q  = _mm_loadu_si128((__m128i*)(qdat + OFFS_QUANT_VECT));
+                const __m128i const_dq = _mm_loadu_si128((__m128i*)(qdat + OFFS_DEQUANT_VECT));
+
+                __m128i src = _mm_load_si128((__m128i*)(q[0].dq));
+                __m128i r = _mm_xor_si128(_mm_set1_epi16(qdat[OFFS_RND_INTER]), _mm_cmpgt_epi16(_mm_setzero_si128(), src));
+                __m128i lo = _mm_mullo_epi16(src, const_q);
+                __m128i hi = _mm_mulhi_epi16(src, const_q);
+                __m128i dst0 = _mm_unpacklo_epi16(lo, hi);
+                __m128i dst1 = _mm_unpackhi_epi16(lo, hi);
+                dst0 = _mm_srai_epi32(_mm_add_epi32(dst0, _mm_unpacklo_epi16(r, _mm_setzero_si128())), 16);
+                dst1 = _mm_srai_epi32(_mm_add_epi32(dst1, _mm_unpackhi_epi16(r, _mm_setzero_si128())), 16);
+                dst0 = _mm_packs_epi32(dst0, dst1);
+                _mm_store_si128((__m128i*)(qv_tmp), dst0);
+
+                t = _mm_cmpeq_epi16(_mm_setzero_si128(), dst0);
+                nz_mask = _mm_movemask_epi8( _mm_packs_epi16(t, t)) & 0xff;
+                dst0 = _mm_mullo_epi16(dst0, const_dq);
+                _mm_store_si128((__m128i*)(q[0].dq), dst0);
+
+
+                src = _mm_load_si128((__m128i*)(q[0].dq + 8));
+                r = _mm_xor_si128(_mm_set1_epi16(qdat[OFFS_RND_INTER]), _mm_cmpgt_epi16(_mm_setzero_si128(), src));
+                lo = _mm_mullo_epi16(src, const_q);
+                hi = _mm_mulhi_epi16(src, const_q);
+                dst0 = _mm_unpacklo_epi16(lo, hi);
+                dst1 = _mm_unpackhi_epi16(lo, hi);
+
+                dst0 = _mm_srai_epi32(_mm_add_epi32(dst0, _mm_unpacklo_epi16(r, _mm_setzero_si128())), 16);
+                dst1 = _mm_srai_epi32(_mm_add_epi32(dst1, _mm_unpackhi_epi16(r, _mm_setzero_si128())), 16);
+                dst0 = _mm_packs_epi32(dst0, dst1);
+                _mm_store_si128((__m128i*)(qv_tmp + 8), dst0);
+
+                t = _mm_cmpeq_epi16(_mm_setzero_si128(), dst0);
+                nz_mask |= _mm_movemask_epi8( _mm_packs_epi16(t, t)) << 8;
+                dst0 = _mm_mullo_epi16(dst0, const_dq);
+                _mm_store_si128((__m128i*)(q[0].dq + 8), dst0);
+                nz_mask = ~nz_mask & 0xffff;
+                if (i0)
+                {
+                    nz_mask &= ~1;
+                }
+            }
+
+            zmask >>= 1;
+            nz_block_mask <<= 1;
+            if (nz_mask)
+                nz_block_mask |= 1;
+            q++;
+        } while (--ccol);
+        ccol = mode >> 1;
+    } while (--crow);
+    return nz_block_mask;
+}
+
+static void h264e_transform_add_sse2(pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask)
+{
+    int crow = side;
+    int ccol = crow;
+
+    assert(IS_ALIGNED(out, 4));
+    assert(IS_ALIGNED(pred, 4));
+    assert(!(out_stride % 4));
+
+    do
+    {
+        do
+        {
+            if (mask >= 0)
+            {
+                // copy 4x4
+                pix_t *dst = out;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 0 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 1 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 2 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 3 * 16);
+            }
+            else
+            {
+                __m128i zero = _mm_setzero_si128();
+                __m128i c32 = _mm_set1_epi16(32);
+                __m128i d0, d1, d2, d3;
+                __m128i e0, e1, e2, e3;
+                d0 = _mm_load_si128((__m128i*)(q->dq + 0));
+                d2 = _mm_load_si128((__m128i*)(q->dq + 8));
+                d1 = _mm_unpackhi_epi64(d0, d2);
+                d3 = _mm_unpackhi_epi64(d2, d0);
+
+                e0 = _mm_add_epi16(d0, d2);
+                e1 = _mm_sub_epi16(d0, d2);
+
+                e2 = _mm_srai_epi16(d1, 1);
+                e2 = _mm_sub_epi16(e2, d3);
+                e3 = _mm_srai_epi16(d3, 1);
+                e3 = _mm_add_epi16(e3, d1);
+
+                d0 = _mm_add_epi16(e0, e3);
+                d1 = _mm_add_epi16(e1, e2);
+                d2 = _mm_sub_epi16(e1, e2);
+                d3 = _mm_sub_epi16(e0, e3);
+
+                e1 = _mm_unpacklo_epi16(d0, d1);    // a0, b0, a1, b1, a2, b2, a3, b3
+                e3 = _mm_unpacklo_epi16(d2, d3);    // c0, d0
+
+                e0 = _mm_unpacklo_epi32(e1, e3);    // a0, b0, c0, d0, a1, b1, c1, d1
+                e2 = _mm_unpackhi_epi32(e1, e3);    // a2, b2,
+
+                e1 = _mm_unpackhi_epi64(e0, e2);
+                e3 = _mm_unpackhi_epi64(e2, e0);
+
+                d0 = _mm_add_epi16(e0, e2);
+                d1 = _mm_sub_epi16(e0, e2);
+                d2 = _mm_srai_epi16(e1, 1);
+                d2 = _mm_sub_epi16(d2, e3);
+                d3 = _mm_srai_epi16(e3, 1);
+                d3 = _mm_add_epi16(d3, e1);
+
+                // Pack 4x64 to 2x128
+                e0 = _mm_unpacklo_epi64(d0, d1);
+                e1 = _mm_unpacklo_epi64(d3, d2);
+
+                e0 = _mm_add_epi16(e0, c32);
+                d0 = _mm_srai_epi16(_mm_add_epi16(e0, e1), 6);
+                d3 = _mm_srai_epi16(_mm_sub_epi16(e0, e1), 6);
+                // Unpack back to 4x64
+                d1 = _mm_unpackhi_epi64(d0, zero);
+                d2 = _mm_unpackhi_epi64(d3, zero);
+
+                *(int* )(out)                = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred +  0)), zero), d0), zero));
+                *(int* )(out + 1*out_stride) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred + 16)), zero), d1), zero));
+                *(int* )(out + 2*out_stride) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred + 32)), zero), d2), zero));
+                *(int* )(out + 3*out_stride) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(pred + 48)), zero), d3), zero));
+
+            }
+            mask = (uint32_t)mask << 1;
+            q++;
+            out += 4;
+            pred += 4;
+        } while (--ccol);
+        ccol = side;
+        out += 4*(out_stride - ccol);
+        pred += 4*(16 - ccol);
+    } while (--crow);
+}
+#endif
+
+#if H264E_ENABLE_NEON && !defined(MINIH264_ASM)
+#define TR32(x, y) tr0 = vtrnq_u32(vreinterpretq_u32_u8(x), vreinterpretq_u32_u8(y)); x = vreinterpretq_u8_u32(tr0.val[0]); y = vreinterpretq_u8_u32(tr0.val[1]);
+#define TR16(x, y) tr1 = vtrnq_u16(vreinterpretq_u16_u8(x), vreinterpretq_u16_u8(y)); x = vreinterpretq_u8_u16(tr1.val[0]); y = vreinterpretq_u8_u16(tr1.val[1]);
+#define TR8(x, y)  tr2 = vtrnq_u8((x), (y)); x = (tr2.val[0]); y = (tr2.val[1]);
+
+static void deblock_luma_v_neon(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+    uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15;
+    uint8x16_t tmp;
+    uint32x4x2_t tr0;
+    uint16x8x2_t tr1;
+    uint8x16x2_t tr2;
+    q8 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q9 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q10= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q11= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q12= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q13= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q14= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q15= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+
+    TR32(q8,  q12);
+    TR32(q9,  q13);
+    TR32(q10, q14);
+    TR32(q11, q15);
+    TR16(q8,  q10);
+    TR16(q9,  q11);
+    TR16(q12, q14);
+    TR16(q13, q15);
+    TR8(q8,   q9 );
+    TR8(q10,  q11);
+    TR8(q12,  q13);
+    TR8(q14,  q15);
+
+    q1  = vabdq_u8(q11, q12);
+    q2  = vcltq_u8(q1, vdupq_n_u8(alpha));
+    q1  = vcltq_u8(vmaxq_u8(vabdq_u8(q11, q10), vabdq_u8(q12, q13)), vdupq_n_u8(beta));
+    q2  = vandq_u8(q2, q1);
+
+    tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pstr));
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    q1  = tmp;
+
+    q1  = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+    q2  = vandq_u8(q2, q1);
+    q7 = vhsubq_u8(q10, q13);
+    q7 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(q7), 1));
+    q0 = veorq_u8(q12, q11);
+    q6 = vandq_u8(vdupq_n_u8(1), q0);
+
+    q0 = vhsubq_u8(q12, q11);// ;(q0-p0))>>1
+
+    q7 = vreinterpretq_u8_s8(vrhaddq_s8(vreinterpretq_s8_u8(q7), vreinterpretq_s8_u8(q6))); //((p1-q1)>>2 + carry + 1) >> 1
+    q7 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(q0),  vreinterpretq_s8_u8(q7))); //=delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+    q7 = vandq_u8(q7, q2);
+
+    tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pthr));
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    q1  = tmp;
+
+    q1  = vandq_u8(q2, q1);
+
+    q0 = vabdq_u8(q9,  q11); // ap = ABS(p2 - p0);
+    q0 = vcltq_u8(q0,  vdupq_n_u8(beta)); //sp = (ap - beta) >> 31;
+    q4 = vandq_u8(q0,  q2);  // & sp
+    q0 = vabdq_u8(q14, q12); //aq = ABS(q2 - q0);
+    q0 = vcltq_u8(q0,  vdupq_n_u8(beta))  ;//sq = (aq - beta) >> 31;
+    q3 = vandq_u8(q0,  q2);  //  & sq
+
+    q0  = vrhaddq_u8(q11, q12);//((p0+q0+1)>>1)
+    q0  = vhaddq_u8 (q0,  q9 );//((p2 + ((p0+q0+1)>>1))>>1)
+    q5  = vandq_u8  (q1,  q4 );
+    q6  = vqaddq_u8 (q10, q5 );//{p1+thr}
+    q0  = vminq_u8  (q0,  q6 );
+    q6  = vqsubq_u8 (q10, q5 );//{p1-thr}
+    q10 = vmaxq_u8  (q0,  q6 );
+
+    q0  = vrhaddq_u8(q11, q12);// ;((p0+q0+1)>>1)
+    q0  = vhaddq_u8 (q0,  q14);// ;((q2 + ((p0+q0+1)>>1))>>1)
+    q5  = vandq_u8  (q1,  q3 );
+    q6  = vqaddq_u8 (q13, q5 );// ;{q1+thr}
+    q0  = vminq_u8  (q0,  q6 );
+    q6  = vqsubq_u8 (q13, q5 );// ;{q1-thr}
+    q13 = vmaxq_u8  (q0,  q6 );
+
+    q1  = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q3)));
+    q1  = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q4))); //tC = thr - sp - sq;
+    q1  = vandq_u8(q1, q2);// ; set thr = 0 if str==0
+
+    q6  = veorq_u8(q6, q6);
+    q5  = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //delta > 0
+    q7  = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7)));
+    q6  = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //-(delta < 0)
+    q5  =  vminq_u8(q1, q5);
+    q6  =  vminq_u8(q1, q6);
+
+    q11 = vqaddq_u8(q11, q5);
+    q11 = vqsubq_u8(q11, q6);
+    q12 = vqsubq_u8(q12, q5);
+    q12 = vqaddq_u8(q12, q6);
+
+    TR8(q8,   q9 );
+    TR8(q10,  q11);
+    TR8(q12,  q13);
+    TR8(q14,  q15);
+    TR16(q8,  q10);
+    TR16(q9,  q11);
+    TR16(q12, q14);
+    TR16(q13, q15);
+    TR32(q8,  q12);
+    TR32(q9,  q13);
+    TR32(q10, q14);
+    TR32(q11, q15);
+
+    pix -= 8*stride + 4;
+    vst1_u8(pix, vget_low_u8(q8));  pix += stride;
+    vst1_u8(pix, vget_low_u8(q9));  pix += stride;
+    vst1_u8(pix, vget_low_u8(q10)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q11)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q12)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q13)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q14)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q15)); pix += stride;
+
+    vst1_u8(pix, vget_high_u8(q8)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q9)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q10)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q11)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q12)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q13)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q14)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q15)); pix += stride;
+}
+
+static void deblock_luma_h_s4_neon(uint8_t *pix, int stride, int alpha, int beta)
+{
+    uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, vspill0, vspill1;
+    q8  = vld1q_u8(pix - 4*stride);
+    q9  = vld1q_u8(pix - 3*stride);
+    q10 = vld1q_u8(pix - 2*stride);
+    q11 = vld1q_u8(pix - 1*stride);
+    q12 = vld1q_u8(pix);
+    q13 = vld1q_u8(pix + 1*stride);
+    q14 = vld1q_u8(pix + 2*stride);
+    q15 = vld1q_u8(pix + 3*stride);
+    q0  = vabdq_u8(q11, q12);
+    q2  = vcltq_u8(q0, vdupq_n_u8(alpha));
+    q2  = vandq_u8(q2, vcltq_u8(vabdq_u8(q11, q10), vdupq_n_u8(beta)));
+    q2  = vandq_u8(q2, vcltq_u8(vabdq_u8(q12, q13), vdupq_n_u8(beta)));
+    q1  = vandq_u8(q2, vcltq_u8(q0, vdupq_n_u8(((alpha >> 2) + 2))));
+    q0  = vandq_u8(q1, vcltq_u8(vabdq_u8(q9,  q11), vdupq_n_u8(beta)));
+    q3  = vandq_u8(q1, vcltq_u8(vabdq_u8(q14, q12), vdupq_n_u8(beta)));
+    q4 = vhaddq_u8(q9,  q10);
+    q5 = vhaddq_u8(q11, q12);
+    q6 = vsubq_u8(vrhaddq_u8(q9,  q10), q4);
+    q7 = vsubq_u8(vrhaddq_u8(q11, q12), q5);
+    q6 = vhaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q4, q8);
+    q4 = vhaddq_u8(q4, q8);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q5, q9);
+    q5 = vhaddq_u8(q5, q9);
+    q7 = vsubq_u8(q7, q5);
+    q6 = vhaddq_u8(q6, q7);
+
+    q7 = vrhaddq_u8(q4, q5);
+    q4 = vhaddq_u8(q4, q5);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vrhaddq_u8(q6, q7);
+    q4 = vaddq_u8(q4, q6);
+    vspill0 =  vbslq_u8(q0, q4, q9);   // VMOV        q6,     q9   VBIT        q6,     q4,     q0
+
+    q4 = vhaddq_u8(q14, q13);
+    q5 = vhaddq_u8(q12, q11);
+    q6 = vsubq_u8(vrhaddq_u8(q14, q13), q4);
+    q7 = vsubq_u8(vrhaddq_u8(q12, q11), q5);
+    q6 = vhaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q4, q15);
+    q4 = vhaddq_u8(q4, q15);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q5, q14);
+    q5 = vhaddq_u8(q5, q14);
+    q7 = vsubq_u8(q7, q5);
+    q6 = vhaddq_u8(q6, q7);
+
+    q7 = vrhaddq_u8(q4, q5);
+    q4 = vhaddq_u8(q4, q5);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vrhaddq_u8(q6, q7);
+    q4 = vaddq_u8(q4, q6);
+    vspill1 =  vbslq_u8(q3, q4, q14);   //     VMOV        q6,     q14    VBIT        q6,     q4,     q3
+
+    q1 = vhaddq_u8 (q9,  q13);
+    q4 = vrhaddq_u8(q1,  q10);
+    q5 = vrhaddq_u8(q11, q12);
+    q6 = vhaddq_u8 (q1,  q10);
+    q7 = vhaddq_u8 (q11, q12);
+    q4 = vhaddq_u8 (q4,  q5);
+    q6 = vrhaddq_u8(q6,  q7);
+    q1 = vrhaddq_u8(q4,  q6);
+    q4 = vrhaddq_u8(q9,  q10);
+    q5 = vrhaddq_u8(q11, q12);
+    q6 = vhaddq_u8 (q9,  q10);
+    q7 = vhaddq_u8 (q11, q12);
+    q4 = vhaddq_u8 (q4,  q5);
+    q6 = vrhaddq_u8(q6,  q7);
+    q4 = vrhaddq_u8(q4,  q6);
+    q5 = vhaddq_u8 (q11, q13);
+    q5 = vrhaddq_u8(q5,  q10);
+
+    q1 = vbslq_u8(q0, q1, q5); //VBIF        q1,     q5,     q0
+    q0 = vbslq_u8(q0, q4, q10);//VBSL        q0,     q4,     q10
+
+    q7 = vhaddq_u8 (q14, q10);
+    q4 = vrhaddq_u8(q7,  q13);
+    q5 = vrhaddq_u8(q11, q12);
+    q6 = vhaddq_u8 (q7,  q13);
+    q7 = vhaddq_u8 (q11, q12);
+    q4 = vhaddq_u8 (q4,  q5 );
+    q6 = vrhaddq_u8(q6,  q7 );
+    q4 = vrhaddq_u8(q4,  q6 );
+    q6 = vrhaddq_u8(q14, q13);
+    q5 = vrhaddq_u8(q11, q12);
+    q5 = vhaddq_u8 (q6,  q5 );
+    q6 = vhaddq_u8 (q14, q13);
+    q7 = vhaddq_u8 (q11, q12);
+    q6 = vrhaddq_u8(q6,  q7 );
+    q5 = vrhaddq_u8(q5,  q6 );
+    q6 = vhaddq_u8 (q12, q10);
+    q6 = vrhaddq_u8(q6,  q13);
+
+    q4 = vbslq_u8(q3, q4, q6); //    VBIF        q4,     q6,     q3    ;q0
+    q3 = vbslq_u8(q3, q5, q13);//    VBSL        q3,     q5,     q13   ;q1
+
+    q10 = vbslq_u8(q2, q0, q10);
+    q11 = vbslq_u8(q2, q1, q11);
+    q12 = vbslq_u8(q2, q4, q12);
+    q13 = vbslq_u8(q2, q3, q13);
+
+    vst1q_u8(pix - 3*stride, vspill0);
+    vst1q_u8(pix - 2*stride, q10);
+    vst1q_u8(pix - 1*stride, q11);
+    vst1q_u8(pix           , q12);
+    vst1q_u8(pix + 1*stride, q13);
+    vst1q_u8(pix + 2*stride, vspill1);
+
+}
+
+static void deblock_luma_v_s4_neon(uint8_t *pix, int stride, int alpha, int beta)
+{
+    uint32x4x2_t tr0;
+    uint16x8x2_t tr1;
+    uint8x16x2_t tr2;
+    uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, vspill0, vspill1;
+    q8 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q9 = vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q10= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q11= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q12= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q13= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q14= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+    q15= vcombine_u8(vld1_u8(pix - 4), vld1_u8(pix - 4 + 8*stride)); pix += stride;
+
+    TR32(q8,  q12);
+    TR32(q9,  q13);
+    TR32(q10, q14);
+    TR32(q11, q15);
+    TR16(q8,  q10);
+    TR16(q9,  q11);
+    TR16(q12, q14);
+    TR16(q13, q15);
+    TR8(q8,   q9 );
+    TR8(q10,  q11);
+    TR8(q12,  q13);
+    TR8(q14,  q15);
+
+    q0 = vabdq_u8(q11, q12);
+    q2 = vcltq_u8(q0, vdupq_n_u8(alpha));
+    q2 = vandq_u8(q2, vcltq_u8(vabdq_u8(q11,    q10), vdupq_n_u8(beta)));
+    q2 = vandq_u8(q2, vcltq_u8(vabdq_u8(q12,    q13), vdupq_n_u8(beta)));
+    q1 = vandq_u8(q2, vcltq_u8(q0, vdupq_n_u8(((alpha >> 2) + 2))));
+    q0 = vandq_u8(q1, vcltq_u8(vabdq_u8(q9,     q11), vdupq_n_u8(beta)));
+    q3 = vandq_u8(q1, vcltq_u8(vabdq_u8(q14,    q12), vdupq_n_u8(beta)));
+    q4 = vhaddq_u8(q9,  q10);
+    q5 = vhaddq_u8(q11, q12);
+    q6 = vsubq_u8(vrhaddq_u8(q9,  q10), q4);
+    q7 = vsubq_u8(vrhaddq_u8(q11, q12), q5);
+    q6 = vhaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q4, q8);
+    q4 = vhaddq_u8(q4, q8);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q5, q9);
+    q5 = vhaddq_u8(q5, q9);
+    q7 = vsubq_u8(q7, q5);
+    q6 = vhaddq_u8(q6, q7);
+
+    q7 = vrhaddq_u8(q4, q5);
+    q4 = vhaddq_u8(q4, q5);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vrhaddq_u8(q6, q7);
+    q4 = vaddq_u8(q4, q6);
+    vspill0 =  vbslq_u8(q0, q4, q9);   // VMOV        q6,     q9   VBIT        q6,     q4,     q0
+
+    q4 = vhaddq_u8(q14, q13);
+    q5 = vhaddq_u8(q12, q11);
+    q6 = vsubq_u8(vrhaddq_u8(q14, q13), q4);
+    q7 = vsubq_u8(vrhaddq_u8(q12, q11), q5);
+    q6 = vhaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q4, q15);
+    q4 = vhaddq_u8(q4, q15);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vaddq_u8(q6, q7);
+    q7 = vrhaddq_u8(q5, q14);
+    q5 = vhaddq_u8(q5, q14);
+    q7 = vsubq_u8(q7, q5);
+    q6 = vhaddq_u8(q6, q7);
+
+    q7 = vrhaddq_u8(q4, q5);
+    q4 = vhaddq_u8(q4, q5);
+    q7 = vsubq_u8(q7, q4);
+    q6 = vrhaddq_u8(q6, q7);
+    q4 = vaddq_u8(q4, q6);
+    vspill1 =  vbslq_u8(q3, q4, q14);   //     VMOV        q6,     q14    VBIT        q6,     q4,     q3
+
+    q1 = vhaddq_u8 (q9,  q13);
+    q4 = vrhaddq_u8(q1,  q10);
+    q5 = vrhaddq_u8(q11, q12);
+    q6 = vhaddq_u8 (q1,  q10);
+    q7 = vhaddq_u8 (q11, q12);
+    q4 = vhaddq_u8 (q4,  q5);
+    q6 = vrhaddq_u8(q6,  q7);
+    q1 = vrhaddq_u8(q4,  q6);
+    q4 = vrhaddq_u8(q9,  q10);
+    q5 = vrhaddq_u8(q11, q12);
+    q6 = vhaddq_u8 (q9,  q10);
+    q7 = vhaddq_u8 (q11, q12);
+    q4 = vhaddq_u8 (q4,  q5);
+    q6 = vrhaddq_u8(q6,  q7);
+    q4 = vrhaddq_u8(q4,  q6);
+    q5 = vhaddq_u8 (q11, q13);
+    q5 = vrhaddq_u8(q5,  q10);
+
+    q1 = vbslq_u8(q0, q1, q5); //VBIF        q1,     q5,     q0
+    q0 = vbslq_u8(q0, q4, q10);//VBSL        q0,     q4,     q10
+
+    q7 = vhaddq_u8 (q14, q10);
+    q4 = vrhaddq_u8(q7,  q13);
+    q5 = vrhaddq_u8(q11, q12);
+    q6 = vhaddq_u8 (q7,  q13);
+    q7 = vhaddq_u8 (q11, q12);
+    q4 = vhaddq_u8 (q4,  q5 );
+    q6 = vrhaddq_u8(q6,  q7 );
+    q4 = vrhaddq_u8(q4,  q6 );
+    q6 = vrhaddq_u8(q14, q13);
+    q5 = vrhaddq_u8(q11, q12);
+    q5 = vhaddq_u8 (q6,  q5 );
+    q6 = vhaddq_u8 (q14, q13);
+    q7 = vhaddq_u8 (q11, q12);
+    q6 = vrhaddq_u8(q6,  q7 );
+    q5 = vrhaddq_u8(q5,  q6 );
+    q6 = vhaddq_u8 (q12, q10);
+    q6 = vrhaddq_u8(q6,  q13);
+
+    q4 = vbslq_u8(q3, q4, q6); //    VBIF        q4,     q6,     q3    ;q0
+    q3 = vbslq_u8(q3, q5, q13);//    VBSL        q3,     q5,     q13   ;q1
+
+    q10 = vbslq_u8(q2,q0, q10);
+    q11 = vbslq_u8(q2,q1, q11);
+    q12 = vbslq_u8(q2,q4, q12);
+    q13 = vbslq_u8(q2,q3, q13);
+
+    q9 = vspill0;
+    q14 = vspill1;
+
+    TR8(q8,   q9 );
+    TR8(q10,  q11);
+    TR8(q12,  q13);
+    TR8(q14,  q15);
+    TR16(q8,  q10);
+    TR16(q9,  q11);
+    TR16(q12, q14);
+    TR16(q13, q15);
+    TR32(q8,  q12);
+    TR32(q9,  q13);
+    TR32(q10, q14);
+    TR32(q11, q15);
+
+    pix -= 8*stride + 4;
+    vst1_u8(pix, vget_low_u8(q8)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q9)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q10)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q11)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q12)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q13)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q14)); pix += stride;
+    vst1_u8(pix, vget_low_u8(q15)); pix += stride;
+
+    vst1_u8(pix, vget_high_u8(q8)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q9)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q10)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q11)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q12)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q13)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q14)); pix += stride;
+    vst1_u8(pix, vget_high_u8(q15)); pix += stride;
+}
+
+static void deblock_luma_h_neon(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+    uint8x16_t q0, q1, q2, q3, q4, q5, q6, q7, q9, q10, q11, q12, q13, q14;
+    uint8x16_t tmp;
+
+    q9  = vld1q_u8(pix - 3*stride);
+    q10 = vld1q_u8(pix - 2*stride);
+    q11 = vld1q_u8(pix - 1*stride);
+    q12 = vld1q_u8(pix);
+    q13 = vld1q_u8(pix + 1*stride);
+    q14 = vld1q_u8(pix + 2*stride);
+
+    q1  = vabdq_u8(q11, q12);
+    q2  = vcltq_u8(q1, vdupq_n_u8(alpha));
+    q1  = vcltq_u8(vmaxq_u8(vabdq_u8(q11, q10), vabdq_u8(q12, q13)), vdupq_n_u8(beta));
+    q2  = vandq_u8(q2, q1);
+
+    tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pstr));
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    q1  = tmp;
+
+    q1  = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+    q2  = vandq_u8(q2, q1);
+    q7 = vhsubq_u8(q10, q13);
+    q7 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(q7), 1));
+    q0 = veorq_u8(q12, q11);
+    q6 = vandq_u8(vdupq_n_u8(1), q0);
+
+    q0 = vhsubq_u8(q12, q11);// ;(q0-p0))>>1
+
+    q7 = vreinterpretq_u8_s8(vrhaddq_s8(vreinterpretq_s8_u8(q7), vreinterpretq_s8_u8(q6))); //((p1-q1)>>2 + carry + 1) >> 1
+    q7 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(q0),  vreinterpretq_s8_u8(q7))); //=delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+    q7 = vandq_u8(q7, q2);
+
+    tmp = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)pthr));
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    tmp = vzipq_u8(tmp, tmp).val[0];
+    q1  = tmp;
+
+    q1  = vandq_u8(q2, q1);
+
+    q0 = vabdq_u8(q9,  q11); // ap = ABS(p2 - p0);
+    q0 = vcltq_u8(q0,  vdupq_n_u8(beta)); //sp = (ap - beta) >> 31;
+    q4 = vandq_u8(q0,  q2); // & sp
+    q0 = vabdq_u8(q14, q12);//aq = ABS(q2 - q0);
+    q0 = vcltq_u8(q0,  vdupq_n_u8(beta));//sq = (aq - beta) >> 31;
+    q3 = vandq_u8(q0,  q2); // & sq
+
+    q0  = vrhaddq_u8(q11, q12);//((p0+q0+1)>>1)
+    q0  = vhaddq_u8 (q0,  q9 );//((p2 + ((p0+q0+1)>>1))>>1)
+    q5  = vandq_u8  (q1,  q4 );
+    q6  = vqaddq_u8 (q10, q5 );//{p1+thr}
+    q0  = vminq_u8  (q0,  q6 );
+    q6  = vqsubq_u8 (q10, q5 );//{p1-thr}
+    q10 = vmaxq_u8  (q0,  q6 );
+
+    q0   = vrhaddq_u8(q11, q12);// ;((p0+q0+1)>>1)
+    q0   = vhaddq_u8 (q0,  q14);// ;((q2 + ((p0+q0+1)>>1))>>1)
+    q5   = vandq_u8  (q1,  q3 );
+    q6   = vqaddq_u8 (q13, q5 );// ;{q1+thr}
+    q0   = vminq_u8  (q0,  q6 );
+    q6   = vqsubq_u8 (q13, q5 );// ;{q1-thr}
+    q13  = vmaxq_u8  (q0,  q6 );
+
+    q1  = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q3)));
+    q1  = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q1), vreinterpretq_s8_u8(q4))); //tC = thr - sp - sq;
+    q1  = vandq_u8(q1, q2);// ; set thr = 0 if str==0
+
+    q6  = veorq_u8(q6, q6);
+    q5  = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //delta > 0
+    q7  = vreinterpretq_u8_s8(vsubq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7)));
+    q6  = vreinterpretq_u8_s8(vmaxq_s8(vreinterpretq_s8_u8(q6), vreinterpretq_s8_u8(q7))); //-(delta < 0)
+    q5  =  vminq_u8(q1, q5);
+    q6  =  vminq_u8(q1, q6);
+
+    q11 = vqaddq_u8(q11, q5);
+    q11 = vqsubq_u8(q11, q6);
+    q12 = vqsubq_u8(q12, q5);
+    q12 = vqaddq_u8(q12, q6);
+
+    vst1q_u8(pix - 2*stride, q10);
+    vst1q_u8(pix - 1*stride, q11);
+    vst1q_u8(pix           , q12);
+    vst1q_u8(pix + 1*stride, q13);
+}
+
+static void deblock_chroma_v_neon(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+    int32x2_t d16 = vld1_s32((int32_t*)(pix - 2 + 0*stride));
+    int32x2_t d18 = vld1_s32((int32_t*)(pix - 2 + 1*stride));
+    int32x2_t d20 = vld1_s32((int32_t*)(pix - 2 + 2*stride));
+    int32x2_t d22 = vld1_s32((int32_t*)(pix - 2 + 3*stride));
+    int32x2_t d17 = vld1_s32((int32_t*)(pix - 2 + 4*stride));
+    int32x2_t d19 = vld1_s32((int32_t*)(pix - 2 + 5*stride));
+    int32x2_t d21 = vld1_s32((int32_t*)(pix - 2 + 6*stride));
+    int32x2_t d23 = vld1_s32((int32_t*)(pix - 2 + 7*stride));
+    int32x2x2_t tr0 = vtrn_s32(d16, d17);
+    int32x2x2_t tr1 = vtrn_s32(d18, d19);
+    int32x2x2_t tr2 = vtrn_s32(d20, d21);
+    int32x2x2_t tr3 = vtrn_s32(d22, d23);
+    int16x8x2_t tr4 = vtrnq_s16(vreinterpretq_s16_s32(vcombine_s32(tr0.val[0], tr0.val[1])), vreinterpretq_s16_s32(vcombine_s32(tr2.val[0], tr2.val[1])));
+    int16x8x2_t tr5 = vtrnq_s16(vreinterpretq_s16_s32(vcombine_s32(tr1.val[0], tr1.val[1])), vreinterpretq_s16_s32(vcombine_s32(tr3.val[0], tr3.val[1])));
+    uint8x16x2_t tr6 = vtrnq_u8(vreinterpretq_u8_s16(tr4.val[0]), vreinterpretq_u8_s16(tr5.val[0]));
+    uint8x16x2_t tr7 = vtrnq_u8(vreinterpretq_u8_s16(tr4.val[1]), vreinterpretq_u8_s16(tr5.val[1]));
+
+{
+    uint8x16_t q8  = tr6.val[0];
+    uint8x16_t q9  = tr6.val[1];
+    uint8x16_t q10 = tr7.val[0];
+    uint8x16_t q11 = tr7.val[1];
+
+    uint8x16_t q1  = vabdq_u8(q9, q10);
+    uint8x16_t q2  = vcltq_u8(q1, vdupq_n_u8(a));
+    uint8x16_t q4  = vmaxq_u8(vabdq_u8(q10, q11), vabdq_u8(q8, q9));
+    uint8x16_t q0;
+    uint8x16_t q3;
+    uint8x16_t q6;
+     int8x16_t q4s;
+     int8x16_t q7;
+    uint8x16_t q7u;
+    uint8x16_t q5;
+    uint8x16_t vstr = vld1q_u8(str);
+    uint8x16_t vthr = vld1q_u8(thr);
+
+    q4 = vcltq_u8(q4, vdupq_n_u8(b));
+    q2 = vandq_u8(q2, q4);
+    q1 = vzipq_u8(vstr, vstr).val[0];
+    q3 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+    q1 = vshrq_n_u8(q1, 2);
+    q1 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+    q2 = vandq_u8(q2, q3);
+
+    q0 = vzipq_u8(vthr, vthr).val[0];
+    q0 = vaddq_u8(q0, vdupq_n_u8(1));
+    q0 = vandq_u8(q0, q2);
+
+    q7 = vshrq_n_s8(vreinterpretq_s8_u8(vhsubq_u8(q8, q11)), 1);
+    q6 = vandq_u8(vdupq_n_u8(1), veorq_u8(q10, q9));
+    q4 = vhsubq_u8(q10, q9);
+    q7 = vrhaddq_s8(q7, vreinterpretq_s8_u8(q6));
+    q7 = vqaddq_s8(vreinterpretq_s8_u8(q4), q7);
+
+    q4s = vdupq_n_s8(0);
+    q5 = vreinterpretq_u8_s8(vmaxq_s8(q4s,               q7));
+    q4 = vreinterpretq_u8_s8(vmaxq_s8(q4s, vsubq_s8(q4s, q7)));
+    q5 = vminq_u8(q0, q5);
+    q4 = vminq_u8(q0, q4);
+
+    q0 = vqaddq_u8(q9,  q5);
+    q0 = vqsubq_u8(q0,  q4);
+    q3 = vqsubq_u8(q10, q5);
+    q3 = vqaddq_u8(q3,  q4);
+
+    q6  = vrhaddq_u8(vhaddq_u8(q9, q11), q8);
+    q7u = vrhaddq_u8(vhaddq_u8(q8, q10), q11);
+
+    q0 = vbslq_u8(q1,  q6, q0 );
+    q3 = vbslq_u8(q1, q7u, q3 );
+    q9 = vbslq_u8(q2,  q0, q9 );
+    q10= vbslq_u8(q2,  q3, q10);
+
+    tr6 = vtrnq_u8(q8,  q9);
+    tr7 = vtrnq_u8(q10, q11);
+
+    tr4 = vtrnq_s16(vreinterpretq_s16_u8(tr6.val[0]), vreinterpretq_s16_u8(tr7.val[0]));
+    tr5 = vtrnq_s16(vreinterpretq_s16_u8(tr6.val[1]), vreinterpretq_s16_u8(tr7.val[1]));
+
+    tr0 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr4.val[0])), vget_high_s32(vreinterpretq_s32_s16(tr4.val[0])));
+    tr1 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr5.val[0])), vget_high_s32(vreinterpretq_s32_s16(tr5.val[0])));
+    tr2 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr4.val[1])), vget_high_s32(vreinterpretq_s32_s16(tr4.val[1])));
+    tr3 = vtrn_s32(vget_low_s32(vreinterpretq_s32_s16(tr5.val[1])), vget_high_s32(vreinterpretq_s32_s16(tr5.val[1])));
+
+#if 0
+    // unaligned store fools Android NDK 15 optimizer
+    *(int32_t*)(uint8_t*)(pix - 2 + 0*stride) = vget_lane_s32(tr0.val[0], 0);
+    *(int32_t*)(uint8_t*)(pix - 2 + 1*stride) = vget_lane_s32(tr1.val[0], 0);
+    *(int32_t*)(uint8_t*)(pix - 2 + 2*stride) = vget_lane_s32(tr2.val[0], 0);
+    *(int32_t*)(uint8_t*)(pix - 2 + 3*stride) = vget_lane_s32(tr3.val[0], 0);
+    *(int32_t*)(uint8_t*)(pix - 2 + 4*stride) = vget_lane_s32(tr0.val[1], 0);
+    *(int32_t*)(uint8_t*)(pix - 2 + 5*stride) = vget_lane_s32(tr1.val[1], 0);
+    *(int32_t*)(uint8_t*)(pix - 2 + 6*stride) = vget_lane_s32(tr2.val[1], 0);
+    *(int32_t*)(uint8_t*)(pix - 2 + 7*stride) = vget_lane_s32(tr3.val[1], 0);
+#else
+    vst1_lane_s16((int16_t*)(pix - 2 + 0*stride),     vreinterpret_s16_s32(tr0.val[0]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 0*stride) + 1, vreinterpret_s16_s32(tr0.val[0]), 1);
+    vst1_lane_s16((int16_t*)(pix - 2 + 1*stride),     vreinterpret_s16_s32(tr1.val[0]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 1*stride) + 1, vreinterpret_s16_s32(tr1.val[0]), 1);
+    vst1_lane_s16((int16_t*)(pix - 2 + 2*stride),     vreinterpret_s16_s32(tr2.val[0]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 2*stride) + 1, vreinterpret_s16_s32(tr2.val[0]), 1);
+    vst1_lane_s16((int16_t*)(pix - 2 + 3*stride),     vreinterpret_s16_s32(tr3.val[0]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 3*stride) + 1, vreinterpret_s16_s32(tr3.val[0]), 1);
+    vst1_lane_s16((int16_t*)(pix - 2 + 4*stride),     vreinterpret_s16_s32(tr0.val[1]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 4*stride) + 1, vreinterpret_s16_s32(tr0.val[1]), 1);
+    vst1_lane_s16((int16_t*)(pix - 2 + 5*stride),     vreinterpret_s16_s32(tr1.val[1]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 5*stride) + 1, vreinterpret_s16_s32(tr1.val[1]), 1);
+    vst1_lane_s16((int16_t*)(pix - 2 + 6*stride),     vreinterpret_s16_s32(tr2.val[1]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 6*stride) + 1, vreinterpret_s16_s32(tr2.val[1]), 1);
+    vst1_lane_s16((int16_t*)(pix - 2 + 7*stride),     vreinterpret_s16_s32(tr3.val[1]), 0);
+    vst1_lane_s16((int16_t*)(pix - 2 + 7*stride) + 1, vreinterpret_s16_s32(tr3.val[1]), 1);
+#endif
+}
+}
+
+static void deblock_chroma_h_neon(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+    uint8x16_t q0;
+    uint8x16_t q8  = vld1q_u8(pix - 2*stride);
+    uint8x16_t q9  = vld1q_u8(pix - 1*stride);
+    uint8x16_t q10 = vld1q_u8(pix);
+    uint8x16_t q11 = vld1q_u8(pix + stride);
+    uint8x16_t q1  = vabdq_u8(q9, q10);
+    uint8x16_t q2  = vcltq_u8(q1, vdupq_n_u8(a));
+    uint8x16_t q4  = vmaxq_u8(vabdq_u8(q10, q11), vabdq_u8(q8, q9));
+    uint8x16_t q3;
+    uint8x16_t q6;
+     int8x16_t q4s;
+     int8x16_t q7;
+    uint8x16_t q7u;
+    uint8x16_t q5;
+    uint8x16_t vstr = vld1q_u8(str);
+    uint8x16_t vthr = vld1q_u8(thr);
+
+    q4 = vcltq_u8(q4, vdupq_n_u8(b));
+    q2 = vandq_u8(q2, q4);
+    q1 = vzipq_u8(vstr, vstr).val[0];
+    q3 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+    q1 = vshrq_n_u8(q1, 2);
+    q1 = vcgtq_s8(vreinterpretq_s8_u8(q1), vdupq_n_s8(0));
+    q2 = vandq_u8(q2, q3);
+
+    q0 = vzipq_u8(vthr, vthr).val[0];
+    q0 = vaddq_u8(q0, vdupq_n_u8(1));
+    q0 = vandq_u8(q0, q2);
+
+    q7 = vshrq_n_s8(vreinterpretq_s8_u8(vhsubq_u8(q8, q11)), 1);
+    q6 = vandq_u8(vdupq_n_u8(1), veorq_u8(q10, q9));
+    q4 = vhsubq_u8(q10, q9);
+    q7 = vrhaddq_s8(q7, vreinterpretq_s8_u8(q6));
+    q7 = vqaddq_s8(vreinterpretq_s8_u8(q4), q7);
+
+    q4s = vdupq_n_s8(0);
+    q5 = vreinterpretq_u8_s8(vmaxq_s8(q4s,               q7));
+    q4 = vreinterpretq_u8_s8(vmaxq_s8(q4s, vsubq_s8(q4s, q7)));
+    q5 = vminq_u8(q0, q5);
+    q4 = vminq_u8(q0, q4);
+
+    q0 = vqaddq_u8(q9,  q5);
+    q0 = vqsubq_u8(q0,  q4);
+    q3 = vqsubq_u8(q10, q5);
+    q3 = vqaddq_u8(q3,  q4);
+
+    q6  = vrhaddq_u8(vhaddq_u8(q9, q11), q8);
+    q7u = vrhaddq_u8(vhaddq_u8(q8, q10), q11);
+
+    q0 = vbslq_u8(q1,  q6, q0 );
+    q3 = vbslq_u8(q1, q7u, q3 );
+    q9 = vbslq_u8(q2,  q0, q9 );
+    q10= vbslq_u8(q2,  q3, q10);
+
+    vst1_u8(pix - stride, vget_low_u8(q9));
+    vst1_u8(pix,          vget_low_u8(q10));
+}
+
+static void h264e_deblock_chroma_neon(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+    const uint8_t *alpha = par->alpha;
+    const uint8_t *beta = par->beta;
+    const uint8_t *thr = par->tc0;
+    const uint8_t *strength = (uint8_t *)par->strength32;
+    int a, b, x, y;
+    a = alpha[0];
+    b = beta[0];
+    for (x = 0; x < 16; x += 8)
+    {
+        uint32_t str = *(uint32_t*)&strength[x];
+        if (str && a)
+        {
+            deblock_chroma_v_neon(pix + (x >> 1), stride, a, b, thr + x, strength + x);
+        }
+        a = alpha[1];
+        b = beta[1];
+    }
+    thr += 16;
+    strength += 16;
+    a = alpha[2];
+    b = beta[2];
+    for (y = 0; y < 16; y += 8)
+    {
+        uint32_t str = *(uint32_t*)&strength[y];
+        if (str && a)
+        {
+            deblock_chroma_h_neon(pix, stride, a, b, thr + y, strength + y);
+        }
+        pix += 4*stride;
+        a = alpha[3];
+        b = beta[3];
+    }
+}
+
+static void h264e_deblock_luma_neon(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+    const uint8_t *alpha = par->alpha;
+    const uint8_t *beta = par->beta;
+    const uint8_t *thr = par->tc0;
+    const uint8_t *strength = (uint8_t *)par->strength32;
+    int a = alpha[0];
+    int b = beta[0];
+    int x, y;
+    for (x = 0; x < 16; x += 4)
+    {
+        uint32_t str = *(uint32_t*)&strength[x];
+        if ((uint8_t)str == 4)
+        {
+            deblock_luma_v_s4_neon(pix + x, stride, a, b);
+        } else if (str && a)
+        {
+            deblock_luma_v_neon(pix + x, stride, a, b, thr + x, strength + x);
+        }
+        a = alpha[1];
+        b = beta[1];
+    }
+    a = alpha[2];
+    b = beta[2];
+    thr += 16;
+    strength += 16;
+    for (y = 0; y < 16; y += 4)
+    {
+        uint32_t str = *(uint32_t*)&strength[y];
+        if ((uint8_t)str == 4)
+        {
+            deblock_luma_h_s4_neon(pix, stride, a, b);
+        } else if (str && a)
+        {
+            deblock_luma_h_neon(pix, stride, a, b, thr + y, strength + y);
+        }
+        a = alpha[3];
+        b = beta[3];
+        pix += 4*stride;
+    }
+}
+
+static void h264e_denoise_run_neon(unsigned char *frm, unsigned char *frmprev, int w, int h_arg, int stride_frm, int stride_frmprev)
+{
+    int cloop, h = h_arg;
+    if (w <= 2 || h <= 2)
+    {
+        return;
+    }
+    w -= 2;
+    h -= 2;
+
+    do
+    {
+        unsigned char *pf = frm += stride_frm;
+        unsigned char *pp = frmprev += stride_frmprev;
+        cloop = w;
+        pp[-stride_frmprev] = *pf++;
+        pp++;
+
+        for (;cloop >= 8; cloop -= 8, pf += 8, pp += 8)
+        {
+            uint16x8_t vp0w;
+            uint32x4_t vpr0;
+            uint32x4_t vpr1;
+            uint16x8_t vf0w;
+            int16x8_t vcls, vt, vcl, vgn, vgd;
+            uint16x8_t vg;
+            uint8x8_t vf0 = vld1_u8(pf);
+            uint8x8_t vft = vld1_u8(pf - stride_frm);
+            uint8x8_t vfb = vld1_u8(pf + stride_frm);
+            uint8x8_t vfl = vld1_u8(pf - 1);
+            uint8x8_t vfr = vld1_u8(pf + 1);
+            uint8x8_t vp0 = vld1_u8(pp);
+            uint8x8_t vpt = vld1_u8(pp - stride_frmprev);
+            uint8x8_t vpb = vld1_u8(pp + stride_frmprev);
+            uint8x8_t vpl = vld1_u8(pp - 1);
+            uint8x8_t vpr = vld1_u8(pp + 1);
+            uint16x8_t vd  = vabdl_u8(vf0, vp0);
+            uint16x8_t vfs = vaddw_u8(vaddw_u8(vaddl_u8(vft, vfb), vfl), vfr);
+            uint16x8_t vps = vaddw_u8(vaddw_u8(vaddl_u8(vpt, vpb), vpl), vpr);
+            uint16x8_t vneighbourhood = vshrq_n_u16(vabdq_u16(vfs, vps), 2);
+
+            vt = vaddq_s16(vreinterpretq_s16_u16(vd), vdupq_n_s16(1));
+
+            vt = vqshlq_n_s16(vt, 7);
+            vcls = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcls);
+            vt = vqdmulhq_s16(vt,vt);                             // 1
+
+            vcl = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+            vt = vqdmulhq_s16(vt,vt);                             // 2
+            vcl = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+            vt = vqdmulhq_s16(vt,vt);                             // 3
+            vcl = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+            vt = vqdmulhq_s16(vt,vt);                             // 4
+            vcl = vclsq_s16(vt);
+            // vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+
+            vgd = vsubq_s16(vdupq_n_s16(127), vcls);
+
+            // same as above 
+            vt = vaddq_s16(vreinterpretq_s16_u16(vneighbourhood), vdupq_n_s16(1));
+            
+            vt = vqshlq_n_s16(vt, 7);
+            vcls = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcls);
+            vt = vqdmulhq_s16(vt,vt);                             // 1
+            vcl = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+            vt = vqdmulhq_s16(vt,vt);                             // 2
+            vcl = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+            vt = vqdmulhq_s16(vt,vt);                             // 3
+            vcl = vclsq_s16(vt);
+            vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+            vt = vqdmulhq_s16(vt,vt);                             // 4
+            vcl = vclsq_s16(vt);
+            // vt = vshlq_s16(vt, vcl);
+            vcls = vaddq_s16(vaddq_s16(vcls, vcls), vcl);
+
+            vgn = vsubq_s16(vdupq_n_s16(127), vcls);
+
+            vgn = vreinterpretq_s16_u16(vshrq_n_u16(vqshlq_n_u16(vreinterpretq_u16_s16(vgn), 10), 8));            // <<=2, saturated
+
+            vgd = vsubq_s16(vdupq_n_s16(255), vgd);
+            vgn = vsubq_s16(vdupq_n_s16(255), vgn);
+
+            //vst1_u8(pp - stride_frmprev, vreinterpret_u8_s8(vmovn_s16(vgn)));
+            //vst1_u8(pp - stride_frmprev, vreinterpret_u8_s8(vmovn_s16(vreinterpretq_s16_u16(vneighbourhood))));
+            //vst1_u8(pp - stride_frmprev, vp0);
+
+            vg  = vmulq_u16(vreinterpretq_u16_s16(vgn), vreinterpretq_u16_s16(vgd));
+
+            vp0w = vmovl_u8(vp0);
+            vpr0 = vmull_u16(vget_low_u16(vp0w), vget_low_u16(vg));
+            vpr1 = vmull_u16(vget_high_u16(vp0w), vget_high_u16(vg));
+            vg = vreinterpretq_u16_s16(vsubq_s16(vreinterpretq_s16_u8(vdupq_n_u8(255)), vreinterpretq_s16_u16(vg)));
+
+            vf0w = vmovl_u8(vf0);
+            vpr0 = vmlal_u16(vpr0, vget_low_u16(vf0w), vget_low_u16(vg));
+            vpr1 = vmlal_u16(vpr1, vget_high_u16(vf0w), vget_high_u16(vg));
+
+            vst1_u8(pp - stride_frmprev, vmovn_u16(vcombine_u16(vrshrn_n_u32(vpr0, 16), vrshrn_n_u32(vpr1, 16))));
+        }                    
+
+        while (cloop--)
+        {
+            int d, neighbourhood;
+            unsigned g, gd, gn, out_val;
+            d = pf[0] - pp[0];
+            neighbourhood  = pf[-1] - pp[-1];
+            neighbourhood += pf[+1] - pp[+1];
+            neighbourhood += pf[-stride_frm] - pp[-stride_frmprev];
+            neighbourhood += pf[+stride_frm] - pp[+stride_frmprev];
+
+            if (d < 0) 
+            {
+                d = -d;
+            }
+            if (neighbourhood < 0) 
+            {
+                neighbourhood = -neighbourhood;
+            }
+            neighbourhood >>= 2;
+
+            gd = g_diff_to_gainQ8[d];
+            gn = g_diff_to_gainQ8[neighbourhood];
+
+            gn <<= 2;
+            if (gn > 255) 
+            {
+                gn = 255;
+            }
+
+            gn = 255 - gn;
+            gd = 255 - gd;
+            g = gn*gd;  // Q8*Q8 = Q16;
+
+            //out_val = ((pp[0]*g ) >> 16) + (((0xffff-g)*pf[0] ) >> 16);
+            //out_val = ((pp[0]*g + (1<<15)) >> 16) + (((0xffff-g)*pf[0]  + (1<<15)) >> 16);
+            out_val = (pp[0]*g + (0xffff - g)*pf[0]  + (1 << 15)) >> 16;
+            
+            assert(out_val <= 255);
+            
+            pp[-stride_frmprev] = (unsigned char)out_val;
+            //pp[-stride_frmprev] = gn;
+            //pp[-stride_frmprev] = neighbourhood;
+            //pp[-stride_frmprev] = pp[0];
+
+            pf++, pp++;
+        } 
+
+        pp[-stride_frmprev] = *pf++;
+    } while(--h);
+
+    memcpy(frmprev + stride_frmprev, frm + stride_frm, w + 2);
+    h = h_arg - 2;
+    do
+    {
+        memcpy(frmprev, frmprev - stride_frmprev, w + 2);
+        frmprev -= stride_frmprev;
+    } while(--h);
+    memcpy(frmprev, frm - stride_frm*(h_arg - 2), w + 2);
+}
+
+#undef IS_NULL
+#define IS_NULL(p) ((p) < (pix_t *)32)
+
+static uint32_t intra_predict_dc4_neon(const pix_t *left, const pix_t *top)
+{
+    unsigned dc = 0, side = 4, round = 0;
+    uint32x2_t s = vdup_n_u32(0);
+
+    if (!IS_NULL(left))
+    {
+        s = vpaddl_u16(vpaddl_u8(vld1_u8(left)));
+        round += side >> 1;
+    }
+    if (!IS_NULL(top))
+    {
+        s = vadd_u32(s, vpaddl_u16(vpaddl_u8(vld1_u8(top))));
+        round += side >> 1;
+    }
+    dc = vget_lane_u32(s, 0);
+
+    dc += round;
+    if (round == side) dc >>= 1;
+    dc >>= 2;
+    if (!round) dc = 128;
+    return dc * 0x01010101;
+}
+
+static uint8x16_t intra_predict_dc16_neon(const pix_t *left, const pix_t *top)
+{
+    unsigned dc = 0, side = 16, round = 0;
+
+    if (!IS_NULL(left))
+    {
+        uint8x16_t v = vld1q_u8(left);
+        uint64x2_t s = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v)));
+        uint64x1_t q = vadd_u64(vget_high_u64(s), vget_low_u64(s));
+        dc += vget_lane_u32(vreinterpret_u32_u64(q), 0);
+        round += side >> 1;
+    }
+    if (!IS_NULL(top))
+    {
+        uint8x16_t v = vld1q_u8(top);
+        uint64x2_t s = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v)));
+        uint64x1_t q = vadd_u64(vget_high_u64(s), vget_low_u64(s));
+        dc += vget_lane_u32(vreinterpret_u32_u64(q), 0);
+        round += side >> 1;
+    }
+    dc += round;
+    if (round == side) dc >>= 1;
+    dc >>= 4;
+    if (!round) dc = 128;
+    return vdupq_n_u8(dc);
+}
+
+/*
+ * Note: To make the code more readable we refer to the neighboring pixels
+ * in variables named as below:
+ *
+ *    UL U0 U1 U2 U3 U4 U5 U6 U7
+ *    L0 xx xx xx xx
+ *    L1 xx xx xx xx
+ *    L2 xx xx xx xx
+ *    L3 xx xx xx xx
+ */
+#define UL edge[-1] 
+#define U0 edge[0] 
+#define T1 edge[1] 
+#define U2 edge[2] 
+#define U3 edge[3] 
+#define U4 edge[4] 
+#define U5 edge[5] 
+#define U6 edge[6] 
+#define U7 edge[7] 
+#define L0 edge[-2]
+#define L1 edge[-3]
+#define L2 edge[-4]
+#define L3 edge[-5]
+
+static void h264e_intra_predict_16x16_neon(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+    int cloop = 4;
+    uint32_t *d = (uint32_t*)predict;
+    uint32x4_t v;
+    assert(IS_ALIGNED(predict, 4));
+    assert(IS_ALIGNED(top, 4));
+    if (mode != 1)
+    {
+        if (mode < 1)
+        {
+            v = vld1q_u32((uint32_t*)top);
+        } else //(mode == 2)
+        {
+            v = vreinterpretq_u32_u8(intra_predict_dc16_neon(left, top));
+        }
+        do
+        {
+            vst1q_u32(d, v); d += 4;
+            vst1q_u32(d, v); d += 4;
+            vst1q_u32(d, v); d += 4;
+            vst1q_u32(d, v); d += 4;
+        } while (--cloop);
+    } else //if (mode == 1)
+    {
+        do
+        {
+            vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+            vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+            vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+            vst1q_u8((uint8_t*)d, vdupq_n_u8(*left++)); d += 4;
+        } while (--cloop);
+    }
+}
+
+static void h264e_intra_predict_chroma_neon(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+    int cloop = 8;
+    uint32_t *d = (uint32_t*)predict;
+    uint32x4_t v;
+    assert(IS_ALIGNED(predict, 4));
+    assert(IS_ALIGNED(top, 4));
+    if (mode < 1)
+    {
+        v = vld1q_u32((uint32_t*)top);
+        vst1q_u32(d, v); d += 4;
+        vst1q_u32(d, v); d += 4;
+        vst1q_u32(d, v); d += 4;
+        vst1q_u32(d, v); d += 4;
+        vst1q_u32(d, v); d += 4;
+        vst1q_u32(d, v); d += 4;
+        vst1q_u32(d, v); d += 4;
+        vst1q_u32(d, v); d += 4;
+    } else if (mode == 1)
+    {
+        do 
+        {
+            v = vreinterpretq_u32_u8(vcombine_u8(vdup_n_u8(left[0]), vdup_n_u8(left[8])));
+            vst1q_u32(d, v); d += 4;
+            left++;
+        } while(--cloop);
+    } else //if (mode == 2)
+    {
+        int ccloop = 2;
+        cloop = 2;
+        do
+        {
+            d[0] = d[1] = d[16] = intra_predict_dc4_neon(left, top);
+            d[17] = intra_predict_dc4_neon(left + 4, top + 4);
+            if (!IS_NULL(top))
+            {
+                d[1] = intra_predict_dc4_neon(NULL, top + 4);
+            }
+            if (!IS_NULL(left))
+            {
+                d[16] = intra_predict_dc4_neon(NULL, left + 4);
+            }
+            d += 2;
+            left += 8;
+            top += 8;
+        } while(--cloop);
+
+        do
+        {
+            v = vld1q_u32(d - 4);
+            vst1q_u32(d, v); d += 4;
+            vst1q_u32(d, v); d += 4;
+            vst1q_u32(d, v); d += 4;
+            d += 4;
+        } while(--ccloop);
+    }
+}
+
+static __inline int vsad_neon(uint8x16_t a, uint8x16_t b)
+{
+    uint64x2_t s = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vabdq_u8(a, b))));
+    uint64x1_t q = vadd_u64(vget_high_u64(s), vget_low_u64(s));
+    return vget_lane_u32(vreinterpret_u32_u64(q), 0);
+}
+
+static int h264e_intra_choose_4x4_neon(const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty)
+{
+    int sad, best_sad, best_m = 2;
+
+    uint32_t r0, r1, r2, r3;
+    uint8x16_t vx, vt, vr, vpred, v1, v2, v8, v9, q1, q2, q10, q11, q12;
+    uint8x8_t d2, d3;
+
+    r0 = ((uint32_t *)blockin)[ 0];
+    r1 = ((uint32_t *)blockin)[ 4];
+    r2 = ((uint32_t *)blockin)[ 8];
+    r3 = ((uint32_t *)blockin)[12];
+    vr = vcombine_u8(vcreate_u8(((uint64_t)r1 << 32) | r0), vcreate_u8(((uint64_t)r3 << 32) | r2));
+
+#define VTEST(mode) sad = vsad_neon(vr,vx);    \
+            if (mode != mpred) sad += penalty; \
+            if (sad < best_sad)                \
+            {                                  \
+                vpred = vx;                    \
+                best_sad = sad;                \
+                best_m = mode;                 \
+            }
+
+    // DC
+    vx = vdupq_n_u8(intra_predict_dc4_neon((avail & AVAIL_L) ? &L3 : 0, (avail & AVAIL_T) ? &U0 : 0));
+
+    best_sad = vsad_neon(vx, vr);
+    if (2 != mpred) 
+    {   
+        best_sad += penalty;
+    }
+    vpred = vx;
+
+    vt = vld1q_u8(&L3);
+    vt = vreinterpretq_u8_u32(vsetq_lane_u32(U7*0x01010101, vreinterpretq_u32_u8(vt), 3));
+    if (avail & AVAIL_T)
+    {
+        uint32x2_t t2;
+        if (!(avail & AVAIL_TR))
+        {
+            vt = vcombine_u8(vget_low_u8(vt), vdup_n_u8(U3));
+        }
+
+        vx =  vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t*)&U0));
+        VTEST(0);
+
+        vx = vt;
+        vx = vrhaddq_u8(vhaddq_u8(vextq_u8(vx, vx, 5), vextq_u8(vx, vx, 7)), vextq_u8(vx, vx, 6));
+
+        v1 = vextq_u8(vx, vx, 1);
+        d2 = vext_u8(vget_low_u8(vx), vget_low_u8(vx), 2);
+        d3 = vext_u8(vget_low_u8(vx), vget_low_u8(vx), 3);
+        vx = vreinterpretq_u8_u32(vcombine_u32(
+            t2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(vx)), vreinterpret_u32_u8(vget_low_u8(v1))).val[0], 
+            vzip_u32(vreinterpret_u32_u8(d2), vreinterpret_u32_u8(d3)).val[0]));
+        VTEST(3);
+
+        vx = vt;
+        vx = vrhaddq_u8(vextq_u8(vt, vt, 5), vextq_u8(vt, vt, 6));
+        vx = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(vx), vreinterpretq_u32_u8(vextq_u8(vx, vx, 1))).val[0]);
+        vx = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(vx), 
+        vreinterpretq_u32_u8(vcombine_u8(vreinterpret_u8_u32(t2), vget_high_u8(vextq_u8(vt, vt, 7))))).val[0]);
+
+        VTEST(7);
+    }
+
+    if (avail & AVAIL_L)
+    {
+        vx = vrev32q_u8(vt);
+        vx = vzipq_u8(vx, vx).val[0];
+        vx = vzipq_u8(vx, vx).val[0];
+        VTEST(1);
+
+        v2 = vrev32q_u8(vt);
+        v8 = vrev32q_u8(vt);
+        vx = vrev32q_u8(vt);
+        v8 = vzipq_u8(vx, vx).val[0];
+        {
+            int tmp = vgetq_lane_u16(vreinterpretq_u16_u8(v8), 3);
+            v2 = vreinterpretq_u8_u16(vsetq_lane_u16(tmp, vreinterpretq_u16_u8(v2), 2));
+            v8 = vreinterpretq_u8_u16(vsetq_lane_u16(tmp, vreinterpretq_u16_u8(v8), 4));
+            v9 = vextq_u8(v2, v2, 14);
+            v9 = vzipq_u8(v9, vhaddq_u8(v9, v2)).val[0]; 
+            v9 = vrhaddq_u8(v9, vextq_u8(v8, v8, 14));
+            tmp |= tmp << 16;
+            vx = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(vextq_u8(v9, v9, 4)),
+                                                vreinterpretq_u32_u8(vextq_u8(v9, v9, 6))).val[0]);
+            vx = vreinterpretq_u8_u32(vsetq_lane_u32(tmp, vreinterpretq_u32_u8(vx), 3));
+        }
+        VTEST(8);
+    }
+
+    if ((avail & (AVAIL_T | AVAIL_L | AVAIL_TL)) == (AVAIL_T | AVAIL_L | AVAIL_TL))
+    {
+        uint32x2x2_t pair;
+        uint8x8_t d4, d6;
+        int lr;
+        q11 = q2 = vrhaddq_u8(vhaddq_u8(vt, vextq_u8(vt, vt, 2)), q10 = vextq_u8(vt, vt, 1));
+        d4 = vget_low_u8(q2);
+        d6 = vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(vext_u8(d4, d4, 3)), vreinterpret_u32_u8(vext_u8(d4, d4, 1))).val[0]);
+        d4 = vreinterpret_u8_u32(vzip_u32(vreinterpret_u32_u8(vext_u8(d4, d4, 2)), vreinterpret_u32_u8(d4)).val[0]);
+        pair = vzip_u32(vreinterpret_u32_u8(d6), vreinterpret_u32_u8(d4));
+        vx = vcombine_u8(vreinterpret_u8_u32(pair.val[0]), vreinterpret_u8_u32(pair.val[1]));
+        VTEST(4);
+
+        vx  = q12 = vrhaddq_u8(vt, q10);
+        q1  = vzipq_u8(vx, q11).val[0];
+        q1  = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(q1), vreinterpretq_u32_u8(vextq_u8(q1, q1, 2))).val[0]);
+        q1  = vreinterpretq_u8_u32(vrev64q_u32(vreinterpretq_u32_u8(q1)));
+        vx  = vcombine_u8(vget_high_u8(q1), vget_low_u8(q1));
+        vx = vreinterpretq_u8_u16(
+            vsetq_lane_u16(vgetq_lane_u16(vreinterpretq_u16_u8(q11), 2), vreinterpretq_u16_u8(vx), 1));
+        VTEST(6);
+
+        q11 = vextq_u8(q11, q11, 1);
+        q1  = vextq_u8(q12, q12, 4);
+        q2  = vextq_u8(q11, q11, 2);
+        q1  = vreinterpretq_u8_u32(vzipq_u32(vreinterpretq_u32_u8(q1), vreinterpretq_u32_u8(q2)).val[0]);
+        q12 = vreinterpretq_u8_u16(vsetq_lane_u16(lr = vgetq_lane_u16(vreinterpretq_u16_u8(q11), 0), vreinterpretq_u16_u8(q12), 1));
+        q11 = vreinterpretq_u8_u16(vsetq_lane_u16((lr << 8) & 0xffff, vreinterpretq_u16_u8(q11), 0));
+        vx = vcombine_u8(vget_low_u8(q1), vreinterpret_u8_u32(vzip_u32(
+            vreinterpret_u32_u8(vext_u8(vget_low_u8(q12), vget_low_u8(q12), 3)),
+            vreinterpret_u32_u8(vext_u8(vget_low_u8(q11), vget_low_u8(q11), 1))
+            ).val[0]));
+        VTEST(5);
+    }
+
+    vst1q_lane_u32(((uint32_t *)blockpred) + 0, vreinterpretq_u32_u8(vpred ), 0);
+    vst1q_lane_u32(((uint32_t *)blockpred) + 4, vreinterpretq_u32_u8(vpred ), 1);
+    vst1q_lane_u32(((uint32_t *)blockpred) + 8, vreinterpretq_u32_u8(vpred ), 2);
+    vst1q_lane_u32(((uint32_t *)blockpred) +12, vreinterpretq_u32_u8(vpred ), 3);
+    return best_m + (best_sad << 4); // pack result
+}
+
+static void copy_wh_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    if (w == 4)
+    {
+        do
+        {
+            *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+            *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+            *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+            *(int32_t*)dst = *(int32_t*)src; dst += 16; src += src_stride;
+        } while (h -= 4);
+    } else if (w == 8)
+    {
+        do
+        {
+            vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+            vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+            vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+            vst1_u8(dst, vld1_u8(src)); dst += 16; src += src_stride;
+        } while (h -= 4);
+    } else
+    {
+        do
+        {
+            uint8x16_t v0, v1, v2, v3;
+            v0 = vld1q_u8(src); src += src_stride;
+            v1 = vld1q_u8(src); src += src_stride;
+            v2 = vld1q_u8(src); src += src_stride;
+            v3 = vld1q_u8(src); src += src_stride;
+
+            vst1q_u8(dst, v0); dst += 16; 
+            vst1q_u8(dst, v1); dst += 16; 
+            vst1q_u8(dst, v2); dst += 16; 
+            vst1q_u8(dst, v3); dst += 16; 
+        } while (h -= 4);
+    }
+}
+
+static void hpel_lpf_hor_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    uint8x8_t c5 = vdup_n_u8(5);
+    uint8x8_t c20 = vshl_n_u8(c5, 2);
+    if (w == 16)
+    {
+        do
+        {
+            uint8x16_t s0 = vld1q_u8(src - 2);
+            uint8x16_t s1 = vld1q_u8(src - 2 + 16);
+            uint8x16_t v0 = s0;
+            uint8x16_t v1 = vextq_u8(s0, s1, 1);
+            uint8x16_t v2 = vextq_u8(s0, s1, 2);
+            uint8x16_t v3 = vextq_u8(s0, s1, 3);
+            uint8x16_t v4 = vextq_u8(s0, s1, 4);
+            uint8x16_t v5 = vextq_u8(s0, s1, 5);
+
+            uint16x8_t q, s = vaddl_u8(vget_low_u8(v0), vget_low_u8(v5));
+            s = vmlsl_u8(s, vget_low_u8(v1), c5);
+            s = vmlsl_u8(s, vget_low_u8(v4), c5);
+            s = vmlal_u8(s, vget_low_u8(v2), c20);
+            s = vmlal_u8(s, vget_low_u8(v3), c20);
+
+            q = vaddl_u8(vget_high_u8(v0), vget_high_u8(v5));
+            q = vmlsl_u8(q, vget_high_u8(v1), c5);
+            q = vmlsl_u8(q, vget_high_u8(v4), c5);
+            q = vmlal_u8(q, vget_high_u8(v2), c20);
+            q = vmlal_u8(q, vget_high_u8(v3), c20);
+
+            vst1q_u8(dst, vcombine_u8(
+                vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5),
+                vqrshrun_n_s16(vreinterpretq_s16_u16(q), 5)));
+
+            dst += 16;
+            src += src_stride;
+        } while (--h);
+    } else
+    {
+        do
+        {
+            uint8x16_t line = vld1q_u8(src - 2);
+            uint8x8_t s0 = vget_low_u8(line);
+            uint8x8_t s1 = vget_high_u8(line);
+            uint8x8_t v0 = s0;
+            uint8x8_t v1 = vext_u8(s0, s1, 1);
+            uint8x8_t v2 = vext_u8(s0, s1, 2);
+            uint8x8_t v3 = vext_u8(s0, s1, 3);
+            uint8x8_t v4 = vext_u8(s0, s1, 4);
+            uint8x8_t v5 = vext_u8(s0, s1, 5);
+
+            uint16x8_t s = vaddl_u8(v0, v5);
+            s = vmlsl_u8(s, v1, c5);
+            s = vmlsl_u8(s, v4, c5);
+            s = vmlal_u8(s, v2, c20);
+            s = vmlal_u8(s, v3, c20);
+
+            vst1_u8(dst, vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5));
+
+            dst += 16;
+            src += src_stride;
+        } while (--h);
+    }
+}
+
+static void hpel_lpf_hor16_neon(const uint8_t *src, int src_stride, int16_t *h264e_restrict dst, int w, int h)
+{
+    uint8x8_t c5 = vdup_n_u8(5);
+    uint8x8_t c20 = vshl_n_u8(c5, 2);
+    if (w == 16)
+    {
+        do
+        {
+            uint8x16_t s0 = vld1q_u8(src - 2);
+            uint8x16_t s1 = vld1q_u8(src - 2 + 16);
+            uint8x16_t v0 = s0;
+            uint8x16_t v1 = vextq_u8(s0, s1, 1);
+            uint8x16_t v2 = vextq_u8(s0, s1, 2);
+            uint8x16_t v3 = vextq_u8(s0, s1, 3);
+            uint8x16_t v4 = vextq_u8(s0, s1, 4);
+            uint8x16_t v5 = vextq_u8(s0, s1, 5);
+
+            uint16x8_t q, s = vaddl_u8(vget_low_u8(v0), vget_low_u8(v5));
+            s = vmlsl_u8(s, vget_low_u8(v1), c5);
+            s = vmlsl_u8(s, vget_low_u8(v4), c5);
+            s = vmlal_u8(s, vget_low_u8(v2), c20);
+            s = vmlal_u8(s, vget_low_u8(v3), c20);
+
+            q = vaddl_u8(vget_high_u8(v0), vget_high_u8(v5));
+            q = vmlsl_u8(q, vget_high_u8(v1), c5);
+            q = vmlsl_u8(q, vget_high_u8(v4), c5);
+            q = vmlal_u8(q, vget_high_u8(v2), c20);
+            q = vmlal_u8(q, vget_high_u8(v3), c20);
+
+            vst1q_s16(dst, vreinterpretq_s16_u16(s));
+            vst1q_s16(dst + 8, vreinterpretq_s16_u16(q));
+
+            dst += 16;
+            src += src_stride;
+        } while (--h);
+    } else
+    {
+        do
+        {
+            uint8x16_t line = vld1q_u8(src - 2);
+            uint8x8_t s0 = vget_low_u8(line);
+            uint8x8_t s1 = vget_high_u8(line);
+            uint8x8_t v0 = s0;
+            uint8x8_t v1 = vext_u8(s0, s1,  1);
+            uint8x8_t v2 = vext_u8(s0, s1, 2);
+            uint8x8_t v3 = vext_u8(s0, s1, 3);
+            uint8x8_t v4 = vext_u8(s0, s1, 4);
+            uint8x8_t v5 = vext_u8(s0, s1, 5);
+
+            uint16x8_t s = vaddl_u8(v0, v5);
+            s = vmlsl_u8(s, v1, c5);
+            s = vmlsl_u8(s, v4, c5);
+            s = vmlal_u8(s, v2, c20);
+            s = vmlal_u8(s, v3, c20);
+
+            vst1q_s16(dst, vreinterpretq_s16_u16(s));
+
+            dst += 16;
+            src += src_stride;
+        } while (--h);
+    }
+}
+
+static void hpel_lpf_ver_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    uint8x8_t c5 = vdup_n_u8(5);
+    uint8x8_t c20 = vshl_n_u8(c5, 2);
+
+    if (w == 16)
+    {
+        uint8x16_t v0 = vld1q_u8(src - 2*src_stride);
+        uint8x16_t v1 = vld1q_u8(src - 1*src_stride);
+        uint8x16_t v2 = vld1q_u8(src);
+        uint8x16_t v3 = vld1q_u8(src + 1*src_stride);
+        uint8x16_t v4 = vld1q_u8(src + 2*src_stride);
+        do
+        {
+            uint8x16_t v5 = vld1q_u8(src + 3*src_stride);
+            uint16x8_t q, s = vaddl_u8(vget_low_u8(v0), vget_low_u8(v5));
+            s = vmlsl_u8(s, vget_low_u8(v1), c5);
+            s = vmlsl_u8(s, vget_low_u8(v4), c5);
+            s = vmlal_u8(s, vget_low_u8(v2), c20);
+            s = vmlal_u8(s, vget_low_u8(v3), c20);
+
+            q = vaddl_u8(vget_high_u8(v0), vget_high_u8(v5));
+            q = vmlsl_u8(q, vget_high_u8(v1), c5);
+            q = vmlsl_u8(q, vget_high_u8(v4), c5);
+            q = vmlal_u8(q, vget_high_u8(v2), c20);
+            q = vmlal_u8(q, vget_high_u8(v3), c20);
+
+            vst1q_u8(dst, vcombine_u8(
+                vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5),
+                vqrshrun_n_s16(vreinterpretq_s16_u16(q), 5)));
+            dst += 16;
+            src += src_stride;
+            v0 = v1;
+            v1 = v2;
+            v2 = v3;
+            v3 = v4;
+            v4 = v5;
+        } while (--h);
+    } else
+    {
+        uint8x8_t v0 = vld1_u8(src - 2*src_stride);
+        uint8x8_t v1 = vld1_u8(src - 1*src_stride);
+        uint8x8_t v2 = vld1_u8(src);
+        uint8x8_t v3 = vld1_u8(src + 1*src_stride);
+        uint8x8_t v4 = vld1_u8(src + 2*src_stride);
+        do
+        {
+            uint8x8_t v5 = vld1_u8(src + 3*src_stride);
+            uint16x8_t s = vaddl_u8(v0, v5);
+            s = vmlsl_u8(s, v1, c5);
+            s = vmlsl_u8(s, v4, c5);
+            s = vmlal_u8(s, v2, c20);
+            s = vmlal_u8(s, v3, c20);
+
+            vst1_u8(dst, vqrshrun_n_s16(vreinterpretq_s16_u16(s), 5));
+            dst += 16;
+            src += src_stride;
+            v0 = v1;
+            v1 = v2;
+            v2 = v3;
+            v3 = v4;
+            v4 = v5;
+        } while (--h);
+    }
+}
+
+static void hpel_lpf_ver16_neon(const int16_t *src, uint8_t *h264e_restrict dst, int w, int h)
+{
+    do
+    {
+        int cloop = h;
+        int16x8_t v0 = vld1q_s16(src);
+        int16x8_t v1 = vld1q_s16(src + 16);
+        int16x8_t v2 = vld1q_s16(src + 16*2);
+        int16x8_t v3 = vld1q_s16(src + 16*3);
+        int16x8_t v4 = vld1q_s16(src + 16*4);
+        do
+        {
+            int16x8_t v5 = vld1q_s16(src+16*5);
+
+            int16x8_t s0 = vaddq_s16(v0, v5);
+            int16x8_t s1 = vaddq_s16(v1, v4);
+            int16x8_t s2 = vaddq_s16(v2, v3);
+
+            int16x8_t vs = vshrq_n_s16(vsubq_s16(s0, s1), 2);
+            int16x8_t vq = vsubq_s16(s2, s1);
+            s0 = vshrq_n_s16(vaddq_s16(vq, vs), 2);
+            s0 = vaddq_s16(s0, s2);
+
+            vst1_u8(dst, vqrshrun_n_s16(s0, 6));
+
+            dst += 16;
+            src += 16;
+            v0 = v1;
+            v1 = v2;
+            v2 = v3;
+            v3 = v4;
+            v4 = v5;
+        } while (--cloop);
+
+        src -= 16*h - 8;
+        dst -= 16*h - 8;
+    } while (w -= 8);
+}
+
+static void hpel_lpf_diag_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    ALIGN(16) int16_t scratch[21 * 16] ALIGN2(16);  /* 21 rows by 16 pixels per row */
+
+    /*
+     * Intermediate values will be 1/2 pel at Horizontal direction
+     * Starting at (0.5, -2) at top extending to (0.5, height + 3) at bottom
+     * scratch contains a 2D array of size (w)X(h + 5)
+     */
+    hpel_lpf_hor16_neon(src - 2*src_stride, src_stride, scratch, w, h + 5);
+    hpel_lpf_ver16_neon(scratch, dst, w, h);
+}
+
+static void average_16x16_unalign_neon(uint8_t *dst, const uint8_t *src, int src_stride)
+{
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+    vst1q_u8(dst, vrhaddq_u8(vld1q_u8(dst), vld1q_u8(src)));  src += src_stride; dst += 16;
+}
+
+static void h264e_qpel_average_wh_align_neon(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, point_t wh)
+{
+    int w = wh.s.x;
+    int h = wh.s.y;
+    int cloop = h;
+    if (w == 8)
+    {
+        do
+        {
+            vst1_u8(dst, vrhadd_u8(vld1_u8(src0), vld1_u8(src1)));
+            dst += 16;
+            src0 += 16;
+            src1 += 16;
+        } while (--cloop);
+    } else
+    {
+        do
+        {
+            vst1q_u8(dst, vrhaddq_u8(vld1q_u8(src0), vld1q_u8(src1)));
+            dst += 16;
+            src0 += 16;
+            src1 += 16;
+        } while (--cloop);
+    }
+}
+
+static void h264e_qpel_interpolate_luma_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+//    src += ((dx + 1) >> 2) + ((dy + 1) >> 2)*src_stride;            // dx == 3 ? next row; dy == 3 ? next line
+//    dxdy              actions: Horizontal, Vertical, Diagonal, Average
+//    0 1 2 3 +1        -   ha    h    ha+
+//    1                 va  hva   hda  hv+a
+//    2                 v   vda   d    v+da
+//    3                 va+ h+va h+da  h+v+a
+//    +stride
+    int32_t pos = 1 << (dxdy.s.x + 4*dxdy.s.y);
+
+    if (pos == 1)
+    {
+        copy_wh_neon(src, src_stride, dst, wh.s.x, wh.s.y);
+    } else
+    {
+        ALIGN(16) uint8_t scratch[16*16] ALIGN2(16);
+        int dstused = 0;
+        if (pos & 0xe0ee)// 1110 0000 1110 1110
+        {
+            hpel_lpf_hor_neon(src + ((pos & 0xe000) ? src_stride : 0), src_stride, dst, wh.s.x, wh.s.y);
+            dstused++;
+        }
+        if (pos & 0xbbb0)// 1011 1011 1011 0000
+        {
+            hpel_lpf_ver_neon(src + ((pos & 0x8880) ? 1 : 0), src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+            dstused++;
+        }
+        if (pos & 0x4e40)// 0100 1110 0100 0000
+        {
+            hpel_lpf_diag_neon(src, src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+            dstused++;
+        }
+        if (pos & 0xfafa)// 1111 1010 1111 1010
+        {
+            assert(wh.s.x == 16 && wh.s.y == 16);
+            if (dstused == 2)
+            {
+                point_t p;
+
+                src = scratch;
+                src_stride = 16;
+                p.u32 = 16 + (16 << 16);
+
+                h264e_qpel_average_wh_align_neon(src, dst, dst, p);
+            } else
+            {
+                src += ((dxdy.s.x + 1) >> 2) + ((dxdy.s.y + 1) >> 2)*src_stride;
+                average_16x16_unalign_neon(dst, src, src_stride);
+            }
+        }
+    }
+}
+
+static void h264e_qpel_interpolate_chroma_neon(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+    /* if fractionl mv is not (0, 0) */
+    if (dxdy.u32)
+    {
+        uint8x8_t v8 = vdup_n_u8(8);
+        uint8x8_t vx = vdup_n_u8(dxdy.s.x);
+        uint8x8_t vy = vdup_n_u8(dxdy.s.y);
+        uint8x8_t v8x = vsub_u8(v8, vx);
+        uint8x8_t v8y = vsub_u8(v8, vy);
+        uint8x8_t va = vmul_u8(v8x, v8y);
+        uint8x8_t vb = vmul_u8(vx, v8y);
+        uint8x8_t vc = vmul_u8(v8x, vy);
+        uint8x8_t vd = vmul_u8(vx, vy);
+        int h = wh.s.y;
+        if (wh.s.x == 8)
+        {
+            uint8x16_t vt0 = vld1q_u8(src);
+            uint8x16_t vt1 = vextq_u8(vt0, vt0, 1);
+            src += src_stride;
+            do
+            {
+                uint8x16_t vb0 = vld1q_u8(src);
+                uint8x16_t vb1 = vextq_u8(vb0, vb0, 1);
+                uint16x8_t vs = vmull_u8(vget_low_u8(vt0), va);
+                vs = vmlal_u8(vs, vget_low_u8(vt1), vb);
+                vs = vmlal_u8(vs, vget_low_u8(vb0), vc);
+                vs = vmlal_u8(vs, vget_low_u8(vb1), vd);
+                vst1_u8(dst, vqrshrun_n_s16(vreinterpretq_s16_u16(vs), 6));
+                vt0 = vb0;
+                vt1 = vb1;
+                dst += 16;
+                src += src_stride;
+             } while(--h);
+         } else
+         {
+            uint8x8_t vt0 = vld1_u8(src);
+            uint8x8_t vt1 = vext_u8(vt0, vt0, 1);
+            src += src_stride;
+            do
+            {
+                uint8x8_t vb0 = vld1_u8(src);
+                uint8x8_t vb1 = vext_u8(vb0, vb0, 1);
+                uint16x8_t vs = vmull_u8(vt0, va);
+                vs = vmlal_u8(vs, vt1, vb);
+                vs = vmlal_u8(vs, vb0, vc);
+                vs = vmlal_u8(vs, vb1, vd);
+                *(int32_t*)dst = vget_lane_s32(vreinterpret_s32_u8(vqrshrun_n_s16(vreinterpretq_s16_u16(vs), 6)), 0);
+                vt0 = vb0;
+                vt1 = vb1;
+                dst += 16;
+                src += src_stride;
+             } while(--h);
+         }
+    } else
+    {
+        copy_wh_neon(src, src_stride, dst, wh.s.x, wh.s.y);
+    }
+}
+
+static int h264e_sad_mb_unlaign_8x8_neon(const pix_t *a, int a_stride, const pix_t *b, int _sad[4])
+{
+    uint16x8_t s0, s1;
+    uint8x16_t va, vb;
+    int cloop = 2, sum = 0;
+    do
+    {
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabdl_u8(    vget_low_u8(va), vget_low_u8(vb));   s1 = vabdl_u8(    vget_high_u8(va), vget_high_u8(vb)); 
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+        va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+        s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+        {
+        uint32x4_t v0 = vpaddlq_u16(s0);
+        uint64x2_t v1 = vpaddlq_u32(v0);
+        sum += _sad[0] = (int)(vgetq_lane_u64(v1, 0)+vgetq_lane_u64(v1, 1));
+        v0 = vpaddlq_u16(s1);
+        v1 = vpaddlq_u32(v0);
+        sum += _sad[1] = (int)(vgetq_lane_u64(v1, 0)+vgetq_lane_u64(v1, 1));
+        _sad += 2;
+        }
+    } while(--cloop);
+    return sum;
+}
+
+static int h264e_sad_mb_unlaign_wh_neon(const pix_t *a, int a_stride, const pix_t *b, point_t wh)
+{
+    uint16x8_t s0, s1;
+    uint8x16_t va, vb;
+    int cloop = wh.s.y/8, sum = 0;
+    if (wh.s.x == 16)
+    {
+        do
+        {
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabdl_u8(    vget_low_u8(va), vget_low_u8(vb));   s1 = vabdl_u8(    vget_high_u8(va), vget_high_u8(vb)); 
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));   s1 = vabal_u8(s1, vget_high_u8(va), vget_high_u8(vb)); 
+
+            uint32x4_t v0 = vpaddlq_u16(s0);
+            uint64x2_t v1 = vpaddlq_u32(v0);
+            sum += vgetq_lane_u64(v1, 0) + vgetq_lane_u64(v1, 1);
+
+            v0 = vpaddlq_u16(s1);
+            v1 = vpaddlq_u32(v0);
+            sum += vgetq_lane_u64(v1, 0) + vgetq_lane_u64(v1, 1);
+        } while(--cloop);
+    } else
+    {
+        do
+        {
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabdl_u8(    vget_low_u8(va), vget_low_u8(vb));
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+            va = vld1q_u8(a), vb = vld1q_u8(b);  a += a_stride, b += 16;
+            s0 = vabal_u8(s0, vget_low_u8(va), vget_low_u8(vb));
+
+            uint32x4_t v0 = vpaddlq_u16(s0);
+            uint64x2_t v1 = vpaddlq_u32(v0);
+            sum += vgetq_lane_u64(v1, 0) + vgetq_lane_u64(v1, 1);
+        } while(--cloop);
+    }
+    return sum;
+}
+
+static void h264e_copy_8x8_neon(pix_t *d, int d_stride, const pix_t *s)
+{
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+    vst1_u8(d, vld1_u8(s)); s += 16;  d += d_stride;
+}
+
+
+static void h264e_copy_16x16_neon(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+    assert(!((unsigned)d & 7));
+    assert(!((unsigned)s & 7));
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+    vst1q_u8(d, vld1q_u8(s)); s += s_stride; d += d_stride;
+}
+
+// Keep intermediate data in transposed format.
+// Save transpose for vectorized implementation
+// TODO: TRANSPOSE_BLOCK==0 broken
+#define TRANSPOSE_BLOCK     0
+#define UNZIGSAG_IN_QUANT   0
+
+#define SUM_DIF(a, b) { int t = a + b; b = a - b; a = t; }
+
+static void hadamar4_2d_neon(int16_t *x)
+{
+    int16x8_t q0 = vld1q_s16(x);
+    int16x8_t q1 = vld1q_s16(x + 8);
+    int16x8_t s = vaddq_s16(q0, q1);
+    int16x8_t d = vsubq_s16(q0, q1);
+    int16x8_t q2 = vcombine_s16(vget_low_s16(s), vget_low_s16(d));
+    int16x8_t q3 = vcombine_s16(vget_high_s16(s), vget_high_s16(d));
+    q0 = vaddq_s16(q2, q3);
+    d  = vsubq_s16(q2, q3);
+    q1 = vcombine_s16(vget_high_s16(d), vget_low_s16(d));
+{
+    int16x4x2_t t0 = vtrn_s16(vget_low_s16(q0), vget_high_s16(q0));
+    int16x4x2_t t1 = vtrn_s16(vget_low_s16(q1), vget_high_s16(q1));
+    int32x4x2_t tq = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(t0.val[0], t0.val[1])), vreinterpretq_s32_s16(vcombine_s16(t1.val[0], t1.val[1])));
+
+    q0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[0])));
+    q1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[1])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+
+    s = vaddq_s16(q0, q1);
+    d = vsubq_s16(q0, q1);
+    q2 = vcombine_s16(vget_low_s16(s), vget_low_s16(d));
+    q3 = vcombine_s16(vget_high_s16(s), vget_high_s16(d));
+    q0 = vaddq_s16(q2, q3);
+    d = vsubq_s16(q2, q3);
+    q1 = vcombine_s16(vget_high_s16(d), vget_low_s16(d));
+    vst1q_s16(x, q0);
+    vst1q_s16(x + 8, q1);
+}
+}
+
+static void dequant_dc_neon(quant_t *q, int16_t *qval, int dequant, int n)
+{
+    do q++->dq[0] = (int16_t)(*qval++*(int16_t)dequant); while (--n);
+}
+
+static void quant_dc_neon(int16_t *qval, int16_t *deq, int16_t quant, int n, int round_q18)
+{
+#if 1
+    int r_minus =  (1 << 18) - round_q18;
+    static const uint8_t iscan16[16] = {0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15};
+    static const uint8_t iscan4[4] = {0, 1, 2, 3};
+    const uint8_t *scan = n == 4 ? iscan4 : iscan16;
+    do
+    {
+        int v = *qval;
+        int r = v < 0 ? r_minus : round_q18;
+        deq[*scan++] = *qval++ = (v * quant + r) >> 18;
+    } while (--n);
+#else
+    int r_minus =  (1 << 18) - round_q18;
+    do
+    {
+        int v = *qval;
+        int r = v < 0 ? r_minus : round_q18;
+        *deq++ = *qval++ = (v * quant + r) >> 18;
+    } while (--n);
+#endif
+}
+
+static void hadamar2_2d_neon(int16_t *x)
+{
+    int a = x[0];
+    int b = x[1];
+    int c = x[2];
+    int d = x[3];
+    x[0] = (int16_t)(a + b + c + d);
+    x[1] = (int16_t)(a - b + c - d);
+    x[2] = (int16_t)(a + b - c - d);
+    x[3] = (int16_t)(a - b - c + d);
+}
+
+static void h264e_quant_luma_dc_neon(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+    int16_t *tmp = ((int16_t*)q) - 16;
+    hadamar4_2d_neon(tmp);
+    quant_dc_neon(tmp, deq, qdat[0], 16, 0x20000);//0x15555);
+    hadamar4_2d_neon(tmp);
+    assert(!(qdat[1] & 3));
+    // dirty trick here: shift w/o rounding, since it have no effect  for qp >= 10 (or, to be precise, for qp => 9)
+    dequant_dc_neon(q, tmp, qdat[1] >> 2, 16);
+}
+
+static int h264e_quant_chroma_dc_neon(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+    int16_t *tmp = ((int16_t*)q) - 16;
+    hadamar2_2d_neon(tmp);
+    quant_dc_neon(tmp, deq, (int16_t)(qdat[0] << 1), 4, 0xAAAA);
+    hadamar2_2d_neon(tmp);
+    assert(!(qdat[1] & 1));
+    dequant_dc_neon(q, tmp, qdat[1] >> 1, 4);
+    return !!(tmp[0] | tmp[1] | tmp[2] | tmp[3]);
+}
+
+#define TRANSFORM(x0, x1, x2, x3, p, s) { \
+    int t0 = x0 + x3;                     \
+    int t1 = x0 - x3;                     \
+    int t2 = x1 + x2;                     \
+    int t3 = x1 - x2;                     \
+    (p)[  0] = (int16_t)(t0 + t2);        \
+    (p)[  s] = (int16_t)(t1*2 + t3);      \
+    (p)[2*s] = (int16_t)(t0 - t2);        \
+    (p)[3*s] = (int16_t)(t1 - t3*2);      \
+}
+
+static void FwdTransformResidual4x42_neon(const uint8_t *inp, const uint8_t *pred, uint32_t inp_stride, int16_t *out)
+{
+#if TRANSPOSE_BLOCK
+    int i;
+    int16_t tmp[16];
+    // Transform columns
+    for (i = 0; i < 4; i++, pred++, inp++)
+    {
+        int f0 = inp[0] - pred[0];
+        int f1 = inp[1*inp_stride] - pred[1*16];
+        int f2 = inp[2*inp_stride] - pred[2*16];
+        int f3 = inp[3*inp_stride] - pred[3*16];
+        TRANSFORM(f0, f1, f2, f3, tmp + i*4, 1);
+    }
+    // Transform rows
+    for (i = 0; i < 4; i++)
+    {
+        int d0 = tmp[i + 0];
+        int d1 = tmp[i + 4];
+        int d2 = tmp[i + 8];
+        int d3 = tmp[i + 12];
+        TRANSFORM(d0, d1, d2, d3, out + i, 4);
+    }
+#else
+    /* Transform rows */
+    uint8x8_t inp0  = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(inp)),  vreinterpret_s32_u8(vld1_u8(inp + inp_stride))).val[0]);
+    uint8x8_t inp1  = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(inp + 2*inp_stride)), vreinterpret_s32_u8(vld1_u8(inp + 3*inp_stride))).val[0]);
+    uint8x8_t pred0 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred)),  vreinterpret_s32_u8(vld1_u8(pred + 16))).val[0]);
+    uint8x8_t pred1 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred + 2*16)), vreinterpret_s32_u8(vld1_u8(pred + 3*16))).val[0]);
+    int16x8_t q0 = vreinterpretq_s16_u16(vsubl_u8(inp0, pred0));
+    int16x8_t q1 = vreinterpretq_s16_u16(vsubl_u8(inp1, pred1));
+
+    int16x4x2_t  t0 = vtrn_s16(vget_low_s16(q0), vget_high_s16(q0));
+    int16x4x2_t  t1 = vtrn_s16(vget_low_s16(q1), vget_high_s16(q1));
+    int32x4x2_t  tq = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(t0.val[0], t0.val[1])), vreinterpretq_s32_s16(vcombine_s16(t1.val[0], t1.val[1])));
+
+    int16x4_t d4 = vadd_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+    int16x4_t d5 = vsub_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+    int16x4_t d6 = vadd_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+    int16x4_t d7 = vsub_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+    int16x8_t q2 = vcombine_s16(d4, d5);
+    int16x8_t q3 = vcombine_s16(d6, d7);
+    q0 = vaddq_s16(q2, q3);
+    q0 = vcombine_s16(vget_low_s16(q0), vadd_s16(vget_high_s16(q0), d5));
+    q1 = vsubq_s16(q2, q3);
+    q1 = vcombine_s16(vget_low_s16(q1), vsub_s16(vget_high_s16(q1), d7));
+
+    t0 = vtrn_s16(vget_low_s16(q0), vget_high_s16(q0));
+    t1 = vtrn_s16(vget_low_s16(q1), vget_high_s16(q1));
+    tq = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(t0.val[0], t0.val[1])), vreinterpretq_s32_s16(vcombine_s16(t1.val[0], t1.val[1])));
+
+    d4 = vadd_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+    d5 = vsub_s16(vget_low_s16(vreinterpretq_s16_s32(tq.val[0])), vget_high_s16(vreinterpretq_s16_s32(tq.val[1])));
+    d6 = vadd_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+    d7 = vsub_s16(vget_high_s16(vreinterpretq_s16_s32(tq.val[0])), vget_low_s16(vreinterpretq_s16_s32(tq.val[1])));
+    q2 = vcombine_s16(d4, d5);
+    q3 = vcombine_s16(d6, d7);
+    q0 = vaddq_s16(q2, q3);
+    q0 = vcombine_s16(vget_low_s16(q0), vadd_s16(vget_high_s16(q0), d5));
+    q1 = vsubq_s16(q2, q3);
+    q1 = vcombine_s16(vget_low_s16(q1), vsub_s16(vget_high_s16(q1), d7));
+
+    vst1q_s16(out, q0);
+    vst1q_s16(out + 8, q1);
+#endif
+}
+
+static void TransformResidual4x4_neon(const int16_t *pSrc, const pix_t *pred, pix_t *out, int out_stride)
+{
+    int16x4_t e0, e1, e2, e3;
+    int16x4_t f0, f1, f2, f3;
+    int16x4_t g0, g1, g2, g3;
+    int16x4_t h0, h1, h2, h3;
+    int16x4_t d0 = vld1_s16(pSrc);
+    int16x4_t d1 = vld1_s16(pSrc + 4);
+    int16x4_t d2 = vld1_s16(pSrc + 8);
+    int16x4_t d3 = vld1_s16(pSrc + 12);
+    int16x4x2_t dd0 = vtrn_s16(d0, d1);
+    int16x4x2_t dd1 = vtrn_s16(d2, d3);
+    int32x4x2_t d = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(dd0.val[0], dd0.val[1])), vreinterpretq_s32_s16(vcombine_s16(dd1.val[0], dd1.val[1])));
+    d0 = vreinterpret_s16_s32(vget_low_s32(d.val[0]));
+    d1 = vreinterpret_s16_s32(vget_high_s32(d.val[0]));
+    d2 = vreinterpret_s16_s32(vget_low_s32(d.val[1]));
+    d3 = vreinterpret_s16_s32(vget_high_s32(d.val[1]));
+
+    e0 = vadd_s16(d0, d2);
+    e1 = vsub_s16(d0, d2);
+    e2 = vsub_s16(vshr_n_s16(d1, 1), d3);
+    e3 = vadd_s16(d1, vshr_n_s16(d3, 1));
+    f0 = vadd_s16(e0, e3);
+    f1 = vadd_s16(e1, e2);
+    f2 = vsub_s16(e1, e2);
+    f3 = vsub_s16(e0, e3);
+
+    dd0 = vtrn_s16(f0, f1);
+    dd1 = vtrn_s16(f2, f3);
+    d = vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(dd0.val[0], dd0.val[1])), vreinterpretq_s32_s16(vcombine_s16(dd1.val[0], dd1.val[1])));
+    f0 = vreinterpret_s16_s32(vget_low_s32(d.val[0]));
+    f1 = vreinterpret_s16_s32(vget_high_s32(d.val[0]));
+    f2 = vreinterpret_s16_s32(vget_low_s32(d.val[1]));
+    f3 = vreinterpret_s16_s32(vget_high_s32(d.val[1]));
+
+    g0 = vadd_s16(f0, f2);
+    g1 = vsub_s16(f0, f2);
+    g2 = vsub_s16(vshr_n_s16(f1, 1), f3);
+    g3 = vadd_s16(f1, vshr_n_s16(f3, 1));
+    h0 = vadd_s16(g0, g3);
+    h1 = vadd_s16(g1, g2);
+    h2 = vsub_s16(g1, g2);
+    h3 = vsub_s16(g0, g3);
+
+    {
+        uint8x8_t inp0 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred)),  vreinterpret_s32_u8(vld1_u8(pred + 16))).val[0]);
+        uint8x8_t inp1 = vreinterpret_u8_s32(vtrn_s32(vreinterpret_s32_u8(vld1_u8(pred + 2*16)), vreinterpret_s32_u8(vld1_u8(pred + 3*16))).val[0]);
+        int16x8_t a0 = vaddq_s16(vcombine_s16(h0, h1), vreinterpretq_s16_u16(vshll_n_u8(inp0, 6)));
+        int16x8_t a1 = vaddq_s16(vcombine_s16(h2, h3), vreinterpretq_s16_u16(vshll_n_u8(inp1, 6)));
+        uint8x8_t r0 = vqrshrun_n_s16(a0, 6);
+        uint8x8_t r1 = vqrshrun_n_s16(a1, 6);
+        *(uint32_t*)(&out[0*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r0), 0);
+        *(uint32_t*)(&out[1*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r0), 1);
+        *(uint32_t*)(&out[2*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r1), 0);
+        *(uint32_t*)(&out[3*out_stride]) = vget_lane_u32(vreinterpret_u32_u8(r1), 1);
+    }
+}
+
+static int is_zero_neon(const int16_t *dat, int i0, const uint16_t *thr)
+{
+    static const uint16x8_t g_ign_first = { 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
+    int16x8_t v0 = vabsq_s16(*(int16x8_t *)dat);
+    int16x8_t v1 = vabsq_s16(*(int16x8_t *)(dat + 8));
+    int16x8_t t = *(int16x8_t *)thr;
+    uint16x8_t m0 = vcgtq_s16(v0, t);
+    uint16x8_t m1 = vcgtq_s16(v1, t);
+    if (i0)
+        m0 = vandq_u16(m0, g_ign_first);
+    m0 = vorrq_u16(m0, m1);
+    uint16x4_t m4 = vorr_u16(vget_low_u16(m0), vget_high_u16(m0));
+    return !(vget_lane_u32(vreinterpret_u32_u16(m4), 0) | vget_lane_u32(vreinterpret_u32_u16(m4), 1));
+}
+
+static int is_zero4_neon(const quant_t *q, int i0, const uint16_t *thr)
+{
+    return is_zero_neon(q[0].dq, i0, thr) &&
+           is_zero_neon(q[1].dq, i0, thr) &&
+           is_zero_neon(q[4].dq, i0, thr) &&
+           is_zero_neon(q[5].dq, i0, thr);
+}
+
+static int zero_smallq_neon(quant_t *q, int mode, const uint16_t *qdat)
+{
+    int zmask = 0;
+    int i, i0 = mode & 1, n = mode >> 1;
+    if (mode == QDQ_MODE_INTER || mode == QDQ_MODE_CHROMA)
+    {
+        for (i = 0; i < n*n; i++)
+        {
+            if (is_zero_neon(q[i].dq, i0, qdat + OFFS_THR_1_OFF))
+            {
+                zmask |= (1 << i); //9.19
+            }
+        }
+        if (mode == QDQ_MODE_INTER)   //8.27
+        {
+            if ((~zmask & 0x0033) && is_zero4_neon(q +  0, i0, qdat + OFFS_THR_2_OFF)) zmask |= 0x33;
+            if ((~zmask & 0x00CC) && is_zero4_neon(q +  2, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 2);
+            if ((~zmask & 0x3300) && is_zero4_neon(q +  8, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 8);
+            if ((~zmask & 0xCC00) && is_zero4_neon(q + 10, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 10);
+        }
+    }
+    return zmask;
+}
+
+static int quantize_neon(quant_t *q, int mode, const uint16_t *qdat, int zmask)
+{
+#if UNZIGSAG_IN_QUANT
+#if TRANSPOSE_BLOCK
+    //         ; Zig-zag scan      Transposed zig-zag
+    //         ;    0 1 5 6        0 2 3 9
+    //         ;    2 4 7 C        1 4 8 A
+    //         ;    3 8 B D        5 7 B E
+    //         ;    9 A E F        6 C D F
+    static const unsigned char iscan16[16] = {0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15};
+#else
+    static const unsigned char iscan16[16] = {0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15};
+#endif
+#endif
+    int ccol, crow, nz_block_mask = 0;
+    ccol = mode >> 1;
+    crow = ccol;
+    do
+    {
+        do
+        {
+            int nz_mask = 0;
+
+            if (zmask & 1)
+            {
+                int32_t *p = (int32_t *)q->qv;
+                *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+                *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+            } else
+            {
+                static const uint8_t iscan16_neon [] = {
+                    0x00,0x01,0x02,0x03,0x08,0x09,0x10,0x11,
+                    0x0A,0x0B,0x04,0x05,0x06,0x07,0x0C,0x0D,
+                    0x12,0x13,0x18,0x19,0x1A,0x1B,0x14,0x15,
+                    0x0E,0x0F,0x16,0x17,0x1C,0x1D,0x1E,0x1F};
+                static const uint16_t imask16_neon [] = {
+                    0x0001,0x0002,0x0004,0x0008,
+                    0x0010,0x0020,0x0040,0x0080,
+                    0x0100,0x0200,0x0400,0x0800,
+                    0x1000,0x2000,0x4000,0x8000};
+                short save = 0;
+                uint8x16_t q8,q9;
+                int16x8_t q0 = vld1q_s16(q->dq);
+                int16x8_t q1 = vld1q_s16(q->dq + 8);
+                uint16x8_t r =  vdupq_n_u16(qdat[OFFS_RND_INTER]);
+                uint16x8_t r0 = veorq_u16(r, vcltq_s16(q0, vdupq_n_s16(0)));
+                uint16x8_t r1 = veorq_u16(r, vcltq_s16(q1, vdupq_n_s16(0)));
+                int16x4_t d4, d5, d6, d7;
+                int16x4_t d22, d23, d24, d25;
+                int16x4_t d26, d27, d28, d29;
+
+                d4 = d6 = vdup_n_s16(qdat[2]);
+                d5 = d7 = vdup_n_s16(qdat[3]);
+                d4 = vset_lane_s16(qdat[0], d4, 0);
+                d4 = vset_lane_s16(qdat[0], d4, 2);
+                d5 = vset_lane_s16(qdat[1], d5, 0);
+                d5 = vset_lane_s16(qdat[1], d5, 2);
+                d6 = vset_lane_s16(qdat[4], d6, 1);
+                d6 = vset_lane_s16(qdat[4], d6, 3);
+                d7 = vset_lane_s16(qdat[5], d7, 1);
+                d7 = vset_lane_s16(qdat[5], d7, 3);
+
+                d22 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_low_s16(q0), d4)), vget_low_u16(r0))), 16);
+                d26 = vmul_s16(d22, d5);
+                d23 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_high_s16(q0), d6)), vget_high_u16(r0))), 16);
+                d27 = vmul_s16(d23, d7);
+                d24 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_low_s16(q1), d4)), vget_low_u16(r1))), 16);
+                d28 = vmul_s16(d24, d5);
+                d25 = vqshrn_n_s32(vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vmull_s16(vget_high_s16(q1), d6)), vget_high_u16(r1))), 16);
+                d29 = vmul_s16(d25, d7);
+                if (mode & 1)
+                {
+                    save = q->dq[0];
+                }
+                vst1q_s16(q->dq,     vcombine_s16(d26, d27));
+                vst1q_s16(q->dq + 8, vcombine_s16(d28, d29));
+                if (mode & 1)
+                {
+                    q->dq[0] = save;
+                }
+
+                if (mode & 1)
+                {
+                    save = q->qv[0];
+                }
+                q8 = vld1q_u8(iscan16_neon);
+                q9 = vld1q_u8(iscan16_neon + 16);
+
+                {
+// vtbl4_u8 is marked unavailable for iOS arm64, use wider versions there.
+#if defined(__APPLE__) && defined(__aarch64__) &&  defined(__apple_build_version__)
+                uint8x16x2_t vlut;
+                vlut.val[0] = vreinterpretq_u8_s16(vcombine_s16(d22, d23));
+                vlut.val[1] = vreinterpretq_u8_s16(vcombine_s16(d24, d25));
+                vst1_s16(q->qv + 0, d4 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_low_u8(q8))));
+                vst1_s16(q->qv + 4, d5 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_high_u8(q8))));
+                vst1_s16(q->qv + 8, d6 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_low_u8(q9))));
+                vst1_s16(q->qv +12, d7 = vreinterpret_s16_u8(vtbl2q_u8(vlut, vget_high_u8(q9))));
+#else
+                uint8x8x4_t vlut;
+                vlut.val[0] = vreinterpret_u8_s16(d22);
+                vlut.val[1] = vreinterpret_u8_s16(d23);
+                vlut.val[2] = vreinterpret_u8_s16(d24);
+                vlut.val[3] = vreinterpret_u8_s16(d25);
+                vst1_s16(q->qv + 0, d4 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_low_u8(q8))));
+                vst1_s16(q->qv + 4, d5 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_high_u8(q8))));
+                vst1_s16(q->qv + 8, d6 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_low_u8(q9))));
+                vst1_s16(q->qv +12, d7 = vreinterpret_s16_u8(vtbl4_u8(vlut, vget_high_u8(q9))));
+#endif
+                }
+                {
+                    uint16x8_t bm0 = vld1q_u16(imask16_neon);
+                    uint16x8_t bm1 = vld1q_u16(imask16_neon + 8);
+                    uint16x4_t m;
+                    bm0 = vandq_u16(bm0, vceqq_s16(vcombine_s16(d4, d5), vdupq_n_s16(0)));
+                    bm1 = vandq_u16(bm1, vceqq_s16(vcombine_s16(d6, d7), vdupq_n_s16(0)));
+                    bm0 = vorrq_u16(bm0, bm1);
+                    m = vorr_u16(vget_low_u16(bm0), vget_high_u16(bm0));
+                    m = vpadd_u16(m, m);
+                    m = vpadd_u16(m, m);
+                    nz_mask = vget_lane_u16(vmvn_u16(m), 0);
+                }
+
+                if (mode & 1)
+                {
+                    q->qv[0] = save;
+                    nz_mask &= ~1;
+                }
+            }
+
+            zmask >>= 1;
+            nz_block_mask <<= 1;
+            if (nz_mask)
+                nz_block_mask |= 1;
+            q++;
+        } while (--ccol);
+        ccol = mode >> 1;
+    } while (--crow);
+    return nz_block_mask;
+}
+
+static void transform_neon(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q)
+{
+    int crow = mode >> 1;
+    int ccol = crow;
+
+    do
+    {
+        do
+        {
+            FwdTransformResidual4x42_neon(inp, pred, inp_stride, q->dq);
+            q++;
+            inp += 4;
+            pred += 4;
+        } while (--ccol);
+        ccol = mode >> 1;
+        inp += 4*(inp_stride - ccol);
+        pred += 4*(16 - ccol);
+    } while (--crow);
+}
+
+static int h264e_transform_sub_quant_dequant_neon(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat)
+{
+    int zmask;
+    transform_neon(inp, pred, inp_stride, mode, q);
+    if (mode & 1) // QDQ_MODE_INTRA_16 || QDQ_MODE_CHROMA
+    {
+        int cloop = (mode >> 1)*(mode >> 1);
+        short *dc = ((short *)q) - 16;
+        quant_t *pq = q;
+        do
+        {
+            *dc++ = pq->dq[0];
+            pq++;
+        } while (--cloop);
+    }
+    zmask = zero_smallq_neon(q, mode, qdat);
+    return quantize_neon(q, mode, qdat, zmask);
+}
+
+static void h264e_transform_add_neon(pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask)
+{
+    int crow = side;
+    int ccol = crow;
+
+    assert(!((unsigned)out % 4));
+    assert(!((unsigned)pred % 4));
+    assert(!(out_stride % 4));
+    do
+    {
+        do
+        {
+            if (mask >= 0)
+            {
+                // copy 4x4
+                pix_t *dst = out;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 0 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 1 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 2 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 3 * 16);
+            } else
+            {
+                TransformResidual4x4_neon(q->dq, pred, out, out_stride);
+            }
+            mask <<= 1;
+            q++;
+            out += 4;
+            pred += 4;
+        } while (--ccol);
+        ccol = side;
+        out += 4*(out_stride - ccol);
+        pred += 4*(16 - ccol);
+    } while (--crow);
+}
+#endif
+
+#if H264E_ENABLE_PLAIN_C
+
+static uint8_t byteclip_deblock(int x)
+{
+    if (x > 255)
+    {
+        return 255;
+    }
+    if (x < 0)
+    {
+        return 0;
+    }
+    return (uint8_t)x;
+}
+
+static int clip_range(int range, int src)
+{
+    if (src > range)
+    {
+        src = range;
+    }
+    if (src < -range)
+    {
+        src = -range;
+    }
+    return src;
+}
+
+static void deblock_chroma(uint8_t *pix, int stride, int alpha, int beta, int thr, int strength)
+{
+    int p1, p0, q0, q1;
+    int delta;
+
+    if (strength == 0)
+    {
+        return;
+    }
+
+    p1 = pix[-2*stride];
+    p0 = pix[-1*stride];
+    q0 = pix[ 0*stride];
+    q1 = pix[ 1*stride];
+
+    if (ABS(p0 - q0) >= alpha || ABS(p1 - p0) >= beta || ABS(q1 - q0) >= beta)
+    {
+        return;
+    }
+
+    if (strength < 4)
+    {
+        int tC = thr + 1;
+        delta = (((q0 - p0)*4) + (p1 - q1) + 4) >> 3;
+        delta = clip_range(tC, delta);
+        pix[-1*stride] = byteclip_deblock(p0 + delta);
+        pix[ 0*stride] = byteclip_deblock(q0 - delta);
+    } else
+    {
+        pix[-1*stride] = (pix_t)((2*p1 + p0 + q1 + 2) >> 2);
+        pix[ 0*stride] = (pix_t)((2*q1 + q0 + p1 + 2) >> 2);
+    }
+}
+
+static void deblock_luma_v(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+    int p2, p1, p0, q0, q1, q2, thr;
+    int ap, aq, delta, cloop, i;
+    for (i = 0; i < 4; i++)
+    {
+        cloop = 4;
+        if (pstr[i])
+        {
+            thr = pthr[i];
+            do
+            {
+                p1 = pix[-2];
+                p0 = pix[-1];
+                q0 = pix[ 0];
+                q1 = pix[ 1];
+
+                //if (ABS(p0 - q0) < alpha && ABS(p1 - p0) < beta && ABS(q1 - q0) < beta)
+                if (((ABS(p0 - q0) - alpha) & (ABS(p1 - p0) - beta) & (ABS(q1 - q0) - beta)) < 0)
+                {
+                    int tC, sp, sq, d2;
+                    // avoid conditons
+                    p2 = pix[-3];
+                    q2 = pix[ 2];
+                    ap = ABS(p2 - p0);
+                    aq = ABS(q2 - q0);
+                    delta = (((q0 - p0)*4) + (p1 - q1) + 4) >> 3;
+
+                    sp = (ap - beta) >> 31;
+                    sq = (aq - beta) >> 31;
+                    d2 = (((p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1) & sp;
+                    d2 = clip_range(thr, d2);
+                    pix[-2] = (pix_t)(p1 + d2);
+                    d2 = (((q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1) & sq;
+                    d2 = clip_range(thr, d2);
+                    pix[ 1] = (pix_t)(q1 + d2);
+                    tC = thr - sp - sq;
+                    delta = clip_range(tC, delta);
+                    pix[-1] = byteclip_deblock(p0 + delta);
+                    pix[ 0] = byteclip_deblock(q0 - delta);
+                }
+                pix += stride;
+            } while (--cloop);
+        } else
+        {
+                pix += 4*stride;
+        }
+    }
+}
+
+static void deblock_luma_h_s4(uint8_t *pix, int stride, int alpha, int beta)
+{
+    int p3, p2, p1, p0, q0, q1, q2, q3;
+    int ap, aq, cloop = 16;
+    do
+    {
+        int abs_p0_q0, abs_p1_p0, abs_q1_q0;
+        p1 = pix[-2*stride];
+        p0 = pix[-1*stride];
+        q0 = pix[ 0*stride];
+        q1 = pix[ 1*stride];
+        abs_p0_q0 = ABS(p0 - q0);
+        abs_p1_p0 = ABS(p1 - p0);
+        abs_q1_q0 = ABS(q1 - q0);
+        if (abs_p0_q0 < alpha && abs_p1_p0 < beta && abs_q1_q0 < beta)
+        {
+            int short_p = (2*p1 + p0 + q1 + 2);
+            int short_q = (2*q1 + q0 + p1 + 2);
+
+            if (abs_p0_q0 < ((alpha>>2)+2))
+            {
+                p2 = pix[-3*stride];
+                q2 = pix[ 2*stride];
+                ap = ABS(p2 - p0);
+                aq = ABS(q2 - q0);
+                if (ap < beta)
+                {
+                    int t = p2 + p1 + p0 + q0 + 2;
+                    p3 = pix[-4*stride];
+                    short_p += t - p1 + q0; //(p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
+                    short_p >>= 1;
+                    pix[-2*stride] = (pix_t)(t >> 2);
+                    pix[-3*stride] = (pix_t)((2*p3 + 2*p2 + t + 2) >> 3); //(2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
+                }
+                if (aq < beta)
+                {
+                    int t = q2 + q1 + p0 + q0 + 2;
+                    q3 = pix[ 3*stride];
+                    short_q += (t - q1 + p0);//(q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3);
+                    short_q >>= 1;
+                    pix[ 1*stride] = (pix_t)(t >> 2);
+                    pix[ 2*stride] = (pix_t)((2*q3 + 2*q2 + t + 2) >> 3); //((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
+                }
+            }
+            pix[-1*stride] = (pix_t)(short_p >> 2);
+            pix[ 0*stride] = (pix_t)(short_q >> 2);
+        }
+        pix += 1;
+    } while (--cloop);
+}
+
+static void deblock_luma_v_s4(uint8_t *pix, int stride, int alpha, int beta)
+{
+    int p3, p2, p1, p0, q0, q1, q2, q3;
+    int ap, aq, cloop = 16;
+    do
+    {
+        p2 = pix[-3];
+        p1 = pix[-2];
+        p0 = pix[-1];
+        q0 = pix[ 0];
+        q1 = pix[ 1];
+        q2 = pix[ 2];
+        if (ABS(p0 - q0) < alpha && ABS(p1 - p0) < beta && ABS(q1 - q0) < beta)
+        {
+            ap = ABS(p2 - p0);
+            aq = ABS(q2 - q0);
+
+            if (ap < beta && ABS(p0 - q0) < ((alpha >> 2) + 2))
+            {
+                p3 = pix[-4];
+                pix[-1] = (pix_t)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
+                pix[-2] = (pix_t)((p2 + p1 + p0 + q0 + 2) >> 2);
+                pix[-3] = (pix_t)((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
+            } else
+            {
+                pix[-1] = (pix_t)((2*p1 + p0 + q1 + 2) >> 2);
+            }
+
+            if (aq < beta && ABS(p0 - q0) < ((alpha >> 2) + 2))
+            {
+                q3 = pix[ 3];
+                pix[ 0] = (pix_t)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
+                pix[ 1] = (pix_t)((q2 + q1 + p0 + q0 + 2) >> 2);
+                pix[ 2] = (pix_t)((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
+            } else
+            {
+                pix[ 0] = (pix_t)((2*q1 + q0 + p1 + 2) >> 2);
+            }
+        }
+        pix += stride;
+    } while (--cloop);
+}
+
+static void deblock_luma_h(uint8_t *pix, int stride, int alpha, int beta, const uint8_t *pthr, const uint8_t *pstr)
+{
+    int p2, p1, p0, q0, q1, q2;
+    int ap, aq, delta, i;
+    for (i = 0; i < 4; i++)
+    {
+        if (pstr[i])
+        {
+            int cloop = 4;
+            int thr = pthr[i];
+            do
+            {
+                p1 = pix[-2*stride];
+                p0 = pix[-1*stride];
+                q0 = pix[ 0*stride];
+                q1 = pix[ 1*stride];
+
+                //if (ABS(p0-q0) < alpha && ABS(p1-p0) < beta && ABS(q1-q0) < beta)
+                if (((ABS(p0-q0) - alpha) & (ABS(p1-p0) - beta) & (ABS(q1-q0) - beta)) < 0)
+                {
+                    int tC, sp, sq, d2;
+                    p2 = pix[-3*stride];
+                    q2 = pix[ 2*stride];
+                    ap = ABS(p2 - p0);
+                    aq = ABS(q2 - q0);
+                    delta = (((q0 - p0)*4) + (p1 - q1) + 4) >> 3;
+
+                    sp = (ap - beta) >> 31;
+                    d2 = (((p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1) & sp;
+                    d2 = clip_range(thr, d2);
+                    pix[-2*stride] = (pix_t)(p1 + d2);
+
+                    sq = (aq - beta) >> 31;
+                    d2 = (((q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1) & sq;
+                    d2 = clip_range(thr, d2);
+                    pix[ 1*stride] = (pix_t)(q1 + d2);
+
+                    tC = thr - sp - sq;
+                    delta = clip_range(tC, delta);
+
+                    pix[-1*stride] = byteclip_deblock(p0 + delta);
+                    pix[ 0*stride] = byteclip_deblock(q0 - delta);
+                }
+                pix += 1;
+            } while (--cloop);
+        } else
+        {
+            pix += 4;
+        }
+    }
+}
+
+static void deblock_chroma_v(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+    int i;
+    for (i = 0; i < 8; i++)
+    {
+        deblock_chroma(pix, 1, a, b, thr[i >> 1], str[i >> 1]);
+        pix += stride;
+    }
+}
+
+static void deblock_chroma_h(uint8_t *pix, int32_t stride, int a, int b, const uint8_t *thr, const uint8_t *str)
+{
+    int i;
+    for (i = 0; i < 8; i++)
+    {
+        deblock_chroma(pix, stride, a, b, thr[i >> 1], str[i >> 1]);
+        pix += 1;
+    }
+}
+
+static void h264e_deblock_chroma(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+    const uint8_t *alpha = par->alpha;
+    const uint8_t *beta  = par->beta;
+    const uint8_t *thr   = par->tc0;
+    const uint8_t *strength = (uint8_t *)par->strength32;
+    int a,b,x,y;
+    a = alpha[0];
+    b = beta[0];
+    for (x = 0; x < 16; x += 8)
+    {
+        uint32_t str = *(uint32_t*)&strength[x];
+        if (str && a)
+        {
+            deblock_chroma_v(pix + (x >> 1), stride, a, b, thr + x, strength + x);
+        }
+        a = alpha[1];
+        b = beta[1];
+    }
+    thr += 16;
+    strength += 16;
+    a = alpha[2];
+    b = beta[2];
+    for (y = 0; y < 16; y += 8)
+    {
+        uint32_t str = *(uint32_t*)&strength[y];
+        if (str && a)
+        {
+            deblock_chroma_h(pix, stride, a, b, thr + y, strength + y);
+        }
+        pix += 4*stride;
+        a = alpha[3];
+        b = beta[3];
+    }
+}
+
+static void h264e_deblock_luma(uint8_t *pix, int32_t stride, const deblock_params_t *par)
+{
+    const uint8_t *alpha = par->alpha;
+    const uint8_t *beta  = par->beta;
+    const uint8_t *thr   = par->tc0;
+    const uint8_t *strength = (uint8_t *)par->strength32;
+    int a = alpha[0];
+    int b = beta[0];
+    int x, y;
+    for (x = 0; x < 16; x += 4)
+    {
+        uint32_t str = *(uint32_t*)&strength[x];
+        if ((uint8_t)str == 4)
+        {
+            deblock_luma_v_s4(pix + x, stride, a, b);
+        } else if (str && a)
+        {
+            deblock_luma_v(pix + x, stride, a, b, thr + x, strength + x);
+        }
+        a = alpha[1];
+        b = beta[1];
+    }
+    a = alpha[2];
+    b = beta[2];
+    thr += 16;
+    strength += 16;
+    for (y = 0; y < 16; y += 4)
+    {
+        uint32_t str = *(uint32_t*)&strength[y];
+        if ((uint8_t)str == 4)
+        {
+            deblock_luma_h_s4(pix, stride, a, b);
+        } else if (str && a)
+        {
+            deblock_luma_h(pix, stride, a, b, thr + y, strength + y);
+        }
+        a = alpha[3];
+        b = beta[3];
+        pix += 4*stride;
+    }
+}
+
+static void h264e_denoise_run(unsigned char *frm, unsigned char *frmprev, int w, int h_arg, int stride_frm, int stride_frmprev)
+{
+    int cloop, h = h_arg;
+    if (w <= 2 || h <= 2)
+    {
+        return;
+    }
+    w -= 2;
+    h -= 2;
+
+    do
+    {
+        unsigned char *pf = frm += stride_frm;
+        unsigned char *pp = frmprev += stride_frmprev;
+        cloop = w;
+        pp[-stride_frmprev] = *pf++;
+        pp++;
+        do
+        {
+            int d, neighbourhood;
+            unsigned g, gd, gn, out_val;
+            d = pf[0] - pp[0];
+            neighbourhood  = pf[-1]      - pp[-1];
+            neighbourhood += pf[+1]      - pp[+1];
+            neighbourhood += pf[-stride_frm] - pp[-stride_frmprev];
+            neighbourhood += pf[+stride_frm] - pp[+stride_frmprev];
+
+            if (d < 0)
+            {
+                d = -d;
+            }
+            if (neighbourhood < 0)
+            {
+                neighbourhood = -neighbourhood;
+            }
+            neighbourhood >>= 2;
+
+            gd = g_diff_to_gainQ8[d];
+            gn = g_diff_to_gainQ8[neighbourhood];
+
+            gn <<= 2;
+            if (gn > 255)
+            {
+                gn = 255;
+            }
+
+            gn = 255 - gn;
+            gd = 255 - gd;
+            g = gn*gd;  // Q8*Q8 = Q16;
+
+            //out_val = ((pp[0]*g ) >> 16) + (((0xffff-g)*pf[0] ) >> 16);
+            //out_val = ((pp[0]*g + (1<<15)) >> 16) + (((0xffff-g)*pf[0]  + (1<<15)) >> 16);
+            out_val = (pp[0]*g + (0xffff - g)*pf[0]  + (1 << 15)) >> 16;
+
+            assert(out_val <= 255);
+
+            pp[-stride_frmprev] = (unsigned char)out_val;
+            //pp[-stride_frmprev] = gd;//(unsigned char)((neighbourhood+1)>255?255:(neighbourhood+1));
+
+            pf++, pp++;
+        } while (--cloop);
+
+        pp[-stride_frmprev] = *pf;
+    } while(--h);
+
+    memcpy(frmprev + stride_frmprev, frm + stride_frm, w + 2);
+    h = h_arg - 2;
+    do
+    {
+        memcpy(frmprev, frmprev - stride_frmprev, w + 2);
+        frmprev -= stride_frmprev;
+    } while(--h);
+    memcpy(frmprev, frm - stride_frm*(h_arg - 2), w + 2);
+}
+
+#undef IS_NULL
+#define IS_NULL(p) ((p) < (pix_t *)32)
+
+static uint32_t intra_predict_dc(const pix_t *left, const pix_t *top, int log_side)
+{
+    unsigned dc = 0, side = 1u << log_side, round = 0;
+    do
+    {
+        if (!IS_NULL(left))
+        {
+            int cloop = side;
+            round += side >> 1;
+            do
+            {
+                dc += *left++;
+                dc += *left++;
+                dc += *left++;
+                dc += *left++;
+            } while(cloop -= 4);
+        }
+        left = top;
+        top = NULL;
+    } while (left);
+    dc += round;
+    if (round == side)
+        dc >>= 1;
+    dc >>= log_side;
+    if (!round) dc = 128;
+    return dc * 0x01010101;
+}
+
+/*
+ * Note: To make the code more readable we refer to the neighboring pixels
+ * in variables named as below:
+ *
+ *    UL U0 U1 U2 U3 U4 U5 U6 U7
+ *    L0 xx xx xx xx
+ *    L1 xx xx xx xx
+ *    L2 xx xx xx xx
+ *    L3 xx xx xx xx
+ */
+#define UL edge[-1]
+#define U0 edge[0]
+#define T1 edge[1]
+#define U2 edge[2]
+#define U3 edge[3]
+#define U4 edge[4]
+#define U5 edge[5]
+#define U6 edge[6]
+#define U7 edge[7]
+#define L0 edge[-2]
+#define L1 edge[-3]
+#define L2 edge[-4]
+#define L3 edge[-5]
+
+static void h264e_intra_predict_16x16(pix_t *predict,  const pix_t *left, const pix_t *top, int mode)
+{
+    int cloop = 16;
+    uint32_t *d = (uint32_t*)predict;
+    assert(IS_ALIGNED(predict, 4));
+    assert(IS_ALIGNED(top, 4));
+    if (mode != 1)
+    {
+        uint32_t t0, t1, t2, t3;
+        if (mode < 1)
+        {
+            t0 = ((uint32_t*)top)[0];
+            t1 = ((uint32_t*)top)[1];
+            t2 = ((uint32_t*)top)[2];
+            t3 = ((uint32_t*)top)[3];
+        } else //(mode == 2)
+        {
+            t0 = t1 = t2 = t3 = intra_predict_dc(left, top, 4);
+        }
+        do
+        {
+            *d++ = t0;
+            *d++ = t1;
+            *d++ = t2;
+            *d++ = t3;
+        } while (--cloop);
+    } else //if (mode == 1)
+    {
+        do
+        {
+            uint32_t val = *left++ * 0x01010101u;
+            *d++ = val;
+            *d++ = val;
+            *d++ = val;
+            *d++ = val;
+        } while (--cloop);
+    }
+}
+
+static void h264e_intra_predict_chroma(pix_t *predict, const pix_t *left, const pix_t *top, int mode)
+{
+    int cloop = 8;
+    uint32_t *d = (uint32_t*)predict;
+    assert(IS_ALIGNED(predict, 4));
+    assert(IS_ALIGNED(top, 4));
+    if (mode < 1)
+    {
+        uint32_t t0, t1, t2, t3;
+        t0 = ((uint32_t*)top)[0];
+        t1 = ((uint32_t*)top)[1];
+        t2 = ((uint32_t*)top)[2];
+        t3 = ((uint32_t*)top)[3];
+        do
+        {
+            *d++ = t0;
+            *d++ = t1;
+            *d++ = t2;
+            *d++ = t3;
+        } while (--cloop);
+    } else if (mode == 1)
+    {
+        do
+        {
+            uint32_t u = left[0] * 0x01010101u;
+            uint32_t v = left[8] * 0x01010101u;
+            d[0] = u;
+            d[1] = u;
+            d[2] = v;
+            d[3] = v;
+            d += 4;
+            left++;
+        } while(--cloop);
+    } else //if (mode == 2)
+    {
+        int ccloop = 2;
+        cloop = 2;
+        do
+        {
+            d[0] = d[1] = d[16] = intra_predict_dc(left, top, 2);
+            d[17] = intra_predict_dc(left + 4, top + 4, 2);
+            if (!IS_NULL(top))
+            {
+                d[1] = intra_predict_dc(NULL, top + 4, 2);
+            }
+            if (!IS_NULL(left))
+            {
+                d[16] = intra_predict_dc(NULL, left + 4, 2);
+            }
+            d += 2;
+            left += 8;
+            top += 8;
+        } while(--cloop);
+
+        do
+        {
+            cloop = 12;
+            do
+            {
+                *d = d[-4];
+                d++;
+            } while(--cloop);
+            d += 4;
+        } while(--ccloop);
+    }
+}
+
+static int pix_sad_4(uint32_t r0, uint32_t r1, uint32_t r2, uint32_t r3,
+                     uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3)
+{
+#if defined(__arm__)
+    int sad = __usad8(r0, x0);
+    sad = __usada8(r1, x1, sad);
+    sad = __usada8(r2, x2, sad);
+    sad = __usada8(r3, x3, sad);
+    return sad;
+#else
+    int c, sad = 0;
+    for (c = 0; c < 4; c++)
+    {
+        int d = (r0 & 0xff) - (x0 & 0xff); r0 >>= 8; x0 >>= 8;
+        sad += ABS(d);
+    }
+    for (c = 0; c < 4; c++)
+    {
+        int d = (r1 & 0xff) - (x1 & 0xff); r1 >>= 8; x1 >>= 8;
+        sad += ABS(d);
+    }
+    for (c = 0; c < 4; c++)
+    {
+        int d = (r2 & 0xff) - (x2 & 0xff); r2 >>= 8; x2 >>= 8;
+        sad += ABS(d);
+    }
+    for (c = 0; c < 4; c++)
+    {
+        int d = (r3 & 0xff) - (x3 & 0xff); r3 >>= 8; x3 >>= 8;
+        sad += ABS(d);
+    }
+    return sad;
+#endif
+}
+
+static int h264e_intra_choose_4x4(const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty)
+{
+    int sad, best_sad, best_m = 2;
+
+    uint32_t r0, r1, r2, r3;
+    uint32_t x0, x1, x2, x3, x;
+
+    r0 = ((uint32_t *)blockin)[ 0];
+    r1 = ((uint32_t *)blockin)[ 4];
+    r2 = ((uint32_t *)blockin)[ 8];
+    r3 = ((uint32_t *)blockin)[12];
+#undef TEST
+#define TEST(mode) sad = pix_sad_4(r0, r1, r2, r3, x0, x1, x2, x3); \
+        if (mode != mpred) sad += penalty;    \
+        if (sad < best_sad)                   \
+        {                                     \
+            ((uint32_t *)blockpred)[ 0] = x0; \
+            ((uint32_t *)blockpred)[ 4] = x1; \
+            ((uint32_t *)blockpred)[ 8] = x2; \
+            ((uint32_t *)blockpred)[12] = x3; \
+            best_sad = sad;                   \
+            best_m = mode;                    \
+        }
+
+    // DC
+    x0 = x1 = x2 = x3 = intra_predict_dc((avail & AVAIL_L) ? &L3 : 0, (avail & AVAIL_T) ? &U0 : 0, 2);
+    best_sad = pix_sad_4(r0, r1, r2, r3, x0, x1, x2, x3);
+    if (2 != mpred)
+    {
+        best_sad += penalty;
+    }
+    ((uint32_t *)blockpred)[ 0] = x0;
+    ((uint32_t *)blockpred)[ 4] = x1;
+    ((uint32_t *)blockpred)[ 8] = x2;
+    ((uint32_t *)blockpred)[12] = x3;
+
+
+    if (avail & AVAIL_T)
+    {
+        uint32_t save = *(uint32_t*)&U4;
+        if (!(avail & AVAIL_TR))
+        {
+            *(uint32_t*)&U4 = U3*0x01010101u;
+        }
+
+        x0 = x1 = x2 = x3 = *(uint32_t*)&U0;
+        TEST(0)
+
+        x  = ((U6 + 3u*U7      + 2u) >> 2) << 24;
+        x |= ((U5 + 2u*U6 + U7 + 2u) >> 2) << 16;
+        x |= ((U4 + 2u*U5 + U6 + 2u) >> 2) << 8;
+        x |= ((U3 + 2u*U4 + U5 + 2u) >> 2);
+
+        x3 = x;
+        x = (x << 8) | ((U2 + 2u*U3 + U4 + 2u) >> 2);
+        x2 = x;
+        x = (x << 8) | ((T1 + 2u*U2 + U3 + 2u) >> 2);
+        x1 = x;
+        x = (x << 8) | ((U0 + 2u*T1 + U2 + 2u) >> 2);
+        x0 = x;
+        TEST(3)
+
+        x3 = x1;
+        x1 = x0;
+
+        x  = ((U4 + U5 + 1u) >> 1) << 24;
+        x |= ((U3 + U4 + 1u) >> 1) << 16;
+        x |= ((U2 + U3 + 1u) >> 1) << 8;
+        x |= ((T1 + U2 + 1u) >> 1);
+        x2 = x;
+        x = (x << 8) | ((U0 + T1 + 1) >> 1);
+        x0 = x;
+        TEST(7)
+
+        *(uint32_t*)&U4 = save;
+    }
+
+    if (avail & AVAIL_L)
+    {
+        x0 = 0x01010101u * L0;
+        x1 = 0x01010101u * L1;
+        x2 = 0x01010101u * L2;
+        x3 = 0x01010101u * L3;
+        TEST(1)
+
+        x = x3;
+        x <<= 16;
+        x |= ((L2 + 3u*L3 + 2u) >> 2) << 8;
+        x |= ((L2 + L3 + 1u) >> 1);
+        x2 = x;
+        x <<= 16;
+        x |= ((L1 + 2u*L2 + L3 + 2u) >> 2) << 8;
+        x |= ((L1 + L2 + 1u) >> 1);
+        x1 = x;
+        x <<= 16;
+        x |= ((L0 + 2u*L1 + L2 + 2u) >> 2) << 8;
+        x |= ((L0 + L1 + 1u) >> 1);
+        x0 = x;
+        TEST(8)
+    }
+
+    if ((avail & (AVAIL_T | AVAIL_L | AVAIL_TL)) == (AVAIL_T | AVAIL_L | AVAIL_TL))
+    {
+        uint32_t line0, line3;
+        x  = ((U3 + 2u*U2 + T1 + 2u) >> 2) << 24;
+        x |= ((U2 + 2u*T1 + U0 + 2u) >> 2) << 16;
+        x |= ((T1 + 2u*U0 + UL + 2u) >> 2) << 8;
+        x |= ((U0 + 2u*UL + L0 + 2u) >> 2);
+        line0 = x;
+        x0 = x;
+        x = (x << 8) | ((UL + 2u*L0 + L1 + 2u) >> 2);
+        x1 = x;
+        x = (x << 8) | ((L0 + 2u*L1 + L2 + 2u) >> 2);
+        x2 = x;
+        x = (x << 8) | ((L1 + 2u*L2 + L3 + 2u) >> 2);
+        x3 = x;
+        line3 = x;
+        TEST(4)
+
+        x = x0 << 8;
+        x |= ((UL + L0 + 1u) >> 1);
+        x0 = x;
+        x <<= 8;
+        x |= (line3 >> 16) & 0xff;
+        x <<= 8;
+        x |= ((L0 + L1 + 1u) >> 1);
+        x1 = x;
+        x <<= 8;
+        x |= (line3 >> 8) & 0xff;
+        x <<= 8;
+        x |= ((L1 + L2 + 1u) >> 1);
+        x2 = x;
+        x <<= 8;
+        x |= line3 & 0xff;
+        x <<= 8;
+        x |= ((L2 + L3 + 1u) >> 1);
+        x3 = x;
+        TEST(6)
+
+        x1 = line0;
+        x3 = (x1 << 8) | ((line3 >> 8) & 0xFF);
+
+        x  = ((U2 + U3 + 1u) >> 1) << 24;
+        x |= ((T1 + U2 + 1u) >> 1) << 16;
+        x |= ((U0 + T1 + 1u) >> 1) << 8;
+        x |= ((UL + U0 + 1u) >> 1);
+        x0 = x;
+        x = (x << 8) | ((line3 >> 16) & 0xFF);
+        x2 = x;
+        TEST(5)
+    }
+    return best_m + (best_sad << 4);
+}
+
+static uint8_t byteclip(int x)
+{
+    if (x > 255) x = 255;
+    if (x < 0) x = 0;
+    return (uint8_t)x;
+}
+
+static int hpel_lpf(const uint8_t *p, int s)
+{
+    return p[0] - 5*p[s] + 20*p[2*s] + 20*p[3*s] - 5*p[4*s] + p[5*s];
+}
+
+static void copy_wh(const uint8_t *src, int src_stride, uint8_t *dst, int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++)
+    {
+        for (x = 0; x < w; x++)
+        {
+            dst [x] = src [x];
+        }
+        dst += 16;
+        src += src_stride;
+    }
+}
+
+static void hpel_lpf_diag(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    ALIGN(16) int16_t scratch[21 * 16] ALIGN2(16);  /* 21 rows by 16 pixels per row */
+
+    /*
+     * Intermediate values will be 1/2 pel at Horizontal direction
+     * Starting at (0.5, -2) at top extending to (0.5, height + 3) at bottom
+     * scratch contains a 2D array of size (w)X(h + 5)
+     */
+    int y, x;
+    for (y = 0; y < h + 5; y++)
+    {
+        for (x = 0; x < w; x++)
+        {
+            scratch[y * w + x] = (int16_t)hpel_lpf(src + (y - 2) * src_stride + (x - 2), 1);
+        }
+    }
+
+    /* Vertical interpolate */
+    for (y = 0; y < h; y++)
+    {
+        for (x = 0; x < w; x++)
+        {
+            int pos = y * w + x;
+            int HalfCoeff =
+                scratch [pos] -
+                5 * scratch [pos + 1 * w] +
+                20 * scratch [pos + 2 * w] +
+                20 * scratch [pos + 3 * w] -
+                5 * scratch [pos + 4 * w] +
+                scratch [pos + 5 * w];
+
+            HalfCoeff = byteclip((HalfCoeff + 512) >> 10);
+
+            dst [y * 16 + x] = (uint8_t)HalfCoeff;
+        }
+    }
+}
+
+static void hpel_lpf_hor(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++)
+    {
+        for (x = 0; x < w; x++)
+        {
+            dst [y * 16 + x] = byteclip((hpel_lpf(src + y * src_stride + (x - 2), 1) + 16) >> 5);
+        }
+    }
+}
+
+static void hpel_lpf_ver(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, int w, int h)
+{
+    int y, x;
+    for (y = 0; y < h; y++)
+    {
+        for (x = 0; x < w; x++)
+        {
+            dst [y * 16 + x] = byteclip((hpel_lpf(src + (y - 2) * src_stride + x, src_stride) + 16) >> 5);
+        }
+    }
+}
+
+static void average_16x16_unalign(uint8_t *dst, const uint8_t *src1, int src1_stride)
+{
+    int x, y;
+    for (y = 0; y < 16; y++)
+    {
+        for (x = 0; x < 16; x++)
+        {
+            dst[y * 16 + x] = (uint8_t)(((uint32_t)dst [y * 16 + x] + src1[y*src1_stride + x] + 1) >> 1);
+        }
+    }
+}
+
+static void h264e_qpel_average_wh_align(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, point_t wh)
+{
+    int w = wh.s.x;
+    int h = wh.s.y;
+    int x, y;
+    for (y = 0; y < h; y++)
+    {
+        for (x = 0; x < w; x++)
+        {
+            dst[y * 16 + x] = (uint8_t)((src0[y * 16 + x] + src1[y * 16 + x] + 1) >> 1);
+        }
+    }
+}
+
+static void h264e_qpel_interpolate_luma(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+    ALIGN(16) uint8_t scratch[16*16] ALIGN2(16);
+    //  src += ((dx + 1) >> 2) + ((dy + 1) >> 2)*src_stride;            // dx == 3 ? next row; dy == 3 ? next line
+    //  dxdy              actions: Horizontal, Vertical, Diagonal, Average
+    //  0 1 2 3 +1        -   ha    h    ha+
+    //  1                 va  hva   hda  hv+a
+    //  2                 v   vda   d    v+da
+    //  3                 va+ h+va h+da  h+v+a
+    //  +stride
+    int32_t pos = 1 << (dxdy.s.x + 4*dxdy.s.y);
+    int dstused = 0;
+
+    if (pos == 1)
+    {
+        copy_wh(src, src_stride, dst, wh.s.x, wh.s.y);
+        return;
+    }
+    if (pos & 0xe0ee)// 1110 0000 1110 1110
+    {
+        hpel_lpf_hor(src + ((pos & 0xe000) ? src_stride : 0), src_stride, dst, wh.s.x, wh.s.y);
+        dstused++;
+    }
+    if (pos & 0xbbb0)// 1011 1011 1011 0000
+    {
+        hpel_lpf_ver(src + ((pos & 0x8880) ? 1 : 0), src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+        dstused++;
+    }
+    if (pos & 0x4e40)// 0100 1110 0100 0000
+    {
+        hpel_lpf_diag(src, src_stride, dstused ? scratch : dst, wh.s.x, wh.s.y);
+        dstused++;
+    }
+    if (pos & 0xfafa)// 1111 1010 1111 1010
+    {
+        assert(wh.s.x == 16 && wh.s.y == 16);
+        if (dstused == 2)
+        {
+            point_t p;
+
+            src = scratch;
+            p.u32 = 16 + (16<<16);
+
+            h264e_qpel_average_wh_align(src, dst, dst, p);
+            return;
+        } else
+        {
+            src += ((dxdy.s.x + 1) >> 2) + ((dxdy.s.y + 1) >> 2)*src_stride;
+        }
+        average_16x16_unalign(dst, src, src_stride);
+    }
+}
+
+static void h264e_qpel_interpolate_chroma(const uint8_t *src, int src_stride, uint8_t *h264e_restrict dst, point_t wh, point_t dxdy)
+{
+    /* if fractionl mv is not (0, 0) */
+    if (dxdy.u32)
+    {
+        int a = (8 - dxdy.s.x) * (8 - dxdy.s.y);
+        int b = dxdy.s.x * (8 - dxdy.s.y);
+        int c = (8 - dxdy.s.x) * dxdy.s.y;
+        int d = dxdy.s.x * dxdy.s.y;
+        int h = wh.s.y;
+        do
+        {
+            int x;
+            for (x = 0; x < wh.s.x; x++)
+            {
+                dst[x] = (uint8_t)((
+                   a * src[             x] + b * src[             x + 1] +
+                   c * src[src_stride + x] + d * src[src_stride + x + 1] +
+                   32) >> 6);
+            }
+            dst += 16;
+            src += src_stride;
+        } while (--h);
+    } else
+    {
+        copy_wh(src, src_stride, dst, wh.s.x, wh.s.y);
+    }
+}
+
+static int sad_block(const pix_t *a, int a_stride, const pix_t *b, int b_stride, int w, int h)
+{
+    int r, c, sad = 0;
+    for (r = 0; r < h; r++)
+    {
+        for (c = 0; c < w; c++)
+        {
+            int d = a[c] - b[c];
+            sad += ABS(d);
+        }
+        a += a_stride;
+        b += b_stride;
+    }
+    return sad;
+}
+
+static int h264e_sad_mb_unlaign_8x8(const pix_t *a, int a_stride, const pix_t *b, int sad[4])
+{
+    sad[0] = sad_block(a,     a_stride, b,     16, 8, 8);
+    sad[1] = sad_block(a + 8, a_stride, b + 8, 16, 8, 8);
+    a += 8*a_stride;
+    b += 8*16;
+    sad[2] = sad_block(a,     a_stride, b,     16, 8, 8);
+    sad[3] = sad_block(a + 8, a_stride, b + 8, 16, 8, 8);
+    return sad[0] + sad[1] + sad[2] + sad[3];
+}
+
+static int h264e_sad_mb_unlaign_wh(const pix_t *a, int a_stride, const pix_t *b, point_t wh)
+{
+    return sad_block(a, a_stride, b, 16, wh.s.x, wh.s.y);
+}
+
+static void h264e_copy_8x8(pix_t *d, int d_stride, const pix_t *s)
+{
+    int cloop = 8;
+    assert(IS_ALIGNED(d, 8));
+    assert(IS_ALIGNED(s, 8));
+    do
+    {
+        int a = ((const int*)s)[0];
+        int b = ((const int*)s)[1];
+        ((int*)d)[0] = a;
+        ((int*)d)[1] = b;
+        s += 16;
+        d += d_stride;
+    } while(--cloop);
+}
+
+static void h264e_copy_16x16(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+    int cloop = 16;
+    assert(IS_ALIGNED(d, 8));
+    assert(IS_ALIGNED(s, 8));
+    do
+    {
+        int a = ((const int*)s)[0];
+        int b = ((const int*)s)[1];
+        int x = ((const int*)s)[2];
+        int y = ((const int*)s)[3];
+        ((int*)d)[0] = a;
+        ((int*)d)[1] = b;
+        ((int*)d)[2] = x;
+        ((int*)d)[3] = y;
+        s += s_stride;
+        d += d_stride;
+    } while(--cloop);
+}
+#endif /* H264E_ENABLE_PLAIN_C */
+
+#if H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM))
+static void h264e_copy_borders(unsigned char *pic, int w, int h, int guard)
+{
+    int r, rowbytes = w + 2*guard;
+    unsigned char *d = pic - guard;
+    for (r = 0; r < h; r++, d += rowbytes)
+    {
+        memset(d, d[guard], guard);
+        memset(d + rowbytes - guard, d[rowbytes - guard - 1], guard);
+    }
+    d = pic - guard - guard*rowbytes;
+    for (r = 0; r < guard; r++)
+    {
+        memcpy(d, pic - guard, rowbytes);
+        memcpy(d + (guard + h)*rowbytes, pic - guard + (h - 1)*rowbytes, rowbytes);
+        d += rowbytes;
+    }
+}
+#endif /* H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM)) */
+
+#if H264E_ENABLE_PLAIN_C
+#undef TRANSPOSE_BLOCK
+#define TRANSPOSE_BLOCK     1
+#define UNZIGSAG_IN_QUANT   0
+#define SUM_DIF(a, b) { int t = a + b; b = a - b; a = t; }
+
+static int clip_byte(int x)
+{
+    if (x > 255)
+    {
+        x = 255;
+    } else if (x < 0)
+    {
+        x = 0;
+    }
+    return x;
+}
+
+static void hadamar4_2d(int16_t *x)
+{
+    int s = 1;
+    int sback = 1;
+    int16_t tmp[16];
+    int16_t *out = tmp;
+    int16_t *p = x;
+    do
+    {
+        int cloop = 4;
+        do
+        {
+            int a, b, c, d;
+            a = *p; p += 4;//s;
+            b = *p; p += 4;//s;
+            c = *p; p += 4;//s;
+            d = *p; p -= 11;//sback;
+            SUM_DIF(a, c);
+            SUM_DIF(b, d);
+            SUM_DIF(a, b);
+            SUM_DIF(c, d);
+
+            *out = (int16_t)a; out += s;
+            *out = (int16_t)c; out += s;
+            *out = (int16_t)d; out += s;
+            *out = (int16_t)b; out += sback;
+        } while (--cloop);
+        s = 5 - s;
+        sback = -11;
+        out = x;
+        p = tmp;
+    } while (s != 1);
+}
+
+static void dequant_dc(quant_t *q, int16_t *qval, int dequant, int n)
+{
+    do q++->dq[0] = (int16_t)(*qval++ * (int16_t)dequant); while (--n);
+}
+
+static void quant_dc(int16_t *qval, int16_t *deq, int16_t quant, int n, int round_q18)
+{
+#if UNZIGSAG_IN_QUANT
+    int r_minus =  (1 << 18) - round_q18;
+    static const uint8_t iscan16[16] = {0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15};
+    static const uint8_t iscan4[4] = {0, 1, 2, 3};
+    const uint8_t *scan = n == 4 ? iscan4 : iscan16;
+    do
+    {
+        int v = *qval;
+        int r = v < 0 ? r_minus : round_q18;
+        deq[*scan++] = *qval++ = (v * quant + r) >> 18;
+    } while (--n);
+#else
+    int r_minus =  (1<<18) - round_q18;
+    do
+    {
+        int v = *qval;
+        int r = v < 0 ? r_minus : round_q18;
+        *deq++ = *qval++ = (v * quant + r) >> 18;
+    } while (--n);
+#endif
+}
+
+static void hadamar2_2d(int16_t *x)
+{
+    int a = x[0];
+    int b = x[1];
+    int c = x[2];
+    int d = x[3];
+    x[0] = (int16_t)(a + b + c + d);
+    x[1] = (int16_t)(a - b + c - d);
+    x[2] = (int16_t)(a + b - c - d);
+    x[3] = (int16_t)(a - b - c + d);
+}
+
+static void h264e_quant_luma_dc(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+    int16_t *tmp = ((int16_t*)q) - 16;
+    hadamar4_2d(tmp);
+    quant_dc(tmp, deq, qdat[0], 16, 0x20000);//0x15555);
+    hadamar4_2d(tmp);
+    assert(!(qdat[1] & 3));
+    // dirty trick here: shift w/o rounding, since it have no effect  for qp >= 10 (or, to be precise, for qp => 9)
+    dequant_dc(q, tmp, qdat[1] >> 2, 16);
+}
+
+static int h264e_quant_chroma_dc(quant_t *q, int16_t *deq, const uint16_t *qdat)
+{
+    int16_t *tmp = ((int16_t*)q) - 16;
+    hadamar2_2d(tmp);
+    quant_dc(tmp, deq, (int16_t)(qdat[0] << 1), 4, 0xAAAA);
+    hadamar2_2d(tmp);
+    assert(!(qdat[1] & 1));
+    dequant_dc(q, tmp, qdat[1] >> 1, 4);
+    return !!(tmp[0] | tmp[1] | tmp[2] | tmp[3]);
+}
+
+static const uint8_t g_idx2quant[16] =
+{
+    0, 2, 0, 2,
+    2, 4, 2, 4,
+    0, 2, 0, 2,
+    2, 4, 2, 4
+};
+
+#define TRANSFORM(x0, x1, x2, x3, p, s) { \
+    int t0 = x0 + x3;                     \
+    int t1 = x0 - x3;                     \
+    int t2 = x1 + x2;                     \
+    int t3 = x1 - x2;                     \
+    (p)[  0] = (int16_t)(t0 + t2);        \
+    (p)[  s] = (int16_t)(t1*2 + t3);      \
+    (p)[2*s] = (int16_t)(t0 - t2);        \
+    (p)[3*s] = (int16_t)(t1 - t3*2);      \
+}
+
+static void FwdTransformResidual4x42(const uint8_t *inp, const uint8_t *pred,
+    uint32_t inp_stride, int16_t *out)
+{
+    int i;
+    int16_t tmp[16];
+
+#if TRANSPOSE_BLOCK
+    // Transform columns
+    for (i = 0; i < 4; i++, pred++, inp++)
+    {
+        int f0 = inp[0] - pred[0];
+        int f1 = inp[1*inp_stride] - pred[1*16];
+        int f2 = inp[2*inp_stride] - pred[2*16];
+        int f3 = inp[3*inp_stride] - pred[3*16];
+        TRANSFORM(f0, f1, f2, f3, tmp + i*4, 1);
+    }
+    // Transform rows
+    for (i = 0; i < 4; i++)
+    {
+        int d0 = tmp[i + 0];
+        int d1 = tmp[i + 4];
+        int d2 = tmp[i + 8];
+        int d3 = tmp[i + 12];
+        TRANSFORM(d0, d1, d2, d3, out + i, 4);
+    }
+
+#else
+    /* Transform rows */
+    for (i = 0; i < 16; i += 4)
+    {
+        int d0 = inp[0] - pred[0];
+        int d1 = inp[1] - pred[1];
+        int d2 = inp[2] - pred[2];
+        int d3 = inp[3] - pred[3];
+        TRANSFORM(d0, d1, d2, d3, tmp + i, 1);
+        pred += 16;
+        inp += inp_stride;
+    }
+
+    /* Transform columns */
+    for (i = 0; i < 4; i++)
+    {
+        int f0 = tmp[i + 0];
+        int f1 = tmp[i + 4];
+        int f2 = tmp[i + 8];
+        int f3 = tmp[i + 12];
+        TRANSFORM(f0, f1, f2, f3, out + i, 4);
+    }
+#endif
+}
+
+static void TransformResidual4x4(int16_t *pSrc)
+{
+    int i;
+    int16_t tmp[16];
+
+    /* Transform rows */
+    for (i = 0; i < 16; i += 4)
+    {
+#if TRANSPOSE_BLOCK
+        int d0 = pSrc[(i >> 2) + 0];
+        int d1 = pSrc[(i >> 2) + 4];
+        int d2 = pSrc[(i >> 2) + 8];
+        int d3 = pSrc[(i >> 2) + 12];
+#else
+        int d0 = pSrc[i + 0];
+        int d1 = pSrc[i + 1];
+        int d2 = pSrc[i + 2];
+        int d3 = pSrc[i + 3];
+#endif
+        int e0 = d0 + d2;
+        int e1 = d0 - d2;
+        int e2 = (d1 >> 1) - d3;
+        int e3 = d1 + (d3 >> 1);
+        int f0 = e0 + e3;
+        int f1 = e1 + e2;
+        int f2 = e1 - e2;
+        int f3 = e0 - e3;
+        tmp[i + 0] = (int16_t)f0;
+        tmp[i + 1] = (int16_t)f1;
+        tmp[i + 2] = (int16_t)f2;
+        tmp[i + 3] = (int16_t)f3;
+    }
+
+    /* Transform columns */
+    for (i = 0; i < 4; i++)
+    {
+        int f0 = tmp[i + 0];
+        int f1 = tmp[i + 4];
+        int f2 = tmp[i + 8];
+        int f3 = tmp[i + 12];
+        int g0 = f0 + f2;
+        int g1 = f0 - f2;
+        int g2 = (f1 >> 1) - f3;
+        int g3 = f1 + (f3 >> 1);
+        int h0 = g0 + g3;
+        int h1 = g1 + g2;
+        int h2 = g1 - g2;
+        int h3 = g0 - g3;
+        pSrc[i + 0] = (int16_t)((h0 + 32) >> 6);
+        pSrc[i + 4] = (int16_t)((h1 + 32) >> 6);
+        pSrc[i + 8] = (int16_t)((h2 + 32) >> 6);
+        pSrc[i + 12] = (int16_t)((h3 + 32) >> 6);
+    }
+}
+
+static int is_zero(const int16_t *dat, int i0, const uint16_t *thr)
+{
+    int i;
+    for (i = i0; i < 16; i++)
+    {
+        if ((unsigned)(dat[i] + thr[i & 7]) > (unsigned)2*thr[i & 7])
+        {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int is_zero4(const quant_t *q, int i0, const uint16_t *thr)
+{
+    return is_zero(q[0].dq, i0, thr) &&
+           is_zero(q[1].dq, i0, thr) &&
+           is_zero(q[4].dq, i0, thr) &&
+           is_zero(q[5].dq, i0, thr);
+}
+
+static int zero_smallq(quant_t *q, int mode, const uint16_t *qdat)
+{
+    int zmask = 0;
+    int i, i0 = mode & 1, n = mode >> 1;
+    if (mode == QDQ_MODE_INTER || mode == QDQ_MODE_CHROMA)
+    {
+        for (i = 0; i < n*n; i++)
+        {
+            if (is_zero(q[i].dq, i0, qdat + OFFS_THR_1_OFF))
+            {
+                zmask |= (1 << i); //9.19
+            }
+        }
+        if (mode == QDQ_MODE_INTER)   //8.27
+        {
+            if ((~zmask & 0x0033) && is_zero4(q +  0, i0, qdat + OFFS_THR_2_OFF)) zmask |= 0x33;
+            if ((~zmask & 0x00CC) && is_zero4(q +  2, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 2);
+            if ((~zmask & 0x3300) && is_zero4(q +  8, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 8);
+            if ((~zmask & 0xCC00) && is_zero4(q + 10, i0, qdat + OFFS_THR_2_OFF)) zmask |= (0x33 << 10);
+        }
+    }
+    return zmask;
+}
+
+static int quantize(quant_t *q, int mode, const uint16_t *qdat, int zmask)
+{
+#if UNZIGSAG_IN_QUANT
+#if TRANSPOSE_BLOCK
+    // ; Zig-zag scan      Transposed zig-zag
+    // ;    0 1 5 6        0 2 3 9
+    // ;    2 4 7 C        1 4 8 A
+    // ;    3 8 B D        5 7 B E
+    // ;    9 A E F        6 C D F
+    static const unsigned char iscan16[16] = { 0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15 };
+#else
+    static const unsigned char iscan16[16] = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
+#endif
+#endif
+    int i, i0 = mode & 1, ccol, crow;
+    int nz_block_mask = 0;
+    ccol = mode >> 1;
+    crow = ccol;
+    do
+    {
+        do
+        {
+            int nz_mask = 0;
+
+            if (zmask & 1)
+            {
+                int32_t *p = (int32_t *)q->qv;
+                *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+                *p++ = 0; *p++ = 0; *p++ = 0; *p++ = 0;
+                USED(p);
+            } else
+            {
+                for (i = i0; i < 16; i++)
+                {
+                    int off = g_idx2quant[i];
+                    int v, round = qdat[OFFS_RND_INTER];
+
+                    if (q->dq[i] < 0) round = 0xFFFF - round;
+
+                    v = (q->dq[i]*qdat[off] + round) >> 16;
+#if UNZIGSAG_IN_QUANT
+                    if (v)
+                        nz_mask |= 1 << iscan16[i];
+                    q->qv[iscan16[i]] = (int16_t)v;
+#else
+                    if (v)
+                        nz_mask |= 1 << i;
+                    q->qv[i] = (int16_t)v;
+#endif
+                    q->dq[i] = (int16_t)(v*qdat[off + 1]);
+                }
+            }
+
+            zmask >>= 1;
+            nz_block_mask <<= 1;
+            if (nz_mask)
+                nz_block_mask |= 1;
+            q++;
+        } while (--ccol);
+        ccol = mode >> 1;
+    } while (--crow);
+    return nz_block_mask;
+}
+
+static void transform(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q)
+{
+    int crow = mode >> 1;
+    int ccol = crow;
+
+    do
+    {
+        do
+        {
+            FwdTransformResidual4x42(inp, pred, inp_stride, q->dq);
+            q++;
+            inp += 4;
+            pred += 4;
+        } while (--ccol);
+        ccol = mode >> 1;
+        inp += 4*(inp_stride - ccol);
+        pred += 4*(16 - ccol);
+    } while (--crow);
+}
+
+static int h264e_transform_sub_quant_dequant(const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat)
+{
+    int zmask;
+    transform(inp, pred, inp_stride, mode, q);
+    if (mode & 1) // QDQ_MODE_INTRA_16 || QDQ_MODE_CHROMA
+    {
+        int cloop = (mode >> 1)*(mode >> 1);
+        short *dc = ((short *)q) - 16;
+        quant_t *pq = q;
+        do
+        {
+            *dc++ = pq->dq[0];
+            pq++;
+        } while (--cloop);
+    }
+    zmask = zero_smallq(q, mode, qdat);
+    return quantize(q, mode, qdat, zmask);
+}
+
+static void h264e_transform_add(pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask)
+{
+    int crow = side;
+    int ccol = crow;
+
+    assert(IS_ALIGNED(out, 4));
+    assert(IS_ALIGNED(pred, 4));
+    assert(!(out_stride % 4));
+
+    do
+    {
+        do
+        {
+            if (mask >= 0)
+            {
+                // copy 4x4
+                pix_t *dst = out;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 0 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 1 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 2 * 16); dst += out_stride;
+                *(uint32_t*)dst = *(uint32_t*)(pred + 3 * 16);
+            } else
+            {
+                int i, j;
+                TransformResidual4x4(q->dq);
+                for (j = 0; j < 4; j++)
+                {
+                    for (i = 0; i < 4; i++)
+                    {
+                        int Value = q->dq[j * 4 + i] + pred[j * 16 + i];
+                        out[j * out_stride + i] = (pix_t)clip_byte(Value);
+                    }
+                }
+            }
+            mask = (uint32_t)mask << 1;
+            q++;
+            out += 4;
+            pred += 4;
+        } while (--ccol);
+        ccol = side;
+        out += 4*(out_stride - ccol);
+        pred += 4*(16 - ccol);
+    } while (--crow);
+}
+#endif /* H264E_ENABLE_PLAIN_C */
+
+#if H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM))
+
+#define BS_BITS 32
+
+static void h264e_bs_put_bits(bs_t *bs, unsigned n, unsigned val)
+{
+    assert(!(val >> n));
+    bs->shift -= n;
+    assert((unsigned)n <= 32);
+    if (bs->shift < 0)
+    {
+        assert(-bs->shift < 32);
+        bs->cache |= val >> -bs->shift;
+        *bs->buf++ = SWAP32(bs->cache);
+        bs->shift = 32 + bs->shift;
+        bs->cache = 0;
+    }
+    bs->cache |= val << bs->shift;
+}
+
+static void h264e_bs_flush(bs_t *bs)
+{
+    *bs->buf = SWAP32(bs->cache);
+}
+
+static unsigned h264e_bs_get_pos_bits(const bs_t *bs)
+{
+    unsigned pos_bits = (unsigned)((bs->buf - bs->origin)*BS_BITS);
+    pos_bits += BS_BITS - bs->shift;
+    assert((int)pos_bits >= 0);
+    return pos_bits;
+}
+
+static unsigned h264e_bs_byte_align(bs_t *bs)
+{
+    int pos = h264e_bs_get_pos_bits(bs);
+    h264e_bs_put_bits(bs, -pos & 7, 0);
+    return pos + (-pos & 7);
+}
+
+/**
+*   Golomb code
+*   0 => 1
+*   1 => 01 0
+*   2 => 01 1
+*   3 => 001 00
+*   4 => 001 01
+*
+*   [0]     => 1
+*   [1..2]  => 01x
+*   [3..6]  => 001xx
+*   [7..14] => 0001xxx
+*
+*/
+static void h264e_bs_put_golomb(bs_t *bs, unsigned val)
+{
+#ifdef __arm__
+    int size = 32 - __clz(val + 1);
+#else
+    int size = 0;
+    unsigned t = val + 1;
+    do
+    {
+        size++;
+    } while (t >>= 1);
+#endif
+    h264e_bs_put_bits(bs, 2*size - 1, val + 1);
+}
+
+/**
+*   signed Golomb code.
+*   mapping to unsigned code:
+*       0 => 0
+*       1 => 1
+*      -1 => 2
+*       2 => 3
+*      -2 => 4
+*       3 => 5
+*      -3 => 6
+*/
+static void h264e_bs_put_sgolomb(bs_t *bs, int val)
+{
+    val = 2*val - 1;
+    val ^= val >> 31;
+    h264e_bs_put_golomb(bs, val);
+}
+
+static void h264e_bs_init_bits(bs_t *bs, void *data)
+{
+    bs->origin = data;
+    bs->buf = bs->origin;
+    bs->shift = BS_BITS;
+    bs->cache = 0;
+}
+
+static void h264e_vlc_encode(bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx)
+{
+    int nnz_context, nlevels, nnz; // nnz = nlevels + trailing_ones
+    int trailing_ones = 0;
+    int trailing_ones_sign = 0;
+    uint8_t runs[16];
+    uint8_t *prun = runs;
+    int16_t *levels;
+    int cloop = maxNumCoeff; USED(cloop);
+    BS_OPEN(bs)
+
+#if H264E_ENABLE_SSE2 || (H264E_ENABLE_PLAIN_C && !H264E_ENABLE_NEON)
+    // this branch used with SSE + C configuration
+    int16_t zzquant[16];
+    levels = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+    if (maxNumCoeff != 4)
+    {
+        int v;
+        if (maxNumCoeff == 16)
+        {
+            v = quant[15]*2; if (v) *--levels = (int16_t)v, *prun++ = 16;
+            v = quant[11]*2; if (v) *--levels = (int16_t)v, *prun++ = 15;
+            v = quant[14]*2; if (v) *--levels = (int16_t)v, *prun++ = 14;
+            v = quant[13]*2; if (v) *--levels = (int16_t)v, *prun++ = 13;
+            v = quant[10]*2; if (v) *--levels = (int16_t)v, *prun++ = 12;
+            v = quant[ 7]*2; if (v) *--levels = (int16_t)v, *prun++ = 11;
+            v = quant[ 3]*2; if (v) *--levels = (int16_t)v, *prun++ = 10;
+            v = quant[ 6]*2; if (v) *--levels = (int16_t)v, *prun++ =  9;
+            v = quant[ 9]*2; if (v) *--levels = (int16_t)v, *prun++ =  8;
+            v = quant[12]*2; if (v) *--levels = (int16_t)v, *prun++ =  7;
+            v = quant[ 8]*2; if (v) *--levels = (int16_t)v, *prun++ =  6;
+            v = quant[ 5]*2; if (v) *--levels = (int16_t)v, *prun++ =  5;
+            v = quant[ 2]*2; if (v) *--levels = (int16_t)v, *prun++ =  4;
+            v = quant[ 1]*2; if (v) *--levels = (int16_t)v, *prun++ =  3;
+            v = quant[ 4]*2; if (v) *--levels = (int16_t)v, *prun++ =  2;
+            v = quant[ 0]*2; if (v) *--levels = (int16_t)v, *prun++ =  1;
+        } else
+        {
+            v = quant[15]*2; if (v) *--levels = (int16_t)v, *prun++ = 15;
+            v = quant[11]*2; if (v) *--levels = (int16_t)v, *prun++ = 14;
+            v = quant[14]*2; if (v) *--levels = (int16_t)v, *prun++ = 13;
+            v = quant[13]*2; if (v) *--levels = (int16_t)v, *prun++ = 12;
+            v = quant[10]*2; if (v) *--levels = (int16_t)v, *prun++ = 11;
+            v = quant[ 7]*2; if (v) *--levels = (int16_t)v, *prun++ = 10;
+            v = quant[ 3]*2; if (v) *--levels = (int16_t)v, *prun++ =  9;
+            v = quant[ 6]*2; if (v) *--levels = (int16_t)v, *prun++ =  8;
+            v = quant[ 9]*2; if (v) *--levels = (int16_t)v, *prun++ =  7;
+            v = quant[12]*2; if (v) *--levels = (int16_t)v, *prun++ =  6;
+            v = quant[ 8]*2; if (v) *--levels = (int16_t)v, *prun++ =  5;
+            v = quant[ 5]*2; if (v) *--levels = (int16_t)v, *prun++ =  4;
+            v = quant[ 2]*2; if (v) *--levels = (int16_t)v, *prun++ =  3;
+            v = quant[ 1]*2; if (v) *--levels = (int16_t)v, *prun++ =  2;
+            v = quant[ 4]*2; if (v) *--levels = (int16_t)v, *prun++ =  1;
+        }
+    } else
+    {
+        int v;
+        v = quant[ 3]*2; if (v) *--levels = (int16_t)v, *prun++ = 4;
+        v = quant[ 2]*2; if (v) *--levels = (int16_t)v, *prun++ = 3;
+        v = quant[ 1]*2; if (v) *--levels = (int16_t)v, *prun++ = 2;
+        v = quant[ 0]*2; if (v) *--levels = (int16_t)v, *prun++ = 1;
+    }
+    USED(prun);
+    quant = zzquant + ((maxNumCoeff == 4) ? 4 : 16);
+    nnz = (int)(quant - levels);
+#else
+    quant += (maxNumCoeff == 4) ? 4 : 16;
+    levels = quant;
+    do
+    {
+        int v = *--quant;
+        if (v)
+        {
+            *--levels = v*2;
+            *prun++ = cloop;
+        }
+    } while (--cloop);
+    quant += maxNumCoeff;
+    nnz = quant - levels;
+#endif
+
+    if (nnz)
+    {
+        cloop = MIN(3, nnz);
+        levels = quant - 1;
+        do
+        {
+            if ((unsigned)(*levels + 2) > 4u)
+            {
+                break;
+            }
+            trailing_ones_sign = (trailing_ones_sign << 1) | (*levels-- < 0);
+            trailing_ones++;
+        } while (--cloop);
+    }
+    nlevels = nnz - trailing_ones;
+
+    nnz_context = nz_ctx[-1] + nz_ctx[1];
+
+    nz_ctx[0] = (uint8_t)nnz;
+    if (nnz_context <= 34)
+    {
+        nnz_context = (nnz_context + 1) >> 1;
+    }
+    nnz_context &= 31;
+
+    // 9.2.1 Parsing process for total number of transform coefficient levels and trailing ones
+    {
+        int off = h264e_g_coeff_token[nnz_context];
+        int n = 6, val = h264e_g_coeff_token[off + trailing_ones + 4*nlevels];
+        if (off != 230)
+        {
+            n = (val & 15) + 1;
+            val >>= 4;
+        }
+        BS_PUT(n, val);
+    }
+
+    if (nnz)
+    {
+        if (trailing_ones)
+        {
+            BS_PUT(trailing_ones, trailing_ones_sign);
+        }
+        if (nlevels)
+        {
+            int vlcnum = 1;
+            int sym_len, prefix_len;
+
+            int sym = *levels-- - 2;
+            if (sym < 0) sym = -3 - sym;
+            if (sym >= 6) vlcnum++;
+            if (trailing_ones < 3)
+            {
+                sym -= 2;
+                if (nnz > 10)
+                {
+                    sym_len = 1;
+                    prefix_len = sym >> 1;
+                    if (prefix_len >= 15)
+                    {
+                        // or vlcnum = 1;  goto escape;
+                        prefix_len = 15;
+                        sym_len = 12;
+                    }
+                    sym -= prefix_len << 1;
+                    // bypass vlcnum advance due to sym -= 2; above
+                    goto loop_enter;
+                }
+            }
+
+            if (sym < 14)
+            {
+                prefix_len = sym;
+                sym = 0; // to avoid side effect in bitbuf
+                sym_len = 0;
+            } else if (sym < 30)
+            {
+                prefix_len = 14;
+                sym_len = 4;
+                sym -= 14;
+            } else
+            {
+                vlcnum = 1;
+                goto escape;
+            }
+            goto loop_enter;
+
+            for (;;)
+            {
+                sym_len = vlcnum;
+                prefix_len = sym >> vlcnum;
+                if (prefix_len >= 15)
+                {
+escape:
+                    prefix_len = 15;
+                    sym_len = 12;
+                }
+                sym -= prefix_len << vlcnum;
+
+                if (prefix_len >= 3 && vlcnum < 6)
+                    vlcnum++;
+loop_enter:
+                sym |= 1 << sym_len;
+                sym_len += prefix_len + 1;
+                BS_PUT(sym_len, sym);
+                if (!--nlevels) break;
+                sym = *levels-- - 2;
+                if (sym < 0) sym = -3 - sym;
+            }
+        }
+
+        if (nnz < maxNumCoeff)
+        {
+            const uint8_t *vlc = (maxNumCoeff == 4) ? h264e_g_total_zeros_cr_2x2 : h264e_g_total_zeros;
+            uint8_t *run = runs;
+            int run_prev = *run++;
+            int nzeros = run_prev - nnz;
+            int zeros_left = 2*nzeros - 1;
+            int ctx = nnz - 1;
+            run[nnz - 1] = (uint8_t)maxNumCoeff; // terminator
+            for (;;)
+            {
+                int t;
+
+                int val = vlc[vlc[ctx] + nzeros];
+                int n = val & 15;
+                val >>= 4;
+                BS_PUT(n, val);
+
+                zeros_left -= nzeros;
+                if (zeros_left < 0)
+                {
+                    break;
+                }
+
+                t = *run++;
+                nzeros = run_prev - t - 1;
+                if (nzeros < 0)
+                {
+                    break;
+                }
+                run_prev = t;
+                assert(zeros_left < 14);
+                vlc = h264e_g_run_before;
+                ctx = zeros_left;
+            }
+        }
+    }
+    BS_CLOSE(bs);
+}
+#endif /* H264E_ENABLE_PLAIN_C || (H264E_ENABLE_NEON && !defined(MINIH264_ASM)) */
+
+#if H264E_SVC_API
+static uint32_t udiv32(uint32_t n, uint32_t d)
+{
+    uint32_t q = 0, r = n, N = 16;
+    do
+    {
+        N--;
+        if ((r >> N) >= d)
+        {
+            r -= (d << N);
+            q += (1 << N);
+        }
+    } while (N);
+    return q;
+}
+
+static void h264e_copy_8x8_s(pix_t *d, int d_stride, const pix_t *s, int s_stride)
+{
+    int cloop = 8;
+    assert(!((unsigned)(uintptr_t)d & 7));
+    assert(!((unsigned)(uintptr_t)s & 7));
+    do
+    {
+        int a = ((const int*)s)[0];
+        int b = ((const int*)s)[1];
+        ((int*)d)[0] = a;
+        ((int*)d)[1] = b;
+        s += s_stride;
+        d += d_stride;
+    } while(--cloop);
+}
+
+static void h264e_frame_downsampling(uint8_t *out, int wo, int ho,
+    const uint8_t *src, int wi, int hi, int wo_Crop, int ho_Crop, int wi_Crop, int hi_Crop)
+{
+#define Q_BILIN 12
+#define ONE_BILIN (1<<Q_BILIN)
+    int r, c;
+    int scaleh = udiv32(hi_Crop<<Q_BILIN, ho_Crop);
+    int scalew = udiv32(wi_Crop<<Q_BILIN, wo_Crop);
+
+    for (r = 0; r < ho_Crop; r++)
+    {
+        int dy = r*scaleh + (scaleh >> 2);
+        int y = dy >> Q_BILIN;
+        dy = dy & (ONE_BILIN - 1);
+
+        for (c = 0; c < wo_Crop; c++)
+        {
+            int dx = c*scalew + (scalew >> 2);
+            //          int dx = c*scalew;
+            int x = dx >> Q_BILIN;
+            const uint8_t *s0, *s1;
+            uint8_t s00, s01, s10, s11;
+            dx &= (ONE_BILIN - 1);
+
+
+            s1 = s0 = src + x + y*wi;
+            if (y < hi - 1)
+            {
+                s1 = s0 + wi;
+            }
+
+            s00 = s01 = s0[0];
+            s10 = s11 = s1[0];
+            if (x < wi - 1)
+            {
+                s01 = s0[1];
+                s11 = s1[1];
+            }
+
+            *out++ =(uint8_t) ((((s11*dx + s10*(ONE_BILIN - dx)) >> (Q_BILIN - 1))*dy +
+                ((s01*dx + s00*(ONE_BILIN - dx)) >> (Q_BILIN - 1))*(ONE_BILIN - dy) + (1 << (Q_BILIN + 1 - 1))) >> (Q_BILIN + 1));
+        }
+        if (wo > wo_Crop) //copy border
+        {
+            int cloop = wo - wo_Crop;
+            uint8_t border = out[-1];
+            do
+            {
+                *out++ = border;
+            } while(--cloop);
+        }
+    }
+
+    // copy bottom
+    {
+        int cloop = (ho - ho_Crop) * wo;
+        if (cloop > 0)
+        {
+            do
+            {
+                *out = out[-wo];
+                out++;
+            } while(--cloop);
+        }
+    }
+}
+
+static int clip(int val, int max)
+{
+    if (val < 0) return 0;
+    if (val > max) return max;
+    return val;
+}
+
+static const int8_t g_filter16_luma[16][4] =
+{
+    {  0, 32,  0,  0 },
+    { -1, 32,  2, -1 },
+    { -2, 31,  4, -1 },
+    { -3, 30,  6, -1 },
+    { -3, 28,  8, -1 },
+    { -4, 26, 11, -1 },
+    { -4, 24, 14, -2 },
+    { -3, 22, 16, -3 },
+    { -3, 19, 19, -3 },
+    { -3, 16, 22, -3 },
+    { -2, 14, 24, -4 },
+    { -1, 11, 26, -4 },
+    { -1,  8, 28, -3 },
+    { -1,  6, 30, -3 },
+    { -1,  4, 31, -2 },
+    { -1,  2, 32, -1 }
+};
+
+static void h264e_intra_upsampling(int srcw, int srch, int dstw, int dsth, int is_chroma,
+    const uint8_t *arg_src, int src_stride, uint8_t *arg_dst, int dst_stride)
+{
+    int i, j;
+    //===== set position calculation parameters =====
+    int shift_x = 16;//(m_iLevelIdc <= 30 ? 16 : 31 - CeilLog2(iBaseW));
+    int shift_y = 16;//(m_iLevelIdc <= 30 ? 16 : 31 - CeilLog2(iBaseH));
+    int step_x  = udiv32(((unsigned int)srcw << shift_x) + (dstw >> 1), dstw);
+    int step_y  = udiv32(((unsigned int)srch << shift_y) + (dsth >> 1), dsth);
+    int start_x = udiv32((srcw << (shift_x - 1 - is_chroma)) + (dstw >> 1), dstw) + (1 << (shift_x - 5));
+    int start_y = udiv32((srch << (shift_y - 1 - is_chroma)) + (dsth >> 1), dsth) + (1 << (shift_y - 5));
+    int16_t *temp16 = (short*)(arg_dst + dst_stride*dsth) + 4;  // malloc(( iBaseH )*sizeof(short)); //ref frame have border =1 mb
+
+    if (is_chroma)
+    {
+        int xpos = start_x - (4 << 12);
+        for (i = 0; i < dstw; i++, xpos += step_x)
+        {
+            const uint8_t* src = arg_src;
+            int xfrac  = (xpos >> 12) & 15;
+            int xint = xpos >> 16;
+            int m0 = clip(xint + 0, srcw - 1);
+            int m1 = clip(xint + 1, srcw - 1);
+            for( j = 0; j < srch ; j++ )
+            {
+                temp16[j] = (int16_t)(src[m1]*xfrac + src[m0]*(16 - xfrac));
+                src += src_stride;
+            }
+            temp16[-1] = temp16[0];
+            temp16[srch] = temp16[srch-1];
+
+            //========== vertical upsampling ===========
+            {
+                int16_t* src16 = temp16;
+                uint8_t* dst = arg_dst + i;
+                int ypos = start_y - (4 << 12);
+                for (j = 0; j < dsth; j++)
+                {
+                    int yfrac = (ypos >> 12) & 15;
+                    int yint  = (ypos >> 16);
+                    int acc = yfrac*src16[yint + 1] + (16 - yfrac)*src16[yint + 0];
+                    acc = (acc + 128) >> 8;
+                    *dst = (int8_t)acc;
+                    dst += dst_stride;
+                    ypos += step_y;
+                }
+            }
+        }
+    } else
+    {
+        int xpos = start_x - (8 << 12);
+        for (i = 0; i < dstw; i++, xpos += step_x)
+        {
+            const uint8_t *src = arg_src;
+            int xfrac    = (xpos >> 12) & 15;
+            int xint   = xpos >> 16;
+            int m0 = clip(xint - 1, srcw - 1);
+            int m1 = clip(xint    , srcw - 1);
+            int m2 = clip(xint + 1, srcw - 1);
+            int m3 = clip(xint + 2, srcw - 1);
+            //========== horizontal upsampling ===========
+            for( j = 0; j < srch ; j++ )
+            {
+                int acc = 0;
+                acc += g_filter16_luma[xfrac][0] * src[m0];
+                acc += g_filter16_luma[xfrac][1] * src[m1];
+                acc += g_filter16_luma[xfrac][2] * src[m2];
+                acc += g_filter16_luma[xfrac][3] * src[m3];
+                temp16[j] = (int16_t)acc;
+                src += src_stride;
+            }
+            temp16[-2] = temp16[-1] = temp16[0];
+            temp16[srch + 1] = temp16[srch] = temp16[srch - 1];
+
+            //========== vertical upsampling ===========
+            {
+                int16_t *src16 = temp16;
+                uint8_t *dst = arg_dst + i;
+                int ypos = start_y - (8 << 12);
+
+                for (j = 0; j < dsth; j++)
+                {
+                    int yfrac = (ypos >> 12) & 15;
+                    int yint = ypos >> 16;
+                    int acc = 512;
+                    acc += g_filter16_luma[yfrac][0] * src16[yint + 0 - 1];
+                    acc += g_filter16_luma[yfrac][1] * src16[yint + 1 - 1];
+                    acc += g_filter16_luma[yfrac][2] * src16[yint + 2 - 1];
+                    acc += g_filter16_luma[yfrac][3] * src16[yint + 3 - 1];
+                    acc >>= 10;
+                    if (acc < 0)
+                    {
+                        acc = 0;
+                    }
+                    if (acc > 255)
+                    {
+                        acc = 255;
+                    }
+                    *dst = (int8_t)acc;
+                    dst += dst_stride;
+                    ypos += step_y;
+                }
+            }
+        }
+    }
+}
+#endif /* H264E_SVC_API */
+
+// Experimental code branch:
+// Rate-control takes into account that long-term references compresses worser than short-term
+#define H264E_RATE_CONTROL_GOLDEN_FRAMES 1
+
+/************************************************************************/
+/*      Constants (can't be changed)                                    */
+/************************************************************************/
+
+#define MIN_QP          10   // Minimum QP
+
+#define MVPRED_MEDIAN   1
+#define MVPRED_L        2
+#define MVPRED_U        3
+#define MVPRED_UR       4
+#define MV_NA           0x8000
+#define AVAIL(mv)       ((mv).u32 != MV_NA)
+
+#define SLICE_TYPE_P    0
+#define SLICE_TYPE_I    2
+
+#define NNZ_NA          64
+
+#define MAX_MV_CAND     20
+
+#define STARTCODE_4BYTES 4
+
+#define SCALABLE_BASELINE 83
+
+/************************************************************************/
+/*      Hardcoded params (can be changed at compile time)               */
+/************************************************************************/
+#define ALPHA_OFS       0       // Deblock alpha offset
+#define BETA_OFS        0       // Deblock beta offset
+#define DQP_CHROMA      0       // chroma delta QP
+
+#define MV_RANGE        32      // Motion vector search range, pixels
+#define MV_GUARD        14      // Out-of-frame MV's restriction, pixels
+
+/************************************************************************/
+/*      Code shortcuts                                                  */
+/************************************************************************/
+#define U(n,v) h264e_bs_put_bits(enc->bs, n, v)
+#define U1(v)  h264e_bs_put_bits(enc->bs, 1, v)
+#define UE(v)  h264e_bs_put_golomb(enc->bs, v)
+#define SE(v)  h264e_bs_put_sgolomb(enc->bs, v)
+#define SWAP(datatype, a, b) { datatype _ = a; a = b; b = _; }
+#define SQR(x) ((x)*(x))
+#define SQRP(pnt) SQR(pnt.s.x) + SQR(pnt.s.y)
+#define SMOOTH(smth, p) smth.s.x = (63*smth.s.x + p.s.x + 32) >> 6;  smth.s.y = (63*smth.s.y + p.s.y + 32) >> 6;
+#define MUL_LAMBDA(x, lambda) ((x)*(lambda) >> 4)
+
+/************************************************************************/
+/*      Optimized code fallback                                         */
+/************************************************************************/
+
+#if defined(MINIH264_ASM)
+#include "asm/minih264e_asm.h"
+#endif
+#if H264E_ENABLE_NEON && defined(MINIH264_ASM)
+#   define h264e_bs_put_bits_neon      h264e_bs_put_bits_arm11
+#   define h264e_bs_flush_neon         h264e_bs_flush_arm11
+#   define h264e_bs_get_pos_bits_neon  h264e_bs_get_pos_bits_arm11
+#   define h264e_bs_byte_align_neon    h264e_bs_byte_align_arm11
+#   define h264e_bs_put_golomb_neon    h264e_bs_put_golomb_arm11
+#   define h264e_bs_put_sgolomb_neon   h264e_bs_put_sgolomb_arm11
+#   define h264e_bs_init_bits_neon     h264e_bs_init_bits_arm11
+#   define h264e_vlc_encode_neon       h264e_vlc_encode_arm11
+#elif H264E_ENABLE_NEON
+#   define h264e_bs_put_bits_neon      h264e_bs_put_bits
+#   define h264e_bs_flush_neon         h264e_bs_flush
+#   define h264e_bs_get_pos_bits_neon  h264e_bs_get_pos_bits
+#   define h264e_bs_byte_align_neon    h264e_bs_byte_align
+#   define h264e_bs_put_golomb_neon    h264e_bs_put_golomb
+#   define h264e_bs_put_sgolomb_neon   h264e_bs_put_sgolomb
+#   define h264e_bs_init_bits_neon     h264e_bs_init_bits
+#   define h264e_vlc_encode_neon       h264e_vlc_encode
+#   define h264e_copy_borders_neon     h264e_copy_borders
+#endif
+
+/************************************************************************/
+/*      Declare exported functions for each configuration               */
+/************************************************************************/
+#if !H264E_CONFIGS_COUNT
+#   error no build configuration defined
+#elif H264E_CONFIGS_COUNT == 1
+//  Exactly one configuration: append config suffix to exported names
+#   if H264E_ENABLE_NEON
+#       define MAP_NAME(name) name##_neon
+#   endif
+#   if H264E_ENABLE_SSE2
+#       define MAP_NAME(name) name##_sse2
+#   endif
+#else //if H264E_CONFIGS_COUNT > 1
+//  Several configurations: use Virtual Functions Table (VFT)
+typedef struct
+{
+#   define  H264E_API(type, name, args) type (*name) args;
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma,  (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int,  h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void,     h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void,     h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void,     h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void,     h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void,     h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void,     h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int,  h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int,  h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int,  h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int,  h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+#   undef H264E_API
+} vft_t;
+
+// non-const VFT, run-time initialized
+static const vft_t *g_vft;
+
+// const VFT for each supported build config
+#if H264E_ENABLE_PLAIN_C
+static const vft_t g_vft_plain_c =
+{
+#define  H264E_API(type, name, args) name,
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma,  (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int,  h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void,     h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void,     h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void,     h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void,     h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void,     h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void,     h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int,  h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int,  h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int,  h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int,  h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+#undef H264E_API
+};
+#endif
+#if H264E_ENABLE_NEON
+static const vft_t g_vft_neon =
+{
+#define  H264E_API(type, name, args) name##_neon,
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma,  (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int,  h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void,     h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void,     h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void,     h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void,     h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void,     h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void,     h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int,  h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int,  h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int,  h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int,  h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+#undef H264E_API
+};
+#endif
+#if H264E_ENABLE_SSE2
+static const vft_t g_vft_sse2 =
+{
+#define  H264E_API(type, name, args) name##_sse2,
+// h264e_qpel
+H264E_API(void, h264e_qpel_interpolate_chroma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_interpolate_luma, (const uint8_t *src,int src_stride, uint8_t *h264e_restrict dst,point_t wh, point_t dxdy))
+H264E_API(void, h264e_qpel_average_wh_align, (const uint8_t *p0, const uint8_t *p1, uint8_t *h264e_restrict d, point_t wh))
+// h264e_deblock
+H264E_API(void, h264e_deblock_chroma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+H264E_API(void, h264e_deblock_luma, (uint8_t *pSrcDst, int32_t srcdstStep, const deblock_params_t *par))
+// h264e_intra
+H264E_API(void, h264e_intra_predict_chroma,  (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(void, h264e_intra_predict_16x16, (pix_t *predict, const pix_t *left, const pix_t *top, int mode))
+H264E_API(int,  h264e_intra_choose_4x4, (const pix_t *blockin, pix_t *blockpred, int avail, const pix_t *edge, int mpred, int penalty))
+// h264e_cavlc
+H264E_API(void,     h264e_bs_put_bits, (bs_t *bs, unsigned n, unsigned val))
+H264E_API(void,     h264e_bs_flush, (bs_t *bs))
+H264E_API(unsigned, h264e_bs_get_pos_bits, (const bs_t *bs))
+H264E_API(unsigned, h264e_bs_byte_align, (bs_t *bs))
+H264E_API(void,     h264e_bs_put_golomb, (bs_t *bs, unsigned val))
+H264E_API(void,     h264e_bs_put_sgolomb, (bs_t *bs, int val))
+H264E_API(void,     h264e_bs_init_bits, (bs_t *bs, void *data))
+H264E_API(void,     h264e_vlc_encode, (bs_t *bs, int16_t *quant, int maxNumCoeff, uint8_t *nz_ctx))
+// h264e_sad
+H264E_API(int,  h264e_sad_mb_unlaign_8x8, (const pix_t *a, int a_stride, const pix_t *b, int sad[4]))
+H264E_API(int,  h264e_sad_mb_unlaign_wh, (const pix_t *a, int a_stride, const pix_t *b, point_t wh))
+H264E_API(void, h264e_copy_8x8, (pix_t *d, int d_stride, const pix_t *s))
+H264E_API(void, h264e_copy_16x16, (pix_t *d, int d_stride, const pix_t *s, int s_stride))
+H264E_API(void, h264e_copy_borders, (unsigned char *pic, int w, int h, int guard))
+// h264e_transform
+H264E_API(void, h264e_transform_add, (pix_t *out, int out_stride, const pix_t *pred, quant_t *q, int side, int32_t mask))
+H264E_API(int,  h264e_transform_sub_quant_dequant, (const pix_t *inp, const pix_t *pred, int inp_stride, int mode, quant_t *q, const uint16_t *qdat))
+H264E_API(void, h264e_quant_luma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+H264E_API(int,  h264e_quant_chroma_dc, (quant_t *q, int16_t *deq, const uint16_t *qdat))
+// h264e_denoise
+H264E_API(void, h264e_denoise_run, (unsigned char *frm, unsigned char *frmprev, int w, int h, int stride_frm, int stride_frmprev))
+#undef H264E_API
+};
+#endif
+
+/************************************************************************/
+/*      Code to detect CPU features and init VFT                        */
+/************************************************************************/
+
+#if H264E_ENABLE_SSE2
+#if defined(_MSC_VER)
+#define minih264_cpuid __cpuid
+#else
+static __inline__ __attribute__((always_inline)) void minih264_cpuid(int CPUInfo[], const int InfoType)
+{
+#if defined(__PIC__)
+    __asm__ __volatile__(
+#if defined(__x86_64__)
+        "push %%rbx\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+        "pop  %%rbx\n"
+#else /* defined(__x86_64__) */
+        "xchgl %%ebx, %1\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+#endif /* defined(__x86_64__) */
+        : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#else /* defined(__PIC__) */
+    __asm__ __volatile__(
+        "cpuid"
+        : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#endif /* defined(__PIC__)*/
+}
+#endif /* defined(_MSC_VER) */
+
+static int CPU_have_SSE2()
+{
+    int CPUInfo[4];
+    minih264_cpuid(CPUInfo, 0);
+    if (CPUInfo[0] > 0)
+    {
+        minih264_cpuid(CPUInfo, 1);
+        if (CPUInfo[3] & (1 << 26))
+            return 1;
+    }
+    return 0;
+}
+#endif
+
+static void init_vft(int enableNEON)
+{
+#if H264E_ENABLE_PLAIN_C
+    g_vft = &g_vft_plain_c;
+#endif
+    (void)enableNEON;
+#if H264E_ENABLE_NEON
+    if (enableNEON)
+        g_vft = &g_vft_neon;
+    else
+        g_vft = &g_vft_plain_c;
+#endif
+#if H264E_ENABLE_SSE2
+    if (CPU_have_SSE2())
+    {
+        g_vft = &g_vft_sse2;
+    }
+#endif
+}
+
+#define MAP_NAME(name) g_vft->name
+
+#endif
+
+#ifdef MAP_NAME
+#   define h264e_qpel_interpolate_chroma     MAP_NAME(h264e_qpel_interpolate_chroma)
+#   define h264e_qpel_interpolate_luma       MAP_NAME(h264e_qpel_interpolate_luma)
+#   define h264e_qpel_average_wh_align       MAP_NAME(h264e_qpel_average_wh_align)
+#   define h264e_deblock_chroma              MAP_NAME(h264e_deblock_chroma)
+#   define h264e_deblock_luma                MAP_NAME(h264e_deblock_luma)
+#   define h264e_intra_predict_chroma        MAP_NAME(h264e_intra_predict_chroma)
+#   define h264e_intra_predict_16x16         MAP_NAME(h264e_intra_predict_16x16)
+#   define h264e_intra_choose_4x4            MAP_NAME(h264e_intra_choose_4x4)
+#   define h264e_bs_put_bits                 MAP_NAME(h264e_bs_put_bits)
+#   define h264e_bs_flush                    MAP_NAME(h264e_bs_flush)
+#   define h264e_bs_get_pos_bits             MAP_NAME(h264e_bs_get_pos_bits)
+#   define h264e_bs_byte_align               MAP_NAME(h264e_bs_byte_align)
+#   define h264e_bs_put_golomb               MAP_NAME(h264e_bs_put_golomb)
+#   define h264e_bs_put_sgolomb              MAP_NAME(h264e_bs_put_sgolomb)
+#   define h264e_bs_init_bits                MAP_NAME(h264e_bs_init_bits)
+#   define h264e_vlc_encode                  MAP_NAME(h264e_vlc_encode)
+#   define h264e_sad_mb_unlaign_8x8          MAP_NAME(h264e_sad_mb_unlaign_8x8)
+#   define h264e_sad_mb_unlaign_wh           MAP_NAME(h264e_sad_mb_unlaign_wh)
+#   define h264e_copy_8x8                    MAP_NAME(h264e_copy_8x8)
+#   define h264e_copy_16x16                  MAP_NAME(h264e_copy_16x16)
+#   define h264e_copy_borders                MAP_NAME(h264e_copy_borders)
+#   define h264e_transform_add               MAP_NAME(h264e_transform_add)
+#   define h264e_transform_sub_quant_dequant MAP_NAME(h264e_transform_sub_quant_dequant)
+#   define h264e_quant_luma_dc               MAP_NAME(h264e_quant_luma_dc)
+#   define h264e_quant_chroma_dc             MAP_NAME(h264e_quant_chroma_dc)
+#   define h264e_denoise_run                 MAP_NAME(h264e_denoise_run)
+#endif
+
+/************************************************************************/
+/*      Arithmetics                                                     */
+/************************************************************************/
+
+#ifndef __arm__
+/**
+*   Count of leading zeroes
+*/
+static unsigned __clz(unsigned v)
+{
+#if defined(_MSC_VER)
+    unsigned long nbit;
+    _BitScanReverse(&nbit, v);
+    return 31 - nbit;
+#elif defined(__GNUC__) || defined(__clang__) || defined(__aarch64__)
+    return __builtin_clz(v);
+#else
+    unsigned clz = 32;
+    assert(v);
+    do
+    {
+        clz--;
+    } while (v >>= 1);
+    return clz;
+#endif
+}
+#endif
+
+/**
+*   Size of unsigned Golomb code
+*/
+static int bitsize_ue(int v)
+{
+    return 2*(32 - __clz(v + 1)) - 1;
+}
+
+/**
+*   Size of signed Golomb code
+*/
+static int bits_se(int v)
+{
+    v = 2*v - 1;
+    v ^= v >> 31;
+    return bitsize_ue(v);
+}
+
+/**
+*   Multiply 32x32 Q16
+*/
+static uint32_t mul32x32shr16(uint32_t x, uint32_t y)
+{
+    uint32_t r = (x >> 16) * (y & 0xFFFFu) + x * (y >> 16) + ((y & 0xFFFFu) * (x & 0xFFFFu) >> 16);
+    //assert(r == (uint32_t)((__int64)x*y>>16));
+    return r;
+}
+
+/**
+*   Integer division, producing Q16 output
+*/
+static uint32_t div_q16(uint32_t numer, uint32_t denum)
+{
+    unsigned f = 1 << __clz(denum);
+    do
+    {
+        denum = denum * f >> 16;
+        numer = mul32x32shr16(numer, f);
+        f = ((1 << 17) - denum);
+    } while (denum  != 0xffff);
+    return numer;
+}
+
+/************************************************************************/
+/*      Motion Vector arithmetics                                       */
+/************************************************************************/
+
+static point_t point(int x, int y)
+{
+    point_t p;
+    p.u32 = ((unsigned)y << 16) | ((unsigned)x & 0xFFFF);    // assumes little-endian
+    return p;
+}
+
+static int mv_is_zero(point_t p)
+{
+    return !p.u32;
+}
+
+static int mv_equal(point_t p0, point_t p1)
+{
+    return (p0.u32 == p1.u32);
+}
+
+/**
+*   check that difference between given MV's components is greater than 3
+*/
+static int mv_differs3(point_t p0, point_t p1)
+{
+    return ABS(p0.s.x - p1.s.x) > 3 || ABS(p0.s.y - p1.s.y) > 3;
+}
+
+static point_t mv_add(point_t a, point_t b)
+{
+#if defined(__arm__)
+    a.u32 = __sadd16(a.u32, b.u32);
+#elif H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1)
+    a.u32 = _mm_cvtsi128_si32(_mm_add_epi16(_mm_cvtsi32_si128(a.u32), _mm_cvtsi32_si128(b.u32)));
+#else
+    a.s.x += b.s.x;
+    a.s.y += b.s.y;
+#endif
+    return a;
+}
+
+static point_t mv_sub(point_t a, point_t b)
+{
+#if defined(__arm__)
+    a.u32 = __ssub16(a.u32, b.u32);
+#elif H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1)
+    a.u32 = _mm_cvtsi128_si32(_mm_sub_epi16(_mm_cvtsi32_si128(a.u32), _mm_cvtsi32_si128(b.u32)));
+#else
+    a.s.x -= b.s.x;
+    a.s.y -= b.s.y;
+#endif
+    return a;
+}
+
+static void mv_clip(point_t *h264e_restrict p, const rectangle_t *range)
+{
+    p->s.x = MAX(p->s.x, range->tl.s.x);
+    p->s.x = MIN(p->s.x, range->br.s.x);
+    p->s.y = MAX(p->s.y, range->tl.s.y);
+    p->s.y = MIN(p->s.y, range->br.s.y);
+}
+
+static int mv_in_rect(point_t p, const rectangle_t *r)
+{
+    return (p.s.y >= r->tl.s.y && p.s.y <= r->br.s.y && p.s.x >= r->tl.s.x && p.s.x <= r->br.s.x);
+}
+
+static point_t mv_round_qpel(point_t p)
+{
+    return point((p.s.x + 1) & ~3, (p.s.y + 1) & ~3);
+}
+
+/************************************************************************/
+/*      Misc macroblock helper functions                                */
+/************************************************************************/
+/**
+*   @return current macroblock input luma pixels
+*/
+static pix_t *mb_input_luma(h264e_enc_t *enc)
+{
+    return enc->inp.yuv[0] + (enc->mb.x + enc->mb.y*enc->inp.stride[0])*16;
+}
+
+/**
+*   @return current macroblock input chroma pixels
+*/
+static pix_t *mb_input_chroma(h264e_enc_t *enc, int uv)
+{
+    return enc->inp.yuv[uv] + (enc->mb.x + enc->mb.y*enc->inp.stride[uv])*8;
+}
+
+/**
+*   @return absolute MV for current macroblock for given MV
+*/
+static point_t mb_abs_mv(h264e_enc_t *enc, point_t mv)
+{
+    return mv_add(mv, point(enc->mb.x*64, enc->mb.y*64));
+}
+
+/************************************************************************/
+/*      Pixel copy functions                                            */
+/************************************************************************/
+/**
+*   Copy incomplete (cropped) macroblock pixels with borders extension
+*/
+static void pix_copy_cropped_mb(pix_t *d, int d_stride, const pix_t *s, int s_stride, int w, int h)
+{
+    int nbottom = d_stride - h; // assume dst = square d_strideXd_stride
+    s_stride -= w;
+    do
+    {
+        int cloop = w;
+        pix_t last;
+        do
+        {
+            last = *s++;
+            *d++ = last;
+        } while (--cloop);
+        cloop = d_stride - w;
+        if (cloop) do
+        {
+            *d++ = last;    // extend row
+        } while (--cloop);
+        s += s_stride;
+    } while (--h);
+    s = d - d_stride;
+    if (nbottom) do
+    {
+        memcpy(d, s, d_stride);  // extend columns
+        d += d_stride;
+    } while (--nbottom);
+}
+
+/**
+*   Copy one image component
+*/
+static void pix_copy_pic(pix_t *dst, int dst_stride, pix_t *src, int src_stride, int w, int h)
+{
+    do
+    {
+        memcpy(dst, src, w);
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+/**
+*   Copy reconstructed frame to reference buffer, with borders extensionn
+*/
+static void pix_copy_recon_pic_to_ref(h264e_enc_t *enc)
+{
+    int c, h = enc->frame.h, w = enc->frame.w, guard = 16;
+    for (c = 0; c < 3; c++)
+    {
+        if (enc->param.const_input_flag)
+        {
+            SWAP(pix_t*, enc->ref.yuv[c], enc->dec.yuv[c]);
+        } else
+        {
+            pix_copy_pic(enc->ref.yuv[c], w + 2*guard, enc->dec.yuv[c], w, w, h);
+        }
+
+        h264e_copy_borders(enc->ref.yuv[c], w, h, guard);
+        if (!c) guard >>= 1, w >>= 1, h >>= 1;
+    }
+}
+
+/************************************************************************/
+/*      Median MV predictor                                             */
+/************************************************************************/
+
+/**
+*   @return neighbors availability flags for current macroblock
+*/
+static int mb_avail_flag(const h264e_enc_t *enc)
+{
+    int nmb = enc->mb.num;
+    int flag = nmb >= enc->slice.start_mb_num + enc->frame.nmbx;
+    if (nmb >= enc->slice.start_mb_num + enc->frame.nmbx - 1 && enc->mb.x != enc->frame.nmbx-1)
+    {
+        flag += AVAIL_TR;
+    }
+    if (nmb != enc->slice.start_mb_num && enc->mb.x)
+    {
+        flag += AVAIL_L;
+    }
+    if (nmb > enc->slice.start_mb_num + enc->frame.nmbx && enc->mb.x)
+    {
+        flag += AVAIL_TL;
+    }
+    return flag;
+}
+
+/**
+*   @return median of 3 given integers
+*/
+#if !(H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1))
+static int me_median_of_3(int a, int b, int c)
+{
+    return MAX(MIN(MAX(a, b), c), MIN(a, b));
+}
+#endif
+
+/**
+*   @return median of 3 given motion vectors
+*/
+static point_t point_median_of_3(point_t a, point_t b, point_t c)
+{
+#if H264E_ENABLE_SSE2 && (H264E_CONFIGS_COUNT == 1)
+    __m128i a2 = _mm_cvtsi32_si128(a.u32);
+    __m128i b2 = _mm_cvtsi32_si128(b.u32);
+    point_t med;
+    med.u32 = _mm_cvtsi128_si32(_mm_max_epi16(_mm_min_epi16(_mm_max_epi16(a2, b2), _mm_cvtsi32_si128(c.u32)), _mm_min_epi16(a2, b2)));
+    return med;
+#else
+    return point(me_median_of_3(a.s.x, b.s.x, c.s.x),
+                 me_median_of_3(a.s.y, b.s.y, c.s.y));
+#endif
+}
+
+/**
+*   Save state of the MV predictor
+*/
+static void me_mv_medianpredictor_save_ctx(h264e_enc_t *enc, point_t *ctx)
+{
+    int i;
+    point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+    for (i = 0; i < 4; i++)
+    {
+        *ctx++ = enc->mv_pred[i];
+        *ctx++ = enc->mv_pred[4 + i];
+        *ctx++ = mvtop[i];
+    }
+}
+
+/**
+*   Restore state of the MV predictor
+*/
+static void me_mv_medianpredictor_restore_ctx(h264e_enc_t *enc, const point_t *ctx)
+{
+    int i;
+    point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+    for (i = 0; i < 4; i++)
+    {
+        enc->mv_pred[i] = *ctx++;
+        enc->mv_pred[4 + i] = *ctx++;
+        mvtop[i] = *ctx++;
+    }
+}
+
+/**
+*   Put motion vector to the deblock filter matrix.
+*   x,y,w,h refers to 4x4 blocks within 16x16 macroblock, and should be in the range [0,4]
+*/
+static void me_mv_dfmatrix_put(point_t *dfmv, int x, int y, int w, int h, point_t mv)
+{
+    int i;
+    assert(y < 4 && x < 4);
+
+    dfmv += y*5 + x + 5;   // 5x5 matrix without left-top cell
+    do
+    {
+        for (i = 0; i < w; i++)
+        {
+            dfmv[i] = mv;
+        }
+        dfmv += 5;
+    } while (--h);
+}
+
+/**
+*   Use given motion vector for prediction
+*/
+static void me_mv_medianpredictor_put(h264e_enc_t *enc, int x, int y, int w, int h, point_t mv)
+{
+    int i;
+    point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+    assert(y < 4 && x < 4);
+
+    enc->mv_pred[4 + y] = mvtop[x + w-1]; // top-left corner = top-right corner
+    for (i = 1; i < h; i++)
+    {
+        enc->mv_pred[4 + y + i] = mv;     // top-left corner(s) for next row(s) = this
+    }
+    for (i = 0; i < h; i++)
+    {
+        enc->mv_pred[y + i] = mv;         // left = this
+    }
+    for (i = 0; i < w; i++)
+    {
+        mvtop[x + i] = mv;                // top = this
+    }
+}
+
+/**
+*   Motion vector median predictor for non-skip macroblock, as defined in the standard
+*/
+static point_t me_mv_medianpredictor_get(const h264e_enc_t *enc, point_t xy, point_t wh)
+{
+    int x = xy.s.x >> 2;
+    int y = xy.s.y >> 2;
+    int w = wh.s.x >> 2;
+    int h = wh.s.y >> 2;
+    int mvPredType = MVPRED_MEDIAN;
+    point_t a, b, c, d, ret = point(0, 0);
+    point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+    int flag = enc->mb.avail;
+
+    assert(y < 4);
+    assert(x < 4);
+    assert(w <= 4);
+    assert(h <= 4);
+
+    a = enc->mv_pred[y];
+    b = mvtop[x];
+    c = mvtop[x + w];
+    d = enc->mv_pred[4 + y];
+
+    if (!x)
+    {
+        if (!(flag & AVAIL_L))
+        {
+            a.u32 = MV_NA;
+        }
+        if (!(flag & AVAIL_TL))
+        {
+            d.u32 = MV_NA;
+        }
+    }
+    if (!y)
+    {
+        if (!(flag & AVAIL_T))
+        {
+            b.u32 = MV_NA;
+            if (x + w < 4)
+            {
+                c.u32 = MV_NA;
+            }
+            if (x > 0)
+            {
+                d.u32 = MV_NA;
+            }
+        }
+        if (!(flag & AVAIL_TL) && !x)
+        {
+            d.u32 = MV_NA;
+        }
+        if (!(flag & AVAIL_TR) && x + w == 4)
+        {
+            c.u32 = MV_NA;
+        }
+    }
+
+    if (x + w == 4 && (!(flag & AVAIL_TR) || y))
+    {
+        c = d;
+    }
+
+    if (AVAIL(a) && !AVAIL(b) && !AVAIL(c))
+    {
+        mvPredType = MVPRED_L;
+    } else if (!AVAIL(a) && AVAIL(b) && !AVAIL(c))
+    {
+        mvPredType = MVPRED_U;
+    } else if (!AVAIL(a) && !AVAIL(b) && AVAIL(c))
+    {
+        mvPredType = MVPRED_UR;
+    }
+
+    // Directional predictions
+    if (w == 2 && h == 4)
+    {
+        if (x == 0)
+        {
+            if (AVAIL(a))
+            {
+                mvPredType = MVPRED_L;
+            }
+        } else
+        {
+            if (AVAIL(c))
+            {
+                mvPredType = MVPRED_UR;
+            }
+        }
+    } else if (w == 4 && h == 2)
+    {
+        if (y == 0)
+        {
+            if (AVAIL(b))
+            {
+                mvPredType = MVPRED_U;
+            }
+        } else
+        {
+            if (AVAIL(a))
+            {
+                mvPredType = MVPRED_L;
+            }
+        }
+    }
+
+    switch(mvPredType)
+    {
+    default:
+    case MVPRED_MEDIAN:
+        if (!(AVAIL(b) || AVAIL(c)))
+        {
+            if (AVAIL(a))
+            {
+                ret = a;
+            }
+        } else
+        {
+            if (!AVAIL(a))
+            {
+                a = ret;
+            }
+            if (!AVAIL(b))
+            {
+                b = ret;
+            }
+            if (!AVAIL(c))
+            {
+                c = ret;
+            }
+            ret = point_median_of_3(a, b, c);
+        }
+        break;
+    case MVPRED_L:
+        if (AVAIL(a))
+        {
+            ret = a;
+        }
+        break;
+    case MVPRED_U:
+        if (AVAIL(b))
+        {
+            ret = b;
+        }
+        break;
+    case MVPRED_UR:
+        if (AVAIL(c))
+        {
+            ret = c;
+        }
+        break;
+    }
+    return ret;
+}
+
+/**
+*   Motion vector median predictor for skip macroblock
+*/
+static point_t me_mv_medianpredictor_get_skip(h264e_enc_t *enc)
+{
+    point_t pred_16x16 = me_mv_medianpredictor_get(enc, point(0, 0),  point(16, 16));
+    enc->mb.mv_skip_pred = point(0, 0);
+    if (!(~enc->mb.avail & (AVAIL_L | AVAIL_T)))
+    {
+        point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+        if (!mv_is_zero(enc->mv_pred[0]) && !mv_is_zero(mvtop[0]))
+        {
+            enc->mb.mv_skip_pred = pred_16x16;
+        }
+    }
+    return pred_16x16;
+}
+
+/**
+*   Get starting points candidates for MV search
+*/
+static int me_mv_medianpredictor_get_cand(const h264e_enc_t *enc, point_t *mv)
+{
+    point_t *mv0 = mv;
+    point_t *mvtop = enc->mv_pred + 8 + enc->mb.x*4;
+    int flag = enc->mb.avail;
+    *mv++ = point(0, 0);
+    if ((flag & AVAIL_L) && AVAIL(enc->mv_pred[0]))
+    {
+        *mv++ = enc->mv_pred[0];
+    }
+    if ((flag & AVAIL_T) && AVAIL(mvtop[0]))
+    {
+        *mv++ = mvtop[0];
+    }
+    if ((flag & AVAIL_TR) && AVAIL(mvtop[4]))
+    {
+        *mv++ = mvtop[4];
+    }
+    return (int)(mv - mv0);
+}
+
+
+/************************************************************************/
+/*      NAL encoding                                                    */
+/************************************************************************/
+
+/**
+*   Count ## of escapes, i.e. binary strings 0000 0000  0000 0000  0000 00xx
+*   P(escape) = 2^-22
+*   E(run_between_escapes) = 2^21 ~= 2 MB
+*/
+static int nal_count_esc(const uint8_t *s, int n)
+{
+    int i, cnt_esc = 0, cntz = 0;
+    for (i = 0; i < n; i++)
+    {
+        uint8_t byte = *s++;
+        if (cntz == 2 && byte <= 3)
+        {
+            cnt_esc++;
+            cntz = 0;
+        }
+
+        if (byte)
+        {
+            cntz = 0;
+        } else
+        {
+            cntz++;
+        }
+    }
+    return cnt_esc;
+}
+
+/**
+*   Put NAL escape codes to the output bitstream
+*/
+static int nal_put_esc(uint8_t *d, const uint8_t *s, int n)
+{
+    int i, j = 0, cntz = 0;
+    for (i = 0; i < n; i++)
+    {
+        uint8_t byte = *s++;
+        if (cntz == 2 && byte <= 3)
+        {
+            d[j++] = 3;
+            cntz = 0;
+        }
+
+        if (byte)
+        {
+            cntz = 0;
+        } else
+        {
+            cntz++;
+        }
+        d[j++] = byte;
+    }
+    assert(d + j <= s);
+    return j;
+}
+
+/**
+*   Init NAL encoding
+*/
+static void nal_start(h264e_enc_t *enc, int nal_hdr)
+{
+    uint8_t *d = enc->out + enc->out_pos;
+    d[0] = d[1] = d[2] = 0; d[3] = 1; // start code
+    enc->out_pos += STARTCODE_4BYTES;
+    d += STARTCODE_4BYTES + (-(int)enc->out_pos & 3);   // 4-bytes align for bitbuffer
+    assert(IS_ALIGNED(d, 4));
+    h264e_bs_init_bits(enc->bs, d);
+    U(8, nal_hdr);
+}
+
+/**
+*   Finalize NAL encoding
+*/
+static void nal_end(h264e_enc_t *enc)
+{
+    int cnt_esc, bs_bytes;
+    uint8_t *nal = enc->out + enc->out_pos;
+
+    U1(1); // stop bit
+    bs_bytes = h264e_bs_byte_align(enc->bs) >> 3;
+    h264e_bs_flush(enc->bs);
+
+    // count # of escape bytes to insert
+    cnt_esc = nal_count_esc((unsigned char*)enc->bs->origin, bs_bytes);
+
+    if ((uint8_t *)enc->bs->origin != nal + cnt_esc)
+    {
+        // make free space for escapes and remove align bytes
+        memmove(nal + cnt_esc, enc->bs->origin, bs_bytes);
+    }
+    if (cnt_esc)
+    {
+        // insert escape bytes
+        bs_bytes = nal_put_esc(nal, nal + cnt_esc, bs_bytes);
+    }
+    if (enc->run_param.nalu_callback)
+    {
+        // Call application-supplied callback
+        enc->run_param.nalu_callback(nal, bs_bytes, enc->run_param.nalu_callback_token);
+    }
+    enc->out_pos += bs_bytes;
+}
+
+
+/************************************************************************/
+/*      Top-level syntax elements (SPS,PPS,Slice)                       */
+/************************************************************************/
+
+/**
+*   Encode Sequence Parameter Set (SPS)
+*   ref: [1] 7.3.2.1.1
+*/
+
+//temp global
+#define dependency_id 1
+#define quality_id 0
+#define default_base_mode_flag 0
+#define log2_max_frame_num_minus4 1
+
+static void encode_sps(h264e_enc_t *enc, int profile_idc)
+{
+    struct limit_t
+    {
+        uint8_t level;
+        uint8_t constrains;
+        uint16_t max_fs;
+        uint16_t max_vbvdiv5;
+        uint32_t max_dpb;
+    };
+    static const struct limit_t limit [] = {
+        {10, 0xE0, 99,    175/5, 396},
+        {10, 0xF0, 99,    350/5, 396},
+        {11, 0xE0, 396,   500/5, 900},
+        {12, 0xE0, 396,   1000/5, 2376},
+        {13, 0xE0, 396,   2000/5, 2376},
+        {20, 0xE0, 396,   2000/5, 2376},
+        {21, 0xE0, 792,   4000/5, 4752},
+        {22, 0xE0, 1620,  4000/5, 8100},
+        {30, 0xE0, 1620,  10000/5, 8100},
+        {31, 0xE0, 3600,  14000/5, 18000},
+        {32, 0xE0, 5120,  20000/5, 20480},
+        {40, 0xE0, 8192,  25000/5, 32768},
+        {41, 0xE0, 8192,  62500/5, 32768},
+        {42, 0xE0, 8704,  62500/5, 34816},
+        {50, 0xE0, 22080, 135000/5, 110400},
+        {51, 0xE0, 36864, 240000/5, 184320}
+    };
+    const struct limit_t *plim = limit;
+
+    while (plim->level < 51 && (enc->frame.nmb > plim->max_fs ||
+        enc->param.vbv_size_bytes > plim->max_vbvdiv5*(5*1000/8) ||
+        (unsigned)(enc->frame.nmb*(enc->param.max_long_term_reference_frames + 1)) > plim->max_dpb))
+    {
+        plim++;
+    }
+
+    nal_start(enc, 0x67 | (profile_idc == SCALABLE_BASELINE)*8);
+    U(8, profile_idc);  // profile, 66 = baseline
+    U(8, plim->constrains & ((profile_idc!= SCALABLE_BASELINE)*4));     // no constrains
+    U(8, plim->level);
+    //U(5, 0x1B);       // sps_id|log2_max_frame_num_minus4|pic_order_cnt_type
+    //UE(0);  // sps_id 1
+    UE(enc->param.sps_id);
+
+#if H264E_SVC_API
+    if(profile_idc== SCALABLE_BASELINE)
+    {
+        UE(1); //chroma_format_idc
+        UE(0); //bit_depth_luma_minus8
+        UE(0); //bit_depth_chroma_minus8)
+        U1(0); //qpprime_y_zero_transform_bypass_flag
+        U1(0); //seq_scaling_matrix_present_flag
+    }
+#endif
+    UE(log2_max_frame_num_minus4);  // log2_max_frame_num_minus4  1 UE(0);  // log2_max_frame_num_minus4  1
+    UE(2);  // pic_order_cnt_type         011
+    UE(1 + enc->param.max_long_term_reference_frames);  // num_ref_frames
+    U1(0);                                      // gaps_in_frame_num_value_allowed_flag);
+    UE(((enc->param.width + 15) >> 4) - 1);     // pic_width_in_mbs_minus1
+    UE(((enc->param.height + 15) >> 4) - 1);    // pic_height_in_map_units_minus1
+    U(3, 6 + enc->frame.cropping_flag);         // frame_mbs_only_flag|direct_8x8_inference_flag|frame_cropping_flag
+//    U1(1);  // frame_mbs_only_flag
+//    U1(1);  // direct_8x8_inference_flag
+//    U1(frame_cropping_flag);  // frame_cropping_flag
+    if (enc->frame.cropping_flag)
+    {
+        UE(0);                                          // frame_crop_left_offset
+        UE((enc->frame.w - enc->param.width) >> 1);     // frame_crop_right_offset
+        UE(0);                                          // frame_crop_top_offset
+        UE((enc->frame.h - enc->param.height) >> 1);    // frame_crop_bottom_offset
+    }
+    U1(0);      // vui_parameters_present_flag
+
+#if H264E_SVC_API
+    if(profile_idc == SCALABLE_BASELINE)
+    {
+        U1(1);  //(inter_layer_deblocking_filter_control_present_flag); //inter_layer_deblocking_filter_control_present_flag
+        U(2,0); //extended_spatial_scalability
+        U1(0);  //chroma_phase_x_plus1_flag
+        U(2,0); //chroma_phase_y_plus1
+
+    /*    if( sps->sps_ext.extended_spatial_scalability == 1 )
+        {
+            //if( ChromaArrayType > 0 )
+            {
+                put_bits( s, 1,0);
+                put_bits( s, 2,0); ///
+            }
+            put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_left_offset );
+            put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_top_offset );
+            put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_right_offset );
+            put_bits_se( s, sps->sps_ext.seq_scaled_ref_layer_bottom_offset );
+        }*/
+        U1(0); //seq_tcoeff_level_prediction_flag
+        U1(1); //slice_header_restriction_flag
+        U1(0); //svc_vui_parameters_present_flag
+        U1(0); //additional_extension2_flag
+    }
+#endif
+    nal_end(enc);
+}
+
+/**
+*   Encode Picture Parameter Set (SPS)
+*   ref: [1] 7.3.2.2
+*/
+static void encode_pps(h264e_enc_t *enc, int pps_id)
+{
+    nal_start(enc, 0x68);
+ //   U(10, 0x338);       // constant shortcut:
+    UE(enc->param.sps_id*4 + pps_id);  // pic_parameter_set_id         1
+    UE(enc->param.sps_id);  // seq_parameter_set_id         1
+    U1(0);  // entropy_coding_mode_flag     0
+    U1(0);  // pic_order_present_flag       0
+    UE(0);  // num_slice_groups_minus1      1
+    UE(0);  // num_ref_idx_l0_active_minus1 1
+    UE(0);  // num_ref_idx_l1_active_minus1 1
+    U1(0);  // weighted_pred_flag           0
+    U(2,0); // weighted_bipred_idc          00
+    SE(enc->sps.pic_init_qp - 26);  // pic_init_qp_minus26
+#if DQP_CHROMA
+    SE(0);  // pic_init_qs_minus26                    1
+    SE(DQP_CHROMA);  // chroma_qp_index_offset        1
+    U1(1);  // deblocking_filter_control_present_flag 1
+    U1(0);  // constrained_intra_pred_flag            0
+    U1(0);  // redundant_pic_cnt_present_flag         0
+#else
+    U(5, 0x1C);         // constant shortcut:
+//     SE(0);  // pic_init_qs_minus26                    1
+//     SE(0);  // chroma_qp_index_offset                 1
+//     U1(1);  // deblocking_filter_control_present_flag 1
+//     U1(0);  // constrained_intra_pred_flag            0
+//     U1(0);  // redundant_pic_cnt_present_flag         0
+#endif
+    nal_end(enc);
+}
+
+/**
+*   Encode Slice Header
+*   ref: [1] 7.3.3
+*/
+static void encode_slice_header(h264e_enc_t *enc, int frame_type, int long_term_idx_use, int long_term_idx_update, int pps_id, int enc_type)
+{
+    // slice reset
+    enc->slice.start_mb_num = enc->mb.num;
+    enc->mb.skip_run = 0;
+    memset(enc->i4x4mode, -1, (enc->frame.nmbx + 1)*4);
+    memset(enc->nnz, NNZ_NA, (enc->frame.nmbx + 1)*8);    // DF ignore slice borders, but uses it's own nnz's
+
+    if (enc_type == 0)
+    {
+#if H264E_SVC_API
+        if (enc->param.num_layers > 1)
+        {
+            //need prefix nal for compatibility base layer with h264
+            nal_start(enc, 14 | 0x40);
+            //if((nal_unit_type == NAL_UNIT_TYPE_PREFIX_SCALABLE_EXT ) ||nal_unit_type == NAL_UNIT_TYPE_RBSP_SCALABLE_EXT))
+            {
+                //reserved_one_bit = 1    idr_flag                    priority_id
+                U(8, (1 << 7) | ((frame_type == H264E_FRAME_TYPE_KEY) << 6) | 0);
+                U1(1);   //no_inter_layer_pred_flag
+                U(3, 0); //dependency_id
+                U(4, quality_id); //quality_id
+                //reserved_three_2bits = 3!
+                U(3, 0); //temporal_id
+                U1(1); //use_ref_base_pic_flag
+                U1(0); //discardable_flag
+                U1(1); //output_flag
+                U(2, 3);
+
+                U1(0); //store_ref_base_pic_flag
+                if (!(frame_type == H264E_FRAME_TYPE_KEY))
+                {
+                    U1(0); //adaptive_ref_base_pic_marking_mode_flag  u(1)
+                }
+
+                U1(0); //prefix_nal_unit_additional_extension_flag 2 u(1)
+
+                //put_bits_rbsp_trailing( s );
+            }
+            nal_end(enc);
+        }
+#endif //#if H264E_SVC_API
+        nal_start(enc, (frame_type == H264E_FRAME_TYPE_KEY ? 5 : 1) | (long_term_idx_update >= 0 ? 0x60 : 0));
+    }
+#if H264E_SVC_API
+    else
+    {
+        nal_start(enc, (20 | (long_term_idx_update >= 0 ? 0x60 : 0)));  //RBSP_SCALABLE_EXT = 20
+        //nal_unit_type 20 or 14
+        {
+            //reserved_one_bit = 1    idr_flag                    priority_id
+            U(8, (1 << 7) | ((frame_type == H264E_FRAME_TYPE_KEY) << 6) | 0);
+            U1(!enc->param.inter_layer_pred_flag); //no_inter_layer_pred_flag
+            U(3, dependency_id); //dependency_id
+            U(4, quality_id);    //quality_id
+            //reserved_three_2bits = 3!!!
+            U(3, 0); //temporal_id
+            U1(0); //use_ref_base_pic_flag
+            U1(1); //discardable_flag
+            U1(1); //output_flag
+            U(2, 3);
+        }
+    }
+#endif
+
+    UE(enc->slice.start_mb_num);        // first_mb_in_slice
+    UE(enc->slice.type);                // slice_type
+    //U(1+4, 16 + (enc->frame.num&15));   // pic_parameter_set_id | frame_num
+    UE(pps_id);                           // pic_parameter_set_id
+    U(4 + log2_max_frame_num_minus4, enc->frame.num & ((1 << (log2_max_frame_num_minus4 + 4)) - 1)); // frame_num U(4, enc->frame.num&15);            // frame_num
+    if (frame_type == H264E_FRAME_TYPE_KEY)
+    {
+        UE(enc->next_idr_pic_id);       // idr_pic_id
+    }
+    //!!!  if !quality_id && enc->slice.type == SLICE_TYPE_P  put_bit(s, 0); // num_ref_idx_active_override_flag = 0
+    if(!quality_id)
+    {
+        if (((enc_type != 0)) && enc->slice.type == SLICE_TYPE_P)
+        {
+            //U1(0);
+        }
+        if (enc->slice.type == SLICE_TYPE_P)// if( slice_type == P  | |  slice_type ==  SP  | |  slice_type  = =  B )
+        {
+            int ref_pic_list_modification_flag_l0 = long_term_idx_use > 0;
+            //U1(0);                      // num_ref_idx_active_override_flag
+            // ref_pic_list_modification()
+            U(2, ref_pic_list_modification_flag_l0); // num_ref_idx_active_override_flag | ref_pic_list_modification_flag_l0
+            if (ref_pic_list_modification_flag_l0)
+            {
+                // Table 7-7
+                UE(2);      // long_term_pic_num is present and specifies the long-term picture number for a reference picture
+                UE(long_term_idx_use - 1); // long_term_pic_num
+                UE(3);      // End loop
+            }
+        }
+
+        if (long_term_idx_update >= 0)
+        {
+            //dec_ref_pic_marking( )
+            if (frame_type == H264E_FRAME_TYPE_KEY)
+            {
+                //U1(0);                                      // no_output_of_prior_pics_flag
+                //U1(enc->param.enable_golden_frames_flag);   // long_term_reference_flag
+                U(2, enc->param.max_long_term_reference_frames > 0);   // no_output_of_prior_pics_flag | long_term_reference_flag
+            } else
+            {
+                int adaptive_ref_pic_marking_mode_flag = long_term_idx_update > 0;//(frame_type == H264E_FRAME_TYPE_GOLDEN);
+                U1(adaptive_ref_pic_marking_mode_flag);
+                if (adaptive_ref_pic_marking_mode_flag)
+                {
+                    // Table 7-9
+                    if (enc->short_term_used)
+                    {
+                        UE(1);  // unmark short
+                        UE(0);  // unmark short
+                    }
+                    if (enc->lt_used[long_term_idx_update - 1])
+                    {
+                        UE(2);  // Mark a long-term reference picture as "unused for reference"
+                        UE(long_term_idx_update - 1); // index
+                    } else
+                    {
+                        UE(4);  // Specify the maximum long-term frame index
+                        UE(enc->param.max_long_term_reference_frames);    // [0,max-1]+1
+                    }
+                    UE(6);  // Mark the current picture as "used for long-term reference"
+                    UE(long_term_idx_update - 1);   // index
+                    UE(0);  // End loop
+                }
+            }
+        }
+    }
+    SE(enc->rc.prev_qp - enc->sps.pic_init_qp);     // slice_qp_delta
+#if H264E_MAX_THREADS
+    if (enc->param.max_threads > 1)
+    {
+        UE(enc->speed.disable_deblock ? 1 : 2);
+    } else
+#endif
+    {
+        UE(enc->speed.disable_deblock);             // disable deblock
+    }
+
+    if (enc->speed.disable_deblock != 1)
+    {
+#if ALPHA_OFS || BETA_OFS
+        SE(ALPHA_OFS/2);                            // slice_alpha_c0_offset_div2
+        SE(BETA_OFS/2);                             // slice_beta_offset_div2
+#else
+        U(2, 3);
+#endif
+    }
+
+#if H264E_SVC_API
+    if (enc_type != 0)
+    {
+        enc->adaptive_base_mode_flag = enc->param.inter_layer_pred_flag;
+        if (enc->param.inter_layer_pred_flag && !quality_id)
+        {
+            UE(16*(dependency_id - 1));
+            //if(1)//(inter_layer_deblocking_filter_control_present_flag)
+            {
+                UE(0);//disable_inter_layer_deblocking_filter_idc
+                UE(0);
+                UE(0);
+            }
+            /*if( sh->disable_inter_layer_deblocking_filter_idc != 1 )
+            {
+                put_bits_se(s, sh->slice_alpha_c0_offset_div2);
+                put_bits_se(s, sh->slice_beta_offset_div2);
+            }*/
+            U1(0); // constrained_intra_resampling_flag 2 u(1)
+        }
+        if (enc->param.inter_layer_pred_flag)
+        {
+            U1(0); //slice_skip_flag u(1)
+            {
+                U1(enc->adaptive_base_mode_flag); // 2 u(1)
+                if (!enc->adaptive_base_mode_flag)
+                    U1(default_base_mode_flag); // 2 u(1)
+                if (!default_base_mode_flag)
+                {
+                    U1(0); //adaptive_motion_prediction_flag) // 2 u(1)
+                    U1(0); //sh->default_motion_prediction_flag// 2 u(1)
+                }
+                U1(0); //adaptive_residual_prediction_flag // 2 u(1)
+                U1(0); //default_residual_prediction_flag // 2 u(1)
+            }
+        }
+    }
+#endif // #if H264E_SVC_API
+}
+
+/**
+*   Macroblock transform, quantization and bitstream encoding
+*/
+static void mb_write(h264e_enc_t *enc, int enc_type, int base_mode)
+{
+    int i, uv, mb_type, cbpc, cbpl, cbp;
+    scratch_t *qv = enc->scratch;
+    //int base_mode = enc_type > 0 ? 1 : 0;
+    int mb_type_svc = base_mode ? -2 : enc->mb.type;
+    int intra16x16_flag = mb_type_svc >= 6;// && !base_mode;
+    uint8_t nz[9];
+    uint8_t *nnz_top = enc->nnz + 8 + enc->mb.x*8;
+    uint8_t *nnz_left = enc->nnz;
+
+    if (enc->mb.type != 5)
+    {
+        enc->i4x4mode[0] = enc->i4x4mode[enc->mb.x + 1] = 0x02020202;
+    }
+
+    enc->df.nzflag = ((enc->df.nzflag >> 4) & 0x84210) | enc->df.df_nzflag[enc->mb.x];
+    for (i = 0; i < 4; i++)
+    {
+        nz[5 + i] = nnz_top[i];
+        nnz_top[i] = 0;
+        nz[3 - i] = nnz_left[i];
+        nnz_left[i] = 0;
+    }
+
+l_skip:
+    if (enc->mb.type == -1)
+    {
+        // encode skip macroblock
+        assert(enc->slice.type != SLICE_TYPE_I);
+
+        // Increment run count
+        enc->mb.skip_run++;
+
+        // Update predictors
+        *(uint32_t*)(nnz_top + 4) = *(uint32_t*)(nnz_left + 4) = 0; // set chroma NNZ to 0
+        me_mv_medianpredictor_put(enc, 0, 0, 4, 4, enc->mb.mv[0]);
+        me_mv_dfmatrix_put(enc->df.df_mv, 0, 0, 4, 4, enc->mb.mv[0]);
+
+        // Update reference with reconstructed pixels
+        h264e_copy_16x16(enc->dec.yuv[0], enc->dec.stride[0], enc->pbest, 16);
+        h264e_copy_8x8(enc->dec.yuv[1], enc->dec.stride[1], enc->ptest);
+        h264e_copy_8x8(enc->dec.yuv[2], enc->dec.stride[2], enc->ptest + 8);
+    } else
+    {
+        if (enc->mb.type != 5)
+        {
+            unsigned nz_mask;
+            nz_mask = h264e_transform_sub_quant_dequant(qv->mb_pix_inp, enc->pbest, 16, intra16x16_flag ? QDQ_MODE_INTRA_16 : QDQ_MODE_INTER, qv->qy, enc->rc.qdat[0]);
+            enc->scratch->nz_mask = (uint16_t)nz_mask;
+            if (intra16x16_flag)
+            {
+                h264e_quant_luma_dc(qv->qy, qv->quant_dc, enc->rc.qdat[0]);
+                nz_mask = 0xFFFF;
+            }
+            h264e_transform_add(enc->dec.yuv[0], enc->dec.stride[0], enc->pbest, qv->qy, 4, nz_mask << 16);
+        }
+
+        // Coded Block Pattern for luma
+        cbpl = 0;
+        if (enc->scratch->nz_mask & 0xCC00) cbpl |= 1;
+        if (enc->scratch->nz_mask & 0x3300) cbpl |= 2;
+        if (enc->scratch->nz_mask & 0x00CC) cbpl |= 4;
+        if (enc->scratch->nz_mask & 0x0033) cbpl |= 8;
+
+        // Coded Block Pattern for chroma
+        cbpc = 0;
+        for (uv = 1; uv < 3; uv++)
+        {
+            pix_t *pred = enc->ptest + (uv - 1)*8;
+            pix_t *pix_mb_uv = mb_input_chroma(enc, uv);
+            int dc_flag, inp_stride = enc->inp.stride[uv];
+            unsigned nz_mask;
+            quant_t *pquv = (uv == 1) ? qv->qu : qv->qv;
+
+            if (enc->frame.cropping_flag && ((enc->mb.x + 1)*16  > enc->param.width || (enc->mb.y + 1)*16  > enc->param.height))
+            {
+                pix_copy_cropped_mb(enc->scratch->mb_pix_inp, 8, pix_mb_uv, enc->inp.stride[uv],
+                    MIN(8, enc->param.width/2  - enc->mb.x*8),
+                    MIN(8, enc->param.height/2 - enc->mb.y*8)
+                    );
+                pix_mb_uv = enc->scratch->mb_pix_inp;
+                inp_stride = 8;
+            }
+
+            nz_mask = h264e_transform_sub_quant_dequant(pix_mb_uv, pred, inp_stride, QDQ_MODE_CHROMA, pquv, enc->rc.qdat[1]);
+
+            if (nz_mask)
+            {
+                cbpc = 2;
+            }
+
+            cbpc |= dc_flag = h264e_quant_chroma_dc(pquv, uv == 1 ? qv->quant_dc_u : qv->quant_dc_v, enc->rc.qdat[1]);
+
+            if (!(dc_flag | nz_mask))
+            {
+                h264e_copy_8x8(enc->dec.yuv[uv], enc->dec.stride[uv], pred);
+            } else
+            {
+                if (dc_flag)
+                {
+                    for (i = 0; i < 4; i++)
+                    {
+                        if (~nz_mask & (8 >> i))
+                        {
+                            memset(pquv[i].dq + 1, 0, (16 - 1)*sizeof(int16_t));
+                        }
+                    }
+                    nz_mask = 15;
+                }
+                h264e_transform_add(enc->dec.yuv[uv], enc->dec.stride[uv], pred, pquv, 2, nz_mask << 28);
+            }
+        }
+        cbpc = MIN(cbpc, 2);
+
+        // Rollback to skip
+        if (!(enc->mb.type | cbpl | cbpc) && // Inter prediction, all-zero after quantization
+            mv_equal(enc->mb.mv[0], enc->mb.mv_skip_pred)) // MV == MV preditor for skip
+        {
+            enc->mb.type = -1;
+            goto l_skip;
+        }
+
+        mb_type = enc->mb.type;
+        if (mb_type_svc >= 6)   // intra 16x16
+        {
+            if (cbpl)
+            {
+                cbpl = 15;
+            }
+            mb_type += enc->mb.i16.pred_mode_luma + cbpc*4 + (cbpl ? 12 : 0);
+        }
+        if (mb_type >= 5 && enc->slice.type == SLICE_TYPE_I)    // Intra in I slice
+        {
+            mb_type -= 5;
+        }
+
+        if (enc->slice.type != SLICE_TYPE_I)
+        {
+            UE(enc->mb.skip_run);
+            enc->mb.skip_run = 0;
+        }
+
+        (void)enc_type;
+#if H264E_SVC_API
+        if (enc->adaptive_base_mode_flag && enc_type > 0)
+            U1(base_mode);
+#endif
+
+        if (!base_mode)
+            UE(mb_type);
+
+        if (enc->mb.type == 3) // 8x8
+        {
+            for (i = 0; i < 4; i++)
+            {
+                UE(0);
+            }
+            // 0 = 8x8
+            // 1 = 8x4
+            // 2 = 4x8
+            // 3 = 4x4
+        }
+
+        if (!base_mode)
+        {
+            if (enc->mb.type >= 5)   // intra
+            {
+                int pred_mode_chroma;
+                if (enc->mb.type == 5)  // intra 4x4
+                {
+                    for (i = 0; i < 16; i++)
+                    {
+                        int m = enc->mb.i4x4_mode[decode_block_scan[i]];
+                        int nbits =  4;
+                        if (m < 0)
+                        {
+                            m = nbits = 1;
+                        }
+                        U(nbits, m);
+                    }
+                }
+                pred_mode_chroma = enc->mb.i16.pred_mode_luma;
+                if (!(pred_mode_chroma&1))
+                {
+                    pred_mode_chroma ^= 2;
+                }
+                UE(pred_mode_chroma);
+                me_mv_medianpredictor_put(enc, 0, 0, 4, 4, point(MV_NA,0));
+            } else
+            {
+                int part, x = 0, y = 0;
+                int dx = (enc->mb.type & 2) ? 2 : 4;
+                int dy = (enc->mb.type & 1) ? 2 : 4;
+                for (part = 0;;part++)
+                {
+                    SE(enc->mb.mvd[part].s.x);
+                    SE(enc->mb.mvd[part].s.y);
+                    me_mv_medianpredictor_put(enc, x, y, dx, dy, enc->mb.mv[part]);
+                    me_mv_dfmatrix_put(enc->df.df_mv, x, y, dx, dy, enc->mb.mv[part]);
+                    x = (x + dx) & 3;
+                    if (!x)
+                    {
+                        y = (y + dy) & 3;
+                        if (!y)
+                        {
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        cbp = cbpl + (cbpc << 4);
+        /*temp for test up-sample filter*/
+        /*if(base_mode)
+        {
+            cbp = 0;
+            cbpl=0;
+            cbpc = 0;
+        }*/
+        if (mb_type_svc < 6)
+        {
+            // encode cbp 9.1.2 Mapping process for coded block pattern
+            static const uint8_t cbp2code[2][48] = {
+                {3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2, 16, 33, 34, 21, 35, 22, 39,  4,
+                36, 40, 23,  5, 24,  6,  7,  1, 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15,  0},
+                {0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11,  1, 32, 33, 36, 34, 37, 44, 40,
+                35, 45, 38, 41, 39, 42, 43, 19,  6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12}
+            };
+            UE(cbp2code[mb_type_svc < 5][cbp]);
+        }
+
+        if (cbp || (mb_type_svc >= 6))
+        {
+            SE(enc->rc.qp - enc->rc.prev_qp);
+            enc->rc.prev_qp = enc->rc.qp;
+        }
+
+        // *** Huffman encoding ***
+
+        // 1. Encode Luma DC (intra 16x16 only)
+        if (intra16x16_flag)
+        {
+            h264e_vlc_encode(enc->bs, qv->quant_dc, 16, nz + 4);
+        }
+
+        // 2. Encode luma residual (only if CBP non-zero)
+        if (cbpl)
+        {
+            for (i = 0; i < 16; i++)
+            {
+                int j = decode_block_scan[i];
+                if (cbp & (1 << (i >> 2)))
+                {
+                    uint8_t *pnz = nz + 4 + (j & 3) - (j >> 2);
+                    h264e_vlc_encode(enc->bs, qv->qy[j].qv, 16 - intra16x16_flag, pnz);
+                    if (*pnz)
+                    {
+                        enc->df.nzflag |= 1 << (5 + (j & 3) + 5*(j >> 2));
+                    }
+                } else
+                {
+                    nz[4 + (j & 3) - (j >> 2)] = 0;
+                }
+            }
+            for (i = 0; i < 4; i++)
+            {
+                nnz_top[i] = nz[1 + i];
+                nnz_left[i] = nz[7 - i];
+            }
+        }
+
+        // 2. Encode chroma
+        if (cbpc)
+        {
+            uint8_t nzcdc[3];
+            nzcdc[0] = nzcdc[2] = 17;   // dummy neighbors, indicating chroma DC
+            // 2.1. Encode chroma DC
+            for (uv = 1; uv < 3; uv++)
+            {
+                h264e_vlc_encode(enc->bs, uv == 1 ? qv->quant_dc_u : qv->quant_dc_v, 4, nzcdc + 1);
+            }
+
+            // 2.2. Encode chroma residual
+            if (cbpc > 1)
+            {
+                for (uv = 1; uv < 3; uv++)
+                {
+                    uint8_t nzc[5];
+                    int nnz_off = (uv == 1 ? 4 : 6);
+                    quant_t *pquv = uv == 1 ? qv->qu : qv->qv;
+                    for (i = 0; i < 2; i++)
+                    {
+                        nzc[3 + i] = nnz_top[nnz_off + i] ;
+                        nzc[1 - i] = nnz_left[nnz_off + i];
+                    }
+                    for (i = 0; i < 4; i++)
+                    {
+                        int k = 2 + (i & 1) - (i >> 1);
+                        h264e_vlc_encode(enc->bs, pquv[i].qv, 15, nzc + k);
+                    }
+                    for (i = 0; i < 2; i++)
+                    {
+                        nnz_top[nnz_off + i]  = nzc[1 + i];
+                        nnz_left[nnz_off + i] = nzc[3 - i];
+                    }
+                }
+            }
+        }
+        if (cbpc !=2)
+        {
+            *(uint32_t*)(nnz_top+4) = *(uint32_t*)(nnz_left+4) = 0; // set chroma NNZ to 0
+        }
+    }
+
+    // Save top & left lines
+    for (uv = 0; uv < 3; uv++)
+    {
+        int off = 0, n = uv ? 8 : 16;
+        pix_t *top = enc->top_line + 48 + enc->mb.x*32;
+        pix_t *left = enc->top_line;
+        pix_t *mb = enc->dec.yuv[uv];
+
+        if (uv)
+        {
+            off = 8 + uv*8;
+        }
+        top  += off;
+        left += off;
+
+        enc->top_line[32 + uv] = top[n - 1];
+        for (i = 0; i < n; i++)
+        {
+            left[i] = mb[n - 1 + i*enc->dec.stride[uv]];
+            top[i] = mb[(n - 1)*enc->dec.stride[uv] + i];
+        }
+    }
+}
+
+/************************************************************************/
+/*      Intra mode encoding                                             */
+/************************************************************************/
+/**
+*   Estimate cost of 4x4 intra predictor
+*/
+static void intra_choose_4x4(h264e_enc_t *enc)
+{
+    int i, n, a, nz_mask = 0, avail = mb_avail_flag(enc);
+    scratch_t *qv = enc->scratch;
+    pix_t *mb_dec = enc->dec.yuv[0];
+    pix_t *dec = enc->ptest;
+    int cost =  g_lambda_i4_q4[enc->rc.qp];// + MUL_LAMBDA(16, g_lambda_q4[enc->rc.qp]);    // 4x4 cost: at least 16 bits + penalty
+
+    uint32_t edge_store[(3 + 16 + 1 + 16 + 4)/4 + 2]; // pad for SSE
+    pix_t *edge = ((pix_t*)edge_store) + 3 + 16 + 1;
+    uint32_t *edge32 = (uint32_t *)edge;              // alias
+    const uint32_t *top32 = (const uint32_t*)(enc->top_line + 48 + enc->mb.x*32);
+    pix_t *left = enc->top_line;
+
+    edge[-1] = enc->top_line[32];
+    for (i = 0; i < 16; i++)
+    {
+        edge[-2 - i] = left[i];
+    }
+    for (i = 0; i < 4; i++)
+    {
+        edge32[i] = top32[i];
+    }
+    edge32[4] = top32[8];
+
+    for (n = 0; n < 16; n++)
+    {
+        static const uint8_t block2avail[16] = {
+            0x07, 0x23, 0x23, 0x2b, 0x9b, 0x77, 0xff, 0x77, 0x9b, 0xff, 0xff, 0x77, 0x9b, 0x77, 0xff, 0x77,
+        };
+        pix_t *block;
+        pix_t *blockin;
+        int sad, mpred, mode;
+        int r = n >> 2;
+        int c = n & 3;
+        int8_t *ctx_l = (int8_t *)enc->i4x4mode + r;
+        int8_t *ctx_t = (int8_t *)enc->i4x4mode + 4 + enc->mb.x*4 + c;
+        edge = ((pix_t*)edge_store) + 3 + 16 + 1 + 4*c - 4*r;
+
+        a = avail;
+        a &= block2avail[n];
+        a |= block2avail[n] >> 4;
+
+        if (!(block2avail[n] & AVAIL_TL)) // TL replace
+        {
+            if ((n <= 3 && (avail & AVAIL_T)) ||
+                (n  > 3 && (avail & AVAIL_L)))
+            {
+                a |= AVAIL_TL;
+            }
+        }
+        if (n < 3 && (avail & AVAIL_T))
+        {
+            a |= AVAIL_TR;
+        }
+
+        blockin = enc->scratch->mb_pix_inp + (c + r*16)*4;
+        block = dec + (c + r*16)*4;
+
+        mpred = MIN(*ctx_l, *ctx_t);
+        if (mpred < 0)
+        {
+            mpred = 2;
+        }
+
+        sad = h264e_intra_choose_4x4(blockin, block, a, edge, mpred, MUL_LAMBDA(3, g_lambda_q4[enc->rc.qp]));
+        mode = sad & 15;
+        sad >>= 4;
+
+        *ctx_l = *ctx_t = (int8_t)mode;
+        if (mode == mpred)
+        {
+            mode = -1;
+        } else if (mode > mpred)
+        {
+            mode--;
+        }
+        enc->mb.i4x4_mode[n] = (int8_t)mode;
+
+        nz_mask <<= 1;
+        if (sad > g_skip_thr_i4x4[enc->rc.qp])
+        {
+            //  skip transform on low SAD gains just about 2% for all-intra coding at QP40,
+            //  for other QP gain is minimal, so SAD check do not used
+            nz_mask |= h264e_transform_sub_quant_dequant(blockin, block, 16, QDQ_MODE_INTRA_4, qv->qy + n, enc->rc.qdat[0]);
+
+            if (nz_mask & 1)
+            {
+                h264e_transform_add(block, 16, block, qv->qy + n, 1, ~0);
+            }
+        } else
+        {
+            memset((qv->qy+n), 0, sizeof(qv->qy[0]));
+        }
+
+        cost += sad;
+
+        edge[2] = block[3];
+        edge[1] = block[3 + 16];
+        edge[0] = block[3 + 16*2];
+        *(uint32_t*)&edge[-4] = *(uint32_t*)&block[16*3];
+    }
+    enc->scratch->nz_mask = (uint16_t)nz_mask;
+
+    if (cost < enc->mb.cost)
+    {
+        enc->mb.cost = cost;
+        enc->mb.type = 5;   // intra 4x4
+        h264e_copy_16x16(mb_dec, enc->dec.stride[0], dec, 16);  // restore reference
+    }
+}
+
+/**
+*   Choose 16x16 prediction mode, most suitable for given gradient
+*/
+static int intra_estimate_16x16(pix_t *p, int s, int avail, int qp)
+{
+    static const uint8_t mode_i16x16_valid[8] = { 4, 5, 6, 7, 4, 5, 6, 15 };
+    int p00 = p[0];
+    int p01 = p[15];
+    int p10 = p[15*s + 0];
+    int p11 = p[15*s + 15];
+    int v = mode_i16x16_valid[avail & (AVAIL_T + AVAIL_L + AVAIL_TL)];
+    // better than above on low bitrates
+    int dx = ABS(p00 - p01) + ABS(p10 - p11) + ABS((int)p[8*s] - (int)p[8*s + 15]);
+    int dy = ABS(p00 - p10) + ABS(p01 - p11) + ABS((int)p[8] - (int)p[15*s + 8]);
+
+    if ((dx > 30 + 3*dy && dy < (100 + 50 - qp)
+        //|| (/*dx < 50 &&*/ dy <= 12)
+        ) && (v & 1))
+        return 0;
+    else if (dy > 30 + 3*dx && dx < (100 + 50 - qp) && (v & (1 << 1)))
+        return 1;
+    else
+        return 2;
+}
+
+/**
+*   Estimate cost of 16x16 intra predictor
+*
+*   for foreman@qp10
+*
+*   12928 - [0-3], [0]
+*   12963 - [0-2], [0]
+*   12868 - [0-2], [0-3]
+*   12878 - [0-2], [0-2]
+*   12834 - [0-3], [0-3]
+*sad
+*   13182
+*heuristic
+*   13063
+*
+*/
+static void intra_choose_16x16(h264e_enc_t *enc, pix_t *left, pix_t *top, int avail)
+{
+    int sad, sad4[4];
+    // heuristic mode decision
+    enc->mb.i16.pred_mode_luma = intra_estimate_16x16(enc->scratch->mb_pix_inp, 16, avail, enc->rc.qp);
+
+    // run chosen predictor
+    h264e_intra_predict_16x16(enc->ptest, left, top, enc->mb.i16.pred_mode_luma);
+
+    // coding cost
+    sad = h264e_sad_mb_unlaign_8x8(enc->scratch->mb_pix_inp, 16, enc->ptest, sad4)        // SAD
+        + MUL_LAMBDA(bitsize_ue(enc->mb.i16.pred_mode_luma + 1), g_lambda_q4[enc->rc.qp]) // side-info penalty
+        + g_lambda_i16_q4[enc->rc.qp];                                                    // block kind penalty
+
+    if (sad < enc->mb.cost)
+    {
+        enc->mb.cost = sad;
+        enc->mb.type = 6;
+        SWAP(pix_t*, enc->pbest, enc->ptest);
+    }
+}
+
+/************************************************************************/
+/*      Inter mode encoding                                             */
+/************************************************************************/
+
+/**
+*   Sub-pel luma interpolation
+*/
+static void interpolate_luma(const pix_t *ref, int stride, point_t mv, point_t wh, pix_t *dst)
+{
+    ref += (mv.s.y >> 2) * stride + (mv.s.x >> 2);
+    mv.u32 &= 0x000030003;
+    h264e_qpel_interpolate_luma(ref, stride, dst, wh, mv);
+}
+
+/**
+*   Sub-pel chroma interpolation
+*/
+static void interpolate_chroma(h264e_enc_t *enc, point_t mv)
+{
+    int i;
+    for (i = 1; i < 3; i++)
+    {
+        point_t wh;
+        int part = 0, x = 0, y = 0;
+        wh.s.x = (enc->mb.type & 2) ? 4 : 8;
+        wh.s.y = (enc->mb.type & 1) ? 4 : 8;
+        if (enc->mb.type == -1) // skip
+        {
+            wh.s.x = wh.s.y = 8;
+        }
+
+        for (;;part++)
+        {
+            pix_t *ref;
+            mv = mb_abs_mv(enc, enc->mb.mv[part]);
+            ref = enc->ref.yuv[i] + ((mv.s.y >> 3) + y)*enc->ref.stride[i] + (mv.s.x >> 3) + x;
+            mv.u32 &= 0x00070007;
+            h264e_qpel_interpolate_chroma(ref, enc->ref.stride[i], enc->ptest + (i - 1)*8 + 16*y + x, wh, mv);
+            x = (x + wh.s.x) & 7;
+            if (!x)
+            {
+                y = (y + wh.s.y) & 7;
+                if (!y)
+                {
+                    break;
+                }
+            }
+        }
+    }
+}
+
+/**
+*   RD cost of given MV
+*/
+static int me_mv_cost(point_t mv, point_t mv_pred, int qp)
+{
+    int nb = bits_se(mv.s.x - mv_pred.s.x) + bits_se(mv.s.y - mv_pred.s.y);
+    return MUL_LAMBDA(nb, g_lambda_mv_q4[qp]);
+}
+
+/**
+*   RD cost of given MV candidate (TODO)
+*/
+#define me_mv_cand_cost me_mv_cost
+//static int me_mv_cand_cost(point_t mv, point_t mv_pred, int qp)
+//{
+//    int nb = bits_se(mv.s.x - mv_pred.s.x) + bits_se(mv.s.y - mv_pred.s.y);
+//    return MUL_LAMBDA(nb, g_lambda_mv_q4[qp]);
+//}
+
+
+/**
+*   Modified full-pel motion search with small diamond algorithm
+*   note: diamond implemented with small modifications, trading speed for precision
+*/
+static int me_search_diamond(h264e_enc_t *enc, const pix_t *ref, const pix_t *b, int rowbytes, point_t *mv,
+    const rectangle_t *range, int qp, point_t mv_pred, int min_sad, point_t wh, pix_t *scratch, pix_t **ppbest, int store_bytes)
+{
+    // cache map           cache moves
+    //      3              0   x->1
+    //      *              1   x->0
+    //  1 * x * 0          2   x->3
+    //      *              3   x->2
+    //      2                   ^1
+
+    //   cache double moves:
+    //           prev               prev
+    //      x ->   0   ->   3   ==>   3   =>   1
+    //      x ->   0   ->   2   ==>   2   =>   1
+    //      x ->   0   ->   0   ==>   0   =>   1
+    //      x ->   0   ->   1   - impossible
+    //   prev SAD(n) is (n+4)
+    //
+
+    static const point_t dir2mv[] = {{{4, 0}},{{-4, 0}},{{0, 4}},{{0, -4}}};
+    union
+    {
+        uint16_t cache[8];
+        uint32_t cache32[4];
+    } sad;
+
+    int dir, cloop, dir_prev, cost;
+    point_t v;
+
+    assert(mv_in_rect(*mv, range));
+
+restart:
+    dir = 0;                // start gradient descend with direction dir2mv[0]
+    cloop = 4;              // try 4 directions
+    dir_prev = -1;          // not yet moved
+
+    // reset SAD cache
+    sad.cache32[0] = sad.cache32[1] = sad.cache32[2] = sad.cache32[3] = ~0u;
+
+    // 1. Full-pel ME with small diamond modification:
+    // center point moved immediately as soon as new minimum found
+    do
+    {
+        assert(dir >= 0 && dir < 4);
+
+        // Try next point. Avoid out-of-range moves
+        v = mv_add(*mv, dir2mv[dir]);
+        //if (mv_in_rect(v, range) && sad.cache[dir] == (uint16_t)~0u)
+        if (mv_in_rect(v, range) && sad.cache[dir] == 0xffffu)
+        {
+            cost = h264e_sad_mb_unlaign_wh(ref + ((v.s.y*rowbytes + v.s.x) >> 2), rowbytes, b, wh);
+            //cost += me_mv_cost(*mv, mv_pred, qp);
+            cost += me_mv_cost(v, mv_pred, qp);
+            sad.cache[dir] = (uint16_t)cost;
+            if (cost < min_sad)
+            {
+                // This point is better than center: move this point to center and continue
+                int corner = ~0;
+                if (dir_prev >= 0)                      // have previous move
+                {                                       // save cache point, which can be used in next iteration
+                    corner = sad.cache[4 + dir];        // see "cache double moves" above
+                }
+                sad.cache32[2] = sad.cache32[0];        // save current cache to 'previous'
+                sad.cache32[3] = sad.cache32[1];
+                sad.cache32[0] = sad.cache32[1] = ~0u;  // reset current cache
+                if (dir_prev >= 0)                      // but if have previous move
+                {                                       // one cache point can be reused from previous iteration
+                    sad.cache[dir_prev^1] = (uint16_t)corner; // see "cache double moves" above
+                }
+                sad.cache[dir^1] = (uint16_t)min_sad;   // previous center become a neighbor's
+                dir_prev = dir;                         // save this direction
+                dir--;                                  // start next iteration with the same direction
+                cloop = 4 + 1;                          // and try 4 directions (+1 for do-while loop)
+                *mv = v;                                // Save best point found
+                min_sad = cost;                         // and it's SAD
+            }
+        }
+        dir = (dir + 1) & 3;                            // cycle search directions
+    } while(--cloop);
+
+    // 2. Optional: Try diagonal step
+    //if (1)
+    {
+        int primary_dir   = sad.cache[3] >= sad.cache[2] ? 2 : 3;
+        int secondary_dir = sad.cache[1] >= sad.cache[0] ? 0 : 1;
+        if (sad.cache[primary_dir] < sad.cache[secondary_dir])
+        {
+            SWAP(int, secondary_dir, primary_dir);
+        }
+
+        v = mv_add(dir2mv[secondary_dir], dir2mv[primary_dir]);
+        v = mv_add(*mv, v);
+        //cost = (uint16_t)~0u;
+        if (mv_in_rect(v, range))
+        {
+            cost = h264e_sad_mb_unlaign_wh(ref + ((v.s.y*rowbytes + v.s.x) >> 2), rowbytes, b, wh);
+            cost += me_mv_cost(v, mv_pred, qp);
+            if (cost < min_sad)
+            {
+                *mv = v;//mv_add(*mv, v);
+                min_sad = cost;
+                goto restart;
+            }
+        }
+    }
+
+    interpolate_luma(ref, rowbytes, *mv, wh, scratch);    // Plain NxM copy can be used
+    *ppbest = scratch;
+
+    // 3. Fractional pel search
+    if (enc->run_param.encode_speed < 9 && mv_in_rect(*mv, &enc->frame.mv_qpel_limit))
+    {
+        point_t vbest = *mv;
+        pix_t *pbest = scratch;
+        pix_t *hpel  = scratch + store_bytes;
+        pix_t *hpel1 = scratch + ((store_bytes == 8) ? 256 : 2*store_bytes);
+        pix_t *hpel2 = hpel1 + store_bytes;
+
+        int i, sad_test;
+        point_t primary_qpel, secondary_qpel, vdiag;
+
+        unsigned minsad1 = sad.cache[1];
+        unsigned minsad2 = sad.cache[3];
+        secondary_qpel = point(-1, 0);
+        primary_qpel = point(0, -1);
+        if (sad.cache[3] >= sad.cache[2])
+            primary_qpel = point(0, 1), minsad2 = sad.cache[2];
+        if (sad.cache[1] >= sad.cache[0])
+            secondary_qpel = point(1, 0), minsad1 = sad.cache[0];
+
+        if (minsad2 > minsad1)
+        {
+            SWAP(point_t, secondary_qpel, primary_qpel);
+        }
+
+        //     ============> primary
+        //     |00 01 02
+        //     |10 11 12
+        //     |20    22
+        //     V
+        //     secondary
+        vdiag = mv_add(primary_qpel, secondary_qpel);
+
+        for (i = 0; i < 7; i++)
+        {
+            pix_t *ptest;
+            switch(i)
+            {
+            case 0:
+                // 02 = interpolate primary half-pel
+                v = mv_add(*mv, mv_add(primary_qpel, primary_qpel));
+                interpolate_luma(ref, rowbytes, v, wh, ptest = hpel1);
+                break;
+            case 1:
+                // 01 q-pel = (00 + 02)/2
+                v = mv_add(*mv, primary_qpel);
+                h264e_qpel_average_wh_align(scratch, hpel1, ptest = hpel, wh);
+                break;
+            case 2:
+                // 20 = interpolate secondary half-pel
+                v = mv_add(*mv, mv_add(secondary_qpel, secondary_qpel));
+                interpolate_luma(ref, rowbytes, v, wh, ptest = hpel2);
+                break;
+            case 3:
+                // 10 q-pel = (00 + 20)/2
+                hpel  = scratch + store_bytes; if (pbest == hpel) hpel = scratch;
+                v = mv_add(*mv, secondary_qpel);
+                h264e_qpel_average_wh_align(scratch, hpel2, ptest = hpel, wh);
+                break;
+            case 4:
+                // 11 q-pel = (02 + 20)/2
+                hpel  = scratch + store_bytes; if (pbest == hpel) hpel = scratch;
+                v = mv_add(*mv, vdiag);
+                h264e_qpel_average_wh_align(hpel1, hpel2, ptest = hpel, wh);
+                break;
+            case 5:
+                // 22 = interpolate center half-pel
+                if (pbest == hpel2) hpel2 = scratch, hpel = scratch + store_bytes;
+                v = mv_add(*mv, mv_add(vdiag, vdiag));
+                interpolate_luma(ref, rowbytes, v, wh, ptest = hpel2);
+                break;
+            case 6:
+            default:
+                // 12 q-pel = (02 + 22)/2
+                hpel  = scratch + store_bytes; if (pbest == hpel) hpel = scratch;
+                v = mv_add(*mv, mv_add(primary_qpel, vdiag));
+                h264e_qpel_average_wh_align(hpel2, hpel1, ptest = hpel, wh);
+                break;
+            }
+
+            sad_test = h264e_sad_mb_unlaign_wh(ptest, 16, b, wh) + me_mv_cost(v, mv_pred, qp);
+            if (sad_test < min_sad)
+            {
+                min_sad = sad_test;
+                vbest = v;
+                pbest = ptest;
+            }
+        }
+
+        *mv = vbest;
+        *ppbest = pbest;
+    }
+    return min_sad;
+}
+
+/**
+*   Set range for MV search
+*/
+static void me_mv_set_range(point_t *pnt, rectangle_t *range, const rectangle_t *mv_limit, int mby)
+{
+    // clip start point
+    rectangle_t r = *mv_limit;
+    r.tl.s.y = (int16_t)(MAX(r.tl.s.y, mby - 63*4));
+    r.br.s.y = (int16_t)(MIN(r.br.s.y, mby + 63*4));
+    mv_clip(pnt, &r);
+    range->tl = mv_add(*pnt, point(-MV_RANGE*4, -MV_RANGE*4));
+    range->br = mv_add(*pnt, point(+MV_RANGE*4, +MV_RANGE*4));
+    // clip search range
+    mv_clip(&range->tl, &r);
+    mv_clip(&range->br, &r);
+}
+
+/**
+*   Remove duplicates from MV candidates list
+*/
+static int me_mv_refine_cand(point_t *p, int n)
+{
+    int i, j, k;
+    p[0] = mv_round_qpel(p[0]);
+    for (j = 1, k = 1; j < n; j++)
+    {
+        point_t mv = mv_round_qpel(p[j]);
+        for (i = 0; i < k; i++)
+        {
+            // TODO
+            //if (!mv_differs3(mv, p[i], 3*4))
+            //if (!mv_differs3(mv, p[i], 1*4))
+            //if (!mv_differs3(mv, p[i], 3))
+            if (mv_equal(mv, p[i]))
+                break;
+        }
+        if (i == k)
+            p[k++] = mv;
+    }
+    return k;
+}
+
+/**
+*   Choose candidates for inter MB partitioning (16x8,8x16 or 8x8),
+*   using SAD's for 8x8 sub-blocks
+*/
+static void mb_inter_partition(/*const */int sad[4], int mode[4])
+{
+/*
+    slope
+        |[ 1  1]| _ |[ 1 -1]|
+        |[-1 -1]|   |[ 1 -1]|
+        indicates v/h gradient: big negative = vertical prediction; big positive = horizontal
+
+    skew
+        |[ 1  0]| _ |[ 0 -1]|
+        |[ 0 -1]|   |[ 1  0]|
+        indicates diagonal gradient: big negative = diagonal down right
+*/
+    int p00 = sad[0];
+    int p01 = sad[1];
+    int p10 = sad[2];
+    int p11 = sad[3];
+    int sum = p00 + p01 + p10 + p11;
+    int slope = ABS((p00 - p10) + (p01 - p11)) - ABS((p00 - p01) + (p10 - p11));
+    int skew = ABS(p11 - p00) - ABS(p10 - p01);
+
+    if (slope >  (sum >> 4))
+    {
+        mode[1] = 1;    // try 8x16 partition
+    }
+    if (slope < -(sum >> 4))
+    {
+        mode[2] = 1;    // try 16x8 partition
+    }
+    if (ABS(skew) > (sum >> 4) && ABS(slope) <= (sum >> 4))
+    {
+        mode[3] = 1;    // try 8x8 partition
+    }
+}
+
+/**
+*   Online MV clustering to "long" and "short" clusters
+*   Estimate mean "long" and "short" vectors
+*/
+static void mv_clusters_update(h264e_enc_t *enc, point_t mv)
+{
+    int mv_norm = SQRP(mv);
+    int n0 = SQRP(enc->mv_clusters[0]);
+    int n1 = SQRP(enc->mv_clusters[1]);
+    if (mv_norm < n1)
+    {
+        // "short" is shorter than "long"
+        SMOOTH(enc->mv_clusters[0], mv);
+    }
+    if (mv_norm >= n0)
+    {
+        // "long" is longer than "short"
+        SMOOTH(enc->mv_clusters[1], mv);
+    }
+}
+
+/**
+*   Choose inter mode: skip/coded, ME partition, find MV
+*/
+static void inter_choose_mode(h264e_enc_t *enc)
+{
+    int prefered_modes[4] = { 1, 0, 0, 0 };
+    point_t mv_skip, mv_skip_a, mv_cand[MAX_MV_CAND];
+    point_t mv_pred_16x16 = me_mv_medianpredictor_get_skip(enc);
+    point_t mv_best = point(MV_NA, 0); // avoid warning
+
+    int sad, sad_skip = 0x7FFFFFFF, sad_best = 0x7FFFFFFF;
+    int off, i, j = 0, ncand = 0;
+    int cand_sad4[MAX_MV_CAND][4];
+    const pix_t *ref_yuv = enc->ref.yuv[0];
+    int ref_stride = enc->ref.stride[0];
+    int mv_cand_cost_best = 0;
+    mv_skip = enc->mb.mv_skip_pred;
+    mv_skip_a = mb_abs_mv(enc, mv_skip);
+
+    for (i = 0; i < 4; i++)
+    {
+        enc->df.df_mv[4 + 5*i].u32 = enc->mv_pred[i].u32;
+        enc->df.df_mv[i].u32       = enc->mv_pred[8 + 4*enc->mb.x + i].u32;
+    }
+
+    // Try skip mode
+    if (mv_in_rect(mv_skip_a, &enc->frame.mv_qpel_limit))
+    {
+        int *sad4 = cand_sad4[0];
+        interpolate_luma(ref_yuv, ref_stride, mv_skip_a, point(16, 16), enc->ptest);
+        sad_skip = h264e_sad_mb_unlaign_8x8(enc->scratch->mb_pix_inp, 16, enc->ptest, sad4);
+
+        if (MAX(MAX(sad4[0], sad4[1]), MAX(sad4[2], sad4[3])) < g_skip_thr_inter[enc->rc.qp])
+        {
+            int uv, sad_uv;
+
+            SWAP(pix_t*, enc->pbest, enc->ptest);
+            enc->mb.type = -1;
+            enc->mb.mv[0] = mv_skip;
+            enc->mb.cost = 0;
+            interpolate_chroma(enc, mv_skip_a);
+
+            // Check that chroma SAD is not too big for the skip
+            for (uv = 1; uv <= 2; uv++)
+            {
+                pix_t *pred = enc->ptest + (uv - 1)*8;
+                pix_t *pix_mb_uv = mb_input_chroma(enc, uv);
+                int inp_stride = enc->inp.stride[uv];
+
+                if (enc->frame.cropping_flag && ((enc->mb.x + 1)*16  > enc->param.width || (enc->mb.y + 1)*16  > enc->param.height))
+                {
+                    // Speculative read beyond frame borders: make local copy of the macroblock.
+                    // TODO: same code used in mb_write() and mb_encode()
+                    pix_copy_cropped_mb(enc->scratch->mb_pix_store, 8, pix_mb_uv, enc->inp.stride[uv],
+                        MIN(8, enc->param.width/2  - enc->mb.x*8),
+                        MIN(8, enc->param.height/2 - enc->mb.y*8));
+                    pix_mb_uv = enc->scratch->mb_pix_store;
+                    inp_stride = 8;
+                }
+
+                sad_uv = h264e_sad_mb_unlaign_wh(pix_mb_uv, inp_stride, pred, point(8, 8));
+                if (sad_uv >= g_skip_thr_inter[enc->rc.qp])
+                {
+                    break;
+                }
+            }
+            if (uv == 3)
+            {
+                return;
+            }
+        }
+
+        if (enc->run_param.encode_speed < 1) // enable 8x16, 16x8 and 8x8 partitions
+        {
+            mb_inter_partition(sad4, prefered_modes);
+        }
+
+        //sad_skip += me_mv_cost(mv_skip, mv_pred_16x16, enc->rc.qp);
+
+        // Too big skip SAD. Use skip predictor as a diamond start point candidate
+        mv_best = mv_round_qpel(mv_skip);
+        mv_cand[ncand++] = mv_best;
+        if (!((mv_skip.s.x | mv_skip.s.y) & 3))
+        {
+            sad_best = sad_skip;//+ me_mv_cost(mv_best, mv_pred_16x16, enc->rc.qp)
+            mv_cand_cost_best = me_mv_cand_cost(mv_skip, mv_pred_16x16, enc->rc.qp);
+            //mv_cand_cost_best = me_mv_cand_cost(mv_skip, point(0,0), enc->rc.qp);
+            j = 1;
+        }
+    }
+
+    mv_cand[ncand++] = mv_pred_16x16;
+    ncand += me_mv_medianpredictor_get_cand(enc, mv_cand + ncand);
+
+    if (enc->mb.x <= 0)
+    {
+        mv_cand[ncand++] = point(8*4, 0);
+    }
+    if (enc->mb.y <= 0)
+    {
+        mv_cand[ncand++] = point(0, 8*4);
+    }
+
+    mv_cand[ncand++] = enc->mv_clusters[0];
+    mv_cand[ncand++] = enc->mv_clusters[1];
+
+    assert(ncand <= MAX_MV_CAND);
+    ncand = me_mv_refine_cand(mv_cand, ncand);
+
+    for (/*j = 0*/; j < ncand; j++)
+    {
+        point_t mv = mb_abs_mv(enc, mv_cand[j]);
+        if (mv_in_rect(mv, &enc->frame.mv_limit))
+        {
+            int mv_cand_cost = me_mv_cand_cost(mv_cand[j], mv_pred_16x16, enc->rc.qp);
+
+            int *sad4 = cand_sad4[j];
+            off = ((mv.s.y + 0) >> 2)*ref_stride + ((mv.s.x + 0) >> 2);
+            sad = h264e_sad_mb_unlaign_8x8(ref_yuv + off, ref_stride, enc->scratch->mb_pix_inp, sad4);
+
+            if (enc->run_param.encode_speed < 1) // enable 8x16, 16x8 and 8x8 partitions
+            {
+                mb_inter_partition(sad4, prefered_modes);
+            }
+
+            if (sad + mv_cand_cost < sad_best + mv_cand_cost_best)
+            //if (sad < sad_best)
+            {
+                mv_cand_cost_best = mv_cand_cost;
+                sad_best = sad;
+                mv_best = mv_cand[j];
+            }
+        }
+    }
+
+    sad_best += me_mv_cost(mv_best, mv_pred_16x16, enc->rc.qp);
+
+    {
+        int mb_type;
+        point_t wh, part, mvpred_ctx[12], part_mv[4][16], part_mvd[4][16];
+        pix_t *store = enc->scratch->mb_pix_store;
+        pix_t *pred_best = store, *pred_test = store + 256;
+
+#define MAX8X8_MODES 4
+        me_mv_medianpredictor_save_ctx(enc, mvpred_ctx);
+        enc->mb.cost = 0xffffff;
+        for (mb_type = 0; mb_type < MAX8X8_MODES; mb_type++)
+        {
+            static const int nbits[4] = { 1, 4, 4, 12 };
+            int imv = 0;
+            int part_sad = MUL_LAMBDA(nbits[mb_type], g_lambda_q4[enc->rc.qp]);
+
+            if (!prefered_modes[mb_type]) continue;
+
+            wh.s.x = (mb_type & 2) ? 8 : 16;
+            wh.s.y = (mb_type & 1) ? 8 : 16;
+            part = point(0, 0);
+            for (;;)
+            {
+                rectangle_t range;
+                pix_t *diamond_out;
+                point_t mv, mv_pred, mvabs = mb_abs_mv(enc, mv_best);
+                me_mv_set_range(&mvabs, &range, &enc->frame.mv_limit, enc->mb.y*16*4 + part.s.y*4);
+
+                mv_pred = me_mv_medianpredictor_get(enc, part, wh);
+
+                if (mb_type)
+                {
+                    mvabs = mv_round_qpel(mb_abs_mv(enc, mv_pred));
+                    me_mv_set_range(&mvabs, &range, &enc->frame.mv_limit, enc->mb.y*16*4 + part.s.y*4);
+                    off = ((mvabs.s.y >> 2) + part.s.y)*ref_stride + ((mvabs.s.x >> 2) + part.s.x);
+                    sad_best = h264e_sad_mb_unlaign_wh(ref_yuv + off, ref_stride, enc->scratch->mb_pix_inp + part.s.y*16 + part.s.x, wh)
+                        + me_mv_cost(mvabs,
+                        //mv_pred,
+                        mb_abs_mv(enc, mv_pred),
+                        enc->rc.qp);
+                }
+
+                part_sad += me_search_diamond(enc, ref_yuv + part.s.y*ref_stride + part.s.x,
+                    enc->scratch->mb_pix_inp + part.s.y*16 + part.s.x, ref_stride, &mvabs, &range, enc->rc.qp,
+                    mb_abs_mv(enc, mv_pred), sad_best, wh,
+                    store, &diamond_out, mb_type ? (mb_type == 2 ? 8 : 128) : 256);
+
+                if (!mb_type)
+                {
+                    pred_test = diamond_out;
+                    if (pred_test < store + 2*256)
+                    {
+                        pred_best = (pred_test == store ? store + 256 : store);
+                        store += 2*256;
+                    } else
+                    {
+                        pred_best = (pred_test == (store + 512) ? store + 512 + 256 : store + 512);
+                    }
+                } else
+                {
+                    h264e_copy_8x8(pred_test + part.s.y*16 + part.s.x, 16, diamond_out);
+                    if (mb_type < 3)
+                    {
+                        int part_off = (wh.s.x >> 4)*8 + (wh.s.y >> 4)*8*16;
+                        h264e_copy_8x8(pred_test + part_off + part.s.y*16 + part.s.x, 16, diamond_out + part_off);
+                    }
+                }
+
+                mv = mv_sub(mvabs, point(enc->mb.x*16*4, enc->mb.y*16*4));
+
+                part_mvd[mb_type][imv] = mv_sub(mv, mv_pred);
+                part_mv[mb_type][imv++] = mv;
+
+                me_mv_medianpredictor_put(enc, part.s.x >> 2, part.s.y >> 2, wh.s.x >> 2, wh.s.y >> 2, mv);
+
+                part.s.x = (part.s.x + wh.s.x) & 15;
+                if (!part.s.x)
+                {
+                    part.s.y = (part.s.y + wh.s.y) & 15;
+                    if (!part.s.y) break;
+                }
+            }
+
+            me_mv_medianpredictor_restore_ctx(enc, mvpred_ctx);
+
+            if (part_sad < enc->mb.cost)
+            {
+                SWAP(pix_t*, pred_best, pred_test);
+                enc->mb.cost = part_sad;
+                enc->mb.type = mb_type;
+            }
+        }
+        enc->pbest = pred_best;
+        enc->ptest = pred_test;
+        memcpy(enc->mb.mv,  part_mv [enc->mb.type], 16*sizeof(point_t));
+        memcpy(enc->mb.mvd, part_mvd[enc->mb.type], 16*sizeof(point_t));
+
+        if (enc->mb.cost > sad_skip)
+        {
+            enc->mb.type = 0;
+            enc->mb.cost = sad_skip + me_mv_cand_cost(mv_skip, mv_pred_16x16, enc->rc.qp);
+            enc->mb.mv [0] = mv_skip;
+            enc->mb.mvd[0] = mv_sub(mv_skip, mv_pred_16x16);
+
+            assert(mv_in_rect(mv_skip_a, &enc->frame.mv_qpel_limit)) ;
+            interpolate_luma(ref_yuv, ref_stride, mv_skip_a, point(16, 16), enc->pbest);
+            interpolate_chroma(enc, mv_skip_a);
+        }
+    }
+}
+
+/************************************************************************/
+/*      Deblock filter                                                  */
+/************************************************************************/
+#define MB_FLAG_SVC_INTRA 1
+#define MB_FLAG_SLICE_START_DEBLOCK_2 2
+
+/**
+*   Set deblock filter strength
+*/
+static void df_strength(deblock_filter_t *df, int mb_type, int mbx, uint8_t *strength, int IntraBLFlag)
+{
+    uint8_t *sv = strength;
+    uint8_t *sh = strength + 16;
+    int flag = df->nzflag;
+    df->df_nzflag[mbx] = (uint8_t)(flag >> 20);
+    /*
+        nzflag represents macroblock and it's neighbors with 24 bit flags:
+        0 1 2 3
+      4 5 6 7 8
+      A B C D E
+      F G H I J
+      K L K N O
+    */
+    (void)IntraBLFlag;
+#if H264E_SVC_API
+    if (IntraBLFlag & MB_FLAG_SVC_INTRA)
+    {
+        int ccloop = 4;
+        do
+        {
+            int cloop = 4;
+            do
+            {
+                int v = 0;
+                if (flag & 3 << 4)
+                {
+                    v = 1;
+                }
+
+                *sv = (uint8_t)v; sv += 4;
+
+                v = 0;
+                if (flag & 33)
+                {
+                    v = 1;
+                }
+
+                *sh++ = (uint8_t)v;
+
+                flag >>= 1;
+
+            } while(--cloop);
+            flag >>= 1;
+            sv -= 15;
+
+        } while(--ccloop);
+    } else
+#endif
+    {
+        if (mb_type < 5)
+        {
+            int ccloop = 4;
+            point_t *mv = df->df_mv;
+            do
+            {
+                int cloop = 4;
+                do
+                {
+                    int v = 0;
+                    if (flag & 3 << 4)
+                    {
+                        v = 2;
+                    } else if (mv_differs3(mv[4], mv[5]))
+                    {
+                        v = 1;
+                    }
+                    *sv = (uint8_t)v; sv += 4;
+
+                    v = 0;
+                    if (flag & 33)
+                    {
+                        v = 2;
+                    } else if (mv_differs3(mv[0], mv[5]))
+                    {
+                        v = 1;
+                    }
+                    *sh++ = (uint8_t)v;
+
+                    flag >>= 1;
+                    mv++;
+                } while(--cloop);
+                flag >>= 1;
+                sv -= 15;
+                mv++;
+            } while(--ccloop);
+        } else
+        {
+            // Deblock mode #3 (intra)
+            ((uint32_t*)(sv))[1] = ((uint32_t*)(sv))[2] = ((uint32_t*)(sv))[3] =             // for inner columns
+            ((uint32_t*)(sh))[1] = ((uint32_t*)(sh))[2] = ((uint32_t*)(sh))[3] = 0x03030303; // for inner rows
+        }
+        if ((mb_type >= 5 || df->mb_type[mbx - 1] >= 5)) // speculative read
+        {
+            ((uint32_t*)(strength))[0] = 0x04040404;    // Deblock mode #4 (strong intra) for left column
+        }
+        if ((mb_type >= 5 || df->mb_type[mbx    ] >= 5))
+        {
+            ((uint32_t*)(strength))[4] = 0x04040404;    // Deblock mode #4 (strong intra) for top row
+        }
+    }
+    df->mb_type[mbx] = (int8_t)mb_type;
+}
+
+/**
+*   Run deblock for current macroblock
+*/
+static void mb_deblock(deblock_filter_t *df, int mb_type, int qp_this, int mbx, int mby, H264E_io_yuv_t *mbyuv, int IntraBLFlag)
+{
+    int i, cr, qp, qp_left, qp_top;
+    deblock_params_t par;
+    uint8_t *alpha = par.alpha; //[2*2];
+    uint8_t *beta  = par.beta;  //[2*2];
+    uint32_t *strength32  = par.strength32; //[4*2]; // == uint8_t strength[16*2];
+    uint8_t *strength = (uint8_t *)strength32;
+    uint8_t *tc0 = par.tc0; //[16*2];
+
+    df_strength(df, mb_type, mbx, strength, IntraBLFlag);
+    if (!mbx || (IntraBLFlag & MB_FLAG_SLICE_START_DEBLOCK_2))
+    {
+        strength32[0] = 0;
+    }
+
+    if (!mby)
+    {
+        strength32[4] = 0;
+    }
+
+    qp_top = df->df_qp[mbx];
+    qp_left = df->df_qp[mbx - 1];
+    df->df_qp[mbx] = (uint8_t)qp_this;
+
+    cr = 0;
+    for (;;)
+    {
+        const uint8_t *lut;
+        if (*((uint32_t*)strength))
+        {
+            qp = (qp_left + qp_this + 1) >> 1;
+            lut = g_a_tc0_b[-10 + qp + ALPHA_OFS];
+            alpha[0] = lut[0];
+            beta[0]  = lut[4 + (BETA_OFS - ALPHA_OFS)*5];
+            for (i = 0; i < 4; i++) tc0[i] = lut[strength[i]];
+        }
+        if (*((uint32_t*)(strength + 16)))
+        {
+            qp = (qp_top + qp_this + 1) >> 1;
+            lut = g_a_tc0_b[-10 + qp + ALPHA_OFS];
+
+            alpha[2]  = lut[0];
+            beta[2] = lut[4 + (BETA_OFS - ALPHA_OFS)*5];
+            for (i = 0; i < 4; i++) tc0[16 + i] = lut[strength[16 + i]];
+        }
+
+        lut = g_a_tc0_b[-10 + qp_this + ALPHA_OFS];
+        alpha[3] = alpha[1] = lut[0];
+        beta[3] = beta[1] = lut[4 + (BETA_OFS - ALPHA_OFS)*5];
+        for (i = 4; i < 16; i++)
+        {
+            tc0[i] = lut[strength[i]];
+            tc0[16 + i] = lut[strength[16 + i]];
+        }
+        if (cr)
+        {
+            int *t = (int *)tc0;
+            t[1] = t[2];         // TODO: need only for OMX
+            t[5] = t[6];
+            i = 2;
+            do
+            {
+                h264e_deblock_chroma(mbyuv->yuv[i], mbyuv->stride[i], &par);
+            } while (--i);
+            break;
+        }
+        h264e_deblock_luma(mbyuv->yuv[0], mbyuv->stride[0], &par);
+
+        qp_this = qpy2qpc[qp_this + DQP_CHROMA];
+        qp_left = qpy2qpc[qp_left + DQP_CHROMA];
+        qp_top = qpy2qpc[qp_top + DQP_CHROMA];
+        cr++;
+    }
+}
+
+/************************************************************************/
+/*      Macroblock encoding                                             */
+/************************************************************************/
+/**
+*   Macroblock encoding
+*/
+static void mb_encode(h264e_enc_t *enc, int enc_type)
+{
+    pix_t *top = enc->top_line + 48 + enc->mb.x*32;
+    pix_t *left = enc->top_line;
+    int avail = enc->mb.avail = mb_avail_flag(enc);
+    int base_mode = 0;
+
+    if (enc->frame.cropping_flag && ((enc->mb.x + 1)*16 > enc->param.width || (enc->mb.y + 1)*16 > enc->param.height))
+    {
+        pix_copy_cropped_mb(enc->scratch->mb_pix_inp, 16, mb_input_luma(enc), enc->inp.stride[0],
+             MIN(16, enc->param.width  - enc->mb.x*16),
+             MIN(16, enc->param.height - enc->mb.y*16));
+    } else
+    {
+        // cache input macroblock
+        h264e_copy_16x16(enc->scratch->mb_pix_inp, 16, mb_input_luma(enc), enc->inp.stride[0]);
+    }
+
+    if (!(avail & AVAIL_L)) left = NULL;
+    if (!(avail & AVAIL_T)) top  = NULL;
+
+    enc->pbest = enc->scratch->mb_pix_store;
+    enc->ptest = enc->pbest + 256;
+    enc->mb.type = 0;
+    enc->mb.cost = 0x7FFFFFFF;
+
+    if (enc->slice.type == SLICE_TYPE_P)
+    {
+        inter_choose_mode(enc);
+    }
+#if H264E_SVC_API
+    else if (enc_type > 0 && enc->param.inter_layer_pred_flag)
+    {
+        base_mode = 1;
+        enc->mb.type = 6;
+        h264e_copy_16x16(enc->pbest, 16, (enc->ref.yuv[0] + (enc->mb.x + enc->mb.y*enc->ref.stride[0])*16), enc->ref.stride[0]);
+        h264e_copy_8x8_s(enc->ptest, 16, (enc->ref.yuv[1] + (enc->mb.x + enc->mb.y*enc->ref.stride[1])*8), enc->ref.stride[1]);
+        h264e_copy_8x8_s(enc->ptest + 8, 16, (enc->ref.yuv[2] + (enc->mb.x + enc->mb.y*enc->ref.stride[2])*8), enc->ref.stride[2]);
+
+        goto _WRITE_MB;
+    }
+#endif
+
+    if (enc->mb.type >= 0)
+    {
+        intra_choose_16x16(enc, left, top, avail);
+        if (enc->run_param.encode_speed < 2 || enc->slice.type != SLICE_TYPE_P) // enable intra4x4 on P slices
+        {
+            intra_choose_4x4(enc);
+        }
+    }
+
+    if (enc->mb.type < 5)
+    {
+        mv_clusters_update(enc, enc->mb.mv[0]);
+    }
+
+    if (enc->mb.type >= 5)
+    {
+        pix_t *pred = enc->ptest;
+        h264e_intra_predict_chroma(pred, left + 16, top + 16, enc->mb.i16.pred_mode_luma);
+    } else
+    {
+        interpolate_chroma(enc, mb_abs_mv(enc, enc->mb.mv[0]));
+    }
+
+#if H264E_SVC_API
+_WRITE_MB:
+#endif
+    mb_write(enc, enc_type, base_mode);
+
+    if (!enc->speed.disable_deblock)
+    {
+        int mbx = enc->mb.x;
+        int mby = enc->mb.y;
+#if H264E_MAX_THREADS
+        if (enc->param.max_threads > 1)
+        {   // Avoid deblock across slice border
+            if (enc->mb.num < enc->slice.start_mb_num + enc->frame.nmbx)
+                mby = 0;
+            if (enc->mb.num == enc->slice.start_mb_num)
+            {
+                base_mode |= MB_FLAG_SLICE_START_DEBLOCK_2;
+            }
+        }
+#endif
+        mb_deblock(&enc->df, enc->mb.type, enc->rc.prev_qp, mbx, mby, &enc->dec, base_mode);
+    }
+}
+
+
+/************************************************************************/
+/*      Rate-control                                                    */
+/************************************************************************/
+
+/**
+*   @return zero threshold for given rounding offset
+*/
+static uint16_t rc_rnd2thr(int round, int q)
+{
+    int b, thr = 0;
+    for (b = 0x8000; b; b >>= 1)
+    {
+        int t = (thr | b)*q;
+        if (t <= 0x10000 - round)  // TODO: error: < !!!!!!!
+        {
+            thr |= b;
+        }
+    }
+    return (uint16_t)thr;
+}
+
+/**
+*   Set quantizer constants (deadzone and rounding) for given QP
+*/
+static void rc_set_qp(h264e_enc_t *enc, int qp)
+{
+    qp = MIN(qp, enc->run_param.qp_max);
+    qp = MAX(qp, enc->run_param.qp_min);
+    qp = MIN(qp, 51);   // avoid VC2010 static analyzer warning
+
+    if (enc->rc.qp != qp)
+    {
+        static const int16_t g_quant_coeff[6*6] =
+        {
+            //    0         2         1
+            13107, 10, 8066, 13, 5243, 16,
+            11916, 11, 7490, 14, 4660, 18,
+            10082, 13, 6554, 16, 4194, 20,
+             9362, 14, 5825, 18, 3647, 23,
+             8192, 16, 5243, 20, 3355, 25,
+             7282, 18, 4559, 23, 2893, 29
+            // 0 2 0 2
+            // 2 1 2 1
+            // 0 2 0 2
+            // 2 1 2 1
+        };
+
+        int cloop = 2;
+        enc->rc.qp = qp;
+
+        do
+        {
+            uint16_t *qdat0 = enc->rc.qdat[2 - cloop];
+            uint16_t *qdat  = enc->rc.qdat[2 - cloop];
+            int qp_div6 = qp*86 >> 9;
+            int qp_mod6 = qp - qp_div6*6;
+            const int16_t *quant_coeff = g_quant_coeff + qp_mod6*6; // TODO: need calculate qp%6*6
+            int i = 3;
+
+            // Quant/dequant multiplier
+            do
+            {
+                *qdat++ = *quant_coeff++ << 1 >> qp_div6;
+                *qdat++ = *quant_coeff++ << qp_div6;
+            } while(--i);
+
+            // quantizer deadzone for P & chroma
+            *qdat++ = enc->slice.type == SLICE_TYPE_P ? g_rnd_inter[qp] : g_deadzonei[qp];
+            // quantizer deadzone for I
+            *qdat++ = g_deadzonei[qp];
+
+            *qdat++ = g_thr_inter[qp]  - 0x7fff;
+            *qdat++ = g_thr_inter2[qp] - 0x7fff;
+
+            qdat[0] = qdat[2] = rc_rnd2thr(g_thr_inter[qp] - 0x7fff, qdat0[0]);
+            qdat[1] = qdat[3] =
+            qdat[4] = qdat[6] = rc_rnd2thr(g_thr_inter[qp] - 0x7fff, qdat0[2]);
+            qdat[5] = qdat[7] = rc_rnd2thr(g_thr_inter[qp] - 0x7fff, qdat0[4]);
+            qdat += 8;
+            qdat[0] = qdat[2] = rc_rnd2thr(g_thr_inter2[qp] - 0x7fff, qdat0[0]);
+            qdat[1] = qdat[3] =
+            qdat[4] = qdat[6] = rc_rnd2thr(g_thr_inter2[qp] - 0x7fff, qdat0[2]);
+            qdat[5] = qdat[7] = rc_rnd2thr(g_thr_inter2[qp] - 0x7fff, qdat0[4]);
+            qdat += 8;
+            qdat[0] = qdat[2] = qdat0[0];
+            qdat[1] = qdat[3] =
+            qdat[4] = qdat[6] = qdat0[2];
+            qdat[5] = qdat[7] = qdat0[4];
+            qdat += 8;
+            qdat[0] = qdat[2] = qdat0[1];
+            qdat[1] = qdat[3] =
+            qdat[4] = qdat[6] = qdat0[3];
+            qdat[5] = qdat[7] = qdat0[5];
+
+            qp = qpy2qpc[qp + DQP_CHROMA];
+        } while (--cloop);
+    }
+}
+
+/**
+*   Estimate frame bit budget and QP
+*
+*   How bit budget allocated?
+*   ~~~~~~~~~~~~~~~~~~~~~~~~~
+*   1. Estimate target size of I and P macroblock, assuming same quality
+*   2. Estimate I peak size
+*   3. Estimate desired stationary VBV level
+*
+*/
+static int rc_frame_start(h264e_enc_t *enc, int is_intra, int is_refers_to_long_term)
+{
+    unsigned np = MIN(enc->param.gop - 1u, 63u);
+    int nmb = enc->frame.nmb;
+
+    int qp = -1, add_bits, bit_budget = enc->run_param.desired_frame_bytes*8;
+    int nominal_p, gop_bits, stationary_vbv_level;
+    uint32_t peak_factor_q16;
+
+    // Estimate QP
+    do
+    {
+        qp++;
+        gop_bits = bits_per_mb[0][qp]*np + bits_per_mb[1][qp];
+    } while (gop_bits*nmb > (int)(np + 1)*enc->run_param.desired_frame_bytes*8 && qp < 40);
+
+    /*
+    *   desired*gop = i + p*(gop-1);   i/p = alpha;
+    *   p = desired * gop / (gop-1+alpha) and i = p*alpha or i = (desired-p)*gop + p;
+    */
+    peak_factor_q16 = div_q16(bits_per_mb[1][qp] << 16, bits_per_mb[0][qp] << 16);
+    if (np)
+    {
+        uint32_t ratio_q16 = div_q16((np + 1) << 16, (np << 16) + peak_factor_q16);
+        nominal_p = mul32x32shr16(enc->run_param.desired_frame_bytes*8, ratio_q16);
+    } else
+    {
+        nominal_p = 0;
+    }
+
+    stationary_vbv_level = MIN(enc->param.vbv_size_bytes*8 >> 4, enc->run_param.desired_frame_bytes*8);
+
+    if (is_intra)
+    {
+        int nominal_i = mul32x32shr16(nominal_p, peak_factor_q16);
+        add_bits = nominal_i - bit_budget;
+    }
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+    else if (is_refers_to_long_term)
+    {
+        int d_qp = enc->rc.max_dqp - enc->rc.dqp_smooth;
+        unsigned peak_factor_golden_q16;
+        int nominal_golden;
+        d_qp = MAX(d_qp, 2);
+        d_qp = MIN(d_qp, 12);
+        d_qp = d_qp * 4 * 85 >> 8;//* 16 / 12;
+
+        peak_factor_golden_q16 = (peak_factor_q16 - (1 << 16)) * d_qp >> 4;
+        nominal_golden = nominal_p + mul32x32shr16(nominal_p, peak_factor_golden_q16);
+        add_bits = nominal_golden - bit_budget;
+    }
+#endif
+    else
+    {
+        add_bits = nominal_p - bit_budget;
+
+        // drift to stationary level
+        if (enc->param.vbv_size_bytes)
+        {
+            add_bits += (enc->rc.vbv_target_level - enc->rc.vbv_bits) >> 4;
+        }
+    }
+    if (enc->param.vbv_size_bytes)
+    {
+        add_bits = MIN(add_bits, (enc->param.vbv_size_bytes*8*7 >> 3) - enc->rc.vbv_bits);
+    }
+
+    bit_budget += add_bits;
+    bit_budget = MIN(bit_budget, enc->run_param.desired_frame_bytes*8*16);
+    bit_budget = MAX(bit_budget, enc->run_param.desired_frame_bytes*8 >> 2);
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+    if (is_intra || is_refers_to_long_term)
+#else
+    if (is_intra)
+#endif
+    {
+        // Increase VBV target level due to to I-frame load: this avoids QP adaptation after I-frame
+        enc->rc.vbv_target_level = enc->rc.vbv_bits + bit_budget - enc->run_param.desired_frame_bytes*8;
+    }
+
+    // Slow drift of VBV target to stationary level...
+    enc->rc.vbv_target_level -= enc->run_param.desired_frame_bytes*8 - nominal_p;
+
+    // ...until stationary level reached
+    enc->rc.vbv_target_level = MAX(enc->rc.vbv_target_level, stationary_vbv_level);
+
+    enc->rc.bit_budget = bit_budget;
+
+    if (enc->param.fine_rate_control_flag && enc->frame.num)
+    {
+        qp = enc->rc.qp_smooth >> 8;
+    } else
+    {
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+        if (is_refers_to_long_term)
+        {
+            for (qp = 0; qp < 42 - 1; qp++)
+            {
+                //if (((bits_per_mb[0][qp] + bits_per_mb[1][qp]) >> 1)*nmb < bit_budget)
+                if (((bits_per_mb[0][qp] + bits_per_mb[1][qp]) >> 1)*nmb < bit_budget)
+                    break;
+            }
+        } else
+#endif
+        {
+            const uint16_t *bits = bits_per_mb[!!is_intra];
+            for (qp = 0; qp < 42 - 1; qp++)
+            {
+                if (bits[qp]*nmb < bit_budget)
+                {
+                    break;
+                }
+            }
+        }
+        qp += MIN_QP;
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+        if (is_refers_to_long_term)
+        {
+            int dqp = MAX(enc->rc.max_dqp, enc->rc.dqp_smooth);
+            dqp  = MIN(dqp, enc->rc.dqp_smooth + 6);
+            qp += dqp;
+            qp = MAX(enc->rc.prev_qp, qp);
+        } else
+#endif
+        {
+            qp += enc->rc.dqp_smooth;
+        }
+
+        // If reference frame has high qp, motion compensation is less effective, so qp should be increased
+        if (enc->rc.prev_qp > qp + 1)
+        {
+            qp = (enc->rc.prev_qp + qp + 1)/2;
+        }
+    }
+
+    enc->rc.qp = 0; // force
+    rc_set_qp(enc, qp);
+    qp = enc->rc.qp;
+
+    enc->rc.qp_smooth = qp << 8;
+    enc->rc.prev_qp = qp;
+
+    return (enc->rc.vbv_bits > enc->param.vbv_size_bytes*8);
+}
+
+/**
+*   Update rate-control state after frame encode
+*/
+static void rc_frame_end(h264e_enc_t *enc, int intra_flag, int skip_flag, int is_refers_to_long_term)
+{
+    // 1. Update QP offset adaptive adjustment
+    if (!skip_flag /*&& !is_refers_to_long_term*/)
+    {
+        int qp, nmb = enc->frame.nmb;
+        // a posterior qp estimation
+        for (qp = 0; qp != 41 && bits_per_mb[intra_flag][qp]*nmb > (int)enc->out_pos*8 - 32; qp++) {/*no action*/}
+
+        qp += MIN_QP;
+
+        if (!is_refers_to_long_term)
+        {
+            if ((enc->rc.qp_smooth >> 8) - enc->rc.dqp_smooth < qp - 1)
+            {
+                enc->rc.dqp_smooth--;
+            } else if ((enc->rc.qp_smooth >> 8) - enc->rc.dqp_smooth > qp + 1)
+            {
+                enc->rc.dqp_smooth++;
+            }
+        }
+        if (intra_flag || is_refers_to_long_term)
+        {
+            enc->rc.max_dqp = enc->rc.dqp_smooth;
+        } else
+        {
+            enc->rc.max_dqp = MAX(enc->rc.max_dqp, (enc->rc.qp_smooth >> 8) - qp);
+        }
+    }
+
+    // 2. Update VBV model state
+    enc->rc.vbv_bits += enc->out_pos*8 - enc->run_param.desired_frame_bytes*8;
+
+    // 3. If VBV model used, handle overflow/underflow
+    if (enc->param.vbv_size_bytes)
+    {
+        if (enc->rc.vbv_bits < 0)       // VBV underflow
+        {
+            if (enc->param.vbv_underflow_stuffing_flag)
+            {
+                // put stuffing ('filler data')
+                nal_start(enc, 12); // filler_data_rbsp
+                do
+                {
+                    U(8, 0xFF);
+                    enc->rc.vbv_bits += 8;
+                } while (enc->rc.vbv_bits < 0);
+                nal_end(enc);
+            } else
+            {
+                // ignore underflow
+                enc->rc.vbv_bits = 0;
+            }
+        }
+        if (enc->rc.vbv_bits > enc->param.vbv_size_bytes*8) // VBV overflow
+        {
+            if (!enc->param.vbv_overflow_empty_frame_flag)
+            {
+                // ignore overflow
+                enc->rc.vbv_bits = enc->param.vbv_size_bytes*8;
+            }
+        }
+    } else
+    {
+        enc->rc.vbv_bits = 0;
+    }
+}
+
+/**
+*   Update rate-control state after macroblock encode, set QP for next MB
+*/
+static void rc_mb_end(h264e_enc_t *enc)
+{
+    // used / ncoded = budget/total
+    int bits_coded = h264e_bs_get_pos_bits(enc->bs) +  enc->out_pos*8 + 1;
+    int mb_coded = enc->mb.num; // after increment: 1, 2....
+    int err = bits_coded *enc->frame.nmb - enc->rc.bit_budget*mb_coded;
+    int d_err = err - enc->rc.prev_err;
+    int qp = enc->rc.qp;
+    assert(enc->mb.num);
+    enc->rc.prev_err = err;
+
+    if (err > 0 && d_err > 0)
+    {   // Increasing risk of overflow
+        if (enc->rc.stable_count < 3)
+        {
+            qp++;                       // State not stable: increase QP
+        }
+        enc->rc.stable_count = 0;       // Set state to "not stable"
+    } else if (err < 0 && d_err < 0)
+    {   // Increasing risk of underlow
+        if (enc->rc.stable_count < 3)
+        {
+            qp--;
+        }
+        enc->rc.stable_count = 0;
+    } else
+    {   // Stable state
+        enc->rc.stable_count++;
+    }
+    enc->rc.qp_smooth += qp - (enc->rc.qp_smooth >> 8);
+    qp = MIN(qp, enc->rc.prev_qp + 3);
+    qp = MAX(qp, enc->rc.prev_qp - 3);
+    rc_set_qp(enc, qp);
+}
+
+/************************************************************************/
+/*      Top-level API                                                   */
+/************************************************************************/
+
+#define ALIGN_128BIT(p) (void *)((uintptr_t)(((char*)(p)) + 15) & ~(uintptr_t)15)
+#define ALLOC(ptr, size) p = ALIGN_128BIT(p); if (enc) ptr = (void *)p; p += size;
+
+/**
+*   Internal allocator for persistent RAM
+*/
+static int enc_alloc(h264e_enc_t *enc, const H264E_create_param_t *par, unsigned char *p, int inp_buf_flag)
+{
+    unsigned char *p0 = p;
+    int nmbx = (par->width  + 15) >> 4;
+    int nmby = (par->height + 15) >> 4;
+    int nref_frames = 1 + par->max_long_term_reference_frames + par->const_input_flag;
+#if H264E_ENABLE_DENOISE
+    nref_frames += !!par->temporal_denoise_flag;
+#endif
+    ALLOC(enc->ref.yuv[0], ((nmbx + 2) * (nmby + 2) * 384) * nref_frames);
+    (void)inp_buf_flag;
+#if H264E_SVC_API
+    if (inp_buf_flag)
+    {
+        ALLOC(enc->inp.yuv[0], ((nmbx)*(nmby)*384)); /* input buffer for base laeyr */
+    }
+#endif
+    return (int)((p - p0) + 15) & ~15u;
+}
+
+/**
+*   Internal allocator for scratch RAM
+*/
+static int enc_alloc_scratch(h264e_enc_t *enc, const H264E_create_param_t *par, unsigned char *p)
+{
+    unsigned char *p0 = p;
+    int nmbx = (par->width  + 15) >> 4;
+    int nmby = (par->height + 15) >> 4;
+    ALLOC(enc->scratch, sizeof(scratch_t));
+    ALLOC(enc->out, nmbx * nmby * (384 + 2 + 10) * 3/2);
+
+    ALLOC(enc->nnz, nmbx*8 + 8);
+    ALLOC(enc->mv_pred, (nmbx*4 + 8)*sizeof(point_t));
+    ALLOC(enc->i4x4mode, nmbx*4 + 4);
+    ALLOC(enc->df.df_qp, nmbx);
+    ALLOC(enc->df.mb_type, nmbx);
+    ALLOC(enc->df.df_nzflag, nmbx);
+    ALLOC(enc->top_line, nmbx*32 + 32 + 16);
+    return (int)(p - p0);
+}
+
+/**
+*   Setup H264E_io_yuv_t structures
+*/
+static pix_t *io_yuv_set_pointers(pix_t *base, H264E_io_yuv_t *frm, int w, int h)
+{
+    int s = w + (16 + 16);    // guards
+    int i, guard = 16;
+    for (i = 0; i < 3; i++)
+    {
+        frm->stride[i] = s;
+        frm->yuv[i] = base + (s + 1)*guard;
+        base += s*(h + 2*guard);
+        if (!i) guard >>= 1, s >>= 1, h >>= 1;
+    }
+    return base;
+}
+
+/**
+*   Verify encoder creation parameters. Return error code, or 0 if prameters
+*/
+static int enc_check_create_params(const H264E_create_param_t *par)
+{
+    if (!par)
+    {
+        return H264E_STATUS_BAD_ARGUMENT;   // NULL argument
+    }
+    if ((int)(par->vbv_size_bytes | par->gop) < 0)
+    {
+        return H264E_STATUS_BAD_PARAMETER;  // negative GOP or VBV size
+    }
+    if (par->width <= 0 || par->height <= 0)
+    {
+        return H264E_STATUS_BAD_PARAMETER;  // non-positive frame size
+    }
+    if ((unsigned)(par->const_input_flag | par->fine_rate_control_flag |
+        par->vbv_overflow_empty_frame_flag | par->vbv_underflow_stuffing_flag) > 1)
+    {
+        return H264E_STATUS_BAD_PARAMETER;  // Any flag is not 0 or 1
+    }
+    if ((unsigned)par->max_long_term_reference_frames > MAX_LONG_TERM_FRAMES)
+    {
+        return H264E_STATUS_BAD_PARAMETER;  // Too many long-term reference frames requested
+    }
+    if ((par->width | par->height) & 1)
+    {
+        return H264E_STATUS_SIZE_NOT_MULTIPLE_2; // frame size must be multiple of 2
+    }
+    if (((par->width | par->height) & 15) && !par->const_input_flag)
+    {
+        // if input buffer reused as scratch (par->const_input_flag == 0)
+        // frame size must be multiple of 16
+        return H264E_STATUS_SIZE_NOT_MULTIPLE_16;
+    }
+    return H264E_STATUS_SUCCESS;
+};
+
+static int H264E_sizeof_one(const H264E_create_param_t *par, int *sizeof_persist, int *sizeof_scratch, int inp_buf_flag)
+{
+    int error = enc_check_create_params(par);
+    if (!sizeof_persist || !sizeof_scratch)
+    {
+        error = H264E_STATUS_BAD_ARGUMENT;
+    }
+    if (error)
+    {
+        return error;
+    }
+
+    *sizeof_persist = enc_alloc(NULL, par, (void*)(uintptr_t)1, inp_buf_flag) + sizeof(h264e_enc_t);
+#if H264E_MAX_THREADS > 1
+    *sizeof_scratch = enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1) * (par->max_threads + 1);
+#else
+    *sizeof_scratch = enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1);
+#endif
+    return error;
+}
+
+static int H264E_init_one(h264e_enc_t *enc, const H264E_create_param_t *opt, int inp_buf_flag)
+{
+    pix_t *base;
+#if H264E_CONFIGS_COUNT > 1
+    init_vft(opt->enableNEON);
+#endif
+    memset(enc, 0, sizeof(*enc));
+
+    enc->frame.nmbx = (opt->width  + 15) >> 4;
+    enc->frame.nmby = (opt->height + 15) >> 4;
+    enc->frame.nmb = enc->frame.nmbx*enc->frame.nmby;
+    enc->frame.w = enc->frame.nmbx*16;
+    enc->frame.h = enc->frame.nmby*16;
+    enc->frame.mv_limit.tl = point(-MV_GUARD*4, -MV_GUARD*4);
+    enc->frame.mv_qpel_limit.tl = mv_add(enc->frame.mv_limit.tl, point(4*4, 4*4));
+    enc->frame.mv_limit.br = point((enc->frame.nmbx*16 - (16 - MV_GUARD))*4, (enc->frame.nmby*16 - (16 - MV_GUARD))*4);
+    enc->frame.mv_qpel_limit.br = mv_add(enc->frame.mv_limit.br, point(-4*4, -4*4));
+    enc->frame.cropping_flag = !!((opt->width | opt->height) & 15);
+    enc->param = *opt;
+
+    enc_alloc(enc, opt, (void*)(enc + 1), inp_buf_flag);
+
+#if H264E_SVC_API
+    if (inp_buf_flag)
+    {
+        enc->inp.yuv[1] = enc->inp.yuv[0] + enc->frame.w*enc->frame.h;
+        enc->inp.yuv[2] = enc->inp.yuv[1] + enc->frame.w*enc->frame.h/4;
+        enc->inp.stride[0] = enc->frame.w;
+        enc->inp.stride[1] = enc->frame.w/2;
+        enc->inp.stride[2] = enc->frame.w/2;
+        enc->dec = enc->inp;
+    }
+#endif
+
+    base = io_yuv_set_pointers(enc->ref.yuv[0], &enc->ref, enc->frame.nmbx*16, enc->frame.nmby*16);
+#if H264E_ENABLE_DENOISE
+    if (enc->param.temporal_denoise_flag)
+    {
+        pix_t *p = base;
+        base = io_yuv_set_pointers(base, &enc->denoise, enc->frame.nmbx*16, enc->frame.nmby*16);
+        while (p < base) *p++ = 0;
+    }
+#endif
+    if (enc->param.const_input_flag)
+    {
+        base = io_yuv_set_pointers(base, &enc->dec, enc->frame.nmbx*16, enc->frame.nmby*16);
+    }
+    if (enc->param.max_long_term_reference_frames)
+    {
+        H264E_io_yuv_t t;
+        int i;
+        for (i = 0; i < enc->param.max_long_term_reference_frames; i++)
+        {
+            base = io_yuv_set_pointers(base, &t, enc->frame.nmbx*16, enc->frame.nmby*16);
+            enc->lt_yuv[i][0] = t.yuv[0];
+            enc->lt_yuv[i][1] = t.yuv[1];
+            enc->lt_yuv[i][2] = t.yuv[2];
+        }
+    }
+    return H264E_STATUS_SUCCESS;
+}
+
+/**
+*   Encoder initialization
+*   See header file for details.
+*/
+int H264E_init(h264e_enc_t *enc, const H264E_create_param_t *opt)
+{
+    h264e_enc_t *enc_curr = enc;
+    int i, ret;
+    (void)i;
+
+    ret = H264E_init_one(enc_curr, opt, 0);
+
+#if H264E_SVC_API
+    for (i = opt->num_layers; i > 1; i--)
+    {
+        H264E_create_param_t opt_next = enc_curr->param;
+        int sizeof_persist = 0, sizeof_scratch = 0;
+
+        opt_next.const_input_flag = 0;
+        opt_next.temporal_denoise_flag = 0;
+        opt_next.width =  opt_next.width >> 1;
+        opt_next.width += opt_next.width & 1;
+        opt_next.height = opt_next.height >> 1;
+        opt_next.height+= opt_next.height & 1;
+
+        opt_next.vbv_size_bytes <<= 2;
+
+        H264E_sizeof_one(&enc_curr->param, &sizeof_persist, &sizeof_scratch, 1);
+        enc_curr = enc_curr->enc_next = (char *)enc_curr + sizeof_persist;
+
+        ret = H264E_init_one(enc_curr, &opt_next, 1);
+        if (ret)
+            break;
+    }
+#endif
+    return ret;
+}
+
+static void encode_slice(h264e_enc_t *enc, int frame_type, int long_term_idx_use, int long_term_idx_update, int pps_id, int enc_type)
+{
+    int i, k;
+    encode_slice_header(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id,enc_type);
+    // encode frame
+    do
+    {   // encode row
+        do
+        {   // encode macroblock
+            if (enc->run_param.desired_nalu_bytes &&
+                h264e_bs_get_pos_bits(enc->bs) > enc->run_param.desired_nalu_bytes*8u)
+            {
+                // start new slice
+                nal_end(enc);
+                encode_slice_header(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id, enc_type);
+            }
+
+            mb_encode(enc, enc_type);
+
+            enc->dec.yuv[0] += 16;
+            enc->dec.yuv[1] += 8;
+            enc->dec.yuv[2] += 8;
+
+            enc->mb.num++;  // before rc_mb_end
+            if (enc->param.fine_rate_control_flag)
+            {
+                rc_mb_end(enc);
+            }
+        } while (++enc->mb.x < enc->frame.nmbx);
+
+        for (i = 0, k = 16; i < 3; i++, k = 8)
+        {
+            enc->dec.yuv[i] += k*(enc->dec.stride[i] - enc->frame.nmbx);
+        }
+
+        // start new row
+        enc->mb.x = 0;
+        *((uint32_t*)(enc->nnz)) = *((uint32_t*)(enc->nnz + 4)) = 0x01010101 * NNZ_NA; // left edge of NNZ predictor
+        enc->i4x4mode[0] = -1;
+
+    } while (++enc->mb.y < enc->frame.nmby);
+
+    if (enc->mb.skip_run)
+    {
+        UE(enc->mb.skip_run);
+    }
+
+    nal_end(enc);
+    for (i = 0, k = 16; i < 3; i++, k = 8)
+    {
+        enc->dec.yuv[i] -= k*enc->dec.stride[i]*enc->frame.nmby;
+    }
+}
+
+#if H264E_MAX_THREADS
+typedef struct
+{
+    H264E_persist_t *enc;
+    int frame_type, long_term_idx_use, long_term_idx_update, pps_id, enc_type;
+} h264_enc_slice_thread_params_t;
+
+static void encode_slice_thread_simple(void *arg)
+{
+    h264_enc_slice_thread_params_t *h = (h264_enc_slice_thread_params_t*)arg;
+    encode_slice(h->enc, h->frame_type, h->long_term_idx_use, h->long_term_idx_update, h->pps_id, h->enc_type);
+}
+#endif
+
+static int H264E_encode_one(H264E_persist_t *enc, const H264E_run_param_t *opt,
+    int long_term_idx_use, int is_refers_to_long_term, int long_term_idx_update,
+    int frame_type, int pps_id, int enc_type)
+{
+    int i, k;
+    // slice reset
+    enc->slice.type = (long_term_idx_use < 0 ? SLICE_TYPE_I : SLICE_TYPE_P);
+    rc_frame_start(enc, (long_term_idx_use < 0) ? 1 : 0, is_refers_to_long_term);
+
+    enc->mb.x = enc->mb.y = enc->mb.num = 0;
+
+    if (long_term_idx_use > 0)
+    {
+        // Activate long-term reference buffer
+        for (i = 0; i < 3; i++)
+        {
+            SWAP(pix_t*, enc->ref.yuv[i], enc->lt_yuv[long_term_idx_use - 1][i]);
+        }
+    }
+
+    if (enc->param.vbv_size_bytes && !long_term_idx_use && long_term_idx_update <= 0 &&
+        enc->rc.vbv_bits - enc->run_param.desired_frame_bytes*8 > enc->param.vbv_size_bytes*8)
+    {
+        // encode transparent frame on VBV overflow
+        encode_slice_header(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id,enc_type);
+        enc->mb.skip_run = enc->frame.nmb;
+        UE(enc->mb.skip_run);
+        nal_end(enc);
+        for (i = 0, k = 16; i < 3; i++, k = 8)
+        {
+            pix_copy_pic(enc->dec.yuv[i], enc->dec.stride[i], enc->ref.yuv[i], enc->ref.stride[i], enc->frame.nmbx*k, enc->frame.nmby*k);
+        }
+    } else
+    {
+#if H264E_MAX_THREADS
+        if (enc->param.max_threads > 1)
+        {
+            H264E_persist_t enc_thr[H264E_MAX_THREADS];
+            int sizeof_scratch = enc_alloc_scratch(NULL, &enc->param, (void*)(uintptr_t)1);
+            unsigned char *scratch_base = ((unsigned char*)enc->scratch) + sizeof_scratch;
+            int mby = 0;
+            int ithr;
+            int nmby = enc->frame.nmby;
+            void *savep[3];
+            for (i = 0; i < 3; i++)
+            {
+                savep[i] = enc->dec.yuv[i];
+            }
+
+            for (ithr = 0; ithr < enc->param.max_threads; ithr++)
+            {
+                enc_thr[ithr] = *enc;
+                enc_thr[ithr].mb.y = mby;
+                enc_thr[ithr].mb.num = mby*enc->frame.nmbx;
+                mby += (enc->frame.nmby - mby) / (enc->param.max_threads - ithr);
+                enc_thr[ithr].frame.nmby = mby;
+                enc_thr[ithr].rc.bit_budget /= enc->param.max_threads;
+                enc_thr[ithr].frame.nmb = enc_thr[ithr].frame.nmbx * enc_thr[ithr].frame.nmby;
+
+                for (i = 0, k = 16; i < 3; i++, k = 8)
+                {
+                    enc_thr[ithr].dec.yuv[i] += k*enc->dec.stride[i]*enc_thr[ithr].mb.y;
+                }
+
+                //enc_alloc_scratch(enc_thr + ithr, &enc->param, (unsigned char*)(scratch_thr[ithr]));
+                scratch_base += enc_alloc_scratch(enc_thr + ithr, &enc->param, scratch_base);
+                enc_thr[ithr].out_pos = 0;
+                h264e_bs_init_bits(enc_thr[ithr].bs, enc_thr[ithr].out);
+            }
+
+            {
+                h264_enc_slice_thread_params_t thread_par[H264E_MAX_THREADS];
+                void *args[H264E_MAX_THREADS];
+                for (i = 0; i < enc->param.max_threads; i++)
+                {
+                    thread_par[i].enc = enc_thr + i;
+                    thread_par[i].frame_type = frame_type;
+                    thread_par[i].long_term_idx_use = long_term_idx_use;
+                    thread_par[i].long_term_idx_update = long_term_idx_update;
+                    thread_par[i].pps_id = pps_id;
+                    thread_par[i].enc_type = enc_type;
+                    args[i] = thread_par + i;
+                }
+                enc->param.run_func_in_thread(enc->param.token, encode_slice_thread_simple, args, enc->param.max_threads);
+            }
+
+            for (i = 0; i < enc->param.max_threads; i++)
+            {
+                memcpy(enc->out + enc->out_pos, enc_thr[i].out, enc_thr[i].out_pos);
+                enc->out_pos += enc_thr[i].out_pos;
+            }
+            enc->frame.nmby = nmby;
+            for (i = 0; i < 3; i++)
+            {
+                enc->dec.yuv[i] = savep[i];
+            }
+        } else
+#endif
+        {
+            encode_slice(enc, frame_type, long_term_idx_use, long_term_idx_update, pps_id, enc_type);
+        }
+    }
+
+    // Set flags for AMM state machine for standard compliance
+    if (frame_type == H264E_FRAME_TYPE_KEY)
+    {
+        // Reset long-term reference frames
+        memset(enc->lt_used, 0, sizeof(enc->lt_used));
+        // Assume that this frame is not short-term (have effect only if AMM used)
+        enc->short_term_used = 0;
+    }
+    if (long_term_idx_update > 0)
+    {
+        enc->lt_used[long_term_idx_update - 1] = 1;
+    } else if (long_term_idx_update == 0)
+    {
+        enc->short_term_used = 1;
+    }
+
+    rc_frame_end(enc, long_term_idx_use == -1, enc->mb.skip_run == enc->frame.nmb, is_refers_to_long_term);
+
+    if (long_term_idx_use > 0)
+    {
+        // deactivate long-term reference
+        for (i = 0; i < 3; i++)
+        {
+            SWAP(pix_t*, enc->ref.yuv[i], enc->lt_yuv[long_term_idx_use - 1][i]);
+        }
+    }
+
+    if (long_term_idx_update != -1)
+    {
+        pix_copy_recon_pic_to_ref(enc);
+
+        if (++enc->frame.num >= enc->param.gop && enc->param.gop && (opt->frame_type == H264E_FRAME_TYPE_DEFAULT))
+        {
+            enc->frame.num = 0;     // trigger to encode IDR on next call
+        }
+
+        if (long_term_idx_update > 0)
+        {
+            for (i = 0; i < 3; i++)
+            {
+                SWAP(pix_t*, enc->ref.yuv[i], enc->lt_yuv[long_term_idx_update - 1][i]);
+            }
+        }
+    }
+
+    return H264E_STATUS_SUCCESS;
+}
+
+static int check_parameters_align(const H264E_create_param_t *opt, const H264E_io_yuv_t *in)
+{
+    int i;
+    int min_align = 0;
+#if H264E_ENABLE_NEON || H264E_ENABLE_SSE2
+    min_align = 7;
+#endif
+    if (opt->const_input_flag && opt->temporal_denoise_flag)
+    {
+        min_align = 0;
+    }
+    for (i = 0; i < 3; i++)
+    {
+        if (((uintptr_t)in->yuv[i]) & min_align)
+        {
+            return i ? H264E_STATUS_BAD_CHROMA_ALIGN : H264E_STATUS_BAD_LUMA_ALIGN;
+        }
+        if (in->stride[i] & min_align)
+        {
+            return i ? H264E_STATUS_BAD_CHROMA_STRIDE : H264E_STATUS_BAD_LUMA_STRIDE;
+        }
+    }
+    return H264E_STATUS_SUCCESS;
+}
+
+/**
+*   Top-level encode function
+*   See header file for details.
+*/
+int H264E_encode(H264E_persist_t *enc, H264E_scratch_t *scratch, const H264E_run_param_t *opt,
+    H264E_io_yuv_t *in, unsigned char **coded_data, int *sizeof_coded_data)
+{
+    int i;
+    int frame_type;
+    int long_term_idx_use;
+    int long_term_idx_update;
+    int is_refers_to_long_term;
+    int error;
+
+    error = check_parameters_align(&enc->param, in);
+    if (error)
+    {
+        return error;
+    }
+    (void)i;
+    i = enc_alloc_scratch(enc, &enc->param, (unsigned char*)scratch);
+#if H264E_SVC_API
+    {
+        H264E_persist_t *e = enc->enc_next;
+        while (e)
+        {
+            i += enc_alloc_scratch(e, &enc->param, ((unsigned char*)scratch) + i);
+            e = e->enc_next;
+        }
+    }
+#endif
+
+    enc->inp = *in;
+
+#if H264E_ENABLE_DENOISE
+    // 1. Run optional denoise filter
+    if (enc->param.temporal_denoise_flag && opt->encode_speed < 2)
+    {
+        int sh = 0;
+        for (i = 0; i < 3; i++)
+        {
+            h264e_denoise_run(in->yuv[i], enc->denoise.yuv[i],  enc->param.width >> sh, enc->param.height >> sh, in->stride[i], enc->denoise.stride[i]);
+            enc->inp.yuv[i] = enc->denoise.yuv[i];
+            enc->inp.stride[i] = enc->denoise.stride[i];
+            sh = 1;
+        }
+    }
+#endif
+
+    enc->out_pos = 0;   // reset output bitbuffer position
+
+    if (opt)
+    {
+        enc->run_param = *opt;  // local copy of run-time parameters
+    }
+    opt = &enc->run_param;      // refer to local copy
+
+    // silently fix invalid QP without warning
+    if (!enc->run_param.qp_max || enc->run_param.qp_max > 51)
+    {
+        enc->run_param.qp_max = 51;
+    }
+    if (!enc->run_param.qp_min || enc->run_param.qp_min < MIN_QP)
+    {
+        enc->run_param.qp_min = MIN_QP;
+    }
+
+    enc->speed.disable_deblock = (opt->encode_speed == 8 || opt->encode_speed == 10);
+
+    if (!enc->param.const_input_flag)
+    {
+        // if input frame can be re-used as a scratch, set reconstructed frame to the input
+        enc->dec = *in;
+    }
+
+    // Set default frame type
+    frame_type = opt->frame_type;
+    if (frame_type == H264E_FRAME_TYPE_DEFAULT)
+    {
+        frame_type = enc->frame.num ? H264E_FRAME_TYPE_P : H264E_FRAME_TYPE_KEY;
+    }
+    // Estimate long-term indexes from frame type
+    // index 0 means "short-term" reference
+    // index -1 means "not used"
+    switch (frame_type)
+    {
+    default:
+    case H264E_FRAME_TYPE_I:        long_term_idx_use = -1; long_term_idx_update = 0; break;
+    case H264E_FRAME_TYPE_KEY:      long_term_idx_use = -1; long_term_idx_update = enc->param.max_long_term_reference_frames > 0; break;
+    case H264E_FRAME_TYPE_GOLDEN:   long_term_idx_use =  1; long_term_idx_update = 1; break;
+    case H264E_FRAME_TYPE_RECOVERY: long_term_idx_use =  1; long_term_idx_update = 0; break;
+    case H264E_FRAME_TYPE_P:        long_term_idx_use =  enc->most_recent_ref_frame_idx; long_term_idx_update =  0; break;
+    case H264E_FRAME_TYPE_DROPPABLE:long_term_idx_use =  enc->most_recent_ref_frame_idx; long_term_idx_update = -1; break;
+    case H264E_FRAME_TYPE_CUSTOM:   long_term_idx_use =  opt->long_term_idx_use; long_term_idx_update = opt->long_term_idx_update;
+        if (!long_term_idx_use)
+        {
+            long_term_idx_use = enc->most_recent_ref_frame_idx;
+        }
+        if (long_term_idx_use < 0)
+        {
+            // hack: redefine frame type, always encode IDR
+            frame_type = H264E_FRAME_TYPE_KEY;
+        }
+        break;
+    }
+
+#if H264E_RATE_CONTROL_GOLDEN_FRAMES
+    is_refers_to_long_term = (long_term_idx_use != enc->most_recent_ref_frame_idx && long_term_idx_use >= 0);
+#else
+    is_refers_to_long_term = 0;
+#endif
+
+    if (long_term_idx_update >= 0)
+    {
+        enc->most_recent_ref_frame_idx = long_term_idx_update;
+    }
+    if (frame_type == H264E_FRAME_TYPE_KEY)
+    {
+        int pic_init_qp = 30;
+        pic_init_qp = MIN(pic_init_qp, enc->run_param.qp_max);
+        pic_init_qp = MAX(pic_init_qp, enc->run_param.qp_min);
+
+        //temp only two layers!
+        enc->sps.pic_init_qp = pic_init_qp;
+        enc->next_idr_pic_id ^= 1;
+        enc->frame.num = 0;
+
+#if H264E_SVC_API
+        if (enc->param.num_layers > 1)
+        {
+            H264E_persist_t *enc_base = enc->enc_next;
+            enc_base->sps.pic_init_qp = pic_init_qp;
+            enc_base->next_idr_pic_id ^= 1;
+            enc_base->frame.num = 0;
+
+            enc_base->out = enc->out;
+            enc_base->out_pos = 0;
+            encode_sps(enc_base, 66);
+            encode_pps(enc_base, 0);
+
+            enc->out_pos += enc_base->out_pos;
+            encode_sps(enc, 83);
+            encode_pps(enc, 1);
+        } else
+#endif
+        {
+            encode_sps(enc, 66);
+            encode_pps(enc, 0);
+        }
+    } else
+    {
+        if (!enc->sps.pic_init_qp)
+        {
+            return H264E_STATUS_BAD_FRAME_TYPE;
+        }
+        if (long_term_idx_use > enc->param.max_long_term_reference_frames ||
+            long_term_idx_update > enc->param.max_long_term_reference_frames ||
+            long_term_idx_use > MAX_LONG_TERM_FRAMES)
+        {
+            return H264E_STATUS_BAD_FRAME_TYPE;
+        }
+    }
+
+#if H264E_SVC_API
+    if (enc->param.num_layers > 1)
+    {
+        H264E_persist_t *enc_base = enc->enc_next;
+        int sh = 0;
+
+        enc_base->run_param = enc->run_param;
+        enc_base->run_param.desired_frame_bytes = enc->run_param.desired_frame_bytes >> 2;
+
+        for (i = 0; i < 3; i++)
+        {
+            h264e_frame_downsampling(enc_base->inp.yuv[i], enc_base->inp.stride[i], enc_base->frame.h >> sh,
+                in->yuv[i], in->stride[i], enc->param.height >> sh, enc_base->param.width >> sh,
+                enc_base->param.height >> sh, enc->param.width >> sh, enc->param.height >> sh);
+            sh = 1;
+        }
+
+        enc_base->scratch = enc->scratch;
+        enc_base->out = enc->out + enc->out_pos;
+        enc_base->out_pos = 0;
+
+        H264E_encode_one(enc_base, &enc_base->run_param, long_term_idx_use, is_refers_to_long_term, long_term_idx_update,
+            frame_type, enc->param.sps_id*4 + 0, 0);
+
+        enc->out_pos += enc_base->out_pos;
+
+        if ((frame_type == H264E_FRAME_TYPE_I || frame_type == H264E_FRAME_TYPE_KEY) && enc->param.inter_layer_pred_flag)
+        {
+            for (i = 0, sh = 0; i < 3; i++, sh = 1)
+            {
+                h264e_intra_upsampling(enc_base->frame.w >> sh, enc_base->frame.h >> sh, enc->frame.w >> sh, enc->frame.h >> sh,
+                    sh, enc_base->dec.yuv[i], enc_base->dec.stride[i], enc->ref.yuv[i], enc->ref.stride[i]);
+            }
+        }
+
+        memset(enc->df.df_nzflag, 0, enc->frame.nmbx);
+        H264E_encode_one(enc, opt, long_term_idx_use, is_refers_to_long_term, long_term_idx_update,
+            frame_type, enc->param.sps_id*4 + 1, 20);
+    } else
+#endif // H264E_SVC_API
+    {
+        H264E_encode_one(enc, opt, long_term_idx_use, is_refers_to_long_term, long_term_idx_update,
+            frame_type, enc->param.sps_id*4 + 0, 0);
+    }
+
+    *sizeof_coded_data = enc->out_pos;
+    *coded_data = enc->out;
+    return H264E_STATUS_SUCCESS;
+}
+
+/**
+*   Return persistent and scratch memory requirements
+*   for given encoding options.
+*   See header file for details.
+*/
+int H264E_sizeof(const H264E_create_param_t *par, int *sizeof_persist, int *sizeof_scratch)
+{
+    int i;
+    int error = H264E_sizeof_one(par, sizeof_persist, sizeof_scratch, 0);
+    (void)i;
+#if H264E_SVC_API
+    for (i = par->num_layers; i > 1; i--)
+    {
+        H264E_create_param_t opt_next = *par;
+        opt_next.const_input_flag = 1;
+        opt_next.temporal_denoise_flag = 0;
+        opt_next.width   = opt_next.width >> 1;
+        opt_next.width  += opt_next.width & 1;
+        opt_next.height  = opt_next.height >> 1;
+        opt_next.height += opt_next.height & 1;
+        *sizeof_persist += enc_alloc(NULL, par, (void*)(uintptr_t)1, 1) + sizeof(h264e_enc_t);
+#if H264E_MAX_THREADS > 1
+        *sizeof_scratch += enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1) * (H264E_MAX_THREADS + 1);
+#else
+        *sizeof_scratch += enc_alloc_scratch(NULL, par, (void*)(uintptr_t)1);
+#endif
+    }
+#endif
+    return error;
+}
+
+/**
+*   Set VBV size and fullness
+*   See header file for details.
+*/
+void H264E_set_vbv_state(
+    H264E_persist_t *enc,
+    int vbv_size_bytes,     //< New VBV size
+    int vbv_fullness_bytes  //< New VBV fulness, -1 = no change
+)
+{
+    if (enc)
+    {
+        enc->param.vbv_size_bytes = vbv_size_bytes;
+        if (vbv_fullness_bytes >= 0)
+        {
+            enc->rc.vbv_bits = vbv_fullness_bytes*8;
+            enc->rc.vbv_target_level = enc->rc.vbv_bits;
+        }
+    }
+}
+#endif
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,17 @@
+</$objtype/mkfile
+
+CFLAGS=$CFLAGS -p -I/sys/include/npe -D__plan9__
+BIN=/$objtype/bin/video
+TARG=hj264
+
+HFILES=\
+	stb_truetype.h\
+
+UPDATE=$HFILES
+
+OFILES=\
+	hj264.$O\
+
+default:V: all
+
+</sys/src/cmd/mkone