shithub: openh264

Download patch

ref: 4f4adcec4940dd2f96ed4e7d9804f8fd3118e9f8
parent: 59b3b428a25f386def4253f6f049c8fbc6096090
parent: 605f2fb858f0db8c808a547fea97bb315f463b5d
author: dongzha <[email protected]>
date: Thu Jan 29 04:28:57 EST 2015

Merge pull request #1772 from mstorsjo/mc-unify

Move the MC implementation to the common library

--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -19,6 +19,7 @@
 		4CE443D918B722CD0017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE443D818B722CD0017DF25 /* Foundation.framework */; };
 		53C1C9BC193F0FB000404D8F /* expand_pic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 53C1C9BB193F0FB000404D8F /* expand_pic.cpp */; };
 		5BA8F2C019603F5F00011CE4 /* common_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BA8F2BF19603F5F00011CE4 /* common_tables.cpp */; };
+		5BDD15ED1A79027600B6CA2E /* mc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 5BDD15EC1A79027600B6CA2E /* mc.cpp */; };
 		F0B204F918FD23BF005DA23F /* copy_mb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F0B204F818FD23BF005DA23F /* copy_mb.cpp */; };
 		F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8221906673900E156A8 /* arm_arch64_common_macro.S */; };
 		F556A8251906673900E156A8 /* expand_picture_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F556A8231906673900E156A8 /* expand_picture_aarch64_neon.S */; };
@@ -54,7 +55,6 @@
 		4C3406BA18D96EA600DFA14A /* deblocking_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deblocking_common.h; sourceTree = "<group>"; };
 		4C3406BD18D96EA600DFA14A /* ls_defines.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ls_defines.h; sourceTree = "<group>"; };
 		4C3406BE18D96EA600DFA14A /* macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = macros.h; sourceTree = "<group>"; };
-		4C3406BF18D96EA600DFA14A /* mc_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc_common.h; sourceTree = "<group>"; };
 		4C3406C018D96EA600DFA14A /* measure_time.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = measure_time.h; sourceTree = "<group>"; };
 		4C3406C118D96EA600DFA14A /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
 		4C3406C218D96EA600DFA14A /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
@@ -70,6 +70,8 @@
 		53C1C9BB193F0FB000404D8F /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; };
 		5BA8F2BE19603F3500011CE4 /* wels_common_defs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_defs.h; sourceTree = "<group>"; };
 		5BA8F2BF19603F5F00011CE4 /* common_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = common_tables.cpp; sourceTree = "<group>"; };
+		5BDD15EB1A79026A00B6CA2E /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
+		5BDD15EC1A79027600B6CA2E /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; };
 		F0B204F718FD23B6005DA23F /* copy_mb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = copy_mb.h; sourceTree = "<group>"; };
 		F0B204F818FD23BF005DA23F /* copy_mb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = copy_mb.cpp; sourceTree = "<group>"; };
 		F556A8221906673900E156A8 /* arm_arch64_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = arm_arch64_common_macro.S; path = arm64/arm_arch64_common_macro.S; sourceTree = "<group>"; };
@@ -123,7 +125,7 @@
 				4C3406BA18D96EA600DFA14A /* deblocking_common.h */,
 				4C3406BD18D96EA600DFA14A /* ls_defines.h */,
 				4C3406BE18D96EA600DFA14A /* macros.h */,
-				4C3406BF18D96EA600DFA14A /* mc_common.h */,
+				5BDD15EB1A79026A00B6CA2E /* mc.h */,
 				4C3406C018D96EA600DFA14A /* measure_time.h */,
 				4C3406C118D96EA600DFA14A /* typedefs.h */,
 				5BA8F2BE19603F3500011CE4 /* wels_common_defs.h */,
@@ -143,6 +145,7 @@
 				4C3406C518D96EA600DFA14A /* crt_util_safe_x.cpp */,
 				53C1C9BB193F0FB000404D8F /* expand_pic.cpp */,
 				4C3406C618D96EA600DFA14A /* deblocking_common.cpp */,
+				5BDD15EC1A79027600B6CA2E /* mc.cpp */,
 				4C3406C818D96EA600DFA14A /* WelsThreadLib.cpp */,
 			);
 			path = src;
@@ -253,6 +256,7 @@
 				F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */,
 				4C3406C918D96EA600DFA14A /* arm_arch_common_macro.S in Sources */,
 				F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */,
+				5BDD15ED1A79027600B6CA2E /* mc.cpp in Sources */,
 				F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */,
 				4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */,
 				F791965919D3BE2200F60C6B /* intra_pred_common.cpp in Sources */,
--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj
@@ -20,7 +20,6 @@
 		4CE4469318BC5EAB0017DF25 /* fmo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467018BC5EAA0017DF25 /* fmo.cpp */; };
 		4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */; };
 		4CE4469518BC5EAB0017DF25 /* manage_dec_ref.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */; };
-		4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467318BC5EAA0017DF25 /* mc.cpp */; };
 		4CE4469718BC5EAB0017DF25 /* mem_align.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467418BC5EAA0017DF25 /* mem_align.cpp */; };
 		4CE4469818BC5EAB0017DF25 /* memmgr_nal_unit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */; };
 		4CE4469918BC5EAB0017DF25 /* mv_pred.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */; };
@@ -72,7 +71,6 @@
 		4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
 		4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; };
 		4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
-		4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
 		4CE4465718BC5EAA0017DF25 /* mem_align.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mem_align.h; sourceTree = "<group>"; };
 		4CE4465818BC5EAA0017DF25 /* memmgr_nal_unit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memmgr_nal_unit.h; sourceTree = "<group>"; };
 		4CE4465918BC5EAA0017DF25 /* mv_pred.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mv_pred.h; sourceTree = "<group>"; };
@@ -98,7 +96,6 @@
 		4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; };
 		4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
 		4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; };
-		4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 0; wrapsLines = 1; };
 		4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; };
 		4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; };
 		4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };
@@ -214,7 +211,6 @@
 				4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */,
 				4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */,
 				4CE4465518BC5EAA0017DF25 /* mb_cache.h */,
-				4CE4465618BC5EAA0017DF25 /* mc.h */,
 				4CE4465718BC5EAA0017DF25 /* mem_align.h */,
 				4CE4465818BC5EAA0017DF25 /* memmgr_nal_unit.h */,
 				4CE4465918BC5EAA0017DF25 /* mv_pred.h */,
@@ -251,7 +247,6 @@
 				4CE4467018BC5EAA0017DF25 /* fmo.cpp */,
 				4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */,
 				4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */,
-				4CE4467318BC5EAA0017DF25 /* mc.cpp */,
 				4CE4467418BC5EAA0017DF25 /* mem_align.cpp */,
 				4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */,
 				4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */,
@@ -369,7 +364,6 @@
 				4CBC1B81194AC4E100214D9E /* intra_pred_aarch64_neon.S in Sources */,
 				4CE4469018BC5EAB0017DF25 /* decoder_core.cpp in Sources */,
 				4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
-				4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
 				4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */,
 				4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */,
 				4CE4468D18BC5EAB0017DF25 /* decode_mb_aux.cpp in Sources */,
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -24,7 +24,6 @@
 		4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446E218BC605C0017DF25 /* encoder_data_tables.cpp */; };
 		4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446E318BC605C0017DF25 /* encoder_ext.cpp */; };
 		4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446E518BC605C0017DF25 /* get_intra_predictor.cpp */; };
-		4CE4471718BC605C0017DF25 /* mc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446E618BC605C0017DF25 /* mc.cpp */; };
 		4CE4471818BC605C0017DF25 /* md.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446E718BC605C0017DF25 /* md.cpp */; };
 		4CE4471918BC605C0017DF25 /* memory_align.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446E818BC605C0017DF25 /* memory_align.cpp */; };
 		4CE4471A18BC605C0017DF25 /* mv_pred.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446E918BC605C0017DF25 /* mv_pred.cpp */; };
@@ -93,7 +92,6 @@
 		4CE446B518BC605C0017DF25 /* extern.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = extern.h; sourceTree = "<group>"; };
 		4CE446B618BC605C0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; };
 		4CE446B718BC605C0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
-		4CE446B818BC605C0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
 		4CE446B918BC605C0017DF25 /* md.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = md.h; sourceTree = "<group>"; };
 		4CE446BA18BC605C0017DF25 /* memory_align.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory_align.h; sourceTree = "<group>"; };
 		4CE446BB18BC605C0017DF25 /* mt_defs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mt_defs.h; sourceTree = "<group>"; };
@@ -135,7 +133,6 @@
 		4CE446E218BC605C0017DF25 /* encoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = encoder_data_tables.cpp; sourceTree = "<group>"; };
 		4CE446E318BC605C0017DF25 /* encoder_ext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = encoder_ext.cpp; sourceTree = "<group>"; };
 		4CE446E518BC605C0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
-		4CE446E618BC605C0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; };
 		4CE446E718BC605C0017DF25 /* md.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = md.cpp; sourceTree = "<group>"; };
 		4CE446E818BC605C0017DF25 /* memory_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory_align.cpp; sourceTree = "<group>"; };
 		4CE446E918BC605C0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };
@@ -276,7 +273,6 @@
 				4CE446B518BC605C0017DF25 /* extern.h */,
 				4CE446B618BC605C0017DF25 /* get_intra_predictor.h */,
 				4CE446B718BC605C0017DF25 /* mb_cache.h */,
-				4CE446B818BC605C0017DF25 /* mc.h */,
 				4CE446B918BC605C0017DF25 /* md.h */,
 				4CE446BA18BC605C0017DF25 /* memory_align.h */,
 				4CE446BB18BC605C0017DF25 /* mt_defs.h */,
@@ -328,7 +324,6 @@
 				4CE446E218BC605C0017DF25 /* encoder_data_tables.cpp */,
 				4CE446E318BC605C0017DF25 /* encoder_ext.cpp */,
 				4CE446E518BC605C0017DF25 /* get_intra_predictor.cpp */,
-				4CE446E618BC605C0017DF25 /* mc.cpp */,
 				4CE446E718BC605C0017DF25 /* md.cpp */,
 				4CE446E818BC605C0017DF25 /* memory_align.cpp */,
 				4CE446E918BC605C0017DF25 /* mv_pred.cpp */,
@@ -455,7 +450,6 @@
 				4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
 				F5BE8005196B913200ED02ED /* memory_aarch64_neon.S in Sources */,
 				4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,
-				4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,
 				F7E9994519EBD1E9009B1021 /* svc_set_mb_syn_cabac.cpp in Sources */,
 				F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */,
 				4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */,
--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -744,14 +744,10 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\inc\mc.h"
+					RelativePath="..\..\..\common\inc\mc.h"
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\inc\mc_common.h"
-					>
-				</File>
-				<File
 					RelativePath="..\..\..\common\inc\measure_time.h"
 					>
 				</File>
@@ -909,7 +905,7 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\src\mc.cpp"
+					RelativePath="..\..\..\common\src\mc.cpp"
 					>
 				</File>
 				<File
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -386,7 +386,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\src\mc.cpp"
+				RelativePath="..\..\..\common\src\mc.cpp"
 				>
 			</File>
 			<File
@@ -563,11 +563,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\inc\mc.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\..\common\inc\mc_common.h"
+				RelativePath="..\..\..\common\inc\mc.h"
 				>
 			</File>
 			<File
--- /dev/null
+++ b/codec/common/inc/mc.h
@@ -1,0 +1,300 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef MC_H
+#define MC_H
+
+#include "typedefs.h"
+
+typedef void (*PWelsMcFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight);
+
+typedef void (*PWelsLumaHalfpelMcFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight);
+typedef void (*PWelsSampleAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t,
+    int32_t, int32_t);
+
+typedef struct TagMcFunc {
+  PWelsLumaHalfpelMcFunc      pfLumaHalfpelHor;
+  PWelsLumaHalfpelMcFunc      pfLumaHalfpelVer;
+  PWelsLumaHalfpelMcFunc      pfLumaHalfpelCen;
+  PWelsMcFunc                 pMcChromaFunc;
+
+  PWelsMcFunc                 pMcLumaFunc;
+  PWelsSampleAveragingFunc    pfSampleAveraging;
+} SMcFunc;
+
+namespace WelsCommon {
+
+void InitMcFunc (SMcFunc* pMcFunc, uint32_t iCpu);
+
+} // namespace WelsCommon
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(HAVE_NEON)
+void McCopyWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McChromaWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t* pWeights, int32_t iHeight);
+
+void McChromaWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t* pWeights, int32_t iHeight);
+
+void PixelAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq4_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+
+void McHorVer01WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer01WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer01WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer03WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer03WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer03WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+
+void McHorVer10WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer10WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer10WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer30WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer30WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer30WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer20WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer20WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer02WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer02WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer22WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer22WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+
+void PixStrideAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+                                 const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+                                const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+
+void McHorVer20Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);// width+1
+void McHorVer20Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t iHeight);// width+1
+
+void McHorVer02Height17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);// height+1
+void McHorVer02Height9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);// height+1
+
+void McHorVer22Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int32_t iHeight);//width+1&&height+1
+#endif
+
+#if defined(HAVE_NEON_AARCH64)
+void McCopyWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  int32_t iHeight);
+void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                  int32_t iHeight);
+void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                   int32_t iHeight);
+void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t* pWeights, int32_t iHeight);
+void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t* pWeights, int32_t iHeight);
+void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       int32_t iHeight);
+void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);
+void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+    const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
+                                        const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight);// width+1
+void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);// width+1
+void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight);// height+1
+void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight);// height+1
+void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);//width+1&&height+1
+#endif
+
+#if defined(X86_ASM)
+//***************************************************************************//
+//                       MMXEXT definition                                   //
+//***************************************************************************//
+void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);
+void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                           const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                         int32_t iHeight);
+void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                         int32_t iHeight);
+void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+//***************************************************************************//
+//                       SSE2 definition                                     //
+//***************************************************************************//
+void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                           int32_t iHeight);
+void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);
+void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight);
+void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight);
+
+void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                             const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth,
+                                int32_t iHeight);
+
+void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight);
+
+void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride,
+                              int32_t iWidth,
+                              int32_t iHeight);
+
+//***************************************************************************//
+//                       SSSE3 definition                                    //
+//***************************************************************************//
+
+void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             const uint8_t* kpABCD, int32_t iHeight);
+
+#endif //X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif//MC_H
--- a/codec/common/inc/mc_common.h
+++ /dev/null
@@ -1,275 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef MC_COMMON_H
-#define MC_COMMON_H
-
-#include "typedefs.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(HAVE_NEON)
-void McCopyWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McChromaWidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            int32_t* pWeights, int32_t iHeight);
-
-void McChromaWidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            int32_t* pWeights, int32_t iHeight);
-
-void PixelAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq4_neon (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-
-void McHorVer01WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer01WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer01WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer03WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer03WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer03WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-
-void McHorVer10WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer10WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer10WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer30WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer30WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer30WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-
-//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-void McHorVer20WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer20WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer20WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-void McHorVer02WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer02WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer02WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-void McHorVer22WidthEq16_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer22WidthEq8_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer22WidthEq4_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-
-void PixStrideAvgWidthEq16_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
-                                 const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void PixStrideAvgWidthEq8_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
-                                const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-
-void McHorVer20Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int32_t iHeight);// width+1
-void McHorVer20Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            int32_t iHeight);// width+1
-
-void McHorVer02Height17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);// height+1
-void McHorVer02Height9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int32_t iHeight);// height+1
-
-void McHorVer22Width17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int32_t iHeight);//width+1&&height+1
-void McHorVer22Width9_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            int32_t iHeight);//width+1&&height+1
-#endif
-
-#if defined(HAVE_NEON_AARCH64)
-void McCopyWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                  int32_t iHeight);
-void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                  int32_t iHeight);
-void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                   int32_t iHeight);
-void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t* pWeights, int32_t iHeight);
-void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t* pWeights, int32_t iHeight);
-void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight);
-void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight);
-void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight);
-void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight);
-void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-void McHorVer20WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight);
-void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight);
-void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       int32_t iHeight);
-void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);
-void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
-    const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
-                                        const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight);// width+1
-void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iHeight);// width+1
-void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight);// height+1
-void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight);// height+1
-void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight);//width+1&&height+1
-void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iHeight);//width+1&&height+1
-#endif
-
-#if defined(X86_ASM)
-//***************************************************************************//
-//                       MMXEXT definition                                   //
-//***************************************************************************//
-void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int32_t iHeight);
-void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                           const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
-void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
-void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-//***************************************************************************//
-//                       SSE2 definition                                     //
-//***************************************************************************//
-void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                           int32_t iHeight);
-void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iHeight);
-void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iWidth, int32_t iHeight);
-void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iWidth, int32_t iHeight);
-
-void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                             const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                int32_t iWidth,
-                                int32_t iHeight);
-
-void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight);
-
-void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride,
-                              int32_t iWidth,
-                              int32_t iHeight);
-
-//***************************************************************************//
-//                       SSSE3 definition                                    //
-//***************************************************************************//
-
-void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             const uint8_t* kpABCD, int32_t iHeight);
-
-#endif //X86_ASM
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif//MC_COMMON_H
--- /dev/null
+++ b/codec/common/src/mc.cpp
@@ -1,0 +1,1328 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mc.c
+ *
+ * \brief	Interfaces implementation for motion compensation
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "mc.h"
+
+#include "cpu_core.h"
+#include "ls_defines.h"
+#include "macros.h"
+
+typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                       const uint8_t* kpABCD, int32_t iHeight);
+typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*,
+    int32_t, int32_t);
+typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight);
+
+namespace WelsCommon {
+
+/*------------------weight for chroma fraction pixel interpolation------------------*/
+//iA = (8 - dx) * (8 - dy);
+//iB = dx * (8 - dy);
+//iC = (8 - dx) * dy;
+//iD = dx * dy
+static const uint8_t g_kuiABCD[8][8][4] = {	//g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
+  {
+    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
+    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
+  },
+  {
+    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
+    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
+  },
+  {
+    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
+    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
+  },
+  {
+    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
+    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
+  },
+  {
+    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
+    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
+  },
+  {
+    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
+    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
+  },
+  {
+    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
+    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
+  },
+  {
+    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
+    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
+  }
+};
+
+//***************************************************************************//
+//                          C code implementation                            //
+//***************************************************************************//
+static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
+    ST16A2 (pDst, LD16 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST32A4 (pDst, LD32 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                     int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST64A8 (pDst, LD64 (pSrc));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                      int32_t iHeight) {
+  int32_t i;
+  for (i = 0; i < iHeight; i++) {
+    ST64A8 (pDst  , LD64 (pSrc));
+    ST64A8 (pDst + 8, LD64 (pSrc + 8));
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+//--------------------Luma sample MC------------------//
+
+static inline int32_t HorFilterInput16bit_c (const int16_t* pSrc) {
+  int32_t iPix05 = pSrc[0] + pSrc[5];
+  int32_t iPix14 = pSrc[1] + pSrc[4];
+  int32_t iPix23 = pSrc[2] + pSrc[3];
+
+  return (iPix05 - (iPix14 * 5) + (iPix23 * 20));
+}
+// h: iOffset=1 / v: iOffset=iSrcStride
+static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
+  const int32_t kiOffset1 = kiOffset;
+  const int32_t kiOffset2 = (kiOffset << 1);
+  const int32_t kiOffset3 = kiOffset + kiOffset2;
+  const uint32_t kuiPix05   = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
+  const uint32_t kuiPix14   = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
+  const uint32_t kuiPix23   = * (pSrc) + * (pSrc + kiOffset1);
+
+  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
+}
+
+static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                               const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+    }
+    pDst  += iDstStride;
+    pSrcA += iSrcAStride;
+    pSrcB += iSrcBStride;
+  }
+}
+static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                             int32_t iHeight) {
+  if (iWidth == 16)
+    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //here iWidth == 2
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  int32_t i, j;
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
+    }
+    pDst += iDstStride;
+    pSrc += iSrcStride;
+  }
+}
+
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  int16_t iTmp[17 + 5];
+  int32_t i, j, k;
+
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth + 5; j++) {
+      iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
+    }
+    for (k = 0; k < iWidth; k++) {
+      pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
+    }
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
+}
+
+/////////////////////luma MC//////////////////////////
+static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiTmp[256];
+  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiTmp[256];
+  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiVerTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiVerTmp[256];
+  uint8_t uiCtrTmp[256];
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth,
+                                 int32_t iHeight) {
+  uint8_t uiHorTmp[256];
+  uint8_t uiVerTmp[256];
+  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+
+void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+               int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
+    {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
+    {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
+    {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
+  };
+
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  int32_t i, j;
+  int32_t iA, iB, iC, iD;
+  const uint8_t* pSrcNext = pSrc + iSrcStride;
+  const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
+  iA = pABCD[0];
+  iB = pABCD[1];
+  iC = pABCD[2];
+  iD = pABCD[3];
+  for (i = 0; i < iHeight; i++) {
+    for (j = 0; j < iWidth; j++) {
+      pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
+    }
+    pDst     += iDstStride;
+    pSrc      = pSrcNext;
+    pSrcNext += iSrcStride;
+  }
+}
+
+void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (0 == kiD8x && 0 == kiD8y)
+    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+}
+
+#if defined(X86_ASM)
+//***************************************************************************//
+//                       SSE2 implement                          //
+//***************************************************************************//
+static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
+  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
+}
+
+static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+
+static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iHeight) {
+  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
+  int32_t tmp1 = 2 * (iWidth - 8);
+  McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
+  McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
+}
+
+static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth,
+                                int32_t iHeight) {
+  if (iWidth == 16)
+    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+
+void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+//pSrc has been added the offset of mv
+{
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
+    {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
+    {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
+    {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
+  };
+
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
+    McChromaWidthEq4_mmx,
+    McChromaWidthEq8_sse2
+  };
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (kiD8x == 0 && kiD8y == 0) {
+    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    return;
+  }
+  if (iWidth != 2) {
+    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  } else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+}
+
+void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                     int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
+    McChromaWidthEq4_mmx,
+    McChromaWidthEq8_ssse3
+  };
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (kiD8x == 0 && kiD8y == 0) {
+    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    return;
+  }
+  if (iWidth != 2) {
+    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+  } else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+}
+
+void PixelAvg_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
+    PixelAvgWidthEq8_mmx,
+    PixelAvgWidthEq16_sse2
+  };
+  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+}
+
+#endif //X86_ASM
+//***************************************************************************//
+//                       NEON implementation                      //
+//***************************************************************************//
+#if defined(HAVE_NEON)
+void McHorVer20Width9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17)
+    McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                 int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 8)
+    McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17)
+    McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                  int32_t iWidth, int32_t iHeight) {
+  if (16 == iWidth)
+    McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (8 == iWidth)
+    McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (4 == iWidth)
+    McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  }
+}
+void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  }
+}
+void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  }
+}
+void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
+  }
+}
+void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                      int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
+  }
+}
+
+void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
+    {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
+    {McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
+    {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
+  };
+  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  if (0 == iMvX && 0 == iMvY) {
+    if (8 == iWidth)
+      McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+      McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else //here iWidth == 2
+      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  } else {
+    const int32_t kiD8x = iMvX & 0x07;
+    const int32_t kiD8y = iMvY & 0x07;
+    if (8 == iWidth)
+      McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else if (4 == iWidth)
+      McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else //here iWidth == 2
+      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+  }
+}
+void PixelAvg_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
+    PixStrideAvgWidthEq8_neon,
+    PixStrideAvgWidthEq16_neon
+  };
+  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+}
+#endif
+#if defined(HAVE_NEON_AARCH64)
+void McHorVer20Width9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17)
+    McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 8)
+    McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22Width9Or17Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+    int32_t iDstStride,
+    int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17)
+    McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else //if (iWidth == 9)
+    McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                          int32_t iWidth, int32_t iHeight) {
+  if (16 == iWidth)
+    McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (8 == iWidth)
+    McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (4 == iWidth)
+    McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 4) {
+    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+
+void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                          int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_AArch64_neon,  McHorVer01_AArch64_neon, McHorVer02_AArch64_neon,    McHorVer03_AArch64_neon},
+    {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
+    {McHorVer20_AArch64_neon,    McHorVer21_AArch64_neon, McHorVer22_AArch64_neon,    McHorVer23_AArch64_neon},
+    {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
+  };
+  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  if (0 == iMvX && 0 == iMvY) {
+    if (8 == iWidth)
+      McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else if (iWidth == 4)
+      McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+    else //here iWidth == 2
+      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  } else {
+    const int32_t kiD8x = iMvX & 0x07;
+    const int32_t kiD8y = iMvY & 0x07;
+    if (8 == iWidth)
+      McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else if (4 == iWidth)
+      McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
+    else //here iWidth == 2
+      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+  }
+}
+void PixelAvg_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                            const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
+    PixStrideAvgWidthEq8_AArch64_neon,
+    PixStrideAvgWidthEq16_AArch64_neon
+  };
+  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+}
+#endif
+
+void InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
+  pMcFuncs->pfLumaHalfpelHor  = McHorVer20_c;
+  pMcFuncs->pfLumaHalfpelVer  = McHorVer02_c;
+  pMcFuncs->pfLumaHalfpelCen  = McHorVer22_c;
+  pMcFuncs->pfSampleAveraging = PixelAvg_c;
+  pMcFuncs->pMcChromaFunc     = McChroma_c;
+  pMcFuncs->pMcLumaFunc       = McLuma_c;
+
+#if defined (X86_ASM)
+  if (uiCpuFlag & WELS_CPU_SSE2) {
+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width9Or17_sse2;
+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height9Or17_sse2;
+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width9Or17Height9Or17_sse2;
+    pMcFuncs->pfSampleAveraging = PixelAvg_sse2;
+    pMcFuncs->pMcChromaFunc     = McChroma_sse2;
+    pMcFuncs->pMcLumaFunc       = McLuma_sse2;
+  }
+
+  if (uiCpuFlag & WELS_CPU_SSSE3) {
+    pMcFuncs->pMcChromaFunc = McChroma_ssse3;
+  }
+#endif //(X86_ASM)
+
+#if defined(HAVE_NEON)
+  if (uiCpuFlag & WELS_CPU_NEON) {
+    pMcFuncs->pMcLumaFunc       = McLuma_neon;
+    pMcFuncs->pMcChromaFunc     = McChroma_neon;
+    pMcFuncs->pfSampleAveraging = PixelAvg_neon;
+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width9Or17_neon;//iWidth+1:8/16
+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height9Or17_neon;//heigh+1:8/16
+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
+  }
+#endif
+#if defined(HAVE_NEON_AARCH64)
+  if (uiCpuFlag & WELS_CPU_NEON) {
+    pMcFuncs->pMcLumaFunc       = McLuma_AArch64_neon;
+    pMcFuncs->pMcChromaFunc     = McChroma_AArch64_neon;
+    pMcFuncs->pfSampleAveraging = PixelAvg_AArch64_neon;
+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
+  }
+#endif
+}
+} // namespace WelsCommon
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -7,6 +7,7 @@
 	$(COMMON_SRCDIR)/src/deblocking_common.cpp\
 	$(COMMON_SRCDIR)/src/expand_pic.cpp\
 	$(COMMON_SRCDIR)/src/intra_pred_common.cpp\
+	$(COMMON_SRCDIR)/src/mc.cpp\
 	$(COMMON_SRCDIR)/src/sad_common.cpp\
 	$(COMMON_SRCDIR)/src/utils.cpp\
 	$(COMMON_SRCDIR)/src/welsCodecTrace.cpp\
--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -55,6 +55,7 @@
 #include "crt_util_safe_x.h"
 #include "mb_cache.h"
 #include "expand_pic.h"
+#include "mc.h"
 
 namespace WelsDec {
 #define MAX_PRED_MODE_ID_I16x16  3
@@ -141,13 +142,6 @@
 uint8_t				uiLongRefCount[LIST_A];	// dependend on ref pic module
 int32_t				iMaxLongTermFrameIdx;
 } SRefPic, *PRefPic;
-
-typedef void (*PWelsMcFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight);
-typedef struct TagMcFunc {
-PWelsMcFunc pMcLumaFunc;
-PWelsMcFunc pMcChromaFunc;
-} SMcFunc;
 
 typedef void (*PCopyFunc) (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
 typedef struct TagCopyFunc {
--- a/codec/decoder/core/inc/mc.h
+++ /dev/null
@@ -1,50 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef WELS_MC_H__
-#define WELS_MC_H__
-
-#include "wels_const.h"
-#include "macros.h"
-#include "decoder_context.h"
-#include "mc_common.h"
-
-namespace WelsDec {
-
-typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                       const uint8_t* kpABCD, int32_t iHeight);
-
-void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu);
-
-} // namespace WelsDec
-
-#endif//WELS_MC_H__
--- a/codec/decoder/core/src/mc.cpp
+++ /dev/null
@@ -1,1205 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	mc.c
- *
- * \brief	Interfaces implementation for motion compensation
- *
- * \date	03/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include "mc.h"
-
-#include "cpu_core.h"
-
-namespace WelsDec {
-
-/*------------------weight for chroma fraction pixel interpolation------------------*/
-//iA = (8 - dx) * (8 - dy);
-//iB = dx * (8 - dy);
-//iC = (8 - dx) * dy;
-//iD = dx * dy
-static const uint8_t g_kuiABCD[8][8][4] = {	//g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
-  {
-    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
-    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
-  },
-  {
-    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
-    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
-  },
-  {
-    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
-    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
-  },
-  {
-    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
-    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
-  },
-  {
-    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
-    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
-  },
-  {
-    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
-    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
-  },
-  {
-    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
-    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
-  },
-  {
-    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
-    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
-  }
-};
-
-typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iWidth, int32_t iHeight);
-
-//***************************************************************************//
-//                          C code implementation                            //
-//***************************************************************************//
-static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma
-    ST16A2 (pDst, LD16 (pSrc));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    ST32A4 (pDst, LD32 (pSrc));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    ST64A8 (pDst, LD64 (pSrc));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    ST64A8 (pDst  , LD64 (pSrc));
-    ST64A8 (pDst + 8, LD64 (pSrc + 8));
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-//--------------------Luma sample MC------------------//
-
-static inline int32_t HorFilterInput16bit_c (const int16_t* pSrc) {
-  int32_t iPix05 = pSrc[0] + pSrc[5];
-  int32_t iPix14 = pSrc[1] + pSrc[4];
-  int32_t iPix23 = pSrc[2] + pSrc[3];
-
-  return (iPix05 - (iPix14 * 5) + (iPix23 * 20));
-}
-// h: iOffset=1 / v: iOffset=iSrcStride
-static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) {
-  const int32_t kiOffset1 = kiOffset;
-  const int32_t kiOffset2 = (kiOffset << 1);
-  const int32_t kiOffset3 = kiOffset + kiOffset2;
-  const uint32_t kuiPix05   = * (pSrc - kiOffset2) + * (pSrc + kiOffset3);
-  const uint32_t kuiPix14   = * (pSrc - kiOffset1) + * (pSrc + kiOffset2);
-  const uint32_t kuiPix23   = * (pSrc) + * (pSrc + kiOffset1);
-
-  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
-}
-
-static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                               const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-    }
-    pDst  += iDstStride;
-    pSrcA += iSrcAStride;
-    pSrcB += iSrcBStride;
-  }
-}
-static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                             int32_t iHeight) {
-  if (iWidth == 16)
-    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //here iWidth == 2
-    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  int16_t iTmp[16 + 5]; //16
-  int32_t i, j, k;
-
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth + 5; j++) {
-      iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
-    }
-    for (k = 0; k < iWidth; k++) {
-      pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
-    }
-    pSrc += iSrcStride;
-    pDst += iDstStride;
-  }
-}
-
-/////////////////////luma MC//////////////////////////
-static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiTmp[256];
-  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiTmp[256];
-  McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiVerTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiVerTmp[256];
-  uint8_t uiCtrTmp[256];
-  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
-}
-static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  uint8_t uiHorTmp[256];
-  uint8_t uiVerTmp[256];
-  McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
-  McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
-  PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
-}
-
-void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-               int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
-    {McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
-    {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
-    {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
-    {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
-  };
-
-  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-
-static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  int32_t i, j;
-  int32_t iA, iB, iC, iD;
-  const uint8_t* pSrcNext = pSrc + iSrcStride;
-  const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07];
-  iA = pABCD[0];
-  iB = pABCD[1];
-  iC = pABCD[2];
-  iD = pABCD[3];
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6;
-    }
-    pDst     += iDstStride;
-    pSrc      = pSrcNext;
-    pSrcNext += iSrcStride;
-  }
-}
-
-void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-  if (0 == kiD8x && 0 == kiD8y)
-    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  else
-    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-}
-
-#if defined(X86_ASM)
-//***************************************************************************//
-//                       SSE2 implement                          //
-//***************************************************************************//
-static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
-  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
-  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
-}
-
-static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
-  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
-}
-
-static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
-  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
-}
-
-static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                int32_t iWidth,
-                                int32_t iHeight) {
-  if (iWidth == 16)
-    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
-}
-
-static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
-}
-
-static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
-    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_sse2 (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else {
-    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
-    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_sse2 (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else {
-    McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
-    PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-
-void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
-    {McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
-    {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
-    {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
-    {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
-  };
-
-  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-
-void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
-    McChromaWidthEq4_mmx,
-    McChromaWidthEq8_sse2
-  };
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-  if (kiD8x == 0 && kiD8y == 0) {
-    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-    return;
-  }
-  if (iWidth != 2) {
-    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-  } else
-    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-}
-
-#endif //X86_ASM
-//***************************************************************************//
-//                       NEON implementation                      //
-//***************************************************************************//
-#if defined(HAVE_NEON)
-void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                  int32_t iWidth, int32_t iHeight) {
-  if (16 == iWidth)
-    McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (8 == iWidth)
-    McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (4 == iWidth)
-    McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  }
-}
-void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  }
-}
-void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  }
-}
-void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-  }
-}
-void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
-  }
-}
-void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  }
-}
-void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
-  }
-}
-void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                      int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
-  }
-}
-
-void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
-    {McCopy_neon,  McHorVer01_neon, McHorVer02_neon,    McHorVer03_neon},
-    {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
-    {McHorVer20_neon,    McHorVer21_neon, McHorVer22_neon,    McHorVer23_neon},
-    {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
-  };
-  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
-  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  if (0 == iMvX && 0 == iMvY) {
-    if (8 == iWidth)
-      McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-      McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else //here iWidth == 2
-      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  } else {
-    const int32_t kiD8x = iMvX & 0x07;
-    const int32_t kiD8y = iMvY & 0x07;
-    if (8 == iWidth)
-      McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    else if (4 == iWidth)
-      McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    else //here iWidth == 2
-      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-  }
-}
-#endif
-#if defined(HAVE_NEON_AARCH64)
-void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                          int32_t iWidth, int32_t iHeight) {
-  if (16 == iWidth)
-    McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (8 == iWidth)
-    McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (4 == iWidth)
-    McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else
-    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-
-void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
-  }
-}
-void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iWidth, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
-  if (iWidth == 16) {
-    McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 8) {
-    McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  } else if (iWidth == 4) {
-    McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
-    McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
-    PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
-  }
-}
-
-void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                          int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
-    {McCopy_AArch64_neon,  McHorVer01_AArch64_neon, McHorVer02_AArch64_neon,    McHorVer03_AArch64_neon},
-    {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon},
-    {McHorVer20_AArch64_neon,    McHorVer21_AArch64_neon, McHorVer22_AArch64_neon,    McHorVer23_AArch64_neon},
-    {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon},
-  };
-  //	pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
-  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-}
-void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  if (0 == iMvX && 0 == iMvY) {
-    if (8 == iWidth)
-      McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else if (iWidth == 4)
-      McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else //here iWidth == 2
-      McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  } else {
-    const int32_t kiD8x = iMvX & 0x07;
-    const int32_t kiD8y = iMvY & 0x07;
-    if (8 == iWidth)
-      McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    else if (4 == iWidth)
-      McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    else //here iWidth == 2
-      McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
-  }
-}
-#endif
-
-void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
-  pMcFunc->pMcLumaFunc   = McLuma_c;
-  pMcFunc->pMcChromaFunc = McChroma_c;
-
-#ifdef	HAVE_NEON
-  if (iCpu & WELS_CPU_NEON) {
-    pMcFunc->pMcLumaFunc	  = McLuma_neon;
-    pMcFunc->pMcChromaFunc  = McChroma_neon;
-  }
-#endif
-#ifdef	HAVE_NEON_AARCH64
-  if (iCpu & WELS_CPU_NEON) {
-    pMcFunc->pMcLumaFunc	  = McLuma_AArch64_neon;
-    pMcFunc->pMcChromaFunc  = McChroma_AArch64_neon;
-  }
-#endif
-#if defined (X86_ASM)
-  if (iCpu & WELS_CPU_SSE2) {
-    pMcFunc->pMcLumaFunc   = McLuma_sse2;
-    pMcFunc->pMcChromaFunc = McChroma_sse2;
-  }
-#endif //(X86_ASM)
-}
-
-} // namespace WelsDec
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -13,7 +13,6 @@
 	$(DECODER_SRCDIR)/core/src/fmo.cpp\
 	$(DECODER_SRCDIR)/core/src/get_intra_predictor.cpp\
 	$(DECODER_SRCDIR)/core/src/manage_dec_ref.cpp\
-	$(DECODER_SRCDIR)/core/src/mc.cpp\
 	$(DECODER_SRCDIR)/core/src/mem_align.cpp\
 	$(DECODER_SRCDIR)/core/src/memmgr_nal_unit.cpp\
 	$(DECODER_SRCDIR)/core/src/mv_pred.cpp\
--- a/codec/encoder/core/inc/mc.h
+++ /dev/null
@@ -1,51 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-//macroblock.h
-#ifndef WELS_MC_H__
-#define WELS_MC_H__
-
-#include <string.h>
-#include "typedefs.h"
-#include "wels_const.h"
-#include "macros.h"
-#include "wels_func_ptr_def.h"
-#include "mc_common.h"
-
-/////////////////////luma MC//////////////////////////
-//x y means dx(mv[0] & 3) and dy(mv[1] & 3)
-
-namespace WelsEnc {
-void WelsInitMcFuncs (SMcFunc* pMcFuncs, uint32_t uiCpuFlag);
-
-}
-#endif//WELS_MC_H__
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -44,6 +44,7 @@
 #include "expand_pic.h"
 #include "rc.h"
 #include "IWelsVP.h"
+#include "mc.h"
 
 namespace WelsEnc {
 
@@ -73,25 +74,6 @@
 typedef int32_t (*PQuantizationSkipFunc) (int16_t* pDct, int16_t iFF,  int16_t iMF);
 typedef int32_t (*PQuantizationHadamardFunc) (int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct,
     int16_t* pBlock);
-
-typedef void (*PWelsMcFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight);
-
-typedef void (*PWelsLumaHalfpelMcFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iWidth, int32_t iHeight);
-typedef void (*PWelsLumaQuarpelMcFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iHeight);
-typedef void (*PWelsSampleAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, int32_t, int32_t);
-
-typedef struct TagMcFunc {
-  PWelsLumaHalfpelMcFunc      pfLumaHalfpelHor;
-  PWelsLumaHalfpelMcFunc      pfLumaHalfpelVer;
-  PWelsLumaHalfpelMcFunc      pfLumaHalfpelCen;
-  PWelsMcFunc                         pfChromaMc;
-
-  PWelsLumaQuarpelMcFunc      pfLumaQuarpelMc[16];
-  PWelsSampleAveragingFunc    pfSampleAveraging[2];
-} SMcFunc;
 
 typedef void (*PLumaDeblockingLT4Func) (uint8_t* iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* iTc);
 typedef void (*PLumaDeblockingEQ4Func) (uint8_t* iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
--- a/codec/encoder/core/src/encoder.cpp
+++ b/codec/encoder/core/src/encoder.cpp
@@ -209,7 +209,7 @@
   /* Motion compensation */
   /*init pixel average function*/
   /*get one column or row pixel when refinement*/
-  WelsInitMcFuncs (&pFuncList->sMcFuncs, uiCpuFlag);
+  InitMcFunc (&pFuncList->sMcFuncs, uiCpuFlag);
   InitCoeffFunc (pFuncList,uiCpuFlag,pParam->iEntropyCodingModeFlag);
 
   WelsInitEncodingFuncs (pFuncList, uiCpuFlag);
--- a/codec/encoder/core/src/mc.cpp
+++ /dev/null
@@ -1,848 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	mc.c
- *
- * \brief	Interfaces implementation for motion compensation
- *
- * \date	03/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include "mc.h"
-#include "cpu_core.h"
-
-namespace WelsEnc {
-/*------------------weight for chroma fraction pixel interpolation------------------*/
-//kuiA = (8 - dx) * (8 - dy);
-//kuiB = dx * (8 - dy);
-//kuiC = (8 - dx) * dy;
-//kuiD = dx * dy
-static const uint8_t g_kuiABCD[8][8][4] = { ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx]
-  {
-    {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0},
-    {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0}
-  },
-  {
-    {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3},
-    {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7}
-  },
-  {
-    {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6},
-    {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14}
-  },
-  {
-    {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9},
-    {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21}
-  },
-  {
-    {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12},
-    {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28}
-  },
-  {
-    {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15},
-    {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35}
-  },
-  {
-    {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18},
-    {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42}
-  },
-  {
-    {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21},
-    {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49}
-  }
-};
-
-//***************************************************************************//
-//                          C code implementation                            //
-//***************************************************************************//
-static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    memcpy (pDst, pSrc, 4);	// confirmed_safe_unsafe_usage
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                     int32_t iHeight)
-
-{
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    memcpy (pDst, pSrc, 8);	// confirmed_safe_unsafe_usage
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                      int32_t iHeight) {
-  int32_t i;
-  for (i = 0; i < iHeight; i++) {
-    memcpy (pDst, pSrc, 16);	// confirmed_safe_unsafe_usage
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-
-//--------------------Luma sample MC------------------//
-static inline int32_t HorFilter_c (const uint8_t* pSrc) {
-  int32_t iPix05 = pSrc[-2] + pSrc[3];
-  int32_t iPix14 = pSrc[-1] + pSrc[2];
-  int32_t iPix23 = pSrc[ 0] + pSrc[1];
-
-  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
-}
-
-static inline int32_t HorFilterInput16bit1_c (const int16_t* pSrc) {
-  int32_t iPix05 = pSrc[0] + pSrc[5];
-  int32_t iPix14 = pSrc[1] + pSrc[4];
-  int32_t iPix23 = pSrc[2] + pSrc[3];
-
-  return (iPix05 - ((iPix14 << 2) + iPix14) + (iPix23 << 4) + (iPix23 << 2));
-}
-static inline int32_t VerFilter_c (const uint8_t* pSrc, const int32_t kiSrcStride) {
-  const int32_t kiLine1	= kiSrcStride;
-  const int32_t kiLine2	= (kiSrcStride << 1);
-  const int32_t kiLine3 = kiLine1 + kiLine2;
-  const uint32_t kuiPix05 = * (pSrc - kiLine2) + * (pSrc + kiLine3);
-  const uint32_t kuiPix14 = * (pSrc - kiLine1) + * (pSrc + kiLine2);
-  const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiLine1);
-
-  return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2));
-}
-
-static inline void PixelAvgWidthEq8_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                       const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 8; j++) {
-      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-    }
-    pDst  += iDstStride;
-    pSrcA += iSrcAStride;
-    pSrcB += iSrcBStride;
-  }
-}
-static inline void PixelAvgWidthEq16_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                                        const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16; j++) {
-      pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
-    }
-    pDst  += iDstStride;
-    pSrcA += iSrcAStride;
-    pSrcB += iSrcBStride;
-  }
-}
-
-//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-static inline void McHorVer20WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16; j++) {
-      pDst[j] = WelsClip1 ((HorFilter_c (pSrc + j) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-static inline void McHorVer02WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16; j++) {
-      pDst[j] = WelsClip1 ((VerFilter_c (pSrc + j, iSrcStride) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-static inline void McHorVer22WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  int16_t pTmp[16 + 5] = {0}; //16
-  int32_t i, j, k;
-
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < 16 + 5; j++) {
-      pTmp[j] = VerFilter_c (pSrc - 2 + j, iSrcStride);
-    }
-    for (k = 0; k < 16; k++) {
-      pDst[k] = WelsClip1 ((HorFilterInput16bit1_c (&pTmp[k]) + 512) >> 10);
-    }
-    pSrc += iSrcStride;
-    pDst += iDstStride;
-  }
-}
-
-/////////////////////luma MC//////////////////////////
-
-static inline void McHorVer01WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer02WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer03WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer02WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer10WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer11WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer12WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer02WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer13WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_c (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer21WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer23WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_c (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer30WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer31WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_c (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_c (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer32WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer02WidthEq16_c (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_c (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer33WidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_c (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_c (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_c (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-
-static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((HorFilter_c (pSrc + j) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//vertical filter to gain half sample, that is (0, 2) location in quarter sample
-static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  int32_t i, j;
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth; j++) {
-      pDst[j] = WelsClip1 ((VerFilter_c (pSrc + j, iSrcStride) + 16) >> 5);
-    }
-    pDst += iDstStride;
-    pSrc += iSrcStride;
-  }
-}
-//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth,
-                                 int32_t iHeight) {
-  int16_t pTmp[17 + 5] = {0}; //w+1
-  int32_t i, j, k;
-
-  for (i = 0; i < iHeight; i++) {
-    for (j = 0; j < iWidth + 5; j++) {
-      pTmp[j] = VerFilter_c (pSrc - 2 + j, iSrcStride);
-    }
-    for (k = 0; k < iWidth; k++) {
-      pDst[k] = WelsClip1 ((HorFilterInput16bit1_c (&pTmp[k]) + 512) >> 10);
-    }
-    pSrc += iSrcStride;
-    pDst += iDstStride;
-  }
-}
-static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                             int32_t iHeight) {
-  int32_t i;
-  if (iWidth == 16)
-    McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else {
-    for (i = 0; i < iHeight; i++) {
-      memcpy (pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
-      pDst += iDstStride;
-      pSrc += iSrcStride;
-    }
-  }
-}
-
-void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                 int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
-//pSrc has been added the offset of mv
-{
-  const int32_t kiDx = iMvX & 0x07;
-  const int32_t kiDy = iMvY & 0x07;
-
-  if (0 == kiDx && 0 == kiDy) {
-    McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  } else {
-    const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
-    const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
-    const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2];
-    const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3];
-
-    int32_t i, j;
-
-    const uint8_t* pSrcNext = pSrc + iSrcStride;
-
-    for (i = 0; i < iHeight; i++) {
-      for (j = 0; j < iWidth; j++) {
-        pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j + 1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j + 1] + 32) >> 6;
-      }
-      pDst += iDstStride;
-      pSrc = pSrcNext;
-      pSrcNext += iSrcStride;
-    }
-  }
-}
-//***************************************************************************//
-//                       MMXEXT and SSE2 implementation                      //
-//***************************************************************************//
-#if defined(X86_ASM)
-
-static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16)
-  McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5);
-  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
-}
-
-//2010.2.5
-
-static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* PDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer02WidthEq8_sse2 (pSrc,     iSrcStride, PDst,     iDstStride, iHeight);
-  McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight);
-}
-static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
-  McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
-}
-void McHorVer22Width9Or17Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iWidth,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
-  int32_t tmp1 = 2 * (iWidth - 8);
-  McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
-  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
-  McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
-}
-
-static inline void McHorVer01WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer03WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer10WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer11WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer12WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer13WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer21WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer23WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer30WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-}
-static inline void McHorVer31WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer32WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-static inline void McHorVer33WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-
-  McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_sse2 (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-
-static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                int32_t iWidth, int32_t iHeight) {
-  int32_t i;
-  if (iWidth == 16)
-    McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 8)
-    McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else if (iWidth == 4)
-    McCopyWidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else {
-    for (i = 0; i < iHeight; i++) {
-      memcpy (pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
-      pDst += iDstStride;
-      pSrc += iSrcStride;
-    }
-  }
-}
-
-typedef void (*McChromaWidthEqx) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                  const uint8_t* pABCD, int32_t iHeigh);
-void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                    int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-  static const McChromaWidthEqx kpfFuncs[2] = {
-    McChromaWidthEq4_mmx,
-    McChromaWidthEq8_sse2
-  };
-
-  if (0 == kiD8x && 0 == kiD8y) {
-    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  } else {
-    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-  }
-}
-
-void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                     int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-
-  static const McChromaWidthEqx kpfFuncs[2] = {
-    McChromaWidthEq4_mmx,
-    McChromaWidthEq8_ssse3
-  };
-  if (0 == kiD8x && 0 == kiD8y) {
-    McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  } else {
-    kpfFuncs[ (iWidth >> 3)] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
-  }
-
-}
-
-#endif //X86_ASM
-
-//***************************************************************************//
-//                       NEON implementation                      //
-//***************************************************************************//
-#if defined(HAVE_NEON)
-void McHorVer20Width9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 17)
-    McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 9)
-    McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 8)
-    McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22Width9Or17Height9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 17)
-    McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 9)
-    McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void EncMcHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_neon (pDst, iDstStride, pTmp, &pTmp[256], iHeight);
-}
-void EncMcChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                       int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-  if (0 == kiD8x && 0 == kiD8y) {
-    if (8 == iWidth)
-      McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else // iWidth == 4
-      McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  } else {
-    if (8 == iWidth)
-      McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    else //if(4 == iWidth)
-      McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-  }
-}
-#endif
-
-#if defined(HAVE_NEON_AARCH64)
-void McHorVer20Width9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 17)
-    McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 9)
-    McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer02Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-    int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 16)
-    McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 8)
-    McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void McHorVer22Width9Or17Height9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
-    int32_t iDstStride,
-    int32_t iWidth, int32_t iHeight) {
-  if (iWidth == 17)
-    McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  else //if (iWidth == 9)
-    McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-}
-void EncMcHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pTmp, 16, iHeight);
-  McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                 int32_t iHeight) {
-  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 512, 16)
-  McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
-  McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, &pTmp[256], 16, iHeight);
-  PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pTmp, 16, &pTmp[256], 16, iHeight);
-}
-void EncMcChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  const int32_t kiD8x = iMvX & 0x07;
-  const int32_t kiD8y = iMvY & 0x07;
-  if (0 == kiD8x && 0 == kiD8y) {
-    if (8 == iWidth)
-      McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-    else // iWidth == 4
-      McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight);
-  } else {
-    if (8 == iWidth)
-      McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-    else //if(4 == iWidth)
-      McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight);
-  }
-}
-#endif
-
-void WelsInitMcFuncs (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
-  static const PWelsSampleAveragingFunc pfPixAvgFunc[2] = {PixelAvgWidthEq8_c, PixelAvgWidthEq16_c};
-
-  static const PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] = { //[y*4+x]
-    McCopyWidthEq16_c,     McHorVer10WidthEq16_c, McHorVer20WidthEq16_c, McHorVer30WidthEq16_c,
-    McHorVer01WidthEq16_c, McHorVer11WidthEq16_c, McHorVer21WidthEq16_c, McHorVer31WidthEq16_c,
-    McHorVer02WidthEq16_c, McHorVer12WidthEq16_c, McHorVer22WidthEq16_c, McHorVer32WidthEq16_c,
-    McHorVer03WidthEq16_c, McHorVer13WidthEq16_c, McHorVer23WidthEq16_c, McHorVer33WidthEq16_c
-  };
-#if defined (X86_ASM)
-  static const PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = {
-    McCopyWidthEq16_sse2,     McHorVer10WidthEq16_sse2, McHorVer20WidthEq16_sse2, McHorVer30WidthEq16_sse2,
-    McHorVer01WidthEq16_sse2, McHorVer11WidthEq16_sse2, McHorVer21WidthEq16_sse2, McHorVer31WidthEq16_sse2,
-    McHorVer02WidthEq16_sse2, McHorVer12WidthEq16_sse2, McHorVer22WidthEq16_sse2, McHorVer32WidthEq16_sse2,
-    McHorVer03WidthEq16_sse2, McHorVer13WidthEq16_sse2, McHorVer23WidthEq16_sse2, McHorVer33WidthEq16_sse2
-  };
-#endif
-#if defined(HAVE_NEON)
-  static const PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_neon[16] = { //[x][y]
-    McCopyWidthEq16_neon,        McHorVer10WidthEq16_neon,   McHorVer20WidthEq16_neon,    McHorVer30WidthEq16_neon,
-    McHorVer01WidthEq16_neon,    EncMcHorVer11_neon,         EncMcHorVer21_neon,          EncMcHorVer31_neon,
-    McHorVer02WidthEq16_neon,    EncMcHorVer12_neon,         McHorVer22WidthEq16_neon,    EncMcHorVer32_neon,
-    McHorVer03WidthEq16_neon,    EncMcHorVer13_neon,         EncMcHorVer23_neon,          EncMcHorVer33_neon
-  };
-#endif
-#if defined(HAVE_NEON_AARCH64)
-  static const PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_AArch64_neon[16] = { //[x][y]
-    McCopyWidthEq16_AArch64_neon,        McHorVer10WidthEq16_AArch64_neon,   McHorVer20WidthEq16_AArch64_neon,    McHorVer30WidthEq16_AArch64_neon,
-    McHorVer01WidthEq16_AArch64_neon,    EncMcHorVer11_AArch64_neon,         EncMcHorVer21_AArch64_neon,          EncMcHorVer31_AArch64_neon,
-    McHorVer02WidthEq16_AArch64_neon,    EncMcHorVer12_AArch64_neon,         McHorVer22WidthEq16_AArch64_neon,    EncMcHorVer32_AArch64_neon,
-    McHorVer03WidthEq16_AArch64_neon,    EncMcHorVer13_AArch64_neon,         EncMcHorVer23_AArch64_neon,          EncMcHorVer33_AArch64_neon
-  };
-#endif
-  pMcFuncs->pfLumaHalfpelHor = McHorVer20_c;
-  pMcFuncs->pfLumaHalfpelVer = McHorVer02_c;
-  pMcFuncs->pfLumaHalfpelCen = McHorVer22_c;
-  memcpy (pMcFuncs->pfSampleAveraging, pfPixAvgFunc, sizeof (pfPixAvgFunc));
-  pMcFuncs->pfChromaMc	= McChroma_c;
-  memcpy (pMcFuncs->pfLumaQuarpelMc, pWelsMcFuncWidthEq16, sizeof (pWelsMcFuncWidthEq16));
-#if defined (X86_ASM)
-  if (uiCpuFlag & WELS_CPU_SSE2) {
-    pMcFuncs->pfLumaHalfpelHor = McHorVer20Width9Or17_sse2;
-    pMcFuncs->pfLumaHalfpelVer = McHorVer02Height9Or17_sse2;
-    pMcFuncs->pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2;
-    pMcFuncs->pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
-    pMcFuncs->pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
-    pMcFuncs->pfChromaMc = McChroma_sse2;
-    memcpy (pMcFuncs->pfLumaQuarpelMc, pWelsMcFuncWidthEq16_sse2, sizeof (pWelsMcFuncWidthEq16_sse2));
-  }
-
-  if (uiCpuFlag & WELS_CPU_SSSE3) {
-    pMcFuncs->pfChromaMc = McChroma_ssse3;
-  }
-
-#endif //(X86_ASM)
-
-#if defined(HAVE_NEON)
-  if (uiCpuFlag & WELS_CPU_NEON) {
-    memcpy (pMcFuncs->pfLumaQuarpelMc, pWelsMcFuncWidthEq16_neon, sizeof (pWelsMcFuncWidthEq16_neon));
-    pMcFuncs->pfChromaMc	= EncMcChroma_neon;
-    pMcFuncs->pfSampleAveraging[0] = PixStrideAvgWidthEq8_neon;
-    pMcFuncs->pfSampleAveraging[1] = PixStrideAvgWidthEq16_neon;
-    pMcFuncs->pfLumaHalfpelHor = McHorVer20Width9Or17_neon;//iWidth+1:8/16
-    pMcFuncs->pfLumaHalfpelVer = McHorVer02Height9Or17_neon;//heigh+1:8/16
-    pMcFuncs->pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_neon;//iWidth+1/heigh+1
-  }
-#endif
-#if defined(HAVE_NEON_AARCH64)
-  if (uiCpuFlag & WELS_CPU_NEON) {
-    memcpy (pMcFuncs->pfLumaQuarpelMc, pWelsMcFuncWidthEq16_AArch64_neon,
-            sizeof (pWelsMcFuncWidthEq16_AArch64_neon));
-    pMcFuncs->pfChromaMc	= EncMcChroma_AArch64_neon;
-    pMcFuncs->pfSampleAveraging[0] = PixStrideAvgWidthEq8_AArch64_neon;
-    pMcFuncs->pfSampleAveraging[1] = PixStrideAvgWidthEq16_AArch64_neon;
-    pMcFuncs->pfLumaHalfpelHor = McHorVer20Width9Or17_AArch64_neon;//iWidth+1:8/16
-    pMcFuncs->pfLumaHalfpelVer = McHorVer02Height9Or17_AArch64_neon;//heigh+1:8/16
-    pMcFuncs->pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_AArch64_neon;//iWidth+1/heigh+1
-  }
-#endif
-}
-}
--- a/codec/encoder/core/src/md.cpp
+++ b/codec/encoder/core/src/md.cpp
@@ -531,15 +531,14 @@
 
 inline void MeRefineQuarPixel (SWelsFuncPtrList* pFunc, SWelsME* pMe, SMeRefinePointer* pMeRefine,
                                const int32_t kiWidth, const int32_t kiHeight, SQuarRefineParams* pParams, int32_t iStrideEnc) {
-  PWelsSampleAveragingFunc* pSampleAvg	= pFunc->sMcFuncs.pfSampleAveraging;
-  const int32_t kiAvgIndex		= kiWidth >> 4;
+  PWelsSampleAveragingFunc pSampleAvg	= pFunc->sMcFuncs.pfSampleAveraging;
   int32_t iCurCost;
   uint8_t* pEncMb				= pMe->pEncMb;
   uint8_t* pTmp				= NULL;
   const uint8_t kuiPixel		= pMe->uiBlockSize;
 
-  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[0], ME_REFINE_BUF_STRIDE,
-                          pParams->pSrcB[0], pParams->iStrideA, kiHeight);
+  pSampleAvg (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[0], ME_REFINE_BUF_STRIDE,
+              pParams->pSrcB[0], pParams->iStrideA, kiWidth, kiHeight);
 
   iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[0]);
   if (iCurCost < pParams->iBestCost) {
@@ -547,8 +546,8 @@
     SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
   }
   //=========================(0, 1)=======================//
-  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[1],
-                          ME_REFINE_BUF_STRIDE, pParams->pSrcB[1], pParams->iStrideA, kiHeight);
+  pSampleAvg (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[1],
+              ME_REFINE_BUF_STRIDE, pParams->pSrcB[1], pParams->iStrideA, kiWidth, kiHeight);
   iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[1]);
   if (iCurCost < pParams->iBestCost) {
     pParams->iBestQuarPix = ME_QUAR_PIXEL_BOTTOM;
@@ -555,8 +554,8 @@
     SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
   }
   //==========================(-1, 0)=========================//
-  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[2],
-                          ME_REFINE_BUF_STRIDE, pParams->pSrcB[2], pParams->iStrideB, kiHeight);
+  pSampleAvg (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[2],
+              ME_REFINE_BUF_STRIDE, pParams->pSrcB[2], pParams->iStrideB, kiWidth, kiHeight);
   iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[2]);
   if (iCurCost < pParams->iBestCost) {
     pParams->iBestQuarPix = ME_QUAR_PIXEL_LEFT;
@@ -563,8 +562,8 @@
     SWITCH_BEST_TMP_BUF (pMeRefine->pQuarPixBest, pMeRefine->pQuarPixTmp);
   }
   //==========================(1, 0)=========================//
-  pSampleAvg[kiAvgIndex] (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[3],
-                          ME_REFINE_BUF_STRIDE,	pParams->pSrcB[3], pParams->iStrideB,  kiHeight);
+  pSampleAvg (pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[3],
+              ME_REFINE_BUF_STRIDE,	pParams->pSrcB[3], pParams->iStrideB,  kiWidth, kiHeight);
 
   iCurCost = CALC_COST (pMeRefine->pQuarPixTmp, pParams->iLms[3]);
   if (iCurCost < pParams->iBestCost) {
--- a/codec/encoder/core/src/svc_base_layer_md.cpp
+++ b/codec/encoder/core/src/svc_base_layer_md.cpp
@@ -1247,9 +1247,9 @@
     pDstCr	= pMbCache->pMemPredChroma + 64;
   }
   //MC
-  pFunc->sMcFuncs.pfLumaQuarpelMc[0] (pRefLuma, iLineSizeY, pDstLuma, 16, 16);
-  pFunc->sMcFuncs.pfChromaMc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
-  pFunc->sMcFuncs.pfChromaMc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
+  pFunc->sMcFuncs.pMcLumaFunc (pRefLuma, iLineSizeY, pDstLuma, 16, 0, 0, 16, 16);
+  pFunc->sMcFuncs.pMcChromaFunc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
+  pFunc->sMcFuncs.pMcChromaFunc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
 
   pCurMb->uiCbp = 0;
   pMbCache->bCollocatedPredFlag = true;
@@ -1313,7 +1313,6 @@
   uint8_t* pDstCr   = pMbCache->pSkipMb + 256 + 64;
 
   SMVUnitXY sMvp = { 0 };
-  uint8_t uiMvpIdx;
   int32_t n;
 
   int32_t iEncStride		= pCurLayer->iEncStride[0];
@@ -1343,19 +1342,18 @@
 
   //luma
   pRefLuma += sQpelMvp.iMvY * iLineSizeY + sQpelMvp.iMvX;
-  uiMvpIdx = ((sMvp.iMvY & 0x03) << 2) + (sMvp.iMvX & 0x03);
-  pFunc->sMcFuncs.pfLumaQuarpelMc[uiMvpIdx] (pRefLuma, iLineSizeY, pDstLuma, 16, 16);
+  pFunc->sMcFuncs.pMcLumaFunc (pRefLuma, iLineSizeY, pDstLuma, 16, sMvp.iMvX, sMvp.iMvY, 16, 16);
   iSadCostLuma    = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
                     pCurLayer->iEncStride[0], pDstLuma, 16);
 
   const int32_t iStrideUV = (sQpelMvp.iMvY >> 1) * iLineSizeUV + (sQpelMvp.iMvX >> 1);
   pRefCb += iStrideUV;
-  pFunc->sMcFuncs.pfChromaMc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
+  pFunc->sMcFuncs.pMcChromaFunc (pRefCb, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cb
   iSadCostChroma  = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[1],
                     pCurLayer->iEncStride[1], pDstCb, 8);
 
   pRefCr += iStrideUV;
-  pFunc->sMcFuncs.pfChromaMc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
+  pFunc->sMcFuncs.pMcChromaFunc (pRefCr, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8); //Cr
   iSadCostChroma += pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] (pMbCache->SPicData.pEncMb[2],
                     pCurLayer->iEncStride[2], pDstCr, 8);
 
@@ -1463,8 +1461,8 @@
     iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
     pTmpRefCb = pRefCb + iMvStride;
     pTmpRefCr = pRefCr + iMvStride;
-    pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb, iLineSizeRefUV, pDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cb
-    pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr, iLineSizeRefUV, pDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cr
+    pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cb
+    pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 8); //Cr
 
     pWelsMd->iCostSkipMb = pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] (pMbCache->SPicData.pEncMb[0],
                            pCurDqLayer->iEncStride[0], pDstLuma, 16);
@@ -1498,8 +1496,8 @@
       pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
       pTmpDstCb = pDstCb + iDstBlk4Stride;
       pTmpDstCr = pDstCr + iDstBlk4Stride;
-      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cb
-      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cr
+      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cb
+      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 8, 4); //Cr
     }
     break;
 
@@ -1526,8 +1524,8 @@
       pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
       pTmpDstCb = pDstCb + iRefBlk4Stride;
       pTmpDstCr = pDstCr + iRefBlk4Stride;
-      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cb
-      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cr
+      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cb
+      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 4, 8); //Cr
     }
     break;
 
@@ -1560,8 +1558,8 @@
       pTmpDstCb = pDstCb + iDstBlk4Stride;
       pTmpRefCr = pRefCr + iRefBlk4Stride;
       pTmpDstCr = pDstCr + iDstBlk4Stride;
-      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 4, 4); //Cb
-      pEncCtx->pFuncList->sMcFuncs.pfChromaMc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 4, 4); //Cr
+      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCb + iMvStride, iLineSizeRefUV, pTmpDstCb, 8, pMv->iMvX, pMv->iMvY, 4, 4); //Cb
+      pEncCtx->pFuncList->sMcFuncs.pMcChromaFunc (pTmpRefCr + iMvStride, iLineSizeRefUV, pTmpDstCr, 8, pMv->iMvX, pMv->iMvY, 4, 4); //Cr
 
     }
     break;
--- a/codec/encoder/core/src/svc_mode_decision.cpp
+++ b/codec/encoder/core/src/svc_mode_decision.cpp
@@ -414,9 +414,9 @@
     pDstCr	= pMbCache->pMemPredChroma + 64;
   }
   //MC
-  pFunc->sMcFuncs.pfLumaQuarpelMc[0] (pRefLuma + iOffsetY, iLineSizeY, pDstLuma, 16, 16);
-  pFunc->sMcFuncs.pfChromaMc (pRefCb + iOffsetUV, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8);
-  pFunc->sMcFuncs.pfChromaMc (pRefCr + iOffsetUV, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8);
+  pFunc->sMcFuncs.pMcLumaFunc (pRefLuma + iOffsetY, iLineSizeY, pDstLuma, 16, 0, 0, 16, 16);
+  pFunc->sMcFuncs.pMcChromaFunc (pRefCb + iOffsetUV, iLineSizeUV, pDstCb, 8, sMvp.iMvX, sMvp.iMvY, 8, 8);
+  pFunc->sMcFuncs.pMcChromaFunc (pRefCr + iOffsetUV, iLineSizeUV, pDstCr, 8, sMvp.iMvX, sMvp.iMvY, 8, 8);
 
   pCurMb->uiCbp = 0;
   pWelsMd->iCostLuma = 0;
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -8,7 +8,6 @@
 	$(ENCODER_SRCDIR)/core/src/encoder_data_tables.cpp\
 	$(ENCODER_SRCDIR)/core/src/encoder_ext.cpp\
 	$(ENCODER_SRCDIR)/core/src/get_intra_predictor.cpp\
-	$(ENCODER_SRCDIR)/core/src/mc.cpp\
 	$(ENCODER_SRCDIR)/core/src/md.cpp\
 	$(ENCODER_SRCDIR)/core/src/memory_align.cpp\
 	$(ENCODER_SRCDIR)/core/src/mv_pred.cpp\
--- a/test/build/win32/codec_ut/codec_unittest.vcproj
+++ b/test/build/win32/codec_ut/codec_unittest.vcproj
@@ -623,42 +623,6 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\decoder\DecUT_MotionCompensation.cpp"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						AdditionalIncludeDirectories="..\..\..\..\codec\api\svc;..\..\..\..\gtest\include;..\..\..\;..\..\..\..\codec\decoder\plus\inc;..\..\..\..\codec\common\inc;..\..\..\..\codec\decoder\core\inc;$(NOINHERIT)"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						AdditionalIncludeDirectories="..\..\..\..\codec\api\svc;..\..\..\..\gtest\include;..\..\..\;..\..\..\..\codec\decoder\plus\inc;..\..\..\..\codec\common\inc;..\..\..\..\codec\decoder\core\inc;$(NOINHERIT)"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						AdditionalIncludeDirectories="..\..\..\..\codec\api\svc;..\..\..\..\gtest\include;..\..\..\;..\..\..\..\codec\decoder\plus\inc;..\..\..\..\codec\common\inc;..\..\..\..\codec\decoder\core\inc;$(NOINHERIT)"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-						AdditionalIncludeDirectories="..\..\..\..\codec\api\svc;..\..\..\..\gtest\include;..\..\..\;..\..\..\..\codec\decoder\plus\inc;..\..\..\..\codec\common\inc;..\..\..\..\codec\decoder\core\inc;$(NOINHERIT)"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
 				RelativePath="..\..\..\decoder\DecUT_ParseSyntax.cpp"
 				>
 				<FileConfiguration
--- a/test/decoder/DecUT_MotionCompensation.cpp
+++ /dev/null
@@ -1,39 +1,0 @@
-#include <gtest/gtest.h>
-#include "codec_def.h"
-#include "mc.h"
-#include "cpu.h"
-using namespace WelsDec;
-
-#define LUMA_FUNC(funcs, src, srcstride, dst, dststride, mvx, mvy, width, height) \
-  sMcFunc.pMcLumaFunc (src, srcstride, dst, dststride, mvx, mvy, width, height)
-
-#define CHROMA_FUNC sMcFunc.pMcChromaFunc
-
-#include "mc_test_common.h"
-
-DEF_MCCOPYTEST (, 2, 2)
-DEF_MCCOPYTEST (, 2, 4)
-DEF_MCCOPYTEST (, 4, 2)
-DEF_MCCOPYTEST (, 4, 4)
-DEF_MCCOPYTEST (, 4, 8)
-DEF_MCCOPYTEST (, 8, 4)
-DEF_MCCOPYTEST (, 8, 8)
-DEF_MCCOPYTEST (, 16, 8)
-DEF_MCCOPYTEST (, 8, 16)
-DEF_MCCOPYTEST (, 16, 16)
-
-DEF_LUMA_MCTEST (, 4, 4)
-DEF_LUMA_MCTEST (, 4, 8)
-DEF_LUMA_MCTEST (, 8, 4)
-DEF_LUMA_MCTEST (, 8, 8)
-DEF_LUMA_MCTEST (, 16, 8)
-DEF_LUMA_MCTEST (, 8, 16)
-DEF_LUMA_MCTEST (, 16, 16)
-
-DEF_CHROMA_MCTEST (, 2, 2)
-DEF_CHROMA_MCTEST (, 2, 4)
-DEF_CHROMA_MCTEST (, 4, 2)
-DEF_CHROMA_MCTEST (, 4, 4)
-DEF_CHROMA_MCTEST (, 4, 8)
-DEF_CHROMA_MCTEST (, 8, 4)
-DEF_CHROMA_MCTEST (, 8, 8)
--- a/test/decoder/targets.mk
+++ b/test/decoder/targets.mk
@@ -6,7 +6,6 @@
 	$(DECODER_UNITTEST_SRCDIR)/DecUT_ErrorConcealment.cpp\
 	$(DECODER_UNITTEST_SRCDIR)/DecUT_IdctResAddPred.cpp\
 	$(DECODER_UNITTEST_SRCDIR)/DecUT_IntraPrediction.cpp\
-	$(DECODER_UNITTEST_SRCDIR)/DecUT_MotionCompensation.cpp\
 	$(DECODER_UNITTEST_SRCDIR)/DecUT_ParseSyntax.cpp\
 	$(DECODER_UNITTEST_SRCDIR)/DecUT_PredMv.cpp\
 
--- a/test/encoder/EncUT_MotionCompensation.cpp
+++ b/test/encoder/EncUT_MotionCompensation.cpp
@@ -1,37 +1,288 @@
 #include <gtest/gtest.h>
 #include "codec_def.h"
+#include "macros.h"
 #include "mc.h"
 #include "cpu.h"
-using namespace WelsEnc;
+using namespace WelsCommon;
 
-static void McLumaFunc (SMcFunc* pFuncs, const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                        int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
-  uint8_t uiMvpIdx = ((iMvY & 0x03) << 2) + (iMvX & 0x03);
-  ASSERT_EQ (iWidth, 16);
-  pFuncs->pfLumaQuarpelMc[uiMvpIdx] (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+#define MC_BUFF_SRC_STRIDE 32
+#define MC_BUFF_DST_STRIDE 32
+#define MC_BUFF_HEIGHT 30
+
+/**********************MC Unit Test Anchor Code Begin******************************/
+static bool bQpelNeeded[4][4] = {
+  { false, true, false, true },
+  { true,  true,  true, true },
+  { false, true, false, true },
+  { true,  true,  true, true }
+};
+static int32_t iHpelRef0Array[4][4] = {
+  { 0, 1, 1, 1 },
+  { 0, 1, 1, 1 },
+  { 2, 3, 3, 3 },
+  { 0, 1, 1, 1 }
+};
+static int32_t iHpelRef1Array[4][4] = {
+  { 0, 0, 0, 0 },
+  { 2, 2, 3, 2 },
+  { 2, 2, 3, 2 },
+  { 2, 2, 3, 2 }
+};
+#define FILTER6TAP(pPixBuff, x, iStride) ((pPixBuff)[x-2*iStride] + (pPixBuff)[x+3*iStride] - 5*((pPixBuff)[x-iStride] + (pPixBuff)[x+2*iStride]) + 20*((pPixBuff)[x] + (pPixBuff)[x+iStride]))
+static inline uint8_t Clip255 (int32_t x) {
+  return ((x & ~255) ? (-x) >> 31 & 255 : x);
 }
 
-#define InitMcFunc WelsInitMcFuncs
+static void MCCopyAnchor (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                          int32_t iHeight) {
+  for (int32_t y = 0; y < iHeight; y++) {
+    memcpy (pDst, pSrc, iWidth * sizeof (uint8_t));
+    pSrc += iSrcStride;
+    pDst += iDstStride;
+  }
+}
 
-#define LUMA_FUNC(funcs, src, srcstride, dst, dststride, mvx, mvy, width, height) \
-  McLumaFunc (funcs, src, srcstride, dst, dststride, mvx, mvy, width, height)
+static void MCHalfPelFilterAnchor (uint8_t* pDstH, uint8_t* pDstV, uint8_t* pDstHV, uint8_t* pSrc,
+                                   int32_t iStride, int32_t iWidth, int32_t iHeight, int16_t* pBuf) {
+  for (int32_t y = 0; y < iHeight; y++) {
+    for (int32_t x = 0; x < iWidth; x++)
+      pDstH[x] = Clip255 ((FILTER6TAP (pSrc, x, 1) + 16) >> 5);
+    for (int32_t x = -2; x < iWidth + 3; x++) {
+      int32_t v = FILTER6TAP (pSrc, x, iStride);
+      pDstV[x] = Clip255 ((v + 16) >> 5);
+      pBuf[x + 2] = v;
+    }
+    for (int32_t x = 0; x < iWidth; x++)
+      pDstHV[x] = Clip255 ((FILTER6TAP (pBuf + 2, x, 1) + 512) >> 10);
+    pDstH += iStride;
+    pDstV += iStride;
+    pDstHV += iStride;
+    pSrc += iStride;
+  }
+}
 
-#define CHROMA_FUNC sMcFunc.pfChromaMc
+static void PixelAvgAnchor (uint8_t* pDst,  int32_t iDstStride,
+                            uint8_t* pSrc1, int32_t iSrc1Stride,
+                            uint8_t* pSrc2, int32_t iSrc2Stride, int32_t iWidth, int32_t iHeight) {
+  for (int32_t y = 0; y < iHeight; y++) {
+    for (int32_t x = 0; x < iWidth; x++)
+      pDst[x] = (pSrc1[x] + pSrc2[x] + 1) >> 1;
+    pDst  += iDstStride;
+    pSrc1 += iSrc1Stride;
+    pSrc2 += iSrc2Stride;
+  }
+}
 
-#include "mc_test_common.h"
+static void MCLumaAnchor (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc[4], int32_t iSrcStride,
+                          int32_t iMvX, int32_t iMvY, int32_t iWidth, int32_t iHeight) {
+  int32_t iMvXIdx = iMvX & 3;
+  int32_t iMvYIdx = iMvY & 3;
+  int32_t iOffset = (iMvY >> 2) * iSrcStride + (iMvX >> 2);
+  uint8_t* pSrc1 = pSrc[iHpelRef0Array[iMvYIdx][iMvXIdx]] + iOffset + ((iMvYIdx) == 3) * iSrcStride;
 
-DEF_MCCOPYTEST (Enc, 16, 8)
-DEF_MCCOPYTEST (Enc, 16, 16)
+  if (bQpelNeeded[iMvYIdx][iMvXIdx]) {
+    uint8_t* pSrc2 = pSrc[iHpelRef1Array[iMvYIdx][iMvXIdx]] + iOffset + ((iMvXIdx) == 3);
+    PixelAvgAnchor (pDst, iDstStride, pSrc1, iSrcStride, pSrc2, iSrcStride, iWidth, iHeight);
+  } else {
+    MCCopyAnchor (pSrc1, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  }
+}
 
-DEF_LUMA_MCTEST (Enc, 16, 8)
-DEF_LUMA_MCTEST (Enc, 16, 16)
+static void MCChromaAnchor (uint8_t* pDstU, uint8_t* pDstV, int32_t iDstStride, uint8_t* pSrc, int32_t iSrcStride,
+                            int32_t iMvX, int32_t iMvY, int32_t iWidth, int32_t iHeight) {
+  uint8_t* pSrcTmp;
+  pSrc += (iMvY >> 3) * iSrcStride + (iMvX >> 3) * 2;
+  pSrcTmp = &pSrc[iSrcStride];
 
-DEF_CHROMA_MCTEST (Enc, 4, 2)
-DEF_CHROMA_MCTEST (Enc, 4, 4)
-DEF_CHROMA_MCTEST (Enc, 4, 8)
-DEF_CHROMA_MCTEST (Enc, 8, 4)
-DEF_CHROMA_MCTEST (Enc, 8, 8)
+  int32_t iMvXIdx = iMvX & 0x07;
+  int32_t iMvYIdx = iMvY & 0x07;
+  int32_t iBiPara0 = (8 - iMvXIdx) * (8 - iMvYIdx);
+  int32_t iBiPara1 = iMvXIdx    * (8 - iMvYIdx);
+  int32_t iBiPara2 = (8 - iMvXIdx) * iMvYIdx;
+  int32_t iBiPara3 = iMvXIdx    * iMvYIdx;
+  for (int32_t y = 0; y < iHeight; y++) {
+    for (int32_t x = 0; x < iWidth; x++) {
+      pDstU[x] = (iBiPara0 * pSrc[2 * x]  + iBiPara1 * pSrc[2 * x + 2] +
+                  iBiPara2 * pSrcTmp[2 * x] + iBiPara3 * pSrcTmp[2 * x + 2] + 32) >> 6;
+      pDstV[x] = (iBiPara0 * pSrc[2 * x + 1]  + iBiPara1 * pSrc[2 * x + 3] +
+                  iBiPara2 * pSrcTmp[2 * x + 1] + iBiPara3 * pSrcTmp[2 * x + 3] + 32) >> 6;
+    }
+    pSrc   = pSrcTmp;
+    pSrcTmp += iSrcStride;
+    pDstU += iDstStride;
+    pDstV += iDstStride;
+  }
+}
 
+/**********************MC Unit Test OPENH264 Code Begin******************************/
+#define DEF_MCCOPYTEST(iW,iH) \
+TEST(McCopy_c,iW##x##iH) \
+{                             \
+    SMcFunc sMcFunc;      \
+    int32_t iCpuCores = 1; \
+    uint32_t uiCpuFlag;\
+    for(int32_t k =0; k<2; k++)\
+    {\
+      if(k==0)\
+      {\
+        uiCpuFlag = 0;\
+      }else \
+      {\
+        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
+      }\
+      InitMcFunc(&sMcFunc, uiCpuFlag); \
+      uint8_t uSrcAnchor[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
+      uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];    \
+      ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+      ENFORCE_STACK_ALIGN_2D(uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)                    \
+      {                                                         \
+        for(int32_t i=0;i<MC_BUFF_SRC_STRIDE;i++)                  \
+        {                                                       \
+          uSrcAnchor[j][i] = uSrcTest[j][i] = rand()%256;      \
+        }                                                         \
+      }                                                              \
+      memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE);\
+      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE);  \
+      MCCopyAnchor(uSrcAnchor[0],MC_BUFF_SRC_STRIDE,uDstAnchor[0],MC_BUFF_DST_STRIDE,iW,iH);   \
+      sMcFunc.pMcLumaFunc(uSrcTest[0],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,0,0,iW,iH); \
+      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
+      {                                                                             \
+        for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
+        {                                                                           \
+          ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \
+        }                                                                             \
+      }                                                                                 \
+    }\
+}
+
+DEF_MCCOPYTEST (2, 2)
+DEF_MCCOPYTEST (2, 4)
+DEF_MCCOPYTEST (4, 2)
+DEF_MCCOPYTEST (4, 4)
+DEF_MCCOPYTEST (4, 8)
+DEF_MCCOPYTEST (8, 4)
+DEF_MCCOPYTEST (8, 8)
+DEF_MCCOPYTEST (16, 8)
+DEF_MCCOPYTEST (8, 16)
+DEF_MCCOPYTEST (16, 16)
+
+#define DEF_LUMA_MCTEST(iW,iH) \
+TEST(McHorVer,iW##x##iH)  \
+{                       \
+    for (int32_t a = 0; a < 4; a++) { \
+    for (int32_t b = 0; b < 4; b++) { \
+    SMcFunc sMcFunc;  \
+    uint8_t uSrcAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
+    uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];      \
+    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+    uint8_t* uSrcInputAnchor[4];                              \
+    int16_t pBuf[MC_BUFF_DST_STRIDE]; \
+    uSrcInputAnchor[0] = &uSrcAnchor[0][4][4]; \
+    uSrcInputAnchor[1] = &uSrcAnchor[1][4][4]; \
+    uSrcInputAnchor[2] = &uSrcAnchor[2][4][4]; \
+    uSrcInputAnchor[3] = &uSrcAnchor[3][4][4]; \
+    for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
+    {\
+      for(int32_t i=0;i<MC_BUFF_SRC_STRIDE;i++)   \
+      {\
+        uSrcAnchor[0][j][i] = uSrcTest[j][i] = rand()%256;  \
+      }\
+    }\
+    int32_t iCpuCores = 1; \
+    uint32_t uiCpuFlag;\
+    for(int32_t k =0; k<2; k++)\
+    {\
+      if(k==0)\
+      {\
+        uiCpuFlag = 0;\
+      }else \
+      {\
+        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
+      }\
+      InitMcFunc(&sMcFunc,uiCpuFlag);\
+      memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+      MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \
+      MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \
+      sMcFunc.pMcLumaFunc(&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
+      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
+      {                                                                             \
+          for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
+          {                                                                           \
+              ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \
+          }                                                                             \
+      }                                                                                \
+    }\
+    }\
+    }\
+}
+
+
+DEF_LUMA_MCTEST (4, 4)
+DEF_LUMA_MCTEST (4, 8)
+DEF_LUMA_MCTEST (8, 4)
+DEF_LUMA_MCTEST (8, 8)
+DEF_LUMA_MCTEST (16, 8)
+DEF_LUMA_MCTEST (8, 16)
+DEF_LUMA_MCTEST (16, 16)
+
+#define DEF_CHROMA_MCTEST(iW,iH) \
+TEST(McChroma,iW##x##iH)  \
+{                       \
+    for (int32_t a = 0; a < 8; a++) { \
+    for (int32_t b = 0; b < 8; b++) { \
+    SMcFunc sMcFunc;  \
+    uint8_t uSrcAnchor[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE*2]; \
+    uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];      \
+    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor1, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor2, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
+    for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
+    {\
+      for(int32_t i=0;i<MC_BUFF_SRC_STRIDE;i++)   \
+      {\
+        uSrcAnchor[j][i*2] = uSrcTest[j][i] = rand()%256;  \
+      }\
+    }\
+    int32_t iCpuCores = 1; \
+    uint32_t uiCpuFlag;\
+    for(int32_t k =0; k<2; k++)\
+    {\
+      if(k==0)\
+      {\
+        uiCpuFlag = 0;\
+      }else \
+      {\
+        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
+      }\
+      InitMcFunc(&sMcFunc,uiCpuFlag);\
+      memset(uDstAnchor1,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+      memset(uDstAnchor2,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
+      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE);     \
+      MCChromaAnchor(uDstAnchor1[0],uDstAnchor2[0],MC_BUFF_DST_STRIDE,uSrcAnchor[0],MC_BUFF_SRC_STRIDE*2,a,b,iW,iH); \
+      sMcFunc.pMcChromaFunc(uSrcTest[0],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
+      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
+      {                                                                             \
+          for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
+          {                                                                           \
+              ASSERT_EQ(uDstAnchor1[j][i],uDstTest[j][i]);                             \
+          }                                                                             \
+      }                                                                                 \
+    }\
+    }\
+    }\
+}
+
+DEF_CHROMA_MCTEST (2, 2)
+DEF_CHROMA_MCTEST (2, 4)
+DEF_CHROMA_MCTEST (4, 2)
+DEF_CHROMA_MCTEST (4, 4)
+DEF_CHROMA_MCTEST (4, 8)
+DEF_CHROMA_MCTEST (8, 4)
+DEF_CHROMA_MCTEST (8, 8)
+
 TEST (EncMcAvg, PixelAvg) {
   SMcFunc sMcFunc;
   for (int32_t k = 0; k < 2; k++) {
@@ -39,7 +290,7 @@
       int32_t width = 8 << w;
       int32_t height = 16;
       uint32_t uiCpuFlag = k == 0 ? 0 : WelsCPUFeatureDetect (NULL);
-      WelsInitMcFuncs (&sMcFunc, uiCpuFlag);
+      InitMcFunc (&sMcFunc, uiCpuFlag);
       uint8_t uSrc1[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];
       uint8_t uSrc2[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];
       ENFORCE_STACK_ALIGN_2D (uint8_t, uDstAnchor, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16);
@@ -52,8 +303,8 @@
       }
       PixelAvgAnchor (uDstAnchor[0], MC_BUFF_DST_STRIDE, uSrc1[0], MC_BUFF_SRC_STRIDE, uSrc2[0], MC_BUFF_SRC_STRIDE, width,
                       height);
-      sMcFunc.pfSampleAveraging[w] (uDstTest[0], MC_BUFF_DST_STRIDE, uSrc1[0], MC_BUFF_SRC_STRIDE, uSrc2[0],
-                                    MC_BUFF_SRC_STRIDE, height);
+      sMcFunc.pfSampleAveraging (uDstTest[0], MC_BUFF_DST_STRIDE, uSrc1[0], MC_BUFF_SRC_STRIDE, uSrc2[0],
+                                 MC_BUFF_SRC_STRIDE, width, height);
       for (int32_t j = 0; j < height; j++) {
         for (int32_t i = 0; i < width; i++) {
           ASSERT_EQ (uDstAnchor[j][i], uDstTest[j][i]);
@@ -88,7 +339,7 @@
       }
 
       uint32_t uiCpuFlag = k == 0 ? 0 : WelsCPUFeatureDetect (NULL);
-      WelsInitMcFuncs (&sMcFunc, uiCpuFlag);
+      InitMcFunc (&sMcFunc, uiCpuFlag);
 
       MCHalfPelFilterAnchor (uAnchors[1], uAnchors[2], uAnchors[3], uAnchors[0], MC_BUFF_SRC_STRIDE, width + 1, height + 1, pBuf + 4);
       sMcFunc.pfLumaHalfpelHor (&uSrcTest[4][4], MC_BUFF_SRC_STRIDE, uDstTest[0], MC_BUFF_DST_STRIDE, width + 1, height);
--- a/test/mc_test_common.h
+++ /dev/null
@@ -1,249 +1,0 @@
-#define MC_BUFF_SRC_STRIDE 32
-#define MC_BUFF_DST_STRIDE 32
-#define MC_BUFF_HEIGHT 30
-
-/**********************MC Unit Test Anchor Code Begin******************************/
-static bool bQpelNeeded[4][4] = {
-  { false, true, false, true },
-  { true,  true,  true, true },
-  { false, true, false, true },
-  { true,  true,  true, true }
-};
-static int32_t iHpelRef0Array[4][4] = {
-  { 0, 1, 1, 1 },
-  { 0, 1, 1, 1 },
-  { 2, 3, 3, 3 },
-  { 0, 1, 1, 1 }
-};
-static int32_t iHpelRef1Array[4][4] = {
-  { 0, 0, 0, 0 },
-  { 2, 2, 3, 2 },
-  { 2, 2, 3, 2 },
-  { 2, 2, 3, 2 }
-};
-#define FILTER6TAP(pPixBuff, x, iStride) ((pPixBuff)[x-2*iStride] + (pPixBuff)[x+3*iStride] - 5*((pPixBuff)[x-iStride] + (pPixBuff)[x+2*iStride]) + 20*((pPixBuff)[x] + (pPixBuff)[x+iStride]))
-static inline uint8_t Clip255 (int32_t x) {
-  return ((x & ~255) ? (-x) >> 31 & 255 : x);
-}
-
-static void MCCopyAnchor (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                          int32_t iHeight) {
-  for (int32_t y = 0; y < iHeight; y++) {
-    memcpy (pDst, pSrc, iWidth * sizeof (uint8_t));
-    pSrc += iSrcStride;
-    pDst += iDstStride;
-  }
-}
-
-static void MCHalfPelFilterAnchor (uint8_t* pDstH, uint8_t* pDstV, uint8_t* pDstHV, uint8_t* pSrc,
-                                   int32_t iStride, int32_t iWidth, int32_t iHeight, int16_t* pBuf) {
-  for (int32_t y = 0; y < iHeight; y++) {
-    for (int32_t x = 0; x < iWidth; x++)
-      pDstH[x] = Clip255 ((FILTER6TAP (pSrc, x, 1) + 16) >> 5);
-    for (int32_t x = -2; x < iWidth + 3; x++) {
-      int32_t v = FILTER6TAP (pSrc, x, iStride);
-      pDstV[x] = Clip255 ((v + 16) >> 5);
-      pBuf[x + 2] = v;
-    }
-    for (int32_t x = 0; x < iWidth; x++)
-      pDstHV[x] = Clip255 ((FILTER6TAP (pBuf + 2, x, 1) + 512) >> 10);
-    pDstH += iStride;
-    pDstV += iStride;
-    pDstHV += iStride;
-    pSrc += iStride;
-  }
-}
-
-static void PixelAvgAnchor (uint8_t* pDst,  int32_t iDstStride,
-                            uint8_t* pSrc1, int32_t iSrc1Stride,
-                            uint8_t* pSrc2, int32_t iSrc2Stride, int32_t iWidth, int32_t iHeight) {
-  for (int32_t y = 0; y < iHeight; y++) {
-    for (int32_t x = 0; x < iWidth; x++)
-      pDst[x] = (pSrc1[x] + pSrc2[x] + 1) >> 1;
-    pDst  += iDstStride;
-    pSrc1 += iSrc1Stride;
-    pSrc2 += iSrc2Stride;
-  }
-}
-
-static void MCLumaAnchor (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc[4], int32_t iSrcStride,
-                          int32_t iMvX, int32_t iMvY, int32_t iWidth, int32_t iHeight) {
-  int32_t iMvXIdx = iMvX & 3;
-  int32_t iMvYIdx = iMvY & 3;
-  int32_t iOffset = (iMvY >> 2) * iSrcStride + (iMvX >> 2);
-  uint8_t* pSrc1 = pSrc[iHpelRef0Array[iMvYIdx][iMvXIdx]] + iOffset + ((iMvYIdx) == 3) * iSrcStride;
-
-  if (bQpelNeeded[iMvYIdx][iMvXIdx]) {
-    uint8_t* pSrc2 = pSrc[iHpelRef1Array[iMvYIdx][iMvXIdx]] + iOffset + ((iMvXIdx) == 3);
-    PixelAvgAnchor (pDst, iDstStride, pSrc1, iSrcStride, pSrc2, iSrcStride, iWidth, iHeight);
-  } else {
-    MCCopyAnchor (pSrc1, iSrcStride, pDst, iDstStride, iWidth, iHeight);
-  }
-}
-
-static void MCChromaAnchor (uint8_t* pDstU, uint8_t* pDstV, int32_t iDstStride, uint8_t* pSrc, int32_t iSrcStride,
-                            int32_t iMvX, int32_t iMvY, int32_t iWidth, int32_t iHeight) {
-  uint8_t* pSrcTmp;
-  pSrc += (iMvY >> 3) * iSrcStride + (iMvX >> 3) * 2;
-  pSrcTmp = &pSrc[iSrcStride];
-
-  int32_t iMvXIdx = iMvX & 0x07;
-  int32_t iMvYIdx = iMvY & 0x07;
-  int32_t iBiPara0 = (8 - iMvXIdx) * (8 - iMvYIdx);
-  int32_t iBiPara1 = iMvXIdx    * (8 - iMvYIdx);
-  int32_t iBiPara2 = (8 - iMvXIdx) * iMvYIdx;
-  int32_t iBiPara3 = iMvXIdx    * iMvYIdx;
-  for (int32_t y = 0; y < iHeight; y++) {
-    for (int32_t x = 0; x < iWidth; x++) {
-      pDstU[x] = (iBiPara0 * pSrc[2 * x]  + iBiPara1 * pSrc[2 * x + 2] +
-                  iBiPara2 * pSrcTmp[2 * x] + iBiPara3 * pSrcTmp[2 * x + 2] + 32) >> 6;
-      pDstV[x] = (iBiPara0 * pSrc[2 * x + 1]  + iBiPara1 * pSrc[2 * x + 3] +
-                  iBiPara2 * pSrcTmp[2 * x + 1] + iBiPara3 * pSrcTmp[2 * x + 3] + 32) >> 6;
-    }
-    pSrc   = pSrcTmp;
-    pSrcTmp += iSrcStride;
-    pDstU += iDstStride;
-    pDstV += iDstStride;
-  }
-}
-
-/**********************MC Unit Test OPENH264 Code Begin******************************/
-#define DEF_MCCOPYTEST(pfx, iW,iH) \
-TEST(pfx##McCopy_c,iW##x##iH) \
-{                             \
-    SMcFunc sMcFunc;      \
-    int32_t iCpuCores = 1; \
-    uint32_t uiCpuFlag;\
-    for(int32_t k =0; k<2; k++)\
-    {\
-      if(k==0)\
-      {\
-        uiCpuFlag = 0;\
-      }else \
-      {\
-        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
-      }\
-      InitMcFunc(&sMcFunc, uiCpuFlag); \
-      uint8_t uSrcAnchor[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
-      uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];    \
-      ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-      ENFORCE_STACK_ALIGN_2D(uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)                    \
-      {                                                         \
-        for(int32_t i=0;i<MC_BUFF_SRC_STRIDE;i++)                  \
-        {                                                       \
-          uSrcAnchor[j][i] = uSrcTest[j][i] = rand()%256;      \
-        }                                                         \
-      }                                                              \
-      memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE);\
-      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE);  \
-      MCCopyAnchor(uSrcAnchor[0],MC_BUFF_SRC_STRIDE,uDstAnchor[0],MC_BUFF_DST_STRIDE,iW,iH);   \
-      LUMA_FUNC(&sMcFunc,uSrcTest[0],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,0,0,iW,iH); \
-      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
-      {                                                                             \
-        for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
-        {                                                                           \
-          ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \
-        }                                                                             \
-      }                                                                                 \
-    }\
-}
-
-#define DEF_LUMA_MCTEST(pfx,iW,iH) \
-TEST(pfx##McHorVer,iW##x##iH)  \
-{                       \
-    for (int32_t a = 0; a < 4; a++) { \
-    for (int32_t b = 0; b < 4; b++) { \
-    SMcFunc sMcFunc;  \
-    uint8_t uSrcAnchor[4][MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE]; \
-    uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];      \
-    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-    uint8_t* uSrcInputAnchor[4];                              \
-    int16_t pBuf[MC_BUFF_DST_STRIDE]; \
-    uSrcInputAnchor[0] = &uSrcAnchor[0][4][4]; \
-    uSrcInputAnchor[1] = &uSrcAnchor[1][4][4]; \
-    uSrcInputAnchor[2] = &uSrcAnchor[2][4][4]; \
-    uSrcInputAnchor[3] = &uSrcAnchor[3][4][4]; \
-    for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
-    {\
-      for(int32_t i=0;i<MC_BUFF_SRC_STRIDE;i++)   \
-      {\
-        uSrcAnchor[0][j][i] = uSrcTest[j][i] = rand()%256;  \
-      }\
-    }\
-    int32_t iCpuCores = 1; \
-    uint32_t uiCpuFlag;\
-    for(int32_t k =0; k<2; k++)\
-    {\
-      if(k==0)\
-      {\
-        uiCpuFlag = 0;\
-      }else \
-      {\
-        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
-      }\
-      InitMcFunc(&sMcFunc,uiCpuFlag);\
-      memset(uDstAnchor,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
-      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
-      MCHalfPelFilterAnchor(uSrcInputAnchor[1],uSrcInputAnchor[2],uSrcInputAnchor[3],uSrcInputAnchor[0],MC_BUFF_SRC_STRIDE,iW+1,iH+1,pBuf+4); \
-      MCLumaAnchor(uDstAnchor[0],MC_BUFF_DST_STRIDE,uSrcInputAnchor,MC_BUFF_SRC_STRIDE,a,b,iW,iH); \
-      LUMA_FUNC(&sMcFunc,&uSrcTest[4][4],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
-      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
-      {                                                                             \
-          for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
-          {                                                                           \
-              ASSERT_EQ(uDstAnchor[j][i],uDstTest[j][i]);                              \
-          }                                                                             \
-      }                                                                                \
-    }\
-    }\
-    }\
-}
-
-#define DEF_CHROMA_MCTEST(pfx,iW,iH) \
-TEST(pfx##McChroma,iW##x##iH)  \
-{                       \
-    for (int32_t a = 0; a < 8; a++) { \
-    for (int32_t b = 0; b < 8; b++) { \
-    SMcFunc sMcFunc;  \
-    uint8_t uSrcAnchor[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE*2]; \
-    uint8_t uSrcTest[MC_BUFF_HEIGHT][MC_BUFF_SRC_STRIDE];      \
-    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor1, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstAnchor2, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-    ENFORCE_STACK_ALIGN_2D(uint8_t, uDstTest, MC_BUFF_HEIGHT, MC_BUFF_DST_STRIDE, 16); \
-    for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
-    {\
-      for(int32_t i=0;i<MC_BUFF_SRC_STRIDE;i++)   \
-      {\
-        uSrcAnchor[j][i*2] = uSrcTest[j][i] = rand()%256;  \
-      }\
-    }\
-    int32_t iCpuCores = 1; \
-    uint32_t uiCpuFlag;\
-    for(int32_t k =0; k<2; k++)\
-    {\
-      if(k==0)\
-      {\
-        uiCpuFlag = 0;\
-      }else \
-      {\
-        uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); \
-      }\
-      InitMcFunc(&sMcFunc,uiCpuFlag);\
-      memset(uDstAnchor1,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
-      memset(uDstAnchor2,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE); \
-      memset(uDstTest,0,sizeof(uint8_t)*MC_BUFF_HEIGHT*MC_BUFF_DST_STRIDE);     \
-      MCChromaAnchor(uDstAnchor1[0],uDstAnchor2[0],MC_BUFF_DST_STRIDE,uSrcAnchor[0],MC_BUFF_SRC_STRIDE*2,a,b,iW,iH); \
-      CHROMA_FUNC(uSrcTest[0],MC_BUFF_SRC_STRIDE,uDstTest[0],MC_BUFF_DST_STRIDE,a,b,iW,iH);\
-      for(int32_t j=0;j<MC_BUFF_HEIGHT;j++)   \
-      {                                                                             \
-          for(int32_t i=0;i<MC_BUFF_DST_STRIDE;i++)                                  \
-          {                                                                           \
-              ASSERT_EQ(uDstAnchor1[j][i],uDstTest[j][i]);                             \
-          }                                                                             \
-      }                                                                                 \
-    }\
-    }\
-    }\
-}